1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 #include "opt_inet.h" 30 #include "opt_inet6.h" 31 #include "opt_route.h" 32 33 #include <sys/param.h> 34 #include <sys/eventhandler.h> 35 #include <sys/kernel.h> 36 #include <sys/sbuf.h> 37 #include <sys/lock.h> 38 #include <sys/rmlock.h> 39 #include <sys/malloc.h> 40 #include <sys/mbuf.h> 41 #include <sys/module.h> 42 #include <sys/kernel.h> 43 #include <sys/priv.h> 44 #include <sys/proc.h> 45 #include <sys/socket.h> 46 #include <sys/socketvar.h> 47 #include <sys/stdarg.h> 48 #include <sys/sysctl.h> 49 #include <sys/syslog.h> 50 #include <sys/queue.h> 51 #include <net/vnet.h> 52 53 #include <net/if.h> 54 #include <net/if_var.h> 55 56 #include <netinet/in.h> 57 #include <netinet/in_var.h> 58 #include <netinet/ip.h> 59 #include <netinet/ip_var.h> 60 #ifdef INET6 61 #include <netinet/ip6.h> 62 #include <netinet6/ip6_var.h> 63 #endif 64 65 #include <net/route.h> 66 #include <net/route/nhop.h> 67 #include <net/route/route_ctl.h> 68 #include <net/route/route_var.h> 69 #include <net/route/fib_algo.h> 70 71 /* 72 * Fib lookup framework. 73 * 74 * This framework enables accelerated longest-prefix-match lookups for the 75 * routing tables by adding the ability to dynamically attach/detach lookup 76 * algorithms implementation to/from the datapath. 77 * 78 * flm - fib lookup modules - implementation of particular lookup algorithm 79 * fd - fib data - instance of an flm bound to specific routing table 80 * 81 * This file provides main framework functionality. 82 * 83 * The following are the features provided by the framework 84 * 85 * 1) nexhops abstraction -> provides transparent referencing, indexing 86 * and efficient idx->ptr mappings for nexthop and nexthop groups. 87 * 2) Routing table synchronisation 88 * 3) dataplane attachment points 89 * 4) automatic algorithm selection based on the provided preference. 90 * 91 * 92 * DATAPATH 93 * For each supported address family, there is a an allocated array of fib_dp 94 * structures, indexed by fib number. Each array entry contains callback function 95 * and its argument. This function will be called with a family-specific lookup key, 96 * scope and provided argument. This array gets re-created every time when new algo 97 * instance gets created. Please take a look at the replace_rtables_family() function 98 * for more details. 99 * 100 */ 101 102 SYSCTL_DECL(_net_route); 103 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 104 "Fib algorithm lookups"); 105 106 /* Algorithm sync policy */ 107 108 /* Time interval to bucket updates */ 109 VNET_DEFINE_STATIC(unsigned int, update_bucket_time_ms) = 50; 110 #define V_update_bucket_time_ms VNET(update_bucket_time_ms) 111 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET, 112 &VNET_NAME(update_bucket_time_ms), 0, "Time interval to calculate update rate"); 113 114 /* Minimum update rate to delay sync */ 115 VNET_DEFINE_STATIC(unsigned int, bucket_change_threshold_rate) = 500; 116 #define V_bucket_change_threshold_rate VNET(bucket_change_threshold_rate) 117 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET, 118 &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync"); 119 120 /* Max allowed delay to sync */ 121 VNET_DEFINE_STATIC(unsigned int, fib_max_sync_delay_ms) = 1000; 122 #define V_fib_max_sync_delay_ms VNET(fib_max_sync_delay_ms) 123 SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET, 124 &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)"); 125 126 127 #ifdef INET6 128 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false; 129 #define V_algo_fixed_inet6 VNET(algo_fixed_inet6) 130 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 131 "IPv6 longest prefix match lookups"); 132 #endif 133 #ifdef INET 134 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false; 135 #define V_algo_fixed_inet VNET(algo_fixed_inet) 136 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 137 "IPv4 longest prefix match lookups"); 138 #endif 139 140 /* Fib instance counter */ 141 static uint32_t fib_gen = 0; 142 143 struct nhop_ref_table { 144 uint32_t count; 145 int32_t refcnt[0]; 146 }; 147 148 enum fib_callout_action { 149 FDA_NONE, /* No callout scheduled */ 150 FDA_REBUILD, /* Asks to rebuild algo instance */ 151 FDA_EVAL, /* Asks to evaluate if the current algo is still be best */ 152 FDA_BATCH, /* Asks to submit batch of updates to the algo */ 153 }; 154 155 struct fib_sync_status { 156 struct timeval diverge_time; /* ts when diverged */ 157 uint32_t num_changes; /* number of changes since sync */ 158 uint32_t bucket_changes; /* num changes within the current bucket */ 159 uint64_t bucket_id; /* 50ms bucket # */ 160 struct fib_change_queue fd_change_queue;/* list of scheduled entries */ 161 }; 162 163 /* 164 * Data structure for the fib lookup instance tied to the particular rib. 165 */ 166 struct fib_data { 167 uint32_t number_nhops; /* current # of nhops */ 168 uint8_t hit_nhops; /* true if out of nhop limit */ 169 uint8_t init_done; /* true if init is competed */ 170 uint32_t fd_dead:1; /* Scheduled for deletion */ 171 uint32_t fd_linked:1; /* true if linked */ 172 uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ 173 uint32_t fd_batch:1; /* true if batched notification scheduled */ 174 uint8_t fd_family; /* family */ 175 uint32_t fd_fibnum; /* fibnum */ 176 uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ 177 uint32_t fd_gen; /* instance gen# */ 178 struct callout fd_callout; /* rebuild callout */ 179 enum fib_callout_action fd_callout_action; /* Callout action to take */ 180 void *fd_algo_data; /* algorithm data */ 181 struct nhop_object **nh_idx; /* nhop idx->ptr array */ 182 struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ 183 struct rib_head *fd_rh; /* RIB table we're attached to */ 184 struct rib_subscription *fd_rs; /* storing table subscription */ 185 struct fib_dp fd_dp; /* fib datapath data */ 186 struct vnet *fd_vnet; /* vnet fib belongs to */ 187 struct epoch_context fd_epoch_ctx; /* epoch context for deletion */ 188 struct fib_lookup_module *fd_flm;/* pointer to the lookup module */ 189 struct fib_sync_status fd_ss; /* State relevant to the rib sync */ 190 uint32_t fd_num_changes; /* number of changes since last callout */ 191 TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ 192 }; 193 194 static bool rebuild_fd(struct fib_data *fd, const char *reason); 195 static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new); 196 static void handle_fd_callout(void *_data); 197 static void destroy_fd_instance_epoch(epoch_context_t ctx); 198 static bool is_idx_free(struct fib_data *fd, uint32_t index); 199 static void set_algo_fixed(struct rib_head *rh); 200 static bool is_algo_fixed(struct rib_head *rh); 201 202 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh); 203 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh); 204 205 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, 206 struct fib_lookup_module *orig_flm); 207 static void fib_unref_algo(struct fib_lookup_module *flm); 208 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum); 209 210 struct mtx fib_mtx; 211 #define FIB_MOD_LOCK() mtx_lock(&fib_mtx) 212 #define FIB_MOD_UNLOCK() mtx_unlock(&fib_mtx) 213 #define FIB_MOD_LOCK_ASSERT() mtx_assert(&fib_mtx, MA_OWNED) 214 215 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF); 216 217 /* Algorithm has to be this percent better than the current to switch */ 218 #define BEST_DIFF_PERCENT (5 * 256 / 100) 219 /* Schedule algo re-evaluation X seconds after a change */ 220 #define ALGO_EVAL_DELAY_MS 30000 221 /* Force algo re-evaluation after X changes */ 222 #define ALGO_EVAL_NUM_ROUTES 100 223 /* Try to setup algorithm X times */ 224 #define FIB_MAX_TRIES 32 225 /* Max amount of supported nexthops */ 226 #define FIB_MAX_NHOPS 262144 227 #define FIB_CALLOUT_DELAY_MS 50 228 229 230 /* Debug */ 231 static int flm_debug_level = LOG_NOTICE; 232 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN, 233 &flm_debug_level, 0, "debuglevel"); 234 #define FLM_MAX_DEBUG_LEVEL LOG_DEBUG 235 #ifndef LOG_DEBUG2 236 #define LOG_DEBUG2 8 237 #endif 238 239 #define _PASS_MSG(_l) (flm_debug_level >= (_l)) 240 #define ALGO_PRINTF(_l, _fmt, ...) if (_PASS_MSG(_l)) { \ 241 printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__); \ 242 } 243 #define _ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \ 244 printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\ 245 print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__) 246 #define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \ 247 printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__) 248 #define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \ 249 _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\ 250 } 251 #define FD_PRINTF(_l, _fd, _fmt, ...) FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__) 252 #define _FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \ 253 _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \ 254 _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__); \ 255 } 256 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2 257 #define FD_PRINTF_LOG_DEBUG2 _FD_PRINTF 258 #else 259 #define FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...) 260 #endif 261 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG 262 #define FD_PRINTF_LOG_DEBUG _FD_PRINTF 263 #else 264 #define FD_PRINTF_LOG_DEBUG() 265 #endif 266 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO 267 #define FD_PRINTF_LOG_INFO _FD_PRINTF 268 #else 269 #define FD_PRINTF_LOG_INFO() 270 #endif 271 #define FD_PRINTF_LOG_NOTICE _FD_PRINTF 272 #define FD_PRINTF_LOG_ERR _FD_PRINTF 273 #define FD_PRINTF_LOG_WARNING _FD_PRINTF 274 275 276 /* List of all registered lookup algorithms */ 277 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list); 278 279 /* List of all fib lookup instances in the vnet */ 280 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list); 281 #define V_fib_data_list VNET(fib_data_list) 282 283 /* Datastructure for storing non-transient fib lookup module failures */ 284 struct fib_error { 285 int fe_family; 286 uint32_t fe_fibnum; /* failed rtable */ 287 struct fib_lookup_module *fe_flm; /* failed module */ 288 TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */ 289 }; 290 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list); 291 #define V_fib_error_list VNET(fib_error_list) 292 293 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */ 294 struct fib_dp_header { 295 struct epoch_context fdh_epoch_ctx; 296 uint32_t fdh_num_tables; 297 struct fib_dp fdh_idx[0]; 298 }; 299 300 /* 301 * Tries to add new non-transient algorithm error to the list of 302 * errors. 303 * Returns true on success. 304 */ 305 static bool 306 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum) 307 { 308 struct fib_error *fe; 309 310 fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO); 311 if (fe == NULL) 312 return (false); 313 fe->fe_flm = flm; 314 fe->fe_family = flm->flm_family; 315 fe->fe_fibnum = fibnum; 316 317 FIB_MOD_LOCK(); 318 /* Avoid duplicates by checking if error already exists first */ 319 if (flm_error_check(flm, fibnum)) { 320 FIB_MOD_UNLOCK(); 321 free(fe, M_TEMP); 322 return (true); 323 } 324 TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries); 325 FIB_MOD_UNLOCK(); 326 327 return (true); 328 } 329 330 /* 331 * True if non-transient error has been registered for @flm in @fibnum. 332 */ 333 static bool 334 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum) 335 { 336 const struct fib_error *fe; 337 338 TAILQ_FOREACH(fe, &V_fib_error_list, entries) { 339 if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum)) 340 return (true); 341 } 342 343 return (false); 344 } 345 346 /* 347 * Clear all errors of algo specified by @flm. 348 */ 349 static void 350 fib_error_clear_flm(struct fib_lookup_module *flm) 351 { 352 struct fib_error *fe, *fe_tmp; 353 354 FIB_MOD_LOCK_ASSERT(); 355 356 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 357 if (fe->fe_flm == flm) { 358 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 359 free(fe, M_TEMP); 360 } 361 } 362 } 363 364 /* 365 * Clears all errors in current VNET. 366 */ 367 static void 368 fib_error_clear(void) 369 { 370 struct fib_error *fe, *fe_tmp; 371 372 FIB_MOD_LOCK_ASSERT(); 373 374 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 375 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 376 free(fe, M_TEMP); 377 } 378 } 379 380 static const char * 381 print_op_result(enum flm_op_result result) 382 { 383 switch (result) { 384 case FLM_SUCCESS: 385 return "success"; 386 case FLM_REBUILD: 387 return "rebuild"; 388 case FLM_BATCH: 389 return "batch"; 390 case FLM_ERROR: 391 return "error"; 392 } 393 394 return "unknown"; 395 } 396 397 static const char * 398 print_family(int family) 399 { 400 401 if (family == AF_INET) 402 return ("inet"); 403 else if (family == AF_INET6) 404 return ("inet6"); 405 else 406 return ("unknown"); 407 } 408 409 /* 410 * Debug function used by lookup algorithms. 411 * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) " 412 */ 413 void 414 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...) 415 { 416 char buf[128]; 417 va_list ap; 418 419 if (level > flm_debug_level) 420 return; 421 422 va_start(ap, fmt); 423 vsnprintf(buf, sizeof(buf), fmt, ap); 424 va_end(ap); 425 426 _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name, 427 fd->fd_gen, func, "%s", buf); 428 } 429 430 /* 431 * Outputs list of algorithms supported by the provided address family. 432 */ 433 static int 434 print_algos_sysctl(struct sysctl_req *req, int family) 435 { 436 struct fib_lookup_module *flm; 437 struct sbuf sbuf; 438 int error, count = 0; 439 440 error = sysctl_wire_old_buffer(req, 0); 441 if (error == 0) { 442 sbuf_new_for_sysctl(&sbuf, NULL, 512, req); 443 TAILQ_FOREACH(flm, &all_algo_list, entries) { 444 if (flm->flm_family == family) { 445 if (count++ > 0) 446 sbuf_cat(&sbuf, ", "); 447 sbuf_cat(&sbuf, flm->flm_name); 448 } 449 } 450 error = sbuf_finish(&sbuf); 451 sbuf_delete(&sbuf); 452 } 453 return (error); 454 } 455 456 #ifdef INET6 457 static int 458 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS) 459 { 460 461 return (print_algos_sysctl(req, AF_INET6)); 462 } 463 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, 464 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 465 print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms"); 466 #endif 467 468 #ifdef INET 469 static int 470 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS) 471 { 472 473 return (print_algos_sysctl(req, AF_INET)); 474 } 475 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, 476 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 477 print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms"); 478 #endif 479 480 /* 481 * Calculate delay between repeated failures. 482 * Returns current delay in milliseconds. 483 */ 484 static uint32_t 485 callout_calc_delay_ms(struct fib_data *fd) 486 { 487 uint32_t shift; 488 489 if (fd->fd_failed_rebuilds > 10) 490 shift = 10; 491 else 492 shift = fd->fd_failed_rebuilds; 493 494 return ((1 << shift) * FIB_CALLOUT_DELAY_MS); 495 } 496 497 static void 498 schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms) 499 { 500 501 FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action); 502 fd->fd_callout_action = action; 503 callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0, 504 handle_fd_callout, fd, 0); 505 } 506 507 static void 508 schedule_fd_rebuild(struct fib_data *fd, const char *reason) 509 { 510 511 RIB_WLOCK_ASSERT(fd->fd_rh); 512 513 if (!fd->fd_need_rebuild) { 514 fd->fd_need_rebuild = true; 515 /* Stop batch updates */ 516 fd->fd_batch = false; 517 518 /* 519 * Potentially re-schedules pending callout 520 * initiated by schedule_algo_eval. 521 */ 522 FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)", 523 reason, fd->fd_failed_rebuilds); 524 schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd)); 525 } 526 } 527 528 static void 529 sync_rib_gen(struct fib_data *fd) 530 { 531 FD_PRINTF(LOG_DEBUG, fd, "Sync gen %u -> %u", fd->fd_rh->rnh_gen, fd->fd_rh->rnh_gen_rib); 532 fd->fd_rh->rnh_gen = fd->fd_rh->rnh_gen_rib; 533 } 534 535 static int64_t 536 get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv) 537 { 538 int64_t diff = 0; 539 540 diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000; 541 diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000; 542 543 return (diff); 544 } 545 546 static void 547 add_tv_diff_ms(struct timeval *tv, int ms) 548 { 549 tv->tv_sec += ms / 1000; 550 ms = ms % 1000; 551 if (ms * 1000 + tv->tv_usec < 1000000) 552 tv->tv_usec += ms * 1000; 553 else { 554 tv->tv_sec += 1; 555 tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000; 556 } 557 } 558 559 /* 560 * Marks the time when algo state diverges from the rib state. 561 */ 562 static void 563 mark_diverge_time(struct fib_data *fd) 564 { 565 struct fib_sync_status *fd_ss = &fd->fd_ss; 566 567 getmicrouptime(&fd_ss->diverge_time); 568 fd_ss->bucket_id = 0; 569 fd_ss->bucket_changes = 0; 570 } 571 572 /* 573 * Calculates and updates the next algorithm sync time, based on the current activity. 574 * 575 * The intent is to provide reasonable balance between the update 576 * latency and efficient batching when changing large amount of routes. 577 * 578 * High-level algorithm looks the following: 579 * 1) all changes are bucketed in 50ms intervals 580 * 2) If amount of changes within the bucket is greater than the threshold, 581 * the update gets delayed, up to maximum delay threshold. 582 */ 583 static void 584 update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action) 585 { 586 uint32_t bucket_id, new_delay = 0; 587 struct timeval tv; 588 589 /* Fetch all variables at once to ensure consistent reads */ 590 uint32_t bucket_time_ms = V_update_bucket_time_ms; 591 uint32_t threshold_rate = V_bucket_change_threshold_rate; 592 uint32_t max_delay_ms = V_fib_max_sync_delay_ms; 593 594 if (bucket_time_ms == 0) 595 bucket_time_ms = 50; 596 /* calculate per-bucket threshold rate */ 597 threshold_rate = threshold_rate * bucket_time_ms / 1000; 598 599 getmicrouptime(&tv); 600 601 struct fib_sync_status *fd_ss = &fd->fd_ss; 602 603 bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms; 604 605 if (fd_ss->bucket_id == bucket_id) { 606 fd_ss->bucket_changes++; 607 if (fd_ss->bucket_changes == threshold_rate) { 608 new_delay = (bucket_id + 2) * bucket_time_ms; 609 if (new_delay <= max_delay_ms) { 610 FD_PRINTF(LOG_DEBUG, fd, 611 "hit threshold of %u routes, delay update," 612 "bucket: %u, total delay: %u", 613 threshold_rate, bucket_id + 1, new_delay); 614 } else { 615 new_delay = 0; 616 FD_PRINTF(LOG_DEBUG, fd, 617 "maximum sync delay (%u ms) reached", max_delay_ms); 618 } 619 } else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1)) 620 new_delay = bucket_time_ms; 621 } else { 622 fd_ss->bucket_id = bucket_id; 623 fd_ss->bucket_changes = 1; 624 } 625 626 if (new_delay > 0) { 627 /* Calculated time has been updated */ 628 struct timeval new_tv = fd_ss->diverge_time; 629 add_tv_diff_ms(&new_tv, new_delay); 630 631 int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv); 632 schedule_callout(fd, action, delay_ms); 633 } 634 } 635 636 static void 637 update_algo_state(struct fib_data *fd) 638 { 639 640 RIB_WLOCK_ASSERT(fd->fd_rh); 641 642 if (fd->fd_batch || fd->fd_need_rebuild) { 643 enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH; 644 update_rebuild_delay(fd, action); 645 return; 646 } 647 648 if (fd->fd_num_changes++ == 0) { 649 /* Start callout to consider switch */ 650 if (!callout_pending(&fd->fd_callout)) 651 schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS); 652 } else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) { 653 /* Reset callout to exec immediately */ 654 if (fd->fd_callout_action == FDA_EVAL) 655 schedule_callout(fd, FDA_EVAL, 1); 656 } 657 } 658 659 static bool 660 need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc) 661 { 662 struct nhop_object *nh; 663 664 /* Sync addition/removal of interface routes */ 665 switch (rc->rc_cmd) { 666 case RTM_ADD: 667 nh = rc->rc_nh_new; 668 if (!NH_IS_NHGRP(nh)) { 669 if (!(nh->nh_flags & NHF_GATEWAY)) 670 return (true); 671 if (nhop_get_rtflags(nh) & RTF_STATIC) 672 return (true); 673 } 674 break; 675 case RTM_DELETE: 676 nh = rc->rc_nh_old; 677 if (!NH_IS_NHGRP(nh)) { 678 if (!(nh->nh_flags & NHF_GATEWAY)) 679 return (true); 680 if (nhop_get_rtflags(nh) & RTF_STATIC) 681 return (true); 682 } 683 break; 684 } 685 686 return (false); 687 } 688 689 static bool 690 apply_rtable_changes(struct fib_data *fd) 691 { 692 enum flm_op_result result; 693 struct fib_change_queue *q = &fd->fd_ss.fd_change_queue; 694 695 result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data); 696 697 if (result == FLM_SUCCESS) { 698 sync_rib_gen(fd); 699 for (int i = 0; i < q->count; i++) 700 if (q->entries[i].nh_old) 701 fib_unref_nhop(fd, q->entries[i].nh_old); 702 q->count = 0; 703 } 704 fd->fd_batch = false; 705 706 return (result == FLM_SUCCESS); 707 } 708 709 static bool 710 fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc) 711 { 712 int plen = 0; 713 714 switch (fd->fd_family) { 715 #ifdef INET 716 case AF_INET: 717 rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid); 718 break; 719 #endif 720 #ifdef INET6 721 case AF_INET6: 722 rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid); 723 break; 724 #endif 725 } 726 727 ce->plen = plen; 728 ce->nh_old = rc->rc_nh_old; 729 ce->nh_new = rc->rc_nh_new; 730 if (ce->nh_new != NULL) { 731 if (fib_ref_nhop(fd, ce->nh_new) == 0) 732 return (false); 733 } 734 735 return (true); 736 } 737 738 static bool 739 queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc) 740 { 741 struct fib_change_queue *q = &fd->fd_ss.fd_change_queue; 742 743 if (q->count >= q->size) { 744 uint32_t q_size; 745 746 if (q->size == 0) 747 q_size = 256; /* ~18k memory */ 748 else 749 q_size = q->size * 2; 750 751 size_t size = q_size * sizeof(struct fib_change_entry); 752 void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO); 753 if (a == NULL) { 754 FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements", 755 q_size); 756 return (false); 757 } 758 q->entries = a; 759 q->size = q_size; 760 } 761 762 return (fill_change_entry(fd, &q->entries[q->count++], rc)); 763 } 764 765 /* 766 * Rib subscription handler. Checks if the algorithm is ready to 767 * receive updates, handles nexthop refcounting and passes change 768 * data to the algorithm callback. 769 */ 770 static void 771 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, 772 void *_data) 773 { 774 struct fib_data *fd = (struct fib_data *)_data; 775 enum flm_op_result result; 776 777 RIB_WLOCK_ASSERT(rnh); 778 779 /* 780 * There is a small gap between subscribing for route changes 781 * and initiating rtable dump. Avoid receiving route changes 782 * prior to finishing rtable dump by checking `init_done`. 783 */ 784 if (!fd->init_done) 785 return; 786 787 bool immediate_sync = need_immediate_sync(fd, rc); 788 789 /* Consider scheduling algorithm re-evaluation */ 790 update_algo_state(fd); 791 792 /* 793 * If algo requested rebuild, stop sending updates by default. 794 * This simplifies nexthop refcount handling logic. 795 */ 796 if (fd->fd_need_rebuild) { 797 if (immediate_sync) 798 rebuild_fd(fd, "rtable change type enforced sync"); 799 return; 800 } 801 802 /* 803 * Algo requested updates to be delivered in batches. 804 * Add the current change to the queue and return. 805 */ 806 if (fd->fd_batch) { 807 if (immediate_sync) { 808 if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd)) 809 rebuild_fd(fd, "batch sync failed"); 810 } else { 811 if (!queue_rtable_change(fd, rc)) 812 schedule_fd_rebuild(fd, "batch queue failed"); 813 } 814 return; 815 } 816 817 /* 818 * Maintain guarantee that every nexthop returned by the dataplane 819 * lookup has > 0 refcount, so can be safely referenced within current 820 * epoch. 821 */ 822 if (rc->rc_nh_new != NULL) { 823 if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) { 824 /* ran out of indexes */ 825 schedule_fd_rebuild(fd, "ran out of nhop indexes"); 826 return; 827 } 828 } 829 830 result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); 831 832 switch (result) { 833 case FLM_SUCCESS: 834 sync_rib_gen(fd); 835 /* Unref old nexthop on success */ 836 if (rc->rc_nh_old != NULL) 837 fib_unref_nhop(fd, rc->rc_nh_old); 838 break; 839 case FLM_BATCH: 840 841 /* 842 * Algo asks to batch the changes. 843 */ 844 if (queue_rtable_change(fd, rc)) { 845 if (!immediate_sync) { 846 fd->fd_batch = true; 847 mark_diverge_time(fd); 848 update_rebuild_delay(fd, FDA_BATCH); 849 break; 850 } 851 if (apply_rtable_changes(fd)) 852 break; 853 } 854 FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild"); 855 856 case FLM_REBUILD: 857 858 /* 859 * Algo is not able to apply the update. 860 * Schedule algo rebuild. 861 */ 862 if (!immediate_sync) { 863 mark_diverge_time(fd); 864 schedule_fd_rebuild(fd, "algo requested rebuild"); 865 break; 866 } 867 868 FD_PRINTF(LOG_INFO, fd, "running sync rebuild"); 869 rebuild_fd(fd, "rtable change type enforced sync"); 870 break; 871 case FLM_ERROR: 872 873 /* 874 * Algo reported a non-recoverable error. 875 * Record the error and schedule rebuild, which will 876 * trigger best algo selection. 877 */ 878 FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error"); 879 if (!flm_error_add(fd->fd_flm, fd->fd_fibnum)) 880 FD_PRINTF(LOG_ERR, fd, "failed to ban algo"); 881 schedule_fd_rebuild(fd, "algo reported non-recoverable error"); 882 } 883 } 884 885 static void 886 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd) 887 { 888 889 if (old_fd == NULL) { 890 // TODO: read from rtable 891 fd->number_nhops = 16; 892 return; 893 } 894 895 if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) 896 fd->number_nhops = 2 * old_fd->number_nhops; 897 else 898 fd->number_nhops = old_fd->number_nhops; 899 } 900 901 struct walk_cbdata { 902 struct fib_data *fd; 903 flm_dump_t *func; 904 enum flm_op_result result; 905 }; 906 907 /* 908 * Handler called after all rtenties have been dumped. 909 * Performs post-dump framework checks and calls 910 * algo:flm_dump_end_cb(). 911 * 912 * Updates walk_cbdata result. 913 */ 914 static void 915 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) 916 { 917 struct walk_cbdata *w = (struct walk_cbdata *)_data; 918 struct fib_data *fd = w->fd; 919 920 RIB_WLOCK_ASSERT(w->fd->fd_rh); 921 922 if (rnh->rib_dying) { 923 w->result = FLM_ERROR; 924 return; 925 } 926 927 if (fd->hit_nhops) { 928 FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops", 929 fd->nh_ref_table->count); 930 if (w->result == FLM_SUCCESS) 931 w->result = FLM_REBUILD; 932 return; 933 } 934 935 if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) 936 return; 937 938 /* Post-dump hook, dump successful */ 939 w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); 940 941 if (w->result == FLM_SUCCESS) { 942 /* Mark init as done to allow routing updates */ 943 fd->init_done = 1; 944 } 945 } 946 947 /* 948 * Callback for each entry in rib. 949 * Calls algo:flm_dump_rib_item_cb func as a part of initial 950 * route table synchronisation. 951 */ 952 static int 953 sync_algo_cb(struct rtentry *rt, void *_data) 954 { 955 struct walk_cbdata *w = (struct walk_cbdata *)_data; 956 957 RIB_WLOCK_ASSERT(w->fd->fd_rh); 958 959 if (w->result == FLM_SUCCESS && w->func) { 960 961 /* 962 * Reference nexthops to maintain guarantee that 963 * each nexthop returned by datapath has > 0 references 964 * and can be safely referenced within current epoch. 965 */ 966 struct nhop_object *nh = rt_get_raw_nhop(rt); 967 if (fib_ref_nhop(w->fd, nh) != 0) 968 w->result = w->func(rt, w->fd->fd_algo_data); 969 else 970 w->result = FLM_REBUILD; 971 } 972 973 return (0); 974 } 975 976 /* 977 * Dump all routing table state to the algo instance. 978 */ 979 static enum flm_op_result 980 sync_algo(struct fib_data *fd) 981 { 982 struct walk_cbdata w = { 983 .fd = fd, 984 .func = fd->fd_flm->flm_dump_rib_item_cb, 985 .result = FLM_SUCCESS, 986 }; 987 988 rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w); 989 990 FD_PRINTF(LOG_INFO, fd, 991 "initial dump completed (rtable version: %d), result: %s", 992 fd->fd_rh->rnh_gen, print_op_result(w.result)); 993 994 return (w.result); 995 } 996 997 /* 998 * Schedules epoch-backed @fd instance deletion. 999 * * Unlinks @fd from the list of active algo instances. 1000 * * Removes rib subscription. 1001 * * Stops callout. 1002 * * Schedules actual deletion. 1003 * 1004 * Assume @fd is already unlinked from the datapath. 1005 */ 1006 static int 1007 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout) 1008 { 1009 bool is_dead; 1010 1011 NET_EPOCH_ASSERT(); 1012 RIB_WLOCK_ASSERT(fd->fd_rh); 1013 1014 FIB_MOD_LOCK(); 1015 is_dead = fd->fd_dead; 1016 if (!is_dead) 1017 fd->fd_dead = true; 1018 if (fd->fd_linked) { 1019 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 1020 fd->fd_linked = false; 1021 } 1022 FIB_MOD_UNLOCK(); 1023 if (is_dead) 1024 return (0); 1025 1026 FD_PRINTF(LOG_INFO, fd, "DETACH"); 1027 1028 if (fd->fd_rs != NULL) 1029 rib_unsubscribe_locked(fd->fd_rs); 1030 1031 /* 1032 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls 1033 * will be executed, hence no _new_ callout schedules will happen. 1034 */ 1035 callout_stop(&fd->fd_callout); 1036 1037 fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx); 1038 1039 return (0); 1040 } 1041 1042 /* 1043 * Wipe all fd instances from the list matching rib specified by @rh. 1044 * If @keep_first is set, remove all but the first record. 1045 */ 1046 static void 1047 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout) 1048 { 1049 struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); 1050 struct fib_data *fd, *fd_tmp; 1051 struct epoch_tracker et; 1052 1053 FIB_MOD_LOCK(); 1054 TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) { 1055 if (fd->fd_rh == rh) { 1056 if (keep_first) { 1057 keep_first = false; 1058 continue; 1059 } 1060 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 1061 fd->fd_linked = false; 1062 TAILQ_INSERT_TAIL(&tmp_head, fd, entries); 1063 } 1064 } 1065 FIB_MOD_UNLOCK(); 1066 1067 /* Pass 2: remove each entry */ 1068 NET_EPOCH_ENTER(et); 1069 TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { 1070 if (!in_callout) 1071 RIB_WLOCK(fd->fd_rh); 1072 schedule_destroy_fd_instance(fd, in_callout); 1073 if (!in_callout) 1074 RIB_WUNLOCK(fd->fd_rh); 1075 } 1076 NET_EPOCH_EXIT(et); 1077 } 1078 1079 void 1080 fib_destroy_rib(struct rib_head *rh) 1081 { 1082 1083 /* 1084 * rnh has `is_dying` flag set, so setup of new fd's will fail at 1085 * sync_algo() stage, preventing new entries to be added to the list 1086 * of active algos. Remove all existing entries for the particular rib. 1087 */ 1088 fib_cleanup_algo(rh, false, false); 1089 } 1090 1091 /* 1092 * Finalises fd destruction by freeing all fd resources. 1093 */ 1094 static void 1095 destroy_fd_instance(struct fib_data *fd) 1096 { 1097 1098 FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd); 1099 1100 /* Call destroy callback first */ 1101 if (fd->fd_algo_data != NULL) 1102 fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); 1103 1104 /* Nhop table */ 1105 if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) { 1106 for (int i = 0; i < fd->number_nhops; i++) { 1107 if (!is_idx_free(fd, i)) { 1108 FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p", 1109 i, fd->nh_idx[i]); 1110 nhop_free_any(fd->nh_idx[i]); 1111 } 1112 } 1113 free(fd->nh_idx, M_RTABLE); 1114 } 1115 if (fd->nh_ref_table != NULL) 1116 free(fd->nh_ref_table, M_RTABLE); 1117 1118 if (fd->fd_ss.fd_change_queue.entries != NULL) 1119 free(fd->fd_ss.fd_change_queue.entries, M_TEMP); 1120 1121 fib_unref_algo(fd->fd_flm); 1122 1123 free(fd, M_RTABLE); 1124 } 1125 1126 /* 1127 * Epoch callback indicating fd is safe to destroy 1128 */ 1129 static void 1130 destroy_fd_instance_epoch(epoch_context_t ctx) 1131 { 1132 struct fib_data *fd; 1133 1134 fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); 1135 1136 CURVNET_SET(fd->fd_vnet); 1137 destroy_fd_instance(fd); 1138 CURVNET_RESTORE(); 1139 } 1140 1141 /* 1142 * Tries to setup fd instance. 1143 * - Allocates fd/nhop table 1144 * - Runs algo:flm_init_cb algo init 1145 * - Subscribes fd to the rib 1146 * - Runs rtable dump 1147 * - Adds instance to the list of active instances. 1148 * 1149 * Returns: operation result. Fills in @pfd with resulting fd on success. 1150 * 1151 */ 1152 static enum flm_op_result 1153 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 1154 struct fib_data *old_fd, struct fib_data **pfd) 1155 { 1156 struct fib_data *fd; 1157 size_t size; 1158 enum flm_op_result result; 1159 1160 /* Allocate */ 1161 fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); 1162 if (fd == NULL) { 1163 *pfd = NULL; 1164 RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure"); 1165 return (FLM_REBUILD); 1166 } 1167 *pfd = fd; 1168 1169 estimate_nhop_scale(old_fd, fd); 1170 1171 fd->fd_rh = rh; 1172 fd->fd_family = rh->rib_family; 1173 fd->fd_fibnum = rh->rib_fibnum; 1174 callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0); 1175 fd->fd_vnet = curvnet; 1176 fd->fd_flm = flm; 1177 1178 FIB_MOD_LOCK(); 1179 flm->flm_refcount++; 1180 fd->fd_gen = ++fib_gen; 1181 FIB_MOD_UNLOCK(); 1182 1183 FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd); 1184 1185 /* Allocate nhidx -> nhop_ptr table */ 1186 size = fd->number_nhops * sizeof(void *); 1187 fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 1188 if (fd->nh_idx == NULL) { 1189 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size); 1190 return (FLM_REBUILD); 1191 } 1192 1193 /* Allocate nhop index refcount table */ 1194 size = sizeof(struct nhop_ref_table); 1195 size += fd->number_nhops * sizeof(uint32_t); 1196 fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 1197 if (fd->nh_ref_table == NULL) { 1198 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size); 1199 return (FLM_REBUILD); 1200 } 1201 FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops); 1202 1203 /* Okay, we're ready for algo init */ 1204 void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; 1205 result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); 1206 if (result != FLM_SUCCESS) { 1207 FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name); 1208 return (result); 1209 } 1210 1211 /* Try to subscribe */ 1212 if (flm->flm_change_rib_item_cb != NULL) { 1213 fd->fd_rs = rib_subscribe_locked(fd->fd_rh, 1214 handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE); 1215 if (fd->fd_rs == NULL) { 1216 FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes"); 1217 return (FLM_REBUILD); 1218 } 1219 } 1220 1221 /* Dump */ 1222 result = sync_algo(fd); 1223 if (result != FLM_SUCCESS) { 1224 FD_PRINTF(LOG_INFO, fd, "rib sync failed"); 1225 return (result); 1226 } 1227 FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully."); 1228 1229 FIB_MOD_LOCK(); 1230 /* 1231 * Insert fd in the beginning of a list, to maintain invariant 1232 * that first matching entry for the AF/fib is always the active 1233 * one. 1234 */ 1235 TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries); 1236 fd->fd_linked = true; 1237 FIB_MOD_UNLOCK(); 1238 1239 return (FLM_SUCCESS); 1240 } 1241 1242 /* 1243 * Sets up algo @flm for table @rh and links it to the datapath. 1244 * 1245 */ 1246 static enum flm_op_result 1247 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 1248 struct fib_data *orig_fd, struct fib_data **pfd, bool attach) 1249 { 1250 struct fib_data *prev_fd, *new_fd; 1251 enum flm_op_result result; 1252 1253 NET_EPOCH_ASSERT(); 1254 RIB_WLOCK_ASSERT(rh); 1255 1256 prev_fd = orig_fd; 1257 new_fd = NULL; 1258 for (int i = 0; i < FIB_MAX_TRIES; i++) { 1259 result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd); 1260 1261 if ((result == FLM_SUCCESS) && attach) { 1262 if (fib_set_datapath_ptr(new_fd, &new_fd->fd_dp)) 1263 sync_rib_gen(new_fd); 1264 else 1265 result = FLM_REBUILD; 1266 } 1267 1268 if ((prev_fd != NULL) && (prev_fd != orig_fd)) { 1269 schedule_destroy_fd_instance(prev_fd, false); 1270 prev_fd = NULL; 1271 } 1272 1273 RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i, 1274 print_op_result(result)); 1275 1276 if (result == FLM_REBUILD) { 1277 prev_fd = new_fd; 1278 new_fd = NULL; 1279 continue; 1280 } 1281 1282 break; 1283 } 1284 1285 if (result != FLM_SUCCESS) { 1286 RH_PRINTF(LOG_WARNING, rh, 1287 "%s algo instance setup failed, failures=%d", flm->flm_name, 1288 orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0); 1289 /* update failure count */ 1290 FIB_MOD_LOCK(); 1291 if (orig_fd != NULL) 1292 orig_fd->fd_failed_rebuilds++; 1293 FIB_MOD_UNLOCK(); 1294 1295 /* Ban algo on non-recoverable error */ 1296 if (result == FLM_ERROR) 1297 flm_error_add(flm, rh->rib_fibnum); 1298 1299 if ((prev_fd != NULL) && (prev_fd != orig_fd)) 1300 schedule_destroy_fd_instance(prev_fd, false); 1301 if (new_fd != NULL) { 1302 schedule_destroy_fd_instance(new_fd, false); 1303 new_fd = NULL; 1304 } 1305 } 1306 1307 *pfd = new_fd; 1308 return (result); 1309 } 1310 1311 /* 1312 * Tries to sync algo with the current rtable state, either 1313 * by executing batch update or rebuilding. 1314 * Returns true on success. 1315 */ 1316 static bool 1317 execute_callout_action(struct fib_data *fd) 1318 { 1319 enum fib_callout_action action = fd->fd_callout_action; 1320 struct fib_lookup_module *flm_new = NULL; 1321 bool result = true; 1322 1323 NET_EPOCH_ASSERT(); 1324 RIB_WLOCK_ASSERT(fd->fd_rh); 1325 1326 fd->fd_need_rebuild = false; 1327 fd->fd_batch = false; 1328 fd->fd_num_changes = 0; 1329 1330 /* First, check if we're still OK to use this algo */ 1331 if (!is_algo_fixed(fd->fd_rh)) 1332 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); 1333 if (flm_new != NULL) 1334 action = FDA_REBUILD; 1335 1336 if (action == FDA_BATCH) { 1337 /* Try to sync */ 1338 if (!apply_rtable_changes(fd)) 1339 action = FDA_REBUILD; 1340 } 1341 1342 if (action == FDA_REBUILD) 1343 result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); 1344 if (flm_new != NULL) 1345 fib_unref_algo(flm_new); 1346 1347 return (result); 1348 } 1349 1350 /* 1351 * Callout for all scheduled fd-related work. 1352 * - Checks if the current algo is still the best algo 1353 * - Synchronises algo instance to the rtable (batch usecase) 1354 * - Creates a new instance of an algo for af/fib if desired. 1355 */ 1356 static void 1357 handle_fd_callout(void *_data) 1358 { 1359 struct fib_data *fd = (struct fib_data *)_data; 1360 struct epoch_tracker et; 1361 1362 FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action); 1363 1364 NET_EPOCH_ENTER(et); 1365 CURVNET_SET(fd->fd_vnet); 1366 execute_callout_action(fd); 1367 CURVNET_RESTORE(); 1368 NET_EPOCH_EXIT(et); 1369 } 1370 1371 /* 1372 * Tries to create new algo instance based on @fd data. 1373 * Returns true on success. 1374 */ 1375 static bool 1376 rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new) 1377 { 1378 struct fib_data *fd_new, *fd_tmp = NULL; 1379 bool result; 1380 1381 if (flm_new == fd->fd_flm) 1382 fd_tmp = fd; 1383 else 1384 FD_PRINTF(LOG_INFO, fd, "switching algo to %s", flm_new->flm_name); 1385 1386 result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true); 1387 if (result != FLM_SUCCESS) { 1388 FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed"); 1389 return (false); 1390 } 1391 FD_PRINTF(LOG_INFO, fd_new, "switched to new instance"); 1392 1393 /* Remove old instance */ 1394 schedule_destroy_fd_instance(fd, true); 1395 1396 return (true); 1397 } 1398 1399 static bool 1400 rebuild_fd(struct fib_data *fd, const char *reason) 1401 { 1402 struct fib_lookup_module *flm_new = NULL; 1403 bool result; 1404 1405 if (!is_algo_fixed(fd->fd_rh)) 1406 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); 1407 1408 FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason); 1409 result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); 1410 if (flm_new != NULL) 1411 fib_unref_algo(flm_new); 1412 1413 if (!result) { 1414 FD_PRINTF(LOG_ERR, fd, "sync rebuild failed"); 1415 schedule_fd_rebuild(fd, "sync rebuild failed"); 1416 } 1417 1418 return (result); 1419 } 1420 1421 /* 1422 * Finds algo by name/family. 1423 * Returns referenced algo or NULL. 1424 */ 1425 static struct fib_lookup_module * 1426 fib_find_algo(const char *algo_name, int family) 1427 { 1428 struct fib_lookup_module *flm; 1429 1430 FIB_MOD_LOCK(); 1431 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1432 if ((strcmp(flm->flm_name, algo_name) == 0) && 1433 (family == flm->flm_family)) { 1434 flm->flm_refcount++; 1435 FIB_MOD_UNLOCK(); 1436 return (flm); 1437 } 1438 } 1439 FIB_MOD_UNLOCK(); 1440 1441 return (NULL); 1442 } 1443 1444 static void 1445 fib_unref_algo(struct fib_lookup_module *flm) 1446 { 1447 1448 FIB_MOD_LOCK(); 1449 flm->flm_refcount--; 1450 FIB_MOD_UNLOCK(); 1451 } 1452 1453 static int 1454 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req) 1455 { 1456 struct fib_lookup_module *flm = NULL; 1457 struct fib_data *fd = NULL; 1458 char old_algo_name[32], algo_name[32]; 1459 struct rib_head *rh = NULL; 1460 enum flm_op_result result; 1461 struct epoch_tracker et; 1462 int error; 1463 1464 /* Fetch current algo/rib for af/family */ 1465 FIB_MOD_LOCK(); 1466 TAILQ_FOREACH(fd, &V_fib_data_list, entries) { 1467 if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum)) 1468 break; 1469 } 1470 if (fd == NULL) { 1471 FIB_MOD_UNLOCK(); 1472 return (ENOENT); 1473 } 1474 rh = fd->fd_rh; 1475 strlcpy(old_algo_name, fd->fd_flm->flm_name, 1476 sizeof(old_algo_name)); 1477 FIB_MOD_UNLOCK(); 1478 1479 strlcpy(algo_name, old_algo_name, sizeof(algo_name)); 1480 error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); 1481 if (error != 0 || req->newptr == NULL) 1482 return (error); 1483 1484 if (strcmp(algo_name, old_algo_name) == 0) 1485 return (0); 1486 1487 /* New algorithm name is different */ 1488 flm = fib_find_algo(algo_name, family); 1489 if (flm == NULL) { 1490 RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name); 1491 return (ESRCH); 1492 } 1493 1494 fd = NULL; 1495 NET_EPOCH_ENTER(et); 1496 RIB_WLOCK(rh); 1497 result = setup_fd_instance(flm, rh, NULL, &fd, true); 1498 RIB_WUNLOCK(rh); 1499 NET_EPOCH_EXIT(et); 1500 fib_unref_algo(flm); 1501 if (result != FLM_SUCCESS) 1502 return (EINVAL); 1503 1504 /* Disable automated jumping between algos */ 1505 FIB_MOD_LOCK(); 1506 set_algo_fixed(rh); 1507 FIB_MOD_UNLOCK(); 1508 /* Remove old instance(s) */ 1509 fib_cleanup_algo(rh, true, false); 1510 1511 /* Drain cb so user can unload the module after userret if so desired */ 1512 NET_EPOCH_DRAIN_CALLBACKS(); 1513 1514 return (0); 1515 } 1516 1517 #ifdef INET 1518 static int 1519 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS) 1520 { 1521 1522 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req)); 1523 } 1524 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo, 1525 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1526 set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo"); 1527 #endif 1528 1529 #ifdef INET6 1530 static int 1531 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS) 1532 { 1533 1534 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req)); 1535 } 1536 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, 1537 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1538 set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo"); 1539 #endif 1540 1541 static struct nhop_object * 1542 dummy_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) 1543 { 1544 return (NULL); 1545 } 1546 1547 static void 1548 destroy_fdh_epoch(epoch_context_t ctx) 1549 { 1550 struct fib_dp_header *fdh; 1551 1552 fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx); 1553 free(fdh, M_RTABLE); 1554 } 1555 1556 static struct fib_dp_header * 1557 alloc_fib_dp_array(uint32_t num_tables, bool waitok) 1558 { 1559 size_t sz; 1560 struct fib_dp_header *fdh; 1561 1562 sz = sizeof(struct fib_dp_header); 1563 sz += sizeof(struct fib_dp) * num_tables; 1564 fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); 1565 if (fdh != NULL) { 1566 fdh->fdh_num_tables = num_tables; 1567 /* 1568 * Set dummy lookup function ptr always returning NULL, so 1569 * we can delay algo init. 1570 */ 1571 for (uint32_t i = 0; i < num_tables; i++) 1572 fdh->fdh_idx[i].f = dummy_lookup; 1573 } 1574 return (fdh); 1575 } 1576 1577 static struct fib_dp_header * 1578 get_fib_dp_header(struct fib_dp *dp) 1579 { 1580 1581 return (__containerof((void *)dp, struct fib_dp_header, fdh_idx)); 1582 } 1583 1584 /* 1585 * Replace per-family index pool @pdp with a new one which 1586 * contains updated callback/algo data from @fd. 1587 * Returns true on success. 1588 */ 1589 static bool 1590 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp) 1591 { 1592 struct fib_dp_header *new_fdh, *old_fdh; 1593 1594 NET_EPOCH_ASSERT(); 1595 1596 FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p", 1597 curvnet, dp->f, dp->arg); 1598 1599 FIB_MOD_LOCK(); 1600 old_fdh = get_fib_dp_header(*pdp); 1601 1602 if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) { 1603 /* 1604 * Function is the same, data pointer needs update. 1605 * Perform in-line replace without reallocation. 1606 */ 1607 old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg; 1608 FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh); 1609 FIB_MOD_UNLOCK(); 1610 return (true); 1611 } 1612 1613 new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false); 1614 FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh); 1615 if (new_fdh == NULL) { 1616 FIB_MOD_UNLOCK(); 1617 FD_PRINTF(LOG_WARNING, fd, "error attaching datapath"); 1618 return (false); 1619 } 1620 1621 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1622 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1623 /* Update relevant data structure for @fd */ 1624 new_fdh->fdh_idx[fd->fd_fibnum] = *dp; 1625 1626 /* Ensure memcpy() writes have completed */ 1627 atomic_thread_fence_rel(); 1628 /* Set new datapath pointer */ 1629 *pdp = &new_fdh->fdh_idx[0]; 1630 FIB_MOD_UNLOCK(); 1631 FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh); 1632 1633 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); 1634 1635 return (true); 1636 } 1637 1638 static struct fib_dp ** 1639 get_family_dp_ptr(int family) 1640 { 1641 switch (family) { 1642 #ifdef INET 1643 case AF_INET: 1644 return (&V_inet_dp); 1645 #endif 1646 #ifdef INET6 1647 case AF_INET6: 1648 return (&V_inet6_dp); 1649 #endif 1650 } 1651 return (NULL); 1652 } 1653 1654 /* 1655 * Make datapath use fib instance @fd 1656 */ 1657 bool 1658 fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp) 1659 { 1660 struct fib_dp **pdp; 1661 1662 pdp = get_family_dp_ptr(fd->fd_family); 1663 return (replace_rtables_family(pdp, fd, dp)); 1664 } 1665 1666 /* 1667 * Grow datapath pointers array. 1668 * Called from sysctl handler on growing number of routing tables. 1669 */ 1670 static void 1671 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) 1672 { 1673 struct fib_dp_header *new_fdh, *old_fdh = NULL; 1674 1675 new_fdh = alloc_fib_dp_array(new_num_tables, true); 1676 1677 FIB_MOD_LOCK(); 1678 if (*pdp != NULL) { 1679 old_fdh = get_fib_dp_header(*pdp); 1680 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1681 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1682 } 1683 1684 /* Wait till all writes completed */ 1685 atomic_thread_fence_rel(); 1686 1687 *pdp = &new_fdh->fdh_idx[0]; 1688 FIB_MOD_UNLOCK(); 1689 1690 if (old_fdh != NULL) 1691 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); 1692 } 1693 1694 /* 1695 * Grows per-AF arrays of datapath pointers for each supported family. 1696 * Called from fibs resize sysctl handler. 1697 */ 1698 void 1699 fib_grow_rtables(uint32_t new_num_tables) 1700 { 1701 1702 #ifdef INET 1703 grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables); 1704 #endif 1705 #ifdef INET6 1706 grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables); 1707 #endif 1708 } 1709 1710 void 1711 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) 1712 { 1713 1714 bzero(rinfo, sizeof(struct rib_rtable_info)); 1715 rinfo->num_prefixes = rh->rnh_prefixes; 1716 rinfo->num_nhops = nhops_get_count(rh); 1717 #ifdef ROUTE_MPATH 1718 rinfo->num_nhgrp = nhgrp_get_count(rh); 1719 #endif 1720 } 1721 1722 /* 1723 * Updates pointer to the algo data for the @fd. 1724 */ 1725 void 1726 fib_set_algo_ptr(struct fib_data *fd, void *algo_data) 1727 { 1728 RIB_WLOCK_ASSERT(fd->fd_rh); 1729 1730 fd->fd_algo_data = algo_data; 1731 } 1732 1733 /* 1734 * Calls @callback with @ctx after the end of a current epoch. 1735 */ 1736 void 1737 fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx) 1738 { 1739 NET_EPOCH_CALL(callback, ctx); 1740 } 1741 1742 /* 1743 * Accessor to get rib instance @fd is attached to. 1744 */ 1745 struct rib_head * 1746 fib_get_rh(struct fib_data *fd) 1747 { 1748 1749 return (fd->fd_rh); 1750 } 1751 1752 /* 1753 * Accessor to export idx->nhop array 1754 */ 1755 struct nhop_object ** 1756 fib_get_nhop_array(struct fib_data *fd) 1757 { 1758 1759 return (fd->nh_idx); 1760 } 1761 1762 static uint32_t 1763 get_nhop_idx(struct nhop_object *nh) 1764 { 1765 #ifdef ROUTE_MPATH 1766 if (NH_IS_NHGRP(nh)) 1767 return (nhgrp_get_idx((struct nhgrp_object *)nh)); 1768 else 1769 #endif 1770 return (nhop_get_idx(nh)); 1771 } 1772 1773 uint32_t 1774 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) 1775 { 1776 1777 return (get_nhop_idx(nh)); 1778 } 1779 1780 static bool 1781 is_idx_free(struct fib_data *fd, uint32_t index) 1782 { 1783 1784 return (fd->nh_ref_table->refcnt[index] == 0); 1785 } 1786 1787 static uint32_t 1788 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh) 1789 { 1790 uint32_t idx = get_nhop_idx(nh); 1791 1792 if (idx >= fd->number_nhops) { 1793 fd->hit_nhops = 1; 1794 return (0); 1795 } 1796 1797 if (is_idx_free(fd, idx)) { 1798 nhop_ref_any(nh); 1799 fd->nh_idx[idx] = nh; 1800 fd->nh_ref_table->count++; 1801 FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); 1802 } 1803 fd->nh_ref_table->refcnt[idx]++; 1804 1805 return (idx); 1806 } 1807 1808 struct nhop_release_data { 1809 struct nhop_object *nh; 1810 struct epoch_context ctx; 1811 }; 1812 1813 static void 1814 release_nhop_epoch(epoch_context_t ctx) 1815 { 1816 struct nhop_release_data *nrd; 1817 1818 nrd = __containerof(ctx, struct nhop_release_data, ctx); 1819 nhop_free_any(nrd->nh); 1820 free(nrd, M_TEMP); 1821 } 1822 1823 /* 1824 * Delays nexthop refcount release. 1825 * Datapath may have the datastructures not updated yet, so the old 1826 * nexthop may still be returned till the end of current epoch. Delay 1827 * refcount removal, as we may be removing the last instance, which will 1828 * trigger nexthop deletion, rendering returned nexthop invalid. 1829 */ 1830 static void 1831 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) 1832 { 1833 struct nhop_release_data *nrd; 1834 1835 nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO); 1836 if (nrd != NULL) { 1837 nrd->nh = nh; 1838 fib_epoch_call(release_nhop_epoch, &nrd->ctx); 1839 } else { 1840 /* 1841 * Unable to allocate memory. Leak nexthop to maintain guarantee 1842 * that each nhop can be referenced. 1843 */ 1844 FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh); 1845 } 1846 } 1847 1848 static void 1849 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh) 1850 { 1851 uint32_t idx = get_nhop_idx(nh); 1852 1853 KASSERT((idx < fd->number_nhops), ("invalid nhop index")); 1854 KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh")); 1855 1856 fd->nh_ref_table->refcnt[idx]--; 1857 if (fd->nh_ref_table->refcnt[idx] == 0) { 1858 FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); 1859 fib_schedule_release_nhop(fd, fd->nh_idx[idx]); 1860 } 1861 } 1862 1863 static void 1864 set_algo_fixed(struct rib_head *rh) 1865 { 1866 switch (rh->rib_family) { 1867 #ifdef INET 1868 case AF_INET: 1869 V_algo_fixed_inet = true; 1870 break; 1871 #endif 1872 #ifdef INET6 1873 case AF_INET6: 1874 V_algo_fixed_inet6 = true; 1875 break; 1876 #endif 1877 } 1878 } 1879 1880 static bool 1881 is_algo_fixed(struct rib_head *rh) 1882 { 1883 1884 switch (rh->rib_family) { 1885 #ifdef INET 1886 case AF_INET: 1887 return (V_algo_fixed_inet); 1888 #endif 1889 #ifdef INET6 1890 case AF_INET6: 1891 return (V_algo_fixed_inet6); 1892 #endif 1893 } 1894 return (false); 1895 } 1896 1897 /* 1898 * Runs the check on what would be the best algo for rib @rh, assuming 1899 * that the current algo is the one specified by @orig_flm. Note that 1900 * it can be NULL for initial selection. 1901 * 1902 * Returns referenced new algo or NULL if the current one is the best. 1903 */ 1904 static struct fib_lookup_module * 1905 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) 1906 { 1907 uint8_t preference, curr_preference = 0, best_preference = 0; 1908 struct fib_lookup_module *flm, *best_flm = NULL; 1909 struct rib_rtable_info rinfo; 1910 int candidate_algos = 0; 1911 1912 fib_get_rtable_info(rh, &rinfo); 1913 1914 FIB_MOD_LOCK(); 1915 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1916 if (flm->flm_family != rh->rib_family) 1917 continue; 1918 candidate_algos++; 1919 preference = flm->flm_get_pref(&rinfo); 1920 if (preference > best_preference) { 1921 if (!flm_error_check(flm, rh->rib_fibnum)) { 1922 best_preference = preference; 1923 best_flm = flm; 1924 } 1925 } 1926 if (flm == orig_flm) 1927 curr_preference = preference; 1928 } 1929 if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference)) 1930 best_flm->flm_refcount++; 1931 else 1932 best_flm = NULL; 1933 FIB_MOD_UNLOCK(); 1934 1935 RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", 1936 candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, 1937 best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"), 1938 best_preference); 1939 1940 return (best_flm); 1941 } 1942 1943 /* 1944 * Called when new route table is created. 1945 * Selects, allocates and attaches fib algo for the table. 1946 */ 1947 static bool 1948 fib_select_algo_initial(struct rib_head *rh, struct fib_dp *dp) 1949 { 1950 struct fib_lookup_module *flm; 1951 struct fib_data *fd = NULL; 1952 enum flm_op_result result; 1953 struct epoch_tracker et; 1954 1955 flm = fib_check_best_algo(rh, NULL); 1956 if (flm == NULL) { 1957 RH_PRINTF(LOG_CRIT, rh, "no algo selected"); 1958 return (false); 1959 } 1960 RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name); 1961 1962 NET_EPOCH_ENTER(et); 1963 RIB_WLOCK(rh); 1964 result = setup_fd_instance(flm, rh, NULL, &fd, false); 1965 RIB_WUNLOCK(rh); 1966 NET_EPOCH_EXIT(et); 1967 1968 RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd); 1969 if (result == FLM_SUCCESS) 1970 *dp = fd->fd_dp; 1971 else 1972 RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name); 1973 1974 fib_unref_algo(flm); 1975 1976 return (result == FLM_SUCCESS); 1977 } 1978 1979 /* 1980 * Sets up fib algo instances for the non-initialized RIBs in the @family. 1981 * Allocates temporary datapath index to amortize datapaint index updates 1982 * with large @num_tables. 1983 */ 1984 void 1985 fib_setup_family(int family, uint32_t num_tables) 1986 { 1987 struct fib_dp_header *new_fdh = alloc_fib_dp_array(num_tables, false); 1988 if (new_fdh == NULL) { 1989 ALGO_PRINTF(LOG_CRIT, "Unable to setup framework for %s", print_family(family)); 1990 return; 1991 } 1992 1993 for (int i = 0; i < num_tables; i++) { 1994 struct rib_head *rh = rt_tables_get_rnh(i, family); 1995 if (rh->rib_algo_init) 1996 continue; 1997 if (!fib_select_algo_initial(rh, &new_fdh->fdh_idx[i])) 1998 continue; 1999 2000 rh->rib_algo_init = true; 2001 } 2002 2003 FIB_MOD_LOCK(); 2004 struct fib_dp **pdp = get_family_dp_ptr(family); 2005 struct fib_dp_header *old_fdh = get_fib_dp_header(*pdp); 2006 2007 /* Update the items not touched by the new init, from the old data pointer */ 2008 for (int i = 0; i < num_tables; i++) { 2009 if (new_fdh->fdh_idx[i].f == dummy_lookup) 2010 new_fdh->fdh_idx[i] = old_fdh->fdh_idx[i]; 2011 } 2012 2013 /* Ensure all index writes have completed */ 2014 atomic_thread_fence_rel(); 2015 /* Set new datapath pointer */ 2016 *pdp = &new_fdh->fdh_idx[0]; 2017 2018 FIB_MOD_UNLOCK(); 2019 2020 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); 2021 } 2022 2023 /* 2024 * Registers fib lookup module within the subsystem. 2025 */ 2026 int 2027 fib_module_register(struct fib_lookup_module *flm) 2028 { 2029 2030 FIB_MOD_LOCK(); 2031 ALGO_PRINTF(LOG_INFO, "attaching %s to %s", flm->flm_name, 2032 print_family(flm->flm_family)); 2033 TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); 2034 FIB_MOD_UNLOCK(); 2035 2036 return (0); 2037 } 2038 2039 /* 2040 * Tries to unregister fib lookup module. 2041 * 2042 * Returns 0 on success, EBUSY if module is still used 2043 * by some of the tables. 2044 */ 2045 int 2046 fib_module_unregister(struct fib_lookup_module *flm) 2047 { 2048 2049 FIB_MOD_LOCK(); 2050 if (flm->flm_refcount > 0) { 2051 FIB_MOD_UNLOCK(); 2052 return (EBUSY); 2053 } 2054 fib_error_clear_flm(flm); 2055 ALGO_PRINTF(LOG_INFO, "detaching %s from %s", flm->flm_name, 2056 print_family(flm->flm_family)); 2057 TAILQ_REMOVE(&all_algo_list, flm, entries); 2058 FIB_MOD_UNLOCK(); 2059 2060 return (0); 2061 } 2062 2063 void 2064 vnet_fib_init(void) 2065 { 2066 2067 TAILQ_INIT(&V_fib_data_list); 2068 } 2069 2070 void 2071 vnet_fib_destroy(void) 2072 { 2073 2074 FIB_MOD_LOCK(); 2075 fib_error_clear(); 2076 FIB_MOD_UNLOCK(); 2077 } 2078