1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 34 #include <sys/param.h> 35 #include <sys/eventhandler.h> 36 #include <sys/kernel.h> 37 #include <sys/sbuf.h> 38 #include <sys/lock.h> 39 #include <sys/rmlock.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/module.h> 43 #include <sys/kernel.h> 44 #include <sys/priv.h> 45 #include <sys/proc.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sysctl.h> 49 #include <sys/syslog.h> 50 #include <sys/queue.h> 51 #include <net/vnet.h> 52 53 #include <net/if.h> 54 #include <net/if_var.h> 55 56 #include <netinet/in.h> 57 #include <netinet/in_var.h> 58 #include <netinet/ip.h> 59 #include <netinet/ip_var.h> 60 #ifdef INET6 61 #include <netinet/ip6.h> 62 #include <netinet6/ip6_var.h> 63 #endif 64 65 #include <net/route.h> 66 #include <net/route/nhop.h> 67 #include <net/route/route_ctl.h> 68 #include <net/route/route_var.h> 69 #include <net/route/fib_algo.h> 70 71 #include <machine/stdarg.h> 72 73 /* 74 * Fib lookup framework. 75 * 76 * This framework enables accelerated longest-prefix-match lookups for the 77 * routing tables by adding the ability to dynamically attach/detach lookup 78 * algorithms implementation to/from the datapath. 79 * 80 * flm - fib lookup modules - implementation of particular lookup algorithm 81 * fd - fib data - instance of an flm bound to specific routing table 82 * 83 * This file provides main framework functionality. 84 * 85 * The following are the features provided by the framework 86 * 87 * 1) nexhops abstraction -> provides transparent referencing, indexing 88 * and efficient idx->ptr mappings for nexthop and nexthop groups. 89 * 2) Routing table synchronisation 90 * 3) dataplane attachment points 91 * 4) automatic algorithm selection based on the provided preference. 92 * 93 * 94 * DATAPATH 95 * For each supported address family, there is a an allocated array of fib_dp 96 * structures, indexed by fib number. Each array entry contains callback function 97 * and its argument. This function will be called with a family-specific lookup key, 98 * scope and provided argument. This array gets re-created every time when new algo 99 * instance gets created. Please take a look at the replace_rtables_family() function 100 * for more details. 101 * 102 */ 103 104 SYSCTL_DECL(_net_route); 105 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 106 "Fib algorithm lookups"); 107 108 /* Algorithm sync policy */ 109 110 /* Time interval to bucket updates */ 111 VNET_DEFINE(unsigned int, bucket_time_ms) = 50; 112 #define V_bucket_time_ms VNET(bucket_time_ms) 113 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET, 114 &VNET_NAME(bucket_time_ms), 0, "Time interval to calculate update rate"); 115 116 /* Minimum update rate to delay sync */ 117 VNET_DEFINE(unsigned int, bucket_change_threshold_rate) = 500; 118 #define V_bucket_change_threshold_rate VNET(bucket_change_threshold_rate) 119 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET, 120 &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync"); 121 122 /* Max allowed delay to sync */ 123 VNET_DEFINE(unsigned int, fib_max_sync_delay_ms) = 1000; 124 #define V_fib_max_sync_delay_ms VNET(fib_max_sync_delay_ms) 125 SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET, 126 &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)"); 127 128 129 #ifdef INET6 130 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false; 131 #define V_algo_fixed_inet6 VNET(algo_fixed_inet6) 132 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 133 "IPv6 longest prefix match lookups"); 134 #endif 135 #ifdef INET 136 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false; 137 #define V_algo_fixed_inet VNET(algo_fixed_inet) 138 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 139 "IPv4 longest prefix match lookups"); 140 #endif 141 142 /* Fib instance counter */ 143 static uint32_t fib_gen = 0; 144 145 struct nhop_ref_table { 146 uint32_t count; 147 int32_t refcnt[0]; 148 }; 149 150 enum fib_callout_action { 151 FDA_NONE, /* No callout scheduled */ 152 FDA_REBUILD, /* Asks to rebuild algo instance */ 153 FDA_EVAL, /* Asks to evaluate if the current algo is still be best */ 154 FDA_BATCH, /* Asks to submit batch of updates to the algo */ 155 }; 156 157 struct fib_sync_status { 158 struct timeval diverge_time; /* ts when diverged */ 159 uint32_t num_changes; /* number of changes since sync */ 160 uint32_t bucket_changes; /* num changes within the current bucket */ 161 uint64_t bucket_id; /* 50ms bucket # */ 162 struct fib_change_queue fd_change_queue;/* list of scheduled entries */ 163 }; 164 165 /* 166 * Data structure for the fib lookup instance tied to the particular rib. 167 */ 168 struct fib_data { 169 uint32_t number_nhops; /* current # of nhops */ 170 uint8_t hit_nhops; /* true if out of nhop limit */ 171 uint8_t init_done; /* true if init is competed */ 172 uint32_t fd_dead:1; /* Scheduled for deletion */ 173 uint32_t fd_linked:1; /* true if linked */ 174 uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ 175 uint32_t fd_batch:1; /* true if batched notification scheduled */ 176 uint8_t fd_family; /* family */ 177 uint32_t fd_fibnum; /* fibnum */ 178 uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ 179 uint32_t fd_gen; /* instance gen# */ 180 struct callout fd_callout; /* rebuild callout */ 181 enum fib_callout_action fd_callout_action; /* Callout action to take */ 182 void *fd_algo_data; /* algorithm data */ 183 struct nhop_object **nh_idx; /* nhop idx->ptr array */ 184 struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ 185 struct rib_head *fd_rh; /* RIB table we're attached to */ 186 struct rib_subscription *fd_rs; /* storing table subscription */ 187 struct fib_dp fd_dp; /* fib datapath data */ 188 struct vnet *fd_vnet; /* vnet fib belongs to */ 189 struct epoch_context fd_epoch_ctx; /* epoch context for deletion */ 190 struct fib_lookup_module *fd_flm;/* pointer to the lookup module */ 191 struct fib_sync_status fd_ss; /* State relevant to the rib sync */ 192 uint32_t fd_num_changes; /* number of changes since last callout */ 193 TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ 194 }; 195 196 static bool rebuild_fd(struct fib_data *fd, const char *reason); 197 static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new); 198 static void handle_fd_callout(void *_data); 199 static void destroy_fd_instance_epoch(epoch_context_t ctx); 200 static bool is_idx_free(struct fib_data *fd, uint32_t index); 201 static void set_algo_fixed(struct rib_head *rh); 202 static bool is_algo_fixed(struct rib_head *rh); 203 204 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh); 205 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh); 206 207 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, 208 struct fib_lookup_module *orig_flm); 209 static void fib_unref_algo(struct fib_lookup_module *flm); 210 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum); 211 212 struct mtx fib_mtx; 213 #define FIB_MOD_LOCK() mtx_lock(&fib_mtx) 214 #define FIB_MOD_UNLOCK() mtx_unlock(&fib_mtx) 215 #define FIB_MOD_LOCK_ASSERT() mtx_assert(&fib_mtx, MA_OWNED) 216 217 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF); 218 219 /* Algorithm has to be this percent better than the current to switch */ 220 #define BEST_DIFF_PERCENT (5 * 256 / 100) 221 /* Schedule algo re-evaluation X seconds after a change */ 222 #define ALGO_EVAL_DELAY_MS 30000 223 /* Force algo re-evaluation after X changes */ 224 #define ALGO_EVAL_NUM_ROUTES 100 225 /* Try to setup algorithm X times */ 226 #define FIB_MAX_TRIES 32 227 /* Max amount of supported nexthops */ 228 #define FIB_MAX_NHOPS 262144 229 #define FIB_CALLOUT_DELAY_MS 50 230 231 232 /* Debug */ 233 static int flm_debug_level = LOG_NOTICE; 234 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN, 235 &flm_debug_level, 0, "debuglevel"); 236 #define FLM_MAX_DEBUG_LEVEL LOG_DEBUG 237 #ifndef LOG_DEBUG2 238 #define LOG_DEBUG2 8 239 #endif 240 241 #define _PASS_MSG(_l) (flm_debug_level >= (_l)) 242 #define ALGO_PRINTF(_fmt, ...) printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__) 243 #define _ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \ 244 printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\ 245 print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__) 246 #define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \ 247 printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__) 248 #define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \ 249 _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\ 250 } 251 #define FD_PRINTF(_l, _fd, _fmt, ...) FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__) 252 #define _FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \ 253 _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \ 254 _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__); \ 255 } 256 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2 257 #define FD_PRINTF_LOG_DEBUG2 _FD_PRINTF 258 #else 259 #define FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...) 260 #endif 261 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG 262 #define FD_PRINTF_LOG_DEBUG _FD_PRINTF 263 #else 264 #define FD_PRINTF_LOG_DEBUG() 265 #endif 266 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO 267 #define FD_PRINTF_LOG_INFO _FD_PRINTF 268 #else 269 #define FD_PRINTF_LOG_INFO() 270 #endif 271 #define FD_PRINTF_LOG_NOTICE _FD_PRINTF 272 #define FD_PRINTF_LOG_ERR _FD_PRINTF 273 #define FD_PRINTF_LOG_WARNING _FD_PRINTF 274 275 276 /* List of all registered lookup algorithms */ 277 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list); 278 279 /* List of all fib lookup instances in the vnet */ 280 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list); 281 #define V_fib_data_list VNET(fib_data_list) 282 283 /* Datastructure for storing non-transient fib lookup module failures */ 284 struct fib_error { 285 int fe_family; 286 uint32_t fe_fibnum; /* failed rtable */ 287 struct fib_lookup_module *fe_flm; /* failed module */ 288 TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */ 289 }; 290 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list); 291 #define V_fib_error_list VNET(fib_error_list) 292 293 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */ 294 struct fib_dp_header { 295 struct epoch_context fdh_epoch_ctx; 296 uint32_t fdh_num_tables; 297 struct fib_dp fdh_idx[0]; 298 }; 299 300 /* 301 * Tries to add new non-transient algorithm error to the list of 302 * errors. 303 * Returns true on success. 304 */ 305 static bool 306 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum) 307 { 308 struct fib_error *fe; 309 310 fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO); 311 if (fe == NULL) 312 return (false); 313 fe->fe_flm = flm; 314 fe->fe_family = flm->flm_family; 315 fe->fe_fibnum = fibnum; 316 317 FIB_MOD_LOCK(); 318 /* Avoid duplicates by checking if error already exists first */ 319 if (flm_error_check(flm, fibnum)) { 320 FIB_MOD_UNLOCK(); 321 free(fe, M_TEMP); 322 return (true); 323 } 324 TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries); 325 FIB_MOD_UNLOCK(); 326 327 return (true); 328 } 329 330 /* 331 * True if non-transient error has been registered for @flm in @fibnum. 332 */ 333 static bool 334 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum) 335 { 336 const struct fib_error *fe; 337 338 TAILQ_FOREACH(fe, &V_fib_error_list, entries) { 339 if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum)) 340 return (true); 341 } 342 343 return (false); 344 } 345 346 /* 347 * Clear all errors of algo specified by @flm. 348 */ 349 static void 350 fib_error_clear_flm(struct fib_lookup_module *flm) 351 { 352 struct fib_error *fe, *fe_tmp; 353 354 FIB_MOD_LOCK_ASSERT(); 355 356 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 357 if (fe->fe_flm == flm) { 358 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 359 free(fe, M_TEMP); 360 } 361 } 362 } 363 364 /* 365 * Clears all errors in current VNET. 366 */ 367 static void 368 fib_error_clear() 369 { 370 struct fib_error *fe, *fe_tmp; 371 372 FIB_MOD_LOCK_ASSERT(); 373 374 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 375 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 376 free(fe, M_TEMP); 377 } 378 } 379 380 static const char * 381 print_op_result(enum flm_op_result result) 382 { 383 switch (result) { 384 case FLM_SUCCESS: 385 return "success"; 386 case FLM_REBUILD: 387 return "rebuild"; 388 case FLM_BATCH: 389 return "batch"; 390 case FLM_ERROR: 391 return "error"; 392 } 393 394 return "unknown"; 395 } 396 397 static const char * 398 print_family(int family) 399 { 400 401 if (family == AF_INET) 402 return ("inet"); 403 else if (family == AF_INET6) 404 return ("inet6"); 405 else 406 return ("unknown"); 407 } 408 409 /* 410 * Debug function used by lookup algorithms. 411 * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) " 412 */ 413 void 414 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...) 415 { 416 char buf[128]; 417 va_list ap; 418 419 if (level > flm_debug_level) 420 return; 421 422 va_start(ap, fmt); 423 vsnprintf(buf, sizeof(buf), fmt, ap); 424 va_end(ap); 425 426 _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name, 427 fd->fd_gen, func, "%s", buf); 428 } 429 430 /* 431 * Outputs list of algorithms supported by the provided address family. 432 */ 433 static int 434 print_algos_sysctl(struct sysctl_req *req, int family) 435 { 436 struct fib_lookup_module *flm; 437 struct sbuf sbuf; 438 int error, count = 0; 439 440 error = sysctl_wire_old_buffer(req, 0); 441 if (error == 0) { 442 sbuf_new_for_sysctl(&sbuf, NULL, 512, req); 443 TAILQ_FOREACH(flm, &all_algo_list, entries) { 444 if (flm->flm_family == family) { 445 if (count++ > 0) 446 sbuf_cat(&sbuf, ", "); 447 sbuf_cat(&sbuf, flm->flm_name); 448 } 449 } 450 error = sbuf_finish(&sbuf); 451 sbuf_delete(&sbuf); 452 } 453 return (error); 454 } 455 456 #ifdef INET6 457 static int 458 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS) 459 { 460 461 return (print_algos_sysctl(req, AF_INET6)); 462 } 463 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, 464 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 465 print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms"); 466 #endif 467 468 #ifdef INET 469 static int 470 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS) 471 { 472 473 return (print_algos_sysctl(req, AF_INET)); 474 } 475 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, 476 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 477 print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms"); 478 #endif 479 480 /* 481 * Calculate delay between repeated failures. 482 * Returns current delay in milliseconds. 483 */ 484 static uint32_t 485 callout_calc_delay_ms(struct fib_data *fd) 486 { 487 uint32_t shift; 488 489 if (fd->fd_failed_rebuilds > 10) 490 shift = 10; 491 else 492 shift = fd->fd_failed_rebuilds; 493 494 return ((1 << shift) * FIB_CALLOUT_DELAY_MS); 495 } 496 497 static void 498 schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms) 499 { 500 501 FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action); 502 fd->fd_callout_action = action; 503 callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0, 504 handle_fd_callout, fd, 0); 505 } 506 507 static void 508 schedule_fd_rebuild(struct fib_data *fd, const char *reason) 509 { 510 511 RIB_WLOCK_ASSERT(fd->fd_rh); 512 513 if (!fd->fd_need_rebuild) { 514 fd->fd_need_rebuild = true; 515 /* Stop batch updates */ 516 fd->fd_batch = false; 517 518 /* 519 * Potentially re-schedules pending callout 520 * initiated by schedule_algo_eval. 521 */ 522 FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)", 523 reason, fd->fd_failed_rebuilds); 524 schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd)); 525 } 526 } 527 528 static void 529 sync_rib_gen(struct fib_data *fd) 530 { 531 FD_PRINTF(LOG_DEBUG, fd, "Sync gen %u -> %u", fd->fd_rh->rnh_gen, fd->fd_rh->rnh_gen_rib); 532 fd->fd_rh->rnh_gen = fd->fd_rh->rnh_gen_rib; 533 } 534 535 static int64_t 536 get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv) 537 { 538 int64_t diff = 0; 539 540 diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000; 541 diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000; 542 543 return (diff); 544 } 545 546 static void 547 add_tv_diff_ms(struct timeval *tv, int ms) 548 { 549 tv->tv_sec += ms / 1000; 550 ms = ms % 1000; 551 if (ms * 1000 + tv->tv_usec < 1000000) 552 tv->tv_usec += ms * 1000; 553 else { 554 tv->tv_sec += 1; 555 tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000; 556 } 557 } 558 559 /* 560 * Marks the time when algo state diverges from the rib state. 561 */ 562 static void 563 mark_diverge_time(struct fib_data *fd) 564 { 565 struct fib_sync_status *fd_ss = &fd->fd_ss; 566 567 getmicrouptime(&fd_ss->diverge_time); 568 fd_ss->bucket_id = 0; 569 fd_ss->bucket_changes = 0; 570 } 571 572 /* 573 * Calculates and updates the next algorithm sync time, based on the current activity. 574 * 575 * The intent is to provide reasonable balance between the update 576 * latency and efficient batching when changing large amount of routes. 577 * 578 * High-level algorithm looks the following: 579 * 1) all changes are bucketed in 50ms intervals 580 * 2) If amount of changes within the bucket is greater than the threshold, 581 * the update gets delayed, up to maximum delay threshold. 582 */ 583 static void 584 update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action) 585 { 586 uint32_t bucket_id, new_delay = 0; 587 struct timeval tv; 588 589 /* Fetch all variables at once to ensure consistent reads */ 590 uint32_t bucket_time_ms = V_bucket_time_ms; 591 uint32_t threshold_rate = V_bucket_change_threshold_rate; 592 uint32_t max_delay_ms = V_fib_max_sync_delay_ms; 593 594 if (bucket_time_ms == 0) 595 bucket_time_ms = 50; 596 /* calculate per-bucket threshold rate */ 597 threshold_rate = threshold_rate * bucket_time_ms / 1000; 598 599 getmicrouptime(&tv); 600 601 struct fib_sync_status *fd_ss = &fd->fd_ss; 602 603 bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms; 604 605 if (fd_ss->bucket_id == bucket_id) { 606 fd_ss->bucket_changes++; 607 if (fd_ss->bucket_changes == threshold_rate) { 608 new_delay = (bucket_id + 2) * bucket_time_ms; 609 if (new_delay <= max_delay_ms) { 610 FD_PRINTF(LOG_DEBUG, fd, 611 "hit threshold of %u routes, delay update," 612 "bucket: %u, total delay: %u", 613 threshold_rate, bucket_id + 1, new_delay); 614 } else { 615 new_delay = 0; 616 FD_PRINTF(LOG_DEBUG, fd, 617 "maximum sync delay (%u ms) reached", max_delay_ms); 618 } 619 } else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1)) 620 new_delay = bucket_time_ms; 621 } else { 622 fd_ss->bucket_id = bucket_id; 623 fd_ss->bucket_changes = 1; 624 } 625 626 if (new_delay > 0) { 627 /* Calculated time has been updated */ 628 struct timeval new_tv = fd_ss->diverge_time; 629 add_tv_diff_ms(&new_tv, new_delay); 630 631 int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv); 632 schedule_callout(fd, action, delay_ms); 633 } 634 } 635 636 static void 637 update_algo_state(struct fib_data *fd) 638 { 639 640 RIB_WLOCK_ASSERT(fd->fd_rh); 641 642 if (fd->fd_batch || fd->fd_need_rebuild) { 643 enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH; 644 update_rebuild_delay(fd, action); 645 return; 646 } 647 648 if (fd->fd_num_changes++ == 0) { 649 /* Start callout to consider switch */ 650 if (!callout_pending(&fd->fd_callout)) 651 schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS); 652 } else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) { 653 /* Reset callout to exec immediately */ 654 if (fd->fd_callout_action == FDA_EVAL) 655 schedule_callout(fd, FDA_EVAL, 1); 656 } 657 } 658 659 static bool 660 need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc) 661 { 662 struct nhop_object *nh; 663 664 /* Sync addition/removal of interface routes */ 665 switch (rc->rc_cmd) { 666 case RTM_ADD: 667 nh = rc->rc_nh_new; 668 if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY))) 669 return (true); 670 break; 671 case RTM_DELETE: 672 nh = rc->rc_nh_old; 673 if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY))) 674 return (true); 675 break; 676 } 677 678 return (false); 679 } 680 681 static bool 682 apply_rtable_changes(struct fib_data *fd) 683 { 684 enum flm_op_result result; 685 struct fib_change_queue *q = &fd->fd_ss.fd_change_queue; 686 687 result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data); 688 689 if (result == FLM_SUCCESS) { 690 sync_rib_gen(fd); 691 for (int i = 0; i < q->count; i++) 692 if (q->entries[i].nh_old) 693 fib_unref_nhop(fd, q->entries[i].nh_old); 694 q->count = 0; 695 } 696 fd->fd_batch = false; 697 698 return (result == FLM_SUCCESS); 699 } 700 701 static bool 702 fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc) 703 { 704 int plen = 0; 705 706 switch (fd->fd_family) { 707 case AF_INET: 708 rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid); 709 break; 710 case AF_INET6: 711 rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid); 712 break; 713 } 714 715 ce->plen = plen; 716 ce->nh_old = rc->rc_nh_old; 717 ce->nh_new = rc->rc_nh_new; 718 if (ce->nh_new != NULL) { 719 if (fib_ref_nhop(fd, ce->nh_new) == 0) 720 return (false); 721 } 722 723 return (true); 724 } 725 726 static bool 727 queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc) 728 { 729 struct fib_change_queue *q = &fd->fd_ss.fd_change_queue; 730 731 if (q->count >= q->size) { 732 uint32_t q_size; 733 734 if (q->size == 0) 735 q_size = 256; /* ~18k memory */ 736 else 737 q_size = q->size * 2; 738 739 size_t size = q_size * sizeof(struct fib_change_entry); 740 void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO); 741 if (a == NULL) { 742 FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements", 743 q_size); 744 return (false); 745 } 746 q->entries = a; 747 q->size = q_size; 748 } 749 750 return (fill_change_entry(fd, &q->entries[q->count++], rc)); 751 } 752 753 /* 754 * Rib subscription handler. Checks if the algorithm is ready to 755 * receive updates, handles nexthop refcounting and passes change 756 * data to the algorithm callback. 757 */ 758 static void 759 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, 760 void *_data) 761 { 762 struct fib_data *fd = (struct fib_data *)_data; 763 enum flm_op_result result; 764 765 RIB_WLOCK_ASSERT(rnh); 766 767 /* 768 * There is a small gap between subscribing for route changes 769 * and initiating rtable dump. Avoid receiving route changes 770 * prior to finishing rtable dump by checking `init_done`. 771 */ 772 if (!fd->init_done) 773 return; 774 775 bool immediate_sync = need_immediate_sync(fd, rc); 776 777 /* Consider scheduling algorithm re-evaluation */ 778 update_algo_state(fd); 779 780 /* 781 * If algo requested rebuild, stop sending updates by default. 782 * This simplifies nexthop refcount handling logic. 783 */ 784 if (fd->fd_need_rebuild) { 785 if (immediate_sync) 786 rebuild_fd(fd, "rtable change type enforced sync"); 787 return; 788 } 789 790 /* 791 * Algo requested updates to be delivered in batches. 792 * Add the current change to the queue and return. 793 */ 794 if (fd->fd_batch) { 795 if (immediate_sync) { 796 if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd)) 797 rebuild_fd(fd, "batch sync failed"); 798 } else { 799 if (!queue_rtable_change(fd, rc)) 800 schedule_fd_rebuild(fd, "batch queue failed"); 801 } 802 return; 803 } 804 805 /* 806 * Maintain guarantee that every nexthop returned by the dataplane 807 * lookup has > 0 refcount, so can be safely referenced within current 808 * epoch. 809 */ 810 if (rc->rc_nh_new != NULL) { 811 if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) { 812 /* ran out of indexes */ 813 schedule_fd_rebuild(fd, "ran out of nhop indexes"); 814 return; 815 } 816 } 817 818 result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); 819 820 switch (result) { 821 case FLM_SUCCESS: 822 sync_rib_gen(fd); 823 /* Unref old nexthop on success */ 824 if (rc->rc_nh_old != NULL) 825 fib_unref_nhop(fd, rc->rc_nh_old); 826 break; 827 case FLM_BATCH: 828 829 /* 830 * Algo asks to batch the changes. 831 */ 832 if (queue_rtable_change(fd, rc)) { 833 if (!immediate_sync) { 834 fd->fd_batch = true; 835 mark_diverge_time(fd); 836 update_rebuild_delay(fd, FDA_BATCH); 837 break; 838 } 839 if (apply_rtable_changes(fd)) 840 break; 841 } 842 FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild"); 843 844 case FLM_REBUILD: 845 846 /* 847 * Algo is not able to apply the update. 848 * Schedule algo rebuild. 849 */ 850 if (!immediate_sync) { 851 mark_diverge_time(fd); 852 schedule_fd_rebuild(fd, "algo requested rebuild"); 853 break; 854 } 855 856 FD_PRINTF(LOG_INFO, fd, "running sync rebuild"); 857 rebuild_fd(fd, "rtable change type enforced sync"); 858 break; 859 case FLM_ERROR: 860 861 /* 862 * Algo reported a non-recoverable error. 863 * Record the error and schedule rebuild, which will 864 * trigger best algo selection. 865 */ 866 FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error"); 867 if (!flm_error_add(fd->fd_flm, fd->fd_fibnum)) 868 FD_PRINTF(LOG_ERR, fd, "failed to ban algo"); 869 schedule_fd_rebuild(fd, "algo reported non-recoverable error"); 870 } 871 } 872 873 static void 874 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd) 875 { 876 877 if (old_fd == NULL) { 878 // TODO: read from rtable 879 fd->number_nhops = 16; 880 return; 881 } 882 883 if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) 884 fd->number_nhops = 2 * old_fd->number_nhops; 885 else 886 fd->number_nhops = old_fd->number_nhops; 887 } 888 889 struct walk_cbdata { 890 struct fib_data *fd; 891 flm_dump_t *func; 892 enum flm_op_result result; 893 }; 894 895 /* 896 * Handler called after all rtenties have been dumped. 897 * Performs post-dump framework checks and calls 898 * algo:flm_dump_end_cb(). 899 * 900 * Updates walk_cbdata result. 901 */ 902 static void 903 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) 904 { 905 struct walk_cbdata *w = (struct walk_cbdata *)_data; 906 struct fib_data *fd = w->fd; 907 908 RIB_WLOCK_ASSERT(w->fd->fd_rh); 909 910 if (rnh->rib_dying) { 911 w->result = FLM_ERROR; 912 return; 913 } 914 915 if (fd->hit_nhops) { 916 FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops", 917 fd->nh_ref_table->count); 918 if (w->result == FLM_SUCCESS) 919 w->result = FLM_REBUILD; 920 return; 921 } 922 923 if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) 924 return; 925 926 /* Post-dump hook, dump successful */ 927 w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); 928 929 if (w->result == FLM_SUCCESS) { 930 /* Mark init as done to allow routing updates */ 931 fd->init_done = 1; 932 } 933 } 934 935 /* 936 * Callback for each entry in rib. 937 * Calls algo:flm_dump_rib_item_cb func as a part of initial 938 * route table synchronisation. 939 */ 940 static int 941 sync_algo_cb(struct rtentry *rt, void *_data) 942 { 943 struct walk_cbdata *w = (struct walk_cbdata *)_data; 944 945 RIB_WLOCK_ASSERT(w->fd->fd_rh); 946 947 if (w->result == FLM_SUCCESS && w->func) { 948 949 /* 950 * Reference nexthops to maintain guarantee that 951 * each nexthop returned by datapath has > 0 references 952 * and can be safely referenced within current epoch. 953 */ 954 struct nhop_object *nh = rt_get_raw_nhop(rt); 955 if (fib_ref_nhop(w->fd, nh) != 0) 956 w->result = w->func(rt, w->fd->fd_algo_data); 957 else 958 w->result = FLM_REBUILD; 959 } 960 961 return (0); 962 } 963 964 /* 965 * Dump all routing table state to the algo instance. 966 */ 967 static enum flm_op_result 968 sync_algo(struct fib_data *fd) 969 { 970 struct walk_cbdata w = { 971 .fd = fd, 972 .func = fd->fd_flm->flm_dump_rib_item_cb, 973 .result = FLM_SUCCESS, 974 }; 975 976 rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w); 977 978 FD_PRINTF(LOG_INFO, fd, 979 "initial dump completed (rtable version: %d), result: %s", 980 fd->fd_rh->rnh_gen, print_op_result(w.result)); 981 982 return (w.result); 983 } 984 985 /* 986 * Schedules epoch-backed @fd instance deletion. 987 * * Unlinks @fd from the list of active algo instances. 988 * * Removes rib subscription. 989 * * Stops callout. 990 * * Schedules actual deletion. 991 * 992 * Assume @fd is already unlinked from the datapath. 993 */ 994 static int 995 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout) 996 { 997 bool is_dead; 998 999 NET_EPOCH_ASSERT(); 1000 RIB_WLOCK_ASSERT(fd->fd_rh); 1001 1002 FIB_MOD_LOCK(); 1003 is_dead = fd->fd_dead; 1004 if (!is_dead) 1005 fd->fd_dead = true; 1006 if (fd->fd_linked) { 1007 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 1008 fd->fd_linked = false; 1009 } 1010 FIB_MOD_UNLOCK(); 1011 if (is_dead) 1012 return (0); 1013 1014 FD_PRINTF(LOG_INFO, fd, "DETACH"); 1015 1016 if (fd->fd_rs != NULL) 1017 rib_unsibscribe_locked(fd->fd_rs); 1018 1019 /* 1020 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls 1021 * will be executed, hence no _new_ callout schedules will happen. 1022 */ 1023 callout_stop(&fd->fd_callout); 1024 1025 fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx); 1026 1027 return (0); 1028 } 1029 1030 /* 1031 * Wipe all fd instances from the list matching rib specified by @rh. 1032 * If @keep_first is set, remove all but the first record. 1033 */ 1034 static void 1035 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout) 1036 { 1037 struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); 1038 struct fib_data *fd, *fd_tmp; 1039 struct epoch_tracker et; 1040 1041 FIB_MOD_LOCK(); 1042 TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) { 1043 if (fd->fd_rh == rh) { 1044 if (keep_first) { 1045 keep_first = false; 1046 continue; 1047 } 1048 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 1049 fd->fd_linked = false; 1050 TAILQ_INSERT_TAIL(&tmp_head, fd, entries); 1051 } 1052 } 1053 FIB_MOD_UNLOCK(); 1054 1055 /* Pass 2: remove each entry */ 1056 NET_EPOCH_ENTER(et); 1057 TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { 1058 if (!in_callout) 1059 RIB_WLOCK(fd->fd_rh); 1060 schedule_destroy_fd_instance(fd, in_callout); 1061 if (!in_callout) 1062 RIB_WUNLOCK(fd->fd_rh); 1063 } 1064 NET_EPOCH_EXIT(et); 1065 } 1066 1067 void 1068 fib_destroy_rib(struct rib_head *rh) 1069 { 1070 1071 /* 1072 * rnh has `is_dying` flag set, so setup of new fd's will fail at 1073 * sync_algo() stage, preventing new entries to be added to the list 1074 * of active algos. Remove all existing entries for the particular rib. 1075 */ 1076 fib_cleanup_algo(rh, false, false); 1077 } 1078 1079 /* 1080 * Finalises fd destruction by freeing all fd resources. 1081 */ 1082 static void 1083 destroy_fd_instance(struct fib_data *fd) 1084 { 1085 1086 FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd); 1087 1088 /* Call destroy callback first */ 1089 if (fd->fd_algo_data != NULL) 1090 fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); 1091 1092 /* Nhop table */ 1093 if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) { 1094 for (int i = 0; i < fd->number_nhops; i++) { 1095 if (!is_idx_free(fd, i)) { 1096 FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p", 1097 i, fd->nh_idx[i]); 1098 nhop_free_any(fd->nh_idx[i]); 1099 } 1100 } 1101 free(fd->nh_idx, M_RTABLE); 1102 } 1103 if (fd->nh_ref_table != NULL) 1104 free(fd->nh_ref_table, M_RTABLE); 1105 1106 if (fd->fd_ss.fd_change_queue.entries != NULL) 1107 free(fd->fd_ss.fd_change_queue.entries, M_TEMP); 1108 1109 fib_unref_algo(fd->fd_flm); 1110 1111 free(fd, M_RTABLE); 1112 } 1113 1114 /* 1115 * Epoch callback indicating fd is safe to destroy 1116 */ 1117 static void 1118 destroy_fd_instance_epoch(epoch_context_t ctx) 1119 { 1120 struct fib_data *fd; 1121 1122 fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); 1123 1124 destroy_fd_instance(fd); 1125 } 1126 1127 /* 1128 * Tries to setup fd instance. 1129 * - Allocates fd/nhop table 1130 * - Runs algo:flm_init_cb algo init 1131 * - Subscribes fd to the rib 1132 * - Runs rtable dump 1133 * - Adds instance to the list of active instances. 1134 * 1135 * Returns: operation result. Fills in @pfd with resulting fd on success. 1136 * 1137 */ 1138 static enum flm_op_result 1139 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 1140 struct fib_data *old_fd, struct fib_data **pfd) 1141 { 1142 struct fib_data *fd; 1143 size_t size; 1144 enum flm_op_result result; 1145 1146 /* Allocate */ 1147 fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); 1148 if (fd == NULL) { 1149 *pfd = NULL; 1150 RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure"); 1151 return (FLM_REBUILD); 1152 } 1153 *pfd = fd; 1154 1155 estimate_nhop_scale(old_fd, fd); 1156 1157 fd->fd_rh = rh; 1158 fd->fd_gen = ++fib_gen; 1159 fd->fd_family = rh->rib_family; 1160 fd->fd_fibnum = rh->rib_fibnum; 1161 callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0); 1162 fd->fd_vnet = curvnet; 1163 fd->fd_flm = flm; 1164 1165 FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd); 1166 1167 FIB_MOD_LOCK(); 1168 flm->flm_refcount++; 1169 FIB_MOD_UNLOCK(); 1170 1171 /* Allocate nhidx -> nhop_ptr table */ 1172 size = fd->number_nhops * sizeof(void *); 1173 fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 1174 if (fd->nh_idx == NULL) { 1175 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size); 1176 return (FLM_REBUILD); 1177 } 1178 1179 /* Allocate nhop index refcount table */ 1180 size = sizeof(struct nhop_ref_table); 1181 size += fd->number_nhops * sizeof(uint32_t); 1182 fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 1183 if (fd->nh_ref_table == NULL) { 1184 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size); 1185 return (FLM_REBUILD); 1186 } 1187 FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops); 1188 1189 /* Okay, we're ready for algo init */ 1190 void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; 1191 result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); 1192 if (result != FLM_SUCCESS) { 1193 FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name); 1194 return (result); 1195 } 1196 1197 /* Try to subscribe */ 1198 if (flm->flm_change_rib_item_cb != NULL) { 1199 fd->fd_rs = rib_subscribe_locked(fd->fd_rh, 1200 handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE); 1201 if (fd->fd_rs == NULL) { 1202 FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes"); 1203 return (FLM_REBUILD); 1204 } 1205 } 1206 1207 /* Dump */ 1208 result = sync_algo(fd); 1209 if (result != FLM_SUCCESS) { 1210 FD_PRINTF(LOG_INFO, fd, "rib sync failed"); 1211 return (result); 1212 } 1213 FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully."); 1214 1215 FIB_MOD_LOCK(); 1216 /* 1217 * Insert fd in the beginning of a list, to maintain invariant 1218 * that first matching entry for the AF/fib is always the active 1219 * one. 1220 */ 1221 TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries); 1222 fd->fd_linked = true; 1223 FIB_MOD_UNLOCK(); 1224 1225 return (FLM_SUCCESS); 1226 } 1227 1228 /* 1229 * Sets up algo @flm for table @rh and links it to the datapath. 1230 * 1231 */ 1232 static enum flm_op_result 1233 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 1234 struct fib_data *orig_fd, struct fib_data **pfd, bool attach) 1235 { 1236 struct fib_data *prev_fd, *new_fd; 1237 enum flm_op_result result; 1238 1239 NET_EPOCH_ASSERT(); 1240 RIB_WLOCK_ASSERT(rh); 1241 1242 prev_fd = orig_fd; 1243 new_fd = NULL; 1244 for (int i = 0; i < FIB_MAX_TRIES; i++) { 1245 result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd); 1246 1247 if ((result == FLM_SUCCESS) && attach) { 1248 if (fib_set_datapath_ptr(new_fd, &new_fd->fd_dp)) 1249 sync_rib_gen(new_fd); 1250 else 1251 result = FLM_REBUILD; 1252 } 1253 1254 if ((prev_fd != NULL) && (prev_fd != orig_fd)) { 1255 schedule_destroy_fd_instance(prev_fd, false); 1256 prev_fd = NULL; 1257 } 1258 1259 RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i, 1260 print_op_result(result)); 1261 1262 if (result == FLM_REBUILD) { 1263 prev_fd = new_fd; 1264 new_fd = NULL; 1265 continue; 1266 } 1267 1268 break; 1269 } 1270 1271 if (result != FLM_SUCCESS) { 1272 RH_PRINTF(LOG_WARNING, rh, 1273 "%s algo instance setup failed, failures=%d", flm->flm_name, 1274 orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0); 1275 /* update failure count */ 1276 FIB_MOD_LOCK(); 1277 if (orig_fd != NULL) 1278 orig_fd->fd_failed_rebuilds++; 1279 FIB_MOD_UNLOCK(); 1280 1281 /* Ban algo on non-recoverable error */ 1282 if (result == FLM_ERROR) 1283 flm_error_add(flm, rh->rib_fibnum); 1284 1285 if ((prev_fd != NULL) && (prev_fd != orig_fd)) 1286 schedule_destroy_fd_instance(prev_fd, false); 1287 if (new_fd != NULL) { 1288 schedule_destroy_fd_instance(new_fd, false); 1289 new_fd = NULL; 1290 } 1291 } 1292 1293 *pfd = new_fd; 1294 return (result); 1295 } 1296 1297 /* 1298 * Tries to sync algo with the current rtable state, either 1299 * by executing batch update or rebuilding. 1300 * Returns true on success. 1301 */ 1302 static bool 1303 execute_callout_action(struct fib_data *fd) 1304 { 1305 enum fib_callout_action action = fd->fd_callout_action; 1306 struct fib_lookup_module *flm_new = NULL; 1307 bool result = true; 1308 1309 NET_EPOCH_ASSERT(); 1310 RIB_WLOCK_ASSERT(fd->fd_rh); 1311 1312 fd->fd_need_rebuild = false; 1313 fd->fd_batch = false; 1314 fd->fd_num_changes = 0; 1315 1316 /* First, check if we're still OK to use this algo */ 1317 if (!is_algo_fixed(fd->fd_rh)) 1318 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); 1319 if (flm_new != NULL) 1320 action = FDA_REBUILD; 1321 1322 if (action == FDA_BATCH) { 1323 /* Try to sync */ 1324 if (!apply_rtable_changes(fd)) 1325 action = FDA_REBUILD; 1326 } 1327 1328 if (action == FDA_REBUILD) 1329 result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); 1330 if (flm_new != NULL) 1331 fib_unref_algo(flm_new); 1332 1333 return (result); 1334 } 1335 1336 /* 1337 * Callout for all scheduled fd-related work. 1338 * - Checks if the current algo is still the best algo 1339 * - Synchronises algo instance to the rtable (batch usecase) 1340 * - Creates a new instance of an algo for af/fib if desired. 1341 */ 1342 static void 1343 handle_fd_callout(void *_data) 1344 { 1345 struct fib_data *fd = (struct fib_data *)_data; 1346 struct epoch_tracker et; 1347 1348 FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action); 1349 1350 NET_EPOCH_ENTER(et); 1351 CURVNET_SET(fd->fd_vnet); 1352 execute_callout_action(fd); 1353 CURVNET_RESTORE(); 1354 NET_EPOCH_EXIT(et); 1355 } 1356 1357 /* 1358 * Tries to create new algo instance based on @fd data. 1359 * Returns true on success. 1360 */ 1361 static bool 1362 rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new) 1363 { 1364 struct fib_data *fd_new, *fd_tmp = NULL; 1365 bool result; 1366 1367 if (flm_new == fd->fd_flm) 1368 fd_tmp = fd; 1369 else 1370 FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name); 1371 1372 result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true); 1373 if (result != FLM_SUCCESS) { 1374 FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed"); 1375 return (false); 1376 } 1377 FD_PRINTF(LOG_INFO, fd_new, "switched to new instance"); 1378 1379 /* Remove old instance */ 1380 schedule_destroy_fd_instance(fd, true); 1381 1382 return (true); 1383 } 1384 1385 static bool 1386 rebuild_fd(struct fib_data *fd, const char *reason) 1387 { 1388 struct fib_lookup_module *flm_new = NULL; 1389 bool result; 1390 1391 if (!is_algo_fixed(fd->fd_rh)) 1392 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); 1393 1394 FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason); 1395 result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); 1396 if (flm_new != NULL) 1397 fib_unref_algo(flm_new); 1398 1399 if (!result) { 1400 FD_PRINTF(LOG_ERR, fd, "sync rebuild failed"); 1401 schedule_fd_rebuild(fd, "sync rebuild failed"); 1402 } 1403 1404 return (result); 1405 } 1406 1407 /* 1408 * Finds algo by name/family. 1409 * Returns referenced algo or NULL. 1410 */ 1411 static struct fib_lookup_module * 1412 fib_find_algo(const char *algo_name, int family) 1413 { 1414 struct fib_lookup_module *flm; 1415 1416 FIB_MOD_LOCK(); 1417 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1418 if ((strcmp(flm->flm_name, algo_name) == 0) && 1419 (family == flm->flm_family)) { 1420 flm->flm_refcount++; 1421 FIB_MOD_UNLOCK(); 1422 return (flm); 1423 } 1424 } 1425 FIB_MOD_UNLOCK(); 1426 1427 return (NULL); 1428 } 1429 1430 static void 1431 fib_unref_algo(struct fib_lookup_module *flm) 1432 { 1433 1434 FIB_MOD_LOCK(); 1435 flm->flm_refcount--; 1436 FIB_MOD_UNLOCK(); 1437 } 1438 1439 static int 1440 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req) 1441 { 1442 struct fib_lookup_module *flm = NULL; 1443 struct fib_data *fd = NULL; 1444 char old_algo_name[32], algo_name[32]; 1445 struct rib_head *rh = NULL; 1446 enum flm_op_result result; 1447 struct epoch_tracker et; 1448 int error; 1449 1450 /* Fetch current algo/rib for af/family */ 1451 FIB_MOD_LOCK(); 1452 TAILQ_FOREACH(fd, &V_fib_data_list, entries) { 1453 if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum)) 1454 break; 1455 } 1456 if (fd == NULL) { 1457 FIB_MOD_UNLOCK(); 1458 return (ENOENT); 1459 } 1460 rh = fd->fd_rh; 1461 strlcpy(old_algo_name, fd->fd_flm->flm_name, 1462 sizeof(old_algo_name)); 1463 FIB_MOD_UNLOCK(); 1464 1465 strlcpy(algo_name, old_algo_name, sizeof(algo_name)); 1466 error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); 1467 if (error != 0 || req->newptr == NULL) 1468 return (error); 1469 1470 if (strcmp(algo_name, old_algo_name) == 0) 1471 return (0); 1472 1473 /* New algorithm name is different */ 1474 flm = fib_find_algo(algo_name, family); 1475 if (flm == NULL) { 1476 RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name); 1477 return (ESRCH); 1478 } 1479 1480 fd = NULL; 1481 NET_EPOCH_ENTER(et); 1482 RIB_WLOCK(rh); 1483 result = setup_fd_instance(flm, rh, NULL, &fd, true); 1484 RIB_WUNLOCK(rh); 1485 NET_EPOCH_EXIT(et); 1486 fib_unref_algo(flm); 1487 if (result != FLM_SUCCESS) 1488 return (EINVAL); 1489 1490 /* Disable automated jumping between algos */ 1491 FIB_MOD_LOCK(); 1492 set_algo_fixed(rh); 1493 FIB_MOD_UNLOCK(); 1494 /* Remove old instance(s) */ 1495 fib_cleanup_algo(rh, true, false); 1496 1497 /* Drain cb so user can unload the module after userret if so desired */ 1498 epoch_drain_callbacks(net_epoch_preempt); 1499 1500 return (0); 1501 } 1502 1503 #ifdef INET 1504 static int 1505 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS) 1506 { 1507 1508 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req)); 1509 } 1510 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo, 1511 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1512 set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo"); 1513 #endif 1514 1515 #ifdef INET6 1516 static int 1517 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS) 1518 { 1519 1520 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req)); 1521 } 1522 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, 1523 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1524 set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo"); 1525 #endif 1526 1527 static void 1528 destroy_fdh_epoch(epoch_context_t ctx) 1529 { 1530 struct fib_dp_header *fdh; 1531 1532 fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx); 1533 free(fdh, M_RTABLE); 1534 } 1535 1536 static struct fib_dp_header * 1537 alloc_fib_dp_array(uint32_t num_tables, bool waitok) 1538 { 1539 size_t sz; 1540 struct fib_dp_header *fdh; 1541 1542 sz = sizeof(struct fib_dp_header); 1543 sz += sizeof(struct fib_dp) * num_tables; 1544 fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); 1545 if (fdh != NULL) 1546 fdh->fdh_num_tables = num_tables; 1547 return (fdh); 1548 } 1549 1550 static struct fib_dp_header * 1551 get_fib_dp_header(struct fib_dp *dp) 1552 { 1553 1554 return (__containerof((void *)dp, struct fib_dp_header, fdh_idx)); 1555 } 1556 1557 /* 1558 * Replace per-family index pool @pdp with a new one which 1559 * contains updated callback/algo data from @fd. 1560 * Returns true on success. 1561 */ 1562 static bool 1563 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp) 1564 { 1565 struct fib_dp_header *new_fdh, *old_fdh; 1566 1567 NET_EPOCH_ASSERT(); 1568 1569 FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p", 1570 curvnet, dp->f, dp->arg); 1571 1572 FIB_MOD_LOCK(); 1573 old_fdh = get_fib_dp_header(*pdp); 1574 1575 if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) { 1576 /* 1577 * Function is the same, data pointer needs update. 1578 * Perform in-line replace without reallocation. 1579 */ 1580 old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg; 1581 FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh); 1582 FIB_MOD_UNLOCK(); 1583 return (true); 1584 } 1585 1586 new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false); 1587 FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh); 1588 if (new_fdh == NULL) { 1589 FIB_MOD_UNLOCK(); 1590 FD_PRINTF(LOG_WARNING, fd, "error attaching datapath"); 1591 return (false); 1592 } 1593 1594 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1595 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1596 /* Update relevant data structure for @fd */ 1597 new_fdh->fdh_idx[fd->fd_fibnum] = *dp; 1598 1599 /* Ensure memcpy() writes have completed */ 1600 atomic_thread_fence_rel(); 1601 /* Set new datapath pointer */ 1602 *pdp = &new_fdh->fdh_idx[0]; 1603 FIB_MOD_UNLOCK(); 1604 FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh); 1605 1606 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); 1607 1608 return (true); 1609 } 1610 1611 static struct fib_dp ** 1612 get_family_dp_ptr(int family) 1613 { 1614 switch (family) { 1615 case AF_INET: 1616 return (&V_inet_dp); 1617 case AF_INET6: 1618 return (&V_inet6_dp); 1619 } 1620 return (NULL); 1621 } 1622 1623 /* 1624 * Make datapath use fib instance @fd 1625 */ 1626 bool 1627 fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp) 1628 { 1629 struct fib_dp **pdp; 1630 1631 pdp = get_family_dp_ptr(fd->fd_family); 1632 return (replace_rtables_family(pdp, fd, dp)); 1633 } 1634 1635 /* 1636 * Grow datapath pointers array. 1637 * Called from sysctl handler on growing number of routing tables. 1638 */ 1639 static void 1640 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) 1641 { 1642 struct fib_dp_header *new_fdh, *old_fdh = NULL; 1643 1644 new_fdh = alloc_fib_dp_array(new_num_tables, true); 1645 1646 FIB_MOD_LOCK(); 1647 if (*pdp != NULL) { 1648 old_fdh = get_fib_dp_header(*pdp); 1649 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1650 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1651 } 1652 1653 /* Wait till all writes completed */ 1654 atomic_thread_fence_rel(); 1655 1656 *pdp = &new_fdh->fdh_idx[0]; 1657 FIB_MOD_UNLOCK(); 1658 1659 if (old_fdh != NULL) 1660 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); 1661 } 1662 1663 /* 1664 * Grows per-AF arrays of datapath pointers for each supported family. 1665 * Called from fibs resize sysctl handler. 1666 */ 1667 void 1668 fib_grow_rtables(uint32_t new_num_tables) 1669 { 1670 1671 #ifdef INET 1672 grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables); 1673 #endif 1674 #ifdef INET6 1675 grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables); 1676 #endif 1677 } 1678 1679 void 1680 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) 1681 { 1682 1683 bzero(rinfo, sizeof(struct rib_rtable_info)); 1684 rinfo->num_prefixes = rh->rnh_prefixes; 1685 rinfo->num_nhops = nhops_get_count(rh); 1686 #ifdef ROUTE_MPATH 1687 rinfo->num_nhgrp = nhgrp_get_count(rh); 1688 #endif 1689 } 1690 1691 /* 1692 * Updates pointer to the algo data for the @fd. 1693 */ 1694 void 1695 fib_set_algo_ptr(struct fib_data *fd, void *algo_data) 1696 { 1697 RIB_WLOCK_ASSERT(fd->fd_rh); 1698 1699 fd->fd_algo_data = algo_data; 1700 } 1701 1702 /* 1703 * Calls @callback with @ctx after the end of a current epoch. 1704 */ 1705 void 1706 fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx) 1707 { 1708 epoch_call(net_epoch_preempt, callback, ctx); 1709 } 1710 1711 /* 1712 * Accessor to get rib instance @fd is attached to. 1713 */ 1714 struct rib_head * 1715 fib_get_rh(struct fib_data *fd) 1716 { 1717 1718 return (fd->fd_rh); 1719 } 1720 1721 /* 1722 * Accessor to export idx->nhop array 1723 */ 1724 struct nhop_object ** 1725 fib_get_nhop_array(struct fib_data *fd) 1726 { 1727 1728 return (fd->nh_idx); 1729 } 1730 1731 static uint32_t 1732 get_nhop_idx(struct nhop_object *nh) 1733 { 1734 #ifdef ROUTE_MPATH 1735 if (NH_IS_NHGRP(nh)) 1736 return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1); 1737 else 1738 return (nhop_get_idx(nh) * 2); 1739 #else 1740 return (nhop_get_idx(nh)); 1741 #endif 1742 } 1743 1744 uint32_t 1745 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) 1746 { 1747 1748 return (get_nhop_idx(nh)); 1749 } 1750 1751 static bool 1752 is_idx_free(struct fib_data *fd, uint32_t index) 1753 { 1754 1755 return (fd->nh_ref_table->refcnt[index] == 0); 1756 } 1757 1758 static uint32_t 1759 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh) 1760 { 1761 uint32_t idx = get_nhop_idx(nh); 1762 1763 if (idx >= fd->number_nhops) { 1764 fd->hit_nhops = 1; 1765 return (0); 1766 } 1767 1768 if (is_idx_free(fd, idx)) { 1769 nhop_ref_any(nh); 1770 fd->nh_idx[idx] = nh; 1771 fd->nh_ref_table->count++; 1772 FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); 1773 } 1774 fd->nh_ref_table->refcnt[idx]++; 1775 1776 return (idx); 1777 } 1778 1779 struct nhop_release_data { 1780 struct nhop_object *nh; 1781 struct epoch_context ctx; 1782 }; 1783 1784 static void 1785 release_nhop_epoch(epoch_context_t ctx) 1786 { 1787 struct nhop_release_data *nrd; 1788 1789 nrd = __containerof(ctx, struct nhop_release_data, ctx); 1790 nhop_free_any(nrd->nh); 1791 free(nrd, M_TEMP); 1792 } 1793 1794 /* 1795 * Delays nexthop refcount release. 1796 * Datapath may have the datastructures not updated yet, so the old 1797 * nexthop may still be returned till the end of current epoch. Delay 1798 * refcount removal, as we may be removing the last instance, which will 1799 * trigger nexthop deletion, rendering returned nexthop invalid. 1800 */ 1801 static void 1802 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) 1803 { 1804 struct nhop_release_data *nrd; 1805 1806 nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO); 1807 if (nrd != NULL) { 1808 nrd->nh = nh; 1809 fib_epoch_call(release_nhop_epoch, &nrd->ctx); 1810 } else { 1811 /* 1812 * Unable to allocate memory. Leak nexthop to maintain guarantee 1813 * that each nhop can be referenced. 1814 */ 1815 FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh); 1816 } 1817 } 1818 1819 static void 1820 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh) 1821 { 1822 uint32_t idx = get_nhop_idx(nh); 1823 1824 KASSERT((idx < fd->number_nhops), ("invalid nhop index")); 1825 KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh")); 1826 1827 fd->nh_ref_table->refcnt[idx]--; 1828 if (fd->nh_ref_table->refcnt[idx] == 0) { 1829 FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); 1830 fib_schedule_release_nhop(fd, fd->nh_idx[idx]); 1831 } 1832 } 1833 1834 static void 1835 set_algo_fixed(struct rib_head *rh) 1836 { 1837 switch (rh->rib_family) { 1838 #ifdef INET 1839 case AF_INET: 1840 V_algo_fixed_inet = true; 1841 break; 1842 #endif 1843 #ifdef INET6 1844 case AF_INET6: 1845 V_algo_fixed_inet6 = true; 1846 break; 1847 #endif 1848 } 1849 } 1850 1851 static bool 1852 is_algo_fixed(struct rib_head *rh) 1853 { 1854 1855 switch (rh->rib_family) { 1856 #ifdef INET 1857 case AF_INET: 1858 return (V_algo_fixed_inet); 1859 #endif 1860 #ifdef INET6 1861 case AF_INET6: 1862 return (V_algo_fixed_inet6); 1863 #endif 1864 } 1865 return (false); 1866 } 1867 1868 /* 1869 * Runs the check on what would be the best algo for rib @rh, assuming 1870 * that the current algo is the one specified by @orig_flm. Note that 1871 * it can be NULL for initial selection. 1872 * 1873 * Returns referenced new algo or NULL if the current one is the best. 1874 */ 1875 static struct fib_lookup_module * 1876 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) 1877 { 1878 uint8_t preference, curr_preference = 0, best_preference = 0; 1879 struct fib_lookup_module *flm, *best_flm = NULL; 1880 struct rib_rtable_info rinfo; 1881 int candidate_algos = 0; 1882 1883 fib_get_rtable_info(rh, &rinfo); 1884 1885 FIB_MOD_LOCK(); 1886 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1887 if (flm->flm_family != rh->rib_family) 1888 continue; 1889 candidate_algos++; 1890 preference = flm->flm_get_pref(&rinfo); 1891 if (preference > best_preference) { 1892 if (!flm_error_check(flm, rh->rib_fibnum)) { 1893 best_preference = preference; 1894 best_flm = flm; 1895 } 1896 } 1897 if (flm == orig_flm) 1898 curr_preference = preference; 1899 } 1900 if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference)) 1901 best_flm->flm_refcount++; 1902 else 1903 best_flm = NULL; 1904 FIB_MOD_UNLOCK(); 1905 1906 RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", 1907 candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, 1908 best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"), 1909 best_preference); 1910 1911 return (best_flm); 1912 } 1913 1914 /* 1915 * Called when new route table is created. 1916 * Selects, allocates and attaches fib algo for the table. 1917 */ 1918 int 1919 fib_select_algo_initial(struct rib_head *rh) 1920 { 1921 struct fib_lookup_module *flm; 1922 struct fib_data *fd = NULL; 1923 enum flm_op_result result; 1924 struct epoch_tracker et; 1925 int error = 0; 1926 1927 flm = fib_check_best_algo(rh, NULL); 1928 if (flm == NULL) { 1929 RH_PRINTF(LOG_CRIT, rh, "no algo selected"); 1930 return (ENOENT); 1931 } 1932 RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name); 1933 1934 NET_EPOCH_ENTER(et); 1935 RIB_WLOCK(rh); 1936 result = setup_fd_instance(flm, rh, NULL, &fd, false); 1937 RIB_WUNLOCK(rh); 1938 NET_EPOCH_EXIT(et); 1939 1940 RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd); 1941 if (result == FLM_SUCCESS) { 1942 1943 /* 1944 * Attach datapath directly to avoid multiple reallocations 1945 * during fib growth 1946 */ 1947 struct fib_dp_header *fdp; 1948 struct fib_dp **pdp; 1949 1950 pdp = get_family_dp_ptr(rh->rib_family); 1951 if (pdp != NULL) { 1952 fdp = get_fib_dp_header(*pdp); 1953 fdp->fdh_idx[fd->fd_fibnum] = fd->fd_dp; 1954 FD_PRINTF(LOG_INFO, fd, "datapath attached"); 1955 } 1956 } else { 1957 error = EINVAL; 1958 RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name); 1959 } 1960 1961 fib_unref_algo(flm); 1962 1963 return (error); 1964 } 1965 1966 /* 1967 * Registers fib lookup module within the subsystem. 1968 */ 1969 int 1970 fib_module_register(struct fib_lookup_module *flm) 1971 { 1972 1973 FIB_MOD_LOCK(); 1974 ALGO_PRINTF("attaching %s to %s", flm->flm_name, 1975 print_family(flm->flm_family)); 1976 TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); 1977 FIB_MOD_UNLOCK(); 1978 1979 return (0); 1980 } 1981 1982 /* 1983 * Tries to unregister fib lookup module. 1984 * 1985 * Returns 0 on success, EBUSY if module is still used 1986 * by some of the tables. 1987 */ 1988 int 1989 fib_module_unregister(struct fib_lookup_module *flm) 1990 { 1991 1992 FIB_MOD_LOCK(); 1993 if (flm->flm_refcount > 0) { 1994 FIB_MOD_UNLOCK(); 1995 return (EBUSY); 1996 } 1997 fib_error_clear_flm(flm); 1998 ALGO_PRINTF("detaching %s from %s", flm->flm_name, 1999 print_family(flm->flm_family)); 2000 TAILQ_REMOVE(&all_algo_list, flm, entries); 2001 FIB_MOD_UNLOCK(); 2002 2003 return (0); 2004 } 2005 2006 void 2007 vnet_fib_init(void) 2008 { 2009 2010 TAILQ_INIT(&V_fib_data_list); 2011 } 2012 2013 void 2014 vnet_fib_destroy(void) 2015 { 2016 2017 FIB_MOD_LOCK(); 2018 fib_error_clear(); 2019 FIB_MOD_UNLOCK(); 2020 } 2021