1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 34 #include <sys/param.h> 35 #include <sys/eventhandler.h> 36 #include <sys/kernel.h> 37 #include <sys/sbuf.h> 38 #include <sys/lock.h> 39 #include <sys/rmlock.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/module.h> 43 #include <sys/kernel.h> 44 #include <sys/priv.h> 45 #include <sys/proc.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sysctl.h> 49 #include <sys/syslog.h> 50 #include <sys/queue.h> 51 #include <net/vnet.h> 52 53 #include <net/if.h> 54 #include <net/if_var.h> 55 56 #include <netinet/in.h> 57 #include <netinet/in_var.h> 58 #include <netinet/ip.h> 59 #include <netinet/ip_var.h> 60 #ifdef INET6 61 #include <netinet/ip6.h> 62 #include <netinet6/ip6_var.h> 63 #endif 64 65 #include <net/route.h> 66 #include <net/route/nhop.h> 67 #include <net/route/route_ctl.h> 68 #include <net/route/route_var.h> 69 #include <net/route/fib_algo.h> 70 71 #include <machine/stdarg.h> 72 73 /* 74 * Fib lookup framework. 75 * 76 * This framework enables accelerated longest-prefix-match lookups for the 77 * routing tables by adding the ability to dynamically attach/detach lookup 78 * algorithms implementation to/from the datapath. 79 * 80 * flm - fib lookup modules - implementation of particular lookup algorithm 81 * fd - fib data - instance of an flm bound to specific routing table 82 * 83 * This file provides main framework functionality. 84 * 85 * The following are the features provided by the framework 86 * 87 * 1) nexhops abstraction -> provides transparent referencing, indexing 88 * and efficient idx->ptr mappings for nexthop and nexthop groups. 89 * 2) Routing table synchronisation 90 * 3) dataplane attachment points 91 * 4) automatic algorithm selection based on the provided preference. 92 * 93 * 94 * DATAPATH 95 * For each supported address family, there is a an allocated array of fib_dp 96 * structures, indexed by fib number. Each array entry contains callback function 97 * and its argument. This function will be called with a family-specific lookup key, 98 * scope and provided argument. This array gets re-created every time when new algo 99 * instance gets created. Please take a look at the replace_rtables_family() function 100 * for more details. 101 * 102 */ 103 104 SYSCTL_DECL(_net_route); 105 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 106 "Fib algorithm lookups"); 107 108 /* Algorithm sync policy */ 109 110 /* Time interval to bucket updates */ 111 VNET_DEFINE(unsigned int, bucket_time_ms) = 50; 112 #define V_bucket_time_ms VNET(bucket_time_ms) 113 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET, 114 &VNET_NAME(bucket_time_ms), 0, "Time interval to calculate update rate"); 115 116 /* Minimum update rate to delay sync */ 117 VNET_DEFINE(unsigned int, bucket_change_threshold_rate) = 500; 118 #define V_bucket_change_threshold_rate VNET(bucket_change_threshold_rate) 119 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET, 120 &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync"); 121 122 /* Max allowed delay to sync */ 123 VNET_DEFINE(unsigned int, fib_max_sync_delay_ms) = 1000; 124 #define V_fib_max_sync_delay_ms VNET(fib_max_sync_delay_ms) 125 SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET, 126 &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)"); 127 128 129 #ifdef INET6 130 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false; 131 #define V_algo_fixed_inet6 VNET(algo_fixed_inet6) 132 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 133 "IPv6 longest prefix match lookups"); 134 #endif 135 #ifdef INET 136 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false; 137 #define V_algo_fixed_inet VNET(algo_fixed_inet) 138 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 139 "IPv4 longest prefix match lookups"); 140 #endif 141 142 /* Fib instance counter */ 143 static uint32_t fib_gen = 0; 144 145 struct nhop_ref_table { 146 uint32_t count; 147 int32_t refcnt[0]; 148 }; 149 150 enum fib_callout_action { 151 FDA_NONE, /* No callout scheduled */ 152 FDA_REBUILD, /* Asks to rebuild algo instance */ 153 FDA_EVAL, /* Asks to evaluate if the current algo is still be best */ 154 }; 155 156 struct fib_sync_status { 157 struct timeval diverge_time; /* ts when diverged */ 158 uint32_t num_changes; /* number of changes since sync */ 159 uint32_t bucket_changes; /* num changes within the current bucket */ 160 uint64_t bucket_id; /* 50ms bucket # */ 161 }; 162 163 /* 164 * Data structure for the fib lookup instance tied to the particular rib. 165 */ 166 struct fib_data { 167 uint32_t number_nhops; /* current # of nhops */ 168 uint8_t hit_nhops; /* true if out of nhop limit */ 169 uint8_t init_done; /* true if init is competed */ 170 uint32_t fd_dead:1; /* Scheduled for deletion */ 171 uint32_t fd_linked:1; /* true if linked */ 172 uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ 173 uint8_t fd_family; /* family */ 174 uint32_t fd_fibnum; /* fibnum */ 175 uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ 176 uint32_t fd_gen; /* instance gen# */ 177 struct callout fd_callout; /* rebuild callout */ 178 enum fib_callout_action fd_callout_action; /* Callout action to take */ 179 void *fd_algo_data; /* algorithm data */ 180 struct nhop_object **nh_idx; /* nhop idx->ptr array */ 181 struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ 182 struct rib_head *fd_rh; /* RIB table we're attached to */ 183 struct rib_subscription *fd_rs; /* storing table subscription */ 184 struct fib_dp fd_dp; /* fib datapath data */ 185 struct vnet *fd_vnet; /* vnet fib belongs to */ 186 struct epoch_context fd_epoch_ctx; /* epoch context for deletion */ 187 struct fib_lookup_module *fd_flm;/* pointer to the lookup module */ 188 struct fib_sync_status fd_ss; /* State relevant to the rib sync */ 189 uint32_t fd_num_changes; /* number of changes since last callout */ 190 TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ 191 }; 192 193 static bool rebuild_fd(struct fib_data *fd, const char *reason); 194 static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new); 195 static void handle_fd_callout(void *_data); 196 static void destroy_fd_instance_epoch(epoch_context_t ctx); 197 static enum flm_op_result attach_datapath(struct fib_data *fd); 198 static bool is_idx_free(struct fib_data *fd, uint32_t index); 199 static void set_algo_fixed(struct rib_head *rh); 200 static bool is_algo_fixed(struct rib_head *rh); 201 202 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh); 203 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh); 204 205 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, 206 struct fib_lookup_module *orig_flm); 207 static void fib_unref_algo(struct fib_lookup_module *flm); 208 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum); 209 210 struct mtx fib_mtx; 211 #define FIB_MOD_LOCK() mtx_lock(&fib_mtx) 212 #define FIB_MOD_UNLOCK() mtx_unlock(&fib_mtx) 213 #define FIB_MOD_LOCK_ASSERT() mtx_assert(&fib_mtx, MA_OWNED) 214 215 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF); 216 217 /* Algorithm has to be this percent better than the current to switch */ 218 #define BEST_DIFF_PERCENT (5 * 256 / 100) 219 /* Schedule algo re-evaluation X seconds after a change */ 220 #define ALGO_EVAL_DELAY_MS 30000 221 /* Force algo re-evaluation after X changes */ 222 #define ALGO_EVAL_NUM_ROUTES 100 223 /* Try to setup algorithm X times */ 224 #define FIB_MAX_TRIES 32 225 /* Max amount of supported nexthops */ 226 #define FIB_MAX_NHOPS 262144 227 #define FIB_CALLOUT_DELAY_MS 50 228 229 230 /* Debug */ 231 static int flm_debug_level = LOG_NOTICE; 232 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN, 233 &flm_debug_level, 0, "debuglevel"); 234 #define FLM_MAX_DEBUG_LEVEL LOG_DEBUG 235 #ifndef LOG_DEBUG2 236 #define LOG_DEBUG2 8 237 #endif 238 239 #define _PASS_MSG(_l) (flm_debug_level >= (_l)) 240 #define ALGO_PRINTF(_fmt, ...) printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__) 241 #define _ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \ 242 printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\ 243 print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__) 244 #define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \ 245 printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__) 246 #define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \ 247 _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\ 248 } 249 #define FD_PRINTF(_l, _fd, _fmt, ...) FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__) 250 #define _FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \ 251 _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \ 252 _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__); \ 253 } 254 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2 255 #define FD_PRINTF_LOG_DEBUG2 _FD_PRINTF 256 #else 257 #define FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...) 258 #endif 259 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG 260 #define FD_PRINTF_LOG_DEBUG _FD_PRINTF 261 #else 262 #define FD_PRINTF_LOG_DEBUG() 263 #endif 264 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO 265 #define FD_PRINTF_LOG_INFO _FD_PRINTF 266 #else 267 #define FD_PRINTF_LOG_INFO() 268 #endif 269 #define FD_PRINTF_LOG_NOTICE _FD_PRINTF 270 #define FD_PRINTF_LOG_ERR _FD_PRINTF 271 #define FD_PRINTF_LOG_WARNING _FD_PRINTF 272 273 274 /* List of all registered lookup algorithms */ 275 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list); 276 277 /* List of all fib lookup instances in the vnet */ 278 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list); 279 #define V_fib_data_list VNET(fib_data_list) 280 281 /* Datastructure for storing non-transient fib lookup module failures */ 282 struct fib_error { 283 int fe_family; 284 uint32_t fe_fibnum; /* failed rtable */ 285 struct fib_lookup_module *fe_flm; /* failed module */ 286 TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */ 287 }; 288 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list); 289 #define V_fib_error_list VNET(fib_error_list) 290 291 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */ 292 struct fib_dp_header { 293 struct epoch_context fdh_epoch_ctx; 294 uint32_t fdh_num_tables; 295 struct fib_dp fdh_idx[0]; 296 }; 297 298 /* 299 * Tries to add new non-transient algorithm error to the list of 300 * errors. 301 * Returns true on success. 302 */ 303 static bool 304 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum) 305 { 306 struct fib_error *fe; 307 308 fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO); 309 if (fe == NULL) 310 return (false); 311 fe->fe_flm = flm; 312 fe->fe_family = flm->flm_family; 313 fe->fe_fibnum = fibnum; 314 315 FIB_MOD_LOCK(); 316 /* Avoid duplicates by checking if error already exists first */ 317 if (flm_error_check(flm, fibnum)) { 318 FIB_MOD_UNLOCK(); 319 free(fe, M_TEMP); 320 return (true); 321 } 322 TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries); 323 FIB_MOD_UNLOCK(); 324 325 return (true); 326 } 327 328 /* 329 * True if non-transient error has been registered for @flm in @fibnum. 330 */ 331 static bool 332 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum) 333 { 334 const struct fib_error *fe; 335 336 TAILQ_FOREACH(fe, &V_fib_error_list, entries) { 337 if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum)) 338 return (true); 339 } 340 341 return (false); 342 } 343 344 /* 345 * Clear all errors of algo specified by @flm. 346 */ 347 static void 348 fib_error_clear_flm(struct fib_lookup_module *flm) 349 { 350 struct fib_error *fe, *fe_tmp; 351 352 FIB_MOD_LOCK_ASSERT(); 353 354 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 355 if (fe->fe_flm == flm) { 356 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 357 free(fe, M_TEMP); 358 } 359 } 360 } 361 362 /* 363 * Clears all errors in current VNET. 364 */ 365 static void 366 fib_error_clear() 367 { 368 struct fib_error *fe, *fe_tmp; 369 370 FIB_MOD_LOCK_ASSERT(); 371 372 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 373 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 374 free(fe, M_TEMP); 375 } 376 } 377 378 static const char * 379 print_op_result(enum flm_op_result result) 380 { 381 switch (result) { 382 case FLM_SUCCESS: 383 return "success"; 384 case FLM_REBUILD: 385 return "rebuild"; 386 case FLM_ERROR: 387 return "error"; 388 } 389 390 return "unknown"; 391 } 392 393 static const char * 394 print_family(int family) 395 { 396 397 if (family == AF_INET) 398 return ("inet"); 399 else if (family == AF_INET6) 400 return ("inet6"); 401 else 402 return ("unknown"); 403 } 404 405 /* 406 * Debug function used by lookup algorithms. 407 * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) " 408 */ 409 void 410 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...) 411 { 412 char buf[128]; 413 va_list ap; 414 415 if (level > flm_debug_level) 416 return; 417 418 va_start(ap, fmt); 419 vsnprintf(buf, sizeof(buf), fmt, ap); 420 va_end(ap); 421 422 _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name, 423 fd->fd_gen, func, "%s", buf); 424 } 425 426 /* 427 * Outputs list of algorithms supported by the provided address family. 428 */ 429 static int 430 print_algos_sysctl(struct sysctl_req *req, int family) 431 { 432 struct fib_lookup_module *flm; 433 struct sbuf sbuf; 434 int error, count = 0; 435 436 error = sysctl_wire_old_buffer(req, 0); 437 if (error == 0) { 438 sbuf_new_for_sysctl(&sbuf, NULL, 512, req); 439 TAILQ_FOREACH(flm, &all_algo_list, entries) { 440 if (flm->flm_family == family) { 441 if (count++ > 0) 442 sbuf_cat(&sbuf, ", "); 443 sbuf_cat(&sbuf, flm->flm_name); 444 } 445 } 446 error = sbuf_finish(&sbuf); 447 sbuf_delete(&sbuf); 448 } 449 return (error); 450 } 451 452 #ifdef INET6 453 static int 454 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS) 455 { 456 457 return (print_algos_sysctl(req, AF_INET6)); 458 } 459 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, 460 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 461 print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms"); 462 #endif 463 464 #ifdef INET 465 static int 466 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS) 467 { 468 469 return (print_algos_sysctl(req, AF_INET)); 470 } 471 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, 472 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 473 print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms"); 474 #endif 475 476 /* 477 * Calculate delay between repeated failures. 478 * Returns current delay in milliseconds. 479 */ 480 static uint32_t 481 callout_calc_delay_ms(struct fib_data *fd) 482 { 483 uint32_t shift; 484 485 if (fd->fd_failed_rebuilds > 10) 486 shift = 10; 487 else 488 shift = fd->fd_failed_rebuilds; 489 490 return ((1 << shift) * FIB_CALLOUT_DELAY_MS); 491 } 492 493 static void 494 schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms) 495 { 496 497 FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action); 498 fd->fd_callout_action = action; 499 callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0, 500 handle_fd_callout, fd, 0); 501 } 502 503 static void 504 schedule_fd_rebuild(struct fib_data *fd, const char *reason) 505 { 506 507 RIB_WLOCK_ASSERT(fd->fd_rh); 508 509 if (!fd->fd_need_rebuild) { 510 fd->fd_need_rebuild = true; 511 512 /* 513 * Potentially re-schedules pending callout 514 * initiated by schedule_algo_eval. 515 */ 516 FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)", 517 reason, fd->fd_failed_rebuilds); 518 schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd)); 519 } 520 } 521 522 static int64_t 523 get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv) 524 { 525 int64_t diff = 0; 526 527 diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000; 528 diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000; 529 530 return (diff); 531 } 532 533 static void 534 add_tv_diff_ms(struct timeval *tv, int ms) 535 { 536 tv->tv_sec += ms / 1000; 537 ms = ms % 1000; 538 if (ms * 1000 + tv->tv_usec < 1000000) 539 tv->tv_usec += ms * 1000; 540 else { 541 tv->tv_sec += 1; 542 tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000; 543 } 544 } 545 546 /* 547 * Marks the time when algo state diverges from the rib state. 548 */ 549 static void 550 mark_diverge_time(struct fib_data *fd) 551 { 552 struct fib_sync_status *fd_ss = &fd->fd_ss; 553 554 getmicrouptime(&fd_ss->diverge_time); 555 fd_ss->bucket_id = 0; 556 fd_ss->bucket_changes = 0; 557 } 558 559 /* 560 * Calculates and updates the next algorithm sync time, based on the current activity. 561 * 562 * The intent is to provide reasonable balance between the update 563 * latency and efficient batching when changing large amount of routes. 564 * 565 * High-level algorithm looks the following: 566 * 1) all changes are bucketed in 50ms intervals 567 * 2) If amount of changes within the bucket is greater than the threshold, 568 * the update gets delayed, up to maximum delay threshold. 569 */ 570 static void 571 update_rebuild_delay(struct fib_data *fd) 572 { 573 uint32_t bucket_id, new_delay = 0; 574 struct timeval tv; 575 576 /* Fetch all variables at once to ensure consistent reads */ 577 uint32_t bucket_time_ms = V_bucket_time_ms; 578 uint32_t threshold_rate = V_bucket_change_threshold_rate; 579 uint32_t max_delay_ms = V_fib_max_sync_delay_ms; 580 581 if (bucket_time_ms == 0) 582 bucket_time_ms = 50; 583 /* calculate per-bucket threshold rate */ 584 threshold_rate = threshold_rate * bucket_time_ms / 1000; 585 586 getmicrouptime(&tv); 587 588 struct fib_sync_status *fd_ss = &fd->fd_ss; 589 590 bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms; 591 592 if (fd_ss->bucket_id == bucket_id) { 593 fd_ss->bucket_changes++; 594 if (fd_ss->bucket_changes == threshold_rate) { 595 new_delay = (bucket_id + 2) * bucket_time_ms; 596 if (new_delay <= max_delay_ms) { 597 FD_PRINTF(LOG_DEBUG, fd, 598 "hit threshold of %u routes, delay update," 599 "bucket: %u, total delay: %u", 600 threshold_rate, bucket_id + 1, new_delay); 601 } else { 602 new_delay = 0; 603 FD_PRINTF(LOG_DEBUG, fd, 604 "maximum sync delay (%u ms) reached", max_delay_ms); 605 } 606 } else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1)) 607 new_delay = bucket_time_ms; 608 } else { 609 fd_ss->bucket_id = bucket_id; 610 fd_ss->bucket_changes = 1; 611 } 612 613 if (new_delay > 0) { 614 /* Calculated time has been updated */ 615 struct timeval new_tv = fd_ss->diverge_time; 616 add_tv_diff_ms(&new_tv, new_delay); 617 618 int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv); 619 schedule_callout(fd, FDA_REBUILD, delay_ms); 620 } 621 } 622 623 static void 624 update_algo_state(struct fib_data *fd) 625 { 626 627 RIB_WLOCK_ASSERT(fd->fd_rh); 628 629 if (fd->fd_need_rebuild) { 630 update_rebuild_delay(fd); 631 return; 632 } 633 634 if (fd->fd_num_changes++ == 0) { 635 /* Start callout to consider switch */ 636 if (!callout_pending(&fd->fd_callout)) 637 schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS); 638 } else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) { 639 /* Reset callout to exec immediately */ 640 if (fd->fd_callout_action == FDA_EVAL) 641 schedule_callout(fd, FDA_EVAL, 1); 642 } 643 } 644 645 static bool 646 need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc) 647 { 648 struct nhop_object *nh; 649 650 /* Sync addition/removal of interface routes */ 651 switch (rc->rc_cmd) { 652 case RTM_ADD: 653 nh = rc->rc_nh_new; 654 if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY))) 655 return (true); 656 break; 657 case RTM_DELETE: 658 nh = rc->rc_nh_old; 659 if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY))) 660 return (true); 661 break; 662 } 663 664 return (false); 665 } 666 667 /* 668 * Rib subscription handler. Checks if the algorithm is ready to 669 * receive updates, handles nexthop refcounting and passes change 670 * data to the algorithm callback. 671 */ 672 static void 673 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, 674 void *_data) 675 { 676 struct fib_data *fd = (struct fib_data *)_data; 677 enum flm_op_result result; 678 679 RIB_WLOCK_ASSERT(rnh); 680 681 /* 682 * There is a small gap between subscribing for route changes 683 * and initiating rtable dump. Avoid receiving route changes 684 * prior to finishing rtable dump by checking `init_done`. 685 */ 686 if (!fd->init_done) 687 return; 688 689 bool immediate_sync = need_immediate_sync(fd, rc); 690 691 /* Consider scheduling algorithm re-evaluation */ 692 update_algo_state(fd); 693 694 /* 695 * If algo requested rebuild, stop sending updates by default. 696 * This simplifies nexthop refcount handling logic. 697 */ 698 if (fd->fd_need_rebuild) 699 return; 700 701 /* 702 * Maintain guarantee that every nexthop returned by the dataplane 703 * lookup has > 0 refcount, so can be safely referenced within current 704 * epoch. 705 */ 706 if (rc->rc_nh_new != NULL) { 707 if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) { 708 /* ran out of indexes */ 709 schedule_fd_rebuild(fd, "ran out of nhop indexes"); 710 return; 711 } 712 } 713 714 result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); 715 716 switch (result) { 717 case FLM_SUCCESS: 718 /* Unref old nexthop on success */ 719 if (rc->rc_nh_old != NULL) 720 fib_unref_nhop(fd, rc->rc_nh_old); 721 break; 722 case FLM_REBUILD: 723 724 /* 725 * Algo is not able to apply the update. 726 * Schedule algo rebuild. 727 */ 728 if (!immediate_sync) { 729 mark_diverge_time(fd); 730 schedule_fd_rebuild(fd, "algo requested rebuild"); 731 break; 732 } 733 734 FD_PRINTF(LOG_INFO, fd, "running sync rebuild"); 735 rebuild_fd(fd, "rtable change type enforced sync"); 736 break; 737 case FLM_ERROR: 738 739 /* 740 * Algo reported a non-recoverable error. 741 * Record the error and schedule rebuild, which will 742 * trigger best algo selection. 743 */ 744 FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error"); 745 if (!flm_error_add(fd->fd_flm, fd->fd_fibnum)) 746 FD_PRINTF(LOG_ERR, fd, "failed to ban algo"); 747 schedule_fd_rebuild(fd, "algo reported non-recoverable error"); 748 } 749 } 750 751 static void 752 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd) 753 { 754 755 if (old_fd == NULL) { 756 // TODO: read from rtable 757 fd->number_nhops = 16; 758 return; 759 } 760 761 if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) 762 fd->number_nhops = 2 * old_fd->number_nhops; 763 else 764 fd->number_nhops = old_fd->number_nhops; 765 } 766 767 struct walk_cbdata { 768 struct fib_data *fd; 769 flm_dump_t *func; 770 enum flm_op_result result; 771 }; 772 773 /* 774 * Handler called after all rtenties have been dumped. 775 * Performs post-dump framework checks and calls 776 * algo:flm_dump_end_cb(). 777 * 778 * Updates walk_cbdata result. 779 */ 780 static void 781 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) 782 { 783 struct walk_cbdata *w = (struct walk_cbdata *)_data; 784 struct fib_data *fd = w->fd; 785 786 RIB_WLOCK_ASSERT(w->fd->fd_rh); 787 788 if (rnh->rib_dying) { 789 w->result = FLM_ERROR; 790 return; 791 } 792 793 if (fd->hit_nhops) { 794 FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops", 795 fd->nh_ref_table->count); 796 if (w->result == FLM_SUCCESS) 797 w->result = FLM_REBUILD; 798 return; 799 } 800 801 if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) 802 return; 803 804 /* Post-dump hook, dump successful */ 805 w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); 806 807 if (w->result == FLM_SUCCESS) { 808 /* Mark init as done to allow routing updates */ 809 fd->init_done = 1; 810 } 811 } 812 813 /* 814 * Callback for each entry in rib. 815 * Calls algo:flm_dump_rib_item_cb func as a part of initial 816 * route table synchronisation. 817 */ 818 static int 819 sync_algo_cb(struct rtentry *rt, void *_data) 820 { 821 struct walk_cbdata *w = (struct walk_cbdata *)_data; 822 823 RIB_WLOCK_ASSERT(w->fd->fd_rh); 824 825 if (w->result == FLM_SUCCESS && w->func) { 826 827 /* 828 * Reference nexthops to maintain guarantee that 829 * each nexthop returned by datapath has > 0 references 830 * and can be safely referenced within current epoch. 831 */ 832 struct nhop_object *nh = rt_get_raw_nhop(rt); 833 if (fib_ref_nhop(w->fd, nh) != 0) 834 w->result = w->func(rt, w->fd->fd_algo_data); 835 else 836 w->result = FLM_REBUILD; 837 } 838 839 return (0); 840 } 841 842 /* 843 * Dump all routing table state to the algo instance. 844 */ 845 static enum flm_op_result 846 sync_algo(struct fib_data *fd) 847 { 848 struct walk_cbdata w = { 849 .fd = fd, 850 .func = fd->fd_flm->flm_dump_rib_item_cb, 851 .result = FLM_SUCCESS, 852 }; 853 854 rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w); 855 856 FD_PRINTF(LOG_INFO, fd, 857 "initial dump completed (rtable version: %d), result: %s", 858 fd->fd_rh->rnh_gen, print_op_result(w.result)); 859 860 return (w.result); 861 } 862 863 /* 864 * Schedules epoch-backed @fd instance deletion. 865 * * Unlinks @fd from the list of active algo instances. 866 * * Removes rib subscription. 867 * * Stops callout. 868 * * Schedules actual deletion. 869 * 870 * Assume @fd is already unlinked from the datapath. 871 */ 872 static int 873 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout) 874 { 875 bool is_dead; 876 877 NET_EPOCH_ASSERT(); 878 RIB_WLOCK_ASSERT(fd->fd_rh); 879 880 FIB_MOD_LOCK(); 881 is_dead = fd->fd_dead; 882 if (!is_dead) 883 fd->fd_dead = true; 884 if (fd->fd_linked) { 885 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 886 fd->fd_linked = false; 887 } 888 FIB_MOD_UNLOCK(); 889 if (is_dead) 890 return (0); 891 892 FD_PRINTF(LOG_INFO, fd, "DETACH"); 893 894 if (fd->fd_rs != NULL) 895 rib_unsibscribe_locked(fd->fd_rs); 896 897 /* 898 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls 899 * will be executed, hence no _new_ callout schedules will happen. 900 */ 901 callout_stop(&fd->fd_callout); 902 903 epoch_call(net_epoch_preempt, destroy_fd_instance_epoch, 904 &fd->fd_epoch_ctx); 905 906 return (0); 907 } 908 909 /* 910 * Wipe all fd instances from the list matching rib specified by @rh. 911 * If @keep_first is set, remove all but the first record. 912 */ 913 static void 914 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout) 915 { 916 struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); 917 struct fib_data *fd, *fd_tmp; 918 struct epoch_tracker et; 919 920 FIB_MOD_LOCK(); 921 TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) { 922 if (fd->fd_rh == rh) { 923 if (keep_first) { 924 keep_first = false; 925 continue; 926 } 927 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 928 fd->fd_linked = false; 929 TAILQ_INSERT_TAIL(&tmp_head, fd, entries); 930 } 931 } 932 FIB_MOD_UNLOCK(); 933 934 /* Pass 2: remove each entry */ 935 NET_EPOCH_ENTER(et); 936 TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { 937 if (!in_callout) 938 RIB_WLOCK(fd->fd_rh); 939 schedule_destroy_fd_instance(fd, in_callout); 940 if (!in_callout) 941 RIB_WUNLOCK(fd->fd_rh); 942 } 943 NET_EPOCH_EXIT(et); 944 } 945 946 void 947 fib_destroy_rib(struct rib_head *rh) 948 { 949 950 /* 951 * rnh has `is_dying` flag set, so setup of new fd's will fail at 952 * sync_algo() stage, preventing new entries to be added to the list 953 * of active algos. Remove all existing entries for the particular rib. 954 */ 955 fib_cleanup_algo(rh, false, false); 956 } 957 958 /* 959 * Finalises fd destruction by freeing all fd resources. 960 */ 961 static void 962 destroy_fd_instance(struct fib_data *fd) 963 { 964 965 FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd); 966 967 /* Call destroy callback first */ 968 if (fd->fd_algo_data != NULL) 969 fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); 970 971 /* Nhop table */ 972 if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) { 973 for (int i = 0; i < fd->number_nhops; i++) { 974 if (!is_idx_free(fd, i)) { 975 FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p", 976 i, fd->nh_idx[i]); 977 nhop_free_any(fd->nh_idx[i]); 978 } 979 } 980 free(fd->nh_idx, M_RTABLE); 981 } 982 if (fd->nh_ref_table != NULL) 983 free(fd->nh_ref_table, M_RTABLE); 984 985 fib_unref_algo(fd->fd_flm); 986 987 free(fd, M_RTABLE); 988 } 989 990 /* 991 * Epoch callback indicating fd is safe to destroy 992 */ 993 static void 994 destroy_fd_instance_epoch(epoch_context_t ctx) 995 { 996 struct fib_data *fd; 997 998 fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); 999 1000 destroy_fd_instance(fd); 1001 } 1002 1003 /* 1004 * Tries to setup fd instance. 1005 * - Allocates fd/nhop table 1006 * - Runs algo:flm_init_cb algo init 1007 * - Subscribes fd to the rib 1008 * - Runs rtable dump 1009 * - Adds instance to the list of active instances. 1010 * 1011 * Returns: operation result. Fills in @pfd with resulting fd on success. 1012 * 1013 */ 1014 static enum flm_op_result 1015 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 1016 struct fib_data *old_fd, struct fib_data **pfd) 1017 { 1018 struct fib_data *fd; 1019 size_t size; 1020 enum flm_op_result result; 1021 1022 /* Allocate */ 1023 fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); 1024 if (fd == NULL) { 1025 *pfd = NULL; 1026 RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure"); 1027 return (FLM_REBUILD); 1028 } 1029 *pfd = fd; 1030 1031 estimate_nhop_scale(old_fd, fd); 1032 1033 fd->fd_rh = rh; 1034 fd->fd_gen = ++fib_gen; 1035 fd->fd_family = rh->rib_family; 1036 fd->fd_fibnum = rh->rib_fibnum; 1037 callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0); 1038 fd->fd_vnet = curvnet; 1039 fd->fd_flm = flm; 1040 1041 FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd); 1042 1043 FIB_MOD_LOCK(); 1044 flm->flm_refcount++; 1045 FIB_MOD_UNLOCK(); 1046 1047 /* Allocate nhidx -> nhop_ptr table */ 1048 size = fd->number_nhops * sizeof(void *); 1049 fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 1050 if (fd->nh_idx == NULL) { 1051 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size); 1052 return (FLM_REBUILD); 1053 } 1054 1055 /* Allocate nhop index refcount table */ 1056 size = sizeof(struct nhop_ref_table); 1057 size += fd->number_nhops * sizeof(uint32_t); 1058 fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 1059 if (fd->nh_ref_table == NULL) { 1060 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size); 1061 return (FLM_REBUILD); 1062 } 1063 FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops); 1064 1065 /* Okay, we're ready for algo init */ 1066 void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; 1067 result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); 1068 if (result != FLM_SUCCESS) { 1069 FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name); 1070 return (result); 1071 } 1072 1073 /* Try to subscribe */ 1074 if (flm->flm_change_rib_item_cb != NULL) { 1075 fd->fd_rs = rib_subscribe_locked(fd->fd_rh, 1076 handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE); 1077 if (fd->fd_rs == NULL) { 1078 FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes"); 1079 return (FLM_REBUILD); 1080 } 1081 } 1082 1083 /* Dump */ 1084 result = sync_algo(fd); 1085 if (result != FLM_SUCCESS) { 1086 FD_PRINTF(LOG_INFO, fd, "rib sync failed"); 1087 return (result); 1088 } 1089 FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully."); 1090 1091 FIB_MOD_LOCK(); 1092 /* 1093 * Insert fd in the beginning of a list, to maintain invariant 1094 * that first matching entry for the AF/fib is always the active 1095 * one. 1096 */ 1097 TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries); 1098 fd->fd_linked = true; 1099 FIB_MOD_UNLOCK(); 1100 1101 return (FLM_SUCCESS); 1102 } 1103 1104 /* 1105 * Sets up algo @flm for table @rh and links it to the datapath. 1106 * 1107 */ 1108 static enum flm_op_result 1109 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 1110 struct fib_data *orig_fd, struct fib_data **pfd, bool attach) 1111 { 1112 struct fib_data *prev_fd, *new_fd; 1113 enum flm_op_result result; 1114 1115 NET_EPOCH_ASSERT(); 1116 RIB_WLOCK_ASSERT(rh); 1117 1118 prev_fd = orig_fd; 1119 new_fd = NULL; 1120 for (int i = 0; i < FIB_MAX_TRIES; i++) { 1121 result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd); 1122 1123 if ((result == FLM_SUCCESS) && attach) 1124 result = attach_datapath(new_fd); 1125 1126 if ((prev_fd != NULL) && (prev_fd != orig_fd)) { 1127 schedule_destroy_fd_instance(prev_fd, false); 1128 prev_fd = NULL; 1129 } 1130 1131 RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i, 1132 print_op_result(result)); 1133 1134 if (result == FLM_REBUILD) { 1135 prev_fd = new_fd; 1136 new_fd = NULL; 1137 continue; 1138 } 1139 1140 break; 1141 } 1142 1143 if (result != FLM_SUCCESS) { 1144 RH_PRINTF(LOG_WARNING, rh, 1145 "%s algo instance setup failed, failures=%d", flm->flm_name, 1146 orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0); 1147 /* update failure count */ 1148 FIB_MOD_LOCK(); 1149 if (orig_fd != NULL) 1150 orig_fd->fd_failed_rebuilds++; 1151 FIB_MOD_UNLOCK(); 1152 1153 /* Ban algo on non-recoverable error */ 1154 if (result == FLM_ERROR) 1155 flm_error_add(flm, rh->rib_fibnum); 1156 1157 if ((prev_fd != NULL) && (prev_fd != orig_fd)) 1158 schedule_destroy_fd_instance(prev_fd, false); 1159 if (new_fd != NULL) { 1160 schedule_destroy_fd_instance(new_fd, false); 1161 new_fd = NULL; 1162 } 1163 } 1164 1165 *pfd = new_fd; 1166 return (result); 1167 } 1168 1169 /* 1170 * Tries to sync algo with the current rtable state, either 1171 * by executing batch update or rebuilding. 1172 * Returns true on success. 1173 */ 1174 static bool 1175 execute_callout_action(struct fib_data *fd) 1176 { 1177 enum fib_callout_action action = fd->fd_callout_action; 1178 struct fib_lookup_module *flm_new = NULL; 1179 bool result = true; 1180 1181 NET_EPOCH_ASSERT(); 1182 RIB_WLOCK_ASSERT(fd->fd_rh); 1183 1184 fd->fd_need_rebuild = false; 1185 fd->fd_num_changes = 0; 1186 1187 /* First, check if we're still OK to use this algo */ 1188 if (!is_algo_fixed(fd->fd_rh)) 1189 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); 1190 if (flm_new != NULL) 1191 action = FDA_REBUILD; 1192 1193 if (action == FDA_REBUILD) 1194 result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); 1195 if (flm_new != NULL) 1196 fib_unref_algo(flm_new); 1197 1198 return (result); 1199 } 1200 1201 /* 1202 * Callout for all scheduled fd-related work. 1203 * - Checks if the current algo is still the best algo 1204 * - Creates a new instance of an algo for af/fib if desired. 1205 */ 1206 static void 1207 handle_fd_callout(void *_data) 1208 { 1209 struct fib_data *fd = (struct fib_data *)_data; 1210 struct epoch_tracker et; 1211 1212 FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action); 1213 1214 NET_EPOCH_ENTER(et); 1215 CURVNET_SET(fd->fd_vnet); 1216 execute_callout_action(fd); 1217 CURVNET_RESTORE(); 1218 NET_EPOCH_EXIT(et); 1219 } 1220 1221 /* 1222 * Tries to create new algo instance based on @fd data. 1223 * Returns true on success. 1224 */ 1225 static bool 1226 rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new) 1227 { 1228 struct fib_data *fd_new, *fd_tmp = NULL; 1229 bool result; 1230 1231 if (flm_new == fd->fd_flm) 1232 fd_tmp = fd; 1233 else 1234 FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name); 1235 1236 result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true); 1237 if (result != FLM_SUCCESS) { 1238 FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed"); 1239 return (false); 1240 } 1241 FD_PRINTF(LOG_INFO, fd_new, "switched to new instance"); 1242 1243 /* Remove old instance */ 1244 schedule_destroy_fd_instance(fd, true); 1245 1246 return (true); 1247 } 1248 1249 static bool 1250 rebuild_fd(struct fib_data *fd, const char *reason) 1251 { 1252 struct fib_lookup_module *flm_new = NULL; 1253 bool result; 1254 1255 if (!is_algo_fixed(fd->fd_rh)) 1256 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); 1257 1258 FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason); 1259 result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); 1260 if (flm_new != NULL) 1261 fib_unref_algo(flm_new); 1262 1263 if (!result) { 1264 FD_PRINTF(LOG_ERR, fd, "sync rebuild failed"); 1265 schedule_fd_rebuild(fd, "sync rebuild failed"); 1266 } 1267 1268 return (result); 1269 } 1270 1271 /* 1272 * Finds algo by name/family. 1273 * Returns referenced algo or NULL. 1274 */ 1275 static struct fib_lookup_module * 1276 fib_find_algo(const char *algo_name, int family) 1277 { 1278 struct fib_lookup_module *flm; 1279 1280 FIB_MOD_LOCK(); 1281 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1282 if ((strcmp(flm->flm_name, algo_name) == 0) && 1283 (family == flm->flm_family)) { 1284 flm->flm_refcount++; 1285 FIB_MOD_UNLOCK(); 1286 return (flm); 1287 } 1288 } 1289 FIB_MOD_UNLOCK(); 1290 1291 return (NULL); 1292 } 1293 1294 static void 1295 fib_unref_algo(struct fib_lookup_module *flm) 1296 { 1297 1298 FIB_MOD_LOCK(); 1299 flm->flm_refcount--; 1300 FIB_MOD_UNLOCK(); 1301 } 1302 1303 static int 1304 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req) 1305 { 1306 struct fib_lookup_module *flm = NULL; 1307 struct fib_data *fd = NULL; 1308 char old_algo_name[32], algo_name[32]; 1309 struct rib_head *rh = NULL; 1310 enum flm_op_result result; 1311 struct epoch_tracker et; 1312 int error; 1313 1314 /* Fetch current algo/rib for af/family */ 1315 FIB_MOD_LOCK(); 1316 TAILQ_FOREACH(fd, &V_fib_data_list, entries) { 1317 if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum)) 1318 break; 1319 } 1320 if (fd == NULL) { 1321 FIB_MOD_UNLOCK(); 1322 return (ENOENT); 1323 } 1324 rh = fd->fd_rh; 1325 strlcpy(old_algo_name, fd->fd_flm->flm_name, 1326 sizeof(old_algo_name)); 1327 FIB_MOD_UNLOCK(); 1328 1329 strlcpy(algo_name, old_algo_name, sizeof(algo_name)); 1330 error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); 1331 if (error != 0 || req->newptr == NULL) 1332 return (error); 1333 1334 if (strcmp(algo_name, old_algo_name) == 0) 1335 return (0); 1336 1337 /* New algorithm name is different */ 1338 flm = fib_find_algo(algo_name, family); 1339 if (flm == NULL) { 1340 RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name); 1341 return (ESRCH); 1342 } 1343 1344 fd = NULL; 1345 NET_EPOCH_ENTER(et); 1346 RIB_WLOCK(rh); 1347 result = setup_fd_instance(flm, rh, NULL, &fd, true); 1348 RIB_WUNLOCK(rh); 1349 NET_EPOCH_EXIT(et); 1350 fib_unref_algo(flm); 1351 if (result != FLM_SUCCESS) 1352 return (EINVAL); 1353 1354 /* Disable automated jumping between algos */ 1355 FIB_MOD_LOCK(); 1356 set_algo_fixed(rh); 1357 FIB_MOD_UNLOCK(); 1358 /* Remove old instance(s) */ 1359 fib_cleanup_algo(rh, true, false); 1360 1361 /* Drain cb so user can unload the module after userret if so desired */ 1362 epoch_drain_callbacks(net_epoch_preempt); 1363 1364 return (0); 1365 } 1366 1367 #ifdef INET 1368 static int 1369 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS) 1370 { 1371 1372 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req)); 1373 } 1374 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo, 1375 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1376 set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo"); 1377 #endif 1378 1379 #ifdef INET6 1380 static int 1381 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS) 1382 { 1383 1384 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req)); 1385 } 1386 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, 1387 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1388 set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo"); 1389 #endif 1390 1391 static void 1392 destroy_fdh_epoch(epoch_context_t ctx) 1393 { 1394 struct fib_dp_header *fdh; 1395 1396 fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx); 1397 free(fdh, M_RTABLE); 1398 } 1399 1400 static struct fib_dp_header * 1401 alloc_fib_dp_array(uint32_t num_tables, bool waitok) 1402 { 1403 size_t sz; 1404 struct fib_dp_header *fdh; 1405 1406 sz = sizeof(struct fib_dp_header); 1407 sz += sizeof(struct fib_dp) * num_tables; 1408 fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); 1409 if (fdh != NULL) 1410 fdh->fdh_num_tables = num_tables; 1411 return (fdh); 1412 } 1413 1414 static struct fib_dp_header * 1415 get_fib_dp_header(struct fib_dp *dp) 1416 { 1417 1418 return (__containerof((void *)dp, struct fib_dp_header, fdh_idx)); 1419 } 1420 1421 /* 1422 * Replace per-family index pool @pdp with a new one which 1423 * contains updated callback/algo data from @fd. 1424 * Returns 0 on success. 1425 */ 1426 static enum flm_op_result 1427 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd) 1428 { 1429 struct fib_dp_header *new_fdh, *old_fdh; 1430 1431 NET_EPOCH_ASSERT(); 1432 1433 FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p", 1434 curvnet, fd->fd_dp.f, fd->fd_dp.arg); 1435 1436 FIB_MOD_LOCK(); 1437 old_fdh = get_fib_dp_header(*pdp); 1438 new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false); 1439 FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh); 1440 if (new_fdh == NULL) { 1441 FIB_MOD_UNLOCK(); 1442 FD_PRINTF(LOG_WARNING, fd, "error attaching datapath"); 1443 return (FLM_REBUILD); 1444 } 1445 1446 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1447 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1448 /* Update relevant data structure for @fd */ 1449 new_fdh->fdh_idx[fd->fd_fibnum] = fd->fd_dp; 1450 1451 /* Ensure memcpy() writes have completed */ 1452 atomic_thread_fence_rel(); 1453 /* Set new datapath pointer */ 1454 *pdp = &new_fdh->fdh_idx[0]; 1455 FIB_MOD_UNLOCK(); 1456 FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh); 1457 1458 epoch_call(net_epoch_preempt, destroy_fdh_epoch, 1459 &old_fdh->fdh_epoch_ctx); 1460 1461 return (FLM_SUCCESS); 1462 } 1463 1464 static struct fib_dp ** 1465 get_family_dp_ptr(int family) 1466 { 1467 switch (family) { 1468 case AF_INET: 1469 return (&V_inet_dp); 1470 case AF_INET6: 1471 return (&V_inet6_dp); 1472 } 1473 return (NULL); 1474 } 1475 1476 /* 1477 * Make datapath use fib instance @fd 1478 */ 1479 static enum flm_op_result 1480 attach_datapath(struct fib_data *fd) 1481 { 1482 struct fib_dp **pdp; 1483 1484 pdp = get_family_dp_ptr(fd->fd_family); 1485 return (replace_rtables_family(pdp, fd)); 1486 } 1487 1488 /* 1489 * Grow datapath pointers array. 1490 * Called from sysctl handler on growing number of routing tables. 1491 */ 1492 static void 1493 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) 1494 { 1495 struct fib_dp_header *new_fdh, *old_fdh = NULL; 1496 1497 new_fdh = alloc_fib_dp_array(new_num_tables, true); 1498 1499 FIB_MOD_LOCK(); 1500 if (*pdp != NULL) { 1501 old_fdh = get_fib_dp_header(*pdp); 1502 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1503 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1504 } 1505 1506 /* Wait till all writes completed */ 1507 atomic_thread_fence_rel(); 1508 1509 *pdp = &new_fdh->fdh_idx[0]; 1510 FIB_MOD_UNLOCK(); 1511 1512 if (old_fdh != NULL) 1513 epoch_call(net_epoch_preempt, destroy_fdh_epoch, 1514 &old_fdh->fdh_epoch_ctx); 1515 } 1516 1517 /* 1518 * Grows per-AF arrays of datapath pointers for each supported family. 1519 * Called from fibs resize sysctl handler. 1520 */ 1521 void 1522 fib_grow_rtables(uint32_t new_num_tables) 1523 { 1524 1525 #ifdef INET 1526 grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables); 1527 #endif 1528 #ifdef INET6 1529 grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables); 1530 #endif 1531 } 1532 1533 void 1534 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) 1535 { 1536 1537 bzero(rinfo, sizeof(struct rib_rtable_info)); 1538 rinfo->num_prefixes = rh->rnh_prefixes; 1539 rinfo->num_nhops = nhops_get_count(rh); 1540 #ifdef ROUTE_MPATH 1541 rinfo->num_nhgrp = nhgrp_get_count(rh); 1542 #endif 1543 } 1544 1545 /* 1546 * Accessor to get rib instance @fd is attached to. 1547 */ 1548 struct rib_head * 1549 fib_get_rh(struct fib_data *fd) 1550 { 1551 1552 return (fd->fd_rh); 1553 } 1554 1555 /* 1556 * Accessor to export idx->nhop array 1557 */ 1558 struct nhop_object ** 1559 fib_get_nhop_array(struct fib_data *fd) 1560 { 1561 1562 return (fd->nh_idx); 1563 } 1564 1565 static uint32_t 1566 get_nhop_idx(struct nhop_object *nh) 1567 { 1568 #ifdef ROUTE_MPATH 1569 if (NH_IS_NHGRP(nh)) 1570 return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1); 1571 else 1572 return (nhop_get_idx(nh) * 2); 1573 #else 1574 return (nhop_get_idx(nh)); 1575 #endif 1576 } 1577 1578 uint32_t 1579 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) 1580 { 1581 1582 return (get_nhop_idx(nh)); 1583 } 1584 1585 static bool 1586 is_idx_free(struct fib_data *fd, uint32_t index) 1587 { 1588 1589 return (fd->nh_ref_table->refcnt[index] == 0); 1590 } 1591 1592 static uint32_t 1593 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh) 1594 { 1595 uint32_t idx = get_nhop_idx(nh); 1596 1597 if (idx >= fd->number_nhops) { 1598 fd->hit_nhops = 1; 1599 return (0); 1600 } 1601 1602 if (is_idx_free(fd, idx)) { 1603 nhop_ref_any(nh); 1604 fd->nh_idx[idx] = nh; 1605 fd->nh_ref_table->count++; 1606 FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); 1607 } 1608 fd->nh_ref_table->refcnt[idx]++; 1609 1610 return (idx); 1611 } 1612 1613 struct nhop_release_data { 1614 struct nhop_object *nh; 1615 struct epoch_context ctx; 1616 }; 1617 1618 static void 1619 release_nhop_epoch(epoch_context_t ctx) 1620 { 1621 struct nhop_release_data *nrd; 1622 1623 nrd = __containerof(ctx, struct nhop_release_data, ctx); 1624 nhop_free_any(nrd->nh); 1625 free(nrd, M_TEMP); 1626 } 1627 1628 /* 1629 * Delays nexthop refcount release. 1630 * Datapath may have the datastructures not updated yet, so the old 1631 * nexthop may still be returned till the end of current epoch. Delay 1632 * refcount removal, as we may be removing the last instance, which will 1633 * trigger nexthop deletion, rendering returned nexthop invalid. 1634 */ 1635 static void 1636 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) 1637 { 1638 struct nhop_release_data *nrd; 1639 1640 nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO); 1641 if (nrd != NULL) { 1642 nrd->nh = nh; 1643 epoch_call(net_epoch_preempt, release_nhop_epoch, &nrd->ctx); 1644 } else { 1645 /* 1646 * Unable to allocate memory. Leak nexthop to maintain guarantee 1647 * that each nhop can be referenced. 1648 */ 1649 FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh); 1650 } 1651 } 1652 1653 static void 1654 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh) 1655 { 1656 uint32_t idx = get_nhop_idx(nh); 1657 1658 KASSERT((idx < fd->number_nhops), ("invalid nhop index")); 1659 KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh")); 1660 1661 fd->nh_ref_table->refcnt[idx]--; 1662 if (fd->nh_ref_table->refcnt[idx] == 0) { 1663 FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); 1664 fib_schedule_release_nhop(fd, fd->nh_idx[idx]); 1665 } 1666 } 1667 1668 static void 1669 set_algo_fixed(struct rib_head *rh) 1670 { 1671 switch (rh->rib_family) { 1672 #ifdef INET 1673 case AF_INET: 1674 V_algo_fixed_inet = true; 1675 break; 1676 #endif 1677 #ifdef INET6 1678 case AF_INET6: 1679 V_algo_fixed_inet6 = true; 1680 break; 1681 #endif 1682 } 1683 } 1684 1685 static bool 1686 is_algo_fixed(struct rib_head *rh) 1687 { 1688 1689 switch (rh->rib_family) { 1690 #ifdef INET 1691 case AF_INET: 1692 return (V_algo_fixed_inet); 1693 #endif 1694 #ifdef INET6 1695 case AF_INET6: 1696 return (V_algo_fixed_inet6); 1697 #endif 1698 } 1699 return (false); 1700 } 1701 1702 /* 1703 * Runs the check on what would be the best algo for rib @rh, assuming 1704 * that the current algo is the one specified by @orig_flm. Note that 1705 * it can be NULL for initial selection. 1706 * 1707 * Returns referenced new algo or NULL if the current one is the best. 1708 */ 1709 static struct fib_lookup_module * 1710 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) 1711 { 1712 uint8_t preference, curr_preference = 0, best_preference = 0; 1713 struct fib_lookup_module *flm, *best_flm = NULL; 1714 struct rib_rtable_info rinfo; 1715 int candidate_algos = 0; 1716 1717 fib_get_rtable_info(rh, &rinfo); 1718 1719 FIB_MOD_LOCK(); 1720 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1721 if (flm->flm_family != rh->rib_family) 1722 continue; 1723 candidate_algos++; 1724 preference = flm->flm_get_pref(&rinfo); 1725 if (preference > best_preference) { 1726 if (!flm_error_check(flm, rh->rib_fibnum)) { 1727 best_preference = preference; 1728 best_flm = flm; 1729 } 1730 } 1731 if (flm == orig_flm) 1732 curr_preference = preference; 1733 } 1734 if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference)) 1735 best_flm->flm_refcount++; 1736 else 1737 best_flm = NULL; 1738 FIB_MOD_UNLOCK(); 1739 1740 RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", 1741 candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, 1742 best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"), 1743 best_preference); 1744 1745 return (best_flm); 1746 } 1747 1748 /* 1749 * Called when new route table is created. 1750 * Selects, allocates and attaches fib algo for the table. 1751 */ 1752 int 1753 fib_select_algo_initial(struct rib_head *rh) 1754 { 1755 struct fib_lookup_module *flm; 1756 struct fib_data *fd = NULL; 1757 enum flm_op_result result; 1758 struct epoch_tracker et; 1759 int error = 0; 1760 1761 flm = fib_check_best_algo(rh, NULL); 1762 if (flm == NULL) { 1763 RH_PRINTF(LOG_CRIT, rh, "no algo selected"); 1764 return (ENOENT); 1765 } 1766 RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name); 1767 1768 NET_EPOCH_ENTER(et); 1769 RIB_WLOCK(rh); 1770 result = setup_fd_instance(flm, rh, NULL, &fd, false); 1771 RIB_WUNLOCK(rh); 1772 NET_EPOCH_EXIT(et); 1773 1774 RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd); 1775 if (result == FLM_SUCCESS) { 1776 1777 /* 1778 * Attach datapath directly to avoid multiple reallocations 1779 * during fib growth 1780 */ 1781 struct fib_dp_header *fdp; 1782 struct fib_dp **pdp; 1783 1784 pdp = get_family_dp_ptr(rh->rib_family); 1785 if (pdp != NULL) { 1786 fdp = get_fib_dp_header(*pdp); 1787 fdp->fdh_idx[fd->fd_fibnum] = fd->fd_dp; 1788 FD_PRINTF(LOG_INFO, fd, "datapath attached"); 1789 } 1790 } else { 1791 error = EINVAL; 1792 RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name); 1793 } 1794 1795 fib_unref_algo(flm); 1796 1797 return (error); 1798 } 1799 1800 /* 1801 * Registers fib lookup module within the subsystem. 1802 */ 1803 int 1804 fib_module_register(struct fib_lookup_module *flm) 1805 { 1806 1807 FIB_MOD_LOCK(); 1808 ALGO_PRINTF("attaching %s to %s", flm->flm_name, 1809 print_family(flm->flm_family)); 1810 TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); 1811 FIB_MOD_UNLOCK(); 1812 1813 return (0); 1814 } 1815 1816 /* 1817 * Tries to unregister fib lookup module. 1818 * 1819 * Returns 0 on success, EBUSY if module is still used 1820 * by some of the tables. 1821 */ 1822 int 1823 fib_module_unregister(struct fib_lookup_module *flm) 1824 { 1825 1826 FIB_MOD_LOCK(); 1827 if (flm->flm_refcount > 0) { 1828 FIB_MOD_UNLOCK(); 1829 return (EBUSY); 1830 } 1831 fib_error_clear_flm(flm); 1832 ALGO_PRINTF("detaching %s from %s", flm->flm_name, 1833 print_family(flm->flm_family)); 1834 TAILQ_REMOVE(&all_algo_list, flm, entries); 1835 FIB_MOD_UNLOCK(); 1836 1837 return (0); 1838 } 1839 1840 void 1841 vnet_fib_init(void) 1842 { 1843 1844 TAILQ_INIT(&V_fib_data_list); 1845 } 1846 1847 void 1848 vnet_fib_destroy(void) 1849 { 1850 1851 FIB_MOD_LOCK(); 1852 fib_error_clear(); 1853 FIB_MOD_UNLOCK(); 1854 } 1855