1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2020 Alexander V. Chernikov
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 #include "opt_inet.h"
30 #include "opt_inet6.h"
31 #include "opt_route.h"
32
33 #include <sys/param.h>
34 #include <sys/eventhandler.h>
35 #include <sys/kernel.h>
36 #include <sys/sbuf.h>
37 #include <sys/lock.h>
38 #include <sys/rmlock.h>
39 #include <sys/malloc.h>
40 #include <sys/mbuf.h>
41 #include <sys/module.h>
42 #include <sys/kernel.h>
43 #include <sys/priv.h>
44 #include <sys/proc.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/queue.h>
50 #include <net/vnet.h>
51
52 #include <net/if.h>
53 #include <net/if_var.h>
54
55 #include <netinet/in.h>
56 #include <netinet/in_var.h>
57 #include <netinet/ip.h>
58 #include <netinet/ip_var.h>
59 #ifdef INET6
60 #include <netinet/ip6.h>
61 #include <netinet6/ip6_var.h>
62 #endif
63
64 #include <net/route.h>
65 #include <net/route/nhop.h>
66 #include <net/route/route_ctl.h>
67 #include <net/route/route_var.h>
68 #include <net/route/fib_algo.h>
69
70 #include <machine/stdarg.h>
71
72 /*
73 * Fib lookup framework.
74 *
75 * This framework enables accelerated longest-prefix-match lookups for the
76 * routing tables by adding the ability to dynamically attach/detach lookup
77 * algorithms implementation to/from the datapath.
78 *
79 * flm - fib lookup modules - implementation of particular lookup algorithm
80 * fd - fib data - instance of an flm bound to specific routing table
81 *
82 * This file provides main framework functionality.
83 *
84 * The following are the features provided by the framework
85 *
86 * 1) nexhops abstraction -> provides transparent referencing, indexing
87 * and efficient idx->ptr mappings for nexthop and nexthop groups.
88 * 2) Routing table synchronisation
89 * 3) dataplane attachment points
90 * 4) automatic algorithm selection based on the provided preference.
91 *
92 *
93 * DATAPATH
94 * For each supported address family, there is a an allocated array of fib_dp
95 * structures, indexed by fib number. Each array entry contains callback function
96 * and its argument. This function will be called with a family-specific lookup key,
97 * scope and provided argument. This array gets re-created every time when new algo
98 * instance gets created. Please take a look at the replace_rtables_family() function
99 * for more details.
100 *
101 */
102
103 SYSCTL_DECL(_net_route);
104 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
105 "Fib algorithm lookups");
106
107 /* Algorithm sync policy */
108
109 /* Time interval to bucket updates */
110 VNET_DEFINE_STATIC(unsigned int, update_bucket_time_ms) = 50;
111 #define V_update_bucket_time_ms VNET(update_bucket_time_ms)
112 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET,
113 &VNET_NAME(update_bucket_time_ms), 0, "Time interval to calculate update rate");
114
115 /* Minimum update rate to delay sync */
116 VNET_DEFINE_STATIC(unsigned int, bucket_change_threshold_rate) = 500;
117 #define V_bucket_change_threshold_rate VNET(bucket_change_threshold_rate)
118 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET,
119 &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync");
120
121 /* Max allowed delay to sync */
122 VNET_DEFINE_STATIC(unsigned int, fib_max_sync_delay_ms) = 1000;
123 #define V_fib_max_sync_delay_ms VNET(fib_max_sync_delay_ms)
124 SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET,
125 &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)");
126
127
128 #ifdef INET6
129 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false;
130 #define V_algo_fixed_inet6 VNET(algo_fixed_inet6)
131 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
132 "IPv6 longest prefix match lookups");
133 #endif
134 #ifdef INET
135 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false;
136 #define V_algo_fixed_inet VNET(algo_fixed_inet)
137 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
138 "IPv4 longest prefix match lookups");
139 #endif
140
141 /* Fib instance counter */
142 static uint32_t fib_gen = 0;
143
144 struct nhop_ref_table {
145 uint32_t count;
146 int32_t refcnt[0];
147 };
148
149 enum fib_callout_action {
150 FDA_NONE, /* No callout scheduled */
151 FDA_REBUILD, /* Asks to rebuild algo instance */
152 FDA_EVAL, /* Asks to evaluate if the current algo is still be best */
153 FDA_BATCH, /* Asks to submit batch of updates to the algo */
154 };
155
156 struct fib_sync_status {
157 struct timeval diverge_time; /* ts when diverged */
158 uint32_t num_changes; /* number of changes since sync */
159 uint32_t bucket_changes; /* num changes within the current bucket */
160 uint64_t bucket_id; /* 50ms bucket # */
161 struct fib_change_queue fd_change_queue;/* list of scheduled entries */
162 };
163
164 /*
165 * Data structure for the fib lookup instance tied to the particular rib.
166 */
167 struct fib_data {
168 uint32_t number_nhops; /* current # of nhops */
169 uint8_t hit_nhops; /* true if out of nhop limit */
170 uint8_t init_done; /* true if init is competed */
171 uint32_t fd_dead:1; /* Scheduled for deletion */
172 uint32_t fd_linked:1; /* true if linked */
173 uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */
174 uint32_t fd_batch:1; /* true if batched notification scheduled */
175 uint8_t fd_family; /* family */
176 uint32_t fd_fibnum; /* fibnum */
177 uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */
178 uint32_t fd_gen; /* instance gen# */
179 struct callout fd_callout; /* rebuild callout */
180 enum fib_callout_action fd_callout_action; /* Callout action to take */
181 void *fd_algo_data; /* algorithm data */
182 struct nhop_object **nh_idx; /* nhop idx->ptr array */
183 struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */
184 struct rib_head *fd_rh; /* RIB table we're attached to */
185 struct rib_subscription *fd_rs; /* storing table subscription */
186 struct fib_dp fd_dp; /* fib datapath data */
187 struct vnet *fd_vnet; /* vnet fib belongs to */
188 struct epoch_context fd_epoch_ctx; /* epoch context for deletion */
189 struct fib_lookup_module *fd_flm;/* pointer to the lookup module */
190 struct fib_sync_status fd_ss; /* State relevant to the rib sync */
191 uint32_t fd_num_changes; /* number of changes since last callout */
192 TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */
193 };
194
195 static bool rebuild_fd(struct fib_data *fd, const char *reason);
196 static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new);
197 static void handle_fd_callout(void *_data);
198 static void destroy_fd_instance_epoch(epoch_context_t ctx);
199 static bool is_idx_free(struct fib_data *fd, uint32_t index);
200 static void set_algo_fixed(struct rib_head *rh);
201 static bool is_algo_fixed(struct rib_head *rh);
202
203 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh);
204 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh);
205
206 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
207 struct fib_lookup_module *orig_flm);
208 static void fib_unref_algo(struct fib_lookup_module *flm);
209 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum);
210
211 struct mtx fib_mtx;
212 #define FIB_MOD_LOCK() mtx_lock(&fib_mtx)
213 #define FIB_MOD_UNLOCK() mtx_unlock(&fib_mtx)
214 #define FIB_MOD_LOCK_ASSERT() mtx_assert(&fib_mtx, MA_OWNED)
215
216 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF);
217
218 /* Algorithm has to be this percent better than the current to switch */
219 #define BEST_DIFF_PERCENT (5 * 256 / 100)
220 /* Schedule algo re-evaluation X seconds after a change */
221 #define ALGO_EVAL_DELAY_MS 30000
222 /* Force algo re-evaluation after X changes */
223 #define ALGO_EVAL_NUM_ROUTES 100
224 /* Try to setup algorithm X times */
225 #define FIB_MAX_TRIES 32
226 /* Max amount of supported nexthops */
227 #define FIB_MAX_NHOPS 262144
228 #define FIB_CALLOUT_DELAY_MS 50
229
230
231 /* Debug */
232 static int flm_debug_level = LOG_NOTICE;
233 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN,
234 &flm_debug_level, 0, "debuglevel");
235 #define FLM_MAX_DEBUG_LEVEL LOG_DEBUG
236 #ifndef LOG_DEBUG2
237 #define LOG_DEBUG2 8
238 #endif
239
240 #define _PASS_MSG(_l) (flm_debug_level >= (_l))
241 #define ALGO_PRINTF(_l, _fmt, ...) if (_PASS_MSG(_l)) { \
242 printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__); \
243 }
244 #define _ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \
245 printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\
246 print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__)
247 #define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \
248 printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__)
249 #define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \
250 _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\
251 }
252 #define FD_PRINTF(_l, _fd, _fmt, ...) FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__)
253 #define _FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \
254 _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \
255 _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__); \
256 }
257 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2
258 #define FD_PRINTF_LOG_DEBUG2 _FD_PRINTF
259 #else
260 #define FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...)
261 #endif
262 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG
263 #define FD_PRINTF_LOG_DEBUG _FD_PRINTF
264 #else
265 #define FD_PRINTF_LOG_DEBUG()
266 #endif
267 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO
268 #define FD_PRINTF_LOG_INFO _FD_PRINTF
269 #else
270 #define FD_PRINTF_LOG_INFO()
271 #endif
272 #define FD_PRINTF_LOG_NOTICE _FD_PRINTF
273 #define FD_PRINTF_LOG_ERR _FD_PRINTF
274 #define FD_PRINTF_LOG_WARNING _FD_PRINTF
275
276
277 /* List of all registered lookup algorithms */
278 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list);
279
280 /* List of all fib lookup instances in the vnet */
281 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list);
282 #define V_fib_data_list VNET(fib_data_list)
283
284 /* Datastructure for storing non-transient fib lookup module failures */
285 struct fib_error {
286 int fe_family;
287 uint32_t fe_fibnum; /* failed rtable */
288 struct fib_lookup_module *fe_flm; /* failed module */
289 TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */
290 };
291 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list);
292 #define V_fib_error_list VNET(fib_error_list)
293
294 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */
295 struct fib_dp_header {
296 struct epoch_context fdh_epoch_ctx;
297 uint32_t fdh_num_tables;
298 struct fib_dp fdh_idx[0];
299 };
300
301 /*
302 * Tries to add new non-transient algorithm error to the list of
303 * errors.
304 * Returns true on success.
305 */
306 static bool
flm_error_add(struct fib_lookup_module * flm,uint32_t fibnum)307 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum)
308 {
309 struct fib_error *fe;
310
311 fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO);
312 if (fe == NULL)
313 return (false);
314 fe->fe_flm = flm;
315 fe->fe_family = flm->flm_family;
316 fe->fe_fibnum = fibnum;
317
318 FIB_MOD_LOCK();
319 /* Avoid duplicates by checking if error already exists first */
320 if (flm_error_check(flm, fibnum)) {
321 FIB_MOD_UNLOCK();
322 free(fe, M_TEMP);
323 return (true);
324 }
325 TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries);
326 FIB_MOD_UNLOCK();
327
328 return (true);
329 }
330
331 /*
332 * True if non-transient error has been registered for @flm in @fibnum.
333 */
334 static bool
flm_error_check(const struct fib_lookup_module * flm,uint32_t fibnum)335 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum)
336 {
337 const struct fib_error *fe;
338
339 TAILQ_FOREACH(fe, &V_fib_error_list, entries) {
340 if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum))
341 return (true);
342 }
343
344 return (false);
345 }
346
347 /*
348 * Clear all errors of algo specified by @flm.
349 */
350 static void
fib_error_clear_flm(struct fib_lookup_module * flm)351 fib_error_clear_flm(struct fib_lookup_module *flm)
352 {
353 struct fib_error *fe, *fe_tmp;
354
355 FIB_MOD_LOCK_ASSERT();
356
357 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
358 if (fe->fe_flm == flm) {
359 TAILQ_REMOVE(&V_fib_error_list, fe, entries);
360 free(fe, M_TEMP);
361 }
362 }
363 }
364
365 /*
366 * Clears all errors in current VNET.
367 */
368 static void
fib_error_clear(void)369 fib_error_clear(void)
370 {
371 struct fib_error *fe, *fe_tmp;
372
373 FIB_MOD_LOCK_ASSERT();
374
375 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
376 TAILQ_REMOVE(&V_fib_error_list, fe, entries);
377 free(fe, M_TEMP);
378 }
379 }
380
381 static const char *
print_op_result(enum flm_op_result result)382 print_op_result(enum flm_op_result result)
383 {
384 switch (result) {
385 case FLM_SUCCESS:
386 return "success";
387 case FLM_REBUILD:
388 return "rebuild";
389 case FLM_BATCH:
390 return "batch";
391 case FLM_ERROR:
392 return "error";
393 }
394
395 return "unknown";
396 }
397
398 static const char *
print_family(int family)399 print_family(int family)
400 {
401
402 if (family == AF_INET)
403 return ("inet");
404 else if (family == AF_INET6)
405 return ("inet6");
406 else
407 return ("unknown");
408 }
409
410 /*
411 * Debug function used by lookup algorithms.
412 * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) "
413 */
414 void
fib_printf(int level,struct fib_data * fd,const char * func,char * fmt,...)415 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...)
416 {
417 char buf[128];
418 va_list ap;
419
420 if (level > flm_debug_level)
421 return;
422
423 va_start(ap, fmt);
424 vsnprintf(buf, sizeof(buf), fmt, ap);
425 va_end(ap);
426
427 _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name,
428 fd->fd_gen, func, "%s", buf);
429 }
430
431 /*
432 * Outputs list of algorithms supported by the provided address family.
433 */
434 static int
print_algos_sysctl(struct sysctl_req * req,int family)435 print_algos_sysctl(struct sysctl_req *req, int family)
436 {
437 struct fib_lookup_module *flm;
438 struct sbuf sbuf;
439 int error, count = 0;
440
441 error = sysctl_wire_old_buffer(req, 0);
442 if (error == 0) {
443 sbuf_new_for_sysctl(&sbuf, NULL, 512, req);
444 TAILQ_FOREACH(flm, &all_algo_list, entries) {
445 if (flm->flm_family == family) {
446 if (count++ > 0)
447 sbuf_cat(&sbuf, ", ");
448 sbuf_cat(&sbuf, flm->flm_name);
449 }
450 }
451 error = sbuf_finish(&sbuf);
452 sbuf_delete(&sbuf);
453 }
454 return (error);
455 }
456
457 #ifdef INET6
458 static int
print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)459 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)
460 {
461
462 return (print_algos_sysctl(req, AF_INET6));
463 }
464 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
465 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
466 print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms");
467 #endif
468
469 #ifdef INET
470 static int
print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)471 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)
472 {
473
474 return (print_algos_sysctl(req, AF_INET));
475 }
476 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
477 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
478 print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms");
479 #endif
480
481 /*
482 * Calculate delay between repeated failures.
483 * Returns current delay in milliseconds.
484 */
485 static uint32_t
callout_calc_delay_ms(struct fib_data * fd)486 callout_calc_delay_ms(struct fib_data *fd)
487 {
488 uint32_t shift;
489
490 if (fd->fd_failed_rebuilds > 10)
491 shift = 10;
492 else
493 shift = fd->fd_failed_rebuilds;
494
495 return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
496 }
497
498 static void
schedule_callout(struct fib_data * fd,enum fib_callout_action action,int delay_ms)499 schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms)
500 {
501
502 FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action);
503 fd->fd_callout_action = action;
504 callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0,
505 handle_fd_callout, fd, 0);
506 }
507
508 static void
schedule_fd_rebuild(struct fib_data * fd,const char * reason)509 schedule_fd_rebuild(struct fib_data *fd, const char *reason)
510 {
511
512 RIB_WLOCK_ASSERT(fd->fd_rh);
513
514 if (!fd->fd_need_rebuild) {
515 fd->fd_need_rebuild = true;
516 /* Stop batch updates */
517 fd->fd_batch = false;
518
519 /*
520 * Potentially re-schedules pending callout
521 * initiated by schedule_algo_eval.
522 */
523 FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)",
524 reason, fd->fd_failed_rebuilds);
525 schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd));
526 }
527 }
528
529 static void
sync_rib_gen(struct fib_data * fd)530 sync_rib_gen(struct fib_data *fd)
531 {
532 FD_PRINTF(LOG_DEBUG, fd, "Sync gen %u -> %u", fd->fd_rh->rnh_gen, fd->fd_rh->rnh_gen_rib);
533 fd->fd_rh->rnh_gen = fd->fd_rh->rnh_gen_rib;
534 }
535
536 static int64_t
get_tv_diff_ms(const struct timeval * old_tv,const struct timeval * new_tv)537 get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv)
538 {
539 int64_t diff = 0;
540
541 diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000;
542 diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000;
543
544 return (diff);
545 }
546
547 static void
add_tv_diff_ms(struct timeval * tv,int ms)548 add_tv_diff_ms(struct timeval *tv, int ms)
549 {
550 tv->tv_sec += ms / 1000;
551 ms = ms % 1000;
552 if (ms * 1000 + tv->tv_usec < 1000000)
553 tv->tv_usec += ms * 1000;
554 else {
555 tv->tv_sec += 1;
556 tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000;
557 }
558 }
559
560 /*
561 * Marks the time when algo state diverges from the rib state.
562 */
563 static void
mark_diverge_time(struct fib_data * fd)564 mark_diverge_time(struct fib_data *fd)
565 {
566 struct fib_sync_status *fd_ss = &fd->fd_ss;
567
568 getmicrouptime(&fd_ss->diverge_time);
569 fd_ss->bucket_id = 0;
570 fd_ss->bucket_changes = 0;
571 }
572
573 /*
574 * Calculates and updates the next algorithm sync time, based on the current activity.
575 *
576 * The intent is to provide reasonable balance between the update
577 * latency and efficient batching when changing large amount of routes.
578 *
579 * High-level algorithm looks the following:
580 * 1) all changes are bucketed in 50ms intervals
581 * 2) If amount of changes within the bucket is greater than the threshold,
582 * the update gets delayed, up to maximum delay threshold.
583 */
584 static void
update_rebuild_delay(struct fib_data * fd,enum fib_callout_action action)585 update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action)
586 {
587 uint32_t bucket_id, new_delay = 0;
588 struct timeval tv;
589
590 /* Fetch all variables at once to ensure consistent reads */
591 uint32_t bucket_time_ms = V_update_bucket_time_ms;
592 uint32_t threshold_rate = V_bucket_change_threshold_rate;
593 uint32_t max_delay_ms = V_fib_max_sync_delay_ms;
594
595 if (bucket_time_ms == 0)
596 bucket_time_ms = 50;
597 /* calculate per-bucket threshold rate */
598 threshold_rate = threshold_rate * bucket_time_ms / 1000;
599
600 getmicrouptime(&tv);
601
602 struct fib_sync_status *fd_ss = &fd->fd_ss;
603
604 bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms;
605
606 if (fd_ss->bucket_id == bucket_id) {
607 fd_ss->bucket_changes++;
608 if (fd_ss->bucket_changes == threshold_rate) {
609 new_delay = (bucket_id + 2) * bucket_time_ms;
610 if (new_delay <= max_delay_ms) {
611 FD_PRINTF(LOG_DEBUG, fd,
612 "hit threshold of %u routes, delay update,"
613 "bucket: %u, total delay: %u",
614 threshold_rate, bucket_id + 1, new_delay);
615 } else {
616 new_delay = 0;
617 FD_PRINTF(LOG_DEBUG, fd,
618 "maximum sync delay (%u ms) reached", max_delay_ms);
619 }
620 } else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1))
621 new_delay = bucket_time_ms;
622 } else {
623 fd_ss->bucket_id = bucket_id;
624 fd_ss->bucket_changes = 1;
625 }
626
627 if (new_delay > 0) {
628 /* Calculated time has been updated */
629 struct timeval new_tv = fd_ss->diverge_time;
630 add_tv_diff_ms(&new_tv, new_delay);
631
632 int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv);
633 schedule_callout(fd, action, delay_ms);
634 }
635 }
636
637 static void
update_algo_state(struct fib_data * fd)638 update_algo_state(struct fib_data *fd)
639 {
640
641 RIB_WLOCK_ASSERT(fd->fd_rh);
642
643 if (fd->fd_batch || fd->fd_need_rebuild) {
644 enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH;
645 update_rebuild_delay(fd, action);
646 return;
647 }
648
649 if (fd->fd_num_changes++ == 0) {
650 /* Start callout to consider switch */
651 if (!callout_pending(&fd->fd_callout))
652 schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS);
653 } else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) {
654 /* Reset callout to exec immediately */
655 if (fd->fd_callout_action == FDA_EVAL)
656 schedule_callout(fd, FDA_EVAL, 1);
657 }
658 }
659
660 static bool
need_immediate_sync(struct fib_data * fd,struct rib_cmd_info * rc)661 need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc)
662 {
663 struct nhop_object *nh;
664
665 /* Sync addition/removal of interface routes */
666 switch (rc->rc_cmd) {
667 case RTM_ADD:
668 nh = rc->rc_nh_new;
669 if (!NH_IS_NHGRP(nh)) {
670 if (!(nh->nh_flags & NHF_GATEWAY))
671 return (true);
672 if (nhop_get_rtflags(nh) & RTF_STATIC)
673 return (true);
674 }
675 break;
676 case RTM_DELETE:
677 nh = rc->rc_nh_old;
678 if (!NH_IS_NHGRP(nh)) {
679 if (!(nh->nh_flags & NHF_GATEWAY))
680 return (true);
681 if (nhop_get_rtflags(nh) & RTF_STATIC)
682 return (true);
683 }
684 break;
685 }
686
687 return (false);
688 }
689
690 static bool
apply_rtable_changes(struct fib_data * fd)691 apply_rtable_changes(struct fib_data *fd)
692 {
693 enum flm_op_result result;
694 struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
695
696 result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data);
697
698 if (result == FLM_SUCCESS) {
699 sync_rib_gen(fd);
700 for (int i = 0; i < q->count; i++)
701 if (q->entries[i].nh_old)
702 fib_unref_nhop(fd, q->entries[i].nh_old);
703 q->count = 0;
704 }
705 fd->fd_batch = false;
706
707 return (result == FLM_SUCCESS);
708 }
709
710 static bool
fill_change_entry(struct fib_data * fd,struct fib_change_entry * ce,struct rib_cmd_info * rc)711 fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc)
712 {
713 int plen = 0;
714
715 switch (fd->fd_family) {
716 #ifdef INET
717 case AF_INET:
718 rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid);
719 break;
720 #endif
721 #ifdef INET6
722 case AF_INET6:
723 rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid);
724 break;
725 #endif
726 }
727
728 ce->plen = plen;
729 ce->nh_old = rc->rc_nh_old;
730 ce->nh_new = rc->rc_nh_new;
731 if (ce->nh_new != NULL) {
732 if (fib_ref_nhop(fd, ce->nh_new) == 0)
733 return (false);
734 }
735
736 return (true);
737 }
738
739 static bool
queue_rtable_change(struct fib_data * fd,struct rib_cmd_info * rc)740 queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc)
741 {
742 struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
743
744 if (q->count >= q->size) {
745 uint32_t q_size;
746
747 if (q->size == 0)
748 q_size = 256; /* ~18k memory */
749 else
750 q_size = q->size * 2;
751
752 size_t size = q_size * sizeof(struct fib_change_entry);
753 void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO);
754 if (a == NULL) {
755 FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements",
756 q_size);
757 return (false);
758 }
759 q->entries = a;
760 q->size = q_size;
761 }
762
763 return (fill_change_entry(fd, &q->entries[q->count++], rc));
764 }
765
766 /*
767 * Rib subscription handler. Checks if the algorithm is ready to
768 * receive updates, handles nexthop refcounting and passes change
769 * data to the algorithm callback.
770 */
771 static void
handle_rtable_change_cb(struct rib_head * rnh,struct rib_cmd_info * rc,void * _data)772 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
773 void *_data)
774 {
775 struct fib_data *fd = (struct fib_data *)_data;
776 enum flm_op_result result;
777
778 RIB_WLOCK_ASSERT(rnh);
779
780 /*
781 * There is a small gap between subscribing for route changes
782 * and initiating rtable dump. Avoid receiving route changes
783 * prior to finishing rtable dump by checking `init_done`.
784 */
785 if (!fd->init_done)
786 return;
787
788 bool immediate_sync = need_immediate_sync(fd, rc);
789
790 /* Consider scheduling algorithm re-evaluation */
791 update_algo_state(fd);
792
793 /*
794 * If algo requested rebuild, stop sending updates by default.
795 * This simplifies nexthop refcount handling logic.
796 */
797 if (fd->fd_need_rebuild) {
798 if (immediate_sync)
799 rebuild_fd(fd, "rtable change type enforced sync");
800 return;
801 }
802
803 /*
804 * Algo requested updates to be delivered in batches.
805 * Add the current change to the queue and return.
806 */
807 if (fd->fd_batch) {
808 if (immediate_sync) {
809 if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd))
810 rebuild_fd(fd, "batch sync failed");
811 } else {
812 if (!queue_rtable_change(fd, rc))
813 schedule_fd_rebuild(fd, "batch queue failed");
814 }
815 return;
816 }
817
818 /*
819 * Maintain guarantee that every nexthop returned by the dataplane
820 * lookup has > 0 refcount, so can be safely referenced within current
821 * epoch.
822 */
823 if (rc->rc_nh_new != NULL) {
824 if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) {
825 /* ran out of indexes */
826 schedule_fd_rebuild(fd, "ran out of nhop indexes");
827 return;
828 }
829 }
830
831 result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
832
833 switch (result) {
834 case FLM_SUCCESS:
835 sync_rib_gen(fd);
836 /* Unref old nexthop on success */
837 if (rc->rc_nh_old != NULL)
838 fib_unref_nhop(fd, rc->rc_nh_old);
839 break;
840 case FLM_BATCH:
841
842 /*
843 * Algo asks to batch the changes.
844 */
845 if (queue_rtable_change(fd, rc)) {
846 if (!immediate_sync) {
847 fd->fd_batch = true;
848 mark_diverge_time(fd);
849 update_rebuild_delay(fd, FDA_BATCH);
850 break;
851 }
852 if (apply_rtable_changes(fd))
853 break;
854 }
855 FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild");
856
857 case FLM_REBUILD:
858
859 /*
860 * Algo is not able to apply the update.
861 * Schedule algo rebuild.
862 */
863 if (!immediate_sync) {
864 mark_diverge_time(fd);
865 schedule_fd_rebuild(fd, "algo requested rebuild");
866 break;
867 }
868
869 FD_PRINTF(LOG_INFO, fd, "running sync rebuild");
870 rebuild_fd(fd, "rtable change type enforced sync");
871 break;
872 case FLM_ERROR:
873
874 /*
875 * Algo reported a non-recoverable error.
876 * Record the error and schedule rebuild, which will
877 * trigger best algo selection.
878 */
879 FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error");
880 if (!flm_error_add(fd->fd_flm, fd->fd_fibnum))
881 FD_PRINTF(LOG_ERR, fd, "failed to ban algo");
882 schedule_fd_rebuild(fd, "algo reported non-recoverable error");
883 }
884 }
885
886 static void
estimate_nhop_scale(const struct fib_data * old_fd,struct fib_data * fd)887 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd)
888 {
889
890 if (old_fd == NULL) {
891 // TODO: read from rtable
892 fd->number_nhops = 16;
893 return;
894 }
895
896 if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
897 fd->number_nhops = 2 * old_fd->number_nhops;
898 else
899 fd->number_nhops = old_fd->number_nhops;
900 }
901
902 struct walk_cbdata {
903 struct fib_data *fd;
904 flm_dump_t *func;
905 enum flm_op_result result;
906 };
907
908 /*
909 * Handler called after all rtenties have been dumped.
910 * Performs post-dump framework checks and calls
911 * algo:flm_dump_end_cb().
912 *
913 * Updates walk_cbdata result.
914 */
915 static void
sync_algo_end_cb(struct rib_head * rnh,enum rib_walk_hook stage,void * _data)916 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
917 {
918 struct walk_cbdata *w = (struct walk_cbdata *)_data;
919 struct fib_data *fd = w->fd;
920
921 RIB_WLOCK_ASSERT(w->fd->fd_rh);
922
923 if (rnh->rib_dying) {
924 w->result = FLM_ERROR;
925 return;
926 }
927
928 if (fd->hit_nhops) {
929 FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops",
930 fd->nh_ref_table->count);
931 if (w->result == FLM_SUCCESS)
932 w->result = FLM_REBUILD;
933 return;
934 }
935
936 if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
937 return;
938
939 /* Post-dump hook, dump successful */
940 w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
941
942 if (w->result == FLM_SUCCESS) {
943 /* Mark init as done to allow routing updates */
944 fd->init_done = 1;
945 }
946 }
947
948 /*
949 * Callback for each entry in rib.
950 * Calls algo:flm_dump_rib_item_cb func as a part of initial
951 * route table synchronisation.
952 */
953 static int
sync_algo_cb(struct rtentry * rt,void * _data)954 sync_algo_cb(struct rtentry *rt, void *_data)
955 {
956 struct walk_cbdata *w = (struct walk_cbdata *)_data;
957
958 RIB_WLOCK_ASSERT(w->fd->fd_rh);
959
960 if (w->result == FLM_SUCCESS && w->func) {
961
962 /*
963 * Reference nexthops to maintain guarantee that
964 * each nexthop returned by datapath has > 0 references
965 * and can be safely referenced within current epoch.
966 */
967 struct nhop_object *nh = rt_get_raw_nhop(rt);
968 if (fib_ref_nhop(w->fd, nh) != 0)
969 w->result = w->func(rt, w->fd->fd_algo_data);
970 else
971 w->result = FLM_REBUILD;
972 }
973
974 return (0);
975 }
976
977 /*
978 * Dump all routing table state to the algo instance.
979 */
980 static enum flm_op_result
sync_algo(struct fib_data * fd)981 sync_algo(struct fib_data *fd)
982 {
983 struct walk_cbdata w = {
984 .fd = fd,
985 .func = fd->fd_flm->flm_dump_rib_item_cb,
986 .result = FLM_SUCCESS,
987 };
988
989 rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w);
990
991 FD_PRINTF(LOG_INFO, fd,
992 "initial dump completed (rtable version: %d), result: %s",
993 fd->fd_rh->rnh_gen, print_op_result(w.result));
994
995 return (w.result);
996 }
997
998 /*
999 * Schedules epoch-backed @fd instance deletion.
1000 * * Unlinks @fd from the list of active algo instances.
1001 * * Removes rib subscription.
1002 * * Stops callout.
1003 * * Schedules actual deletion.
1004 *
1005 * Assume @fd is already unlinked from the datapath.
1006 */
1007 static int
schedule_destroy_fd_instance(struct fib_data * fd,bool in_callout)1008 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
1009 {
1010 bool is_dead;
1011
1012 NET_EPOCH_ASSERT();
1013 RIB_WLOCK_ASSERT(fd->fd_rh);
1014
1015 FIB_MOD_LOCK();
1016 is_dead = fd->fd_dead;
1017 if (!is_dead)
1018 fd->fd_dead = true;
1019 if (fd->fd_linked) {
1020 TAILQ_REMOVE(&V_fib_data_list, fd, entries);
1021 fd->fd_linked = false;
1022 }
1023 FIB_MOD_UNLOCK();
1024 if (is_dead)
1025 return (0);
1026
1027 FD_PRINTF(LOG_INFO, fd, "DETACH");
1028
1029 if (fd->fd_rs != NULL)
1030 rib_unsubscribe_locked(fd->fd_rs);
1031
1032 /*
1033 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
1034 * will be executed, hence no _new_ callout schedules will happen.
1035 */
1036 callout_stop(&fd->fd_callout);
1037
1038 fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx);
1039
1040 return (0);
1041 }
1042
1043 /*
1044 * Wipe all fd instances from the list matching rib specified by @rh.
1045 * If @keep_first is set, remove all but the first record.
1046 */
1047 static void
fib_cleanup_algo(struct rib_head * rh,bool keep_first,bool in_callout)1048 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
1049 {
1050 struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
1051 struct fib_data *fd, *fd_tmp;
1052 struct epoch_tracker et;
1053
1054 FIB_MOD_LOCK();
1055 TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) {
1056 if (fd->fd_rh == rh) {
1057 if (keep_first) {
1058 keep_first = false;
1059 continue;
1060 }
1061 TAILQ_REMOVE(&V_fib_data_list, fd, entries);
1062 fd->fd_linked = false;
1063 TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
1064 }
1065 }
1066 FIB_MOD_UNLOCK();
1067
1068 /* Pass 2: remove each entry */
1069 NET_EPOCH_ENTER(et);
1070 TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
1071 if (!in_callout)
1072 RIB_WLOCK(fd->fd_rh);
1073 schedule_destroy_fd_instance(fd, in_callout);
1074 if (!in_callout)
1075 RIB_WUNLOCK(fd->fd_rh);
1076 }
1077 NET_EPOCH_EXIT(et);
1078 }
1079
1080 void
fib_destroy_rib(struct rib_head * rh)1081 fib_destroy_rib(struct rib_head *rh)
1082 {
1083
1084 /*
1085 * rnh has `is_dying` flag set, so setup of new fd's will fail at
1086 * sync_algo() stage, preventing new entries to be added to the list
1087 * of active algos. Remove all existing entries for the particular rib.
1088 */
1089 fib_cleanup_algo(rh, false, false);
1090 }
1091
1092 /*
1093 * Finalises fd destruction by freeing all fd resources.
1094 */
1095 static void
destroy_fd_instance(struct fib_data * fd)1096 destroy_fd_instance(struct fib_data *fd)
1097 {
1098
1099 FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd);
1100
1101 /* Call destroy callback first */
1102 if (fd->fd_algo_data != NULL)
1103 fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
1104
1105 /* Nhop table */
1106 if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) {
1107 for (int i = 0; i < fd->number_nhops; i++) {
1108 if (!is_idx_free(fd, i)) {
1109 FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p",
1110 i, fd->nh_idx[i]);
1111 nhop_free_any(fd->nh_idx[i]);
1112 }
1113 }
1114 free(fd->nh_idx, M_RTABLE);
1115 }
1116 if (fd->nh_ref_table != NULL)
1117 free(fd->nh_ref_table, M_RTABLE);
1118
1119 if (fd->fd_ss.fd_change_queue.entries != NULL)
1120 free(fd->fd_ss.fd_change_queue.entries, M_TEMP);
1121
1122 fib_unref_algo(fd->fd_flm);
1123
1124 free(fd, M_RTABLE);
1125 }
1126
1127 /*
1128 * Epoch callback indicating fd is safe to destroy
1129 */
1130 static void
destroy_fd_instance_epoch(epoch_context_t ctx)1131 destroy_fd_instance_epoch(epoch_context_t ctx)
1132 {
1133 struct fib_data *fd;
1134
1135 fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
1136
1137 CURVNET_SET(fd->fd_vnet);
1138 destroy_fd_instance(fd);
1139 CURVNET_RESTORE();
1140 }
1141
1142 /*
1143 * Tries to setup fd instance.
1144 * - Allocates fd/nhop table
1145 * - Runs algo:flm_init_cb algo init
1146 * - Subscribes fd to the rib
1147 * - Runs rtable dump
1148 * - Adds instance to the list of active instances.
1149 *
1150 * Returns: operation result. Fills in @pfd with resulting fd on success.
1151 *
1152 */
1153 static enum flm_op_result
try_setup_fd_instance(struct fib_lookup_module * flm,struct rib_head * rh,struct fib_data * old_fd,struct fib_data ** pfd)1154 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1155 struct fib_data *old_fd, struct fib_data **pfd)
1156 {
1157 struct fib_data *fd;
1158 size_t size;
1159 enum flm_op_result result;
1160
1161 /* Allocate */
1162 fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO);
1163 if (fd == NULL) {
1164 *pfd = NULL;
1165 RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure");
1166 return (FLM_REBUILD);
1167 }
1168 *pfd = fd;
1169
1170 estimate_nhop_scale(old_fd, fd);
1171
1172 fd->fd_rh = rh;
1173 fd->fd_family = rh->rib_family;
1174 fd->fd_fibnum = rh->rib_fibnum;
1175 callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0);
1176 fd->fd_vnet = curvnet;
1177 fd->fd_flm = flm;
1178
1179 FIB_MOD_LOCK();
1180 flm->flm_refcount++;
1181 fd->fd_gen = ++fib_gen;
1182 FIB_MOD_UNLOCK();
1183
1184 FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd);
1185
1186 /* Allocate nhidx -> nhop_ptr table */
1187 size = fd->number_nhops * sizeof(void *);
1188 fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1189 if (fd->nh_idx == NULL) {
1190 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size);
1191 return (FLM_REBUILD);
1192 }
1193
1194 /* Allocate nhop index refcount table */
1195 size = sizeof(struct nhop_ref_table);
1196 size += fd->number_nhops * sizeof(uint32_t);
1197 fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1198 if (fd->nh_ref_table == NULL) {
1199 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size);
1200 return (FLM_REBUILD);
1201 }
1202 FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops);
1203
1204 /* Okay, we're ready for algo init */
1205 void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL;
1206 result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data);
1207 if (result != FLM_SUCCESS) {
1208 FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name);
1209 return (result);
1210 }
1211
1212 /* Try to subscribe */
1213 if (flm->flm_change_rib_item_cb != NULL) {
1214 fd->fd_rs = rib_subscribe_locked(fd->fd_rh,
1215 handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE);
1216 if (fd->fd_rs == NULL) {
1217 FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes");
1218 return (FLM_REBUILD);
1219 }
1220 }
1221
1222 /* Dump */
1223 result = sync_algo(fd);
1224 if (result != FLM_SUCCESS) {
1225 FD_PRINTF(LOG_INFO, fd, "rib sync failed");
1226 return (result);
1227 }
1228 FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully.");
1229
1230 FIB_MOD_LOCK();
1231 /*
1232 * Insert fd in the beginning of a list, to maintain invariant
1233 * that first matching entry for the AF/fib is always the active
1234 * one.
1235 */
1236 TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries);
1237 fd->fd_linked = true;
1238 FIB_MOD_UNLOCK();
1239
1240 return (FLM_SUCCESS);
1241 }
1242
1243 /*
1244 * Sets up algo @flm for table @rh and links it to the datapath.
1245 *
1246 */
1247 static enum flm_op_result
setup_fd_instance(struct fib_lookup_module * flm,struct rib_head * rh,struct fib_data * orig_fd,struct fib_data ** pfd,bool attach)1248 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1249 struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
1250 {
1251 struct fib_data *prev_fd, *new_fd;
1252 enum flm_op_result result;
1253
1254 NET_EPOCH_ASSERT();
1255 RIB_WLOCK_ASSERT(rh);
1256
1257 prev_fd = orig_fd;
1258 new_fd = NULL;
1259 for (int i = 0; i < FIB_MAX_TRIES; i++) {
1260 result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);
1261
1262 if ((result == FLM_SUCCESS) && attach) {
1263 if (fib_set_datapath_ptr(new_fd, &new_fd->fd_dp))
1264 sync_rib_gen(new_fd);
1265 else
1266 result = FLM_REBUILD;
1267 }
1268
1269 if ((prev_fd != NULL) && (prev_fd != orig_fd)) {
1270 schedule_destroy_fd_instance(prev_fd, false);
1271 prev_fd = NULL;
1272 }
1273
1274 RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
1275 print_op_result(result));
1276
1277 if (result == FLM_REBUILD) {
1278 prev_fd = new_fd;
1279 new_fd = NULL;
1280 continue;
1281 }
1282
1283 break;
1284 }
1285
1286 if (result != FLM_SUCCESS) {
1287 RH_PRINTF(LOG_WARNING, rh,
1288 "%s algo instance setup failed, failures=%d", flm->flm_name,
1289 orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0);
1290 /* update failure count */
1291 FIB_MOD_LOCK();
1292 if (orig_fd != NULL)
1293 orig_fd->fd_failed_rebuilds++;
1294 FIB_MOD_UNLOCK();
1295
1296 /* Ban algo on non-recoverable error */
1297 if (result == FLM_ERROR)
1298 flm_error_add(flm, rh->rib_fibnum);
1299
1300 if ((prev_fd != NULL) && (prev_fd != orig_fd))
1301 schedule_destroy_fd_instance(prev_fd, false);
1302 if (new_fd != NULL) {
1303 schedule_destroy_fd_instance(new_fd, false);
1304 new_fd = NULL;
1305 }
1306 }
1307
1308 *pfd = new_fd;
1309 return (result);
1310 }
1311
1312 /*
1313 * Tries to sync algo with the current rtable state, either
1314 * by executing batch update or rebuilding.
1315 * Returns true on success.
1316 */
1317 static bool
execute_callout_action(struct fib_data * fd)1318 execute_callout_action(struct fib_data *fd)
1319 {
1320 enum fib_callout_action action = fd->fd_callout_action;
1321 struct fib_lookup_module *flm_new = NULL;
1322 bool result = true;
1323
1324 NET_EPOCH_ASSERT();
1325 RIB_WLOCK_ASSERT(fd->fd_rh);
1326
1327 fd->fd_need_rebuild = false;
1328 fd->fd_batch = false;
1329 fd->fd_num_changes = 0;
1330
1331 /* First, check if we're still OK to use this algo */
1332 if (!is_algo_fixed(fd->fd_rh))
1333 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1334 if (flm_new != NULL)
1335 action = FDA_REBUILD;
1336
1337 if (action == FDA_BATCH) {
1338 /* Try to sync */
1339 if (!apply_rtable_changes(fd))
1340 action = FDA_REBUILD;
1341 }
1342
1343 if (action == FDA_REBUILD)
1344 result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1345 if (flm_new != NULL)
1346 fib_unref_algo(flm_new);
1347
1348 return (result);
1349 }
1350
1351 /*
1352 * Callout for all scheduled fd-related work.
1353 * - Checks if the current algo is still the best algo
1354 * - Synchronises algo instance to the rtable (batch usecase)
1355 * - Creates a new instance of an algo for af/fib if desired.
1356 */
1357 static void
handle_fd_callout(void * _data)1358 handle_fd_callout(void *_data)
1359 {
1360 struct fib_data *fd = (struct fib_data *)_data;
1361 struct epoch_tracker et;
1362
1363 FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action);
1364
1365 NET_EPOCH_ENTER(et);
1366 CURVNET_SET(fd->fd_vnet);
1367 execute_callout_action(fd);
1368 CURVNET_RESTORE();
1369 NET_EPOCH_EXIT(et);
1370 }
1371
1372 /*
1373 * Tries to create new algo instance based on @fd data.
1374 * Returns true on success.
1375 */
1376 static bool
rebuild_fd_flm(struct fib_data * fd,struct fib_lookup_module * flm_new)1377 rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new)
1378 {
1379 struct fib_data *fd_new, *fd_tmp = NULL;
1380 bool result;
1381
1382 if (flm_new == fd->fd_flm)
1383 fd_tmp = fd;
1384 else
1385 FD_PRINTF(LOG_INFO, fd, "switching algo to %s", flm_new->flm_name);
1386
1387 result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true);
1388 if (result != FLM_SUCCESS) {
1389 FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed");
1390 return (false);
1391 }
1392 FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");
1393
1394 /* Remove old instance */
1395 schedule_destroy_fd_instance(fd, true);
1396
1397 return (true);
1398 }
1399
1400 static bool
rebuild_fd(struct fib_data * fd,const char * reason)1401 rebuild_fd(struct fib_data *fd, const char *reason)
1402 {
1403 struct fib_lookup_module *flm_new = NULL;
1404 bool result;
1405
1406 if (!is_algo_fixed(fd->fd_rh))
1407 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1408
1409 FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason);
1410 result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1411 if (flm_new != NULL)
1412 fib_unref_algo(flm_new);
1413
1414 if (!result) {
1415 FD_PRINTF(LOG_ERR, fd, "sync rebuild failed");
1416 schedule_fd_rebuild(fd, "sync rebuild failed");
1417 }
1418
1419 return (result);
1420 }
1421
1422 /*
1423 * Finds algo by name/family.
1424 * Returns referenced algo or NULL.
1425 */
1426 static struct fib_lookup_module *
fib_find_algo(const char * algo_name,int family)1427 fib_find_algo(const char *algo_name, int family)
1428 {
1429 struct fib_lookup_module *flm;
1430
1431 FIB_MOD_LOCK();
1432 TAILQ_FOREACH(flm, &all_algo_list, entries) {
1433 if ((strcmp(flm->flm_name, algo_name) == 0) &&
1434 (family == flm->flm_family)) {
1435 flm->flm_refcount++;
1436 FIB_MOD_UNLOCK();
1437 return (flm);
1438 }
1439 }
1440 FIB_MOD_UNLOCK();
1441
1442 return (NULL);
1443 }
1444
1445 static void
fib_unref_algo(struct fib_lookup_module * flm)1446 fib_unref_algo(struct fib_lookup_module *flm)
1447 {
1448
1449 FIB_MOD_LOCK();
1450 flm->flm_refcount--;
1451 FIB_MOD_UNLOCK();
1452 }
1453
1454 static int
set_fib_algo(uint32_t fibnum,int family,struct sysctl_oid * oidp,struct sysctl_req * req)1455 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req)
1456 {
1457 struct fib_lookup_module *flm = NULL;
1458 struct fib_data *fd = NULL;
1459 char old_algo_name[32], algo_name[32];
1460 struct rib_head *rh = NULL;
1461 enum flm_op_result result;
1462 struct epoch_tracker et;
1463 int error;
1464
1465 /* Fetch current algo/rib for af/family */
1466 FIB_MOD_LOCK();
1467 TAILQ_FOREACH(fd, &V_fib_data_list, entries) {
1468 if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum))
1469 break;
1470 }
1471 if (fd == NULL) {
1472 FIB_MOD_UNLOCK();
1473 return (ENOENT);
1474 }
1475 rh = fd->fd_rh;
1476 strlcpy(old_algo_name, fd->fd_flm->flm_name,
1477 sizeof(old_algo_name));
1478 FIB_MOD_UNLOCK();
1479
1480 strlcpy(algo_name, old_algo_name, sizeof(algo_name));
1481 error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req);
1482 if (error != 0 || req->newptr == NULL)
1483 return (error);
1484
1485 if (strcmp(algo_name, old_algo_name) == 0)
1486 return (0);
1487
1488 /* New algorithm name is different */
1489 flm = fib_find_algo(algo_name, family);
1490 if (flm == NULL) {
1491 RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name);
1492 return (ESRCH);
1493 }
1494
1495 fd = NULL;
1496 NET_EPOCH_ENTER(et);
1497 RIB_WLOCK(rh);
1498 result = setup_fd_instance(flm, rh, NULL, &fd, true);
1499 RIB_WUNLOCK(rh);
1500 NET_EPOCH_EXIT(et);
1501 fib_unref_algo(flm);
1502 if (result != FLM_SUCCESS)
1503 return (EINVAL);
1504
1505 /* Disable automated jumping between algos */
1506 FIB_MOD_LOCK();
1507 set_algo_fixed(rh);
1508 FIB_MOD_UNLOCK();
1509 /* Remove old instance(s) */
1510 fib_cleanup_algo(rh, true, false);
1511
1512 /* Drain cb so user can unload the module after userret if so desired */
1513 NET_EPOCH_DRAIN_CALLBACKS();
1514
1515 return (0);
1516 }
1517
1518 #ifdef INET
1519 static int
set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS)1520 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS)
1521 {
1522
1523 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req));
1524 }
1525 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo,
1526 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1527 set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo");
1528 #endif
1529
1530 #ifdef INET6
1531 static int
set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS)1532 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS)
1533 {
1534
1535 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req));
1536 }
1537 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo,
1538 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1539 set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo");
1540 #endif
1541
1542 static struct nhop_object *
dummy_lookup(void * algo_data,const struct flm_lookup_key key,uint32_t scopeid)1543 dummy_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
1544 {
1545 return (NULL);
1546 }
1547
1548 static void
destroy_fdh_epoch(epoch_context_t ctx)1549 destroy_fdh_epoch(epoch_context_t ctx)
1550 {
1551 struct fib_dp_header *fdh;
1552
1553 fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx);
1554 free(fdh, M_RTABLE);
1555 }
1556
1557 static struct fib_dp_header *
alloc_fib_dp_array(uint32_t num_tables,bool waitok)1558 alloc_fib_dp_array(uint32_t num_tables, bool waitok)
1559 {
1560 size_t sz;
1561 struct fib_dp_header *fdh;
1562
1563 sz = sizeof(struct fib_dp_header);
1564 sz += sizeof(struct fib_dp) * num_tables;
1565 fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
1566 if (fdh != NULL) {
1567 fdh->fdh_num_tables = num_tables;
1568 /*
1569 * Set dummy lookup function ptr always returning NULL, so
1570 * we can delay algo init.
1571 */
1572 for (uint32_t i = 0; i < num_tables; i++)
1573 fdh->fdh_idx[i].f = dummy_lookup;
1574 }
1575 return (fdh);
1576 }
1577
1578 static struct fib_dp_header *
get_fib_dp_header(struct fib_dp * dp)1579 get_fib_dp_header(struct fib_dp *dp)
1580 {
1581
1582 return (__containerof((void *)dp, struct fib_dp_header, fdh_idx));
1583 }
1584
1585 /*
1586 * Replace per-family index pool @pdp with a new one which
1587 * contains updated callback/algo data from @fd.
1588 * Returns true on success.
1589 */
1590 static bool
replace_rtables_family(struct fib_dp ** pdp,struct fib_data * fd,struct fib_dp * dp)1591 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp)
1592 {
1593 struct fib_dp_header *new_fdh, *old_fdh;
1594
1595 NET_EPOCH_ASSERT();
1596
1597 FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p",
1598 curvnet, dp->f, dp->arg);
1599
1600 FIB_MOD_LOCK();
1601 old_fdh = get_fib_dp_header(*pdp);
1602
1603 if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) {
1604 /*
1605 * Function is the same, data pointer needs update.
1606 * Perform in-line replace without reallocation.
1607 */
1608 old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg;
1609 FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh);
1610 FIB_MOD_UNLOCK();
1611 return (true);
1612 }
1613
1614 new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false);
1615 FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh);
1616 if (new_fdh == NULL) {
1617 FIB_MOD_UNLOCK();
1618 FD_PRINTF(LOG_WARNING, fd, "error attaching datapath");
1619 return (false);
1620 }
1621
1622 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1623 old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1624 /* Update relevant data structure for @fd */
1625 new_fdh->fdh_idx[fd->fd_fibnum] = *dp;
1626
1627 /* Ensure memcpy() writes have completed */
1628 atomic_thread_fence_rel();
1629 /* Set new datapath pointer */
1630 *pdp = &new_fdh->fdh_idx[0];
1631 FIB_MOD_UNLOCK();
1632 FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh);
1633
1634 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
1635
1636 return (true);
1637 }
1638
1639 static struct fib_dp **
get_family_dp_ptr(int family)1640 get_family_dp_ptr(int family)
1641 {
1642 switch (family) {
1643 #ifdef INET
1644 case AF_INET:
1645 return (&V_inet_dp);
1646 #endif
1647 #ifdef INET6
1648 case AF_INET6:
1649 return (&V_inet6_dp);
1650 #endif
1651 }
1652 return (NULL);
1653 }
1654
1655 /*
1656 * Make datapath use fib instance @fd
1657 */
1658 bool
fib_set_datapath_ptr(struct fib_data * fd,struct fib_dp * dp)1659 fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp)
1660 {
1661 struct fib_dp **pdp;
1662
1663 pdp = get_family_dp_ptr(fd->fd_family);
1664 return (replace_rtables_family(pdp, fd, dp));
1665 }
1666
1667 /*
1668 * Grow datapath pointers array.
1669 * Called from sysctl handler on growing number of routing tables.
1670 */
1671 static void
grow_rtables_family(struct fib_dp ** pdp,uint32_t new_num_tables)1672 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables)
1673 {
1674 struct fib_dp_header *new_fdh, *old_fdh = NULL;
1675
1676 new_fdh = alloc_fib_dp_array(new_num_tables, true);
1677
1678 FIB_MOD_LOCK();
1679 if (*pdp != NULL) {
1680 old_fdh = get_fib_dp_header(*pdp);
1681 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1682 old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1683 }
1684
1685 /* Wait till all writes completed */
1686 atomic_thread_fence_rel();
1687
1688 *pdp = &new_fdh->fdh_idx[0];
1689 FIB_MOD_UNLOCK();
1690
1691 if (old_fdh != NULL)
1692 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
1693 }
1694
1695 /*
1696 * Grows per-AF arrays of datapath pointers for each supported family.
1697 * Called from fibs resize sysctl handler.
1698 */
1699 void
fib_grow_rtables(uint32_t new_num_tables)1700 fib_grow_rtables(uint32_t new_num_tables)
1701 {
1702
1703 #ifdef INET
1704 grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables);
1705 #endif
1706 #ifdef INET6
1707 grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables);
1708 #endif
1709 }
1710
1711 void
fib_get_rtable_info(struct rib_head * rh,struct rib_rtable_info * rinfo)1712 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo)
1713 {
1714
1715 bzero(rinfo, sizeof(struct rib_rtable_info));
1716 rinfo->num_prefixes = rh->rnh_prefixes;
1717 rinfo->num_nhops = nhops_get_count(rh);
1718 #ifdef ROUTE_MPATH
1719 rinfo->num_nhgrp = nhgrp_get_count(rh);
1720 #endif
1721 }
1722
1723 /*
1724 * Updates pointer to the algo data for the @fd.
1725 */
1726 void
fib_set_algo_ptr(struct fib_data * fd,void * algo_data)1727 fib_set_algo_ptr(struct fib_data *fd, void *algo_data)
1728 {
1729 RIB_WLOCK_ASSERT(fd->fd_rh);
1730
1731 fd->fd_algo_data = algo_data;
1732 }
1733
1734 /*
1735 * Calls @callback with @ctx after the end of a current epoch.
1736 */
1737 void
fib_epoch_call(epoch_callback_t callback,epoch_context_t ctx)1738 fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx)
1739 {
1740 NET_EPOCH_CALL(callback, ctx);
1741 }
1742
1743 /*
1744 * Accessor to get rib instance @fd is attached to.
1745 */
1746 struct rib_head *
fib_get_rh(struct fib_data * fd)1747 fib_get_rh(struct fib_data *fd)
1748 {
1749
1750 return (fd->fd_rh);
1751 }
1752
1753 /*
1754 * Accessor to export idx->nhop array
1755 */
1756 struct nhop_object **
fib_get_nhop_array(struct fib_data * fd)1757 fib_get_nhop_array(struct fib_data *fd)
1758 {
1759
1760 return (fd->nh_idx);
1761 }
1762
1763 static uint32_t
get_nhop_idx(struct nhop_object * nh)1764 get_nhop_idx(struct nhop_object *nh)
1765 {
1766 #ifdef ROUTE_MPATH
1767 if (NH_IS_NHGRP(nh))
1768 return (nhgrp_get_idx((struct nhgrp_object *)nh));
1769 else
1770 #endif
1771 return (nhop_get_idx(nh));
1772 }
1773
1774 uint32_t
fib_get_nhop_idx(struct fib_data * fd,struct nhop_object * nh)1775 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh)
1776 {
1777
1778 return (get_nhop_idx(nh));
1779 }
1780
1781 static bool
is_idx_free(struct fib_data * fd,uint32_t index)1782 is_idx_free(struct fib_data *fd, uint32_t index)
1783 {
1784
1785 return (fd->nh_ref_table->refcnt[index] == 0);
1786 }
1787
1788 static uint32_t
fib_ref_nhop(struct fib_data * fd,struct nhop_object * nh)1789 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh)
1790 {
1791 uint32_t idx = get_nhop_idx(nh);
1792
1793 if (idx >= fd->number_nhops) {
1794 fd->hit_nhops = 1;
1795 return (0);
1796 }
1797
1798 if (is_idx_free(fd, idx)) {
1799 nhop_ref_any(nh);
1800 fd->nh_idx[idx] = nh;
1801 fd->nh_ref_table->count++;
1802 FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]);
1803 }
1804 fd->nh_ref_table->refcnt[idx]++;
1805
1806 return (idx);
1807 }
1808
1809 struct nhop_release_data {
1810 struct nhop_object *nh;
1811 struct epoch_context ctx;
1812 };
1813
1814 static void
release_nhop_epoch(epoch_context_t ctx)1815 release_nhop_epoch(epoch_context_t ctx)
1816 {
1817 struct nhop_release_data *nrd;
1818
1819 nrd = __containerof(ctx, struct nhop_release_data, ctx);
1820 nhop_free_any(nrd->nh);
1821 free(nrd, M_TEMP);
1822 }
1823
1824 /*
1825 * Delays nexthop refcount release.
1826 * Datapath may have the datastructures not updated yet, so the old
1827 * nexthop may still be returned till the end of current epoch. Delay
1828 * refcount removal, as we may be removing the last instance, which will
1829 * trigger nexthop deletion, rendering returned nexthop invalid.
1830 */
1831 static void
fib_schedule_release_nhop(struct fib_data * fd,struct nhop_object * nh)1832 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh)
1833 {
1834 struct nhop_release_data *nrd;
1835
1836 nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO);
1837 if (nrd != NULL) {
1838 nrd->nh = nh;
1839 fib_epoch_call(release_nhop_epoch, &nrd->ctx);
1840 } else {
1841 /*
1842 * Unable to allocate memory. Leak nexthop to maintain guarantee
1843 * that each nhop can be referenced.
1844 */
1845 FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh);
1846 }
1847 }
1848
1849 static void
fib_unref_nhop(struct fib_data * fd,struct nhop_object * nh)1850 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh)
1851 {
1852 uint32_t idx = get_nhop_idx(nh);
1853
1854 KASSERT((idx < fd->number_nhops), ("invalid nhop index"));
1855 KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh"));
1856
1857 fd->nh_ref_table->refcnt[idx]--;
1858 if (fd->nh_ref_table->refcnt[idx] == 0) {
1859 FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]);
1860 fib_schedule_release_nhop(fd, fd->nh_idx[idx]);
1861 }
1862 }
1863
1864 static void
set_algo_fixed(struct rib_head * rh)1865 set_algo_fixed(struct rib_head *rh)
1866 {
1867 switch (rh->rib_family) {
1868 #ifdef INET
1869 case AF_INET:
1870 V_algo_fixed_inet = true;
1871 break;
1872 #endif
1873 #ifdef INET6
1874 case AF_INET6:
1875 V_algo_fixed_inet6 = true;
1876 break;
1877 #endif
1878 }
1879 }
1880
1881 static bool
is_algo_fixed(struct rib_head * rh)1882 is_algo_fixed(struct rib_head *rh)
1883 {
1884
1885 switch (rh->rib_family) {
1886 #ifdef INET
1887 case AF_INET:
1888 return (V_algo_fixed_inet);
1889 #endif
1890 #ifdef INET6
1891 case AF_INET6:
1892 return (V_algo_fixed_inet6);
1893 #endif
1894 }
1895 return (false);
1896 }
1897
1898 /*
1899 * Runs the check on what would be the best algo for rib @rh, assuming
1900 * that the current algo is the one specified by @orig_flm. Note that
1901 * it can be NULL for initial selection.
1902 *
1903 * Returns referenced new algo or NULL if the current one is the best.
1904 */
1905 static struct fib_lookup_module *
fib_check_best_algo(struct rib_head * rh,struct fib_lookup_module * orig_flm)1906 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm)
1907 {
1908 uint8_t preference, curr_preference = 0, best_preference = 0;
1909 struct fib_lookup_module *flm, *best_flm = NULL;
1910 struct rib_rtable_info rinfo;
1911 int candidate_algos = 0;
1912
1913 fib_get_rtable_info(rh, &rinfo);
1914
1915 FIB_MOD_LOCK();
1916 TAILQ_FOREACH(flm, &all_algo_list, entries) {
1917 if (flm->flm_family != rh->rib_family)
1918 continue;
1919 candidate_algos++;
1920 preference = flm->flm_get_pref(&rinfo);
1921 if (preference > best_preference) {
1922 if (!flm_error_check(flm, rh->rib_fibnum)) {
1923 best_preference = preference;
1924 best_flm = flm;
1925 }
1926 }
1927 if (flm == orig_flm)
1928 curr_preference = preference;
1929 }
1930 if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference))
1931 best_flm->flm_refcount++;
1932 else
1933 best_flm = NULL;
1934 FIB_MOD_UNLOCK();
1935
1936 RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)",
1937 candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference,
1938 best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"),
1939 best_preference);
1940
1941 return (best_flm);
1942 }
1943
1944 /*
1945 * Called when new route table is created.
1946 * Selects, allocates and attaches fib algo for the table.
1947 */
1948 static bool
fib_select_algo_initial(struct rib_head * rh,struct fib_dp * dp)1949 fib_select_algo_initial(struct rib_head *rh, struct fib_dp *dp)
1950 {
1951 struct fib_lookup_module *flm;
1952 struct fib_data *fd = NULL;
1953 enum flm_op_result result;
1954 struct epoch_tracker et;
1955
1956 flm = fib_check_best_algo(rh, NULL);
1957 if (flm == NULL) {
1958 RH_PRINTF(LOG_CRIT, rh, "no algo selected");
1959 return (false);
1960 }
1961 RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);
1962
1963 NET_EPOCH_ENTER(et);
1964 RIB_WLOCK(rh);
1965 result = setup_fd_instance(flm, rh, NULL, &fd, false);
1966 RIB_WUNLOCK(rh);
1967 NET_EPOCH_EXIT(et);
1968
1969 RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
1970 if (result == FLM_SUCCESS)
1971 *dp = fd->fd_dp;
1972 else
1973 RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name);
1974
1975 fib_unref_algo(flm);
1976
1977 return (result == FLM_SUCCESS);
1978 }
1979
1980 /*
1981 * Sets up fib algo instances for the non-initialized RIBs in the @family.
1982 * Allocates temporary datapath index to amortize datapaint index updates
1983 * with large @num_tables.
1984 */
1985 void
fib_setup_family(int family,uint32_t num_tables)1986 fib_setup_family(int family, uint32_t num_tables)
1987 {
1988 struct fib_dp_header *new_fdh = alloc_fib_dp_array(num_tables, false);
1989 if (new_fdh == NULL) {
1990 ALGO_PRINTF(LOG_CRIT, "Unable to setup framework for %s", print_family(family));
1991 return;
1992 }
1993
1994 for (int i = 0; i < num_tables; i++) {
1995 struct rib_head *rh = rt_tables_get_rnh(i, family);
1996 if (rh->rib_algo_init)
1997 continue;
1998 if (!fib_select_algo_initial(rh, &new_fdh->fdh_idx[i]))
1999 continue;
2000
2001 rh->rib_algo_init = true;
2002 }
2003
2004 FIB_MOD_LOCK();
2005 struct fib_dp **pdp = get_family_dp_ptr(family);
2006 struct fib_dp_header *old_fdh = get_fib_dp_header(*pdp);
2007
2008 /* Update the items not touched by the new init, from the old data pointer */
2009 for (int i = 0; i < num_tables; i++) {
2010 if (new_fdh->fdh_idx[i].f == dummy_lookup)
2011 new_fdh->fdh_idx[i] = old_fdh->fdh_idx[i];
2012 }
2013
2014 /* Ensure all index writes have completed */
2015 atomic_thread_fence_rel();
2016 /* Set new datapath pointer */
2017 *pdp = &new_fdh->fdh_idx[0];
2018
2019 FIB_MOD_UNLOCK();
2020
2021 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
2022 }
2023
2024 /*
2025 * Registers fib lookup module within the subsystem.
2026 */
2027 int
fib_module_register(struct fib_lookup_module * flm)2028 fib_module_register(struct fib_lookup_module *flm)
2029 {
2030
2031 FIB_MOD_LOCK();
2032 ALGO_PRINTF(LOG_INFO, "attaching %s to %s", flm->flm_name,
2033 print_family(flm->flm_family));
2034 TAILQ_INSERT_TAIL(&all_algo_list, flm, entries);
2035 FIB_MOD_UNLOCK();
2036
2037 return (0);
2038 }
2039
2040 /*
2041 * Tries to unregister fib lookup module.
2042 *
2043 * Returns 0 on success, EBUSY if module is still used
2044 * by some of the tables.
2045 */
2046 int
fib_module_unregister(struct fib_lookup_module * flm)2047 fib_module_unregister(struct fib_lookup_module *flm)
2048 {
2049
2050 FIB_MOD_LOCK();
2051 if (flm->flm_refcount > 0) {
2052 FIB_MOD_UNLOCK();
2053 return (EBUSY);
2054 }
2055 fib_error_clear_flm(flm);
2056 ALGO_PRINTF(LOG_INFO, "detaching %s from %s", flm->flm_name,
2057 print_family(flm->flm_family));
2058 TAILQ_REMOVE(&all_algo_list, flm, entries);
2059 FIB_MOD_UNLOCK();
2060
2061 return (0);
2062 }
2063
2064 void
vnet_fib_init(void)2065 vnet_fib_init(void)
2066 {
2067
2068 TAILQ_INIT(&V_fib_data_list);
2069 }
2070
2071 void
vnet_fib_destroy(void)2072 vnet_fib_destroy(void)
2073 {
2074
2075 FIB_MOD_LOCK();
2076 fib_error_clear();
2077 FIB_MOD_UNLOCK();
2078 }
2079