xref: /freebsd/sys/net/route/fib_algo.c (revision b51f459a2098622c31ed54f5c1bf0e03efce403b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 
34 #include <sys/param.h>
35 #include <sys/eventhandler.h>
36 #include <sys/kernel.h>
37 #include <sys/sbuf.h>
38 #include <sys/lock.h>
39 #include <sys/rmlock.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/module.h>
43 #include <sys/kernel.h>
44 #include <sys/priv.h>
45 #include <sys/proc.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/syslog.h>
50 #include <sys/queue.h>
51 #include <net/vnet.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 
56 #include <netinet/in.h>
57 #include <netinet/in_var.h>
58 #include <netinet/ip.h>
59 #include <netinet/ip_var.h>
60 #ifdef INET6
61 #include <netinet/ip6.h>
62 #include <netinet6/ip6_var.h>
63 #endif
64 
65 #include <net/route.h>
66 #include <net/route/nhop.h>
67 #include <net/route/route_ctl.h>
68 #include <net/route/route_var.h>
69 #include <net/route/fib_algo.h>
70 
71 #include <machine/stdarg.h>
72 
73 /*
74  * Fib lookup framework.
75  *
76  * This framework enables accelerated longest-prefix-match lookups for the
77  *  routing tables by adding the ability to dynamically attach/detach lookup
78  *  algorithms implementation to/from the datapath.
79  *
80  * flm - fib lookup modules - implementation of particular lookup algorithm
81  * fd - fib data - instance of an flm bound to specific routing table
82  *
83  * This file provides main framework functionality.
84  *
85  * The following are the features provided by the framework
86  *
87  * 1) nexhops abstraction -> provides transparent referencing, indexing
88  *   and efficient idx->ptr mappings for nexthop and nexthop groups.
89  * 2) Routing table synchronisation
90  * 3) dataplane attachment points
91  * 4) automatic algorithm selection based on the provided preference.
92  *
93  *
94  * DATAPATH
95  * For each supported address family, there is a an allocated array of fib_dp
96  *  structures, indexed by fib number. Each array entry contains callback function
97  *  and its argument. This function will be called with a family-specific lookup key,
98  *  scope and provided argument. This array gets re-created every time when new algo
99  *  instance gets created. Please take a look at the replace_rtables_family() function
100  *  for more details.
101  *
102  */
103 
104 SYSCTL_DECL(_net_route);
105 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
106     "Fib algorithm lookups");
107 
108 /* Algorithm sync policy */
109 
110 /* Time interval to bucket updates */
111 VNET_DEFINE(unsigned int, bucket_time_ms) = 50;
112 #define	V_bucket_time_ms	VNET(bucket_time_ms)
113 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET,
114     &VNET_NAME(bucket_time_ms), 0, "Time interval to calculate update rate");
115 
116 /* Minimum update rate to delay sync */
117 VNET_DEFINE(unsigned int, bucket_change_threshold_rate) = 500;
118 #define	V_bucket_change_threshold_rate	VNET(bucket_change_threshold_rate)
119 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET,
120     &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync");
121 
122 /* Max allowed delay to sync */
123 VNET_DEFINE(unsigned int, fib_max_sync_delay_ms) = 1000;
124 #define	V_fib_max_sync_delay_ms	VNET(fib_max_sync_delay_ms)
125 SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET,
126     &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)");
127 
128 
129 #ifdef INET6
130 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false;
131 #define	V_algo_fixed_inet6	VNET(algo_fixed_inet6)
132 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
133     "IPv6 longest prefix match lookups");
134 #endif
135 #ifdef INET
136 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false;
137 #define	V_algo_fixed_inet	VNET(algo_fixed_inet)
138 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
139     "IPv4 longest prefix match lookups");
140 #endif
141 
142 /* Fib instance counter */
143 static uint32_t fib_gen = 0;
144 
145 struct nhop_ref_table {
146 	uint32_t		count;
147 	int32_t			refcnt[0];
148 };
149 
150 enum fib_callout_action {
151 	FDA_NONE,	/* No callout scheduled */
152 	FDA_REBUILD,	/* Asks to rebuild algo instance */
153 	FDA_EVAL,	/* Asks to evaluate if the current algo is still be best */
154 	FDA_BATCH,	/* Asks to submit batch of updates to the algo */
155 };
156 
157 struct fib_sync_status {
158 	struct timeval		diverge_time;	/* ts when diverged */
159 	uint32_t		num_changes;	/* number of changes since sync */
160 	uint32_t		bucket_changes;	/* num changes within the current bucket */
161 	uint64_t		bucket_id;	/* 50ms bucket # */
162 	struct fib_change_queue	fd_change_queue;/* list of scheduled entries */
163 };
164 
165 /*
166  * Data structure for the fib lookup instance tied to the particular rib.
167  */
168 struct fib_data {
169 	uint32_t		number_nhops;	/* current # of nhops */
170 	uint8_t			hit_nhops;	/* true if out of nhop limit */
171 	uint8_t			init_done;	/* true if init is competed */
172 	uint32_t		fd_dead:1;	/* Scheduled for deletion */
173 	uint32_t		fd_linked:1;	/* true if linked */
174 	uint32_t		fd_need_rebuild:1;	/* true if rebuild scheduled */
175 	uint32_t		fd_batch:1;	/* true if batched notification scheduled */
176 	uint8_t			fd_family;	/* family */
177 	uint32_t		fd_fibnum;	/* fibnum */
178 	uint32_t		fd_failed_rebuilds;	/* stat: failed rebuilds */
179 	uint32_t		fd_gen;		/* instance gen# */
180 	struct callout		fd_callout;	/* rebuild callout */
181 	enum fib_callout_action	fd_callout_action;	/* Callout action to take */
182 	void			*fd_algo_data;	/* algorithm data */
183 	struct nhop_object	**nh_idx;	/* nhop idx->ptr array */
184 	struct nhop_ref_table	*nh_ref_table;	/* array with # of nhop references */
185 	struct rib_head		*fd_rh;		/* RIB table we're attached to */
186 	struct rib_subscription	*fd_rs;		/* storing table subscription */
187 	struct fib_dp		fd_dp;		/* fib datapath data */
188 	struct vnet		*fd_vnet;	/* vnet fib belongs to */
189 	struct epoch_context	fd_epoch_ctx;	/* epoch context for deletion */
190 	struct fib_lookup_module	*fd_flm;/* pointer to the lookup module */
191 	struct fib_sync_status	fd_ss;		/* State relevant to the rib sync  */
192 	uint32_t		fd_num_changes;	/* number of changes since last callout */
193 	TAILQ_ENTRY(fib_data)	entries;	/* list of all fds in vnet */
194 };
195 
196 static bool rebuild_fd(struct fib_data *fd, const char *reason);
197 static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new);
198 static void handle_fd_callout(void *_data);
199 static void destroy_fd_instance_epoch(epoch_context_t ctx);
200 static enum flm_op_result attach_datapath(struct fib_data *fd);
201 static bool is_idx_free(struct fib_data *fd, uint32_t index);
202 static void set_algo_fixed(struct rib_head *rh);
203 static bool is_algo_fixed(struct rib_head *rh);
204 
205 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh);
206 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh);
207 
208 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
209     struct fib_lookup_module *orig_flm);
210 static void fib_unref_algo(struct fib_lookup_module *flm);
211 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum);
212 
213 struct mtx fib_mtx;
214 #define	FIB_MOD_LOCK()		mtx_lock(&fib_mtx)
215 #define	FIB_MOD_UNLOCK()	mtx_unlock(&fib_mtx)
216 #define	FIB_MOD_LOCK_ASSERT()	mtx_assert(&fib_mtx, MA_OWNED)
217 
218 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF);
219 
220 /* Algorithm has to be this percent better than the current to switch */
221 #define	BEST_DIFF_PERCENT	(5 * 256 / 100)
222 /* Schedule algo re-evaluation X seconds after a change */
223 #define	ALGO_EVAL_DELAY_MS	30000
224 /* Force algo re-evaluation after X changes */
225 #define	ALGO_EVAL_NUM_ROUTES	100
226 /* Try to setup algorithm X times */
227 #define	FIB_MAX_TRIES		32
228 /* Max amount of supported nexthops */
229 #define	FIB_MAX_NHOPS		262144
230 #define	FIB_CALLOUT_DELAY_MS	50
231 
232 
233 /* Debug */
234 static int flm_debug_level = LOG_NOTICE;
235 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN,
236     &flm_debug_level, 0, "debuglevel");
237 #define	FLM_MAX_DEBUG_LEVEL	LOG_DEBUG
238 #ifndef	LOG_DEBUG2
239 #define	LOG_DEBUG2	8
240 #endif
241 
242 #define	_PASS_MSG(_l)	(flm_debug_level >= (_l))
243 #define	ALGO_PRINTF(_fmt, ...)	printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__)
244 #define	_ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \
245     printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\
246     print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__)
247 #define	_RH_PRINTF(_fib, _fam, _func, _fmt, ...) \
248     printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__)
249 #define	RH_PRINTF(_l, _rh, _fmt, ...)	if (_PASS_MSG(_l)) {	\
250     _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\
251 }
252 #define	FD_PRINTF(_l, _fd, _fmt, ...)	FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__)
253 #define	_FD_PRINTF(_l, _fd, _fmt, ...)	if (_PASS_MSG(_l)) {		\
254     _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name,	\
255     _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__);			\
256 }
257 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2
258 #define	FD_PRINTF_LOG_DEBUG2	_FD_PRINTF
259 #else
260 #define	FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...)
261 #endif
262 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG
263 #define	FD_PRINTF_LOG_DEBUG	_FD_PRINTF
264 #else
265 #define	FD_PRINTF_LOG_DEBUG()
266 #endif
267 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO
268 #define	FD_PRINTF_LOG_INFO	_FD_PRINTF
269 #else
270 #define	FD_PRINTF_LOG_INFO()
271 #endif
272 #define	FD_PRINTF_LOG_NOTICE	_FD_PRINTF
273 #define	FD_PRINTF_LOG_ERR	_FD_PRINTF
274 #define	FD_PRINTF_LOG_WARNING	_FD_PRINTF
275 
276 
277 /* List of all registered lookup algorithms */
278 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list);
279 
280 /* List of all fib lookup instances in the vnet */
281 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list);
282 #define	V_fib_data_list	VNET(fib_data_list)
283 
284 /* Datastructure for storing non-transient fib lookup module failures */
285 struct fib_error {
286 	int				fe_family;
287 	uint32_t			fe_fibnum;	/* failed rtable */
288 	struct fib_lookup_module	*fe_flm;	/* failed module */
289 	TAILQ_ENTRY(fib_error)		entries;/* list of all errored entries */
290 };
291 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list);
292 #define	V_fib_error_list VNET(fib_error_list)
293 
294 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */
295 struct fib_dp_header {
296 	struct epoch_context	fdh_epoch_ctx;
297 	uint32_t		fdh_num_tables;
298 	struct fib_dp		fdh_idx[0];
299 };
300 
301 /*
302  * Tries to add new non-transient algorithm error to the list of
303  *  errors.
304  * Returns true on success.
305  */
306 static bool
307 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum)
308 {
309 	struct fib_error *fe;
310 
311 	fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO);
312 	if (fe == NULL)
313 		return (false);
314 	fe->fe_flm = flm;
315 	fe->fe_family = flm->flm_family;
316 	fe->fe_fibnum = fibnum;
317 
318 	FIB_MOD_LOCK();
319 	/* Avoid duplicates by checking if error already exists first */
320 	if (flm_error_check(flm, fibnum)) {
321 		FIB_MOD_UNLOCK();
322 		free(fe, M_TEMP);
323 		return (true);
324 	}
325 	TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries);
326 	FIB_MOD_UNLOCK();
327 
328 	return (true);
329 }
330 
331 /*
332  * True if non-transient error has been registered for @flm in @fibnum.
333  */
334 static bool
335 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum)
336 {
337 	const struct fib_error *fe;
338 
339 	TAILQ_FOREACH(fe, &V_fib_error_list, entries) {
340 		if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum))
341 			return (true);
342 	}
343 
344 	return (false);
345 }
346 
347 /*
348  * Clear all errors of algo specified by @flm.
349  */
350 static void
351 fib_error_clear_flm(struct fib_lookup_module *flm)
352 {
353 	struct fib_error *fe, *fe_tmp;
354 
355 	FIB_MOD_LOCK_ASSERT();
356 
357 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
358 		if (fe->fe_flm == flm) {
359 			TAILQ_REMOVE(&V_fib_error_list, fe, entries);
360 			free(fe, M_TEMP);
361 		}
362 	}
363 }
364 
365 /*
366  * Clears all errors in current VNET.
367  */
368 static void
369 fib_error_clear()
370 {
371 	struct fib_error *fe, *fe_tmp;
372 
373 	FIB_MOD_LOCK_ASSERT();
374 
375 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
376 		TAILQ_REMOVE(&V_fib_error_list, fe, entries);
377 		free(fe, M_TEMP);
378 	}
379 }
380 
381 static const char *
382 print_op_result(enum flm_op_result result)
383 {
384 	switch (result) {
385 	case FLM_SUCCESS:
386 		return "success";
387 	case FLM_REBUILD:
388 		return "rebuild";
389 	case FLM_BATCH:
390 		return "batch";
391 	case FLM_ERROR:
392 		return "error";
393 	}
394 
395 	return "unknown";
396 }
397 
398 static const char *
399 print_family(int family)
400 {
401 
402 	if (family == AF_INET)
403 		return ("inet");
404 	else if (family == AF_INET6)
405 		return ("inet6");
406 	else
407 		return ("unknown");
408 }
409 
410 /*
411  * Debug function used by lookup algorithms.
412  * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) "
413  */
414 void
415 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...)
416 {
417 	char buf[128];
418 	va_list ap;
419 
420 	if (level > flm_debug_level)
421 		return;
422 
423 	va_start(ap, fmt);
424 	vsnprintf(buf, sizeof(buf), fmt, ap);
425 	va_end(ap);
426 
427 	_ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name,
428 	    fd->fd_gen, func, "%s", buf);
429 }
430 
431 /*
432  * Outputs list of algorithms supported by the provided address family.
433  */
434 static int
435 print_algos_sysctl(struct sysctl_req *req, int family)
436 {
437 	struct fib_lookup_module *flm;
438 	struct sbuf sbuf;
439 	int error, count = 0;
440 
441 	error = sysctl_wire_old_buffer(req, 0);
442 	if (error == 0) {
443 		sbuf_new_for_sysctl(&sbuf, NULL, 512, req);
444 		TAILQ_FOREACH(flm, &all_algo_list, entries) {
445 			if (flm->flm_family == family) {
446 				if (count++ > 0)
447 					sbuf_cat(&sbuf, ", ");
448 				sbuf_cat(&sbuf, flm->flm_name);
449 			}
450 		}
451 		error = sbuf_finish(&sbuf);
452 		sbuf_delete(&sbuf);
453 	}
454 	return (error);
455 }
456 
457 #ifdef INET6
458 static int
459 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)
460 {
461 
462 	return (print_algos_sysctl(req, AF_INET6));
463 }
464 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
465     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
466     print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms");
467 #endif
468 
469 #ifdef INET
470 static int
471 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)
472 {
473 
474 	return (print_algos_sysctl(req, AF_INET));
475 }
476 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
477     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
478     print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms");
479 #endif
480 
481 /*
482  * Calculate delay between repeated failures.
483  * Returns current delay in milliseconds.
484  */
485 static uint32_t
486 callout_calc_delay_ms(struct fib_data *fd)
487 {
488 	uint32_t shift;
489 
490 	if (fd->fd_failed_rebuilds > 10)
491 		shift = 10;
492 	else
493 		shift = fd->fd_failed_rebuilds;
494 
495 	return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
496 }
497 
498 static void
499 schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms)
500 {
501 
502 	FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action);
503 	fd->fd_callout_action = action;
504 	callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0,
505 	    handle_fd_callout, fd, 0);
506 }
507 
508 static void
509 schedule_fd_rebuild(struct fib_data *fd, const char *reason)
510 {
511 
512 	RIB_WLOCK_ASSERT(fd->fd_rh);
513 
514 	if (!fd->fd_need_rebuild) {
515 		fd->fd_need_rebuild = true;
516 		/* Stop batch updates */
517 		fd->fd_batch = false;
518 
519 		/*
520 		 * Potentially re-schedules pending callout
521 		 *  initiated by schedule_algo_eval.
522 		 */
523 		FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)",
524 		    reason, fd->fd_failed_rebuilds);
525 		schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd));
526 	}
527 }
528 
529 static int64_t
530 get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv)
531 {
532 	int64_t diff = 0;
533 
534 	diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000;
535 	diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000;
536 
537 	return (diff);
538 }
539 
540 static void
541 add_tv_diff_ms(struct timeval *tv, int ms)
542 {
543 	tv->tv_sec += ms / 1000;
544 	ms = ms % 1000;
545 	if (ms * 1000 + tv->tv_usec < 1000000)
546 		tv->tv_usec += ms * 1000;
547 	else {
548 		tv->tv_sec += 1;
549 		tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000;
550 	}
551 }
552 
553 /*
554  * Marks the time when algo state diverges from the rib state.
555  */
556 static void
557 mark_diverge_time(struct fib_data *fd)
558 {
559 	struct fib_sync_status *fd_ss = &fd->fd_ss;
560 
561 	getmicrouptime(&fd_ss->diverge_time);
562 	fd_ss->bucket_id = 0;
563 	fd_ss->bucket_changes = 0;
564 }
565 
566 /*
567  * Calculates and updates the next algorithm sync time, based on the current activity.
568  *
569  * The intent is to provide reasonable balance between the update
570  *  latency and efficient batching when changing large amount of routes.
571  *
572  * High-level algorithm looks the following:
573  * 1) all changes are bucketed in 50ms intervals
574  * 2) If amount of changes within the bucket is greater than the threshold,
575  *   the update gets delayed, up to maximum delay threshold.
576  */
577 static void
578 update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action)
579 {
580 	uint32_t bucket_id, new_delay = 0;
581 	struct timeval tv;
582 
583 	/* Fetch all variables at once to ensure consistent reads */
584 	uint32_t bucket_time_ms = V_bucket_time_ms;
585 	uint32_t threshold_rate = V_bucket_change_threshold_rate;
586 	uint32_t max_delay_ms = V_fib_max_sync_delay_ms;
587 
588 	if (bucket_time_ms == 0)
589 		bucket_time_ms = 50;
590 	/* calculate per-bucket threshold rate */
591 	threshold_rate = threshold_rate * bucket_time_ms / 1000;
592 
593 	getmicrouptime(&tv);
594 
595 	struct fib_sync_status *fd_ss = &fd->fd_ss;
596 
597 	bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms;
598 
599 	if (fd_ss->bucket_id == bucket_id) {
600 		fd_ss->bucket_changes++;
601 		if (fd_ss->bucket_changes == threshold_rate) {
602 			new_delay = (bucket_id + 2) * bucket_time_ms;
603 			if (new_delay <= max_delay_ms) {
604 				FD_PRINTF(LOG_DEBUG, fd,
605 				    "hit threshold of %u routes, delay update,"
606 				    "bucket: %u, total delay: %u",
607 				    threshold_rate, bucket_id + 1, new_delay);
608 			} else {
609 				new_delay = 0;
610 				FD_PRINTF(LOG_DEBUG, fd,
611 				    "maximum sync delay (%u ms) reached", max_delay_ms);
612 			}
613 		} else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1))
614 			new_delay = bucket_time_ms;
615 	} else {
616 		fd_ss->bucket_id = bucket_id;
617 		fd_ss->bucket_changes = 1;
618 	}
619 
620 	if (new_delay > 0) {
621 		/* Calculated time has been updated */
622 		struct timeval new_tv = fd_ss->diverge_time;
623 		add_tv_diff_ms(&new_tv, new_delay);
624 
625 		int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv);
626 		schedule_callout(fd, action, delay_ms);
627 	}
628 }
629 
630 static void
631 update_algo_state(struct fib_data *fd)
632 {
633 
634 	RIB_WLOCK_ASSERT(fd->fd_rh);
635 
636 	if (fd->fd_batch || fd->fd_need_rebuild) {
637 		enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH;
638 		update_rebuild_delay(fd, action);
639 		return;
640 	}
641 
642 	if (fd->fd_num_changes++ == 0) {
643 		/* Start callout to consider switch */
644 		if (!callout_pending(&fd->fd_callout))
645 			schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS);
646 	} else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) {
647 		/* Reset callout to exec immediately */
648 		if (fd->fd_callout_action == FDA_EVAL)
649 			schedule_callout(fd, FDA_EVAL, 1);
650 	}
651 }
652 
653 static bool
654 need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc)
655 {
656 	struct nhop_object *nh;
657 
658 	/* Sync addition/removal of interface routes */
659 	switch (rc->rc_cmd) {
660 	case RTM_ADD:
661 		nh = rc->rc_nh_new;
662 		if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY)))
663 			return (true);
664 		break;
665 	case RTM_DELETE:
666 		nh = rc->rc_nh_old;
667 		if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY)))
668 			return (true);
669 		break;
670 	}
671 
672 	return (false);
673 }
674 
675 static bool
676 apply_rtable_changes(struct fib_data *fd)
677 {
678 	enum flm_op_result result;
679 	struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
680 
681 	result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data);
682 
683 	if (result == FLM_SUCCESS) {
684 		for (int i = 0; i < q->count; i++)
685 			if (q->entries[i].nh_old)
686 				fib_unref_nhop(fd, q->entries[i].nh_old);
687 		q->count = 0;
688 	}
689 	fd->fd_batch = false;
690 
691 	return (result == FLM_SUCCESS);
692 }
693 
694 static bool
695 fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc)
696 {
697 	int plen = 0;
698 
699 	switch (fd->fd_family) {
700 	case AF_INET:
701 		rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid);
702 		break;
703 	case AF_INET6:
704 		rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid);
705 		break;
706 	}
707 
708 	ce->plen = plen;
709 	ce->nh_old = rc->rc_nh_old;
710 	ce->nh_new = rc->rc_nh_new;
711 	if (ce->nh_new != NULL) {
712 		if (fib_ref_nhop(fd, ce->nh_new) == 0)
713 			return (false);
714 	}
715 
716 	return (true);
717 }
718 
719 static bool
720 queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc)
721 {
722 	struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
723 
724 	if (q->count >= q->size) {
725 		uint32_t q_size;
726 
727 		if (q->size == 0)
728 			q_size = 256; /* ~18k memory */
729 		else
730 			q_size = q->size * 2;
731 
732 		size_t size = q_size * sizeof(struct fib_change_entry);
733 		void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO);
734 		if (a == NULL) {
735 			FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements",
736 			    q_size);
737 			return (false);
738 		}
739 		q->entries = a;
740 		q->size = q_size;
741 	}
742 
743 	return (fill_change_entry(fd, &q->entries[q->count++], rc));
744 }
745 
746 /*
747  * Rib subscription handler. Checks if the algorithm is ready to
748  *  receive updates, handles nexthop refcounting and passes change
749  *  data to the algorithm callback.
750  */
751 static void
752 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
753     void *_data)
754 {
755 	struct fib_data *fd = (struct fib_data *)_data;
756 	enum flm_op_result result;
757 
758 	RIB_WLOCK_ASSERT(rnh);
759 
760 	/*
761 	 * There is a small gap between subscribing for route changes
762 	 *  and initiating rtable dump. Avoid receiving route changes
763 	 *  prior to finishing rtable dump by checking `init_done`.
764 	 */
765 	if (!fd->init_done)
766 		return;
767 
768 	bool immediate_sync = need_immediate_sync(fd, rc);
769 
770 	/* Consider scheduling algorithm re-evaluation */
771 	update_algo_state(fd);
772 
773 	/*
774 	 * If algo requested rebuild, stop sending updates by default.
775 	 * This simplifies nexthop refcount handling logic.
776 	 */
777 	if (fd->fd_need_rebuild) {
778 		if (immediate_sync)
779 			rebuild_fd(fd, "rtable change type enforced sync");
780 		return;
781 	}
782 
783 	/*
784 	 * Algo requested updates to be delivered in batches.
785 	 * Add the current change to the queue and return.
786 	 */
787 	if (fd->fd_batch) {
788 		if (immediate_sync) {
789 			if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd))
790 				rebuild_fd(fd, "batch sync failed");
791 		} else {
792 			if (!queue_rtable_change(fd, rc))
793 				schedule_fd_rebuild(fd, "batch queue failed");
794 		}
795 		return;
796 	}
797 
798 	/*
799 	 * Maintain guarantee that every nexthop returned by the dataplane
800 	 *  lookup has > 0 refcount, so can be safely referenced within current
801 	 *  epoch.
802 	 */
803 	if (rc->rc_nh_new != NULL) {
804 		if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) {
805 			/* ran out of indexes */
806 			schedule_fd_rebuild(fd, "ran out of nhop indexes");
807 			return;
808 		}
809 	}
810 
811 	result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
812 
813 	switch (result) {
814 	case FLM_SUCCESS:
815 		/* Unref old nexthop on success */
816 		if (rc->rc_nh_old != NULL)
817 			fib_unref_nhop(fd, rc->rc_nh_old);
818 		break;
819 	case FLM_BATCH:
820 
821 		/*
822 		 * Algo asks to batch the changes.
823 		 */
824 		if (queue_rtable_change(fd, rc)) {
825 			if (!immediate_sync) {
826 				fd->fd_batch = true;
827 				mark_diverge_time(fd);
828 				update_rebuild_delay(fd, FDA_BATCH);
829 				break;
830 			}
831 			if (apply_rtable_changes(fd))
832 				break;
833 		}
834 		FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild");
835 
836 	case FLM_REBUILD:
837 
838 		/*
839 		 * Algo is not able to apply the update.
840 		 * Schedule algo rebuild.
841 		 */
842 		if (!immediate_sync) {
843 			mark_diverge_time(fd);
844 			schedule_fd_rebuild(fd, "algo requested rebuild");
845 			break;
846 		}
847 
848 		FD_PRINTF(LOG_INFO, fd, "running sync rebuild");
849 		rebuild_fd(fd, "rtable change type enforced sync");
850 		break;
851 	case FLM_ERROR:
852 
853 		/*
854 		 * Algo reported a non-recoverable error.
855 		 * Record the error and schedule rebuild, which will
856 		 *  trigger best algo selection.
857 		 */
858 		FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error");
859 		if (!flm_error_add(fd->fd_flm, fd->fd_fibnum))
860 			FD_PRINTF(LOG_ERR, fd, "failed to ban algo");
861 		schedule_fd_rebuild(fd, "algo reported non-recoverable error");
862 	}
863 }
864 
865 static void
866 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd)
867 {
868 
869 	if (old_fd == NULL) {
870 		// TODO: read from rtable
871 		fd->number_nhops = 16;
872 		return;
873 	}
874 
875 	if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
876 		fd->number_nhops = 2 * old_fd->number_nhops;
877 	else
878 		fd->number_nhops = old_fd->number_nhops;
879 }
880 
881 struct walk_cbdata {
882 	struct fib_data		*fd;
883 	flm_dump_t		*func;
884 	enum flm_op_result	result;
885 };
886 
887 /*
888  * Handler called after all rtenties have been dumped.
889  * Performs post-dump framework checks and calls
890  * algo:flm_dump_end_cb().
891  *
892  * Updates walk_cbdata result.
893  */
894 static void
895 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
896 {
897 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
898 	struct fib_data *fd = w->fd;
899 
900 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
901 
902 	if (rnh->rib_dying) {
903 		w->result = FLM_ERROR;
904 		return;
905 	}
906 
907 	if (fd->hit_nhops) {
908 		FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops",
909 		    fd->nh_ref_table->count);
910 		if (w->result == FLM_SUCCESS)
911 			w->result = FLM_REBUILD;
912 		return;
913 	}
914 
915 	if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
916 		return;
917 
918 	/* Post-dump hook, dump successful */
919 	w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
920 
921 	if (w->result == FLM_SUCCESS) {
922 		/* Mark init as done to allow routing updates */
923 		fd->init_done = 1;
924 	}
925 }
926 
927 /*
928  * Callback for each entry in rib.
929  * Calls algo:flm_dump_rib_item_cb func as a part of initial
930  *  route table synchronisation.
931  */
932 static int
933 sync_algo_cb(struct rtentry *rt, void *_data)
934 {
935 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
936 
937 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
938 
939 	if (w->result == FLM_SUCCESS && w->func) {
940 
941 		/*
942 		 * Reference nexthops to maintain guarantee that
943 		 *  each nexthop returned by datapath has > 0 references
944 		 *  and can be safely referenced within current epoch.
945 		 */
946 		struct nhop_object *nh = rt_get_raw_nhop(rt);
947 		if (fib_ref_nhop(w->fd, nh) != 0)
948 			w->result = w->func(rt, w->fd->fd_algo_data);
949 		else
950 			w->result = FLM_REBUILD;
951 	}
952 
953 	return (0);
954 }
955 
956 /*
957  * Dump all routing table state to the algo instance.
958  */
959 static enum flm_op_result
960 sync_algo(struct fib_data *fd)
961 {
962 	struct walk_cbdata w = {
963 		.fd = fd,
964 		.func = fd->fd_flm->flm_dump_rib_item_cb,
965 		.result = FLM_SUCCESS,
966 	};
967 
968 	rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w);
969 
970 	FD_PRINTF(LOG_INFO, fd,
971 	    "initial dump completed (rtable version: %d), result: %s",
972 	    fd->fd_rh->rnh_gen, print_op_result(w.result));
973 
974 	return (w.result);
975 }
976 
977 /*
978  * Schedules epoch-backed @fd instance deletion.
979  * * Unlinks @fd from the list of active algo instances.
980  * * Removes rib subscription.
981  * * Stops callout.
982  * * Schedules actual deletion.
983  *
984  * Assume @fd is already unlinked from the datapath.
985  */
986 static int
987 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
988 {
989 	bool is_dead;
990 
991 	NET_EPOCH_ASSERT();
992 	RIB_WLOCK_ASSERT(fd->fd_rh);
993 
994 	FIB_MOD_LOCK();
995 	is_dead = fd->fd_dead;
996 	if (!is_dead)
997 		fd->fd_dead = true;
998 	if (fd->fd_linked) {
999 		TAILQ_REMOVE(&V_fib_data_list, fd, entries);
1000 		fd->fd_linked = false;
1001 	}
1002 	FIB_MOD_UNLOCK();
1003 	if (is_dead)
1004 		return (0);
1005 
1006 	FD_PRINTF(LOG_INFO, fd, "DETACH");
1007 
1008 	if (fd->fd_rs != NULL)
1009 		rib_unsibscribe_locked(fd->fd_rs);
1010 
1011 	/*
1012 	 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
1013 	 * will be executed, hence no _new_ callout schedules will happen.
1014 	 */
1015 	callout_stop(&fd->fd_callout);
1016 
1017 	epoch_call(net_epoch_preempt, destroy_fd_instance_epoch,
1018 	    &fd->fd_epoch_ctx);
1019 
1020 	return (0);
1021 }
1022 
1023 /*
1024  * Wipe all fd instances from the list matching rib specified by @rh.
1025  * If @keep_first is set, remove all but the first record.
1026  */
1027 static void
1028 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
1029 {
1030 	struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
1031 	struct fib_data *fd, *fd_tmp;
1032 	struct epoch_tracker et;
1033 
1034 	FIB_MOD_LOCK();
1035 	TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) {
1036 		if (fd->fd_rh == rh) {
1037 			if (keep_first) {
1038 				keep_first = false;
1039 				continue;
1040 			}
1041 			TAILQ_REMOVE(&V_fib_data_list, fd, entries);
1042 			fd->fd_linked = false;
1043 			TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
1044 		}
1045 	}
1046 	FIB_MOD_UNLOCK();
1047 
1048 	/* Pass 2: remove each entry */
1049 	NET_EPOCH_ENTER(et);
1050 	TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
1051 		if (!in_callout)
1052 			RIB_WLOCK(fd->fd_rh);
1053 		schedule_destroy_fd_instance(fd, in_callout);
1054 		if (!in_callout)
1055 			RIB_WUNLOCK(fd->fd_rh);
1056 	}
1057 	NET_EPOCH_EXIT(et);
1058 }
1059 
1060 void
1061 fib_destroy_rib(struct rib_head *rh)
1062 {
1063 
1064 	/*
1065 	 * rnh has `is_dying` flag set, so setup of new fd's will fail at
1066 	 *  sync_algo() stage, preventing new entries to be added to the list
1067 	 *  of active algos. Remove all existing entries for the particular rib.
1068 	 */
1069 	fib_cleanup_algo(rh, false, false);
1070 }
1071 
1072 /*
1073  * Finalises fd destruction by freeing all fd resources.
1074  */
1075 static void
1076 destroy_fd_instance(struct fib_data *fd)
1077 {
1078 
1079 	FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd);
1080 
1081 	/* Call destroy callback first */
1082 	if (fd->fd_algo_data != NULL)
1083 		fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
1084 
1085 	/* Nhop table */
1086 	if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) {
1087 		for (int i = 0; i < fd->number_nhops; i++) {
1088 			if (!is_idx_free(fd, i)) {
1089 				FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p",
1090 				    i, fd->nh_idx[i]);
1091 				nhop_free_any(fd->nh_idx[i]);
1092 			}
1093 		}
1094 		free(fd->nh_idx, M_RTABLE);
1095 	}
1096 	if (fd->nh_ref_table != NULL)
1097 		free(fd->nh_ref_table, M_RTABLE);
1098 
1099 	if (fd->fd_ss.fd_change_queue.entries != NULL)
1100 		free(fd->fd_ss.fd_change_queue.entries, M_TEMP);
1101 
1102 	fib_unref_algo(fd->fd_flm);
1103 
1104 	free(fd, M_RTABLE);
1105 }
1106 
1107 /*
1108  * Epoch callback indicating fd is safe to destroy
1109  */
1110 static void
1111 destroy_fd_instance_epoch(epoch_context_t ctx)
1112 {
1113 	struct fib_data *fd;
1114 
1115 	fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
1116 
1117 	destroy_fd_instance(fd);
1118 }
1119 
1120 /*
1121  * Tries to setup fd instance.
1122  * - Allocates fd/nhop table
1123  * - Runs algo:flm_init_cb algo init
1124  * - Subscribes fd to the rib
1125  * - Runs rtable dump
1126  * - Adds instance to the list of active instances.
1127  *
1128  * Returns: operation result. Fills in @pfd with resulting fd on success.
1129  *
1130  */
1131 static enum flm_op_result
1132 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1133     struct fib_data *old_fd, struct fib_data **pfd)
1134 {
1135 	struct fib_data *fd;
1136 	size_t size;
1137 	enum flm_op_result result;
1138 
1139 	/* Allocate */
1140 	fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO);
1141 	if (fd == NULL)  {
1142 		*pfd = NULL;
1143 		RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure");
1144 		return (FLM_REBUILD);
1145 	}
1146 	*pfd = fd;
1147 
1148 	estimate_nhop_scale(old_fd, fd);
1149 
1150 	fd->fd_rh = rh;
1151 	fd->fd_gen = ++fib_gen;
1152 	fd->fd_family = rh->rib_family;
1153 	fd->fd_fibnum = rh->rib_fibnum;
1154 	callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0);
1155 	fd->fd_vnet = curvnet;
1156 	fd->fd_flm = flm;
1157 
1158 	FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd);
1159 
1160 	FIB_MOD_LOCK();
1161 	flm->flm_refcount++;
1162 	FIB_MOD_UNLOCK();
1163 
1164 	/* Allocate nhidx -> nhop_ptr table */
1165 	size = fd->number_nhops * sizeof(void *);
1166 	fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1167 	if (fd->nh_idx == NULL) {
1168 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size);
1169 		return (FLM_REBUILD);
1170 	}
1171 
1172 	/* Allocate nhop index refcount table */
1173 	size = sizeof(struct nhop_ref_table);
1174 	size += fd->number_nhops * sizeof(uint32_t);
1175 	fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1176 	if (fd->nh_ref_table == NULL) {
1177 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size);
1178 		return (FLM_REBUILD);
1179 	}
1180 	FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops);
1181 
1182 	/* Okay, we're ready for algo init */
1183 	void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL;
1184 	result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data);
1185 	if (result != FLM_SUCCESS) {
1186 		FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name);
1187 		return (result);
1188 	}
1189 
1190 	/* Try to subscribe */
1191 	if (flm->flm_change_rib_item_cb != NULL) {
1192 		fd->fd_rs = rib_subscribe_locked(fd->fd_rh,
1193 		    handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE);
1194 		if (fd->fd_rs == NULL) {
1195 			FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes");
1196 			return (FLM_REBUILD);
1197 		}
1198 	}
1199 
1200 	/* Dump */
1201 	result = sync_algo(fd);
1202 	if (result != FLM_SUCCESS) {
1203 		FD_PRINTF(LOG_INFO, fd, "rib sync failed");
1204 		return (result);
1205 	}
1206 	FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully.");
1207 
1208 	FIB_MOD_LOCK();
1209 	/*
1210 	 * Insert fd in the beginning of a list, to maintain invariant
1211 	 *  that first matching entry for the AF/fib is always the active
1212 	 *  one.
1213 	 */
1214 	TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries);
1215 	fd->fd_linked = true;
1216 	FIB_MOD_UNLOCK();
1217 
1218 	return (FLM_SUCCESS);
1219 }
1220 
1221 /*
1222  * Sets up algo @flm for table @rh and links it to the datapath.
1223  *
1224  */
1225 static enum flm_op_result
1226 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1227     struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
1228 {
1229 	struct fib_data *prev_fd, *new_fd;
1230 	enum flm_op_result result;
1231 
1232 	NET_EPOCH_ASSERT();
1233 	RIB_WLOCK_ASSERT(rh);
1234 
1235 	prev_fd = orig_fd;
1236 	new_fd = NULL;
1237 	for (int i = 0; i < FIB_MAX_TRIES; i++) {
1238 		result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);
1239 
1240 		if ((result == FLM_SUCCESS) && attach)
1241 			result = attach_datapath(new_fd);
1242 
1243 		if ((prev_fd != NULL) && (prev_fd != orig_fd)) {
1244 			schedule_destroy_fd_instance(prev_fd, false);
1245 			prev_fd = NULL;
1246 		}
1247 
1248 		RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
1249 		    print_op_result(result));
1250 
1251 		if (result == FLM_REBUILD) {
1252 			prev_fd = new_fd;
1253 			new_fd = NULL;
1254 			continue;
1255 		}
1256 
1257 		break;
1258 	}
1259 
1260 	if (result != FLM_SUCCESS) {
1261 		RH_PRINTF(LOG_WARNING, rh,
1262 		    "%s algo instance setup failed, failures=%d", flm->flm_name,
1263 		    orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0);
1264 		/* update failure count */
1265 		FIB_MOD_LOCK();
1266 		if (orig_fd != NULL)
1267 			orig_fd->fd_failed_rebuilds++;
1268 		FIB_MOD_UNLOCK();
1269 
1270 		/* Ban algo on non-recoverable error */
1271 		if (result == FLM_ERROR)
1272 			flm_error_add(flm, rh->rib_fibnum);
1273 
1274 		if ((prev_fd != NULL) && (prev_fd != orig_fd))
1275 			schedule_destroy_fd_instance(prev_fd, false);
1276 		if (new_fd != NULL) {
1277 			schedule_destroy_fd_instance(new_fd, false);
1278 			new_fd = NULL;
1279 		}
1280 	}
1281 
1282 	*pfd = new_fd;
1283 	return (result);
1284 }
1285 
1286 /*
1287  * Tries to sync algo with the current rtable state, either
1288  * by executing batch update or rebuilding.
1289  * Returns true on success.
1290  */
1291 static bool
1292 execute_callout_action(struct fib_data *fd)
1293 {
1294 	enum fib_callout_action action = fd->fd_callout_action;
1295 	struct fib_lookup_module *flm_new = NULL;
1296 	bool result = true;
1297 
1298 	NET_EPOCH_ASSERT();
1299 	RIB_WLOCK_ASSERT(fd->fd_rh);
1300 
1301 	fd->fd_need_rebuild = false;
1302 	fd->fd_batch = false;
1303 	fd->fd_num_changes = 0;
1304 
1305 	/* First, check if we're still OK to use this algo */
1306 	if (!is_algo_fixed(fd->fd_rh))
1307 		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1308 	if (flm_new != NULL)
1309 		action = FDA_REBUILD;
1310 
1311 	if (action == FDA_BATCH) {
1312 		/* Try to sync */
1313 		if (!apply_rtable_changes(fd))
1314 			action = FDA_REBUILD;
1315 	}
1316 
1317 	if (action == FDA_REBUILD)
1318 		result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1319 	if (flm_new != NULL)
1320 		fib_unref_algo(flm_new);
1321 
1322 	return (result);
1323 }
1324 
1325 /*
1326  * Callout for all scheduled fd-related work.
1327  * - Checks if the current algo is still the best algo
1328  * - Synchronises algo instance to the rtable (batch usecase)
1329  * - Creates a new instance of an algo for af/fib if desired.
1330  */
1331 static void
1332 handle_fd_callout(void *_data)
1333 {
1334 	struct fib_data *fd = (struct fib_data *)_data;
1335 	struct epoch_tracker et;
1336 
1337 	FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action);
1338 
1339 	NET_EPOCH_ENTER(et);
1340 	CURVNET_SET(fd->fd_vnet);
1341 	execute_callout_action(fd);
1342 	CURVNET_RESTORE();
1343 	NET_EPOCH_EXIT(et);
1344 }
1345 
1346 /*
1347  * Tries to create new algo instance based on @fd data.
1348  * Returns true on success.
1349  */
1350 static bool
1351 rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new)
1352 {
1353 	struct fib_data *fd_new, *fd_tmp = NULL;
1354 	bool result;
1355 
1356 	if (flm_new == fd->fd_flm)
1357 		fd_tmp = fd;
1358 	else
1359 		FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name);
1360 
1361 	result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true);
1362 	if (result != FLM_SUCCESS) {
1363 		FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed");
1364 		return (false);
1365 	}
1366 	FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");
1367 
1368 	/* Remove old instance */
1369 	schedule_destroy_fd_instance(fd, true);
1370 
1371 	return (true);
1372 }
1373 
1374 static bool
1375 rebuild_fd(struct fib_data *fd, const char *reason)
1376 {
1377 	struct fib_lookup_module *flm_new = NULL;
1378 	bool result;
1379 
1380 	if (!is_algo_fixed(fd->fd_rh))
1381 		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1382 
1383 	FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason);
1384 	result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1385 	if (flm_new != NULL)
1386 		fib_unref_algo(flm_new);
1387 
1388 	if (!result) {
1389 		FD_PRINTF(LOG_ERR, fd, "sync rebuild failed");
1390 		schedule_fd_rebuild(fd, "sync rebuild failed");
1391 	}
1392 
1393 	return (result);
1394 }
1395 
1396 /*
1397  * Finds algo by name/family.
1398  * Returns referenced algo or NULL.
1399  */
1400 static struct fib_lookup_module *
1401 fib_find_algo(const char *algo_name, int family)
1402 {
1403 	struct fib_lookup_module *flm;
1404 
1405 	FIB_MOD_LOCK();
1406 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1407 		if ((strcmp(flm->flm_name, algo_name) == 0) &&
1408 		    (family == flm->flm_family)) {
1409 			flm->flm_refcount++;
1410 			FIB_MOD_UNLOCK();
1411 			return (flm);
1412 		}
1413 	}
1414 	FIB_MOD_UNLOCK();
1415 
1416 	return (NULL);
1417 }
1418 
1419 static void
1420 fib_unref_algo(struct fib_lookup_module *flm)
1421 {
1422 
1423 	FIB_MOD_LOCK();
1424 	flm->flm_refcount--;
1425 	FIB_MOD_UNLOCK();
1426 }
1427 
1428 static int
1429 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req)
1430 {
1431 	struct fib_lookup_module *flm = NULL;
1432 	struct fib_data *fd = NULL;
1433 	char old_algo_name[32], algo_name[32];
1434 	struct rib_head *rh = NULL;
1435 	enum flm_op_result result;
1436 	struct epoch_tracker et;
1437 	int error;
1438 
1439 	/* Fetch current algo/rib for af/family */
1440 	FIB_MOD_LOCK();
1441 	TAILQ_FOREACH(fd, &V_fib_data_list, entries) {
1442 		if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum))
1443 			break;
1444 	}
1445 	if (fd == NULL) {
1446 		FIB_MOD_UNLOCK();
1447 		return (ENOENT);
1448 	}
1449 	rh = fd->fd_rh;
1450 	strlcpy(old_algo_name, fd->fd_flm->flm_name,
1451 	    sizeof(old_algo_name));
1452 	FIB_MOD_UNLOCK();
1453 
1454 	strlcpy(algo_name, old_algo_name, sizeof(algo_name));
1455 	error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req);
1456 	if (error != 0 || req->newptr == NULL)
1457 		return (error);
1458 
1459 	if (strcmp(algo_name, old_algo_name) == 0)
1460 		return (0);
1461 
1462 	/* New algorithm name is different */
1463 	flm = fib_find_algo(algo_name, family);
1464 	if (flm == NULL) {
1465 		RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name);
1466 		return (ESRCH);
1467 	}
1468 
1469 	fd = NULL;
1470 	NET_EPOCH_ENTER(et);
1471 	RIB_WLOCK(rh);
1472 	result = setup_fd_instance(flm, rh, NULL, &fd, true);
1473 	RIB_WUNLOCK(rh);
1474 	NET_EPOCH_EXIT(et);
1475 	fib_unref_algo(flm);
1476 	if (result != FLM_SUCCESS)
1477 		return (EINVAL);
1478 
1479 	/* Disable automated jumping between algos */
1480 	FIB_MOD_LOCK();
1481 	set_algo_fixed(rh);
1482 	FIB_MOD_UNLOCK();
1483 	/* Remove old instance(s) */
1484 	fib_cleanup_algo(rh, true, false);
1485 
1486 	/* Drain cb so user can unload the module after userret if so desired */
1487 	epoch_drain_callbacks(net_epoch_preempt);
1488 
1489 	return (0);
1490 }
1491 
1492 #ifdef INET
1493 static int
1494 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS)
1495 {
1496 
1497 	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req));
1498 }
1499 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo,
1500     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1501     set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo");
1502 #endif
1503 
1504 #ifdef INET6
1505 static int
1506 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS)
1507 {
1508 
1509 	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req));
1510 }
1511 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo,
1512     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1513     set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo");
1514 #endif
1515 
1516 static void
1517 destroy_fdh_epoch(epoch_context_t ctx)
1518 {
1519 	struct fib_dp_header *fdh;
1520 
1521 	fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx);
1522 	free(fdh, M_RTABLE);
1523 }
1524 
1525 static struct fib_dp_header *
1526 alloc_fib_dp_array(uint32_t num_tables, bool waitok)
1527 {
1528 	size_t sz;
1529 	struct fib_dp_header *fdh;
1530 
1531 	sz = sizeof(struct fib_dp_header);
1532 	sz += sizeof(struct fib_dp) * num_tables;
1533 	fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
1534 	if (fdh != NULL)
1535 		fdh->fdh_num_tables = num_tables;
1536 	return (fdh);
1537 }
1538 
1539 static struct fib_dp_header *
1540 get_fib_dp_header(struct fib_dp *dp)
1541 {
1542 
1543 	return (__containerof((void *)dp, struct fib_dp_header, fdh_idx));
1544 }
1545 
1546 /*
1547  * Replace per-family index pool @pdp with a new one which
1548  * contains updated callback/algo data from @fd.
1549  * Returns 0 on success.
1550  */
1551 static enum flm_op_result
1552 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd)
1553 {
1554 	struct fib_dp_header *new_fdh, *old_fdh;
1555 
1556 	NET_EPOCH_ASSERT();
1557 
1558 	FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p",
1559 	    curvnet, fd->fd_dp.f, fd->fd_dp.arg);
1560 
1561 	FIB_MOD_LOCK();
1562 	old_fdh = get_fib_dp_header(*pdp);
1563 	new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false);
1564 	FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh);
1565 	if (new_fdh == NULL) {
1566 		FIB_MOD_UNLOCK();
1567 		FD_PRINTF(LOG_WARNING, fd, "error attaching datapath");
1568 		return (FLM_REBUILD);
1569 	}
1570 
1571 	memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1572 	    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1573 	/* Update relevant data structure for @fd */
1574 	new_fdh->fdh_idx[fd->fd_fibnum] = fd->fd_dp;
1575 
1576 	/* Ensure memcpy() writes have completed */
1577 	atomic_thread_fence_rel();
1578 	/* Set new datapath pointer */
1579 	*pdp = &new_fdh->fdh_idx[0];
1580 	FIB_MOD_UNLOCK();
1581 	FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh);
1582 
1583 	epoch_call(net_epoch_preempt, destroy_fdh_epoch,
1584 	    &old_fdh->fdh_epoch_ctx);
1585 
1586 	return (FLM_SUCCESS);
1587 }
1588 
1589 static struct fib_dp **
1590 get_family_dp_ptr(int family)
1591 {
1592 	switch (family) {
1593 	case AF_INET:
1594 		return (&V_inet_dp);
1595 	case AF_INET6:
1596 		return (&V_inet6_dp);
1597 	}
1598 	return (NULL);
1599 }
1600 
1601 /*
1602  * Make datapath use fib instance @fd
1603  */
1604 static enum flm_op_result
1605 attach_datapath(struct fib_data *fd)
1606 {
1607 	struct fib_dp **pdp;
1608 
1609 	pdp = get_family_dp_ptr(fd->fd_family);
1610 	return (replace_rtables_family(pdp, fd));
1611 }
1612 
1613 /*
1614  * Grow datapath pointers array.
1615  * Called from sysctl handler on growing number of routing tables.
1616  */
1617 static void
1618 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables)
1619 {
1620 	struct fib_dp_header *new_fdh, *old_fdh = NULL;
1621 
1622 	new_fdh = alloc_fib_dp_array(new_num_tables, true);
1623 
1624 	FIB_MOD_LOCK();
1625 	if (*pdp != NULL) {
1626 		old_fdh = get_fib_dp_header(*pdp);
1627 		memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1628 		    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1629 	}
1630 
1631 	/* Wait till all writes completed */
1632 	atomic_thread_fence_rel();
1633 
1634 	*pdp = &new_fdh->fdh_idx[0];
1635 	FIB_MOD_UNLOCK();
1636 
1637 	if (old_fdh != NULL)
1638 		epoch_call(net_epoch_preempt, destroy_fdh_epoch,
1639 		    &old_fdh->fdh_epoch_ctx);
1640 }
1641 
1642 /*
1643  * Grows per-AF arrays of datapath pointers for each supported family.
1644  * Called from fibs resize sysctl handler.
1645  */
1646 void
1647 fib_grow_rtables(uint32_t new_num_tables)
1648 {
1649 
1650 #ifdef INET
1651 	grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables);
1652 #endif
1653 #ifdef INET6
1654 	grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables);
1655 #endif
1656 }
1657 
1658 void
1659 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo)
1660 {
1661 
1662 	bzero(rinfo, sizeof(struct rib_rtable_info));
1663 	rinfo->num_prefixes = rh->rnh_prefixes;
1664 	rinfo->num_nhops = nhops_get_count(rh);
1665 #ifdef ROUTE_MPATH
1666 	rinfo->num_nhgrp = nhgrp_get_count(rh);
1667 #endif
1668 }
1669 
1670 /*
1671  * Accessor to get rib instance @fd is attached to.
1672  */
1673 struct rib_head *
1674 fib_get_rh(struct fib_data *fd)
1675 {
1676 
1677 	return (fd->fd_rh);
1678 }
1679 
1680 /*
1681  * Accessor to export idx->nhop array
1682  */
1683 struct nhop_object **
1684 fib_get_nhop_array(struct fib_data *fd)
1685 {
1686 
1687 	return (fd->nh_idx);
1688 }
1689 
1690 static uint32_t
1691 get_nhop_idx(struct nhop_object *nh)
1692 {
1693 #ifdef ROUTE_MPATH
1694 	if (NH_IS_NHGRP(nh))
1695 		return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1);
1696 	else
1697 		return (nhop_get_idx(nh) * 2);
1698 #else
1699 	return (nhop_get_idx(nh));
1700 #endif
1701 }
1702 
1703 uint32_t
1704 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh)
1705 {
1706 
1707 	return (get_nhop_idx(nh));
1708 }
1709 
1710 static bool
1711 is_idx_free(struct fib_data *fd, uint32_t index)
1712 {
1713 
1714 	return (fd->nh_ref_table->refcnt[index] == 0);
1715 }
1716 
1717 static uint32_t
1718 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh)
1719 {
1720 	uint32_t idx = get_nhop_idx(nh);
1721 
1722 	if (idx >= fd->number_nhops) {
1723 		fd->hit_nhops = 1;
1724 		return (0);
1725 	}
1726 
1727 	if (is_idx_free(fd, idx)) {
1728 		nhop_ref_any(nh);
1729 		fd->nh_idx[idx] = nh;
1730 		fd->nh_ref_table->count++;
1731 		FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]);
1732 	}
1733 	fd->nh_ref_table->refcnt[idx]++;
1734 
1735 	return (idx);
1736 }
1737 
1738 struct nhop_release_data {
1739 	struct nhop_object	*nh;
1740 	struct epoch_context	ctx;
1741 };
1742 
1743 static void
1744 release_nhop_epoch(epoch_context_t ctx)
1745 {
1746 	struct nhop_release_data *nrd;
1747 
1748 	nrd = __containerof(ctx, struct nhop_release_data, ctx);
1749 	nhop_free_any(nrd->nh);
1750 	free(nrd, M_TEMP);
1751 }
1752 
1753 /*
1754  * Delays nexthop refcount release.
1755  * Datapath may have the datastructures not updated yet, so the old
1756  *  nexthop may still be returned till the end of current epoch. Delay
1757  *  refcount removal, as we may be removing the last instance, which will
1758  *  trigger nexthop deletion, rendering returned nexthop invalid.
1759  */
1760 static void
1761 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh)
1762 {
1763 	struct nhop_release_data *nrd;
1764 
1765 	nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO);
1766 	if (nrd != NULL) {
1767 		nrd->nh = nh;
1768 		epoch_call(net_epoch_preempt, release_nhop_epoch, &nrd->ctx);
1769 	} else {
1770 		/*
1771 		 * Unable to allocate memory. Leak nexthop to maintain guarantee
1772 		 *  that each nhop can be referenced.
1773 		 */
1774 		FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh);
1775 	}
1776 }
1777 
1778 static void
1779 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh)
1780 {
1781 	uint32_t idx = get_nhop_idx(nh);
1782 
1783 	KASSERT((idx < fd->number_nhops), ("invalid nhop index"));
1784 	KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh"));
1785 
1786 	fd->nh_ref_table->refcnt[idx]--;
1787 	if (fd->nh_ref_table->refcnt[idx] == 0) {
1788 		FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]);
1789 		fib_schedule_release_nhop(fd, fd->nh_idx[idx]);
1790 	}
1791 }
1792 
1793 static void
1794 set_algo_fixed(struct rib_head *rh)
1795 {
1796 	switch (rh->rib_family) {
1797 #ifdef INET
1798 	case AF_INET:
1799 		V_algo_fixed_inet = true;
1800 		break;
1801 #endif
1802 #ifdef INET6
1803 	case AF_INET6:
1804 		V_algo_fixed_inet6 = true;
1805 		break;
1806 #endif
1807 	}
1808 }
1809 
1810 static bool
1811 is_algo_fixed(struct rib_head *rh)
1812 {
1813 
1814 	switch (rh->rib_family) {
1815 #ifdef INET
1816 	case AF_INET:
1817 		return (V_algo_fixed_inet);
1818 #endif
1819 #ifdef INET6
1820 	case AF_INET6:
1821 		return (V_algo_fixed_inet6);
1822 #endif
1823 	}
1824 	return (false);
1825 }
1826 
1827 /*
1828  * Runs the check on what would be the best algo for rib @rh, assuming
1829  *  that the current algo is the one specified by @orig_flm. Note that
1830  *  it can be NULL for initial selection.
1831  *
1832  * Returns referenced new algo or NULL if the current one is the best.
1833  */
1834 static struct fib_lookup_module *
1835 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm)
1836 {
1837 	uint8_t preference, curr_preference = 0, best_preference = 0;
1838 	struct fib_lookup_module *flm, *best_flm = NULL;
1839 	struct rib_rtable_info rinfo;
1840 	int candidate_algos = 0;
1841 
1842 	fib_get_rtable_info(rh, &rinfo);
1843 
1844 	FIB_MOD_LOCK();
1845 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1846 		if (flm->flm_family != rh->rib_family)
1847 			continue;
1848 		candidate_algos++;
1849 		preference = flm->flm_get_pref(&rinfo);
1850 		if (preference > best_preference) {
1851 			if (!flm_error_check(flm, rh->rib_fibnum)) {
1852 				best_preference = preference;
1853 				best_flm = flm;
1854 			}
1855 		}
1856 		if (flm == orig_flm)
1857 			curr_preference = preference;
1858 	}
1859 	if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference))
1860 		best_flm->flm_refcount++;
1861 	else
1862 		best_flm = NULL;
1863 	FIB_MOD_UNLOCK();
1864 
1865 	RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)",
1866 	    candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference,
1867 	    best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"),
1868 	    best_preference);
1869 
1870 	return (best_flm);
1871 }
1872 
1873 /*
1874  * Called when new route table is created.
1875  * Selects, allocates and attaches fib algo for the table.
1876  */
1877 int
1878 fib_select_algo_initial(struct rib_head *rh)
1879 {
1880 	struct fib_lookup_module *flm;
1881 	struct fib_data *fd = NULL;
1882 	enum flm_op_result result;
1883 	struct epoch_tracker et;
1884 	int error = 0;
1885 
1886 	flm = fib_check_best_algo(rh, NULL);
1887 	if (flm == NULL) {
1888 		RH_PRINTF(LOG_CRIT, rh, "no algo selected");
1889 		return (ENOENT);
1890 	}
1891 	RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);
1892 
1893 	NET_EPOCH_ENTER(et);
1894 	RIB_WLOCK(rh);
1895 	result = setup_fd_instance(flm, rh, NULL, &fd, false);
1896 	RIB_WUNLOCK(rh);
1897 	NET_EPOCH_EXIT(et);
1898 
1899 	RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
1900 	if (result == FLM_SUCCESS) {
1901 
1902 		/*
1903 		 * Attach datapath directly to avoid multiple reallocations
1904 		 * during fib growth
1905 		 */
1906 		struct fib_dp_header *fdp;
1907 		struct fib_dp **pdp;
1908 
1909 		pdp = get_family_dp_ptr(rh->rib_family);
1910 		if (pdp != NULL) {
1911 			fdp = get_fib_dp_header(*pdp);
1912 			fdp->fdh_idx[fd->fd_fibnum] = fd->fd_dp;
1913 			FD_PRINTF(LOG_INFO, fd, "datapath attached");
1914 		}
1915 	} else {
1916 		error = EINVAL;
1917 		RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name);
1918 	}
1919 
1920 	fib_unref_algo(flm);
1921 
1922 	return (error);
1923 }
1924 
1925 /*
1926  * Registers fib lookup module within the subsystem.
1927  */
1928 int
1929 fib_module_register(struct fib_lookup_module *flm)
1930 {
1931 
1932 	FIB_MOD_LOCK();
1933 	ALGO_PRINTF("attaching %s to %s", flm->flm_name,
1934 	    print_family(flm->flm_family));
1935 	TAILQ_INSERT_TAIL(&all_algo_list, flm, entries);
1936 	FIB_MOD_UNLOCK();
1937 
1938 	return (0);
1939 }
1940 
1941 /*
1942  * Tries to unregister fib lookup module.
1943  *
1944  * Returns 0 on success, EBUSY if module is still used
1945  *  by some of the tables.
1946  */
1947 int
1948 fib_module_unregister(struct fib_lookup_module *flm)
1949 {
1950 
1951 	FIB_MOD_LOCK();
1952 	if (flm->flm_refcount > 0) {
1953 		FIB_MOD_UNLOCK();
1954 		return (EBUSY);
1955 	}
1956 	fib_error_clear_flm(flm);
1957 	ALGO_PRINTF("detaching %s from %s", flm->flm_name,
1958 	    print_family(flm->flm_family));
1959 	TAILQ_REMOVE(&all_algo_list, flm, entries);
1960 	FIB_MOD_UNLOCK();
1961 
1962 	return (0);
1963 }
1964 
1965 void
1966 vnet_fib_init(void)
1967 {
1968 
1969 	TAILQ_INIT(&V_fib_data_list);
1970 }
1971 
1972 void
1973 vnet_fib_destroy(void)
1974 {
1975 
1976 	FIB_MOD_LOCK();
1977 	fib_error_clear();
1978 	FIB_MOD_UNLOCK();
1979 }
1980