xref: /freebsd/sys/net/route/fib_algo.c (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 #include "opt_inet.h"
30 #include "opt_inet6.h"
31 #include "opt_route.h"
32 
33 #include <sys/param.h>
34 #include <sys/eventhandler.h>
35 #include <sys/kernel.h>
36 #include <sys/sbuf.h>
37 #include <sys/lock.h>
38 #include <sys/rmlock.h>
39 #include <sys/malloc.h>
40 #include <sys/mbuf.h>
41 #include <sys/module.h>
42 #include <sys/kernel.h>
43 #include <sys/priv.h>
44 #include <sys/proc.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/queue.h>
50 #include <net/vnet.h>
51 
52 #include <net/if.h>
53 #include <net/if_var.h>
54 
55 #include <netinet/in.h>
56 #include <netinet/in_var.h>
57 #include <netinet/ip.h>
58 #include <netinet/ip_var.h>
59 #ifdef INET6
60 #include <netinet/ip6.h>
61 #include <netinet6/ip6_var.h>
62 #endif
63 
64 #include <net/route.h>
65 #include <net/route/nhop.h>
66 #include <net/route/route_ctl.h>
67 #include <net/route/route_var.h>
68 #include <net/route/fib_algo.h>
69 
70 #include <machine/stdarg.h>
71 
72 /*
73  * Fib lookup framework.
74  *
75  * This framework enables accelerated longest-prefix-match lookups for the
76  *  routing tables by adding the ability to dynamically attach/detach lookup
77  *  algorithms implementation to/from the datapath.
78  *
79  * flm - fib lookup modules - implementation of particular lookup algorithm
80  * fd - fib data - instance of an flm bound to specific routing table
81  *
82  * This file provides main framework functionality.
83  *
84  * The following are the features provided by the framework
85  *
86  * 1) nexhops abstraction -> provides transparent referencing, indexing
87  *   and efficient idx->ptr mappings for nexthop and nexthop groups.
88  * 2) Routing table synchronisation
89  * 3) dataplane attachment points
90  * 4) automatic algorithm selection based on the provided preference.
91  *
92  *
93  * DATAPATH
94  * For each supported address family, there is a an allocated array of fib_dp
95  *  structures, indexed by fib number. Each array entry contains callback function
96  *  and its argument. This function will be called with a family-specific lookup key,
97  *  scope and provided argument. This array gets re-created every time when new algo
98  *  instance gets created. Please take a look at the replace_rtables_family() function
99  *  for more details.
100  *
101  */
102 
103 SYSCTL_DECL(_net_route);
104 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
105     "Fib algorithm lookups");
106 
107 /* Algorithm sync policy */
108 
109 /* Time interval to bucket updates */
110 VNET_DEFINE_STATIC(unsigned int, update_bucket_time_ms) = 50;
111 #define	V_update_bucket_time_ms	VNET(update_bucket_time_ms)
112 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET,
113     &VNET_NAME(update_bucket_time_ms), 0, "Time interval to calculate update rate");
114 
115 /* Minimum update rate to delay sync */
116 VNET_DEFINE_STATIC(unsigned int, bucket_change_threshold_rate) = 500;
117 #define	V_bucket_change_threshold_rate	VNET(bucket_change_threshold_rate)
118 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET,
119     &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync");
120 
121 /* Max allowed delay to sync */
122 VNET_DEFINE_STATIC(unsigned int, fib_max_sync_delay_ms) = 1000;
123 #define	V_fib_max_sync_delay_ms	VNET(fib_max_sync_delay_ms)
124 SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET,
125     &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)");
126 
127 
128 #ifdef INET6
129 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false;
130 #define	V_algo_fixed_inet6	VNET(algo_fixed_inet6)
131 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
132     "IPv6 longest prefix match lookups");
133 #endif
134 #ifdef INET
135 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false;
136 #define	V_algo_fixed_inet	VNET(algo_fixed_inet)
137 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
138     "IPv4 longest prefix match lookups");
139 #endif
140 
141 /* Fib instance counter */
142 static uint32_t fib_gen = 0;
143 
144 struct nhop_ref_table {
145 	uint32_t		count;
146 	int32_t			refcnt[0];
147 };
148 
149 enum fib_callout_action {
150 	FDA_NONE,	/* No callout scheduled */
151 	FDA_REBUILD,	/* Asks to rebuild algo instance */
152 	FDA_EVAL,	/* Asks to evaluate if the current algo is still be best */
153 	FDA_BATCH,	/* Asks to submit batch of updates to the algo */
154 };
155 
156 struct fib_sync_status {
157 	struct timeval		diverge_time;	/* ts when diverged */
158 	uint32_t		num_changes;	/* number of changes since sync */
159 	uint32_t		bucket_changes;	/* num changes within the current bucket */
160 	uint64_t		bucket_id;	/* 50ms bucket # */
161 	struct fib_change_queue	fd_change_queue;/* list of scheduled entries */
162 };
163 
164 /*
165  * Data structure for the fib lookup instance tied to the particular rib.
166  */
167 struct fib_data {
168 	uint32_t		number_nhops;	/* current # of nhops */
169 	uint8_t			hit_nhops;	/* true if out of nhop limit */
170 	uint8_t			init_done;	/* true if init is competed */
171 	uint32_t		fd_dead:1;	/* Scheduled for deletion */
172 	uint32_t		fd_linked:1;	/* true if linked */
173 	uint32_t		fd_need_rebuild:1;	/* true if rebuild scheduled */
174 	uint32_t		fd_batch:1;	/* true if batched notification scheduled */
175 	uint8_t			fd_family;	/* family */
176 	uint32_t		fd_fibnum;	/* fibnum */
177 	uint32_t		fd_failed_rebuilds;	/* stat: failed rebuilds */
178 	uint32_t		fd_gen;		/* instance gen# */
179 	struct callout		fd_callout;	/* rebuild callout */
180 	enum fib_callout_action	fd_callout_action;	/* Callout action to take */
181 	void			*fd_algo_data;	/* algorithm data */
182 	struct nhop_object	**nh_idx;	/* nhop idx->ptr array */
183 	struct nhop_ref_table	*nh_ref_table;	/* array with # of nhop references */
184 	struct rib_head		*fd_rh;		/* RIB table we're attached to */
185 	struct rib_subscription	*fd_rs;		/* storing table subscription */
186 	struct fib_dp		fd_dp;		/* fib datapath data */
187 	struct vnet		*fd_vnet;	/* vnet fib belongs to */
188 	struct epoch_context	fd_epoch_ctx;	/* epoch context for deletion */
189 	struct fib_lookup_module	*fd_flm;/* pointer to the lookup module */
190 	struct fib_sync_status	fd_ss;		/* State relevant to the rib sync  */
191 	uint32_t		fd_num_changes;	/* number of changes since last callout */
192 	TAILQ_ENTRY(fib_data)	entries;	/* list of all fds in vnet */
193 };
194 
195 static bool rebuild_fd(struct fib_data *fd, const char *reason);
196 static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new);
197 static void handle_fd_callout(void *_data);
198 static void destroy_fd_instance_epoch(epoch_context_t ctx);
199 static bool is_idx_free(struct fib_data *fd, uint32_t index);
200 static void set_algo_fixed(struct rib_head *rh);
201 static bool is_algo_fixed(struct rib_head *rh);
202 
203 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh);
204 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh);
205 
206 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
207     struct fib_lookup_module *orig_flm);
208 static void fib_unref_algo(struct fib_lookup_module *flm);
209 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum);
210 
211 struct mtx fib_mtx;
212 #define	FIB_MOD_LOCK()		mtx_lock(&fib_mtx)
213 #define	FIB_MOD_UNLOCK()	mtx_unlock(&fib_mtx)
214 #define	FIB_MOD_LOCK_ASSERT()	mtx_assert(&fib_mtx, MA_OWNED)
215 
216 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF);
217 
218 /* Algorithm has to be this percent better than the current to switch */
219 #define	BEST_DIFF_PERCENT	(5 * 256 / 100)
220 /* Schedule algo re-evaluation X seconds after a change */
221 #define	ALGO_EVAL_DELAY_MS	30000
222 /* Force algo re-evaluation after X changes */
223 #define	ALGO_EVAL_NUM_ROUTES	100
224 /* Try to setup algorithm X times */
225 #define	FIB_MAX_TRIES		32
226 /* Max amount of supported nexthops */
227 #define	FIB_MAX_NHOPS		262144
228 #define	FIB_CALLOUT_DELAY_MS	50
229 
230 
231 /* Debug */
232 static int flm_debug_level = LOG_NOTICE;
233 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN,
234     &flm_debug_level, 0, "debuglevel");
235 #define	FLM_MAX_DEBUG_LEVEL	LOG_DEBUG
236 #ifndef	LOG_DEBUG2
237 #define	LOG_DEBUG2	8
238 #endif
239 
240 #define	_PASS_MSG(_l)	(flm_debug_level >= (_l))
241 #define	ALGO_PRINTF(_l, _fmt, ...)	if (_PASS_MSG(_l)) {		\
242 	printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__);	\
243 }
244 #define	_ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \
245     printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\
246     print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__)
247 #define	_RH_PRINTF(_fib, _fam, _func, _fmt, ...) \
248     printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__)
249 #define	RH_PRINTF(_l, _rh, _fmt, ...)	if (_PASS_MSG(_l)) {	\
250     _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\
251 }
252 #define	FD_PRINTF(_l, _fd, _fmt, ...)	FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__)
253 #define	_FD_PRINTF(_l, _fd, _fmt, ...)	if (_PASS_MSG(_l)) {		\
254     _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name,	\
255     _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__);			\
256 }
257 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2
258 #define	FD_PRINTF_LOG_DEBUG2	_FD_PRINTF
259 #else
260 #define	FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...)
261 #endif
262 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG
263 #define	FD_PRINTF_LOG_DEBUG	_FD_PRINTF
264 #else
265 #define	FD_PRINTF_LOG_DEBUG()
266 #endif
267 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO
268 #define	FD_PRINTF_LOG_INFO	_FD_PRINTF
269 #else
270 #define	FD_PRINTF_LOG_INFO()
271 #endif
272 #define	FD_PRINTF_LOG_NOTICE	_FD_PRINTF
273 #define	FD_PRINTF_LOG_ERR	_FD_PRINTF
274 #define	FD_PRINTF_LOG_WARNING	_FD_PRINTF
275 
276 
277 /* List of all registered lookup algorithms */
278 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list);
279 
280 /* List of all fib lookup instances in the vnet */
281 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list);
282 #define	V_fib_data_list	VNET(fib_data_list)
283 
284 /* Datastructure for storing non-transient fib lookup module failures */
285 struct fib_error {
286 	int				fe_family;
287 	uint32_t			fe_fibnum;	/* failed rtable */
288 	struct fib_lookup_module	*fe_flm;	/* failed module */
289 	TAILQ_ENTRY(fib_error)		entries;/* list of all errored entries */
290 };
291 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list);
292 #define	V_fib_error_list VNET(fib_error_list)
293 
294 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */
295 struct fib_dp_header {
296 	struct epoch_context	fdh_epoch_ctx;
297 	uint32_t		fdh_num_tables;
298 	struct fib_dp		fdh_idx[0];
299 };
300 
301 /*
302  * Tries to add new non-transient algorithm error to the list of
303  *  errors.
304  * Returns true on success.
305  */
306 static bool
307 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum)
308 {
309 	struct fib_error *fe;
310 
311 	fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO);
312 	if (fe == NULL)
313 		return (false);
314 	fe->fe_flm = flm;
315 	fe->fe_family = flm->flm_family;
316 	fe->fe_fibnum = fibnum;
317 
318 	FIB_MOD_LOCK();
319 	/* Avoid duplicates by checking if error already exists first */
320 	if (flm_error_check(flm, fibnum)) {
321 		FIB_MOD_UNLOCK();
322 		free(fe, M_TEMP);
323 		return (true);
324 	}
325 	TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries);
326 	FIB_MOD_UNLOCK();
327 
328 	return (true);
329 }
330 
331 /*
332  * True if non-transient error has been registered for @flm in @fibnum.
333  */
334 static bool
335 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum)
336 {
337 	const struct fib_error *fe;
338 
339 	TAILQ_FOREACH(fe, &V_fib_error_list, entries) {
340 		if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum))
341 			return (true);
342 	}
343 
344 	return (false);
345 }
346 
347 /*
348  * Clear all errors of algo specified by @flm.
349  */
350 static void
351 fib_error_clear_flm(struct fib_lookup_module *flm)
352 {
353 	struct fib_error *fe, *fe_tmp;
354 
355 	FIB_MOD_LOCK_ASSERT();
356 
357 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
358 		if (fe->fe_flm == flm) {
359 			TAILQ_REMOVE(&V_fib_error_list, fe, entries);
360 			free(fe, M_TEMP);
361 		}
362 	}
363 }
364 
365 /*
366  * Clears all errors in current VNET.
367  */
368 static void
369 fib_error_clear(void)
370 {
371 	struct fib_error *fe, *fe_tmp;
372 
373 	FIB_MOD_LOCK_ASSERT();
374 
375 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
376 		TAILQ_REMOVE(&V_fib_error_list, fe, entries);
377 		free(fe, M_TEMP);
378 	}
379 }
380 
381 static const char *
382 print_op_result(enum flm_op_result result)
383 {
384 	switch (result) {
385 	case FLM_SUCCESS:
386 		return "success";
387 	case FLM_REBUILD:
388 		return "rebuild";
389 	case FLM_BATCH:
390 		return "batch";
391 	case FLM_ERROR:
392 		return "error";
393 	}
394 
395 	return "unknown";
396 }
397 
398 static const char *
399 print_family(int family)
400 {
401 
402 	if (family == AF_INET)
403 		return ("inet");
404 	else if (family == AF_INET6)
405 		return ("inet6");
406 	else
407 		return ("unknown");
408 }
409 
410 /*
411  * Debug function used by lookup algorithms.
412  * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) "
413  */
414 void
415 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...)
416 {
417 	char buf[128];
418 	va_list ap;
419 
420 	if (level > flm_debug_level)
421 		return;
422 
423 	va_start(ap, fmt);
424 	vsnprintf(buf, sizeof(buf), fmt, ap);
425 	va_end(ap);
426 
427 	_ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name,
428 	    fd->fd_gen, func, "%s", buf);
429 }
430 
431 /*
432  * Outputs list of algorithms supported by the provided address family.
433  */
434 static int
435 print_algos_sysctl(struct sysctl_req *req, int family)
436 {
437 	struct fib_lookup_module *flm;
438 	struct sbuf sbuf;
439 	int error, count = 0;
440 
441 	error = sysctl_wire_old_buffer(req, 0);
442 	if (error == 0) {
443 		sbuf_new_for_sysctl(&sbuf, NULL, 512, req);
444 		TAILQ_FOREACH(flm, &all_algo_list, entries) {
445 			if (flm->flm_family == family) {
446 				if (count++ > 0)
447 					sbuf_cat(&sbuf, ", ");
448 				sbuf_cat(&sbuf, flm->flm_name);
449 			}
450 		}
451 		error = sbuf_finish(&sbuf);
452 		sbuf_delete(&sbuf);
453 	}
454 	return (error);
455 }
456 
457 #ifdef INET6
458 static int
459 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)
460 {
461 
462 	return (print_algos_sysctl(req, AF_INET6));
463 }
464 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
465     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
466     print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms");
467 #endif
468 
469 #ifdef INET
470 static int
471 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)
472 {
473 
474 	return (print_algos_sysctl(req, AF_INET));
475 }
476 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
477     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
478     print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms");
479 #endif
480 
481 /*
482  * Calculate delay between repeated failures.
483  * Returns current delay in milliseconds.
484  */
485 static uint32_t
486 callout_calc_delay_ms(struct fib_data *fd)
487 {
488 	uint32_t shift;
489 
490 	if (fd->fd_failed_rebuilds > 10)
491 		shift = 10;
492 	else
493 		shift = fd->fd_failed_rebuilds;
494 
495 	return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
496 }
497 
498 static void
499 schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms)
500 {
501 
502 	FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action);
503 	fd->fd_callout_action = action;
504 	callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0,
505 	    handle_fd_callout, fd, 0);
506 }
507 
508 static void
509 schedule_fd_rebuild(struct fib_data *fd, const char *reason)
510 {
511 
512 	RIB_WLOCK_ASSERT(fd->fd_rh);
513 
514 	if (!fd->fd_need_rebuild) {
515 		fd->fd_need_rebuild = true;
516 		/* Stop batch updates */
517 		fd->fd_batch = false;
518 
519 		/*
520 		 * Potentially re-schedules pending callout
521 		 *  initiated by schedule_algo_eval.
522 		 */
523 		FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)",
524 		    reason, fd->fd_failed_rebuilds);
525 		schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd));
526 	}
527 }
528 
529 static void
530 sync_rib_gen(struct fib_data *fd)
531 {
532 	FD_PRINTF(LOG_DEBUG, fd, "Sync gen %u -> %u", fd->fd_rh->rnh_gen, fd->fd_rh->rnh_gen_rib);
533 	fd->fd_rh->rnh_gen = fd->fd_rh->rnh_gen_rib;
534 }
535 
536 static int64_t
537 get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv)
538 {
539 	int64_t diff = 0;
540 
541 	diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000;
542 	diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000;
543 
544 	return (diff);
545 }
546 
547 static void
548 add_tv_diff_ms(struct timeval *tv, int ms)
549 {
550 	tv->tv_sec += ms / 1000;
551 	ms = ms % 1000;
552 	if (ms * 1000 + tv->tv_usec < 1000000)
553 		tv->tv_usec += ms * 1000;
554 	else {
555 		tv->tv_sec += 1;
556 		tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000;
557 	}
558 }
559 
560 /*
561  * Marks the time when algo state diverges from the rib state.
562  */
563 static void
564 mark_diverge_time(struct fib_data *fd)
565 {
566 	struct fib_sync_status *fd_ss = &fd->fd_ss;
567 
568 	getmicrouptime(&fd_ss->diverge_time);
569 	fd_ss->bucket_id = 0;
570 	fd_ss->bucket_changes = 0;
571 }
572 
573 /*
574  * Calculates and updates the next algorithm sync time, based on the current activity.
575  *
576  * The intent is to provide reasonable balance between the update
577  *  latency and efficient batching when changing large amount of routes.
578  *
579  * High-level algorithm looks the following:
580  * 1) all changes are bucketed in 50ms intervals
581  * 2) If amount of changes within the bucket is greater than the threshold,
582  *   the update gets delayed, up to maximum delay threshold.
583  */
584 static void
585 update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action)
586 {
587 	uint32_t bucket_id, new_delay = 0;
588 	struct timeval tv;
589 
590 	/* Fetch all variables at once to ensure consistent reads */
591 	uint32_t bucket_time_ms = V_update_bucket_time_ms;
592 	uint32_t threshold_rate = V_bucket_change_threshold_rate;
593 	uint32_t max_delay_ms = V_fib_max_sync_delay_ms;
594 
595 	if (bucket_time_ms == 0)
596 		bucket_time_ms = 50;
597 	/* calculate per-bucket threshold rate */
598 	threshold_rate = threshold_rate * bucket_time_ms / 1000;
599 
600 	getmicrouptime(&tv);
601 
602 	struct fib_sync_status *fd_ss = &fd->fd_ss;
603 
604 	bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms;
605 
606 	if (fd_ss->bucket_id == bucket_id) {
607 		fd_ss->bucket_changes++;
608 		if (fd_ss->bucket_changes == threshold_rate) {
609 			new_delay = (bucket_id + 2) * bucket_time_ms;
610 			if (new_delay <= max_delay_ms) {
611 				FD_PRINTF(LOG_DEBUG, fd,
612 				    "hit threshold of %u routes, delay update,"
613 				    "bucket: %u, total delay: %u",
614 				    threshold_rate, bucket_id + 1, new_delay);
615 			} else {
616 				new_delay = 0;
617 				FD_PRINTF(LOG_DEBUG, fd,
618 				    "maximum sync delay (%u ms) reached", max_delay_ms);
619 			}
620 		} else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1))
621 			new_delay = bucket_time_ms;
622 	} else {
623 		fd_ss->bucket_id = bucket_id;
624 		fd_ss->bucket_changes = 1;
625 	}
626 
627 	if (new_delay > 0) {
628 		/* Calculated time has been updated */
629 		struct timeval new_tv = fd_ss->diverge_time;
630 		add_tv_diff_ms(&new_tv, new_delay);
631 
632 		int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv);
633 		schedule_callout(fd, action, delay_ms);
634 	}
635 }
636 
637 static void
638 update_algo_state(struct fib_data *fd)
639 {
640 
641 	RIB_WLOCK_ASSERT(fd->fd_rh);
642 
643 	if (fd->fd_batch || fd->fd_need_rebuild) {
644 		enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH;
645 		update_rebuild_delay(fd, action);
646 		return;
647 	}
648 
649 	if (fd->fd_num_changes++ == 0) {
650 		/* Start callout to consider switch */
651 		if (!callout_pending(&fd->fd_callout))
652 			schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS);
653 	} else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) {
654 		/* Reset callout to exec immediately */
655 		if (fd->fd_callout_action == FDA_EVAL)
656 			schedule_callout(fd, FDA_EVAL, 1);
657 	}
658 }
659 
660 static bool
661 need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc)
662 {
663 	struct nhop_object *nh;
664 
665 	/* Sync addition/removal of interface routes */
666 	switch (rc->rc_cmd) {
667 	case RTM_ADD:
668 		nh = rc->rc_nh_new;
669 		if (!NH_IS_NHGRP(nh)) {
670 			if (!(nh->nh_flags & NHF_GATEWAY))
671 				return (true);
672 			if (nhop_get_rtflags(nh) & RTF_STATIC)
673 				return (true);
674 		}
675 		break;
676 	case RTM_DELETE:
677 		nh = rc->rc_nh_old;
678 		if (!NH_IS_NHGRP(nh)) {
679 			if (!(nh->nh_flags & NHF_GATEWAY))
680 				return (true);
681 			if (nhop_get_rtflags(nh) & RTF_STATIC)
682 				return (true);
683 		}
684 		break;
685 	}
686 
687 	return (false);
688 }
689 
690 static bool
691 apply_rtable_changes(struct fib_data *fd)
692 {
693 	enum flm_op_result result;
694 	struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
695 
696 	result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data);
697 
698 	if (result == FLM_SUCCESS) {
699 		sync_rib_gen(fd);
700 		for (int i = 0; i < q->count; i++)
701 			if (q->entries[i].nh_old)
702 				fib_unref_nhop(fd, q->entries[i].nh_old);
703 		q->count = 0;
704 	}
705 	fd->fd_batch = false;
706 
707 	return (result == FLM_SUCCESS);
708 }
709 
710 static bool
711 fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc)
712 {
713 	int plen = 0;
714 
715 	switch (fd->fd_family) {
716 #ifdef INET
717 	case AF_INET:
718 		rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid);
719 		break;
720 #endif
721 #ifdef INET6
722 	case AF_INET6:
723 		rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid);
724 		break;
725 #endif
726 	}
727 
728 	ce->plen = plen;
729 	ce->nh_old = rc->rc_nh_old;
730 	ce->nh_new = rc->rc_nh_new;
731 	if (ce->nh_new != NULL) {
732 		if (fib_ref_nhop(fd, ce->nh_new) == 0)
733 			return (false);
734 	}
735 
736 	return (true);
737 }
738 
739 static bool
740 queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc)
741 {
742 	struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
743 
744 	if (q->count >= q->size) {
745 		uint32_t q_size;
746 
747 		if (q->size == 0)
748 			q_size = 256; /* ~18k memory */
749 		else
750 			q_size = q->size * 2;
751 
752 		size_t size = q_size * sizeof(struct fib_change_entry);
753 		void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO);
754 		if (a == NULL) {
755 			FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements",
756 			    q_size);
757 			return (false);
758 		}
759 		q->entries = a;
760 		q->size = q_size;
761 	}
762 
763 	return (fill_change_entry(fd, &q->entries[q->count++], rc));
764 }
765 
766 /*
767  * Rib subscription handler. Checks if the algorithm is ready to
768  *  receive updates, handles nexthop refcounting and passes change
769  *  data to the algorithm callback.
770  */
771 static void
772 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
773     void *_data)
774 {
775 	struct fib_data *fd = (struct fib_data *)_data;
776 	enum flm_op_result result;
777 
778 	RIB_WLOCK_ASSERT(rnh);
779 
780 	/*
781 	 * There is a small gap between subscribing for route changes
782 	 *  and initiating rtable dump. Avoid receiving route changes
783 	 *  prior to finishing rtable dump by checking `init_done`.
784 	 */
785 	if (!fd->init_done)
786 		return;
787 
788 	bool immediate_sync = need_immediate_sync(fd, rc);
789 
790 	/* Consider scheduling algorithm re-evaluation */
791 	update_algo_state(fd);
792 
793 	/*
794 	 * If algo requested rebuild, stop sending updates by default.
795 	 * This simplifies nexthop refcount handling logic.
796 	 */
797 	if (fd->fd_need_rebuild) {
798 		if (immediate_sync)
799 			rebuild_fd(fd, "rtable change type enforced sync");
800 		return;
801 	}
802 
803 	/*
804 	 * Algo requested updates to be delivered in batches.
805 	 * Add the current change to the queue and return.
806 	 */
807 	if (fd->fd_batch) {
808 		if (immediate_sync) {
809 			if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd))
810 				rebuild_fd(fd, "batch sync failed");
811 		} else {
812 			if (!queue_rtable_change(fd, rc))
813 				schedule_fd_rebuild(fd, "batch queue failed");
814 		}
815 		return;
816 	}
817 
818 	/*
819 	 * Maintain guarantee that every nexthop returned by the dataplane
820 	 *  lookup has > 0 refcount, so can be safely referenced within current
821 	 *  epoch.
822 	 */
823 	if (rc->rc_nh_new != NULL) {
824 		if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) {
825 			/* ran out of indexes */
826 			schedule_fd_rebuild(fd, "ran out of nhop indexes");
827 			return;
828 		}
829 	}
830 
831 	result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
832 
833 	switch (result) {
834 	case FLM_SUCCESS:
835 		sync_rib_gen(fd);
836 		/* Unref old nexthop on success */
837 		if (rc->rc_nh_old != NULL)
838 			fib_unref_nhop(fd, rc->rc_nh_old);
839 		break;
840 	case FLM_BATCH:
841 
842 		/*
843 		 * Algo asks to batch the changes.
844 		 */
845 		if (queue_rtable_change(fd, rc)) {
846 			if (!immediate_sync) {
847 				fd->fd_batch = true;
848 				mark_diverge_time(fd);
849 				update_rebuild_delay(fd, FDA_BATCH);
850 				break;
851 			}
852 			if (apply_rtable_changes(fd))
853 				break;
854 		}
855 		FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild");
856 
857 	case FLM_REBUILD:
858 
859 		/*
860 		 * Algo is not able to apply the update.
861 		 * Schedule algo rebuild.
862 		 */
863 		if (!immediate_sync) {
864 			mark_diverge_time(fd);
865 			schedule_fd_rebuild(fd, "algo requested rebuild");
866 			break;
867 		}
868 
869 		FD_PRINTF(LOG_INFO, fd, "running sync rebuild");
870 		rebuild_fd(fd, "rtable change type enforced sync");
871 		break;
872 	case FLM_ERROR:
873 
874 		/*
875 		 * Algo reported a non-recoverable error.
876 		 * Record the error and schedule rebuild, which will
877 		 *  trigger best algo selection.
878 		 */
879 		FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error");
880 		if (!flm_error_add(fd->fd_flm, fd->fd_fibnum))
881 			FD_PRINTF(LOG_ERR, fd, "failed to ban algo");
882 		schedule_fd_rebuild(fd, "algo reported non-recoverable error");
883 	}
884 }
885 
886 static void
887 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd)
888 {
889 
890 	if (old_fd == NULL) {
891 		// TODO: read from rtable
892 		fd->number_nhops = 16;
893 		return;
894 	}
895 
896 	if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
897 		fd->number_nhops = 2 * old_fd->number_nhops;
898 	else
899 		fd->number_nhops = old_fd->number_nhops;
900 }
901 
902 struct walk_cbdata {
903 	struct fib_data		*fd;
904 	flm_dump_t		*func;
905 	enum flm_op_result	result;
906 };
907 
908 /*
909  * Handler called after all rtenties have been dumped.
910  * Performs post-dump framework checks and calls
911  * algo:flm_dump_end_cb().
912  *
913  * Updates walk_cbdata result.
914  */
915 static void
916 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
917 {
918 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
919 	struct fib_data *fd = w->fd;
920 
921 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
922 
923 	if (rnh->rib_dying) {
924 		w->result = FLM_ERROR;
925 		return;
926 	}
927 
928 	if (fd->hit_nhops) {
929 		FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops",
930 		    fd->nh_ref_table->count);
931 		if (w->result == FLM_SUCCESS)
932 			w->result = FLM_REBUILD;
933 		return;
934 	}
935 
936 	if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
937 		return;
938 
939 	/* Post-dump hook, dump successful */
940 	w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
941 
942 	if (w->result == FLM_SUCCESS) {
943 		/* Mark init as done to allow routing updates */
944 		fd->init_done = 1;
945 	}
946 }
947 
948 /*
949  * Callback for each entry in rib.
950  * Calls algo:flm_dump_rib_item_cb func as a part of initial
951  *  route table synchronisation.
952  */
953 static int
954 sync_algo_cb(struct rtentry *rt, void *_data)
955 {
956 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
957 
958 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
959 
960 	if (w->result == FLM_SUCCESS && w->func) {
961 
962 		/*
963 		 * Reference nexthops to maintain guarantee that
964 		 *  each nexthop returned by datapath has > 0 references
965 		 *  and can be safely referenced within current epoch.
966 		 */
967 		struct nhop_object *nh = rt_get_raw_nhop(rt);
968 		if (fib_ref_nhop(w->fd, nh) != 0)
969 			w->result = w->func(rt, w->fd->fd_algo_data);
970 		else
971 			w->result = FLM_REBUILD;
972 	}
973 
974 	return (0);
975 }
976 
977 /*
978  * Dump all routing table state to the algo instance.
979  */
980 static enum flm_op_result
981 sync_algo(struct fib_data *fd)
982 {
983 	struct walk_cbdata w = {
984 		.fd = fd,
985 		.func = fd->fd_flm->flm_dump_rib_item_cb,
986 		.result = FLM_SUCCESS,
987 	};
988 
989 	rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w);
990 
991 	FD_PRINTF(LOG_INFO, fd,
992 	    "initial dump completed (rtable version: %d), result: %s",
993 	    fd->fd_rh->rnh_gen, print_op_result(w.result));
994 
995 	return (w.result);
996 }
997 
998 /*
999  * Schedules epoch-backed @fd instance deletion.
1000  * * Unlinks @fd from the list of active algo instances.
1001  * * Removes rib subscription.
1002  * * Stops callout.
1003  * * Schedules actual deletion.
1004  *
1005  * Assume @fd is already unlinked from the datapath.
1006  */
1007 static int
1008 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
1009 {
1010 	bool is_dead;
1011 
1012 	NET_EPOCH_ASSERT();
1013 	RIB_WLOCK_ASSERT(fd->fd_rh);
1014 
1015 	FIB_MOD_LOCK();
1016 	is_dead = fd->fd_dead;
1017 	if (!is_dead)
1018 		fd->fd_dead = true;
1019 	if (fd->fd_linked) {
1020 		TAILQ_REMOVE(&V_fib_data_list, fd, entries);
1021 		fd->fd_linked = false;
1022 	}
1023 	FIB_MOD_UNLOCK();
1024 	if (is_dead)
1025 		return (0);
1026 
1027 	FD_PRINTF(LOG_INFO, fd, "DETACH");
1028 
1029 	if (fd->fd_rs != NULL)
1030 		rib_unsubscribe_locked(fd->fd_rs);
1031 
1032 	/*
1033 	 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
1034 	 * will be executed, hence no _new_ callout schedules will happen.
1035 	 */
1036 	callout_stop(&fd->fd_callout);
1037 
1038 	fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx);
1039 
1040 	return (0);
1041 }
1042 
1043 /*
1044  * Wipe all fd instances from the list matching rib specified by @rh.
1045  * If @keep_first is set, remove all but the first record.
1046  */
1047 static void
1048 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
1049 {
1050 	struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
1051 	struct fib_data *fd, *fd_tmp;
1052 	struct epoch_tracker et;
1053 
1054 	FIB_MOD_LOCK();
1055 	TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) {
1056 		if (fd->fd_rh == rh) {
1057 			if (keep_first) {
1058 				keep_first = false;
1059 				continue;
1060 			}
1061 			TAILQ_REMOVE(&V_fib_data_list, fd, entries);
1062 			fd->fd_linked = false;
1063 			TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
1064 		}
1065 	}
1066 	FIB_MOD_UNLOCK();
1067 
1068 	/* Pass 2: remove each entry */
1069 	NET_EPOCH_ENTER(et);
1070 	TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
1071 		if (!in_callout)
1072 			RIB_WLOCK(fd->fd_rh);
1073 		schedule_destroy_fd_instance(fd, in_callout);
1074 		if (!in_callout)
1075 			RIB_WUNLOCK(fd->fd_rh);
1076 	}
1077 	NET_EPOCH_EXIT(et);
1078 }
1079 
1080 void
1081 fib_destroy_rib(struct rib_head *rh)
1082 {
1083 
1084 	/*
1085 	 * rnh has `is_dying` flag set, so setup of new fd's will fail at
1086 	 *  sync_algo() stage, preventing new entries to be added to the list
1087 	 *  of active algos. Remove all existing entries for the particular rib.
1088 	 */
1089 	fib_cleanup_algo(rh, false, false);
1090 }
1091 
1092 /*
1093  * Finalises fd destruction by freeing all fd resources.
1094  */
1095 static void
1096 destroy_fd_instance(struct fib_data *fd)
1097 {
1098 
1099 	FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd);
1100 
1101 	/* Call destroy callback first */
1102 	if (fd->fd_algo_data != NULL)
1103 		fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
1104 
1105 	/* Nhop table */
1106 	if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) {
1107 		for (int i = 0; i < fd->number_nhops; i++) {
1108 			if (!is_idx_free(fd, i)) {
1109 				FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p",
1110 				    i, fd->nh_idx[i]);
1111 				nhop_free_any(fd->nh_idx[i]);
1112 			}
1113 		}
1114 		free(fd->nh_idx, M_RTABLE);
1115 	}
1116 	if (fd->nh_ref_table != NULL)
1117 		free(fd->nh_ref_table, M_RTABLE);
1118 
1119 	if (fd->fd_ss.fd_change_queue.entries != NULL)
1120 		free(fd->fd_ss.fd_change_queue.entries, M_TEMP);
1121 
1122 	fib_unref_algo(fd->fd_flm);
1123 
1124 	free(fd, M_RTABLE);
1125 }
1126 
1127 /*
1128  * Epoch callback indicating fd is safe to destroy
1129  */
1130 static void
1131 destroy_fd_instance_epoch(epoch_context_t ctx)
1132 {
1133 	struct fib_data *fd;
1134 
1135 	fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
1136 
1137 	CURVNET_SET(fd->fd_vnet);
1138 	destroy_fd_instance(fd);
1139 	CURVNET_RESTORE();
1140 }
1141 
1142 /*
1143  * Tries to setup fd instance.
1144  * - Allocates fd/nhop table
1145  * - Runs algo:flm_init_cb algo init
1146  * - Subscribes fd to the rib
1147  * - Runs rtable dump
1148  * - Adds instance to the list of active instances.
1149  *
1150  * Returns: operation result. Fills in @pfd with resulting fd on success.
1151  *
1152  */
1153 static enum flm_op_result
1154 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1155     struct fib_data *old_fd, struct fib_data **pfd)
1156 {
1157 	struct fib_data *fd;
1158 	size_t size;
1159 	enum flm_op_result result;
1160 
1161 	/* Allocate */
1162 	fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO);
1163 	if (fd == NULL)  {
1164 		*pfd = NULL;
1165 		RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure");
1166 		return (FLM_REBUILD);
1167 	}
1168 	*pfd = fd;
1169 
1170 	estimate_nhop_scale(old_fd, fd);
1171 
1172 	fd->fd_rh = rh;
1173 	fd->fd_family = rh->rib_family;
1174 	fd->fd_fibnum = rh->rib_fibnum;
1175 	callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0);
1176 	fd->fd_vnet = curvnet;
1177 	fd->fd_flm = flm;
1178 
1179 	FIB_MOD_LOCK();
1180 	flm->flm_refcount++;
1181 	fd->fd_gen = ++fib_gen;
1182 	FIB_MOD_UNLOCK();
1183 
1184 	FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd);
1185 
1186 	/* Allocate nhidx -> nhop_ptr table */
1187 	size = fd->number_nhops * sizeof(void *);
1188 	fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1189 	if (fd->nh_idx == NULL) {
1190 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size);
1191 		return (FLM_REBUILD);
1192 	}
1193 
1194 	/* Allocate nhop index refcount table */
1195 	size = sizeof(struct nhop_ref_table);
1196 	size += fd->number_nhops * sizeof(uint32_t);
1197 	fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1198 	if (fd->nh_ref_table == NULL) {
1199 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size);
1200 		return (FLM_REBUILD);
1201 	}
1202 	FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops);
1203 
1204 	/* Okay, we're ready for algo init */
1205 	void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL;
1206 	result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data);
1207 	if (result != FLM_SUCCESS) {
1208 		FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name);
1209 		return (result);
1210 	}
1211 
1212 	/* Try to subscribe */
1213 	if (flm->flm_change_rib_item_cb != NULL) {
1214 		fd->fd_rs = rib_subscribe_locked(fd->fd_rh,
1215 		    handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE);
1216 		if (fd->fd_rs == NULL) {
1217 			FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes");
1218 			return (FLM_REBUILD);
1219 		}
1220 	}
1221 
1222 	/* Dump */
1223 	result = sync_algo(fd);
1224 	if (result != FLM_SUCCESS) {
1225 		FD_PRINTF(LOG_INFO, fd, "rib sync failed");
1226 		return (result);
1227 	}
1228 	FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully.");
1229 
1230 	FIB_MOD_LOCK();
1231 	/*
1232 	 * Insert fd in the beginning of a list, to maintain invariant
1233 	 *  that first matching entry for the AF/fib is always the active
1234 	 *  one.
1235 	 */
1236 	TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries);
1237 	fd->fd_linked = true;
1238 	FIB_MOD_UNLOCK();
1239 
1240 	return (FLM_SUCCESS);
1241 }
1242 
1243 /*
1244  * Sets up algo @flm for table @rh and links it to the datapath.
1245  *
1246  */
1247 static enum flm_op_result
1248 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1249     struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
1250 {
1251 	struct fib_data *prev_fd, *new_fd;
1252 	enum flm_op_result result;
1253 
1254 	NET_EPOCH_ASSERT();
1255 	RIB_WLOCK_ASSERT(rh);
1256 
1257 	prev_fd = orig_fd;
1258 	new_fd = NULL;
1259 	for (int i = 0; i < FIB_MAX_TRIES; i++) {
1260 		result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);
1261 
1262 		if ((result == FLM_SUCCESS) && attach) {
1263 			if (fib_set_datapath_ptr(new_fd, &new_fd->fd_dp))
1264 				sync_rib_gen(new_fd);
1265 			else
1266 				result = FLM_REBUILD;
1267 		}
1268 
1269 		if ((prev_fd != NULL) && (prev_fd != orig_fd)) {
1270 			schedule_destroy_fd_instance(prev_fd, false);
1271 			prev_fd = NULL;
1272 		}
1273 
1274 		RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
1275 		    print_op_result(result));
1276 
1277 		if (result == FLM_REBUILD) {
1278 			prev_fd = new_fd;
1279 			new_fd = NULL;
1280 			continue;
1281 		}
1282 
1283 		break;
1284 	}
1285 
1286 	if (result != FLM_SUCCESS) {
1287 		RH_PRINTF(LOG_WARNING, rh,
1288 		    "%s algo instance setup failed, failures=%d", flm->flm_name,
1289 		    orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0);
1290 		/* update failure count */
1291 		FIB_MOD_LOCK();
1292 		if (orig_fd != NULL)
1293 			orig_fd->fd_failed_rebuilds++;
1294 		FIB_MOD_UNLOCK();
1295 
1296 		/* Ban algo on non-recoverable error */
1297 		if (result == FLM_ERROR)
1298 			flm_error_add(flm, rh->rib_fibnum);
1299 
1300 		if ((prev_fd != NULL) && (prev_fd != orig_fd))
1301 			schedule_destroy_fd_instance(prev_fd, false);
1302 		if (new_fd != NULL) {
1303 			schedule_destroy_fd_instance(new_fd, false);
1304 			new_fd = NULL;
1305 		}
1306 	}
1307 
1308 	*pfd = new_fd;
1309 	return (result);
1310 }
1311 
1312 /*
1313  * Tries to sync algo with the current rtable state, either
1314  * by executing batch update or rebuilding.
1315  * Returns true on success.
1316  */
1317 static bool
1318 execute_callout_action(struct fib_data *fd)
1319 {
1320 	enum fib_callout_action action = fd->fd_callout_action;
1321 	struct fib_lookup_module *flm_new = NULL;
1322 	bool result = true;
1323 
1324 	NET_EPOCH_ASSERT();
1325 	RIB_WLOCK_ASSERT(fd->fd_rh);
1326 
1327 	fd->fd_need_rebuild = false;
1328 	fd->fd_batch = false;
1329 	fd->fd_num_changes = 0;
1330 
1331 	/* First, check if we're still OK to use this algo */
1332 	if (!is_algo_fixed(fd->fd_rh))
1333 		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1334 	if (flm_new != NULL)
1335 		action = FDA_REBUILD;
1336 
1337 	if (action == FDA_BATCH) {
1338 		/* Try to sync */
1339 		if (!apply_rtable_changes(fd))
1340 			action = FDA_REBUILD;
1341 	}
1342 
1343 	if (action == FDA_REBUILD)
1344 		result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1345 	if (flm_new != NULL)
1346 		fib_unref_algo(flm_new);
1347 
1348 	return (result);
1349 }
1350 
1351 /*
1352  * Callout for all scheduled fd-related work.
1353  * - Checks if the current algo is still the best algo
1354  * - Synchronises algo instance to the rtable (batch usecase)
1355  * - Creates a new instance of an algo for af/fib if desired.
1356  */
1357 static void
1358 handle_fd_callout(void *_data)
1359 {
1360 	struct fib_data *fd = (struct fib_data *)_data;
1361 	struct epoch_tracker et;
1362 
1363 	FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action);
1364 
1365 	NET_EPOCH_ENTER(et);
1366 	CURVNET_SET(fd->fd_vnet);
1367 	execute_callout_action(fd);
1368 	CURVNET_RESTORE();
1369 	NET_EPOCH_EXIT(et);
1370 }
1371 
1372 /*
1373  * Tries to create new algo instance based on @fd data.
1374  * Returns true on success.
1375  */
1376 static bool
1377 rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new)
1378 {
1379 	struct fib_data *fd_new, *fd_tmp = NULL;
1380 	bool result;
1381 
1382 	if (flm_new == fd->fd_flm)
1383 		fd_tmp = fd;
1384 	else
1385 		FD_PRINTF(LOG_INFO, fd, "switching algo to %s", flm_new->flm_name);
1386 
1387 	result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true);
1388 	if (result != FLM_SUCCESS) {
1389 		FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed");
1390 		return (false);
1391 	}
1392 	FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");
1393 
1394 	/* Remove old instance */
1395 	schedule_destroy_fd_instance(fd, true);
1396 
1397 	return (true);
1398 }
1399 
1400 static bool
1401 rebuild_fd(struct fib_data *fd, const char *reason)
1402 {
1403 	struct fib_lookup_module *flm_new = NULL;
1404 	bool result;
1405 
1406 	if (!is_algo_fixed(fd->fd_rh))
1407 		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1408 
1409 	FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason);
1410 	result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1411 	if (flm_new != NULL)
1412 		fib_unref_algo(flm_new);
1413 
1414 	if (!result) {
1415 		FD_PRINTF(LOG_ERR, fd, "sync rebuild failed");
1416 		schedule_fd_rebuild(fd, "sync rebuild failed");
1417 	}
1418 
1419 	return (result);
1420 }
1421 
1422 /*
1423  * Finds algo by name/family.
1424  * Returns referenced algo or NULL.
1425  */
1426 static struct fib_lookup_module *
1427 fib_find_algo(const char *algo_name, int family)
1428 {
1429 	struct fib_lookup_module *flm;
1430 
1431 	FIB_MOD_LOCK();
1432 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1433 		if ((strcmp(flm->flm_name, algo_name) == 0) &&
1434 		    (family == flm->flm_family)) {
1435 			flm->flm_refcount++;
1436 			FIB_MOD_UNLOCK();
1437 			return (flm);
1438 		}
1439 	}
1440 	FIB_MOD_UNLOCK();
1441 
1442 	return (NULL);
1443 }
1444 
1445 static void
1446 fib_unref_algo(struct fib_lookup_module *flm)
1447 {
1448 
1449 	FIB_MOD_LOCK();
1450 	flm->flm_refcount--;
1451 	FIB_MOD_UNLOCK();
1452 }
1453 
1454 static int
1455 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req)
1456 {
1457 	struct fib_lookup_module *flm = NULL;
1458 	struct fib_data *fd = NULL;
1459 	char old_algo_name[32], algo_name[32];
1460 	struct rib_head *rh = NULL;
1461 	enum flm_op_result result;
1462 	struct epoch_tracker et;
1463 	int error;
1464 
1465 	/* Fetch current algo/rib for af/family */
1466 	FIB_MOD_LOCK();
1467 	TAILQ_FOREACH(fd, &V_fib_data_list, entries) {
1468 		if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum))
1469 			break;
1470 	}
1471 	if (fd == NULL) {
1472 		FIB_MOD_UNLOCK();
1473 		return (ENOENT);
1474 	}
1475 	rh = fd->fd_rh;
1476 	strlcpy(old_algo_name, fd->fd_flm->flm_name,
1477 	    sizeof(old_algo_name));
1478 	FIB_MOD_UNLOCK();
1479 
1480 	strlcpy(algo_name, old_algo_name, sizeof(algo_name));
1481 	error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req);
1482 	if (error != 0 || req->newptr == NULL)
1483 		return (error);
1484 
1485 	if (strcmp(algo_name, old_algo_name) == 0)
1486 		return (0);
1487 
1488 	/* New algorithm name is different */
1489 	flm = fib_find_algo(algo_name, family);
1490 	if (flm == NULL) {
1491 		RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name);
1492 		return (ESRCH);
1493 	}
1494 
1495 	fd = NULL;
1496 	NET_EPOCH_ENTER(et);
1497 	RIB_WLOCK(rh);
1498 	result = setup_fd_instance(flm, rh, NULL, &fd, true);
1499 	RIB_WUNLOCK(rh);
1500 	NET_EPOCH_EXIT(et);
1501 	fib_unref_algo(flm);
1502 	if (result != FLM_SUCCESS)
1503 		return (EINVAL);
1504 
1505 	/* Disable automated jumping between algos */
1506 	FIB_MOD_LOCK();
1507 	set_algo_fixed(rh);
1508 	FIB_MOD_UNLOCK();
1509 	/* Remove old instance(s) */
1510 	fib_cleanup_algo(rh, true, false);
1511 
1512 	/* Drain cb so user can unload the module after userret if so desired */
1513 	NET_EPOCH_DRAIN_CALLBACKS();
1514 
1515 	return (0);
1516 }
1517 
1518 #ifdef INET
1519 static int
1520 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS)
1521 {
1522 
1523 	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req));
1524 }
1525 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo,
1526     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1527     set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo");
1528 #endif
1529 
1530 #ifdef INET6
1531 static int
1532 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS)
1533 {
1534 
1535 	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req));
1536 }
1537 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo,
1538     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1539     set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo");
1540 #endif
1541 
1542 static struct nhop_object *
1543 dummy_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
1544 {
1545 	return (NULL);
1546 }
1547 
1548 static void
1549 destroy_fdh_epoch(epoch_context_t ctx)
1550 {
1551 	struct fib_dp_header *fdh;
1552 
1553 	fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx);
1554 	free(fdh, M_RTABLE);
1555 }
1556 
1557 static struct fib_dp_header *
1558 alloc_fib_dp_array(uint32_t num_tables, bool waitok)
1559 {
1560 	size_t sz;
1561 	struct fib_dp_header *fdh;
1562 
1563 	sz = sizeof(struct fib_dp_header);
1564 	sz += sizeof(struct fib_dp) * num_tables;
1565 	fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
1566 	if (fdh != NULL) {
1567 		fdh->fdh_num_tables = num_tables;
1568 		/*
1569 		 * Set dummy lookup function ptr always returning NULL, so
1570 		 * we can delay algo init.
1571 		 */
1572 		for (uint32_t i = 0; i < num_tables; i++)
1573 			fdh->fdh_idx[i].f = dummy_lookup;
1574 	}
1575 	return (fdh);
1576 }
1577 
1578 static struct fib_dp_header *
1579 get_fib_dp_header(struct fib_dp *dp)
1580 {
1581 
1582 	return (__containerof((void *)dp, struct fib_dp_header, fdh_idx));
1583 }
1584 
1585 /*
1586  * Replace per-family index pool @pdp with a new one which
1587  * contains updated callback/algo data from @fd.
1588  * Returns true on success.
1589  */
1590 static bool
1591 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp)
1592 {
1593 	struct fib_dp_header *new_fdh, *old_fdh;
1594 
1595 	NET_EPOCH_ASSERT();
1596 
1597 	FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p",
1598 	    curvnet, dp->f, dp->arg);
1599 
1600 	FIB_MOD_LOCK();
1601 	old_fdh = get_fib_dp_header(*pdp);
1602 
1603 	if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) {
1604 		/*
1605 		 * Function is the same, data pointer needs update.
1606 		 * Perform in-line replace without reallocation.
1607 		 */
1608 		old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg;
1609 		FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh);
1610 		FIB_MOD_UNLOCK();
1611 		return (true);
1612 	}
1613 
1614 	new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false);
1615 	FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh);
1616 	if (new_fdh == NULL) {
1617 		FIB_MOD_UNLOCK();
1618 		FD_PRINTF(LOG_WARNING, fd, "error attaching datapath");
1619 		return (false);
1620 	}
1621 
1622 	memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1623 	    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1624 	/* Update relevant data structure for @fd */
1625 	new_fdh->fdh_idx[fd->fd_fibnum] = *dp;
1626 
1627 	/* Ensure memcpy() writes have completed */
1628 	atomic_thread_fence_rel();
1629 	/* Set new datapath pointer */
1630 	*pdp = &new_fdh->fdh_idx[0];
1631 	FIB_MOD_UNLOCK();
1632 	FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh);
1633 
1634 	fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
1635 
1636 	return (true);
1637 }
1638 
1639 static struct fib_dp **
1640 get_family_dp_ptr(int family)
1641 {
1642 	switch (family) {
1643 #ifdef INET
1644 	case AF_INET:
1645 		return (&V_inet_dp);
1646 #endif
1647 #ifdef INET6
1648 	case AF_INET6:
1649 		return (&V_inet6_dp);
1650 #endif
1651 	}
1652 	return (NULL);
1653 }
1654 
1655 /*
1656  * Make datapath use fib instance @fd
1657  */
1658 bool
1659 fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp)
1660 {
1661 	struct fib_dp **pdp;
1662 
1663 	pdp = get_family_dp_ptr(fd->fd_family);
1664 	return (replace_rtables_family(pdp, fd, dp));
1665 }
1666 
1667 /*
1668  * Grow datapath pointers array.
1669  * Called from sysctl handler on growing number of routing tables.
1670  */
1671 static void
1672 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables)
1673 {
1674 	struct fib_dp_header *new_fdh, *old_fdh = NULL;
1675 
1676 	new_fdh = alloc_fib_dp_array(new_num_tables, true);
1677 
1678 	FIB_MOD_LOCK();
1679 	if (*pdp != NULL) {
1680 		old_fdh = get_fib_dp_header(*pdp);
1681 		memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1682 		    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1683 	}
1684 
1685 	/* Wait till all writes completed */
1686 	atomic_thread_fence_rel();
1687 
1688 	*pdp = &new_fdh->fdh_idx[0];
1689 	FIB_MOD_UNLOCK();
1690 
1691 	if (old_fdh != NULL)
1692 		fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
1693 }
1694 
1695 /*
1696  * Grows per-AF arrays of datapath pointers for each supported family.
1697  * Called from fibs resize sysctl handler.
1698  */
1699 void
1700 fib_grow_rtables(uint32_t new_num_tables)
1701 {
1702 
1703 #ifdef INET
1704 	grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables);
1705 #endif
1706 #ifdef INET6
1707 	grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables);
1708 #endif
1709 }
1710 
1711 void
1712 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo)
1713 {
1714 
1715 	bzero(rinfo, sizeof(struct rib_rtable_info));
1716 	rinfo->num_prefixes = rh->rnh_prefixes;
1717 	rinfo->num_nhops = nhops_get_count(rh);
1718 #ifdef ROUTE_MPATH
1719 	rinfo->num_nhgrp = nhgrp_get_count(rh);
1720 #endif
1721 }
1722 
1723 /*
1724  * Updates pointer to the algo data for the @fd.
1725  */
1726 void
1727 fib_set_algo_ptr(struct fib_data *fd, void *algo_data)
1728 {
1729 	RIB_WLOCK_ASSERT(fd->fd_rh);
1730 
1731 	fd->fd_algo_data = algo_data;
1732 }
1733 
1734 /*
1735  * Calls @callback with @ctx after the end of a current epoch.
1736  */
1737 void
1738 fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx)
1739 {
1740 	NET_EPOCH_CALL(callback, ctx);
1741 }
1742 
1743 /*
1744  * Accessor to get rib instance @fd is attached to.
1745  */
1746 struct rib_head *
1747 fib_get_rh(struct fib_data *fd)
1748 {
1749 
1750 	return (fd->fd_rh);
1751 }
1752 
1753 /*
1754  * Accessor to export idx->nhop array
1755  */
1756 struct nhop_object **
1757 fib_get_nhop_array(struct fib_data *fd)
1758 {
1759 
1760 	return (fd->nh_idx);
1761 }
1762 
1763 static uint32_t
1764 get_nhop_idx(struct nhop_object *nh)
1765 {
1766 #ifdef ROUTE_MPATH
1767 	if (NH_IS_NHGRP(nh))
1768 		return (nhgrp_get_idx((struct nhgrp_object *)nh));
1769 	else
1770 #endif
1771 		return (nhop_get_idx(nh));
1772 }
1773 
1774 uint32_t
1775 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh)
1776 {
1777 
1778 	return (get_nhop_idx(nh));
1779 }
1780 
1781 static bool
1782 is_idx_free(struct fib_data *fd, uint32_t index)
1783 {
1784 
1785 	return (fd->nh_ref_table->refcnt[index] == 0);
1786 }
1787 
1788 static uint32_t
1789 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh)
1790 {
1791 	uint32_t idx = get_nhop_idx(nh);
1792 
1793 	if (idx >= fd->number_nhops) {
1794 		fd->hit_nhops = 1;
1795 		return (0);
1796 	}
1797 
1798 	if (is_idx_free(fd, idx)) {
1799 		nhop_ref_any(nh);
1800 		fd->nh_idx[idx] = nh;
1801 		fd->nh_ref_table->count++;
1802 		FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]);
1803 	}
1804 	fd->nh_ref_table->refcnt[idx]++;
1805 
1806 	return (idx);
1807 }
1808 
1809 struct nhop_release_data {
1810 	struct nhop_object	*nh;
1811 	struct epoch_context	ctx;
1812 };
1813 
1814 static void
1815 release_nhop_epoch(epoch_context_t ctx)
1816 {
1817 	struct nhop_release_data *nrd;
1818 
1819 	nrd = __containerof(ctx, struct nhop_release_data, ctx);
1820 	nhop_free_any(nrd->nh);
1821 	free(nrd, M_TEMP);
1822 }
1823 
1824 /*
1825  * Delays nexthop refcount release.
1826  * Datapath may have the datastructures not updated yet, so the old
1827  *  nexthop may still be returned till the end of current epoch. Delay
1828  *  refcount removal, as we may be removing the last instance, which will
1829  *  trigger nexthop deletion, rendering returned nexthop invalid.
1830  */
1831 static void
1832 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh)
1833 {
1834 	struct nhop_release_data *nrd;
1835 
1836 	nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO);
1837 	if (nrd != NULL) {
1838 		nrd->nh = nh;
1839 		fib_epoch_call(release_nhop_epoch, &nrd->ctx);
1840 	} else {
1841 		/*
1842 		 * Unable to allocate memory. Leak nexthop to maintain guarantee
1843 		 *  that each nhop can be referenced.
1844 		 */
1845 		FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh);
1846 	}
1847 }
1848 
1849 static void
1850 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh)
1851 {
1852 	uint32_t idx = get_nhop_idx(nh);
1853 
1854 	KASSERT((idx < fd->number_nhops), ("invalid nhop index"));
1855 	KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh"));
1856 
1857 	fd->nh_ref_table->refcnt[idx]--;
1858 	if (fd->nh_ref_table->refcnt[idx] == 0) {
1859 		FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]);
1860 		fib_schedule_release_nhop(fd, fd->nh_idx[idx]);
1861 	}
1862 }
1863 
1864 static void
1865 set_algo_fixed(struct rib_head *rh)
1866 {
1867 	switch (rh->rib_family) {
1868 #ifdef INET
1869 	case AF_INET:
1870 		V_algo_fixed_inet = true;
1871 		break;
1872 #endif
1873 #ifdef INET6
1874 	case AF_INET6:
1875 		V_algo_fixed_inet6 = true;
1876 		break;
1877 #endif
1878 	}
1879 }
1880 
1881 static bool
1882 is_algo_fixed(struct rib_head *rh)
1883 {
1884 
1885 	switch (rh->rib_family) {
1886 #ifdef INET
1887 	case AF_INET:
1888 		return (V_algo_fixed_inet);
1889 #endif
1890 #ifdef INET6
1891 	case AF_INET6:
1892 		return (V_algo_fixed_inet6);
1893 #endif
1894 	}
1895 	return (false);
1896 }
1897 
1898 /*
1899  * Runs the check on what would be the best algo for rib @rh, assuming
1900  *  that the current algo is the one specified by @orig_flm. Note that
1901  *  it can be NULL for initial selection.
1902  *
1903  * Returns referenced new algo or NULL if the current one is the best.
1904  */
1905 static struct fib_lookup_module *
1906 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm)
1907 {
1908 	uint8_t preference, curr_preference = 0, best_preference = 0;
1909 	struct fib_lookup_module *flm, *best_flm = NULL;
1910 	struct rib_rtable_info rinfo;
1911 	int candidate_algos = 0;
1912 
1913 	fib_get_rtable_info(rh, &rinfo);
1914 
1915 	FIB_MOD_LOCK();
1916 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1917 		if (flm->flm_family != rh->rib_family)
1918 			continue;
1919 		candidate_algos++;
1920 		preference = flm->flm_get_pref(&rinfo);
1921 		if (preference > best_preference) {
1922 			if (!flm_error_check(flm, rh->rib_fibnum)) {
1923 				best_preference = preference;
1924 				best_flm = flm;
1925 			}
1926 		}
1927 		if (flm == orig_flm)
1928 			curr_preference = preference;
1929 	}
1930 	if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference))
1931 		best_flm->flm_refcount++;
1932 	else
1933 		best_flm = NULL;
1934 	FIB_MOD_UNLOCK();
1935 
1936 	RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)",
1937 	    candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference,
1938 	    best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"),
1939 	    best_preference);
1940 
1941 	return (best_flm);
1942 }
1943 
1944 /*
1945  * Called when new route table is created.
1946  * Selects, allocates and attaches fib algo for the table.
1947  */
1948 static bool
1949 fib_select_algo_initial(struct rib_head *rh, struct fib_dp *dp)
1950 {
1951 	struct fib_lookup_module *flm;
1952 	struct fib_data *fd = NULL;
1953 	enum flm_op_result result;
1954 	struct epoch_tracker et;
1955 
1956 	flm = fib_check_best_algo(rh, NULL);
1957 	if (flm == NULL) {
1958 		RH_PRINTF(LOG_CRIT, rh, "no algo selected");
1959 		return (false);
1960 	}
1961 	RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);
1962 
1963 	NET_EPOCH_ENTER(et);
1964 	RIB_WLOCK(rh);
1965 	result = setup_fd_instance(flm, rh, NULL, &fd, false);
1966 	RIB_WUNLOCK(rh);
1967 	NET_EPOCH_EXIT(et);
1968 
1969 	RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
1970 	if (result == FLM_SUCCESS)
1971 		*dp = fd->fd_dp;
1972 	else
1973 		RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name);
1974 
1975 	fib_unref_algo(flm);
1976 
1977 	return (result == FLM_SUCCESS);
1978 }
1979 
1980 /*
1981  * Sets up fib algo instances for the non-initialized RIBs in the @family.
1982  * Allocates temporary datapath index to amortize datapaint index updates
1983  * with large @num_tables.
1984  */
1985 void
1986 fib_setup_family(int family, uint32_t num_tables)
1987 {
1988 	struct fib_dp_header *new_fdh = alloc_fib_dp_array(num_tables, false);
1989 	if (new_fdh == NULL) {
1990 		ALGO_PRINTF(LOG_CRIT, "Unable to setup framework for %s", print_family(family));
1991 		return;
1992 	}
1993 
1994 	for (int i = 0; i < num_tables; i++) {
1995 		struct rib_head *rh = rt_tables_get_rnh(i, family);
1996 		if (rh->rib_algo_init)
1997 			continue;
1998 		if (!fib_select_algo_initial(rh, &new_fdh->fdh_idx[i]))
1999 			continue;
2000 
2001 		rh->rib_algo_init = true;
2002 	}
2003 
2004 	FIB_MOD_LOCK();
2005 	struct fib_dp **pdp = get_family_dp_ptr(family);
2006 	struct fib_dp_header *old_fdh = get_fib_dp_header(*pdp);
2007 
2008 	/* Update the items not touched by the new init, from the old data pointer */
2009 	for (int i = 0; i < num_tables; i++) {
2010 		if (new_fdh->fdh_idx[i].f == dummy_lookup)
2011 			new_fdh->fdh_idx[i] = old_fdh->fdh_idx[i];
2012 	}
2013 
2014 	/* Ensure all index writes have completed */
2015 	atomic_thread_fence_rel();
2016 	/* Set new datapath pointer */
2017 	*pdp = &new_fdh->fdh_idx[0];
2018 
2019 	FIB_MOD_UNLOCK();
2020 
2021 	fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
2022 }
2023 
2024 /*
2025  * Registers fib lookup module within the subsystem.
2026  */
2027 int
2028 fib_module_register(struct fib_lookup_module *flm)
2029 {
2030 
2031 	FIB_MOD_LOCK();
2032 	ALGO_PRINTF(LOG_INFO, "attaching %s to %s", flm->flm_name,
2033 	    print_family(flm->flm_family));
2034 	TAILQ_INSERT_TAIL(&all_algo_list, flm, entries);
2035 	FIB_MOD_UNLOCK();
2036 
2037 	return (0);
2038 }
2039 
2040 /*
2041  * Tries to unregister fib lookup module.
2042  *
2043  * Returns 0 on success, EBUSY if module is still used
2044  *  by some of the tables.
2045  */
2046 int
2047 fib_module_unregister(struct fib_lookup_module *flm)
2048 {
2049 
2050 	FIB_MOD_LOCK();
2051 	if (flm->flm_refcount > 0) {
2052 		FIB_MOD_UNLOCK();
2053 		return (EBUSY);
2054 	}
2055 	fib_error_clear_flm(flm);
2056 	ALGO_PRINTF(LOG_INFO, "detaching %s from %s", flm->flm_name,
2057 	    print_family(flm->flm_family));
2058 	TAILQ_REMOVE(&all_algo_list, flm, entries);
2059 	FIB_MOD_UNLOCK();
2060 
2061 	return (0);
2062 }
2063 
2064 void
2065 vnet_fib_init(void)
2066 {
2067 
2068 	TAILQ_INIT(&V_fib_data_list);
2069 }
2070 
2071 void
2072 vnet_fib_destroy(void)
2073 {
2074 
2075 	FIB_MOD_LOCK();
2076 	fib_error_clear();
2077 	FIB_MOD_UNLOCK();
2078 }
2079