xref: /freebsd/sys/net/route/fib_algo.c (revision 221622ec0c8e184dd1ea7e1f77fb45d2d32cb6e2)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 
34 #include <sys/param.h>
35 #include <sys/eventhandler.h>
36 #include <sys/kernel.h>
37 #include <sys/sbuf.h>
38 #include <sys/lock.h>
39 #include <sys/rmlock.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/module.h>
43 #include <sys/kernel.h>
44 #include <sys/priv.h>
45 #include <sys/proc.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/syslog.h>
50 #include <sys/queue.h>
51 #include <net/vnet.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 
56 #include <netinet/in.h>
57 #include <netinet/in_var.h>
58 #include <netinet/ip.h>
59 #include <netinet/ip_var.h>
60 #ifdef INET6
61 #include <netinet/ip6.h>
62 #include <netinet6/ip6_var.h>
63 #endif
64 
65 #include <net/route.h>
66 #include <net/route/nhop.h>
67 #include <net/route/route_ctl.h>
68 #include <net/route/route_var.h>
69 #include <net/route/fib_algo.h>
70 
71 #include <machine/stdarg.h>
72 
73 /*
74  * Fib lookup framework.
75  *
76  * This framework enables accelerated longest-prefix-match lookups for the
77  *  routing tables by adding the ability to dynamically attach/detach lookup
78  *  algorithms implementation to/from the datapath.
79  *
80  * flm - fib lookup modules - implementation of particular lookup algorithm
81  * fd - fib data - instance of an flm bound to specific routing table
82  *
83  * This file provides main framework functionality.
84  *
85  * The following are the features provided by the framework
86  *
87  * 1) nexhops abstraction -> provides transparent referencing, indexing
88  *   and efficient idx->ptr mappings for nexthop and nexthop groups.
89  * 2) Routing table synchronisation
90  * 3) dataplane attachment points
91  * 4) automatic algorithm selection based on the provided preference.
92  *
93  *
94  * DATAPATH
95  * For each supported address family, there is a an allocated array of fib_dp
96  *  structures, indexed by fib number. Each array entry contains callback function
97  *  and its argument. This function will be called with a family-specific lookup key,
98  *  scope and provided argument. This array gets re-created every time when new algo
99  *  instance gets created. Please take a look at the replace_rtables_family() function
100  *  for more details.
101  *
102  */
103 
104 SYSCTL_DECL(_net_route);
105 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
106     "Fib algorithm lookups");
107 
108 VNET_DEFINE(int, fib_sync_limit) = 100;
109 #define	V_fib_sync_limit	VNET(fib_sync_limit)
110 SYSCTL_INT(_net_route_algo, OID_AUTO, fib_sync_limit, CTLFLAG_RW | CTLFLAG_VNET,
111     &VNET_NAME(fib_sync_limit), 0, "Guarantee synchronous fib till route limit");
112 
113 #ifdef INET6
114 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false;
115 #define	V_algo_fixed_inet6	VNET(algo_fixed_inet6)
116 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
117     "IPv6 longest prefix match lookups");
118 #endif
119 #ifdef INET
120 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false;
121 #define	V_algo_fixed_inet	VNET(algo_fixed_inet)
122 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
123     "IPv4 longest prefix match lookups");
124 #endif
125 
126 /* Fib instance counter */
127 static uint32_t fib_gen = 0;
128 
129 struct nhop_ref_table {
130 	uint32_t		count;
131 	int32_t			refcnt[0];
132 };
133 
134 /*
135  * Data structure for the fib lookup instance tied to the particular rib.
136  */
137 struct fib_data {
138 	uint32_t		number_nhops;	/* current # of nhops */
139 	uint8_t			hit_nhops;	/* true if out of nhop limit */
140 	uint8_t			init_done;	/* true if init is competed */
141 	uint32_t		fd_dead:1;	/* Scheduled for deletion */
142 	uint32_t		fd_linked:1;	/* true if linked */
143 	uint32_t		fd_need_rebuild:1;	/* true if rebuild scheduled */
144 	uint32_t		fd_force_eval:1;/* true if rebuild scheduled */
145 	uint8_t			fd_family;	/* family */
146 	uint32_t		fd_fibnum;	/* fibnum */
147 	uint32_t		fd_failed_rebuilds;	/* stat: failed rebuilds */
148 	uint32_t		fd_gen;		/* instance gen# */
149 	struct callout		fd_callout;	/* rebuild callout */
150 	void			*fd_algo_data;	/* algorithm data */
151 	struct nhop_object	**nh_idx;	/* nhop idx->ptr array */
152 	struct nhop_ref_table	*nh_ref_table;	/* array with # of nhop references */
153 	struct rib_head		*fd_rh;		/* RIB table we're attached to */
154 	struct rib_subscription	*fd_rs;		/* storing table subscription */
155 	struct fib_dp		fd_dp;		/* fib datapath data */
156 	struct vnet		*fd_vnet;	/* vnet fib belongs to */
157 	struct epoch_context	fd_epoch_ctx;	/* epoch context for deletion */
158 	struct fib_lookup_module	*fd_flm;/* pointer to the lookup module */
159 	uint32_t		fd_num_changes;	/* number of changes since last callout */
160 	TAILQ_ENTRY(fib_data)	entries;	/* list of all fds in vnet */
161 };
162 
163 static bool rebuild_fd(struct fib_data *fd);
164 static void rebuild_fd_callout(void *_data);
165 static void destroy_fd_instance_epoch(epoch_context_t ctx);
166 static enum flm_op_result attach_datapath(struct fib_data *fd);
167 static bool is_idx_free(struct fib_data *fd, uint32_t index);
168 static void set_algo_fixed(struct rib_head *rh);
169 static bool is_algo_fixed(struct rib_head *rh);
170 
171 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh);
172 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh);
173 
174 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
175     struct fib_lookup_module *orig_flm);
176 static void fib_unref_algo(struct fib_lookup_module *flm);
177 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum);
178 
179 struct mtx fib_mtx;
180 #define	FIB_MOD_LOCK()		mtx_lock(&fib_mtx)
181 #define	FIB_MOD_UNLOCK()	mtx_unlock(&fib_mtx)
182 #define	FIB_MOD_LOCK_ASSERT()	mtx_assert(&fib_mtx, MA_OWNED)
183 
184 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF);
185 
186 /* Algorithm has to be this percent better than the current to switch */
187 #define	BEST_DIFF_PERCENT	(5 * 256 / 100)
188 /* Schedule algo re-evaluation X seconds after a change */
189 #define	ALGO_EVAL_DELAY_MS	30000
190 /* Force algo re-evaluation after X changes */
191 #define	ALGO_EVAL_NUM_ROUTES	100
192 /* Try to setup algorithm X times */
193 #define	FIB_MAX_TRIES		32
194 /* Max amount of supported nexthops */
195 #define	FIB_MAX_NHOPS		262144
196 #define	FIB_CALLOUT_DELAY_MS	50
197 
198 /* Debug */
199 static int flm_debug_level = LOG_NOTICE;
200 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN,
201     &flm_debug_level, 0, "debuglevel");
202 #define	FLM_MAX_DEBUG_LEVEL	LOG_DEBUG
203 #ifndef	LOG_DEBUG2
204 #define	LOG_DEBUG2	8
205 #endif
206 
207 #define	_PASS_MSG(_l)	(flm_debug_level >= (_l))
208 #define	ALGO_PRINTF(_fmt, ...)	printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__)
209 #define	_ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \
210     printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\
211     print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__)
212 #define	_RH_PRINTF(_fib, _fam, _func, _fmt, ...) \
213     printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__)
214 #define	RH_PRINTF(_l, _rh, _fmt, ...)	if (_PASS_MSG(_l)) {	\
215     _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\
216 }
217 #define	FD_PRINTF(_l, _fd, _fmt, ...)	FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__)
218 #define	_FD_PRINTF(_l, _fd, _fmt, ...)	if (_PASS_MSG(_l)) {		\
219     _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name,	\
220     _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__);			\
221 }
222 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2
223 #define	FD_PRINTF_LOG_DEBUG2	_FD_PRINTF
224 #else
225 #define	FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...)
226 #endif
227 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG
228 #define	FD_PRINTF_LOG_DEBUG	_FD_PRINTF
229 #else
230 #define	FD_PRINTF_LOG_DEBUG()
231 #endif
232 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO
233 #define	FD_PRINTF_LOG_INFO	_FD_PRINTF
234 #else
235 #define	FD_PRINTF_LOG_INFO()
236 #endif
237 #define	FD_PRINTF_LOG_NOTICE	_FD_PRINTF
238 #define	FD_PRINTF_LOG_ERR	_FD_PRINTF
239 #define	FD_PRINTF_LOG_WARNING	_FD_PRINTF
240 
241 
242 /* List of all registered lookup algorithms */
243 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list);
244 
245 /* List of all fib lookup instances in the vnet */
246 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list);
247 #define	V_fib_data_list	VNET(fib_data_list)
248 
249 /* Datastructure for storing non-transient fib lookup module failures */
250 struct fib_error {
251 	int				fe_family;
252 	uint32_t			fe_fibnum;	/* failed rtable */
253 	struct fib_lookup_module	*fe_flm;	/* failed module */
254 	TAILQ_ENTRY(fib_error)		entries;/* list of all errored entries */
255 };
256 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list);
257 #define	V_fib_error_list VNET(fib_error_list)
258 
259 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */
260 struct fib_dp_header {
261 	struct epoch_context	fdh_epoch_ctx;
262 	uint32_t		fdh_num_tables;
263 	struct fib_dp		fdh_idx[0];
264 };
265 
266 /*
267  * Tries to add new non-transient algorithm error to the list of
268  *  errors.
269  * Returns true on success.
270  */
271 static bool
272 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum)
273 {
274 	struct fib_error *fe;
275 
276 	fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO);
277 	if (fe == NULL)
278 		return (false);
279 	fe->fe_flm = flm;
280 	fe->fe_family = flm->flm_family;
281 	fe->fe_fibnum = fibnum;
282 
283 	FIB_MOD_LOCK();
284 	/* Avoid duplicates by checking if error already exists first */
285 	if (flm_error_check(flm, fibnum)) {
286 		FIB_MOD_UNLOCK();
287 		free(fe, M_TEMP);
288 		return (true);
289 	}
290 	TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries);
291 	FIB_MOD_UNLOCK();
292 
293 	return (true);
294 }
295 
296 /*
297  * True if non-transient error has been registered for @flm in @fibnum.
298  */
299 static bool
300 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum)
301 {
302 	const struct fib_error *fe;
303 
304 	TAILQ_FOREACH(fe, &V_fib_error_list, entries) {
305 		if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum))
306 			return (true);
307 	}
308 
309 	return (false);
310 }
311 
312 /*
313  * Clear all errors of algo specified by @flm.
314  */
315 static void
316 fib_error_clear_flm(struct fib_lookup_module *flm)
317 {
318 	struct fib_error *fe, *fe_tmp;
319 
320 	FIB_MOD_LOCK_ASSERT();
321 
322 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
323 		if (fe->fe_flm == flm) {
324 			TAILQ_REMOVE(&V_fib_error_list, fe, entries);
325 			free(fe, M_TEMP);
326 		}
327 	}
328 }
329 
330 /*
331  * Clears all errors in current VNET.
332  */
333 static void
334 fib_error_clear()
335 {
336 	struct fib_error *fe, *fe_tmp;
337 
338 	FIB_MOD_LOCK_ASSERT();
339 
340 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
341 		TAILQ_REMOVE(&V_fib_error_list, fe, entries);
342 		free(fe, M_TEMP);
343 	}
344 }
345 
346 static const char *
347 print_op_result(enum flm_op_result result)
348 {
349 	switch (result) {
350 	case FLM_SUCCESS:
351 		return "success";
352 	case FLM_REBUILD:
353 		return "rebuild";
354 	case FLM_ERROR:
355 		return "error";
356 	}
357 
358 	return "unknown";
359 }
360 
361 static const char *
362 print_family(int family)
363 {
364 
365 	if (family == AF_INET)
366 		return ("inet");
367 	else if (family == AF_INET6)
368 		return ("inet6");
369 	else
370 		return ("unknown");
371 }
372 
373 /*
374  * Debug function used by lookup algorithms.
375  * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) "
376  */
377 void
378 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...)
379 {
380 	char buf[128];
381 	va_list ap;
382 
383 	if (level > flm_debug_level)
384 		return;
385 
386 	va_start(ap, fmt);
387 	vsnprintf(buf, sizeof(buf), fmt, ap);
388 	va_end(ap);
389 
390 	_ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name,
391 	    fd->fd_gen, func, "%s", buf);
392 }
393 
394 /*
395  * Outputs list of algorithms supported by the provided address family.
396  */
397 static int
398 print_algos_sysctl(struct sysctl_req *req, int family)
399 {
400 	struct fib_lookup_module *flm;
401 	struct sbuf sbuf;
402 	int error, count = 0;
403 
404 	error = sysctl_wire_old_buffer(req, 0);
405 	if (error == 0) {
406 		sbuf_new_for_sysctl(&sbuf, NULL, 512, req);
407 		TAILQ_FOREACH(flm, &all_algo_list, entries) {
408 			if (flm->flm_family == family) {
409 				if (count++ > 0)
410 					sbuf_cat(&sbuf, ", ");
411 				sbuf_cat(&sbuf, flm->flm_name);
412 			}
413 		}
414 		error = sbuf_finish(&sbuf);
415 		sbuf_delete(&sbuf);
416 	}
417 	return (error);
418 }
419 
420 #ifdef INET6
421 static int
422 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)
423 {
424 
425 	return (print_algos_sysctl(req, AF_INET6));
426 }
427 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
428     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
429     print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms");
430 #endif
431 
432 #ifdef INET
433 static int
434 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)
435 {
436 
437 	return (print_algos_sysctl(req, AF_INET));
438 }
439 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
440     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
441     print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms");
442 #endif
443 
444 /*
445  * Calculate delay between repeated failures.
446  * Returns current delay in milliseconds.
447  */
448 static uint32_t
449 callout_calc_delay_ms(struct fib_data *fd)
450 {
451 	uint32_t shift;
452 
453 	if (fd->fd_failed_rebuilds > 10)
454 		shift = 10;
455 	else
456 		shift = fd->fd_failed_rebuilds;
457 
458 	return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
459 }
460 
461 static void
462 schedule_callout(struct fib_data *fd, int delay_ms)
463 {
464 
465 	callout_reset_sbt(&fd->fd_callout, 0, SBT_1MS * delay_ms,
466 	    rebuild_fd_callout, fd, 0);
467 }
468 
469 static void
470 schedule_fd_rebuild(struct fib_data *fd, const char *reason)
471 {
472 
473 	RIB_WLOCK_ASSERT(fd->fd_rh);
474 
475 	if (!fd->fd_need_rebuild) {
476 		fd->fd_need_rebuild = true;
477 
478 		/*
479 		 * Potentially re-schedules pending callout
480 		 *  initiated by schedule_algo_eval.
481 		 */
482 		FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)",
483 		    reason, fd->fd_failed_rebuilds);
484 		schedule_callout(fd, callout_calc_delay_ms(fd));
485 	}
486 }
487 
488 static void
489 schedule_algo_eval(struct fib_data *fd)
490 {
491 
492 	RIB_WLOCK_ASSERT(fd->fd_rh);
493 
494 	if (fd->fd_num_changes++ == 0) {
495 		/* Start callout to consider switch */
496 		if (!callout_pending(&fd->fd_callout))
497 			schedule_callout(fd, ALGO_EVAL_DELAY_MS);
498 	} else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) {
499 		/* Reset callout to exec immediately */
500 		if (!fd->fd_need_rebuild) {
501 			fd->fd_force_eval = true;
502 			schedule_callout(fd, 1);
503 		}
504 	}
505 }
506 
507 static bool
508 need_immediate_rebuild(struct fib_data *fd, struct rib_cmd_info *rc)
509 {
510 	struct nhop_object *nh;
511 
512 	if ((V_fib_sync_limit == 0) || (fd->fd_rh->rnh_prefixes <= V_fib_sync_limit))
513 		return (true);
514 
515 	/* Sync addition/removal of interface routes */
516 	switch (rc->rc_cmd) {
517 	case RTM_ADD:
518 		nh = rc->rc_nh_new;
519 		if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY)))
520 			return (true);
521 		break;
522 	case RTM_DELETE:
523 		nh = rc->rc_nh_old;
524 		if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY)))
525 			return (true);
526 		break;
527 	}
528 
529 	return (false);
530 }
531 
532 /*
533  * Rib subscription handler. Checks if the algorithm is ready to
534  *  receive updates, handles nexthop refcounting and passes change
535  *  data to the algorithm callback.
536  */
537 static void
538 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
539     void *_data)
540 {
541 	struct fib_data *fd = (struct fib_data *)_data;
542 	enum flm_op_result result;
543 
544 	RIB_WLOCK_ASSERT(rnh);
545 
546 	/*
547 	 * There is a small gap between subscribing for route changes
548 	 *  and initiating rtable dump. Avoid receiving route changes
549 	 *  prior to finishing rtable dump by checking `init_done`.
550 	 */
551 	if (!fd->init_done)
552 		return;
553 	/*
554 	 * If algo requested rebuild, stop sending updates by default.
555 	 * This simplifies nexthop refcount handling logic.
556 	 */
557 	if (fd->fd_need_rebuild)
558 		return;
559 
560 	/* Consider scheduling algorithm re-evaluation */
561 	schedule_algo_eval(fd);
562 
563 	/*
564 	 * Maintain guarantee that every nexthop returned by the dataplane
565 	 *  lookup has > 0 refcount, so can be safely referenced within current
566 	 *  epoch.
567 	 */
568 	if (rc->rc_nh_new != NULL) {
569 		if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) {
570 			/* ran out of indexes */
571 			schedule_fd_rebuild(fd, "ran out of nhop indexes");
572 			return;
573 		}
574 	}
575 
576 	result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
577 
578 	switch (result) {
579 	case FLM_SUCCESS:
580 		/* Unref old nexthop on success */
581 		if (rc->rc_nh_old != NULL)
582 			fib_unref_nhop(fd, rc->rc_nh_old);
583 		break;
584 	case FLM_REBUILD:
585 
586 		/*
587 		 * Algo is not able to apply the update.
588 		 * Schedule algo rebuild.
589 		 */
590 		if (!need_immediate_rebuild(fd, rc)) {
591 			schedule_fd_rebuild(fd, "algo requested rebuild");
592 			break;
593 		}
594 
595 		fd->fd_need_rebuild = true;
596 		FD_PRINTF(LOG_INFO, fd, "running sync rebuild");
597 		if (!rebuild_fd(fd))
598 			schedule_fd_rebuild(fd, "sync rebuild failed");
599 		break;
600 	case FLM_ERROR:
601 
602 		/*
603 		 * Algo reported a non-recoverable error.
604 		 * Record the error and schedule rebuild, which will
605 		 *  trigger best algo selection.
606 		 */
607 		FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error");
608 		if (!flm_error_add(fd->fd_flm, fd->fd_fibnum))
609 			FD_PRINTF(LOG_ERR, fd, "failed to ban algo");
610 		schedule_fd_rebuild(fd, "algo reported non-recoverable error");
611 	}
612 }
613 
614 static void
615 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd)
616 {
617 
618 	if (old_fd == NULL) {
619 		// TODO: read from rtable
620 		fd->number_nhops = 16;
621 		return;
622 	}
623 
624 	if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
625 		fd->number_nhops = 2 * old_fd->number_nhops;
626 	else
627 		fd->number_nhops = old_fd->number_nhops;
628 }
629 
630 struct walk_cbdata {
631 	struct fib_data		*fd;
632 	flm_dump_t		*func;
633 	enum flm_op_result	result;
634 };
635 
636 /*
637  * Handler called after all rtenties have been dumped.
638  * Performs post-dump framework checks and calls
639  * algo:flm_dump_end_cb().
640  *
641  * Updates walk_cbdata result.
642  */
643 static void
644 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
645 {
646 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
647 	struct fib_data *fd = w->fd;
648 
649 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
650 
651 	if (rnh->rib_dying) {
652 		w->result = FLM_ERROR;
653 		return;
654 	}
655 
656 	if (fd->hit_nhops) {
657 		FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops",
658 		    fd->nh_ref_table->count);
659 		if (w->result == FLM_SUCCESS)
660 			w->result = FLM_REBUILD;
661 		return;
662 	}
663 
664 	if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
665 		return;
666 
667 	/* Post-dump hook, dump successful */
668 	w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
669 
670 	if (w->result == FLM_SUCCESS) {
671 		/* Mark init as done to allow routing updates */
672 		fd->init_done = 1;
673 	}
674 }
675 
676 /*
677  * Callback for each entry in rib.
678  * Calls algo:flm_dump_rib_item_cb func as a part of initial
679  *  route table synchronisation.
680  */
681 static int
682 sync_algo_cb(struct rtentry *rt, void *_data)
683 {
684 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
685 
686 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
687 
688 	if (w->result == FLM_SUCCESS && w->func) {
689 
690 		/*
691 		 * Reference nexthops to maintain guarantee that
692 		 *  each nexthop returned by datapath has > 0 references
693 		 *  and can be safely referenced within current epoch.
694 		 */
695 		struct nhop_object *nh = rt_get_raw_nhop(rt);
696 		if (fib_ref_nhop(w->fd, nh) != 0)
697 			w->result = w->func(rt, w->fd->fd_algo_data);
698 		else
699 			w->result = FLM_REBUILD;
700 	}
701 
702 	return (0);
703 }
704 
705 /*
706  * Dump all routing table state to the algo instance.
707  */
708 static enum flm_op_result
709 sync_algo(struct fib_data *fd)
710 {
711 	struct walk_cbdata w = {
712 		.fd = fd,
713 		.func = fd->fd_flm->flm_dump_rib_item_cb,
714 		.result = FLM_SUCCESS,
715 	};
716 
717 	rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w);
718 
719 	FD_PRINTF(LOG_INFO, fd,
720 	    "initial dump completed (rtable version: %d), result: %s",
721 	    fd->fd_rh->rnh_gen, print_op_result(w.result));
722 
723 	return (w.result);
724 }
725 
726 /*
727  * Schedules epoch-backed @fd instance deletion.
728  * * Unlinks @fd from the list of active algo instances.
729  * * Removes rib subscription.
730  * * Stops callout.
731  * * Schedules actual deletion.
732  *
733  * Assume @fd is already unlinked from the datapath.
734  */
735 static int
736 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
737 {
738 	bool is_dead;
739 
740 	NET_EPOCH_ASSERT();
741 	RIB_WLOCK_ASSERT(fd->fd_rh);
742 
743 	FIB_MOD_LOCK();
744 	is_dead = fd->fd_dead;
745 	if (!is_dead)
746 		fd->fd_dead = true;
747 	if (fd->fd_linked) {
748 		TAILQ_REMOVE(&V_fib_data_list, fd, entries);
749 		fd->fd_linked = false;
750 	}
751 	FIB_MOD_UNLOCK();
752 	if (is_dead)
753 		return (0);
754 
755 	FD_PRINTF(LOG_INFO, fd, "DETACH");
756 
757 	if (fd->fd_rs != NULL)
758 		rib_unsibscribe_locked(fd->fd_rs);
759 
760 	/*
761 	 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
762 	 * will be executed, hence no _new_ callout schedules will happen.
763 	 */
764 	callout_stop(&fd->fd_callout);
765 
766 	epoch_call(net_epoch_preempt, destroy_fd_instance_epoch,
767 	    &fd->fd_epoch_ctx);
768 
769 	return (0);
770 }
771 
772 /*
773  * Wipe all fd instances from the list matching rib specified by @rh.
774  * If @keep_first is set, remove all but the first record.
775  */
776 static void
777 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
778 {
779 	struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
780 	struct fib_data *fd, *fd_tmp;
781 	struct epoch_tracker et;
782 
783 	FIB_MOD_LOCK();
784 	TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) {
785 		if (fd->fd_rh == rh) {
786 			if (keep_first) {
787 				keep_first = false;
788 				continue;
789 			}
790 			TAILQ_REMOVE(&V_fib_data_list, fd, entries);
791 			fd->fd_linked = false;
792 			TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
793 		}
794 	}
795 	FIB_MOD_UNLOCK();
796 
797 	/* Pass 2: remove each entry */
798 	NET_EPOCH_ENTER(et);
799 	TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
800 		if (!in_callout)
801 			RIB_WLOCK(fd->fd_rh);
802 		schedule_destroy_fd_instance(fd, in_callout);
803 		if (!in_callout)
804 			RIB_WUNLOCK(fd->fd_rh);
805 	}
806 	NET_EPOCH_EXIT(et);
807 }
808 
809 void
810 fib_destroy_rib(struct rib_head *rh)
811 {
812 
813 	/*
814 	 * rnh has `is_dying` flag set, so setup of new fd's will fail at
815 	 *  sync_algo() stage, preventing new entries to be added to the list
816 	 *  of active algos. Remove all existing entries for the particular rib.
817 	 */
818 	fib_cleanup_algo(rh, false, false);
819 }
820 
821 /*
822  * Finalises fd destruction by freeing all fd resources.
823  */
824 static void
825 destroy_fd_instance(struct fib_data *fd)
826 {
827 
828 	FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd);
829 
830 	/* Call destroy callback first */
831 	if (fd->fd_algo_data != NULL)
832 		fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
833 
834 	/* Nhop table */
835 	if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) {
836 		for (int i = 0; i < fd->number_nhops; i++) {
837 			if (!is_idx_free(fd, i)) {
838 				FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p",
839 				    i, fd->nh_idx[i]);
840 				nhop_free_any(fd->nh_idx[i]);
841 			}
842 		}
843 		free(fd->nh_idx, M_RTABLE);
844 	}
845 	if (fd->nh_ref_table != NULL)
846 		free(fd->nh_ref_table, M_RTABLE);
847 
848 	fib_unref_algo(fd->fd_flm);
849 
850 	free(fd, M_RTABLE);
851 }
852 
853 /*
854  * Epoch callback indicating fd is safe to destroy
855  */
856 static void
857 destroy_fd_instance_epoch(epoch_context_t ctx)
858 {
859 	struct fib_data *fd;
860 
861 	fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
862 
863 	destroy_fd_instance(fd);
864 }
865 
866 /*
867  * Tries to setup fd instance.
868  * - Allocates fd/nhop table
869  * - Runs algo:flm_init_cb algo init
870  * - Subscribes fd to the rib
871  * - Runs rtable dump
872  * - Adds instance to the list of active instances.
873  *
874  * Returns: operation result. Fills in @pfd with resulting fd on success.
875  *
876  */
877 static enum flm_op_result
878 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
879     struct fib_data *old_fd, struct fib_data **pfd)
880 {
881 	struct fib_data *fd;
882 	size_t size;
883 	enum flm_op_result result;
884 
885 	/* Allocate */
886 	fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO);
887 	if (fd == NULL)  {
888 		*pfd = NULL;
889 		RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure");
890 		return (FLM_REBUILD);
891 	}
892 	*pfd = fd;
893 
894 	estimate_nhop_scale(old_fd, fd);
895 
896 	fd->fd_rh = rh;
897 	fd->fd_gen = ++fib_gen;
898 	fd->fd_family = rh->rib_family;
899 	fd->fd_fibnum = rh->rib_fibnum;
900 	callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0);
901 	fd->fd_vnet = curvnet;
902 	fd->fd_flm = flm;
903 
904 	FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd);
905 
906 	FIB_MOD_LOCK();
907 	flm->flm_refcount++;
908 	FIB_MOD_UNLOCK();
909 
910 	/* Allocate nhidx -> nhop_ptr table */
911 	size = fd->number_nhops * sizeof(void *);
912 	fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
913 	if (fd->nh_idx == NULL) {
914 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size);
915 		return (FLM_REBUILD);
916 	}
917 
918 	/* Allocate nhop index refcount table */
919 	size = sizeof(struct nhop_ref_table);
920 	size += fd->number_nhops * sizeof(uint32_t);
921 	fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
922 	if (fd->nh_ref_table == NULL) {
923 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size);
924 		return (FLM_REBUILD);
925 	}
926 	FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops);
927 
928 	/* Okay, we're ready for algo init */
929 	void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL;
930 	result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data);
931 	if (result != FLM_SUCCESS) {
932 		FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name);
933 		return (result);
934 	}
935 
936 	/* Try to subscribe */
937 	if (flm->flm_change_rib_item_cb != NULL) {
938 		fd->fd_rs = rib_subscribe_locked(fd->fd_rh,
939 		    handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE);
940 		if (fd->fd_rs == NULL) {
941 			FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes");
942 			return (FLM_REBUILD);
943 		}
944 	}
945 
946 	/* Dump */
947 	result = sync_algo(fd);
948 	if (result != FLM_SUCCESS) {
949 		FD_PRINTF(LOG_INFO, fd, "rib sync failed");
950 		return (result);
951 	}
952 	FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully.");
953 
954 	FIB_MOD_LOCK();
955 	/*
956 	 * Insert fd in the beginning of a list, to maintain invariant
957 	 *  that first matching entry for the AF/fib is always the active
958 	 *  one.
959 	 */
960 	TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries);
961 	fd->fd_linked = true;
962 	FIB_MOD_UNLOCK();
963 
964 	return (FLM_SUCCESS);
965 }
966 
967 /*
968  * Sets up algo @flm for table @rh and links it to the datapath.
969  *
970  */
971 static enum flm_op_result
972 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
973     struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
974 {
975 	struct fib_data *prev_fd, *new_fd;
976 	enum flm_op_result result;
977 
978 	NET_EPOCH_ASSERT();
979 	RIB_WLOCK_ASSERT(rh);
980 
981 	prev_fd = orig_fd;
982 	new_fd = NULL;
983 	for (int i = 0; i < FIB_MAX_TRIES; i++) {
984 		result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);
985 
986 		if ((result == FLM_SUCCESS) && attach)
987 			result = attach_datapath(new_fd);
988 
989 		if ((prev_fd != NULL) && (prev_fd != orig_fd)) {
990 			schedule_destroy_fd_instance(prev_fd, false);
991 			prev_fd = NULL;
992 		}
993 
994 		RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
995 		    print_op_result(result));
996 
997 		if (result == FLM_REBUILD) {
998 			prev_fd = new_fd;
999 			new_fd = NULL;
1000 			continue;
1001 		}
1002 
1003 		break;
1004 	}
1005 
1006 	if (result != FLM_SUCCESS) {
1007 		RH_PRINTF(LOG_WARNING, rh,
1008 		    "%s algo instance setup failed, failures=%d", flm->flm_name,
1009 		    orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0);
1010 		/* update failure count */
1011 		FIB_MOD_LOCK();
1012 		if (orig_fd != NULL)
1013 			orig_fd->fd_failed_rebuilds++;
1014 		FIB_MOD_UNLOCK();
1015 
1016 		/* Ban algo on non-recoverable error */
1017 		if (result == FLM_ERROR)
1018 			flm_error_add(flm, rh->rib_fibnum);
1019 
1020 		if ((prev_fd != NULL) && (prev_fd != orig_fd))
1021 			schedule_destroy_fd_instance(prev_fd, false);
1022 		if (new_fd != NULL) {
1023 			schedule_destroy_fd_instance(new_fd, false);
1024 			new_fd = NULL;
1025 		}
1026 	}
1027 
1028 	*pfd = new_fd;
1029 	return (result);
1030 }
1031 
1032 /*
1033  * Callout for all scheduled fd-related work.
1034  * - Checks if the current algo is still the best algo
1035  * - Creates a new instance of an algo for af/fib if desired.
1036  */
1037 static void
1038 rebuild_fd_callout(void *_data)
1039 {
1040 	struct fib_data *fd = (struct fib_data *)_data;
1041 	struct epoch_tracker et;
1042 
1043 	FD_PRINTF(LOG_INFO, fd, "running callout rebuild");
1044 
1045 	NET_EPOCH_ENTER(et);
1046 	CURVNET_SET(fd->fd_vnet);
1047 	rebuild_fd(fd);
1048 	CURVNET_RESTORE();
1049 	NET_EPOCH_EXIT(et);
1050 }
1051 
1052 /*
1053  * Tries to create new algo instance based on @fd data.
1054  * Returns true on success.
1055  */
1056 static bool
1057 rebuild_fd(struct fib_data *fd)
1058 {
1059 	struct fib_data *fd_new, *fd_tmp;
1060 	struct fib_lookup_module *flm_new = NULL;
1061 	enum flm_op_result result;
1062 	bool need_rebuild = false;
1063 
1064 	NET_EPOCH_ASSERT();
1065 	RIB_WLOCK_ASSERT(fd->fd_rh);
1066 
1067 	need_rebuild = fd->fd_need_rebuild;
1068 	fd->fd_need_rebuild = false;
1069 	fd->fd_force_eval = false;
1070 	fd->fd_num_changes = 0;
1071 
1072 	/* First, check if we're still OK to use this algo */
1073 	if (!is_algo_fixed(fd->fd_rh))
1074 		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1075 	if ((flm_new == NULL) && (!need_rebuild)) {
1076 		/* Keep existing algo, no need to rebuild. */
1077 		return (true);
1078 	}
1079 
1080 	if (flm_new == NULL) {
1081 		flm_new = fd->fd_flm;
1082 		fd_tmp = fd;
1083 	} else {
1084 		fd_tmp = NULL;
1085 		FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name);
1086 	}
1087 	result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true);
1088 	if (fd_tmp == NULL) {
1089 		/* fd_new represents new algo */
1090 		fib_unref_algo(flm_new);
1091 	}
1092 	if (result != FLM_SUCCESS) {
1093 		FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed");
1094 		return (false);
1095 	}
1096 	FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");
1097 
1098 	/* Remove old instance */
1099 	schedule_destroy_fd_instance(fd, true);
1100 
1101 	return (true);
1102 }
1103 
1104 /*
1105  * Finds algo by name/family.
1106  * Returns referenced algo or NULL.
1107  */
1108 static struct fib_lookup_module *
1109 fib_find_algo(const char *algo_name, int family)
1110 {
1111 	struct fib_lookup_module *flm;
1112 
1113 	FIB_MOD_LOCK();
1114 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1115 		if ((strcmp(flm->flm_name, algo_name) == 0) &&
1116 		    (family == flm->flm_family)) {
1117 			flm->flm_refcount++;
1118 			FIB_MOD_UNLOCK();
1119 			return (flm);
1120 		}
1121 	}
1122 	FIB_MOD_UNLOCK();
1123 
1124 	return (NULL);
1125 }
1126 
1127 static void
1128 fib_unref_algo(struct fib_lookup_module *flm)
1129 {
1130 
1131 	FIB_MOD_LOCK();
1132 	flm->flm_refcount--;
1133 	FIB_MOD_UNLOCK();
1134 }
1135 
1136 static int
1137 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req)
1138 {
1139 	struct fib_lookup_module *flm = NULL;
1140 	struct fib_data *fd = NULL;
1141 	char old_algo_name[32], algo_name[32];
1142 	struct rib_head *rh = NULL;
1143 	enum flm_op_result result;
1144 	struct epoch_tracker et;
1145 	int error;
1146 
1147 	/* Fetch current algo/rib for af/family */
1148 	FIB_MOD_LOCK();
1149 	TAILQ_FOREACH(fd, &V_fib_data_list, entries) {
1150 		if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum))
1151 			break;
1152 	}
1153 	if (fd == NULL) {
1154 		FIB_MOD_UNLOCK();
1155 		return (ENOENT);
1156 	}
1157 	rh = fd->fd_rh;
1158 	strlcpy(old_algo_name, fd->fd_flm->flm_name,
1159 	    sizeof(old_algo_name));
1160 	FIB_MOD_UNLOCK();
1161 
1162 	strlcpy(algo_name, old_algo_name, sizeof(algo_name));
1163 	error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req);
1164 	if (error != 0 || req->newptr == NULL)
1165 		return (error);
1166 
1167 	if (strcmp(algo_name, old_algo_name) == 0)
1168 		return (0);
1169 
1170 	/* New algorithm name is different */
1171 	flm = fib_find_algo(algo_name, family);
1172 	if (flm == NULL) {
1173 		RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name);
1174 		return (ESRCH);
1175 	}
1176 
1177 	fd = NULL;
1178 	NET_EPOCH_ENTER(et);
1179 	RIB_WLOCK(rh);
1180 	result = setup_fd_instance(flm, rh, NULL, &fd, true);
1181 	RIB_WUNLOCK(rh);
1182 	NET_EPOCH_EXIT(et);
1183 	fib_unref_algo(flm);
1184 	if (result != FLM_SUCCESS)
1185 		return (EINVAL);
1186 
1187 	/* Disable automated jumping between algos */
1188 	FIB_MOD_LOCK();
1189 	set_algo_fixed(rh);
1190 	FIB_MOD_UNLOCK();
1191 	/* Remove old instance(s) */
1192 	fib_cleanup_algo(rh, true, false);
1193 
1194 	/* Drain cb so user can unload the module after userret if so desired */
1195 	epoch_drain_callbacks(net_epoch_preempt);
1196 
1197 	return (0);
1198 }
1199 
1200 #ifdef INET
1201 static int
1202 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS)
1203 {
1204 
1205 	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req));
1206 }
1207 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo,
1208     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1209     set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo");
1210 #endif
1211 
1212 #ifdef INET6
1213 static int
1214 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS)
1215 {
1216 
1217 	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req));
1218 }
1219 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo,
1220     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1221     set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo");
1222 #endif
1223 
1224 static void
1225 destroy_fdh_epoch(epoch_context_t ctx)
1226 {
1227 	struct fib_dp_header *fdh;
1228 
1229 	fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx);
1230 	free(fdh, M_RTABLE);
1231 }
1232 
1233 static struct fib_dp_header *
1234 alloc_fib_dp_array(uint32_t num_tables, bool waitok)
1235 {
1236 	size_t sz;
1237 	struct fib_dp_header *fdh;
1238 
1239 	sz = sizeof(struct fib_dp_header);
1240 	sz += sizeof(struct fib_dp) * num_tables;
1241 	fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
1242 	if (fdh != NULL)
1243 		fdh->fdh_num_tables = num_tables;
1244 	return (fdh);
1245 }
1246 
1247 static struct fib_dp_header *
1248 get_fib_dp_header(struct fib_dp *dp)
1249 {
1250 
1251 	return (__containerof((void *)dp, struct fib_dp_header, fdh_idx));
1252 }
1253 
1254 /*
1255  * Replace per-family index pool @pdp with a new one which
1256  * contains updated callback/algo data from @fd.
1257  * Returns 0 on success.
1258  */
1259 static enum flm_op_result
1260 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd)
1261 {
1262 	struct fib_dp_header *new_fdh, *old_fdh;
1263 
1264 	NET_EPOCH_ASSERT();
1265 
1266 	FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p",
1267 	    curvnet, fd->fd_dp.f, fd->fd_dp.arg);
1268 
1269 	FIB_MOD_LOCK();
1270 	old_fdh = get_fib_dp_header(*pdp);
1271 	new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false);
1272 	FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh);
1273 	if (new_fdh == NULL) {
1274 		FIB_MOD_UNLOCK();
1275 		FD_PRINTF(LOG_WARNING, fd, "error attaching datapath");
1276 		return (FLM_REBUILD);
1277 	}
1278 
1279 	memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1280 	    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1281 	/* Update relevant data structure for @fd */
1282 	new_fdh->fdh_idx[fd->fd_fibnum] = fd->fd_dp;
1283 
1284 	/* Ensure memcpy() writes have completed */
1285 	atomic_thread_fence_rel();
1286 	/* Set new datapath pointer */
1287 	*pdp = &new_fdh->fdh_idx[0];
1288 	FIB_MOD_UNLOCK();
1289 	FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh);
1290 
1291 	epoch_call(net_epoch_preempt, destroy_fdh_epoch,
1292 	    &old_fdh->fdh_epoch_ctx);
1293 
1294 	return (FLM_SUCCESS);
1295 }
1296 
1297 static struct fib_dp **
1298 get_family_dp_ptr(int family)
1299 {
1300 	switch (family) {
1301 	case AF_INET:
1302 		return (&V_inet_dp);
1303 	case AF_INET6:
1304 		return (&V_inet6_dp);
1305 	}
1306 	return (NULL);
1307 }
1308 
1309 /*
1310  * Make datapath use fib instance @fd
1311  */
1312 static enum flm_op_result
1313 attach_datapath(struct fib_data *fd)
1314 {
1315 	struct fib_dp **pdp;
1316 
1317 	pdp = get_family_dp_ptr(fd->fd_family);
1318 	return (replace_rtables_family(pdp, fd));
1319 }
1320 
1321 /*
1322  * Grow datapath pointers array.
1323  * Called from sysctl handler on growing number of routing tables.
1324  */
1325 static void
1326 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables)
1327 {
1328 	struct fib_dp_header *new_fdh, *old_fdh = NULL;
1329 
1330 	new_fdh = alloc_fib_dp_array(new_num_tables, true);
1331 
1332 	FIB_MOD_LOCK();
1333 	if (*pdp != NULL) {
1334 		old_fdh = get_fib_dp_header(*pdp);
1335 		memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1336 		    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1337 	}
1338 
1339 	/* Wait till all writes completed */
1340 	atomic_thread_fence_rel();
1341 
1342 	*pdp = &new_fdh->fdh_idx[0];
1343 	FIB_MOD_UNLOCK();
1344 
1345 	if (old_fdh != NULL)
1346 		epoch_call(net_epoch_preempt, destroy_fdh_epoch,
1347 		    &old_fdh->fdh_epoch_ctx);
1348 }
1349 
1350 /*
1351  * Grows per-AF arrays of datapath pointers for each supported family.
1352  * Called from fibs resize sysctl handler.
1353  */
1354 void
1355 fib_grow_rtables(uint32_t new_num_tables)
1356 {
1357 
1358 #ifdef INET
1359 	grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables);
1360 #endif
1361 #ifdef INET6
1362 	grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables);
1363 #endif
1364 }
1365 
1366 void
1367 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo)
1368 {
1369 
1370 	bzero(rinfo, sizeof(struct rib_rtable_info));
1371 	rinfo->num_prefixes = rh->rnh_prefixes;
1372 	rinfo->num_nhops = nhops_get_count(rh);
1373 #ifdef ROUTE_MPATH
1374 	rinfo->num_nhgrp = nhgrp_get_count(rh);
1375 #endif
1376 }
1377 
1378 /*
1379  * Accessor to get rib instance @fd is attached to.
1380  */
1381 struct rib_head *
1382 fib_get_rh(struct fib_data *fd)
1383 {
1384 
1385 	return (fd->fd_rh);
1386 }
1387 
1388 /*
1389  * Accessor to export idx->nhop array
1390  */
1391 struct nhop_object **
1392 fib_get_nhop_array(struct fib_data *fd)
1393 {
1394 
1395 	return (fd->nh_idx);
1396 }
1397 
1398 static uint32_t
1399 get_nhop_idx(struct nhop_object *nh)
1400 {
1401 #ifdef ROUTE_MPATH
1402 	if (NH_IS_NHGRP(nh))
1403 		return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1);
1404 	else
1405 		return (nhop_get_idx(nh) * 2);
1406 #else
1407 	return (nhop_get_idx(nh));
1408 #endif
1409 }
1410 
1411 uint32_t
1412 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh)
1413 {
1414 
1415 	return (get_nhop_idx(nh));
1416 }
1417 
1418 static bool
1419 is_idx_free(struct fib_data *fd, uint32_t index)
1420 {
1421 
1422 	return (fd->nh_ref_table->refcnt[index] == 0);
1423 }
1424 
1425 static uint32_t
1426 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh)
1427 {
1428 	uint32_t idx = get_nhop_idx(nh);
1429 
1430 	if (idx >= fd->number_nhops) {
1431 		fd->hit_nhops = 1;
1432 		return (0);
1433 	}
1434 
1435 	if (is_idx_free(fd, idx)) {
1436 		nhop_ref_any(nh);
1437 		fd->nh_idx[idx] = nh;
1438 		fd->nh_ref_table->count++;
1439 		FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]);
1440 	}
1441 	fd->nh_ref_table->refcnt[idx]++;
1442 
1443 	return (idx);
1444 }
1445 
1446 struct nhop_release_data {
1447 	struct nhop_object	*nh;
1448 	struct epoch_context	ctx;
1449 };
1450 
1451 static void
1452 release_nhop_epoch(epoch_context_t ctx)
1453 {
1454 	struct nhop_release_data *nrd;
1455 
1456 	nrd = __containerof(ctx, struct nhop_release_data, ctx);
1457 	nhop_free_any(nrd->nh);
1458 	free(nrd, M_TEMP);
1459 }
1460 
1461 /*
1462  * Delays nexthop refcount release.
1463  * Datapath may have the datastructures not updated yet, so the old
1464  *  nexthop may still be returned till the end of current epoch. Delay
1465  *  refcount removal, as we may be removing the last instance, which will
1466  *  trigger nexthop deletion, rendering returned nexthop invalid.
1467  */
1468 static void
1469 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh)
1470 {
1471 	struct nhop_release_data *nrd;
1472 
1473 	nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO);
1474 	if (nrd != NULL) {
1475 		nrd->nh = nh;
1476 		epoch_call(net_epoch_preempt, release_nhop_epoch, &nrd->ctx);
1477 	} else {
1478 		/*
1479 		 * Unable to allocate memory. Leak nexthop to maintain guarantee
1480 		 *  that each nhop can be referenced.
1481 		 */
1482 		FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh);
1483 	}
1484 }
1485 
1486 static void
1487 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh)
1488 {
1489 	uint32_t idx = get_nhop_idx(nh);
1490 
1491 	KASSERT((idx < fd->number_nhops), ("invalid nhop index"));
1492 	KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh"));
1493 
1494 	fd->nh_ref_table->refcnt[idx]--;
1495 	if (fd->nh_ref_table->refcnt[idx] == 0) {
1496 		FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]);
1497 		fib_schedule_release_nhop(fd, fd->nh_idx[idx]);
1498 	}
1499 }
1500 
1501 static void
1502 set_algo_fixed(struct rib_head *rh)
1503 {
1504 	switch (rh->rib_family) {
1505 #ifdef INET
1506 	case AF_INET:
1507 		V_algo_fixed_inet = true;
1508 		break;
1509 #endif
1510 #ifdef INET6
1511 	case AF_INET6:
1512 		V_algo_fixed_inet6 = true;
1513 		break;
1514 #endif
1515 	}
1516 }
1517 
1518 static bool
1519 is_algo_fixed(struct rib_head *rh)
1520 {
1521 
1522 	switch (rh->rib_family) {
1523 #ifdef INET
1524 	case AF_INET:
1525 		return (V_algo_fixed_inet);
1526 #endif
1527 #ifdef INET6
1528 	case AF_INET6:
1529 		return (V_algo_fixed_inet6);
1530 #endif
1531 	}
1532 	return (false);
1533 }
1534 
1535 /*
1536  * Runs the check on what would be the best algo for rib @rh, assuming
1537  *  that the current algo is the one specified by @orig_flm. Note that
1538  *  it can be NULL for initial selection.
1539  *
1540  * Returns referenced new algo or NULL if the current one is the best.
1541  */
1542 static struct fib_lookup_module *
1543 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm)
1544 {
1545 	uint8_t preference, curr_preference = 0, best_preference = 0;
1546 	struct fib_lookup_module *flm, *best_flm = NULL;
1547 	struct rib_rtable_info rinfo;
1548 	int candidate_algos = 0;
1549 
1550 	fib_get_rtable_info(rh, &rinfo);
1551 
1552 	FIB_MOD_LOCK();
1553 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1554 		if (flm->flm_family != rh->rib_family)
1555 			continue;
1556 		candidate_algos++;
1557 		preference = flm->flm_get_pref(&rinfo);
1558 		if (preference > best_preference) {
1559 			if (!flm_error_check(flm, rh->rib_fibnum)) {
1560 				best_preference = preference;
1561 				best_flm = flm;
1562 			}
1563 		}
1564 		if (flm == orig_flm)
1565 			curr_preference = preference;
1566 	}
1567 	if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference))
1568 		best_flm->flm_refcount++;
1569 	else
1570 		best_flm = NULL;
1571 	FIB_MOD_UNLOCK();
1572 
1573 	RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)",
1574 	    candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference,
1575 	    best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"),
1576 	    best_preference);
1577 
1578 	return (best_flm);
1579 }
1580 
1581 /*
1582  * Called when new route table is created.
1583  * Selects, allocates and attaches fib algo for the table.
1584  */
1585 int
1586 fib_select_algo_initial(struct rib_head *rh)
1587 {
1588 	struct fib_lookup_module *flm;
1589 	struct fib_data *fd = NULL;
1590 	enum flm_op_result result;
1591 	struct epoch_tracker et;
1592 	int error = 0;
1593 
1594 	flm = fib_check_best_algo(rh, NULL);
1595 	if (flm == NULL) {
1596 		RH_PRINTF(LOG_CRIT, rh, "no algo selected");
1597 		return (ENOENT);
1598 	}
1599 	RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);
1600 
1601 	NET_EPOCH_ENTER(et);
1602 	RIB_WLOCK(rh);
1603 	result = setup_fd_instance(flm, rh, NULL, &fd, false);
1604 	RIB_WUNLOCK(rh);
1605 	NET_EPOCH_EXIT(et);
1606 
1607 	RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
1608 	if (result == FLM_SUCCESS) {
1609 
1610 		/*
1611 		 * Attach datapath directly to avoid multiple reallocations
1612 		 * during fib growth
1613 		 */
1614 		struct fib_dp_header *fdp;
1615 		struct fib_dp **pdp;
1616 
1617 		pdp = get_family_dp_ptr(rh->rib_family);
1618 		if (pdp != NULL) {
1619 			fdp = get_fib_dp_header(*pdp);
1620 			fdp->fdh_idx[fd->fd_fibnum] = fd->fd_dp;
1621 			FD_PRINTF(LOG_INFO, fd, "datapath attached");
1622 		}
1623 	} else {
1624 		error = EINVAL;
1625 		RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name);
1626 	}
1627 
1628 	fib_unref_algo(flm);
1629 
1630 	return (error);
1631 }
1632 
1633 /*
1634  * Registers fib lookup module within the subsystem.
1635  */
1636 int
1637 fib_module_register(struct fib_lookup_module *flm)
1638 {
1639 
1640 	FIB_MOD_LOCK();
1641 	ALGO_PRINTF("attaching %s to %s", flm->flm_name,
1642 	    print_family(flm->flm_family));
1643 	TAILQ_INSERT_TAIL(&all_algo_list, flm, entries);
1644 	FIB_MOD_UNLOCK();
1645 
1646 	return (0);
1647 }
1648 
1649 /*
1650  * Tries to unregister fib lookup module.
1651  *
1652  * Returns 0 on success, EBUSY if module is still used
1653  *  by some of the tables.
1654  */
1655 int
1656 fib_module_unregister(struct fib_lookup_module *flm)
1657 {
1658 
1659 	FIB_MOD_LOCK();
1660 	if (flm->flm_refcount > 0) {
1661 		FIB_MOD_UNLOCK();
1662 		return (EBUSY);
1663 	}
1664 	fib_error_clear_flm(flm);
1665 	ALGO_PRINTF("detaching %s from %s", flm->flm_name,
1666 	    print_family(flm->flm_family));
1667 	TAILQ_REMOVE(&all_algo_list, flm, entries);
1668 	FIB_MOD_UNLOCK();
1669 
1670 	return (0);
1671 }
1672 
1673 void
1674 vnet_fib_init(void)
1675 {
1676 
1677 	TAILQ_INIT(&V_fib_data_list);
1678 }
1679 
1680 void
1681 vnet_fib_destroy(void)
1682 {
1683 
1684 	FIB_MOD_LOCK();
1685 	fib_error_clear();
1686 	FIB_MOD_UNLOCK();
1687 }
1688