xref: /freebsd/sys/net/route/fib_algo.c (revision c66ec88fed842fbaad62c30d510644ceb7bd2d71)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 
34 #include <sys/param.h>
35 #include <sys/eventhandler.h>
36 #include <sys/kernel.h>
37 #include <sys/sbuf.h>
38 #include <sys/lock.h>
39 #include <sys/rmlock.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/module.h>
43 #include <sys/kernel.h>
44 #include <sys/priv.h>
45 #include <sys/proc.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/syslog.h>
50 #include <sys/queue.h>
51 #include <net/vnet.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 
56 #include <netinet/in.h>
57 #include <netinet/in_var.h>
58 #include <netinet/ip.h>
59 #include <netinet/ip_var.h>
60 #ifdef INET6
61 #include <netinet/ip6.h>
62 #include <netinet6/ip6_var.h>
63 #endif
64 
65 #include <net/route.h>
66 #include <net/route/nhop.h>
67 #include <net/route/route_ctl.h>
68 #include <net/route/route_var.h>
69 #include <net/route/fib_algo.h>
70 
71 #include <machine/stdarg.h>
72 
73 /*
74  * Fib lookup framework.
75  *
76  * This framework enables accelerated longest-prefix-match lookups for the
77  *  routing tables by adding the ability to dynamically attach/detach lookup
78  *  algorithms implementation to/from the datapath.
79  *
80  * flm - fib lookup modules - implementation of particular lookup algorithm
81  * fd - fib data - instance of an flm bound to specific routing table
82  *
83  * This file provides main framework functionality.
84  *
85  * The following are the features provided by the framework
86  *
87  * 1) nexhops abstraction -> provides transparent referencing, indexing
88  *   and efficient idx->ptr mappings for nexthop and nexthop groups.
89  * 2) Routing table synchronisation
90  * 3) dataplane attachment points
91  * 4) automatic algorithm selection based on the provided preference.
92  *
93  *
94  * DATAPATH
95  * For each supported address family, there is a an allocated array of fib_dp
96  *  structures, indexed by fib number. Each array entry contains callback function
97  *  and its argument. This function will be called with a family-specific lookup key,
98  *  scope and provided argument. This array gets re-created every time when new algo
99  *  instance gets created. Please take a look at the replace_rtables_family() function
100  *  for more details.
101  *
102  */
103 
104 SYSCTL_DECL(_net_route);
105 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
106     "Fib algorithm lookups");
107 
108 #ifdef INET6
109 bool algo_fixed_inet6 = false;
110 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
111     "IPv6 longest prefix match lookups");
112 #endif
113 #ifdef INET
114 bool algo_fixed_inet = false;
115 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
116     "IPv4 longest prefix match lookups");
117 #endif
118 
119 struct nhop_ref_table {
120 	uint32_t		count;
121 	int32_t			refcnt[0];
122 };
123 
124 /*
125  * Data structure for the fib lookup instance tied to the particular rib.
126  */
127 struct fib_data {
128 	uint32_t		number_nhops;	/* current # of nhops */
129 	uint8_t			hit_nhops;	/* true if out of nhop limit */
130 	uint8_t			init_done;	/* true if init is competed */
131 	uint32_t		fd_dead:1;	/* Scheduled for deletion */
132 	uint32_t		fd_linked:1;	/* true if linked */
133 	uint32_t		fd_need_rebuild:1;	/* true if rebuild scheduled */
134 	uint32_t		fd_force_eval:1;/* true if rebuild scheduled */
135 	uint8_t			fd_family;	/* family */
136 	uint32_t		fd_fibnum;	/* fibnum */
137 	uint32_t		fd_failed_rebuilds;	/* stat: failed rebuilds */
138 	uint32_t		fd_algo_mask;	/* bitmask for algo data */
139 	struct callout		fd_callout;	/* rebuild callout */
140 	void			*fd_algo_data;	/* algorithm data */
141 	struct nhop_object	**nh_idx;	/* nhop idx->ptr array */
142 	struct nhop_ref_table	*nh_ref_table;	/* array with # of nhop references */
143 	struct rib_head		*fd_rh;		/* RIB table we're attached to */
144 	struct rib_subscription	*fd_rs;		/* storing table subscription */
145 	struct fib_dp		fd_dp;		/* fib datapath data */
146 	struct vnet		*fd_vnet;	/* vnet fib belongs to */
147 	struct epoch_context	fd_epoch_ctx;	/* epoch context for deletion */
148 	struct fib_lookup_module	*fd_flm;/* pointer to the lookup module */
149 	uint32_t		fd_num_changes;	/* number of changes since last callout */
150 	TAILQ_ENTRY(fib_data)	entries;	/* list of all fds in vnet */
151 };
152 
153 static void rebuild_fd_callout(void *_data);
154 static void destroy_fd_instance_epoch(epoch_context_t ctx);
155 static enum flm_op_result attach_datapath(struct fib_data *fd);
156 static bool is_idx_free(struct fib_data *fd, uint32_t index);
157 static void set_algo_fixed(struct rib_head *rh);
158 
159 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh);
160 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh);
161 
162 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
163     struct fib_lookup_module *orig_flm);
164 static void fib_unref_algo(struct fib_lookup_module *flm);
165 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum);
166 
167 struct mtx fib_mtx;
168 #define	FIB_MOD_LOCK()		mtx_lock(&fib_mtx)
169 #define	FIB_MOD_UNLOCK()	mtx_unlock(&fib_mtx)
170 #define	FIB_MOD_LOCK_ASSERT()	mtx_assert(&fib_mtx, MA_OWNED)
171 
172 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF);
173 
174 /* Algorithm has to be this percent better than the current to switch */
175 #define	BEST_DIFF_PERCENT	(5 * 256 / 100)
176 /* Schedule algo re-evaluation X seconds after a change */
177 #define	ALGO_EVAL_DELAY_MS	30000
178 /* Force algo re-evaluation after X changes */
179 #define	ALGO_EVAL_NUM_ROUTES	100
180 /* Try to setup algorithm X times */
181 #define	FIB_MAX_TRIES		32
182 /* Max amount of supported nexthops */
183 #define	FIB_MAX_NHOPS		262144
184 #define	FIB_CALLOUT_DELAY_MS	50
185 
186 /* Debug */
187 static int flm_debug_level = LOG_NOTICE;
188 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN,
189     &flm_debug_level, 0, "debuglevel");
190 #define	FLM_MAX_DEBUG_LEVEL	LOG_DEBUG
191 
192 #define	_PASS_MSG(_l)	(flm_debug_level >= (_l))
193 #define	ALGO_PRINTF(_fmt, ...)	printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__)
194 #define	_ALGO_PRINTF(_fib, _fam, _aname, _func, _fmt, ...) \
195     printf("[fib_algo] %s.%u (%s) %s: " _fmt "\n",\
196     print_family(_fam), _fib, _aname, _func, ## __VA_ARGS__)
197 #define	_RH_PRINTF(_fib, _fam, _func, _fmt, ...) \
198     printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__)
199 #define	RH_PRINTF(_l, _rh, _fmt, ...)	if (_PASS_MSG(_l)) {	\
200     _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\
201 }
202 #define	FD_PRINTF(_l, _fd, _fmt, ...)	FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__)
203 #define	_FD_PRINTF(_l, _fd, _fmt, ...)	if (_PASS_MSG(_l)) {		\
204     _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name,	\
205     __func__, _fmt, ## __VA_ARGS__);					\
206 }
207 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG
208 #define	FD_PRINTF_LOG_DEBUG	_FD_PRINTF
209 #else
210 #define	FD_PRINTF_LOG_DEBUG()
211 #endif
212 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO
213 #define	FD_PRINTF_LOG_INFO	_FD_PRINTF
214 #else
215 #define	FD_PRINTF_LOG_INFO()
216 #endif
217 #define	FD_PRINTF_LOG_NOTICE	_FD_PRINTF
218 #define	FD_PRINTF_LOG_ERR	_FD_PRINTF
219 #define	FD_PRINTF_LOG_WARNING	_FD_PRINTF
220 
221 
222 /* List of all registered lookup algorithms */
223 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list);
224 
225 /* List of all fib lookup instances in the vnet */
226 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list);
227 #define	V_fib_data_list	VNET(fib_data_list)
228 
229 /* Datastructure for storing non-transient fib lookup module failures */
230 struct fib_error {
231 	int				fe_family;
232 	uint32_t			fe_fibnum;	/* failed rtable */
233 	struct fib_lookup_module	*fe_flm;	/* failed module */
234 	TAILQ_ENTRY(fib_error)		entries;/* list of all errored entries */
235 };
236 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list);
237 #define	V_fib_error_list VNET(fib_error_list)
238 
239 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */
240 struct fib_dp_header {
241 	struct epoch_context	fdh_epoch_ctx;
242 	uint32_t		fdh_num_tables;
243 	struct fib_dp		fdh_idx[0];
244 };
245 
246 /*
247  * Tries to add new non-transient algorithm error to the list of
248  *  errors.
249  * Returns true on success.
250  */
251 static bool
252 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum)
253 {
254 	struct fib_error *fe;
255 
256 	fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO);
257 	if (fe == NULL)
258 		return (false);
259 	fe->fe_flm = flm;
260 	fe->fe_family = flm->flm_family;
261 	fe->fe_fibnum = fibnum;
262 
263 	FIB_MOD_LOCK();
264 	/* Avoid duplicates by checking if error already exists first */
265 	if (flm_error_check(flm, fibnum)) {
266 		FIB_MOD_UNLOCK();
267 		free(fe, M_TEMP);
268 		return (true);
269 	}
270 	TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries);
271 	FIB_MOD_UNLOCK();
272 
273 	return (true);
274 }
275 
276 /*
277  * True if non-transient error has been registered for @flm in @fibnum.
278  */
279 static bool
280 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum)
281 {
282 	const struct fib_error *fe;
283 
284 	TAILQ_FOREACH(fe, &V_fib_error_list, entries) {
285 		if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum))
286 			return (true);
287 	}
288 
289 	return (false);
290 }
291 
292 /*
293  * Clear all errors of algo specified by @flm.
294  */
295 static void
296 fib_error_clear_flm(struct fib_lookup_module *flm)
297 {
298 	struct fib_error *fe, *fe_tmp;
299 
300 	FIB_MOD_LOCK_ASSERT();
301 
302 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
303 		if (fe->fe_flm == flm) {
304 			TAILQ_REMOVE(&V_fib_error_list, fe, entries);
305 			free(fe, M_TEMP);
306 		}
307 	}
308 }
309 
310 /*
311  * Clears all errors in current VNET.
312  */
313 static void
314 fib_error_clear()
315 {
316 	struct fib_error *fe, *fe_tmp;
317 
318 	FIB_MOD_LOCK_ASSERT();
319 
320 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
321 		TAILQ_REMOVE(&V_fib_error_list, fe, entries);
322 		free(fe, M_TEMP);
323 	}
324 }
325 
326 static const char *
327 print_op_result(enum flm_op_result result)
328 {
329 	switch (result) {
330 	case FLM_SUCCESS:
331 		return "success";
332 	case FLM_REBUILD:
333 		return "rebuild";
334 	case FLM_ERROR:
335 		return "error";
336 	}
337 
338 	return "unknown";
339 }
340 
341 static const char *
342 print_family(int family)
343 {
344 
345 	if (family == AF_INET)
346 		return ("inet");
347 	else if (family == AF_INET6)
348 		return ("inet6");
349 	else
350 		return ("unknown");
351 }
352 
353 /*
354  * Debug function used by lookup algorithms.
355  * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) "
356  */
357 void
358 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...)
359 {
360 	char buf[128];
361 	va_list ap;
362 
363 	if (level > flm_debug_level)
364 		return;
365 
366 	va_start(ap, fmt);
367 	vsnprintf(buf, sizeof(buf), fmt, ap);
368 	va_end(ap);
369 
370 	_ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name,
371 	    func, "%s", buf);
372 }
373 
374 /*
375  * Outputs list of algorithms supported by the provided address family.
376  */
377 static int
378 print_algos_sysctl(struct sysctl_req *req, int family)
379 {
380 	struct fib_lookup_module *flm;
381 	struct sbuf sbuf;
382 	int error, count = 0;
383 
384 	error = sysctl_wire_old_buffer(req, 0);
385 	if (error == 0) {
386 		sbuf_new_for_sysctl(&sbuf, NULL, 512, req);
387 		TAILQ_FOREACH(flm, &all_algo_list, entries) {
388 			if (flm->flm_family == family) {
389 				if (count++ > 0)
390 					sbuf_cat(&sbuf, ", ");
391 				sbuf_cat(&sbuf, flm->flm_name);
392 			}
393 		}
394 		error = sbuf_finish(&sbuf);
395 		sbuf_delete(&sbuf);
396 	}
397 	return (error);
398 }
399 
400 #ifdef INET6
401 static int
402 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)
403 {
404 
405 	return (print_algos_sysctl(req, AF_INET6));
406 }
407 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
408     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
409     print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms");
410 #endif
411 
412 #ifdef INET
413 static int
414 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)
415 {
416 
417 	return (print_algos_sysctl(req, AF_INET));
418 }
419 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
420     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
421     print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms");
422 #endif
423 
424 /*
425  * Calculate delay between repeated failures.
426  * Returns current delay in milliseconds.
427  */
428 static uint32_t
429 callout_calc_delay_ms(struct fib_data *fd)
430 {
431 	uint32_t shift;
432 
433 	if (fd->fd_failed_rebuilds > 10)
434 		shift = 10;
435 	else
436 		shift = fd->fd_failed_rebuilds;
437 
438 	return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
439 }
440 
441 static void
442 schedule_callout(struct fib_data *fd, int delay_ms)
443 {
444 
445 	callout_reset_sbt(&fd->fd_callout, 0, SBT_1MS * delay_ms,
446 	    rebuild_fd_callout, fd, 0);
447 }
448 
449 static void
450 schedule_fd_rebuild(struct fib_data *fd)
451 {
452 
453 	FIB_MOD_LOCK();
454 	if (!fd->fd_need_rebuild) {
455 		fd->fd_need_rebuild = true;
456 
457 		/*
458 		 * Potentially re-schedules pending callout
459 		 *  initiated by schedule_algo_eval.
460 		 */
461 		FD_PRINTF(LOG_INFO, fd, "Scheduling rebuilt");
462 		schedule_callout(fd, callout_calc_delay_ms(fd));
463 	}
464 	FIB_MOD_UNLOCK();
465 }
466 
467 static void
468 schedule_algo_eval(struct fib_data *fd)
469 {
470 
471 	if (fd->fd_num_changes++ == 0) {
472 		/* Start callout to consider switch */
473 		FIB_MOD_LOCK();
474 		if (!callout_pending(&fd->fd_callout))
475 			schedule_callout(fd, ALGO_EVAL_DELAY_MS);
476 		FIB_MOD_UNLOCK();
477 	} else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) {
478 		/* Reset callout to exec immediately */
479 		FIB_MOD_LOCK();
480 		if (!fd->fd_need_rebuild) {
481 			fd->fd_force_eval = true;
482 			schedule_callout(fd, 1);
483 		}
484 		FIB_MOD_UNLOCK();
485 	}
486 }
487 
488 /*
489  * Rib subscription handler. Checks if the algorithm is ready to
490  *  receive updates, handles nexthop refcounting and passes change
491  *  data to the algorithm callback.
492  */
493 static void
494 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
495     void *_data)
496 {
497 	struct fib_data *fd = (struct fib_data *)_data;
498 	enum flm_op_result result;
499 
500 	RIB_WLOCK_ASSERT(rnh);
501 
502 	/*
503 	 * There is a small gap between subscribing for route changes
504 	 *  and initiating rtable dump. Avoid receiving route changes
505 	 *  prior to finishing rtable dump by checking `init_done`.
506 	 */
507 	if (!fd->init_done)
508 		return;
509 	/*
510 	 * If algo requested rebuild, stop sending updates by default.
511 	 * This simplifies nexthop refcount handling logic.
512 	 */
513 	if (fd->fd_need_rebuild)
514 		return;
515 
516 	/* Consider scheduling algorithm re-evaluation */
517 	schedule_algo_eval(fd);
518 
519 	/*
520 	 * Maintain guarantee that every nexthop returned by the dataplane
521 	 *  lookup has > 0 refcount, so can be safely referenced within current
522 	 *  epoch.
523 	 */
524 	if (rc->rc_nh_new != NULL) {
525 		if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) {
526 			/* ran out of indexes */
527 			schedule_fd_rebuild(fd);
528 			return;
529 		}
530 	}
531 
532 	result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
533 
534 	switch (result) {
535 	case FLM_SUCCESS:
536 		/* Unref old nexthop on success */
537 		if (rc->rc_nh_old != NULL)
538 			fib_unref_nhop(fd, rc->rc_nh_old);
539 		break;
540 	case FLM_REBUILD:
541 
542 		/*
543 		 * Algo is not able to apply the update.
544 		 * Schedule algo rebuild.
545 		 */
546 		schedule_fd_rebuild(fd);
547 		break;
548 	case FLM_ERROR:
549 
550 		/*
551 		 * Algo reported a non-recoverable error.
552 		 * Record the error and schedule rebuild, which will
553 		 *  trigger best algo selection.
554 		 */
555 		FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error");
556 		if (!flm_error_add(fd->fd_flm, fd->fd_fibnum))
557 			FD_PRINTF(LOG_ERR, fd, "failed to ban algo");
558 		schedule_fd_rebuild(fd);
559 	}
560 }
561 
562 static void
563 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd)
564 {
565 
566 	if (old_fd == NULL) {
567 		// TODO: read from rtable
568 		fd->number_nhops = 16;
569 		return;
570 	}
571 
572 	if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
573 		fd->number_nhops = 2 * old_fd->number_nhops;
574 	else
575 		fd->number_nhops = old_fd->number_nhops;
576 }
577 
578 struct walk_cbdata {
579 	struct fib_data		*fd;
580 	flm_dump_t		*func;
581 	enum flm_op_result	result;
582 };
583 
584 /*
585  * Handler called after all rtenties have been dumped.
586  * Performs post-dump framework checks and calls
587  * algo:flm_dump_end_cb().
588  *
589  * Updates walk_cbdata result.
590  */
591 static void
592 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
593 {
594 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
595 	struct fib_data *fd = w->fd;
596 
597 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
598 
599 	if (rnh->rib_dying) {
600 		w->result = FLM_ERROR;
601 		return;
602 	}
603 
604 	if (fd->hit_nhops) {
605 		FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops",
606 		    fd->nh_ref_table->count);
607 		if (w->result == FLM_SUCCESS)
608 			w->result = FLM_REBUILD;
609 		return;
610 	}
611 
612 	if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
613 		return;
614 
615 	/* Post-dump hook, dump successful */
616 	w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
617 
618 	if (w->result == FLM_SUCCESS) {
619 		/* Mark init as done to allow routing updates */
620 		fd->init_done = 1;
621 	}
622 }
623 
624 /*
625  * Callback for each entry in rib.
626  * Calls algo:flm_dump_rib_item_cb func as a part of initial
627  *  route table synchronisation.
628  */
629 static int
630 sync_algo_cb(struct rtentry *rt, void *_data)
631 {
632 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
633 
634 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
635 
636 	if (w->result == FLM_SUCCESS && w->func) {
637 
638 		/*
639 		 * Reference nexthops to maintain guarantee that
640 		 *  each nexthop returned by datapath has > 0 references
641 		 *  and can be safely referenced within current epoch.
642 		 */
643 		struct nhop_object *nh = rt_get_raw_nhop(rt);
644 		if (fib_ref_nhop(w->fd, nh) != 0)
645 			w->result = w->func(rt, w->fd->fd_algo_data);
646 		else
647 			w->result = FLM_REBUILD;
648 	}
649 
650 	return (0);
651 }
652 
653 /*
654  * Dump all routing table state to the algo instance.
655  */
656 static enum flm_op_result
657 sync_algo(struct fib_data *fd)
658 {
659 	struct walk_cbdata w = {
660 		.fd = fd,
661 		.func = fd->fd_flm->flm_dump_rib_item_cb,
662 		.result = FLM_SUCCESS,
663 	};
664 
665 	rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w);
666 
667 	FD_PRINTF(LOG_INFO, fd, "initial dump completed, result: %s",
668 	    print_op_result(w.result));
669 
670 	return (w.result);
671 }
672 
673 /*
674  * Schedules epoch-backed @fd instance deletion.
675  * * Unlinks @fd from the list of active algo instances.
676  * * Removes rib subscription.
677  * * Stops callout.
678  * * Schedules actual deletion.
679  *
680  * Assume @fd is already unlinked from the datapath.
681  */
682 static int
683 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
684 {
685 	bool is_dead;
686 
687 	NET_EPOCH_ASSERT();
688 
689 	FIB_MOD_LOCK();
690 	is_dead = fd->fd_dead;
691 	if (!is_dead)
692 		fd->fd_dead = true;
693 	if (fd->fd_linked) {
694 		TAILQ_REMOVE(&V_fib_data_list, fd, entries);
695 		fd->fd_linked = false;
696 	}
697 	FIB_MOD_UNLOCK();
698 	if (is_dead)
699 		return (0);
700 
701 	FD_PRINTF(LOG_INFO, fd, "DETACH");
702 
703 	if (fd->fd_rs != NULL)
704 		rib_unsibscribe(fd->fd_rs);
705 
706 	/*
707 	 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
708 	 * will be executed, hence no _new_ callout schedules will happen.
709 	 *
710 	 * There can be 2 possible scenarious here:
711 	 * 1) we're running inside a callout when we're deleting ourselves
712 	 *  due to migration to a newer fd
713 	 * 2) we're running from rt_table_destroy() and callout is scheduled
714 	 *  for execution OR is executing
715 	 *
716 	 * For (2) we need to wait for the callout termination, as the routing table
717 	 * will be destroyed after this function returns.
718 	 * For (1) we cannot call drain, but can ensure that this is the last invocation.
719 	 */
720 
721 	if (in_callout)
722 		callout_stop(&fd->fd_callout);
723 	else
724 		callout_drain(&fd->fd_callout);
725 
726 	epoch_call(net_epoch_preempt, destroy_fd_instance_epoch,
727 	    &fd->fd_epoch_ctx);
728 
729 	return (0);
730 }
731 
732 /*
733  * Wipe all fd instances from the list matching rib specified by @rh.
734  * If @keep_first is set, remove all but the first record.
735  */
736 static void
737 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
738 {
739 	struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
740 	struct fib_data *fd, *fd_tmp;
741 	struct epoch_tracker et;
742 
743 	FIB_MOD_LOCK();
744 	TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) {
745 		if (fd->fd_rh == rh) {
746 			if (keep_first) {
747 				keep_first = false;
748 				continue;
749 			}
750 			TAILQ_REMOVE(&V_fib_data_list, fd, entries);
751 			fd->fd_linked = false;
752 			TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
753 		}
754 	}
755 	FIB_MOD_UNLOCK();
756 
757 	/* Pass 2: remove each entry */
758 	NET_EPOCH_ENTER(et);
759 	TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
760 		schedule_destroy_fd_instance(fd, in_callout);
761 	}
762 	NET_EPOCH_EXIT(et);
763 }
764 
765 void
766 fib_destroy_rib(struct rib_head *rh)
767 {
768 
769 	/*
770 	 * rnh has `is_dying` flag set, so setup of new fd's will fail at
771 	 *  sync_algo() stage, preventing new entries to be added to the list
772 	 *  of active algos. Remove all existing entries for the particular rib.
773 	 */
774 	fib_cleanup_algo(rh, false, false);
775 }
776 
777 /*
778  * Finalises fd destruction by freeing all fd resources.
779  */
780 static void
781 destroy_fd_instance(struct fib_data *fd)
782 {
783 
784 	FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd);
785 
786 	/* Call destroy callback first */
787 	if (fd->fd_algo_data != NULL)
788 		fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
789 
790 	/* Nhop table */
791 	if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) {
792 		for (int i = 0; i < fd->number_nhops; i++) {
793 			if (!is_idx_free(fd, i)) {
794 				FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p",
795 				    i, fd->nh_idx[i]);
796 				nhop_free_any(fd->nh_idx[i]);
797 			}
798 		}
799 		free(fd->nh_idx, M_RTABLE);
800 	}
801 	if (fd->nh_ref_table != NULL)
802 		free(fd->nh_ref_table, M_RTABLE);
803 
804 	fib_unref_algo(fd->fd_flm);
805 
806 	free(fd, M_RTABLE);
807 }
808 
809 /*
810  * Epoch callback indicating fd is safe to destroy
811  */
812 static void
813 destroy_fd_instance_epoch(epoch_context_t ctx)
814 {
815 	struct fib_data *fd;
816 
817 	fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
818 
819 	destroy_fd_instance(fd);
820 }
821 
822 /*
823  * Tries to setup fd instance.
824  * - Allocates fd/nhop table
825  * - Runs algo:flm_init_cb algo init
826  * - Subscribes fd to the rib
827  * - Runs rtable dump
828  * - Adds instance to the list of active instances.
829  *
830  * Returns: operation result. Fills in @pfd with resulting fd on success.
831  *
832  */
833 static enum flm_op_result
834 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
835     struct fib_data *old_fd, struct fib_data **pfd)
836 {
837 	struct fib_data *fd;
838 	size_t size;
839 	enum flm_op_result result;
840 
841 	/* Allocate */
842 	fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO);
843 	if (fd == NULL)  {
844 		*pfd = NULL;
845 		return (FLM_REBUILD);
846 	}
847 	*pfd = fd;
848 
849 	estimate_nhop_scale(old_fd, fd);
850 
851 	fd->fd_rh = rh;
852 	fd->fd_family = rh->rib_family;
853 	fd->fd_fibnum = rh->rib_fibnum;
854 	callout_init(&fd->fd_callout, 1);
855 	fd->fd_vnet = curvnet;
856 	fd->fd_flm = flm;
857 
858 	FIB_MOD_LOCK();
859 	flm->flm_refcount++;
860 	FIB_MOD_UNLOCK();
861 
862 	/* Allocate nhidx -> nhop_ptr table */
863 	size = fd->number_nhops * sizeof(void *);
864 	fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
865 	if (fd->nh_idx == NULL) {
866 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size);
867 		return (FLM_REBUILD);
868 	}
869 
870 	/* Allocate nhop index refcount table */
871 	size = sizeof(struct nhop_ref_table);
872 	size += fd->number_nhops * sizeof(uint32_t);
873 	fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
874 	if (fd->nh_ref_table == NULL) {
875 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size);
876 		return (FLM_REBUILD);
877 	}
878 	FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops);
879 
880 	/* Okay, we're ready for algo init */
881 	void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL;
882 	result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data);
883 	if (result != FLM_SUCCESS)
884 		return (result);
885 
886 	/* Try to subscribe */
887 	if (flm->flm_change_rib_item_cb != NULL) {
888 		fd->fd_rs = rib_subscribe_internal(fd->fd_rh,
889 		    handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE, 0);
890 		if (fd->fd_rs == NULL)
891 			return (FLM_REBUILD);
892 	}
893 
894 	/* Dump */
895 	result = sync_algo(fd);
896 	if (result != FLM_SUCCESS)
897 		return (result);
898 	FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully.");
899 
900 	FIB_MOD_LOCK();
901 	/*
902 	 * Insert fd in the beginning of a list, to maintain invariant
903 	 *  that first matching entry for the AF/fib is always the active
904 	 *  one.
905 	 */
906 	TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries);
907 	fd->fd_linked = true;
908 	FIB_MOD_UNLOCK();
909 
910 	return (FLM_SUCCESS);
911 }
912 
913 /*
914  * Sets up algo @flm for table @rh and links it to the datapath.
915  *
916  */
917 static enum flm_op_result
918 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
919     struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
920 {
921 	struct fib_data *prev_fd, *new_fd;
922 	struct epoch_tracker et;
923 	enum flm_op_result result;
924 
925 	prev_fd = orig_fd;
926 	new_fd = NULL;
927 	for (int i = 0; i < FIB_MAX_TRIES; i++) {
928 		NET_EPOCH_ENTER(et);
929 		result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);
930 
931 		if ((result == FLM_SUCCESS) && attach)
932 			result = attach_datapath(new_fd);
933 
934 		if ((prev_fd != NULL) && (prev_fd != orig_fd)) {
935 			schedule_destroy_fd_instance(prev_fd, false);
936 			prev_fd = NULL;
937 		}
938 		NET_EPOCH_EXIT(et);
939 
940 		RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
941 		    print_op_result(result));
942 
943 		if (result == FLM_REBUILD) {
944 			prev_fd = new_fd;
945 			new_fd = NULL;
946 			continue;
947 		}
948 
949 		break;
950 	}
951 
952 	if (result != FLM_SUCCESS) {
953 		/* update failure count */
954 		FIB_MOD_LOCK();
955 		if (orig_fd != NULL)
956 			orig_fd->fd_failed_rebuilds++;
957 		FIB_MOD_UNLOCK();
958 
959 		/* Ban algo on non-recoverable error */
960 		if (result == FLM_ERROR)
961 			flm_error_add(flm, rh->rib_fibnum);
962 
963 		NET_EPOCH_ENTER(et);
964 		if ((prev_fd != NULL) && (prev_fd != orig_fd))
965 			schedule_destroy_fd_instance(prev_fd, false);
966 		if (new_fd != NULL) {
967 			schedule_destroy_fd_instance(new_fd, false);
968 			new_fd = NULL;
969 		}
970 		NET_EPOCH_EXIT(et);
971 	}
972 
973 	*pfd = new_fd;
974 	return (result);
975 }
976 
977 /*
978  * Callout for all scheduled fd-related work.
979  * - Checks if the current algo is still the best algo
980  * - Creates a new instance of an algo for af/fib if desired.
981  */
982 static void
983 rebuild_fd_callout(void *_data)
984 {
985 	struct fib_data *fd, *fd_new, *fd_tmp;
986 	struct fib_lookup_module *flm_new;
987 	struct epoch_tracker et;
988 	enum flm_op_result result;
989 	bool need_rebuild = false;
990 
991 	fd = (struct fib_data *)_data;
992 
993 	FIB_MOD_LOCK();
994 	need_rebuild = fd->fd_need_rebuild;
995 	fd->fd_need_rebuild = false;
996 	fd->fd_force_eval = false;
997 	fd->fd_num_changes = 0;
998 	FIB_MOD_UNLOCK();
999 
1000 	CURVNET_SET(fd->fd_vnet);
1001 
1002 	/* First, check if we're still OK to use this algo */
1003 	flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1004 	if ((flm_new == NULL) && (!need_rebuild)) {
1005 		/* Keep existing algo, no need to rebuild. */
1006 		CURVNET_RESTORE();
1007 		return;
1008 	}
1009 
1010 	if (flm_new == NULL) {
1011 		flm_new = fd->fd_flm;
1012 		fd_tmp = fd;
1013 	} else {
1014 		fd_tmp = NULL;
1015 		FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name);
1016 	}
1017 	result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true);
1018 	if (fd_tmp == NULL) {
1019 		/* fd_new represents new algo */
1020 		fib_unref_algo(flm_new);
1021 	}
1022 	if (result != FLM_SUCCESS) {
1023 		FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed");
1024 		CURVNET_RESTORE();
1025 		return;
1026 	}
1027 	FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");
1028 
1029 	/* Remove old instance removal */
1030 	if (fd != NULL) {
1031 		NET_EPOCH_ENTER(et);
1032 		schedule_destroy_fd_instance(fd, true);
1033 		NET_EPOCH_EXIT(et);
1034 	}
1035 
1036 	CURVNET_RESTORE();
1037 }
1038 
1039 /*
1040  * Finds algo by name/family.
1041  * Returns referenced algo or NULL.
1042  */
1043 static struct fib_lookup_module *
1044 fib_find_algo(const char *algo_name, int family)
1045 {
1046 	struct fib_lookup_module *flm;
1047 
1048 	FIB_MOD_LOCK();
1049 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1050 		if ((strcmp(flm->flm_name, algo_name) == 0) &&
1051 		    (family == flm->flm_family)) {
1052 			flm->flm_refcount++;
1053 			FIB_MOD_UNLOCK();
1054 			return (flm);
1055 		}
1056 	}
1057 	FIB_MOD_UNLOCK();
1058 
1059 	return (NULL);
1060 }
1061 
1062 static void
1063 fib_unref_algo(struct fib_lookup_module *flm)
1064 {
1065 
1066 	FIB_MOD_LOCK();
1067 	flm->flm_refcount--;
1068 	FIB_MOD_UNLOCK();
1069 }
1070 
1071 static int
1072 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req)
1073 {
1074 	struct fib_lookup_module *flm = NULL;
1075 	struct fib_data *fd = NULL;
1076 	char old_algo_name[32], algo_name[32];
1077 	struct rib_head *rh = NULL;
1078 	enum flm_op_result result;
1079 	int error;
1080 
1081 	/* Fetch current algo/rib for af/family */
1082 	FIB_MOD_LOCK();
1083 	TAILQ_FOREACH(fd, &V_fib_data_list, entries) {
1084 		if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum))
1085 			break;
1086 	}
1087 	if (fd == NULL) {
1088 		FIB_MOD_UNLOCK();
1089 		return (ENOENT);
1090 	}
1091 	rh = fd->fd_rh;
1092 	strlcpy(old_algo_name, fd->fd_flm->flm_name,
1093 	    sizeof(old_algo_name));
1094 	FIB_MOD_UNLOCK();
1095 
1096 	strlcpy(algo_name, old_algo_name, sizeof(algo_name));
1097 	error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req);
1098 	if (error != 0 || req->newptr == NULL)
1099 		return (error);
1100 
1101 	if (strcmp(algo_name, old_algo_name) == 0)
1102 		return (0);
1103 
1104 	/* New algorithm name is different */
1105 	flm = fib_find_algo(algo_name, family);
1106 	if (flm == NULL) {
1107 		RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name);
1108 		return (ESRCH);
1109 	}
1110 
1111 	fd = NULL;
1112 	result = setup_fd_instance(flm, rh, NULL, &fd, true);
1113 	fib_unref_algo(flm);
1114 	if (result != FLM_SUCCESS)
1115 		return (EINVAL);
1116 
1117 	/* Disable automated jumping between algos */
1118 	FIB_MOD_LOCK();
1119 	set_algo_fixed(rh);
1120 	FIB_MOD_UNLOCK();
1121 	/* Remove old instance(s) */
1122 	fib_cleanup_algo(rh, true, false);
1123 
1124 	/* Drain cb so user can unload the module after userret if so desired */
1125 	epoch_drain_callbacks(net_epoch_preempt);
1126 
1127 	return (0);
1128 }
1129 
1130 #ifdef INET
1131 static int
1132 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS)
1133 {
1134 
1135 	return (set_fib_algo(RT_DEFAULT_FIB, AF_INET, oidp, req));
1136 }
1137 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo,
1138     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1139     set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo");
1140 #endif
1141 
1142 #ifdef INET6
1143 static int
1144 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS)
1145 {
1146 
1147 	return (set_fib_algo(RT_DEFAULT_FIB, AF_INET6, oidp, req));
1148 }
1149 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo,
1150     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1151     set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo");
1152 #endif
1153 
1154 static void
1155 destroy_fdh_epoch(epoch_context_t ctx)
1156 {
1157 	struct fib_dp_header *fdh;
1158 
1159 	fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx);
1160 	free(fdh, M_RTABLE);
1161 }
1162 
1163 static struct fib_dp_header *
1164 alloc_fib_dp_array(uint32_t num_tables, bool waitok)
1165 {
1166 	size_t sz;
1167 	struct fib_dp_header *fdh;
1168 
1169 	sz = sizeof(struct fib_dp_header);
1170 	sz += sizeof(struct fib_dp) * num_tables;
1171 	fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
1172 	if (fdh != NULL)
1173 		fdh->fdh_num_tables = num_tables;
1174 	return (fdh);
1175 }
1176 
1177 static struct fib_dp_header *
1178 get_fib_dp_header(struct fib_dp *dp)
1179 {
1180 
1181 	return (__containerof((void *)dp, struct fib_dp_header, fdh_idx));
1182 }
1183 
1184 /*
1185  * Replace per-family index pool @pdp with a new one which
1186  * contains updated callback/algo data from @fd.
1187  * Returns 0 on success.
1188  */
1189 static enum flm_op_result
1190 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd)
1191 {
1192 	struct fib_dp_header *new_fdh, *old_fdh;
1193 
1194 	NET_EPOCH_ASSERT();
1195 
1196 	FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p",
1197 	    curvnet, fd->fd_dp.f, fd->fd_dp.arg);
1198 
1199 	FIB_MOD_LOCK();
1200 	old_fdh = get_fib_dp_header(*pdp);
1201 	new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false);
1202 	FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh);
1203 	if (new_fdh == NULL) {
1204 		FIB_MOD_UNLOCK();
1205 		FD_PRINTF(LOG_WARNING, fd, "error attaching datapath");
1206 		return (FLM_REBUILD);
1207 	}
1208 
1209 	memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1210 	    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1211 	/* Update relevant data structure for @fd */
1212 	new_fdh->fdh_idx[fd->fd_fibnum] = fd->fd_dp;
1213 
1214 	/* Ensure memcpy() writes have completed */
1215 	atomic_thread_fence_rel();
1216 	/* Set new datapath pointer */
1217 	*pdp = &new_fdh->fdh_idx[0];
1218 	FIB_MOD_UNLOCK();
1219 	FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh);
1220 
1221 	epoch_call(net_epoch_preempt, destroy_fdh_epoch,
1222 	    &old_fdh->fdh_epoch_ctx);
1223 
1224 	return (FLM_SUCCESS);
1225 }
1226 
1227 static struct fib_dp **
1228 get_family_dp_ptr(int family)
1229 {
1230 	switch (family) {
1231 	case AF_INET:
1232 		return (&V_inet_dp);
1233 	case AF_INET6:
1234 		return (&V_inet6_dp);
1235 	}
1236 	return (NULL);
1237 }
1238 
1239 /*
1240  * Make datapath use fib instance @fd
1241  */
1242 static enum flm_op_result
1243 attach_datapath(struct fib_data *fd)
1244 {
1245 	struct fib_dp **pdp;
1246 
1247 	pdp = get_family_dp_ptr(fd->fd_family);
1248 	return (replace_rtables_family(pdp, fd));
1249 }
1250 
1251 /*
1252  * Grow datapath pointers array.
1253  * Called from sysctl handler on growing number of routing tables.
1254  */
1255 static void
1256 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables)
1257 {
1258 	struct fib_dp_header *new_fdh, *old_fdh = NULL;
1259 
1260 	new_fdh = alloc_fib_dp_array(new_num_tables, true);
1261 
1262 	FIB_MOD_LOCK();
1263 	if (*pdp != NULL) {
1264 		old_fdh = get_fib_dp_header(*pdp);
1265 		memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1266 		    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1267 	}
1268 
1269 	/* Wait till all writes completed */
1270 	atomic_thread_fence_rel();
1271 
1272 	*pdp = &new_fdh->fdh_idx[0];
1273 	FIB_MOD_UNLOCK();
1274 
1275 	if (old_fdh != NULL)
1276 		epoch_call(net_epoch_preempt, destroy_fdh_epoch,
1277 		    &old_fdh->fdh_epoch_ctx);
1278 }
1279 
1280 /*
1281  * Grows per-AF arrays of datapath pointers for each supported family.
1282  * Called from fibs resize sysctl handler.
1283  */
1284 void
1285 fib_grow_rtables(uint32_t new_num_tables)
1286 {
1287 
1288 #ifdef INET
1289 	grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables);
1290 #endif
1291 #ifdef INET6
1292 	grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables);
1293 #endif
1294 }
1295 
1296 void
1297 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo)
1298 {
1299 
1300 	bzero(rinfo, sizeof(struct rib_rtable_info));
1301 	rinfo->num_prefixes = rh->rnh_prefixes;
1302 	rinfo->num_nhops = nhops_get_count(rh);
1303 #ifdef ROUTE_MPATH
1304 	rinfo->num_nhgrp = nhgrp_get_count(rh);
1305 #endif
1306 }
1307 
1308 /*
1309  * Accessor to get rib instance @fd is attached to.
1310  */
1311 struct rib_head *
1312 fib_get_rh(struct fib_data *fd)
1313 {
1314 
1315 	return (fd->fd_rh);
1316 }
1317 
1318 /*
1319  * Accessor to export idx->nhop array
1320  */
1321 struct nhop_object **
1322 fib_get_nhop_array(struct fib_data *fd)
1323 {
1324 
1325 	return (fd->nh_idx);
1326 }
1327 
1328 static uint32_t
1329 get_nhop_idx(struct nhop_object *nh)
1330 {
1331 #ifdef ROUTE_MPATH
1332 	if (NH_IS_NHGRP(nh))
1333 		return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1);
1334 	else
1335 		return (nhop_get_idx(nh) * 2);
1336 #else
1337 	return (nhop_get_idx(nh));
1338 #endif
1339 }
1340 
1341 uint32_t
1342 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh)
1343 {
1344 
1345 	return (get_nhop_idx(nh));
1346 }
1347 
1348 static bool
1349 is_idx_free(struct fib_data *fd, uint32_t index)
1350 {
1351 
1352 	return (fd->nh_ref_table->refcnt[index] == 0);
1353 }
1354 
1355 static uint32_t
1356 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh)
1357 {
1358 	uint32_t idx = get_nhop_idx(nh);
1359 
1360 	if (idx >= fd->number_nhops) {
1361 		fd->hit_nhops = 1;
1362 		return (0);
1363 	}
1364 
1365 	if (is_idx_free(fd, idx)) {
1366 		nhop_ref_any(nh);
1367 		fd->nh_idx[idx] = nh;
1368 		fd->nh_ref_table->count++;
1369 		FD_PRINTF(LOG_DEBUG, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]);
1370 	}
1371 	fd->nh_ref_table->refcnt[idx]++;
1372 
1373 	return (idx);
1374 }
1375 
1376 struct nhop_release_data {
1377 	struct nhop_object	*nh;
1378 	struct epoch_context	ctx;
1379 };
1380 
1381 static void
1382 release_nhop_epoch(epoch_context_t ctx)
1383 {
1384 	struct nhop_release_data *nrd;
1385 
1386 	nrd = __containerof(ctx, struct nhop_release_data, ctx);
1387 	nhop_free_any(nrd->nh);
1388 	free(nrd, M_TEMP);
1389 }
1390 
1391 /*
1392  * Delays nexthop refcount release.
1393  * Datapath may have the datastructures not updated yet, so the old
1394  *  nexthop may still be returned till the end of current epoch. Delay
1395  *  refcount removal, as we may be removing the last instance, which will
1396  *  trigger nexthop deletion, rendering returned nexthop invalid.
1397  */
1398 static void
1399 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh)
1400 {
1401 	struct nhop_release_data *nrd;
1402 
1403 	nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO);
1404 	if (nrd != NULL) {
1405 		nrd->nh = nh;
1406 		epoch_call(net_epoch_preempt, release_nhop_epoch, &nrd->ctx);
1407 	} else {
1408 		/*
1409 		 * Unable to allocate memory. Leak nexthop to maintain guarantee
1410 		 *  that each nhop can be referenced.
1411 		 */
1412 		FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh);
1413 	}
1414 }
1415 
1416 static void
1417 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh)
1418 {
1419 	uint32_t idx = get_nhop_idx(nh);
1420 
1421 	KASSERT((idx < fd->number_nhops), ("invalid nhop index"));
1422 	KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh"));
1423 
1424 	fd->nh_ref_table->refcnt[idx]--;
1425 	if (fd->nh_ref_table->refcnt[idx] == 0) {
1426 		FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]);
1427 		fib_schedule_release_nhop(fd, fd->nh_idx[idx]);
1428 	}
1429 }
1430 
1431 static void
1432 set_algo_fixed(struct rib_head *rh)
1433 {
1434 	switch (rh->rib_family) {
1435 #ifdef INET
1436 	case AF_INET:
1437 		algo_fixed_inet = true;
1438 		break;
1439 #endif
1440 #ifdef INET6
1441 	case AF_INET6:
1442 		algo_fixed_inet6 = true;
1443 		break;
1444 #endif
1445 	}
1446 }
1447 
1448 static bool
1449 is_algo_fixed(struct rib_head *rh)
1450 {
1451 
1452 	switch (rh->rib_family) {
1453 #ifdef INET
1454 	case AF_INET:
1455 		return (algo_fixed_inet);
1456 #endif
1457 #ifdef INET6
1458 	case AF_INET6:
1459 		return (algo_fixed_inet6);
1460 #endif
1461 	}
1462 	return (false);
1463 }
1464 
1465 /*
1466  * Runs the check on what would be the best algo for rib @rh, assuming
1467  *  that the current algo is the one specified by @orig_flm. Note that
1468  *  it can be NULL for initial selection.
1469  *
1470  * Returns referenced new algo or NULL if the current one is the best.
1471  */
1472 static struct fib_lookup_module *
1473 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm)
1474 {
1475 	uint8_t preference, curr_preference = 0, best_preference = 0;
1476 	struct fib_lookup_module *flm, *best_flm = NULL;
1477 	struct rib_rtable_info rinfo;
1478 	int candidate_algos = 0;
1479 
1480 	fib_get_rtable_info(rh, &rinfo);
1481 
1482 	FIB_MOD_LOCK();
1483 	if (is_algo_fixed(rh)) {
1484 		FIB_MOD_UNLOCK();
1485 		return (NULL);
1486 	}
1487 
1488 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1489 		if (flm->flm_family != rh->rib_family)
1490 			continue;
1491 		candidate_algos++;
1492 		preference = flm->flm_get_pref(&rinfo);
1493 		if (preference > best_preference) {
1494 			if (!flm_error_check(flm, rh->rib_fibnum)) {
1495 				best_preference = preference;
1496 				best_flm = flm;
1497 			}
1498 		}
1499 		if (flm == orig_flm)
1500 			curr_preference = preference;
1501 	}
1502 	if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference))
1503 		best_flm->flm_refcount++;
1504 	else
1505 		best_flm = NULL;
1506 	FIB_MOD_UNLOCK();
1507 
1508 	RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)",
1509 	    candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference,
1510 	    best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"),
1511 	    best_preference);
1512 
1513 	return (best_flm);
1514 }
1515 
1516 /*
1517  * Called when new route table is created.
1518  * Selects, allocates and attaches fib algo for the table.
1519  */
1520 int
1521 fib_select_algo_initial(struct rib_head *rh)
1522 {
1523 	struct fib_lookup_module *flm;
1524 	struct fib_data *fd = NULL;
1525 	enum flm_op_result result;
1526 	int error = 0;
1527 
1528 	flm = fib_check_best_algo(rh, NULL);
1529 	if (flm == NULL) {
1530 		RH_PRINTF(LOG_CRIT, rh, "no algo selected");
1531 		return (ENOENT);
1532 	}
1533 	RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);
1534 
1535 	result = setup_fd_instance(flm, rh, NULL, &fd, false);
1536 	RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
1537 	if (result == FLM_SUCCESS) {
1538 
1539 		/*
1540 		 * Attach datapath directly to avoid multiple reallocations
1541 		 * during fib growth
1542 		 */
1543 		struct fib_dp_header *fdp;
1544 		struct fib_dp **pdp;
1545 
1546 		pdp = get_family_dp_ptr(rh->rib_family);
1547 		if (pdp != NULL) {
1548 			fdp = get_fib_dp_header(*pdp);
1549 			fdp->fdh_idx[fd->fd_fibnum] = fd->fd_dp;
1550 			FD_PRINTF(LOG_INFO, fd, "datapath attached");
1551 		}
1552 	} else {
1553 		error = EINVAL;
1554 		RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name);
1555 	}
1556 
1557 	fib_unref_algo(flm);
1558 
1559 	return (error);
1560 }
1561 
1562 /*
1563  * Registers fib lookup module within the subsystem.
1564  */
1565 int
1566 fib_module_register(struct fib_lookup_module *flm)
1567 {
1568 
1569 	FIB_MOD_LOCK();
1570 	ALGO_PRINTF("attaching %s to %s", flm->flm_name,
1571 	    print_family(flm->flm_family));
1572 	TAILQ_INSERT_TAIL(&all_algo_list, flm, entries);
1573 	FIB_MOD_UNLOCK();
1574 
1575 	return (0);
1576 }
1577 
1578 /*
1579  * Tries to unregister fib lookup module.
1580  *
1581  * Returns 0 on success, EBUSY if module is still used
1582  *  by some of the tables.
1583  */
1584 int
1585 fib_module_unregister(struct fib_lookup_module *flm)
1586 {
1587 
1588 	FIB_MOD_LOCK();
1589 	if (flm->flm_refcount > 0) {
1590 		FIB_MOD_UNLOCK();
1591 		return (EBUSY);
1592 	}
1593 	fib_error_clear_flm(flm);
1594 	ALGO_PRINTF("detaching %s from %s", flm->flm_name,
1595 	    print_family(flm->flm_family));
1596 	TAILQ_REMOVE(&all_algo_list, flm, entries);
1597 	FIB_MOD_UNLOCK();
1598 
1599 	return (0);
1600 }
1601 
1602 void
1603 vnet_fib_init(void)
1604 {
1605 
1606 	TAILQ_INIT(&V_fib_data_list);
1607 }
1608 
1609 void
1610 vnet_fib_destroy(void)
1611 {
1612 
1613 	FIB_MOD_LOCK();
1614 	fib_error_clear();
1615 	FIB_MOD_UNLOCK();
1616 }
1617