xref: /freebsd/sys/net/route/fib_algo.c (revision 3110d4ebd6c0848cf5e25890d01791bb407e2a9b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 
34 #include <sys/param.h>
35 #include <sys/eventhandler.h>
36 #include <sys/kernel.h>
37 #include <sys/sbuf.h>
38 #include <sys/lock.h>
39 #include <sys/rmlock.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/module.h>
43 #include <sys/kernel.h>
44 #include <sys/priv.h>
45 #include <sys/proc.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/syslog.h>
50 #include <sys/queue.h>
51 #include <net/vnet.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 
56 #include <netinet/in.h>
57 #include <netinet/in_var.h>
58 #include <netinet/ip.h>
59 #include <netinet/ip_var.h>
60 #ifdef INET6
61 #include <netinet/ip6.h>
62 #include <netinet6/ip6_var.h>
63 #endif
64 
65 #include <net/route.h>
66 #include <net/route/nhop.h>
67 #include <net/route/route_ctl.h>
68 #include <net/route/route_var.h>
69 #include <net/route/fib_algo.h>
70 
71 #include <machine/stdarg.h>
72 
73 /*
74  * Fib lookup framework.
75  *
76  * This framework enables accelerated longest-prefix-match lookups for the
77  *  routing tables by adding the ability to dynamically attach/detach lookup
78  *  algorithms implementation to/from the datapath.
79  *
80  * flm - fib lookup modules - implementation of particular lookup algorithm
81  * fd - fib data - instance of an flm bound to specific routing table
82  *
83  * This file provides main framework functionality.
84  *
85  * The following are the features provided by the framework
86  *
87  * 1) nexhops abstraction -> provides transparent referencing, indexing
88  *   and efficient idx->ptr mappings for nexthop and nexthop groups.
89  * 2) Routing table synchronisation
90  * 3) dataplane attachment points
91  * 4) automatic algorithm selection based on the provided preference.
92  *
93  *
94  * DATAPATH
95  * For each supported address family, there is a an allocated array of fib_dp
96  *  structures, indexed by fib number. Each array entry contains callback function
97  *  and its argument. This function will be called with a family-specific lookup key,
98  *  scope and provided argument. This array gets re-created every time when new algo
99  *  instance gets created. Please take a look at the replace_rtables_family() function
100  *  for more details.
101  *
102  */
103 
104 SYSCTL_DECL(_net_route);
105 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
106     "Fib algorithm lookups");
107 
108 #ifdef INET6
109 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false;
110 #define	V_algo_fixed_inet6	VNET(algo_fixed_inet6)
111 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
112     "IPv6 longest prefix match lookups");
113 #endif
114 #ifdef INET
115 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false;
116 #define	V_algo_fixed_inet	VNET(algo_fixed_inet)
117 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
118     "IPv4 longest prefix match lookups");
119 #endif
120 
121 struct nhop_ref_table {
122 	uint32_t		count;
123 	int32_t			refcnt[0];
124 };
125 
126 /*
127  * Data structure for the fib lookup instance tied to the particular rib.
128  */
129 struct fib_data {
130 	uint32_t		number_nhops;	/* current # of nhops */
131 	uint8_t			hit_nhops;	/* true if out of nhop limit */
132 	uint8_t			init_done;	/* true if init is competed */
133 	uint32_t		fd_dead:1;	/* Scheduled for deletion */
134 	uint32_t		fd_linked:1;	/* true if linked */
135 	uint32_t		fd_need_rebuild:1;	/* true if rebuild scheduled */
136 	uint32_t		fd_force_eval:1;/* true if rebuild scheduled */
137 	uint8_t			fd_family;	/* family */
138 	uint32_t		fd_fibnum;	/* fibnum */
139 	uint32_t		fd_failed_rebuilds;	/* stat: failed rebuilds */
140 	uint32_t		fd_algo_mask;	/* bitmask for algo data */
141 	struct callout		fd_callout;	/* rebuild callout */
142 	void			*fd_algo_data;	/* algorithm data */
143 	struct nhop_object	**nh_idx;	/* nhop idx->ptr array */
144 	struct nhop_ref_table	*nh_ref_table;	/* array with # of nhop references */
145 	struct rib_head		*fd_rh;		/* RIB table we're attached to */
146 	struct rib_subscription	*fd_rs;		/* storing table subscription */
147 	struct fib_dp		fd_dp;		/* fib datapath data */
148 	struct vnet		*fd_vnet;	/* vnet fib belongs to */
149 	struct epoch_context	fd_epoch_ctx;	/* epoch context for deletion */
150 	struct fib_lookup_module	*fd_flm;/* pointer to the lookup module */
151 	uint32_t		fd_num_changes;	/* number of changes since last callout */
152 	TAILQ_ENTRY(fib_data)	entries;	/* list of all fds in vnet */
153 };
154 
155 static void rebuild_fd_callout(void *_data);
156 static void destroy_fd_instance_epoch(epoch_context_t ctx);
157 static enum flm_op_result attach_datapath(struct fib_data *fd);
158 static bool is_idx_free(struct fib_data *fd, uint32_t index);
159 static void set_algo_fixed(struct rib_head *rh);
160 static bool is_algo_fixed(struct rib_head *rh);
161 
162 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh);
163 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh);
164 
165 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
166     struct fib_lookup_module *orig_flm);
167 static void fib_unref_algo(struct fib_lookup_module *flm);
168 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum);
169 
170 struct mtx fib_mtx;
171 #define	FIB_MOD_LOCK()		mtx_lock(&fib_mtx)
172 #define	FIB_MOD_UNLOCK()	mtx_unlock(&fib_mtx)
173 #define	FIB_MOD_LOCK_ASSERT()	mtx_assert(&fib_mtx, MA_OWNED)
174 
175 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF);
176 
177 /* Algorithm has to be this percent better than the current to switch */
178 #define	BEST_DIFF_PERCENT	(5 * 256 / 100)
179 /* Schedule algo re-evaluation X seconds after a change */
180 #define	ALGO_EVAL_DELAY_MS	30000
181 /* Force algo re-evaluation after X changes */
182 #define	ALGO_EVAL_NUM_ROUTES	100
183 /* Try to setup algorithm X times */
184 #define	FIB_MAX_TRIES		32
185 /* Max amount of supported nexthops */
186 #define	FIB_MAX_NHOPS		262144
187 #define	FIB_CALLOUT_DELAY_MS	50
188 
189 /* Debug */
190 static int flm_debug_level = LOG_NOTICE;
191 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN,
192     &flm_debug_level, 0, "debuglevel");
193 #define	FLM_MAX_DEBUG_LEVEL	LOG_DEBUG
194 
195 #define	_PASS_MSG(_l)	(flm_debug_level >= (_l))
196 #define	ALGO_PRINTF(_fmt, ...)	printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__)
197 #define	_ALGO_PRINTF(_fib, _fam, _aname, _func, _fmt, ...) \
198     printf("[fib_algo] %s.%u (%s) %s: " _fmt "\n",\
199     print_family(_fam), _fib, _aname, _func, ## __VA_ARGS__)
200 #define	_RH_PRINTF(_fib, _fam, _func, _fmt, ...) \
201     printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__)
202 #define	RH_PRINTF(_l, _rh, _fmt, ...)	if (_PASS_MSG(_l)) {	\
203     _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\
204 }
205 #define	FD_PRINTF(_l, _fd, _fmt, ...)	FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__)
206 #define	_FD_PRINTF(_l, _fd, _fmt, ...)	if (_PASS_MSG(_l)) {		\
207     _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name,	\
208     __func__, _fmt, ## __VA_ARGS__);					\
209 }
210 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG
211 #define	FD_PRINTF_LOG_DEBUG	_FD_PRINTF
212 #else
213 #define	FD_PRINTF_LOG_DEBUG()
214 #endif
215 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO
216 #define	FD_PRINTF_LOG_INFO	_FD_PRINTF
217 #else
218 #define	FD_PRINTF_LOG_INFO()
219 #endif
220 #define	FD_PRINTF_LOG_NOTICE	_FD_PRINTF
221 #define	FD_PRINTF_LOG_ERR	_FD_PRINTF
222 #define	FD_PRINTF_LOG_WARNING	_FD_PRINTF
223 
224 
225 /* List of all registered lookup algorithms */
226 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list);
227 
228 /* List of all fib lookup instances in the vnet */
229 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list);
230 #define	V_fib_data_list	VNET(fib_data_list)
231 
232 /* Datastructure for storing non-transient fib lookup module failures */
233 struct fib_error {
234 	int				fe_family;
235 	uint32_t			fe_fibnum;	/* failed rtable */
236 	struct fib_lookup_module	*fe_flm;	/* failed module */
237 	TAILQ_ENTRY(fib_error)		entries;/* list of all errored entries */
238 };
239 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list);
240 #define	V_fib_error_list VNET(fib_error_list)
241 
242 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */
243 struct fib_dp_header {
244 	struct epoch_context	fdh_epoch_ctx;
245 	uint32_t		fdh_num_tables;
246 	struct fib_dp		fdh_idx[0];
247 };
248 
249 /*
250  * Tries to add new non-transient algorithm error to the list of
251  *  errors.
252  * Returns true on success.
253  */
254 static bool
255 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum)
256 {
257 	struct fib_error *fe;
258 
259 	fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO);
260 	if (fe == NULL)
261 		return (false);
262 	fe->fe_flm = flm;
263 	fe->fe_family = flm->flm_family;
264 	fe->fe_fibnum = fibnum;
265 
266 	FIB_MOD_LOCK();
267 	/* Avoid duplicates by checking if error already exists first */
268 	if (flm_error_check(flm, fibnum)) {
269 		FIB_MOD_UNLOCK();
270 		free(fe, M_TEMP);
271 		return (true);
272 	}
273 	TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries);
274 	FIB_MOD_UNLOCK();
275 
276 	return (true);
277 }
278 
279 /*
280  * True if non-transient error has been registered for @flm in @fibnum.
281  */
282 static bool
283 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum)
284 {
285 	const struct fib_error *fe;
286 
287 	TAILQ_FOREACH(fe, &V_fib_error_list, entries) {
288 		if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum))
289 			return (true);
290 	}
291 
292 	return (false);
293 }
294 
295 /*
296  * Clear all errors of algo specified by @flm.
297  */
298 static void
299 fib_error_clear_flm(struct fib_lookup_module *flm)
300 {
301 	struct fib_error *fe, *fe_tmp;
302 
303 	FIB_MOD_LOCK_ASSERT();
304 
305 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
306 		if (fe->fe_flm == flm) {
307 			TAILQ_REMOVE(&V_fib_error_list, fe, entries);
308 			free(fe, M_TEMP);
309 		}
310 	}
311 }
312 
313 /*
314  * Clears all errors in current VNET.
315  */
316 static void
317 fib_error_clear()
318 {
319 	struct fib_error *fe, *fe_tmp;
320 
321 	FIB_MOD_LOCK_ASSERT();
322 
323 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
324 		TAILQ_REMOVE(&V_fib_error_list, fe, entries);
325 		free(fe, M_TEMP);
326 	}
327 }
328 
329 static const char *
330 print_op_result(enum flm_op_result result)
331 {
332 	switch (result) {
333 	case FLM_SUCCESS:
334 		return "success";
335 	case FLM_REBUILD:
336 		return "rebuild";
337 	case FLM_ERROR:
338 		return "error";
339 	}
340 
341 	return "unknown";
342 }
343 
344 static const char *
345 print_family(int family)
346 {
347 
348 	if (family == AF_INET)
349 		return ("inet");
350 	else if (family == AF_INET6)
351 		return ("inet6");
352 	else
353 		return ("unknown");
354 }
355 
356 /*
357  * Debug function used by lookup algorithms.
358  * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) "
359  */
360 void
361 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...)
362 {
363 	char buf[128];
364 	va_list ap;
365 
366 	if (level > flm_debug_level)
367 		return;
368 
369 	va_start(ap, fmt);
370 	vsnprintf(buf, sizeof(buf), fmt, ap);
371 	va_end(ap);
372 
373 	_ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name,
374 	    func, "%s", buf);
375 }
376 
377 /*
378  * Outputs list of algorithms supported by the provided address family.
379  */
380 static int
381 print_algos_sysctl(struct sysctl_req *req, int family)
382 {
383 	struct fib_lookup_module *flm;
384 	struct sbuf sbuf;
385 	int error, count = 0;
386 
387 	error = sysctl_wire_old_buffer(req, 0);
388 	if (error == 0) {
389 		sbuf_new_for_sysctl(&sbuf, NULL, 512, req);
390 		TAILQ_FOREACH(flm, &all_algo_list, entries) {
391 			if (flm->flm_family == family) {
392 				if (count++ > 0)
393 					sbuf_cat(&sbuf, ", ");
394 				sbuf_cat(&sbuf, flm->flm_name);
395 			}
396 		}
397 		error = sbuf_finish(&sbuf);
398 		sbuf_delete(&sbuf);
399 	}
400 	return (error);
401 }
402 
403 #ifdef INET6
404 static int
405 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)
406 {
407 
408 	return (print_algos_sysctl(req, AF_INET6));
409 }
410 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
411     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
412     print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms");
413 #endif
414 
415 #ifdef INET
416 static int
417 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)
418 {
419 
420 	return (print_algos_sysctl(req, AF_INET));
421 }
422 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
423     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
424     print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms");
425 #endif
426 
427 /*
428  * Calculate delay between repeated failures.
429  * Returns current delay in milliseconds.
430  */
431 static uint32_t
432 callout_calc_delay_ms(struct fib_data *fd)
433 {
434 	uint32_t shift;
435 
436 	if (fd->fd_failed_rebuilds > 10)
437 		shift = 10;
438 	else
439 		shift = fd->fd_failed_rebuilds;
440 
441 	return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
442 }
443 
444 static void
445 schedule_callout(struct fib_data *fd, int delay_ms)
446 {
447 
448 	callout_reset_sbt(&fd->fd_callout, 0, SBT_1MS * delay_ms,
449 	    rebuild_fd_callout, fd, 0);
450 }
451 
452 static void
453 schedule_fd_rebuild(struct fib_data *fd)
454 {
455 
456 	FIB_MOD_LOCK();
457 	if (!fd->fd_need_rebuild) {
458 		fd->fd_need_rebuild = true;
459 
460 		/*
461 		 * Potentially re-schedules pending callout
462 		 *  initiated by schedule_algo_eval.
463 		 */
464 		FD_PRINTF(LOG_INFO, fd, "Scheduling rebuilt");
465 		schedule_callout(fd, callout_calc_delay_ms(fd));
466 	}
467 	FIB_MOD_UNLOCK();
468 }
469 
470 static void
471 schedule_algo_eval(struct fib_data *fd)
472 {
473 
474 	if (fd->fd_num_changes++ == 0) {
475 		/* Start callout to consider switch */
476 		FIB_MOD_LOCK();
477 		if (!callout_pending(&fd->fd_callout))
478 			schedule_callout(fd, ALGO_EVAL_DELAY_MS);
479 		FIB_MOD_UNLOCK();
480 	} else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) {
481 		/* Reset callout to exec immediately */
482 		FIB_MOD_LOCK();
483 		if (!fd->fd_need_rebuild) {
484 			fd->fd_force_eval = true;
485 			schedule_callout(fd, 1);
486 		}
487 		FIB_MOD_UNLOCK();
488 	}
489 }
490 
491 /*
492  * Rib subscription handler. Checks if the algorithm is ready to
493  *  receive updates, handles nexthop refcounting and passes change
494  *  data to the algorithm callback.
495  */
496 static void
497 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
498     void *_data)
499 {
500 	struct fib_data *fd = (struct fib_data *)_data;
501 	enum flm_op_result result;
502 
503 	RIB_WLOCK_ASSERT(rnh);
504 
505 	/*
506 	 * There is a small gap between subscribing for route changes
507 	 *  and initiating rtable dump. Avoid receiving route changes
508 	 *  prior to finishing rtable dump by checking `init_done`.
509 	 */
510 	if (!fd->init_done)
511 		return;
512 	/*
513 	 * If algo requested rebuild, stop sending updates by default.
514 	 * This simplifies nexthop refcount handling logic.
515 	 */
516 	if (fd->fd_need_rebuild)
517 		return;
518 
519 	/* Consider scheduling algorithm re-evaluation */
520 	schedule_algo_eval(fd);
521 
522 	/*
523 	 * Maintain guarantee that every nexthop returned by the dataplane
524 	 *  lookup has > 0 refcount, so can be safely referenced within current
525 	 *  epoch.
526 	 */
527 	if (rc->rc_nh_new != NULL) {
528 		if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) {
529 			/* ran out of indexes */
530 			schedule_fd_rebuild(fd);
531 			return;
532 		}
533 	}
534 
535 	result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
536 
537 	switch (result) {
538 	case FLM_SUCCESS:
539 		/* Unref old nexthop on success */
540 		if (rc->rc_nh_old != NULL)
541 			fib_unref_nhop(fd, rc->rc_nh_old);
542 		break;
543 	case FLM_REBUILD:
544 
545 		/*
546 		 * Algo is not able to apply the update.
547 		 * Schedule algo rebuild.
548 		 */
549 		schedule_fd_rebuild(fd);
550 		break;
551 	case FLM_ERROR:
552 
553 		/*
554 		 * Algo reported a non-recoverable error.
555 		 * Record the error and schedule rebuild, which will
556 		 *  trigger best algo selection.
557 		 */
558 		FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error");
559 		if (!flm_error_add(fd->fd_flm, fd->fd_fibnum))
560 			FD_PRINTF(LOG_ERR, fd, "failed to ban algo");
561 		schedule_fd_rebuild(fd);
562 	}
563 }
564 
565 static void
566 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd)
567 {
568 
569 	if (old_fd == NULL) {
570 		// TODO: read from rtable
571 		fd->number_nhops = 16;
572 		return;
573 	}
574 
575 	if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
576 		fd->number_nhops = 2 * old_fd->number_nhops;
577 	else
578 		fd->number_nhops = old_fd->number_nhops;
579 }
580 
581 struct walk_cbdata {
582 	struct fib_data		*fd;
583 	flm_dump_t		*func;
584 	enum flm_op_result	result;
585 };
586 
587 /*
588  * Handler called after all rtenties have been dumped.
589  * Performs post-dump framework checks and calls
590  * algo:flm_dump_end_cb().
591  *
592  * Updates walk_cbdata result.
593  */
594 static void
595 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
596 {
597 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
598 	struct fib_data *fd = w->fd;
599 
600 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
601 
602 	if (rnh->rib_dying) {
603 		w->result = FLM_ERROR;
604 		return;
605 	}
606 
607 	if (fd->hit_nhops) {
608 		FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops",
609 		    fd->nh_ref_table->count);
610 		if (w->result == FLM_SUCCESS)
611 			w->result = FLM_REBUILD;
612 		return;
613 	}
614 
615 	if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
616 		return;
617 
618 	/* Post-dump hook, dump successful */
619 	w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
620 
621 	if (w->result == FLM_SUCCESS) {
622 		/* Mark init as done to allow routing updates */
623 		fd->init_done = 1;
624 	}
625 }
626 
627 /*
628  * Callback for each entry in rib.
629  * Calls algo:flm_dump_rib_item_cb func as a part of initial
630  *  route table synchronisation.
631  */
632 static int
633 sync_algo_cb(struct rtentry *rt, void *_data)
634 {
635 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
636 
637 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
638 
639 	if (w->result == FLM_SUCCESS && w->func) {
640 
641 		/*
642 		 * Reference nexthops to maintain guarantee that
643 		 *  each nexthop returned by datapath has > 0 references
644 		 *  and can be safely referenced within current epoch.
645 		 */
646 		struct nhop_object *nh = rt_get_raw_nhop(rt);
647 		if (fib_ref_nhop(w->fd, nh) != 0)
648 			w->result = w->func(rt, w->fd->fd_algo_data);
649 		else
650 			w->result = FLM_REBUILD;
651 	}
652 
653 	return (0);
654 }
655 
656 /*
657  * Dump all routing table state to the algo instance.
658  */
659 static enum flm_op_result
660 sync_algo(struct fib_data *fd)
661 {
662 	struct walk_cbdata w = {
663 		.fd = fd,
664 		.func = fd->fd_flm->flm_dump_rib_item_cb,
665 		.result = FLM_SUCCESS,
666 	};
667 
668 	rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w);
669 
670 	FD_PRINTF(LOG_INFO, fd, "initial dump completed, result: %s",
671 	    print_op_result(w.result));
672 
673 	return (w.result);
674 }
675 
676 /*
677  * Schedules epoch-backed @fd instance deletion.
678  * * Unlinks @fd from the list of active algo instances.
679  * * Removes rib subscription.
680  * * Stops callout.
681  * * Schedules actual deletion.
682  *
683  * Assume @fd is already unlinked from the datapath.
684  */
685 static int
686 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
687 {
688 	bool is_dead;
689 
690 	NET_EPOCH_ASSERT();
691 
692 	FIB_MOD_LOCK();
693 	is_dead = fd->fd_dead;
694 	if (!is_dead)
695 		fd->fd_dead = true;
696 	if (fd->fd_linked) {
697 		TAILQ_REMOVE(&V_fib_data_list, fd, entries);
698 		fd->fd_linked = false;
699 	}
700 	FIB_MOD_UNLOCK();
701 	if (is_dead)
702 		return (0);
703 
704 	FD_PRINTF(LOG_INFO, fd, "DETACH");
705 
706 	if (fd->fd_rs != NULL)
707 		rib_unsibscribe(fd->fd_rs);
708 
709 	/*
710 	 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
711 	 * will be executed, hence no _new_ callout schedules will happen.
712 	 *
713 	 * There can be 2 possible scenarious here:
714 	 * 1) we're running inside a callout when we're deleting ourselves
715 	 *  due to migration to a newer fd
716 	 * 2) we're running from rt_table_destroy() and callout is scheduled
717 	 *  for execution OR is executing
718 	 *
719 	 * For (2) we need to wait for the callout termination, as the routing table
720 	 * will be destroyed after this function returns.
721 	 * For (1) we cannot call drain, but can ensure that this is the last invocation.
722 	 */
723 
724 	if (in_callout)
725 		callout_stop(&fd->fd_callout);
726 	else
727 		callout_drain(&fd->fd_callout);
728 
729 	epoch_call(net_epoch_preempt, destroy_fd_instance_epoch,
730 	    &fd->fd_epoch_ctx);
731 
732 	return (0);
733 }
734 
735 /*
736  * Wipe all fd instances from the list matching rib specified by @rh.
737  * If @keep_first is set, remove all but the first record.
738  */
739 static void
740 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
741 {
742 	struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
743 	struct fib_data *fd, *fd_tmp;
744 	struct epoch_tracker et;
745 
746 	FIB_MOD_LOCK();
747 	TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) {
748 		if (fd->fd_rh == rh) {
749 			if (keep_first) {
750 				keep_first = false;
751 				continue;
752 			}
753 			TAILQ_REMOVE(&V_fib_data_list, fd, entries);
754 			fd->fd_linked = false;
755 			TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
756 		}
757 	}
758 	FIB_MOD_UNLOCK();
759 
760 	/* Pass 2: remove each entry */
761 	NET_EPOCH_ENTER(et);
762 	TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
763 		schedule_destroy_fd_instance(fd, in_callout);
764 	}
765 	NET_EPOCH_EXIT(et);
766 }
767 
768 void
769 fib_destroy_rib(struct rib_head *rh)
770 {
771 
772 	/*
773 	 * rnh has `is_dying` flag set, so setup of new fd's will fail at
774 	 *  sync_algo() stage, preventing new entries to be added to the list
775 	 *  of active algos. Remove all existing entries for the particular rib.
776 	 */
777 	fib_cleanup_algo(rh, false, false);
778 }
779 
780 /*
781  * Finalises fd destruction by freeing all fd resources.
782  */
783 static void
784 destroy_fd_instance(struct fib_data *fd)
785 {
786 
787 	FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd);
788 
789 	/* Call destroy callback first */
790 	if (fd->fd_algo_data != NULL)
791 		fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
792 
793 	/* Nhop table */
794 	if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) {
795 		for (int i = 0; i < fd->number_nhops; i++) {
796 			if (!is_idx_free(fd, i)) {
797 				FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p",
798 				    i, fd->nh_idx[i]);
799 				nhop_free_any(fd->nh_idx[i]);
800 			}
801 		}
802 		free(fd->nh_idx, M_RTABLE);
803 	}
804 	if (fd->nh_ref_table != NULL)
805 		free(fd->nh_ref_table, M_RTABLE);
806 
807 	fib_unref_algo(fd->fd_flm);
808 
809 	free(fd, M_RTABLE);
810 }
811 
812 /*
813  * Epoch callback indicating fd is safe to destroy
814  */
815 static void
816 destroy_fd_instance_epoch(epoch_context_t ctx)
817 {
818 	struct fib_data *fd;
819 
820 	fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
821 
822 	destroy_fd_instance(fd);
823 }
824 
825 /*
826  * Tries to setup fd instance.
827  * - Allocates fd/nhop table
828  * - Runs algo:flm_init_cb algo init
829  * - Subscribes fd to the rib
830  * - Runs rtable dump
831  * - Adds instance to the list of active instances.
832  *
833  * Returns: operation result. Fills in @pfd with resulting fd on success.
834  *
835  */
836 static enum flm_op_result
837 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
838     struct fib_data *old_fd, struct fib_data **pfd)
839 {
840 	struct fib_data *fd;
841 	size_t size;
842 	enum flm_op_result result;
843 
844 	/* Allocate */
845 	fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO);
846 	if (fd == NULL)  {
847 		*pfd = NULL;
848 		return (FLM_REBUILD);
849 	}
850 	*pfd = fd;
851 
852 	estimate_nhop_scale(old_fd, fd);
853 
854 	fd->fd_rh = rh;
855 	fd->fd_family = rh->rib_family;
856 	fd->fd_fibnum = rh->rib_fibnum;
857 	callout_init(&fd->fd_callout, 1);
858 	fd->fd_vnet = curvnet;
859 	fd->fd_flm = flm;
860 
861 	FIB_MOD_LOCK();
862 	flm->flm_refcount++;
863 	FIB_MOD_UNLOCK();
864 
865 	/* Allocate nhidx -> nhop_ptr table */
866 	size = fd->number_nhops * sizeof(void *);
867 	fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
868 	if (fd->nh_idx == NULL) {
869 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size);
870 		return (FLM_REBUILD);
871 	}
872 
873 	/* Allocate nhop index refcount table */
874 	size = sizeof(struct nhop_ref_table);
875 	size += fd->number_nhops * sizeof(uint32_t);
876 	fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
877 	if (fd->nh_ref_table == NULL) {
878 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size);
879 		return (FLM_REBUILD);
880 	}
881 	FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops);
882 
883 	/* Okay, we're ready for algo init */
884 	void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL;
885 	result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data);
886 	if (result != FLM_SUCCESS)
887 		return (result);
888 
889 	/* Try to subscribe */
890 	if (flm->flm_change_rib_item_cb != NULL) {
891 		fd->fd_rs = rib_subscribe_internal(fd->fd_rh,
892 		    handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE, 0);
893 		if (fd->fd_rs == NULL)
894 			return (FLM_REBUILD);
895 	}
896 
897 	/* Dump */
898 	result = sync_algo(fd);
899 	if (result != FLM_SUCCESS)
900 		return (result);
901 	FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully.");
902 
903 	FIB_MOD_LOCK();
904 	/*
905 	 * Insert fd in the beginning of a list, to maintain invariant
906 	 *  that first matching entry for the AF/fib is always the active
907 	 *  one.
908 	 */
909 	TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries);
910 	fd->fd_linked = true;
911 	FIB_MOD_UNLOCK();
912 
913 	return (FLM_SUCCESS);
914 }
915 
916 /*
917  * Sets up algo @flm for table @rh and links it to the datapath.
918  *
919  */
920 static enum flm_op_result
921 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
922     struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
923 {
924 	struct fib_data *prev_fd, *new_fd;
925 	struct epoch_tracker et;
926 	enum flm_op_result result;
927 
928 	prev_fd = orig_fd;
929 	new_fd = NULL;
930 	for (int i = 0; i < FIB_MAX_TRIES; i++) {
931 		NET_EPOCH_ENTER(et);
932 		result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);
933 
934 		if ((result == FLM_SUCCESS) && attach)
935 			result = attach_datapath(new_fd);
936 
937 		if ((prev_fd != NULL) && (prev_fd != orig_fd)) {
938 			schedule_destroy_fd_instance(prev_fd, false);
939 			prev_fd = NULL;
940 		}
941 		NET_EPOCH_EXIT(et);
942 
943 		RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
944 		    print_op_result(result));
945 
946 		if (result == FLM_REBUILD) {
947 			prev_fd = new_fd;
948 			new_fd = NULL;
949 			continue;
950 		}
951 
952 		break;
953 	}
954 
955 	if (result != FLM_SUCCESS) {
956 		/* update failure count */
957 		FIB_MOD_LOCK();
958 		if (orig_fd != NULL)
959 			orig_fd->fd_failed_rebuilds++;
960 		FIB_MOD_UNLOCK();
961 
962 		/* Ban algo on non-recoverable error */
963 		if (result == FLM_ERROR)
964 			flm_error_add(flm, rh->rib_fibnum);
965 
966 		NET_EPOCH_ENTER(et);
967 		if ((prev_fd != NULL) && (prev_fd != orig_fd))
968 			schedule_destroy_fd_instance(prev_fd, false);
969 		if (new_fd != NULL) {
970 			schedule_destroy_fd_instance(new_fd, false);
971 			new_fd = NULL;
972 		}
973 		NET_EPOCH_EXIT(et);
974 	}
975 
976 	*pfd = new_fd;
977 	return (result);
978 }
979 
980 /*
981  * Callout for all scheduled fd-related work.
982  * - Checks if the current algo is still the best algo
983  * - Creates a new instance of an algo for af/fib if desired.
984  */
985 static void
986 rebuild_fd_callout(void *_data)
987 {
988 	struct fib_data *fd, *fd_new, *fd_tmp;
989 	struct fib_lookup_module *flm_new = NULL;
990 	struct epoch_tracker et;
991 	enum flm_op_result result;
992 	bool need_rebuild = false;
993 
994 	fd = (struct fib_data *)_data;
995 
996 	FIB_MOD_LOCK();
997 	need_rebuild = fd->fd_need_rebuild;
998 	fd->fd_need_rebuild = false;
999 	fd->fd_force_eval = false;
1000 	fd->fd_num_changes = 0;
1001 	FIB_MOD_UNLOCK();
1002 
1003 	CURVNET_SET(fd->fd_vnet);
1004 
1005 	/* First, check if we're still OK to use this algo */
1006 	if (!is_algo_fixed(fd->fd_rh))
1007 		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1008 	if ((flm_new == NULL) && (!need_rebuild)) {
1009 		/* Keep existing algo, no need to rebuild. */
1010 		CURVNET_RESTORE();
1011 		return;
1012 	}
1013 
1014 	if (flm_new == NULL) {
1015 		flm_new = fd->fd_flm;
1016 		fd_tmp = fd;
1017 	} else {
1018 		fd_tmp = NULL;
1019 		FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name);
1020 	}
1021 	result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true);
1022 	if (fd_tmp == NULL) {
1023 		/* fd_new represents new algo */
1024 		fib_unref_algo(flm_new);
1025 	}
1026 	if (result != FLM_SUCCESS) {
1027 		FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed");
1028 		CURVNET_RESTORE();
1029 		return;
1030 	}
1031 	FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");
1032 
1033 	/* Remove old instance removal */
1034 	if (fd != NULL) {
1035 		NET_EPOCH_ENTER(et);
1036 		schedule_destroy_fd_instance(fd, true);
1037 		NET_EPOCH_EXIT(et);
1038 	}
1039 
1040 	CURVNET_RESTORE();
1041 }
1042 
1043 /*
1044  * Finds algo by name/family.
1045  * Returns referenced algo or NULL.
1046  */
1047 static struct fib_lookup_module *
1048 fib_find_algo(const char *algo_name, int family)
1049 {
1050 	struct fib_lookup_module *flm;
1051 
1052 	FIB_MOD_LOCK();
1053 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1054 		if ((strcmp(flm->flm_name, algo_name) == 0) &&
1055 		    (family == flm->flm_family)) {
1056 			flm->flm_refcount++;
1057 			FIB_MOD_UNLOCK();
1058 			return (flm);
1059 		}
1060 	}
1061 	FIB_MOD_UNLOCK();
1062 
1063 	return (NULL);
1064 }
1065 
1066 static void
1067 fib_unref_algo(struct fib_lookup_module *flm)
1068 {
1069 
1070 	FIB_MOD_LOCK();
1071 	flm->flm_refcount--;
1072 	FIB_MOD_UNLOCK();
1073 }
1074 
1075 static int
1076 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req)
1077 {
1078 	struct fib_lookup_module *flm = NULL;
1079 	struct fib_data *fd = NULL;
1080 	char old_algo_name[32], algo_name[32];
1081 	struct rib_head *rh = NULL;
1082 	enum flm_op_result result;
1083 	int error;
1084 
1085 	/* Fetch current algo/rib for af/family */
1086 	FIB_MOD_LOCK();
1087 	TAILQ_FOREACH(fd, &V_fib_data_list, entries) {
1088 		if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum))
1089 			break;
1090 	}
1091 	if (fd == NULL) {
1092 		FIB_MOD_UNLOCK();
1093 		return (ENOENT);
1094 	}
1095 	rh = fd->fd_rh;
1096 	strlcpy(old_algo_name, fd->fd_flm->flm_name,
1097 	    sizeof(old_algo_name));
1098 	FIB_MOD_UNLOCK();
1099 
1100 	strlcpy(algo_name, old_algo_name, sizeof(algo_name));
1101 	error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req);
1102 	if (error != 0 || req->newptr == NULL)
1103 		return (error);
1104 
1105 	if (strcmp(algo_name, old_algo_name) == 0)
1106 		return (0);
1107 
1108 	/* New algorithm name is different */
1109 	flm = fib_find_algo(algo_name, family);
1110 	if (flm == NULL) {
1111 		RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name);
1112 		return (ESRCH);
1113 	}
1114 
1115 	fd = NULL;
1116 	result = setup_fd_instance(flm, rh, NULL, &fd, true);
1117 	fib_unref_algo(flm);
1118 	if (result != FLM_SUCCESS)
1119 		return (EINVAL);
1120 
1121 	/* Disable automated jumping between algos */
1122 	FIB_MOD_LOCK();
1123 	set_algo_fixed(rh);
1124 	FIB_MOD_UNLOCK();
1125 	/* Remove old instance(s) */
1126 	fib_cleanup_algo(rh, true, false);
1127 
1128 	/* Drain cb so user can unload the module after userret if so desired */
1129 	epoch_drain_callbacks(net_epoch_preempt);
1130 
1131 	return (0);
1132 }
1133 
1134 #ifdef INET
1135 static int
1136 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS)
1137 {
1138 
1139 	return (set_fib_algo(RT_DEFAULT_FIB, AF_INET, oidp, req));
1140 }
1141 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo,
1142     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1143     set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo");
1144 #endif
1145 
1146 #ifdef INET6
1147 static int
1148 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS)
1149 {
1150 
1151 	return (set_fib_algo(RT_DEFAULT_FIB, AF_INET6, oidp, req));
1152 }
1153 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo,
1154     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1155     set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo");
1156 #endif
1157 
1158 static void
1159 destroy_fdh_epoch(epoch_context_t ctx)
1160 {
1161 	struct fib_dp_header *fdh;
1162 
1163 	fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx);
1164 	free(fdh, M_RTABLE);
1165 }
1166 
1167 static struct fib_dp_header *
1168 alloc_fib_dp_array(uint32_t num_tables, bool waitok)
1169 {
1170 	size_t sz;
1171 	struct fib_dp_header *fdh;
1172 
1173 	sz = sizeof(struct fib_dp_header);
1174 	sz += sizeof(struct fib_dp) * num_tables;
1175 	fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
1176 	if (fdh != NULL)
1177 		fdh->fdh_num_tables = num_tables;
1178 	return (fdh);
1179 }
1180 
1181 static struct fib_dp_header *
1182 get_fib_dp_header(struct fib_dp *dp)
1183 {
1184 
1185 	return (__containerof((void *)dp, struct fib_dp_header, fdh_idx));
1186 }
1187 
1188 /*
1189  * Replace per-family index pool @pdp with a new one which
1190  * contains updated callback/algo data from @fd.
1191  * Returns 0 on success.
1192  */
1193 static enum flm_op_result
1194 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd)
1195 {
1196 	struct fib_dp_header *new_fdh, *old_fdh;
1197 
1198 	NET_EPOCH_ASSERT();
1199 
1200 	FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p",
1201 	    curvnet, fd->fd_dp.f, fd->fd_dp.arg);
1202 
1203 	FIB_MOD_LOCK();
1204 	old_fdh = get_fib_dp_header(*pdp);
1205 	new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false);
1206 	FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh);
1207 	if (new_fdh == NULL) {
1208 		FIB_MOD_UNLOCK();
1209 		FD_PRINTF(LOG_WARNING, fd, "error attaching datapath");
1210 		return (FLM_REBUILD);
1211 	}
1212 
1213 	memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1214 	    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1215 	/* Update relevant data structure for @fd */
1216 	new_fdh->fdh_idx[fd->fd_fibnum] = fd->fd_dp;
1217 
1218 	/* Ensure memcpy() writes have completed */
1219 	atomic_thread_fence_rel();
1220 	/* Set new datapath pointer */
1221 	*pdp = &new_fdh->fdh_idx[0];
1222 	FIB_MOD_UNLOCK();
1223 	FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh);
1224 
1225 	epoch_call(net_epoch_preempt, destroy_fdh_epoch,
1226 	    &old_fdh->fdh_epoch_ctx);
1227 
1228 	return (FLM_SUCCESS);
1229 }
1230 
1231 static struct fib_dp **
1232 get_family_dp_ptr(int family)
1233 {
1234 	switch (family) {
1235 	case AF_INET:
1236 		return (&V_inet_dp);
1237 	case AF_INET6:
1238 		return (&V_inet6_dp);
1239 	}
1240 	return (NULL);
1241 }
1242 
1243 /*
1244  * Make datapath use fib instance @fd
1245  */
1246 static enum flm_op_result
1247 attach_datapath(struct fib_data *fd)
1248 {
1249 	struct fib_dp **pdp;
1250 
1251 	pdp = get_family_dp_ptr(fd->fd_family);
1252 	return (replace_rtables_family(pdp, fd));
1253 }
1254 
1255 /*
1256  * Grow datapath pointers array.
1257  * Called from sysctl handler on growing number of routing tables.
1258  */
1259 static void
1260 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables)
1261 {
1262 	struct fib_dp_header *new_fdh, *old_fdh = NULL;
1263 
1264 	new_fdh = alloc_fib_dp_array(new_num_tables, true);
1265 
1266 	FIB_MOD_LOCK();
1267 	if (*pdp != NULL) {
1268 		old_fdh = get_fib_dp_header(*pdp);
1269 		memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1270 		    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1271 	}
1272 
1273 	/* Wait till all writes completed */
1274 	atomic_thread_fence_rel();
1275 
1276 	*pdp = &new_fdh->fdh_idx[0];
1277 	FIB_MOD_UNLOCK();
1278 
1279 	if (old_fdh != NULL)
1280 		epoch_call(net_epoch_preempt, destroy_fdh_epoch,
1281 		    &old_fdh->fdh_epoch_ctx);
1282 }
1283 
1284 /*
1285  * Grows per-AF arrays of datapath pointers for each supported family.
1286  * Called from fibs resize sysctl handler.
1287  */
1288 void
1289 fib_grow_rtables(uint32_t new_num_tables)
1290 {
1291 
1292 #ifdef INET
1293 	grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables);
1294 #endif
1295 #ifdef INET6
1296 	grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables);
1297 #endif
1298 }
1299 
1300 void
1301 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo)
1302 {
1303 
1304 	bzero(rinfo, sizeof(struct rib_rtable_info));
1305 	rinfo->num_prefixes = rh->rnh_prefixes;
1306 	rinfo->num_nhops = nhops_get_count(rh);
1307 #ifdef ROUTE_MPATH
1308 	rinfo->num_nhgrp = nhgrp_get_count(rh);
1309 #endif
1310 }
1311 
1312 /*
1313  * Accessor to get rib instance @fd is attached to.
1314  */
1315 struct rib_head *
1316 fib_get_rh(struct fib_data *fd)
1317 {
1318 
1319 	return (fd->fd_rh);
1320 }
1321 
1322 /*
1323  * Accessor to export idx->nhop array
1324  */
1325 struct nhop_object **
1326 fib_get_nhop_array(struct fib_data *fd)
1327 {
1328 
1329 	return (fd->nh_idx);
1330 }
1331 
1332 static uint32_t
1333 get_nhop_idx(struct nhop_object *nh)
1334 {
1335 #ifdef ROUTE_MPATH
1336 	if (NH_IS_NHGRP(nh))
1337 		return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1);
1338 	else
1339 		return (nhop_get_idx(nh) * 2);
1340 #else
1341 	return (nhop_get_idx(nh));
1342 #endif
1343 }
1344 
1345 uint32_t
1346 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh)
1347 {
1348 
1349 	return (get_nhop_idx(nh));
1350 }
1351 
1352 static bool
1353 is_idx_free(struct fib_data *fd, uint32_t index)
1354 {
1355 
1356 	return (fd->nh_ref_table->refcnt[index] == 0);
1357 }
1358 
1359 static uint32_t
1360 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh)
1361 {
1362 	uint32_t idx = get_nhop_idx(nh);
1363 
1364 	if (idx >= fd->number_nhops) {
1365 		fd->hit_nhops = 1;
1366 		return (0);
1367 	}
1368 
1369 	if (is_idx_free(fd, idx)) {
1370 		nhop_ref_any(nh);
1371 		fd->nh_idx[idx] = nh;
1372 		fd->nh_ref_table->count++;
1373 		FD_PRINTF(LOG_DEBUG, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]);
1374 	}
1375 	fd->nh_ref_table->refcnt[idx]++;
1376 
1377 	return (idx);
1378 }
1379 
1380 struct nhop_release_data {
1381 	struct nhop_object	*nh;
1382 	struct epoch_context	ctx;
1383 };
1384 
1385 static void
1386 release_nhop_epoch(epoch_context_t ctx)
1387 {
1388 	struct nhop_release_data *nrd;
1389 
1390 	nrd = __containerof(ctx, struct nhop_release_data, ctx);
1391 	nhop_free_any(nrd->nh);
1392 	free(nrd, M_TEMP);
1393 }
1394 
1395 /*
1396  * Delays nexthop refcount release.
1397  * Datapath may have the datastructures not updated yet, so the old
1398  *  nexthop may still be returned till the end of current epoch. Delay
1399  *  refcount removal, as we may be removing the last instance, which will
1400  *  trigger nexthop deletion, rendering returned nexthop invalid.
1401  */
1402 static void
1403 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh)
1404 {
1405 	struct nhop_release_data *nrd;
1406 
1407 	nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO);
1408 	if (nrd != NULL) {
1409 		nrd->nh = nh;
1410 		epoch_call(net_epoch_preempt, release_nhop_epoch, &nrd->ctx);
1411 	} else {
1412 		/*
1413 		 * Unable to allocate memory. Leak nexthop to maintain guarantee
1414 		 *  that each nhop can be referenced.
1415 		 */
1416 		FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh);
1417 	}
1418 }
1419 
1420 static void
1421 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh)
1422 {
1423 	uint32_t idx = get_nhop_idx(nh);
1424 
1425 	KASSERT((idx < fd->number_nhops), ("invalid nhop index"));
1426 	KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh"));
1427 
1428 	fd->nh_ref_table->refcnt[idx]--;
1429 	if (fd->nh_ref_table->refcnt[idx] == 0) {
1430 		FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]);
1431 		fib_schedule_release_nhop(fd, fd->nh_idx[idx]);
1432 	}
1433 }
1434 
1435 static void
1436 set_algo_fixed(struct rib_head *rh)
1437 {
1438 	switch (rh->rib_family) {
1439 #ifdef INET
1440 	case AF_INET:
1441 		V_algo_fixed_inet = true;
1442 		break;
1443 #endif
1444 #ifdef INET6
1445 	case AF_INET6:
1446 		V_algo_fixed_inet6 = true;
1447 		break;
1448 #endif
1449 	}
1450 }
1451 
1452 static bool
1453 is_algo_fixed(struct rib_head *rh)
1454 {
1455 
1456 	switch (rh->rib_family) {
1457 #ifdef INET
1458 	case AF_INET:
1459 		return (V_algo_fixed_inet);
1460 #endif
1461 #ifdef INET6
1462 	case AF_INET6:
1463 		return (V_algo_fixed_inet6);
1464 #endif
1465 	}
1466 	return (false);
1467 }
1468 
1469 /*
1470  * Runs the check on what would be the best algo for rib @rh, assuming
1471  *  that the current algo is the one specified by @orig_flm. Note that
1472  *  it can be NULL for initial selection.
1473  *
1474  * Returns referenced new algo or NULL if the current one is the best.
1475  */
1476 static struct fib_lookup_module *
1477 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm)
1478 {
1479 	uint8_t preference, curr_preference = 0, best_preference = 0;
1480 	struct fib_lookup_module *flm, *best_flm = NULL;
1481 	struct rib_rtable_info rinfo;
1482 	int candidate_algos = 0;
1483 
1484 	fib_get_rtable_info(rh, &rinfo);
1485 
1486 	FIB_MOD_LOCK();
1487 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1488 		if (flm->flm_family != rh->rib_family)
1489 			continue;
1490 		candidate_algos++;
1491 		preference = flm->flm_get_pref(&rinfo);
1492 		if (preference > best_preference) {
1493 			if (!flm_error_check(flm, rh->rib_fibnum)) {
1494 				best_preference = preference;
1495 				best_flm = flm;
1496 			}
1497 		}
1498 		if (flm == orig_flm)
1499 			curr_preference = preference;
1500 	}
1501 	if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference))
1502 		best_flm->flm_refcount++;
1503 	else
1504 		best_flm = NULL;
1505 	FIB_MOD_UNLOCK();
1506 
1507 	RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)",
1508 	    candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference,
1509 	    best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"),
1510 	    best_preference);
1511 
1512 	return (best_flm);
1513 }
1514 
1515 /*
1516  * Called when new route table is created.
1517  * Selects, allocates and attaches fib algo for the table.
1518  */
1519 int
1520 fib_select_algo_initial(struct rib_head *rh)
1521 {
1522 	struct fib_lookup_module *flm;
1523 	struct fib_data *fd = NULL;
1524 	enum flm_op_result result;
1525 	int error = 0;
1526 
1527 	flm = fib_check_best_algo(rh, NULL);
1528 	if (flm == NULL) {
1529 		RH_PRINTF(LOG_CRIT, rh, "no algo selected");
1530 		return (ENOENT);
1531 	}
1532 	RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);
1533 
1534 	result = setup_fd_instance(flm, rh, NULL, &fd, false);
1535 	RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
1536 	if (result == FLM_SUCCESS) {
1537 
1538 		/*
1539 		 * Attach datapath directly to avoid multiple reallocations
1540 		 * during fib growth
1541 		 */
1542 		struct fib_dp_header *fdp;
1543 		struct fib_dp **pdp;
1544 
1545 		pdp = get_family_dp_ptr(rh->rib_family);
1546 		if (pdp != NULL) {
1547 			fdp = get_fib_dp_header(*pdp);
1548 			fdp->fdh_idx[fd->fd_fibnum] = fd->fd_dp;
1549 			FD_PRINTF(LOG_INFO, fd, "datapath attached");
1550 		}
1551 	} else {
1552 		error = EINVAL;
1553 		RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name);
1554 	}
1555 
1556 	fib_unref_algo(flm);
1557 
1558 	return (error);
1559 }
1560 
1561 /*
1562  * Registers fib lookup module within the subsystem.
1563  */
1564 int
1565 fib_module_register(struct fib_lookup_module *flm)
1566 {
1567 
1568 	FIB_MOD_LOCK();
1569 	ALGO_PRINTF("attaching %s to %s", flm->flm_name,
1570 	    print_family(flm->flm_family));
1571 	TAILQ_INSERT_TAIL(&all_algo_list, flm, entries);
1572 	FIB_MOD_UNLOCK();
1573 
1574 	return (0);
1575 }
1576 
1577 /*
1578  * Tries to unregister fib lookup module.
1579  *
1580  * Returns 0 on success, EBUSY if module is still used
1581  *  by some of the tables.
1582  */
1583 int
1584 fib_module_unregister(struct fib_lookup_module *flm)
1585 {
1586 
1587 	FIB_MOD_LOCK();
1588 	if (flm->flm_refcount > 0) {
1589 		FIB_MOD_UNLOCK();
1590 		return (EBUSY);
1591 	}
1592 	fib_error_clear_flm(flm);
1593 	ALGO_PRINTF("detaching %s from %s", flm->flm_name,
1594 	    print_family(flm->flm_family));
1595 	TAILQ_REMOVE(&all_algo_list, flm, entries);
1596 	FIB_MOD_UNLOCK();
1597 
1598 	return (0);
1599 }
1600 
1601 void
1602 vnet_fib_init(void)
1603 {
1604 
1605 	TAILQ_INIT(&V_fib_data_list);
1606 }
1607 
1608 void
1609 vnet_fib_destroy(void)
1610 {
1611 
1612 	FIB_MOD_LOCK();
1613 	fib_error_clear();
1614 	FIB_MOD_UNLOCK();
1615 }
1616