1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 #include "opt_inet.h"
30 #include "opt_inet6.h"
31 #include "opt_route.h"
32 
33 #include <sys/param.h>
34 #include <sys/eventhandler.h>
35 #include <sys/kernel.h>
36 #include <sys/sbuf.h>
37 #include <sys/lock.h>
38 #include <sys/rmlock.h>
39 #include <sys/malloc.h>
40 #include <sys/mbuf.h>
41 #include <sys/module.h>
42 #include <sys/kernel.h>
43 #include <sys/priv.h>
44 #include <sys/proc.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/stdarg.h>
48 #include <sys/sysctl.h>
49 #include <sys/syslog.h>
50 #include <sys/queue.h>
51 #include <net/vnet.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 
56 #include <netinet/in.h>
57 #include <netinet/in_var.h>
58 #include <netinet/ip.h>
59 #include <netinet/ip_var.h>
60 #ifdef INET6
61 #include <netinet/ip6.h>
62 #include <netinet6/ip6_var.h>
63 #endif
64 
65 #include <net/route.h>
66 #include <net/route/nhop.h>
67 #include <net/route/route_ctl.h>
68 #include <net/route/route_var.h>
69 #include <net/route/fib_algo.h>
70 
71 /*
72  * Fib lookup framework.
73  *
74  * This framework enables accelerated longest-prefix-match lookups for the
75  *  routing tables by adding the ability to dynamically attach/detach lookup
76  *  algorithms implementation to/from the datapath.
77  *
78  * flm - fib lookup modules - implementation of particular lookup algorithm
79  * fd - fib data - instance of an flm bound to specific routing table
80  *
81  * This file provides main framework functionality.
82  *
83  * The following are the features provided by the framework
84  *
85  * 1) nexhops abstraction -> provides transparent referencing, indexing
86  *   and efficient idx->ptr mappings for nexthop and nexthop groups.
87  * 2) Routing table synchronisation
88  * 3) dataplane attachment points
89  * 4) automatic algorithm selection based on the provided preference.
90  *
91  *
92  * DATAPATH
93  * For each supported address family, there is a an allocated array of fib_dp
94  *  structures, indexed by fib number. Each array entry contains callback function
95  *  and its argument. This function will be called with a family-specific lookup key,
96  *  scope and provided argument. This array gets re-created every time when new algo
97  *  instance gets created. Please take a look at the replace_rtables_family() function
98  *  for more details.
99  *
100  */
101 
102 SYSCTL_DECL(_net_route);
103 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
104     "Fib algorithm lookups");
105 
106 /* Algorithm sync policy */
107 
108 /* Time interval to bucket updates */
109 VNET_DEFINE_STATIC(unsigned int, update_bucket_time_ms) = 50;
110 #define	V_update_bucket_time_ms	VNET(update_bucket_time_ms)
111 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET,
112     &VNET_NAME(update_bucket_time_ms), 0, "Time interval to calculate update rate");
113 
114 /* Minimum update rate to delay sync */
115 VNET_DEFINE_STATIC(unsigned int, bucket_change_threshold_rate) = 500;
116 #define	V_bucket_change_threshold_rate	VNET(bucket_change_threshold_rate)
117 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET,
118     &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync");
119 
120 /* Max allowed delay to sync */
121 VNET_DEFINE_STATIC(unsigned int, fib_max_sync_delay_ms) = 1000;
122 #define	V_fib_max_sync_delay_ms	VNET(fib_max_sync_delay_ms)
123 SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET,
124     &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)");
125 
126 
127 #ifdef INET6
128 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false;
129 #define	V_algo_fixed_inet6	VNET(algo_fixed_inet6)
130 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
131     "IPv6 longest prefix match lookups");
132 #endif
133 #ifdef INET
134 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false;
135 #define	V_algo_fixed_inet	VNET(algo_fixed_inet)
136 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
137     "IPv4 longest prefix match lookups");
138 #endif
139 
140 /* Fib instance counter */
141 static uint32_t fib_gen = 0;
142 
143 struct nhop_ref_table {
144 	uint32_t		count;
145 	int32_t			refcnt[0];
146 };
147 
148 enum fib_callout_action {
149 	FDA_NONE,	/* No callout scheduled */
150 	FDA_REBUILD,	/* Asks to rebuild algo instance */
151 	FDA_EVAL,	/* Asks to evaluate if the current algo is still be best */
152 	FDA_BATCH,	/* Asks to submit batch of updates to the algo */
153 };
154 
155 struct fib_sync_status {
156 	struct timeval		diverge_time;	/* ts when diverged */
157 	uint32_t		num_changes;	/* number of changes since sync */
158 	uint32_t		bucket_changes;	/* num changes within the current bucket */
159 	uint64_t		bucket_id;	/* 50ms bucket # */
160 	struct fib_change_queue	fd_change_queue;/* list of scheduled entries */
161 };
162 
163 /*
164  * Data structure for the fib lookup instance tied to the particular rib.
165  */
166 struct fib_data {
167 	uint32_t		number_nhops;	/* current # of nhops */
168 	uint8_t			hit_nhops;	/* true if out of nhop limit */
169 	uint8_t			init_done;	/* true if init is competed */
170 	uint32_t		fd_dead:1;	/* Scheduled for deletion */
171 	uint32_t		fd_linked:1;	/* true if linked */
172 	uint32_t		fd_need_rebuild:1;	/* true if rebuild scheduled */
173 	uint32_t		fd_batch:1;	/* true if batched notification scheduled */
174 	uint8_t			fd_family;	/* family */
175 	uint32_t		fd_fibnum;	/* fibnum */
176 	uint32_t		fd_failed_rebuilds;	/* stat: failed rebuilds */
177 	uint32_t		fd_gen;		/* instance gen# */
178 	struct callout		fd_callout;	/* rebuild callout */
179 	enum fib_callout_action	fd_callout_action;	/* Callout action to take */
180 	void			*fd_algo_data;	/* algorithm data */
181 	struct nhop_object	**nh_idx;	/* nhop idx->ptr array */
182 	struct nhop_ref_table	*nh_ref_table;	/* array with # of nhop references */
183 	struct rib_head		*fd_rh;		/* RIB table we're attached to */
184 	struct rib_subscription	*fd_rs;		/* storing table subscription */
185 	struct fib_dp		fd_dp;		/* fib datapath data */
186 	struct vnet		*fd_vnet;	/* vnet fib belongs to */
187 	struct epoch_context	fd_epoch_ctx;	/* epoch context for deletion */
188 	struct fib_lookup_module	*fd_flm;/* pointer to the lookup module */
189 	struct fib_sync_status	fd_ss;		/* State relevant to the rib sync  */
190 	uint32_t		fd_num_changes;	/* number of changes since last callout */
191 	TAILQ_ENTRY(fib_data)	entries;	/* list of all fds in vnet */
192 };
193 
194 static bool rebuild_fd(struct fib_data *fd, const char *reason);
195 static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new);
196 static void handle_fd_callout(void *_data);
197 static void destroy_fd_instance_epoch(epoch_context_t ctx);
198 static bool is_idx_free(struct fib_data *fd, uint32_t index);
199 static void set_algo_fixed(struct rib_head *rh);
200 static bool is_algo_fixed(struct rib_head *rh);
201 
202 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh);
203 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh);
204 
205 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
206     struct fib_lookup_module *orig_flm);
207 static void fib_unref_algo(struct fib_lookup_module *flm);
208 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum);
209 
210 struct mtx fib_mtx;
211 #define	FIB_MOD_LOCK()		mtx_lock(&fib_mtx)
212 #define	FIB_MOD_UNLOCK()	mtx_unlock(&fib_mtx)
213 #define	FIB_MOD_LOCK_ASSERT()	mtx_assert(&fib_mtx, MA_OWNED)
214 
215 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF);
216 
217 /* Algorithm has to be this percent better than the current to switch */
218 #define	BEST_DIFF_PERCENT	(5 * 256 / 100)
219 /* Schedule algo re-evaluation X seconds after a change */
220 #define	ALGO_EVAL_DELAY_MS	30000
221 /* Force algo re-evaluation after X changes */
222 #define	ALGO_EVAL_NUM_ROUTES	100
223 /* Try to setup algorithm X times */
224 #define	FIB_MAX_TRIES		32
225 /* Max amount of supported nexthops */
226 #define	FIB_MAX_NHOPS		262144
227 #define	FIB_CALLOUT_DELAY_MS	50
228 
229 
230 /* Debug */
231 static int flm_debug_level = LOG_NOTICE;
232 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN,
233     &flm_debug_level, 0, "debuglevel");
234 #define	FLM_MAX_DEBUG_LEVEL	LOG_DEBUG
235 #ifndef	LOG_DEBUG2
236 #define	LOG_DEBUG2	8
237 #endif
238 
239 #define	_PASS_MSG(_l)	(flm_debug_level >= (_l))
240 #define	ALGO_PRINTF(_l, _fmt, ...)	if (_PASS_MSG(_l)) {		\
241 	printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__);	\
242 }
243 #define	_ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \
244     printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\
245     print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__)
246 #define	_RH_PRINTF(_fib, _fam, _func, _fmt, ...) \
247     printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__)
248 #define	RH_PRINTF(_l, _rh, _fmt, ...)	if (_PASS_MSG(_l)) {	\
249     _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\
250 }
251 #define	FD_PRINTF(_l, _fd, _fmt, ...)	FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__)
252 #define	_FD_PRINTF(_l, _fd, _fmt, ...)	if (_PASS_MSG(_l)) {		\
253     _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name,	\
254     _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__);			\
255 }
256 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2
257 #define	FD_PRINTF_LOG_DEBUG2	_FD_PRINTF
258 #else
259 #define	FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...)
260 #endif
261 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG
262 #define	FD_PRINTF_LOG_DEBUG	_FD_PRINTF
263 #else
264 #define	FD_PRINTF_LOG_DEBUG()
265 #endif
266 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO
267 #define	FD_PRINTF_LOG_INFO	_FD_PRINTF
268 #else
269 #define	FD_PRINTF_LOG_INFO()
270 #endif
271 #define	FD_PRINTF_LOG_NOTICE	_FD_PRINTF
272 #define	FD_PRINTF_LOG_ERR	_FD_PRINTF
273 #define	FD_PRINTF_LOG_WARNING	_FD_PRINTF
274 
275 
276 /* List of all registered lookup algorithms */
277 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list);
278 
279 /* List of all fib lookup instances in the vnet */
280 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list);
281 #define	V_fib_data_list	VNET(fib_data_list)
282 
283 /* Datastructure for storing non-transient fib lookup module failures */
284 struct fib_error {
285 	int				fe_family;
286 	uint32_t			fe_fibnum;	/* failed rtable */
287 	struct fib_lookup_module	*fe_flm;	/* failed module */
288 	TAILQ_ENTRY(fib_error)		entries;/* list of all errored entries */
289 };
290 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list);
291 #define	V_fib_error_list VNET(fib_error_list)
292 
293 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */
294 struct fib_dp_header {
295 	struct epoch_context	fdh_epoch_ctx;
296 	uint32_t		fdh_num_tables;
297 	struct fib_dp		fdh_idx[0];
298 };
299 
300 /*
301  * Tries to add new non-transient algorithm error to the list of
302  *  errors.
303  * Returns true on success.
304  */
305 static bool
flm_error_add(struct fib_lookup_module * flm,uint32_t fibnum)306 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum)
307 {
308 	struct fib_error *fe;
309 
310 	fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO);
311 	if (fe == NULL)
312 		return (false);
313 	fe->fe_flm = flm;
314 	fe->fe_family = flm->flm_family;
315 	fe->fe_fibnum = fibnum;
316 
317 	FIB_MOD_LOCK();
318 	/* Avoid duplicates by checking if error already exists first */
319 	if (flm_error_check(flm, fibnum)) {
320 		FIB_MOD_UNLOCK();
321 		free(fe, M_TEMP);
322 		return (true);
323 	}
324 	TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries);
325 	FIB_MOD_UNLOCK();
326 
327 	return (true);
328 }
329 
330 /*
331  * True if non-transient error has been registered for @flm in @fibnum.
332  */
333 static bool
flm_error_check(const struct fib_lookup_module * flm,uint32_t fibnum)334 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum)
335 {
336 	const struct fib_error *fe;
337 
338 	TAILQ_FOREACH(fe, &V_fib_error_list, entries) {
339 		if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum))
340 			return (true);
341 	}
342 
343 	return (false);
344 }
345 
346 /*
347  * Clear all errors of algo specified by @flm.
348  */
349 static void
fib_error_clear_flm(struct fib_lookup_module * flm)350 fib_error_clear_flm(struct fib_lookup_module *flm)
351 {
352 	struct fib_error *fe, *fe_tmp;
353 
354 	FIB_MOD_LOCK_ASSERT();
355 
356 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
357 		if (fe->fe_flm == flm) {
358 			TAILQ_REMOVE(&V_fib_error_list, fe, entries);
359 			free(fe, M_TEMP);
360 		}
361 	}
362 }
363 
364 /*
365  * Clears all errors in current VNET.
366  */
367 static void
fib_error_clear(void)368 fib_error_clear(void)
369 {
370 	struct fib_error *fe, *fe_tmp;
371 
372 	FIB_MOD_LOCK_ASSERT();
373 
374 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
375 		TAILQ_REMOVE(&V_fib_error_list, fe, entries);
376 		free(fe, M_TEMP);
377 	}
378 }
379 
380 static const char *
print_op_result(enum flm_op_result result)381 print_op_result(enum flm_op_result result)
382 {
383 	switch (result) {
384 	case FLM_SUCCESS:
385 		return "success";
386 	case FLM_REBUILD:
387 		return "rebuild";
388 	case FLM_BATCH:
389 		return "batch";
390 	case FLM_ERROR:
391 		return "error";
392 	}
393 
394 	return "unknown";
395 }
396 
397 static const char *
print_family(int family)398 print_family(int family)
399 {
400 
401 	if (family == AF_INET)
402 		return ("inet");
403 	else if (family == AF_INET6)
404 		return ("inet6");
405 	else
406 		return ("unknown");
407 }
408 
409 /*
410  * Debug function used by lookup algorithms.
411  * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) "
412  */
413 void
fib_printf(int level,struct fib_data * fd,const char * func,char * fmt,...)414 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...)
415 {
416 	char buf[128];
417 	va_list ap;
418 
419 	if (level > flm_debug_level)
420 		return;
421 
422 	va_start(ap, fmt);
423 	vsnprintf(buf, sizeof(buf), fmt, ap);
424 	va_end(ap);
425 
426 	_ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name,
427 	    fd->fd_gen, func, "%s", buf);
428 }
429 
430 /*
431  * Outputs list of algorithms supported by the provided address family.
432  */
433 static int
print_algos_sysctl(struct sysctl_req * req,int family)434 print_algos_sysctl(struct sysctl_req *req, int family)
435 {
436 	struct fib_lookup_module *flm;
437 	struct sbuf sbuf;
438 	int error, count = 0;
439 
440 	error = sysctl_wire_old_buffer(req, 0);
441 	if (error == 0) {
442 		sbuf_new_for_sysctl(&sbuf, NULL, 512, req);
443 		TAILQ_FOREACH(flm, &all_algo_list, entries) {
444 			if (flm->flm_family == family) {
445 				if (count++ > 0)
446 					sbuf_cat(&sbuf, ", ");
447 				sbuf_cat(&sbuf, flm->flm_name);
448 			}
449 		}
450 		error = sbuf_finish(&sbuf);
451 		sbuf_delete(&sbuf);
452 	}
453 	return (error);
454 }
455 
456 #ifdef INET6
457 static int
print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)458 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)
459 {
460 
461 	return (print_algos_sysctl(req, AF_INET6));
462 }
463 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
464     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
465     print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms");
466 #endif
467 
468 #ifdef INET
469 static int
print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)470 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)
471 {
472 
473 	return (print_algos_sysctl(req, AF_INET));
474 }
475 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
476     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
477     print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms");
478 #endif
479 
480 /*
481  * Calculate delay between repeated failures.
482  * Returns current delay in milliseconds.
483  */
484 static uint32_t
callout_calc_delay_ms(struct fib_data * fd)485 callout_calc_delay_ms(struct fib_data *fd)
486 {
487 	uint32_t shift;
488 
489 	if (fd->fd_failed_rebuilds > 10)
490 		shift = 10;
491 	else
492 		shift = fd->fd_failed_rebuilds;
493 
494 	return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
495 }
496 
497 static void
schedule_callout(struct fib_data * fd,enum fib_callout_action action,int delay_ms)498 schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms)
499 {
500 
501 	FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action);
502 	fd->fd_callout_action = action;
503 	callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0,
504 	    handle_fd_callout, fd, 0);
505 }
506 
507 static void
schedule_fd_rebuild(struct fib_data * fd,const char * reason)508 schedule_fd_rebuild(struct fib_data *fd, const char *reason)
509 {
510 
511 	RIB_WLOCK_ASSERT(fd->fd_rh);
512 
513 	if (!fd->fd_need_rebuild) {
514 		fd->fd_need_rebuild = true;
515 		/* Stop batch updates */
516 		fd->fd_batch = false;
517 
518 		/*
519 		 * Potentially re-schedules pending callout
520 		 *  initiated by schedule_algo_eval.
521 		 */
522 		FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)",
523 		    reason, fd->fd_failed_rebuilds);
524 		schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd));
525 	}
526 }
527 
528 static void
sync_rib_gen(struct fib_data * fd)529 sync_rib_gen(struct fib_data *fd)
530 {
531 	FD_PRINTF(LOG_DEBUG, fd, "Sync gen %u -> %u", fd->fd_rh->rnh_gen, fd->fd_rh->rnh_gen_rib);
532 	fd->fd_rh->rnh_gen = fd->fd_rh->rnh_gen_rib;
533 }
534 
535 static int64_t
get_tv_diff_ms(const struct timeval * old_tv,const struct timeval * new_tv)536 get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv)
537 {
538 	int64_t diff = 0;
539 
540 	diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000;
541 	diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000;
542 
543 	return (diff);
544 }
545 
546 static void
add_tv_diff_ms(struct timeval * tv,int ms)547 add_tv_diff_ms(struct timeval *tv, int ms)
548 {
549 	tv->tv_sec += ms / 1000;
550 	ms = ms % 1000;
551 	if (ms * 1000 + tv->tv_usec < 1000000)
552 		tv->tv_usec += ms * 1000;
553 	else {
554 		tv->tv_sec += 1;
555 		tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000;
556 	}
557 }
558 
559 /*
560  * Marks the time when algo state diverges from the rib state.
561  */
562 static void
mark_diverge_time(struct fib_data * fd)563 mark_diverge_time(struct fib_data *fd)
564 {
565 	struct fib_sync_status *fd_ss = &fd->fd_ss;
566 
567 	getmicrouptime(&fd_ss->diverge_time);
568 	fd_ss->bucket_id = 0;
569 	fd_ss->bucket_changes = 0;
570 }
571 
572 /*
573  * Calculates and updates the next algorithm sync time, based on the current activity.
574  *
575  * The intent is to provide reasonable balance between the update
576  *  latency and efficient batching when changing large amount of routes.
577  *
578  * High-level algorithm looks the following:
579  * 1) all changes are bucketed in 50ms intervals
580  * 2) If amount of changes within the bucket is greater than the threshold,
581  *   the update gets delayed, up to maximum delay threshold.
582  */
583 static void
update_rebuild_delay(struct fib_data * fd,enum fib_callout_action action)584 update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action)
585 {
586 	uint32_t bucket_id, new_delay = 0;
587 	struct timeval tv;
588 
589 	/* Fetch all variables at once to ensure consistent reads */
590 	uint32_t bucket_time_ms = V_update_bucket_time_ms;
591 	uint32_t threshold_rate = V_bucket_change_threshold_rate;
592 	uint32_t max_delay_ms = V_fib_max_sync_delay_ms;
593 
594 	if (bucket_time_ms == 0)
595 		bucket_time_ms = 50;
596 	/* calculate per-bucket threshold rate */
597 	threshold_rate = threshold_rate * bucket_time_ms / 1000;
598 
599 	getmicrouptime(&tv);
600 
601 	struct fib_sync_status *fd_ss = &fd->fd_ss;
602 
603 	bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms;
604 
605 	if (fd_ss->bucket_id == bucket_id) {
606 		fd_ss->bucket_changes++;
607 		if (fd_ss->bucket_changes == threshold_rate) {
608 			new_delay = (bucket_id + 2) * bucket_time_ms;
609 			if (new_delay <= max_delay_ms) {
610 				FD_PRINTF(LOG_DEBUG, fd,
611 				    "hit threshold of %u routes, delay update,"
612 				    "bucket: %u, total delay: %u",
613 				    threshold_rate, bucket_id + 1, new_delay);
614 			} else {
615 				new_delay = 0;
616 				FD_PRINTF(LOG_DEBUG, fd,
617 				    "maximum sync delay (%u ms) reached", max_delay_ms);
618 			}
619 		} else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1))
620 			new_delay = bucket_time_ms;
621 	} else {
622 		fd_ss->bucket_id = bucket_id;
623 		fd_ss->bucket_changes = 1;
624 	}
625 
626 	if (new_delay > 0) {
627 		/* Calculated time has been updated */
628 		struct timeval new_tv = fd_ss->diverge_time;
629 		add_tv_diff_ms(&new_tv, new_delay);
630 
631 		int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv);
632 		schedule_callout(fd, action, delay_ms);
633 	}
634 }
635 
636 static void
update_algo_state(struct fib_data * fd)637 update_algo_state(struct fib_data *fd)
638 {
639 
640 	RIB_WLOCK_ASSERT(fd->fd_rh);
641 
642 	if (fd->fd_batch || fd->fd_need_rebuild) {
643 		enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH;
644 		update_rebuild_delay(fd, action);
645 		return;
646 	}
647 
648 	if (fd->fd_num_changes++ == 0) {
649 		/* Start callout to consider switch */
650 		if (!callout_pending(&fd->fd_callout))
651 			schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS);
652 	} else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) {
653 		/* Reset callout to exec immediately */
654 		if (fd->fd_callout_action == FDA_EVAL)
655 			schedule_callout(fd, FDA_EVAL, 1);
656 	}
657 }
658 
659 static bool
need_immediate_sync(struct fib_data * fd,struct rib_cmd_info * rc)660 need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc)
661 {
662 	struct nhop_object *nh;
663 
664 	/* Sync addition/removal of interface routes */
665 	switch (rc->rc_cmd) {
666 	case RTM_ADD:
667 		nh = rc->rc_nh_new;
668 		if (!NH_IS_NHGRP(nh)) {
669 			if (!(nh->nh_flags & NHF_GATEWAY))
670 				return (true);
671 			if (nhop_get_rtflags(nh) & RTF_STATIC)
672 				return (true);
673 		}
674 		break;
675 	case RTM_DELETE:
676 		nh = rc->rc_nh_old;
677 		if (!NH_IS_NHGRP(nh)) {
678 			if (!(nh->nh_flags & NHF_GATEWAY))
679 				return (true);
680 			if (nhop_get_rtflags(nh) & RTF_STATIC)
681 				return (true);
682 		}
683 		break;
684 	}
685 
686 	return (false);
687 }
688 
689 static bool
apply_rtable_changes(struct fib_data * fd)690 apply_rtable_changes(struct fib_data *fd)
691 {
692 	enum flm_op_result result;
693 	struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
694 
695 	result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data);
696 
697 	if (result == FLM_SUCCESS) {
698 		sync_rib_gen(fd);
699 		for (int i = 0; i < q->count; i++)
700 			if (q->entries[i].nh_old)
701 				fib_unref_nhop(fd, q->entries[i].nh_old);
702 		q->count = 0;
703 	}
704 	fd->fd_batch = false;
705 
706 	return (result == FLM_SUCCESS);
707 }
708 
709 static bool
fill_change_entry(struct fib_data * fd,struct fib_change_entry * ce,struct rib_cmd_info * rc)710 fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc)
711 {
712 	int plen = 0;
713 
714 	switch (fd->fd_family) {
715 #ifdef INET
716 	case AF_INET:
717 		rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid);
718 		break;
719 #endif
720 #ifdef INET6
721 	case AF_INET6:
722 		rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid);
723 		break;
724 #endif
725 	}
726 
727 	ce->plen = plen;
728 	ce->nh_old = rc->rc_nh_old;
729 	ce->nh_new = rc->rc_nh_new;
730 	if (ce->nh_new != NULL) {
731 		if (fib_ref_nhop(fd, ce->nh_new) == 0)
732 			return (false);
733 	}
734 
735 	return (true);
736 }
737 
738 static bool
queue_rtable_change(struct fib_data * fd,struct rib_cmd_info * rc)739 queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc)
740 {
741 	struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
742 
743 	if (q->count >= q->size) {
744 		uint32_t q_size;
745 
746 		if (q->size == 0)
747 			q_size = 256; /* ~18k memory */
748 		else
749 			q_size = q->size * 2;
750 
751 		size_t size = q_size * sizeof(struct fib_change_entry);
752 		void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO);
753 		if (a == NULL) {
754 			FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements",
755 			    q_size);
756 			return (false);
757 		}
758 		q->entries = a;
759 		q->size = q_size;
760 	}
761 
762 	return (fill_change_entry(fd, &q->entries[q->count++], rc));
763 }
764 
765 /*
766  * Rib subscription handler. Checks if the algorithm is ready to
767  *  receive updates, handles nexthop refcounting and passes change
768  *  data to the algorithm callback.
769  */
770 static void
handle_rtable_change_cb(struct rib_head * rnh,struct rib_cmd_info * rc,void * _data)771 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
772     void *_data)
773 {
774 	struct fib_data *fd = (struct fib_data *)_data;
775 	enum flm_op_result result;
776 
777 	RIB_WLOCK_ASSERT(rnh);
778 
779 	/*
780 	 * There is a small gap between subscribing for route changes
781 	 *  and initiating rtable dump. Avoid receiving route changes
782 	 *  prior to finishing rtable dump by checking `init_done`.
783 	 */
784 	if (!fd->init_done)
785 		return;
786 
787 	bool immediate_sync = need_immediate_sync(fd, rc);
788 
789 	/* Consider scheduling algorithm re-evaluation */
790 	update_algo_state(fd);
791 
792 	/*
793 	 * If algo requested rebuild, stop sending updates by default.
794 	 * This simplifies nexthop refcount handling logic.
795 	 */
796 	if (fd->fd_need_rebuild) {
797 		if (immediate_sync)
798 			rebuild_fd(fd, "rtable change type enforced sync");
799 		return;
800 	}
801 
802 	/*
803 	 * Algo requested updates to be delivered in batches.
804 	 * Add the current change to the queue and return.
805 	 */
806 	if (fd->fd_batch) {
807 		if (immediate_sync) {
808 			if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd))
809 				rebuild_fd(fd, "batch sync failed");
810 		} else {
811 			if (!queue_rtable_change(fd, rc))
812 				schedule_fd_rebuild(fd, "batch queue failed");
813 		}
814 		return;
815 	}
816 
817 	/*
818 	 * Maintain guarantee that every nexthop returned by the dataplane
819 	 *  lookup has > 0 refcount, so can be safely referenced within current
820 	 *  epoch.
821 	 */
822 	if (rc->rc_nh_new != NULL) {
823 		if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) {
824 			/* ran out of indexes */
825 			schedule_fd_rebuild(fd, "ran out of nhop indexes");
826 			return;
827 		}
828 	}
829 
830 	result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
831 
832 	switch (result) {
833 	case FLM_SUCCESS:
834 		sync_rib_gen(fd);
835 		/* Unref old nexthop on success */
836 		if (rc->rc_nh_old != NULL)
837 			fib_unref_nhop(fd, rc->rc_nh_old);
838 		break;
839 	case FLM_BATCH:
840 
841 		/*
842 		 * Algo asks to batch the changes.
843 		 */
844 		if (queue_rtable_change(fd, rc)) {
845 			if (!immediate_sync) {
846 				fd->fd_batch = true;
847 				mark_diverge_time(fd);
848 				update_rebuild_delay(fd, FDA_BATCH);
849 				break;
850 			}
851 			if (apply_rtable_changes(fd))
852 				break;
853 		}
854 		FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild");
855 
856 	case FLM_REBUILD:
857 
858 		/*
859 		 * Algo is not able to apply the update.
860 		 * Schedule algo rebuild.
861 		 */
862 		if (!immediate_sync) {
863 			mark_diverge_time(fd);
864 			schedule_fd_rebuild(fd, "algo requested rebuild");
865 			break;
866 		}
867 
868 		FD_PRINTF(LOG_INFO, fd, "running sync rebuild");
869 		rebuild_fd(fd, "rtable change type enforced sync");
870 		break;
871 	case FLM_ERROR:
872 
873 		/*
874 		 * Algo reported a non-recoverable error.
875 		 * Record the error and schedule rebuild, which will
876 		 *  trigger best algo selection.
877 		 */
878 		FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error");
879 		if (!flm_error_add(fd->fd_flm, fd->fd_fibnum))
880 			FD_PRINTF(LOG_ERR, fd, "failed to ban algo");
881 		schedule_fd_rebuild(fd, "algo reported non-recoverable error");
882 	}
883 }
884 
885 static void
estimate_nhop_scale(const struct fib_data * old_fd,struct fib_data * fd)886 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd)
887 {
888 
889 	if (old_fd == NULL) {
890 		// TODO: read from rtable
891 		fd->number_nhops = 16;
892 		return;
893 	}
894 
895 	if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
896 		fd->number_nhops = 2 * old_fd->number_nhops;
897 	else
898 		fd->number_nhops = old_fd->number_nhops;
899 }
900 
901 struct walk_cbdata {
902 	struct fib_data		*fd;
903 	flm_dump_t		*func;
904 	enum flm_op_result	result;
905 };
906 
907 /*
908  * Handler called after all rtenties have been dumped.
909  * Performs post-dump framework checks and calls
910  * algo:flm_dump_end_cb().
911  *
912  * Updates walk_cbdata result.
913  */
914 static void
sync_algo_end_cb(struct rib_head * rnh,enum rib_walk_hook stage,void * _data)915 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
916 {
917 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
918 	struct fib_data *fd = w->fd;
919 
920 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
921 
922 	if (rnh->rib_dying) {
923 		w->result = FLM_ERROR;
924 		return;
925 	}
926 
927 	if (fd->hit_nhops) {
928 		FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops",
929 		    fd->nh_ref_table->count);
930 		if (w->result == FLM_SUCCESS)
931 			w->result = FLM_REBUILD;
932 		return;
933 	}
934 
935 	if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
936 		return;
937 
938 	/* Post-dump hook, dump successful */
939 	w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
940 
941 	if (w->result == FLM_SUCCESS) {
942 		/* Mark init as done to allow routing updates */
943 		fd->init_done = 1;
944 	}
945 }
946 
947 /*
948  * Callback for each entry in rib.
949  * Calls algo:flm_dump_rib_item_cb func as a part of initial
950  *  route table synchronisation.
951  */
952 static int
sync_algo_cb(struct rtentry * rt,void * _data)953 sync_algo_cb(struct rtentry *rt, void *_data)
954 {
955 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
956 
957 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
958 
959 	if (w->result == FLM_SUCCESS && w->func) {
960 
961 		/*
962 		 * Reference nexthops to maintain guarantee that
963 		 *  each nexthop returned by datapath has > 0 references
964 		 *  and can be safely referenced within current epoch.
965 		 */
966 		struct nhop_object *nh = rt_get_raw_nhop(rt);
967 		if (fib_ref_nhop(w->fd, nh) != 0)
968 			w->result = w->func(rt, w->fd->fd_algo_data);
969 		else
970 			w->result = FLM_REBUILD;
971 	}
972 
973 	return (0);
974 }
975 
976 /*
977  * Dump all routing table state to the algo instance.
978  */
979 static enum flm_op_result
sync_algo(struct fib_data * fd)980 sync_algo(struct fib_data *fd)
981 {
982 	struct walk_cbdata w = {
983 		.fd = fd,
984 		.func = fd->fd_flm->flm_dump_rib_item_cb,
985 		.result = FLM_SUCCESS,
986 	};
987 
988 	rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w);
989 
990 	FD_PRINTF(LOG_INFO, fd,
991 	    "initial dump completed (rtable version: %d), result: %s",
992 	    fd->fd_rh->rnh_gen, print_op_result(w.result));
993 
994 	return (w.result);
995 }
996 
997 /*
998  * Schedules epoch-backed @fd instance deletion.
999  * * Unlinks @fd from the list of active algo instances.
1000  * * Removes rib subscription.
1001  * * Stops callout.
1002  * * Schedules actual deletion.
1003  *
1004  * Assume @fd is already unlinked from the datapath.
1005  */
1006 static int
schedule_destroy_fd_instance(struct fib_data * fd,bool in_callout)1007 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
1008 {
1009 	bool is_dead;
1010 
1011 	NET_EPOCH_ASSERT();
1012 	RIB_WLOCK_ASSERT(fd->fd_rh);
1013 
1014 	FIB_MOD_LOCK();
1015 	is_dead = fd->fd_dead;
1016 	if (!is_dead)
1017 		fd->fd_dead = true;
1018 	if (fd->fd_linked) {
1019 		TAILQ_REMOVE(&V_fib_data_list, fd, entries);
1020 		fd->fd_linked = false;
1021 	}
1022 	FIB_MOD_UNLOCK();
1023 	if (is_dead)
1024 		return (0);
1025 
1026 	FD_PRINTF(LOG_INFO, fd, "DETACH");
1027 
1028 	if (fd->fd_rs != NULL)
1029 		rib_unsubscribe_locked(fd->fd_rs);
1030 
1031 	/*
1032 	 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
1033 	 * will be executed, hence no _new_ callout schedules will happen.
1034 	 */
1035 	callout_stop(&fd->fd_callout);
1036 
1037 	fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx);
1038 
1039 	return (0);
1040 }
1041 
1042 /*
1043  * Wipe all fd instances from the list matching rib specified by @rh.
1044  * If @keep_first is set, remove all but the first record.
1045  */
1046 static void
fib_cleanup_algo(struct rib_head * rh,bool keep_first,bool in_callout)1047 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
1048 {
1049 	struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
1050 	struct fib_data *fd, *fd_tmp;
1051 	struct epoch_tracker et;
1052 
1053 	FIB_MOD_LOCK();
1054 	TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) {
1055 		if (fd->fd_rh == rh) {
1056 			if (keep_first) {
1057 				keep_first = false;
1058 				continue;
1059 			}
1060 			TAILQ_REMOVE(&V_fib_data_list, fd, entries);
1061 			fd->fd_linked = false;
1062 			TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
1063 		}
1064 	}
1065 	FIB_MOD_UNLOCK();
1066 
1067 	/* Pass 2: remove each entry */
1068 	NET_EPOCH_ENTER(et);
1069 	TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
1070 		if (!in_callout)
1071 			RIB_WLOCK(fd->fd_rh);
1072 		schedule_destroy_fd_instance(fd, in_callout);
1073 		if (!in_callout)
1074 			RIB_WUNLOCK(fd->fd_rh);
1075 	}
1076 	NET_EPOCH_EXIT(et);
1077 }
1078 
1079 void
fib_destroy_rib(struct rib_head * rh)1080 fib_destroy_rib(struct rib_head *rh)
1081 {
1082 
1083 	/*
1084 	 * rnh has `is_dying` flag set, so setup of new fd's will fail at
1085 	 *  sync_algo() stage, preventing new entries to be added to the list
1086 	 *  of active algos. Remove all existing entries for the particular rib.
1087 	 */
1088 	fib_cleanup_algo(rh, false, false);
1089 }
1090 
1091 /*
1092  * Finalises fd destruction by freeing all fd resources.
1093  */
1094 static void
destroy_fd_instance(struct fib_data * fd)1095 destroy_fd_instance(struct fib_data *fd)
1096 {
1097 
1098 	FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd);
1099 
1100 	/* Call destroy callback first */
1101 	if (fd->fd_algo_data != NULL)
1102 		fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
1103 
1104 	/* Nhop table */
1105 	if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) {
1106 		for (int i = 0; i < fd->number_nhops; i++) {
1107 			if (!is_idx_free(fd, i)) {
1108 				FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p",
1109 				    i, fd->nh_idx[i]);
1110 				nhop_free_any(fd->nh_idx[i]);
1111 			}
1112 		}
1113 		free(fd->nh_idx, M_RTABLE);
1114 	}
1115 	if (fd->nh_ref_table != NULL)
1116 		free(fd->nh_ref_table, M_RTABLE);
1117 
1118 	if (fd->fd_ss.fd_change_queue.entries != NULL)
1119 		free(fd->fd_ss.fd_change_queue.entries, M_TEMP);
1120 
1121 	fib_unref_algo(fd->fd_flm);
1122 
1123 	free(fd, M_RTABLE);
1124 }
1125 
1126 /*
1127  * Epoch callback indicating fd is safe to destroy
1128  */
1129 static void
destroy_fd_instance_epoch(epoch_context_t ctx)1130 destroy_fd_instance_epoch(epoch_context_t ctx)
1131 {
1132 	struct fib_data *fd;
1133 
1134 	fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
1135 
1136 	CURVNET_SET(fd->fd_vnet);
1137 	destroy_fd_instance(fd);
1138 	CURVNET_RESTORE();
1139 }
1140 
1141 /*
1142  * Tries to setup fd instance.
1143  * - Allocates fd/nhop table
1144  * - Runs algo:flm_init_cb algo init
1145  * - Subscribes fd to the rib
1146  * - Runs rtable dump
1147  * - Adds instance to the list of active instances.
1148  *
1149  * Returns: operation result. Fills in @pfd with resulting fd on success.
1150  *
1151  */
1152 static enum flm_op_result
try_setup_fd_instance(struct fib_lookup_module * flm,struct rib_head * rh,struct fib_data * old_fd,struct fib_data ** pfd)1153 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1154     struct fib_data *old_fd, struct fib_data **pfd)
1155 {
1156 	struct fib_data *fd;
1157 	size_t size;
1158 	enum flm_op_result result;
1159 
1160 	/* Allocate */
1161 	fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO);
1162 	if (fd == NULL)  {
1163 		*pfd = NULL;
1164 		RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure");
1165 		return (FLM_REBUILD);
1166 	}
1167 	*pfd = fd;
1168 
1169 	estimate_nhop_scale(old_fd, fd);
1170 
1171 	fd->fd_rh = rh;
1172 	fd->fd_family = rh->rib_family;
1173 	fd->fd_fibnum = rh->rib_fibnum;
1174 	callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0);
1175 	fd->fd_vnet = curvnet;
1176 	fd->fd_flm = flm;
1177 
1178 	FIB_MOD_LOCK();
1179 	flm->flm_refcount++;
1180 	fd->fd_gen = ++fib_gen;
1181 	FIB_MOD_UNLOCK();
1182 
1183 	FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd);
1184 
1185 	/* Allocate nhidx -> nhop_ptr table */
1186 	size = fd->number_nhops * sizeof(void *);
1187 	fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1188 	if (fd->nh_idx == NULL) {
1189 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size);
1190 		return (FLM_REBUILD);
1191 	}
1192 
1193 	/* Allocate nhop index refcount table */
1194 	size = sizeof(struct nhop_ref_table);
1195 	size += fd->number_nhops * sizeof(uint32_t);
1196 	fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1197 	if (fd->nh_ref_table == NULL) {
1198 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size);
1199 		return (FLM_REBUILD);
1200 	}
1201 	FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops);
1202 
1203 	/* Okay, we're ready for algo init */
1204 	void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL;
1205 	result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data);
1206 	if (result != FLM_SUCCESS) {
1207 		FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name);
1208 		return (result);
1209 	}
1210 
1211 	/* Try to subscribe */
1212 	if (flm->flm_change_rib_item_cb != NULL) {
1213 		fd->fd_rs = rib_subscribe_locked(fd->fd_rh,
1214 		    handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE);
1215 		if (fd->fd_rs == NULL) {
1216 			FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes");
1217 			return (FLM_REBUILD);
1218 		}
1219 	}
1220 
1221 	/* Dump */
1222 	result = sync_algo(fd);
1223 	if (result != FLM_SUCCESS) {
1224 		FD_PRINTF(LOG_INFO, fd, "rib sync failed");
1225 		return (result);
1226 	}
1227 	FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully.");
1228 
1229 	FIB_MOD_LOCK();
1230 	/*
1231 	 * Insert fd in the beginning of a list, to maintain invariant
1232 	 *  that first matching entry for the AF/fib is always the active
1233 	 *  one.
1234 	 */
1235 	TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries);
1236 	fd->fd_linked = true;
1237 	FIB_MOD_UNLOCK();
1238 
1239 	return (FLM_SUCCESS);
1240 }
1241 
1242 /*
1243  * Sets up algo @flm for table @rh and links it to the datapath.
1244  *
1245  */
1246 static enum flm_op_result
setup_fd_instance(struct fib_lookup_module * flm,struct rib_head * rh,struct fib_data * orig_fd,struct fib_data ** pfd,bool attach)1247 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1248     struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
1249 {
1250 	struct fib_data *prev_fd, *new_fd;
1251 	enum flm_op_result result;
1252 
1253 	NET_EPOCH_ASSERT();
1254 	RIB_WLOCK_ASSERT(rh);
1255 
1256 	prev_fd = orig_fd;
1257 	new_fd = NULL;
1258 	for (int i = 0; i < FIB_MAX_TRIES; i++) {
1259 		result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);
1260 
1261 		if ((result == FLM_SUCCESS) && attach) {
1262 			if (fib_set_datapath_ptr(new_fd, &new_fd->fd_dp))
1263 				sync_rib_gen(new_fd);
1264 			else
1265 				result = FLM_REBUILD;
1266 		}
1267 
1268 		if ((prev_fd != NULL) && (prev_fd != orig_fd)) {
1269 			schedule_destroy_fd_instance(prev_fd, false);
1270 			prev_fd = NULL;
1271 		}
1272 
1273 		RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
1274 		    print_op_result(result));
1275 
1276 		if (result == FLM_REBUILD) {
1277 			prev_fd = new_fd;
1278 			new_fd = NULL;
1279 			continue;
1280 		}
1281 
1282 		break;
1283 	}
1284 
1285 	if (result != FLM_SUCCESS) {
1286 		RH_PRINTF(LOG_WARNING, rh,
1287 		    "%s algo instance setup failed, failures=%d", flm->flm_name,
1288 		    orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0);
1289 		/* update failure count */
1290 		FIB_MOD_LOCK();
1291 		if (orig_fd != NULL)
1292 			orig_fd->fd_failed_rebuilds++;
1293 		FIB_MOD_UNLOCK();
1294 
1295 		/* Ban algo on non-recoverable error */
1296 		if (result == FLM_ERROR)
1297 			flm_error_add(flm, rh->rib_fibnum);
1298 
1299 		if ((prev_fd != NULL) && (prev_fd != orig_fd))
1300 			schedule_destroy_fd_instance(prev_fd, false);
1301 		if (new_fd != NULL) {
1302 			schedule_destroy_fd_instance(new_fd, false);
1303 			new_fd = NULL;
1304 		}
1305 	}
1306 
1307 	*pfd = new_fd;
1308 	return (result);
1309 }
1310 
1311 /*
1312  * Tries to sync algo with the current rtable state, either
1313  * by executing batch update or rebuilding.
1314  * Returns true on success.
1315  */
1316 static bool
execute_callout_action(struct fib_data * fd)1317 execute_callout_action(struct fib_data *fd)
1318 {
1319 	enum fib_callout_action action = fd->fd_callout_action;
1320 	struct fib_lookup_module *flm_new = NULL;
1321 	bool result = true;
1322 
1323 	NET_EPOCH_ASSERT();
1324 	RIB_WLOCK_ASSERT(fd->fd_rh);
1325 
1326 	fd->fd_need_rebuild = false;
1327 	fd->fd_batch = false;
1328 	fd->fd_num_changes = 0;
1329 
1330 	/* First, check if we're still OK to use this algo */
1331 	if (!is_algo_fixed(fd->fd_rh))
1332 		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1333 	if (flm_new != NULL)
1334 		action = FDA_REBUILD;
1335 
1336 	if (action == FDA_BATCH) {
1337 		/* Try to sync */
1338 		if (!apply_rtable_changes(fd))
1339 			action = FDA_REBUILD;
1340 	}
1341 
1342 	if (action == FDA_REBUILD)
1343 		result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1344 	if (flm_new != NULL)
1345 		fib_unref_algo(flm_new);
1346 
1347 	return (result);
1348 }
1349 
1350 /*
1351  * Callout for all scheduled fd-related work.
1352  * - Checks if the current algo is still the best algo
1353  * - Synchronises algo instance to the rtable (batch usecase)
1354  * - Creates a new instance of an algo for af/fib if desired.
1355  */
1356 static void
handle_fd_callout(void * _data)1357 handle_fd_callout(void *_data)
1358 {
1359 	struct fib_data *fd = (struct fib_data *)_data;
1360 	struct epoch_tracker et;
1361 
1362 	FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action);
1363 
1364 	NET_EPOCH_ENTER(et);
1365 	CURVNET_SET(fd->fd_vnet);
1366 	execute_callout_action(fd);
1367 	CURVNET_RESTORE();
1368 	NET_EPOCH_EXIT(et);
1369 }
1370 
1371 /*
1372  * Tries to create new algo instance based on @fd data.
1373  * Returns true on success.
1374  */
1375 static bool
rebuild_fd_flm(struct fib_data * fd,struct fib_lookup_module * flm_new)1376 rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new)
1377 {
1378 	struct fib_data *fd_new, *fd_tmp = NULL;
1379 	bool result;
1380 
1381 	if (flm_new == fd->fd_flm)
1382 		fd_tmp = fd;
1383 	else
1384 		FD_PRINTF(LOG_INFO, fd, "switching algo to %s", flm_new->flm_name);
1385 
1386 	result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true);
1387 	if (result != FLM_SUCCESS) {
1388 		FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed");
1389 		return (false);
1390 	}
1391 	FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");
1392 
1393 	/* Remove old instance */
1394 	schedule_destroy_fd_instance(fd, true);
1395 
1396 	return (true);
1397 }
1398 
1399 static bool
rebuild_fd(struct fib_data * fd,const char * reason)1400 rebuild_fd(struct fib_data *fd, const char *reason)
1401 {
1402 	struct fib_lookup_module *flm_new = NULL;
1403 	bool result;
1404 
1405 	if (!is_algo_fixed(fd->fd_rh))
1406 		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1407 
1408 	FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason);
1409 	result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1410 	if (flm_new != NULL)
1411 		fib_unref_algo(flm_new);
1412 
1413 	if (!result) {
1414 		FD_PRINTF(LOG_ERR, fd, "sync rebuild failed");
1415 		schedule_fd_rebuild(fd, "sync rebuild failed");
1416 	}
1417 
1418 	return (result);
1419 }
1420 
1421 /*
1422  * Finds algo by name/family.
1423  * Returns referenced algo or NULL.
1424  */
1425 static struct fib_lookup_module *
fib_find_algo(const char * algo_name,int family)1426 fib_find_algo(const char *algo_name, int family)
1427 {
1428 	struct fib_lookup_module *flm;
1429 
1430 	FIB_MOD_LOCK();
1431 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1432 		if ((strcmp(flm->flm_name, algo_name) == 0) &&
1433 		    (family == flm->flm_family)) {
1434 			flm->flm_refcount++;
1435 			FIB_MOD_UNLOCK();
1436 			return (flm);
1437 		}
1438 	}
1439 	FIB_MOD_UNLOCK();
1440 
1441 	return (NULL);
1442 }
1443 
1444 static void
fib_unref_algo(struct fib_lookup_module * flm)1445 fib_unref_algo(struct fib_lookup_module *flm)
1446 {
1447 
1448 	FIB_MOD_LOCK();
1449 	flm->flm_refcount--;
1450 	FIB_MOD_UNLOCK();
1451 }
1452 
1453 static int
set_fib_algo(uint32_t fibnum,int family,struct sysctl_oid * oidp,struct sysctl_req * req)1454 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req)
1455 {
1456 	struct fib_lookup_module *flm = NULL;
1457 	struct fib_data *fd = NULL;
1458 	char old_algo_name[32], algo_name[32];
1459 	struct rib_head *rh = NULL;
1460 	enum flm_op_result result;
1461 	struct epoch_tracker et;
1462 	int error;
1463 
1464 	/* Fetch current algo/rib for af/family */
1465 	FIB_MOD_LOCK();
1466 	TAILQ_FOREACH(fd, &V_fib_data_list, entries) {
1467 		if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum))
1468 			break;
1469 	}
1470 	if (fd == NULL) {
1471 		FIB_MOD_UNLOCK();
1472 		return (ENOENT);
1473 	}
1474 	rh = fd->fd_rh;
1475 	strlcpy(old_algo_name, fd->fd_flm->flm_name,
1476 	    sizeof(old_algo_name));
1477 	FIB_MOD_UNLOCK();
1478 
1479 	strlcpy(algo_name, old_algo_name, sizeof(algo_name));
1480 	error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req);
1481 	if (error != 0 || req->newptr == NULL)
1482 		return (error);
1483 
1484 	if (strcmp(algo_name, old_algo_name) == 0)
1485 		return (0);
1486 
1487 	/* New algorithm name is different */
1488 	flm = fib_find_algo(algo_name, family);
1489 	if (flm == NULL) {
1490 		RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name);
1491 		return (ESRCH);
1492 	}
1493 
1494 	fd = NULL;
1495 	NET_EPOCH_ENTER(et);
1496 	RIB_WLOCK(rh);
1497 	result = setup_fd_instance(flm, rh, NULL, &fd, true);
1498 	RIB_WUNLOCK(rh);
1499 	NET_EPOCH_EXIT(et);
1500 	fib_unref_algo(flm);
1501 	if (result != FLM_SUCCESS)
1502 		return (EINVAL);
1503 
1504 	/* Disable automated jumping between algos */
1505 	FIB_MOD_LOCK();
1506 	set_algo_fixed(rh);
1507 	FIB_MOD_UNLOCK();
1508 	/* Remove old instance(s) */
1509 	fib_cleanup_algo(rh, true, false);
1510 
1511 	/* Drain cb so user can unload the module after userret if so desired */
1512 	NET_EPOCH_DRAIN_CALLBACKS();
1513 
1514 	return (0);
1515 }
1516 
1517 #ifdef INET
1518 static int
set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS)1519 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS)
1520 {
1521 
1522 	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req));
1523 }
1524 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo,
1525     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1526     set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo");
1527 #endif
1528 
1529 #ifdef INET6
1530 static int
set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS)1531 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS)
1532 {
1533 
1534 	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req));
1535 }
1536 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo,
1537     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1538     set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo");
1539 #endif
1540 
1541 static struct nhop_object *
dummy_lookup(void * algo_data,const struct flm_lookup_key key,uint32_t scopeid)1542 dummy_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
1543 {
1544 	return (NULL);
1545 }
1546 
1547 static void
destroy_fdh_epoch(epoch_context_t ctx)1548 destroy_fdh_epoch(epoch_context_t ctx)
1549 {
1550 	struct fib_dp_header *fdh;
1551 
1552 	fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx);
1553 	free(fdh, M_RTABLE);
1554 }
1555 
1556 static struct fib_dp_header *
alloc_fib_dp_array(uint32_t num_tables,bool waitok)1557 alloc_fib_dp_array(uint32_t num_tables, bool waitok)
1558 {
1559 	size_t sz;
1560 	struct fib_dp_header *fdh;
1561 
1562 	sz = sizeof(struct fib_dp_header);
1563 	sz += sizeof(struct fib_dp) * num_tables;
1564 	fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
1565 	if (fdh != NULL) {
1566 		fdh->fdh_num_tables = num_tables;
1567 		/*
1568 		 * Set dummy lookup function ptr always returning NULL, so
1569 		 * we can delay algo init.
1570 		 */
1571 		for (uint32_t i = 0; i < num_tables; i++)
1572 			fdh->fdh_idx[i].f = dummy_lookup;
1573 	}
1574 	return (fdh);
1575 }
1576 
1577 static struct fib_dp_header *
get_fib_dp_header(struct fib_dp * dp)1578 get_fib_dp_header(struct fib_dp *dp)
1579 {
1580 
1581 	return (__containerof((void *)dp, struct fib_dp_header, fdh_idx));
1582 }
1583 
1584 /*
1585  * Replace per-family index pool @pdp with a new one which
1586  * contains updated callback/algo data from @fd.
1587  * Returns true on success.
1588  */
1589 static bool
replace_rtables_family(struct fib_dp ** pdp,struct fib_data * fd,struct fib_dp * dp)1590 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp)
1591 {
1592 	struct fib_dp_header *new_fdh, *old_fdh;
1593 
1594 	NET_EPOCH_ASSERT();
1595 
1596 	FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p",
1597 	    curvnet, dp->f, dp->arg);
1598 
1599 	FIB_MOD_LOCK();
1600 	old_fdh = get_fib_dp_header(*pdp);
1601 
1602 	if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) {
1603 		/*
1604 		 * Function is the same, data pointer needs update.
1605 		 * Perform in-line replace without reallocation.
1606 		 */
1607 		old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg;
1608 		FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh);
1609 		FIB_MOD_UNLOCK();
1610 		return (true);
1611 	}
1612 
1613 	new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false);
1614 	FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh);
1615 	if (new_fdh == NULL) {
1616 		FIB_MOD_UNLOCK();
1617 		FD_PRINTF(LOG_WARNING, fd, "error attaching datapath");
1618 		return (false);
1619 	}
1620 
1621 	memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1622 	    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1623 	/* Update relevant data structure for @fd */
1624 	new_fdh->fdh_idx[fd->fd_fibnum] = *dp;
1625 
1626 	/* Ensure memcpy() writes have completed */
1627 	atomic_thread_fence_rel();
1628 	/* Set new datapath pointer */
1629 	*pdp = &new_fdh->fdh_idx[0];
1630 	FIB_MOD_UNLOCK();
1631 	FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh);
1632 
1633 	fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
1634 
1635 	return (true);
1636 }
1637 
1638 static struct fib_dp **
get_family_dp_ptr(int family)1639 get_family_dp_ptr(int family)
1640 {
1641 	switch (family) {
1642 #ifdef INET
1643 	case AF_INET:
1644 		return (&V_inet_dp);
1645 #endif
1646 #ifdef INET6
1647 	case AF_INET6:
1648 		return (&V_inet6_dp);
1649 #endif
1650 	}
1651 	return (NULL);
1652 }
1653 
1654 /*
1655  * Make datapath use fib instance @fd
1656  */
1657 bool
fib_set_datapath_ptr(struct fib_data * fd,struct fib_dp * dp)1658 fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp)
1659 {
1660 	struct fib_dp **pdp;
1661 
1662 	pdp = get_family_dp_ptr(fd->fd_family);
1663 	return (replace_rtables_family(pdp, fd, dp));
1664 }
1665 
1666 /*
1667  * Grow datapath pointers array.
1668  * Called from sysctl handler on growing number of routing tables.
1669  */
1670 static void
grow_rtables_family(struct fib_dp ** pdp,uint32_t new_num_tables)1671 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables)
1672 {
1673 	struct fib_dp_header *new_fdh, *old_fdh = NULL;
1674 
1675 	new_fdh = alloc_fib_dp_array(new_num_tables, true);
1676 
1677 	FIB_MOD_LOCK();
1678 	if (*pdp != NULL) {
1679 		old_fdh = get_fib_dp_header(*pdp);
1680 		memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1681 		    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1682 	}
1683 
1684 	/* Wait till all writes completed */
1685 	atomic_thread_fence_rel();
1686 
1687 	*pdp = &new_fdh->fdh_idx[0];
1688 	FIB_MOD_UNLOCK();
1689 
1690 	if (old_fdh != NULL)
1691 		fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
1692 }
1693 
1694 /*
1695  * Grows per-AF arrays of datapath pointers for each supported family.
1696  * Called from fibs resize sysctl handler.
1697  */
1698 void
fib_grow_rtables(uint32_t new_num_tables)1699 fib_grow_rtables(uint32_t new_num_tables)
1700 {
1701 
1702 #ifdef INET
1703 	grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables);
1704 #endif
1705 #ifdef INET6
1706 	grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables);
1707 #endif
1708 }
1709 
1710 void
fib_get_rtable_info(struct rib_head * rh,struct rib_rtable_info * rinfo)1711 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo)
1712 {
1713 
1714 	bzero(rinfo, sizeof(struct rib_rtable_info));
1715 	rinfo->num_prefixes = rh->rnh_prefixes;
1716 	rinfo->num_nhops = nhops_get_count(rh);
1717 #ifdef ROUTE_MPATH
1718 	rinfo->num_nhgrp = nhgrp_get_count(rh);
1719 #endif
1720 }
1721 
1722 /*
1723  * Updates pointer to the algo data for the @fd.
1724  */
1725 void
fib_set_algo_ptr(struct fib_data * fd,void * algo_data)1726 fib_set_algo_ptr(struct fib_data *fd, void *algo_data)
1727 {
1728 	RIB_WLOCK_ASSERT(fd->fd_rh);
1729 
1730 	fd->fd_algo_data = algo_data;
1731 }
1732 
1733 /*
1734  * Calls @callback with @ctx after the end of a current epoch.
1735  */
1736 void
fib_epoch_call(epoch_callback_t callback,epoch_context_t ctx)1737 fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx)
1738 {
1739 	NET_EPOCH_CALL(callback, ctx);
1740 }
1741 
1742 /*
1743  * Accessor to get rib instance @fd is attached to.
1744  */
1745 struct rib_head *
fib_get_rh(struct fib_data * fd)1746 fib_get_rh(struct fib_data *fd)
1747 {
1748 
1749 	return (fd->fd_rh);
1750 }
1751 
1752 /*
1753  * Accessor to export idx->nhop array
1754  */
1755 struct nhop_object **
fib_get_nhop_array(struct fib_data * fd)1756 fib_get_nhop_array(struct fib_data *fd)
1757 {
1758 
1759 	return (fd->nh_idx);
1760 }
1761 
1762 static uint32_t
get_nhop_idx(struct nhop_object * nh)1763 get_nhop_idx(struct nhop_object *nh)
1764 {
1765 #ifdef ROUTE_MPATH
1766 	if (NH_IS_NHGRP(nh))
1767 		return (nhgrp_get_idx((struct nhgrp_object *)nh));
1768 	else
1769 #endif
1770 		return (nhop_get_idx(nh));
1771 }
1772 
1773 uint32_t
fib_get_nhop_idx(struct fib_data * fd,struct nhop_object * nh)1774 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh)
1775 {
1776 
1777 	return (get_nhop_idx(nh));
1778 }
1779 
1780 static bool
is_idx_free(struct fib_data * fd,uint32_t index)1781 is_idx_free(struct fib_data *fd, uint32_t index)
1782 {
1783 
1784 	return (fd->nh_ref_table->refcnt[index] == 0);
1785 }
1786 
1787 static uint32_t
fib_ref_nhop(struct fib_data * fd,struct nhop_object * nh)1788 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh)
1789 {
1790 	uint32_t idx = get_nhop_idx(nh);
1791 
1792 	if (idx >= fd->number_nhops) {
1793 		fd->hit_nhops = 1;
1794 		return (0);
1795 	}
1796 
1797 	if (is_idx_free(fd, idx)) {
1798 		nhop_ref_any(nh);
1799 		fd->nh_idx[idx] = nh;
1800 		fd->nh_ref_table->count++;
1801 		FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]);
1802 	}
1803 	fd->nh_ref_table->refcnt[idx]++;
1804 
1805 	return (idx);
1806 }
1807 
1808 struct nhop_release_data {
1809 	struct nhop_object	*nh;
1810 	struct epoch_context	ctx;
1811 };
1812 
1813 static void
release_nhop_epoch(epoch_context_t ctx)1814 release_nhop_epoch(epoch_context_t ctx)
1815 {
1816 	struct nhop_release_data *nrd;
1817 
1818 	nrd = __containerof(ctx, struct nhop_release_data, ctx);
1819 	nhop_free_any(nrd->nh);
1820 	free(nrd, M_TEMP);
1821 }
1822 
1823 /*
1824  * Delays nexthop refcount release.
1825  * Datapath may have the datastructures not updated yet, so the old
1826  *  nexthop may still be returned till the end of current epoch. Delay
1827  *  refcount removal, as we may be removing the last instance, which will
1828  *  trigger nexthop deletion, rendering returned nexthop invalid.
1829  */
1830 static void
fib_schedule_release_nhop(struct fib_data * fd,struct nhop_object * nh)1831 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh)
1832 {
1833 	struct nhop_release_data *nrd;
1834 
1835 	nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO);
1836 	if (nrd != NULL) {
1837 		nrd->nh = nh;
1838 		fib_epoch_call(release_nhop_epoch, &nrd->ctx);
1839 	} else {
1840 		/*
1841 		 * Unable to allocate memory. Leak nexthop to maintain guarantee
1842 		 *  that each nhop can be referenced.
1843 		 */
1844 		FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh);
1845 	}
1846 }
1847 
1848 static void
fib_unref_nhop(struct fib_data * fd,struct nhop_object * nh)1849 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh)
1850 {
1851 	uint32_t idx = get_nhop_idx(nh);
1852 
1853 	KASSERT((idx < fd->number_nhops), ("invalid nhop index"));
1854 	KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh"));
1855 
1856 	fd->nh_ref_table->refcnt[idx]--;
1857 	if (fd->nh_ref_table->refcnt[idx] == 0) {
1858 		FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]);
1859 		fib_schedule_release_nhop(fd, fd->nh_idx[idx]);
1860 	}
1861 }
1862 
1863 static void
set_algo_fixed(struct rib_head * rh)1864 set_algo_fixed(struct rib_head *rh)
1865 {
1866 	switch (rh->rib_family) {
1867 #ifdef INET
1868 	case AF_INET:
1869 		V_algo_fixed_inet = true;
1870 		break;
1871 #endif
1872 #ifdef INET6
1873 	case AF_INET6:
1874 		V_algo_fixed_inet6 = true;
1875 		break;
1876 #endif
1877 	}
1878 }
1879 
1880 static bool
is_algo_fixed(struct rib_head * rh)1881 is_algo_fixed(struct rib_head *rh)
1882 {
1883 
1884 	switch (rh->rib_family) {
1885 #ifdef INET
1886 	case AF_INET:
1887 		return (V_algo_fixed_inet);
1888 #endif
1889 #ifdef INET6
1890 	case AF_INET6:
1891 		return (V_algo_fixed_inet6);
1892 #endif
1893 	}
1894 	return (false);
1895 }
1896 
1897 /*
1898  * Runs the check on what would be the best algo for rib @rh, assuming
1899  *  that the current algo is the one specified by @orig_flm. Note that
1900  *  it can be NULL for initial selection.
1901  *
1902  * Returns referenced new algo or NULL if the current one is the best.
1903  */
1904 static struct fib_lookup_module *
fib_check_best_algo(struct rib_head * rh,struct fib_lookup_module * orig_flm)1905 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm)
1906 {
1907 	uint8_t preference, curr_preference = 0, best_preference = 0;
1908 	struct fib_lookup_module *flm, *best_flm = NULL;
1909 	struct rib_rtable_info rinfo;
1910 	int candidate_algos = 0;
1911 
1912 	fib_get_rtable_info(rh, &rinfo);
1913 
1914 	FIB_MOD_LOCK();
1915 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1916 		if (flm->flm_family != rh->rib_family)
1917 			continue;
1918 		candidate_algos++;
1919 		preference = flm->flm_get_pref(&rinfo);
1920 		if (preference > best_preference) {
1921 			if (!flm_error_check(flm, rh->rib_fibnum)) {
1922 				best_preference = preference;
1923 				best_flm = flm;
1924 			}
1925 		}
1926 		if (flm == orig_flm)
1927 			curr_preference = preference;
1928 	}
1929 	if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference))
1930 		best_flm->flm_refcount++;
1931 	else
1932 		best_flm = NULL;
1933 	FIB_MOD_UNLOCK();
1934 
1935 	RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)",
1936 	    candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference,
1937 	    best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"),
1938 	    best_preference);
1939 
1940 	return (best_flm);
1941 }
1942 
1943 /*
1944  * Called when new route table is created.
1945  * Selects, allocates and attaches fib algo for the table.
1946  */
1947 static bool
fib_select_algo_initial(struct rib_head * rh,struct fib_dp * dp)1948 fib_select_algo_initial(struct rib_head *rh, struct fib_dp *dp)
1949 {
1950 	struct fib_lookup_module *flm;
1951 	struct fib_data *fd = NULL;
1952 	enum flm_op_result result;
1953 	struct epoch_tracker et;
1954 
1955 	flm = fib_check_best_algo(rh, NULL);
1956 	if (flm == NULL) {
1957 		RH_PRINTF(LOG_CRIT, rh, "no algo selected");
1958 		return (false);
1959 	}
1960 	RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);
1961 
1962 	NET_EPOCH_ENTER(et);
1963 	RIB_WLOCK(rh);
1964 	result = setup_fd_instance(flm, rh, NULL, &fd, false);
1965 	RIB_WUNLOCK(rh);
1966 	NET_EPOCH_EXIT(et);
1967 
1968 	RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
1969 	if (result == FLM_SUCCESS)
1970 		*dp = fd->fd_dp;
1971 	else
1972 		RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name);
1973 
1974 	fib_unref_algo(flm);
1975 
1976 	return (result == FLM_SUCCESS);
1977 }
1978 
1979 /*
1980  * Sets up fib algo instances for the non-initialized RIBs in the @family.
1981  * Allocates temporary datapath index to amortize datapaint index updates
1982  * with large @num_tables.
1983  */
1984 void
fib_setup_family(int family,uint32_t num_tables)1985 fib_setup_family(int family, uint32_t num_tables)
1986 {
1987 	struct fib_dp_header *new_fdh = alloc_fib_dp_array(num_tables, false);
1988 	if (new_fdh == NULL) {
1989 		ALGO_PRINTF(LOG_CRIT, "Unable to setup framework for %s", print_family(family));
1990 		return;
1991 	}
1992 
1993 	for (int i = 0; i < num_tables; i++) {
1994 		struct rib_head *rh = rt_tables_get_rnh(i, family);
1995 		if (rh->rib_algo_init)
1996 			continue;
1997 		if (!fib_select_algo_initial(rh, &new_fdh->fdh_idx[i]))
1998 			continue;
1999 
2000 		rh->rib_algo_init = true;
2001 	}
2002 
2003 	FIB_MOD_LOCK();
2004 	struct fib_dp **pdp = get_family_dp_ptr(family);
2005 	struct fib_dp_header *old_fdh = get_fib_dp_header(*pdp);
2006 
2007 	/* Update the items not touched by the new init, from the old data pointer */
2008 	for (int i = 0; i < num_tables; i++) {
2009 		if (new_fdh->fdh_idx[i].f == dummy_lookup)
2010 			new_fdh->fdh_idx[i] = old_fdh->fdh_idx[i];
2011 	}
2012 
2013 	/* Ensure all index writes have completed */
2014 	atomic_thread_fence_rel();
2015 	/* Set new datapath pointer */
2016 	*pdp = &new_fdh->fdh_idx[0];
2017 
2018 	FIB_MOD_UNLOCK();
2019 
2020 	fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
2021 }
2022 
2023 /*
2024  * Registers fib lookup module within the subsystem.
2025  */
2026 int
fib_module_register(struct fib_lookup_module * flm)2027 fib_module_register(struct fib_lookup_module *flm)
2028 {
2029 
2030 	FIB_MOD_LOCK();
2031 	ALGO_PRINTF(LOG_INFO, "attaching %s to %s", flm->flm_name,
2032 	    print_family(flm->flm_family));
2033 	TAILQ_INSERT_TAIL(&all_algo_list, flm, entries);
2034 	FIB_MOD_UNLOCK();
2035 
2036 	return (0);
2037 }
2038 
2039 /*
2040  * Tries to unregister fib lookup module.
2041  *
2042  * Returns 0 on success, EBUSY if module is still used
2043  *  by some of the tables.
2044  */
2045 int
fib_module_unregister(struct fib_lookup_module * flm)2046 fib_module_unregister(struct fib_lookup_module *flm)
2047 {
2048 
2049 	FIB_MOD_LOCK();
2050 	if (flm->flm_refcount > 0) {
2051 		FIB_MOD_UNLOCK();
2052 		return (EBUSY);
2053 	}
2054 	fib_error_clear_flm(flm);
2055 	ALGO_PRINTF(LOG_INFO, "detaching %s from %s", flm->flm_name,
2056 	    print_family(flm->flm_family));
2057 	TAILQ_REMOVE(&all_algo_list, flm, entries);
2058 	FIB_MOD_UNLOCK();
2059 
2060 	return (0);
2061 }
2062 
2063 void
vnet_fib_init(void)2064 vnet_fib_init(void)
2065 {
2066 
2067 	TAILQ_INIT(&V_fib_data_list);
2068 }
2069 
2070 void
vnet_fib_destroy(void)2071 vnet_fib_destroy(void)
2072 {
2073 
2074 	FIB_MOD_LOCK();
2075 	fib_error_clear();
2076 	FIB_MOD_UNLOCK();
2077 }
2078