xref: /freebsd/sys/net/route/route_ctl.c (revision b197d4b893974c9eb4d7b38704c6d5c486235d6f)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
37 #include <sys/mbuf.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
40 #include <sys/syslog.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/rmlock.h>
44 
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/if_dl.h>
48 #include <net/vnet.h>
49 #include <net/route.h>
50 #include <net/route/route_ctl.h>
51 #include <net/route/route_var.h>
52 #include <net/route/nhop_utils.h>
53 #include <net/route/nhop.h>
54 #include <net/route/nhop_var.h>
55 #include <netinet/in.h>
56 #include <netinet6/scope6_var.h>
57 #include <netinet6/in6_var.h>
58 
59 #define	DEBUG_MOD_NAME	route_ctl
60 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
61 #include <net/route/route_debug.h>
62 _DECLARE_DEBUG(LOG_INFO);
63 
64 /*
65  * This file contains control plane routing tables functions.
66  *
67  * All functions assumes they are called in net epoch.
68  */
69 
70 union sockaddr_union {
71 	struct sockaddr		sa;
72 	struct sockaddr_in	sin;
73 	struct sockaddr_in6	sin6;
74 	char			_buf[32];
75 };
76 
77 static int add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
78     struct rib_cmd_info *rc);
79 static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
80     struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
81     struct rib_cmd_info *rc);
82 
83 static int add_route_flags(struct rib_head *rnh, struct rtentry *rt,
84     struct route_nhop_data *rnd_add, int op_flags, struct rib_cmd_info *rc);
85 #ifdef ROUTE_MPATH
86 static int add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
87     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
88     int op_flags, struct rib_cmd_info *rc);
89 #endif
90 
91 static int add_route(struct rib_head *rnh, struct rtentry *rt,
92     struct route_nhop_data *rnd, struct rib_cmd_info *rc);
93 static int delete_route(struct rib_head *rnh, struct rtentry *rt,
94     struct rib_cmd_info *rc);
95 static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
96     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc);
97 
98 static int get_prio_from_info(const struct rt_addrinfo *info);
99 static int nhop_get_prio(const struct nhop_object *nh);
100 
101 #ifdef ROUTE_MPATH
102 static bool rib_can_multipath(struct rib_head *rh);
103 #endif
104 
105 /* Per-vnet multipath routing configuration */
106 SYSCTL_DECL(_net_route);
107 #define	V_rib_route_multipath	VNET(rib_route_multipath)
108 #ifdef ROUTE_MPATH
109 #define _MP_FLAGS	CTLFLAG_RW
110 #else
111 #define _MP_FLAGS	CTLFLAG_RD
112 #endif
113 VNET_DEFINE(u_int, rib_route_multipath) = 1;
114 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
115     &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
116 #undef _MP_FLAGS
117 
118 #ifdef ROUTE_MPATH
119 VNET_DEFINE(u_int, fib_hash_outbound) = 0;
120 SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
121     &VNET_NAME(fib_hash_outbound), 0,
122     "Compute flowid for locally-originated packets");
123 
124 /* Default entropy to add to the hash calculation for the outbound connections*/
125 uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
126 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
127 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
128 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
129 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
130 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
131 };
132 #endif
133 
134 #if defined(INET) && defined(INET6)
135 FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops");
136 #define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop)
137 VNET_DEFINE_STATIC(u_int, rib_route_ipv6_nexthop) = 1;
138 SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET,
139     &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address");
140 #endif
141 
142 /* Debug bits */
143 SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
144 
145 static struct rib_head *
146 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
147 {
148 	struct rib_head *rnh;
149 	struct sockaddr *dst;
150 
151 	KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
152 
153 	dst = info->rti_info[RTAX_DST];
154 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
155 
156 	return (rnh);
157 }
158 
159 #if defined(INET) && defined(INET6)
160 bool
161 rib_can_4o6_nhop(void)
162 {
163 	return (!!V_rib_route_ipv6_nexthop);
164 }
165 #endif
166 
167 #ifdef ROUTE_MPATH
168 static bool
169 rib_can_multipath(struct rib_head *rh)
170 {
171 	int result;
172 
173 	CURVNET_SET(rh->rib_vnet);
174 	result = !!V_rib_route_multipath;
175 	CURVNET_RESTORE();
176 
177 	return (result);
178 }
179 
180 /*
181  * Check is nhop is multipath-eligible.
182  * Avoid nhops without gateways and redirects.
183  *
184  * Returns 1 for multipath-eligible nexthop,
185  * 0 otherwise.
186  */
187 bool
188 nhop_can_multipath(const struct nhop_object *nh)
189 {
190 
191 	if ((nh->nh_flags & NHF_MULTIPATH) != 0)
192 		return (1);
193 	if ((nh->nh_flags & NHF_GATEWAY) == 0)
194 		return (0);
195 	if ((nh->nh_flags & NHF_REDIRECT) != 0)
196 		return (0);
197 
198 	return (1);
199 }
200 #endif
201 
202 static int
203 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
204 {
205 	uint32_t weight;
206 
207 	if (info->rti_mflags & RTV_WEIGHT)
208 		weight = info->rti_rmx->rmx_weight;
209 	else
210 		weight = default_weight;
211 	/* Keep upper 1 byte for adm distance purposes */
212 	if (weight > RT_MAX_WEIGHT)
213 		weight = RT_MAX_WEIGHT;
214 	else if (weight == 0)
215 		weight = default_weight;
216 
217 	return (weight);
218 }
219 
220 /*
221  * File-local concept for distingushing between the normal and
222  * RTF_PINNED routes tha can override the "normal" one.
223  */
224 #define	NH_PRIORITY_HIGH	2
225 #define	NH_PRIORITY_NORMAL	1
226 static int
227 get_prio_from_info(const struct rt_addrinfo *info)
228 {
229 	if (info->rti_flags & RTF_PINNED)
230 		return (NH_PRIORITY_HIGH);
231 	return (NH_PRIORITY_NORMAL);
232 }
233 
234 static int
235 nhop_get_prio(const struct nhop_object *nh)
236 {
237 	if (NH_IS_PINNED(nh))
238 		return (NH_PRIORITY_HIGH);
239 	return (NH_PRIORITY_NORMAL);
240 }
241 
242 /*
243  * Check if specified @gw matches gw data in the nexthop @nh.
244  *
245  * Returns true if matches, false otherwise.
246  */
247 bool
248 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
249 {
250 
251 	if (nh->gw_sa.sa_family != gw->sa_family)
252 		return (false);
253 
254 	switch (gw->sa_family) {
255 	case AF_INET:
256 		return (nh->gw4_sa.sin_addr.s_addr ==
257 		    ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
258 	case AF_INET6:
259 		{
260 			const struct sockaddr_in6 *gw6;
261 			gw6 = (const struct sockaddr_in6 *)gw;
262 
263 			/*
264 			 * Currently (2020-09) IPv6 gws in kernel have their
265 			 * scope embedded. Once this becomes false, this code
266 			 * has to be revisited.
267 			 */
268 			if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
269 			    &gw6->sin6_addr))
270 				return (true);
271 			return (false);
272 		}
273 	case AF_LINK:
274 		{
275 			const struct sockaddr_dl *sdl;
276 			sdl = (const struct sockaddr_dl *)gw;
277 			return (nh->gwl_sa.sdl_index == sdl->sdl_index);
278 		}
279 	default:
280 		return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
281 	}
282 
283 	/* NOTREACHED */
284 	return (false);
285 }
286 
287 /*
288  * Matches all nexthop with given @gw.
289  * Can be used as rib_filter_f callback.
290  */
291 int
292 rib_match_gw(const struct rtentry *rt, const struct nhop_object *nh, void *gw_sa)
293 {
294 	const struct sockaddr *gw = (const struct sockaddr *)gw_sa;
295 
296 	return (match_nhop_gw(nh, gw));
297 }
298 
299 struct gw_filter_data {
300 	const struct sockaddr *gw;
301 	int count;
302 };
303 
304 /*
305  * Matches first occurence of the gateway provided in @gwd
306  */
307 static int
308 match_gw_one(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
309 {
310 	struct gw_filter_data *gwd = (struct gw_filter_data *)_data;
311 
312 	/* Return only first match to make rtsock happy */
313 	if (match_nhop_gw(nh, gwd->gw) && gwd->count++ == 0)
314 		return (1);
315 	return (0);
316 }
317 
318 /*
319  * Checks if data in @info matches nexhop @nh.
320  *
321  * Returns 0 on success,
322  * ESRCH if not matched,
323  * ENOENT if filter function returned false
324  */
325 int
326 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
327     const struct nhop_object *nh)
328 {
329 	const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
330 
331 	if (info->rti_filter != NULL) {
332 	    if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
333 		    return (ENOENT);
334 	    else
335 		    return (0);
336 	}
337 	if ((gw != NULL) && !match_nhop_gw(nh, gw))
338 		return (ESRCH);
339 
340 	return (0);
341 }
342 
343 /*
344  * Runs exact prefix match based on @dst and @netmask.
345  * Returns matched @rtentry if found or NULL.
346  * If rtentry was found, saves nexthop / weight value into @rnd.
347  */
348 static struct rtentry *
349 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
350     const struct sockaddr *netmask, struct route_nhop_data *rnd)
351 {
352 	struct rtentry *rt;
353 
354 	RIB_LOCK_ASSERT(rnh);
355 
356 	rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
357 	if (rt != NULL) {
358 		rnd->rnd_nhop = rt->rt_nhop;
359 		rnd->rnd_weight = rt->rt_weight;
360 	} else {
361 		rnd->rnd_nhop = NULL;
362 		rnd->rnd_weight = 0;
363 	}
364 
365 	return (rt);
366 }
367 
368 struct rtentry *
369 lookup_prefix_rt(struct rib_head *rnh, const struct rtentry *rt,
370     struct route_nhop_data *rnd)
371 {
372 	return (lookup_prefix_bysa(rnh, rt_key_const(rt), rt_mask_const(rt), rnd));
373 }
374 
375 /*
376  * Runs exact prefix match based on dst/netmask from @info.
377  * Assumes RIB lock is held.
378  * Returns matched @rtentry if found or NULL.
379  * If rtentry was found, saves nexthop / weight value into @rnd.
380  */
381 struct rtentry *
382 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
383     struct route_nhop_data *rnd)
384 {
385 	struct rtentry *rt;
386 
387 	rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
388 	    info->rti_info[RTAX_NETMASK], rnd);
389 
390 	return (rt);
391 }
392 
393 static bool
394 fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
395     struct sockaddr **pmask)
396 {
397 	if (plen == -1) {
398 		*pmask = NULL;
399 		return (true);
400 	}
401 
402 	switch (family) {
403 #ifdef INET
404 	case AF_INET:
405 		{
406 			struct sockaddr_in *mask = (struct sockaddr_in *)(*pmask);
407 			struct sockaddr_in *dst= (struct sockaddr_in *)_dst;
408 
409 			memset(mask, 0, sizeof(*mask));
410 			mask->sin_family = family;
411 			mask->sin_len = sizeof(*mask);
412 			if (plen == 32)
413 				*pmask = NULL;
414 			else if (plen > 32 || plen < 0)
415 				return (false);
416 			else {
417 				uint32_t daddr, maddr;
418 				maddr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
419 				mask->sin_addr.s_addr = maddr;
420 				daddr = dst->sin_addr.s_addr;
421 				daddr = htonl(ntohl(daddr) & ntohl(maddr));
422 				dst->sin_addr.s_addr = daddr;
423 			}
424 			return (true);
425 		}
426 		break;
427 #endif
428 #ifdef INET6
429 	case AF_INET6:
430 		{
431 			struct sockaddr_in6 *mask = (struct sockaddr_in6 *)(*pmask);
432 			struct sockaddr_in6 *dst = (struct sockaddr_in6 *)_dst;
433 
434 			memset(mask, 0, sizeof(*mask));
435 			mask->sin6_family = family;
436 			mask->sin6_len = sizeof(*mask);
437 			if (plen == 128)
438 				*pmask = NULL;
439 			else if (plen > 128 || plen < 0)
440 				return (false);
441 			else {
442 				ip6_writemask(&mask->sin6_addr, plen);
443 				IN6_MASK_ADDR(&dst->sin6_addr, &mask->sin6_addr);
444 			}
445 			return (true);
446 		}
447 		break;
448 #endif
449 	}
450 	return (false);
451 }
452 
453 /*
454  * Attempts to add @dst/plen prefix with nexthop/nexhopgroup data @rnd
455  * to the routing table.
456  *
457  * @fibnum: rtable id to insert route to
458  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
459  * @plen: prefix length (or -1 if host route or not applicable for AF)
460  * @op_flags: combination of RTM_F_ flags
461  * @rc: storage to report operation result
462  *
463  * Returns 0 on success.
464  */
465 int
466 rib_add_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
467     struct route_nhop_data *rnd, int op_flags, struct rib_cmd_info *rc)
468 {
469 	union sockaddr_union mask_storage;
470 	struct sockaddr *netmask = &mask_storage.sa;
471 	struct rtentry *rt = NULL;
472 
473 	NET_EPOCH_ASSERT();
474 
475 	bzero(rc, sizeof(struct rib_cmd_info));
476 	rc->rc_cmd = RTM_ADD;
477 
478 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
479 	if (rnh == NULL)
480 		return (EAFNOSUPPORT);
481 
482 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
483 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
484 		return (EINVAL);
485 	}
486 
487 	if (op_flags & RTM_F_CREATE) {
488 		if ((rt = rt_alloc(rnh, dst, netmask)) == NULL) {
489 			FIB_RH_LOG(LOG_INFO, rnh, "rtentry allocation failed");
490 			return (ENOMEM);
491 		}
492 	}
493 
494 	return (add_route_flags(rnh, rt, rnd, op_flags, rc));
495 }
496 
497 /*
498  * Attempts to delete @dst/plen prefix matching gateway @gw from the
499  *  routing rable.
500  *
501  * @fibnum: rtable id to remove route from
502  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
503  * @plen: prefix length (or -1 if host route or not applicable for AF)
504  * @gw: gateway to match
505  * @op_flags: combination of RTM_F_ flags
506  * @rc: storage to report operation result
507  *
508  * Returns 0 on success.
509  */
510 int
511 rib_del_route_px_gw(uint32_t fibnum, struct sockaddr *dst, int plen,
512     const struct sockaddr *gw, int op_flags, struct rib_cmd_info *rc)
513 {
514 	struct gw_filter_data gwd = { .gw = gw };
515 
516 	return (rib_del_route_px(fibnum, dst, plen, match_gw_one, &gwd, op_flags, rc));
517 }
518 
519 /*
520  * Attempts to delete @dst/plen prefix matching @filter_func from the
521  *  routing rable.
522  *
523  * @fibnum: rtable id to remove route from
524  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
525  * @plen: prefix length (or -1 if host route or not applicable for AF)
526  * @filter_func: func to be called for each nexthop of the prefix for matching
527  * @filter_arg: argument to pass to @filter_func
528  * @op_flags: combination of RTM_F_ flags
529  * @rc: storage to report operation result
530  *
531  * Returns 0 on success.
532  */
533 int
534 rib_del_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
535     rib_filter_f_t *filter_func, void *filter_arg, int op_flags,
536     struct rib_cmd_info *rc)
537 {
538 	union sockaddr_union mask_storage;
539 	struct sockaddr *netmask = &mask_storage.sa;
540 	int error;
541 
542 	NET_EPOCH_ASSERT();
543 
544 	bzero(rc, sizeof(struct rib_cmd_info));
545 	rc->rc_cmd = RTM_DELETE;
546 
547 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
548 	if (rnh == NULL)
549 		return (EAFNOSUPPORT);
550 
551 	if (dst->sa_len > sizeof(mask_storage)) {
552 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too big: %d", dst->sa_len);
553 		return (EINVAL);
554 	}
555 
556 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
557 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
558 		return (EINVAL);
559 	}
560 
561 	int prio = (op_flags & RTM_F_FORCE) ? NH_PRIORITY_HIGH : NH_PRIORITY_NORMAL;
562 
563 	RIB_WLOCK(rnh);
564 	struct route_nhop_data rnd;
565 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
566 	if (rt != NULL) {
567 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
568 		    filter_arg, rc);
569 	} else
570 		error = ESRCH;
571 	RIB_WUNLOCK(rnh);
572 
573 	if (error != 0)
574 		return (error);
575 
576 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
577 
578 	if (rc->rc_cmd == RTM_DELETE)
579 		rt_free(rc->rc_rt);
580 #ifdef ROUTE_MPATH
581 	else {
582 		/*
583 		 * Deleting 1 path may result in RTM_CHANGE to
584 		 * a different mpath group/nhop.
585 		 * Free old mpath group.
586 		 */
587 		nhop_free_any(rc->rc_nh_old);
588 	}
589 #endif
590 
591 	return (0);
592 }
593 
594 /*
595  * Tries to copy route @rt from one rtable to the rtable specified by @dst_rh.
596  * @rt: route to copy.
597  * @rnd_src: nhop and weight. Multipath routes are not supported
598  * @rh_dst: target rtable.
599  * @rc: operation result storage
600  *
601  * Return 0 on success.
602  */
603 int
604 rib_copy_route(struct rtentry *rt, const struct route_nhop_data *rnd_src,
605     struct rib_head *rh_dst, struct rib_cmd_info *rc)
606 {
607 	struct nhop_object __diagused *nh_src = rnd_src->rnd_nhop;
608 	int error;
609 
610 	MPASS((nh_src->nh_flags & NHF_MULTIPATH) == 0);
611 
612 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
613 		char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
614 		nhop_print_buf_any(nh_src, nhbuf, sizeof(nhbuf));
615 		rt_print_buf(rt, rtbuf, sizeof(rtbuf));
616 		FIB_RH_LOG(LOG_DEBUG2, rh_dst, "copying %s -> %s from fib %u",
617 		    rtbuf, nhbuf, nhop_get_fibnum(nh_src));
618 	}
619 	struct nhop_object *nh = nhop_alloc(rh_dst->rib_fibnum, rh_dst->rib_family);
620 	if (nh == NULL) {
621 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to allocate new nexthop");
622 		return (ENOMEM);
623 	}
624 	nhop_copy(nh, rnd_src->rnd_nhop);
625 	nhop_set_fibnum(nh, rh_dst->rib_fibnum);
626 	nh = nhop_get_nhop_internal(rh_dst, nh, &error);
627 	if (error != 0) {
628 		FIB_RH_LOG(LOG_INFO, rh_dst,
629 		    "unable to finalize new nexthop: error %d", error);
630 		return (ENOMEM);
631 	}
632 
633 	struct rtentry *rt_new = rt_alloc(rh_dst, rt_key(rt), rt_mask(rt));
634 	if (rt_new == NULL) {
635 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to create new rtentry");
636 		nhop_free(nh);
637 		return (ENOMEM);
638 	}
639 
640 	struct route_nhop_data rnd = {
641 		.rnd_nhop = nh,
642 		.rnd_weight = rnd_src->rnd_weight
643 	};
644 	int op_flags = RTM_F_CREATE | (NH_IS_PINNED(nh) ? RTM_F_FORCE : 0);
645 	error = add_route_flags(rh_dst, rt_new, &rnd, op_flags, rc);
646 
647 	if (error != 0) {
648 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
649 			char buf[NHOP_PRINT_BUFSIZE];
650 			rt_print_buf(rt_new, buf, sizeof(buf));
651 			FIB_RH_LOG(LOG_DEBUG, rh_dst,
652 			    "Unable to add route %s: error %d", buf, error);
653 		}
654 		nhop_free(nh);
655 		rt_free_immediate(rt_new);
656 	}
657 	return (error);
658 }
659 
660 /*
661  * Adds route defined by @info into the kernel table specified by @fibnum and
662  * sa_family in @info->rti_info[RTAX_DST].
663  *
664  * Returns 0 on success and fills in operation metadata into @rc.
665  */
666 int
667 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
668     struct rib_cmd_info *rc)
669 {
670 	struct rib_head *rnh;
671 	int error;
672 
673 	NET_EPOCH_ASSERT();
674 
675 	rnh = get_rnh(fibnum, info);
676 	if (rnh == NULL)
677 		return (EAFNOSUPPORT);
678 
679 	/*
680 	 * Check consistency between RTF_HOST flag and netmask
681 	 * existence.
682 	 */
683 	if (info->rti_flags & RTF_HOST)
684 		info->rti_info[RTAX_NETMASK] = NULL;
685 	else if (info->rti_info[RTAX_NETMASK] == NULL) {
686 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: no RTF_HOST and empty netmask");
687 		return (EINVAL);
688 	}
689 
690 	bzero(rc, sizeof(struct rib_cmd_info));
691 	rc->rc_cmd = RTM_ADD;
692 
693 	error = add_route_byinfo(rnh, info, rc);
694 	if (error == 0)
695 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
696 
697 	return (error);
698 }
699 
700 static int
701 add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
702     struct rib_cmd_info *rc)
703 {
704 	struct route_nhop_data rnd_add;
705 	struct nhop_object *nh;
706 	struct rtentry *rt;
707 	struct sockaddr *dst, *gateway, *netmask;
708 	int error;
709 
710 	dst = info->rti_info[RTAX_DST];
711 	gateway = info->rti_info[RTAX_GATEWAY];
712 	netmask = info->rti_info[RTAX_NETMASK];
713 
714 	if ((info->rti_flags & RTF_GATEWAY) && !gateway) {
715 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: RTF_GATEWAY set with empty gw");
716 		return (EINVAL);
717 	}
718 	if (dst && gateway && !nhop_check_gateway(dst->sa_family, gateway->sa_family)) {
719 		FIB_RH_LOG(LOG_DEBUG, rnh,
720 		    "error: invalid dst/gateway family combination (%d, %d)",
721 		    dst->sa_family, gateway->sa_family);
722 		return (EINVAL);
723 	}
724 
725 	if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) {
726 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large: %d",
727 		    dst->sa_len);
728 		return (EINVAL);
729 	}
730 
731 	if (info->rti_ifa == NULL) {
732 		error = rt_getifa_fib(info, rnh->rib_fibnum);
733 		if (error)
734 			return (error);
735 	}
736 
737 	if ((rt = rt_alloc(rnh, dst, netmask)) == NULL)
738 		return (ENOBUFS);
739 
740 	error = nhop_create_from_info(rnh, info, &nh);
741 	if (error != 0) {
742 		rt_free_immediate(rt);
743 		return (error);
744 	}
745 
746 	rnd_add.rnd_nhop = nh;
747 	rnd_add.rnd_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
748 
749 	int op_flags = RTM_F_CREATE;
750 	if (get_prio_from_info(info) == NH_PRIORITY_HIGH)
751 		op_flags |= RTM_F_FORCE;
752 	else
753 		op_flags |= RTM_F_APPEND;
754 	return (add_route_flags(rnh, rt, &rnd_add, op_flags, rc));
755 
756 }
757 
758 static int
759 add_route_flags(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd_add,
760     int op_flags, struct rib_cmd_info *rc)
761 {
762 	struct route_nhop_data rnd_orig;
763 	struct nhop_object *nh;
764 	struct rtentry *rt_orig;
765 	int error = 0;
766 
767 	nh = rnd_add->rnd_nhop;
768 
769 	RIB_WLOCK(rnh);
770 
771 	rt_orig = lookup_prefix_rt(rnh, rt, &rnd_orig);
772 
773 	if (rt_orig == NULL) {
774 		if (op_flags & RTM_F_CREATE)
775 			error = add_route(rnh, rt, rnd_add, rc);
776 		else
777 			error = ESRCH; /* no entry but creation was not required */
778 		RIB_WUNLOCK(rnh);
779 		if (error != 0)
780 			goto out;
781 		return (0);
782 	}
783 
784 	if (op_flags & RTM_F_EXCL) {
785 		/* We have existing route in the RIB but not allowed to replace. */
786 		RIB_WUNLOCK(rnh);
787 		error = EEXIST;
788 		goto out;
789 	}
790 
791 	/* Now either append or replace */
792 	if (op_flags & RTM_F_REPLACE) {
793 		if (nhop_get_prio(rnd_orig.rnd_nhop) > nhop_get_prio(rnd_add->rnd_nhop)) {
794 			/* Old path is "better" (e.g. has PINNED flag set) */
795 			error = EEXIST;
796 			goto out;
797 		}
798 		change_route(rnh, rt_orig, rnd_add, rc);
799 		RIB_WUNLOCK(rnh);
800 		nh = rc->rc_nh_old;
801 		goto out;
802 	}
803 
804 	RIB_WUNLOCK(rnh);
805 
806 #ifdef ROUTE_MPATH
807 	if ((op_flags & RTM_F_APPEND) && rib_can_multipath(rnh) &&
808 	    nhop_can_multipath(rnd_add->rnd_nhop) &&
809 	    nhop_can_multipath(rnd_orig.rnd_nhop)) {
810 
811 		for (int i = 0; i < RIB_MAX_RETRIES; i++) {
812 			error = add_route_flags_mpath(rnh, rt_orig, rnd_add, &rnd_orig,
813 			    op_flags, rc);
814 			if (error != EAGAIN)
815 				break;
816 			RTSTAT_INC(rts_add_retry);
817 		}
818 
819 		/*
820 		 *  Original nhop reference is unused in any case.
821 		 */
822 		nhop_free_any(rnd_add->rnd_nhop);
823 		if (op_flags & RTM_F_CREATE) {
824 			if (error != 0 || rc->rc_cmd != RTM_ADD)
825 				rt_free_immediate(rt);
826 		}
827 		return (error);
828 	}
829 #endif
830 	/* Out of options - free state and return error */
831 	error = EEXIST;
832 out:
833 	if (op_flags & RTM_F_CREATE)
834 		rt_free_immediate(rt);
835 	nhop_free_any(nh);
836 
837 	return (error);
838 }
839 
840 #ifdef ROUTE_MPATH
841 static int
842 add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
843     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
844     int op_flags, struct rib_cmd_info *rc)
845 {
846 	RIB_RLOCK_TRACKER;
847 	struct route_nhop_data rnd_new;
848 	int error = 0;
849 
850 	error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, &rnd_new);
851 	if (error != 0) {
852 		if (error == EAGAIN) {
853 			/*
854 			 * Group creation failed, most probably because
855 			 * @rnd_orig data got scheduled for deletion.
856 			 * Refresh @rnd_orig data and retry.
857 			 */
858 			RIB_RLOCK(rnh);
859 			lookup_prefix_rt(rnh, rt, rnd_orig);
860 			RIB_RUNLOCK(rnh);
861 			if (rnd_orig == NULL && !(op_flags & RTM_F_CREATE)) {
862 				/* In this iteration route doesn't exist */
863 				error = ENOENT;
864 			}
865 		}
866 		return (error);
867 	}
868 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
869 	if (error != 0)
870 		return (error);
871 
872 	if (V_fib_hash_outbound == 0 && NH_IS_NHGRP(rc->rc_nh_new)) {
873 		/*
874 		 * First multipath route got installed. Enable local
875 		 * outbound connections hashing.
876 		 */
877 		if (bootverbose)
878 			printf("FIB: enabled flowid calculation for locally-originated packets\n");
879 		V_fib_hash_outbound = 1;
880 	}
881 
882 	return (0);
883 }
884 #endif
885 
886 /*
887  * Removes route defined by @info from the kernel table specified by @fibnum and
888  * sa_family in @info->rti_info[RTAX_DST].
889  *
890  * Returns 0 on success and fills in operation metadata into @rc.
891  */
892 int
893 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
894 {
895 	struct rib_head *rnh;
896 	struct sockaddr *dst, *netmask;
897 	struct sockaddr_storage mdst;
898 	int error;
899 
900 	NET_EPOCH_ASSERT();
901 
902 	rnh = get_rnh(fibnum, info);
903 	if (rnh == NULL)
904 		return (EAFNOSUPPORT);
905 
906 	bzero(rc, sizeof(struct rib_cmd_info));
907 	rc->rc_cmd = RTM_DELETE;
908 
909 	dst = info->rti_info[RTAX_DST];
910 	netmask = info->rti_info[RTAX_NETMASK];
911 
912 	if (netmask != NULL) {
913 		/* Ensure @dst is always properly masked */
914 		if (dst->sa_len > sizeof(mdst)) {
915 			FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large");
916 			return (EINVAL);
917 		}
918 		rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
919 		dst = (struct sockaddr *)&mdst;
920 	}
921 
922 	rib_filter_f_t *filter_func = NULL;
923 	void *filter_arg = NULL;
924 	struct gw_filter_data gwd = { .gw = info->rti_info[RTAX_GATEWAY] };
925 
926 	if (info->rti_filter != NULL) {
927 		filter_func = info->rti_filter;
928 		filter_arg = info->rti_filterdata;
929 	} else if (gwd.gw != NULL) {
930 		filter_func = match_gw_one;
931 		filter_arg = &gwd;
932 	}
933 
934 	int prio = get_prio_from_info(info);
935 
936 	RIB_WLOCK(rnh);
937 	struct route_nhop_data rnd;
938 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
939 	if (rt != NULL) {
940 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
941 		    filter_arg, rc);
942 	} else
943 		error = ESRCH;
944 	RIB_WUNLOCK(rnh);
945 
946 	if (error != 0)
947 		return (error);
948 
949 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
950 
951 	if (rc->rc_cmd == RTM_DELETE)
952 		rt_free(rc->rc_rt);
953 #ifdef ROUTE_MPATH
954 	else {
955 		/*
956 		 * Deleting 1 path may result in RTM_CHANGE to
957 		 * a different mpath group/nhop.
958 		 * Free old mpath group.
959 		 */
960 		nhop_free_any(rc->rc_nh_old);
961 	}
962 #endif
963 
964 	return (0);
965 }
966 
967 /*
968  * Conditionally unlinks rtentry paths from @rnh matching @cb.
969  * Returns 0 on success with operation result stored in @rc.
970  * On error, returns:
971  * ESRCH - if prefix was not found or filter function failed to match
972  * EADDRINUSE - if trying to delete higher priority route.
973  */
974 static int
975 rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
976     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc)
977 {
978 	struct nhop_object *nh = rt->rt_nhop;
979 
980 #ifdef ROUTE_MPATH
981 	if (NH_IS_NHGRP(nh)) {
982 		struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
983 		struct route_nhop_data rnd;
984 		int error;
985 
986 		if (cb == NULL)
987 			return (ESRCH);
988 		error = nhgrp_get_filtered_group(rnh, rt, nhg, cb, cbdata, &rnd);
989 		if (error == 0) {
990 			if (rnd.rnd_nhgrp == nhg) {
991 				/* No match, unreference new group and return. */
992 				nhop_free_any(rnd.rnd_nhop);
993 				return (ESRCH);
994 			}
995 			error = change_route(rnh, rt, &rnd, rc);
996 		}
997 		return (error);
998 	}
999 #endif
1000 	if (cb != NULL && !cb(rt, nh, cbdata))
1001 		return (ESRCH);
1002 
1003 	if (prio < nhop_get_prio(nh))
1004 		return (EADDRINUSE);
1005 
1006 	return (delete_route(rnh, rt, rc));
1007 }
1008 
1009 int
1010 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
1011     struct rib_cmd_info *rc)
1012 {
1013 	RIB_RLOCK_TRACKER;
1014 	struct route_nhop_data rnd_orig;
1015 	struct rib_head *rnh;
1016 	struct rtentry *rt;
1017 	int error;
1018 
1019 	NET_EPOCH_ASSERT();
1020 
1021 	rnh = get_rnh(fibnum, info);
1022 	if (rnh == NULL)
1023 		return (EAFNOSUPPORT);
1024 
1025 	bzero(rc, sizeof(struct rib_cmd_info));
1026 	rc->rc_cmd = RTM_CHANGE;
1027 
1028 	/* Check if updated gateway exists */
1029 	if ((info->rti_flags & RTF_GATEWAY) &&
1030 	    (info->rti_info[RTAX_GATEWAY] == NULL)) {
1031 
1032 		/*
1033 		 * route(8) adds RTF_GATEWAY flag if -interface is not set.
1034 		 * Remove RTF_GATEWAY to enforce consistency and maintain
1035 		 * compatibility..
1036 		 */
1037 		info->rti_flags &= ~RTF_GATEWAY;
1038 	}
1039 
1040 	/*
1041 	 * route change is done in multiple steps, with dropping and
1042 	 * reacquiring lock. In the situations with multiple processes
1043 	 * changes the same route in can lead to the case when route
1044 	 * is changed between the steps. Address it by retrying the operation
1045 	 * multiple times before failing.
1046 	 */
1047 
1048 	RIB_RLOCK(rnh);
1049 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
1050 	    info->rti_info[RTAX_NETMASK], &rnh->head);
1051 
1052 	if (rt == NULL) {
1053 		RIB_RUNLOCK(rnh);
1054 		return (ESRCH);
1055 	}
1056 
1057 	rnd_orig.rnd_nhop = rt->rt_nhop;
1058 	rnd_orig.rnd_weight = rt->rt_weight;
1059 
1060 	RIB_RUNLOCK(rnh);
1061 
1062 	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
1063 		error = change_route_byinfo(rnh, rt, info, &rnd_orig, rc);
1064 		if (error != EAGAIN)
1065 			break;
1066 	}
1067 
1068 	return (error);
1069 }
1070 
1071 static int
1072 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
1073     struct nhop_object *nh_orig, struct nhop_object **nh_new)
1074 {
1075 	int error;
1076 
1077 	/*
1078 	 * New gateway could require new ifaddr, ifp;
1079 	 * flags may also be different; ifp may be specified
1080 	 * by ll sockaddr when protocol address is ambiguous
1081 	 */
1082 	if (((nh_orig->nh_flags & NHF_GATEWAY) &&
1083 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
1084 	    info->rti_info[RTAX_IFP] != NULL ||
1085 	    (info->rti_info[RTAX_IFA] != NULL &&
1086 	     !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
1087 		error = rt_getifa_fib(info, rnh->rib_fibnum);
1088 
1089 		if (error != 0) {
1090 			info->rti_ifa = NULL;
1091 			return (error);
1092 		}
1093 	}
1094 
1095 	error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
1096 	info->rti_ifa = NULL;
1097 
1098 	return (error);
1099 }
1100 
1101 #ifdef ROUTE_MPATH
1102 static int
1103 change_mpath_route(struct rib_head *rnh, struct rtentry *rt,
1104     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1105     struct rib_cmd_info *rc)
1106 {
1107 	int error = 0, found_idx = 0;
1108 	struct nhop_object *nh_orig = NULL, *nh_new;
1109 	struct route_nhop_data rnd_new = {};
1110 	const struct weightened_nhop *wn = NULL;
1111 	struct weightened_nhop *wn_new;
1112 	uint32_t num_nhops;
1113 
1114 	wn = nhgrp_get_nhops(rnd_orig->rnd_nhgrp, &num_nhops);
1115 	for (int i = 0; i < num_nhops; i++) {
1116 		if (check_info_match_nhop(info, NULL, wn[i].nh) == 0) {
1117 			nh_orig = wn[i].nh;
1118 			found_idx = i;
1119 			break;
1120 		}
1121 	}
1122 
1123 	if (nh_orig == NULL)
1124 		return (ESRCH);
1125 
1126 	error = change_nhop(rnh, info, nh_orig, &nh_new);
1127 	if (error != 0)
1128 		return (error);
1129 
1130 	wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
1131 	    M_TEMP, M_NOWAIT | M_ZERO);
1132 	if (wn_new == NULL) {
1133 		nhop_free(nh_new);
1134 		return (EAGAIN);
1135 	}
1136 
1137 	memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
1138 	wn_new[found_idx].nh = nh_new;
1139 	wn_new[found_idx].weight = get_info_weight(info, wn[found_idx].weight);
1140 
1141 	error = nhgrp_get_group(rnh, wn_new, num_nhops, 0, &rnd_new.rnd_nhgrp);
1142 	nhop_free(nh_new);
1143 	free(wn_new, M_TEMP);
1144 
1145 	if (error != 0)
1146 		return (error);
1147 
1148 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1149 
1150 	return (error);
1151 }
1152 #endif
1153 
1154 static int
1155 change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
1156     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1157     struct rib_cmd_info *rc)
1158 {
1159 	int error = 0;
1160 	struct nhop_object *nh_orig;
1161 	struct route_nhop_data rnd_new;
1162 
1163 	nh_orig = rnd_orig->rnd_nhop;
1164 	if (nh_orig == NULL)
1165 		return (ESRCH);
1166 
1167 #ifdef ROUTE_MPATH
1168 	if (NH_IS_NHGRP(nh_orig))
1169 		return (change_mpath_route(rnh, rt, info, rnd_orig, rc));
1170 #endif
1171 
1172 	rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
1173 	error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
1174 	if (error != 0)
1175 		return (error);
1176 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1177 
1178 	return (error);
1179 }
1180 
1181 /*
1182  * Insert @rt with nhop data from @rnd_new to @rnh.
1183  * Returns 0 on success and stores operation results in @rc.
1184  */
1185 static int
1186 add_route(struct rib_head *rnh, struct rtentry *rt,
1187     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1188 {
1189 	struct radix_node *rn;
1190 
1191 	RIB_WLOCK_ASSERT(rnh);
1192 
1193 	rt->rt_nhop = rnd->rnd_nhop;
1194 	rt->rt_weight = rnd->rnd_weight;
1195 	rn = rnh->rnh_addaddr(rt_key(rt), rt_mask_const(rt), &rnh->head, rt->rt_nodes);
1196 
1197 	if (rn != NULL) {
1198 		if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1199 			tmproutes_update(rnh, rt, rnd->rnd_nhop);
1200 
1201 		/* Finalize notification */
1202 		rib_bump_gen(rnh);
1203 		rnh->rnh_prefixes++;
1204 
1205 		rc->rc_cmd = RTM_ADD;
1206 		rc->rc_rt = rt;
1207 		rc->rc_nh_old = NULL;
1208 		rc->rc_nh_new = rnd->rnd_nhop;
1209 		rc->rc_nh_weight = rnd->rnd_weight;
1210 
1211 		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1212 		return (0);
1213 	}
1214 
1215 	/* Existing route or memory allocation failure. */
1216 	return (EEXIST);
1217 }
1218 
1219 /*
1220  * Unconditionally deletes @rt from @rnh.
1221  */
1222 static int
1223 delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc)
1224 {
1225 	RIB_WLOCK_ASSERT(rnh);
1226 
1227 	/* Route deletion requested. */
1228 	struct radix_node *rn;
1229 
1230 	rn = rnh->rnh_deladdr(rt_key_const(rt), rt_mask_const(rt), &rnh->head);
1231 	if (rn == NULL)
1232 		return (ESRCH);
1233 	rt = RNTORT(rn);
1234 	rt->rte_flags &= ~RTF_UP;
1235 
1236 	rib_bump_gen(rnh);
1237 	rnh->rnh_prefixes--;
1238 
1239 	rc->rc_cmd = RTM_DELETE;
1240 	rc->rc_rt = rt;
1241 	rc->rc_nh_old = rt->rt_nhop;
1242 	rc->rc_nh_new = NULL;
1243 	rc->rc_nh_weight = rt->rt_weight;
1244 
1245 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1246 
1247 	return (0);
1248 }
1249 
1250 /*
1251  * Switch @rt nhop/weigh to the ones specified in @rnd.
1252  * Returns 0 on success.
1253  */
1254 int
1255 change_route(struct rib_head *rnh, struct rtentry *rt,
1256     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1257 {
1258 	struct nhop_object *nh_orig;
1259 
1260 	RIB_WLOCK_ASSERT(rnh);
1261 
1262 	nh_orig = rt->rt_nhop;
1263 
1264 	if (rnd->rnd_nhop == NULL)
1265 		return (delete_route(rnh, rt, rc));
1266 
1267 	/* Changing nexthop & weight to a new one */
1268 	rt->rt_nhop = rnd->rnd_nhop;
1269 	rt->rt_weight = rnd->rnd_weight;
1270 	if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1271 		tmproutes_update(rnh, rt, rnd->rnd_nhop);
1272 
1273 	/* Finalize notification */
1274 	rib_bump_gen(rnh);
1275 	rc->rc_cmd = RTM_CHANGE;
1276 	rc->rc_rt = rt;
1277 	rc->rc_nh_old = nh_orig;
1278 	rc->rc_nh_new = rnd->rnd_nhop;
1279 	rc->rc_nh_weight = rnd->rnd_weight;
1280 
1281 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1282 
1283 	return (0);
1284 }
1285 
1286 /*
1287  * Conditionally update route nhop/weight IFF data in @nhd_orig is
1288  *  consistent with the current route data.
1289  * Nexthop in @nhd_new is consumed.
1290  */
1291 int
1292 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
1293     struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new,
1294     struct rib_cmd_info *rc)
1295 {
1296 	struct rtentry *rt_new;
1297 	int error = 0;
1298 
1299 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
1300 		char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
1301 		nhop_print_buf_any(rnd_orig->rnd_nhop, buf_old, NHOP_PRINT_BUFSIZE);
1302 		nhop_print_buf_any(rnd_new->rnd_nhop, buf_new, NHOP_PRINT_BUFSIZE);
1303 		FIB_LOG(LOG_DEBUG2, rnh->rib_fibnum, rnh->rib_family,
1304 		    "trying change %s -> %s", buf_old, buf_new);
1305 	}
1306 	RIB_WLOCK(rnh);
1307 
1308 	struct route_nhop_data rnd;
1309 	rt_new = lookup_prefix_rt(rnh, rt, &rnd);
1310 
1311 	if (rt_new == NULL) {
1312 		if (rnd_orig->rnd_nhop == NULL)
1313 			error = add_route(rnh, rt, rnd_new, rc);
1314 		else {
1315 			/*
1316 			 * Prefix does not exist, which was not our assumption.
1317 			 * Update @rnd_orig with the new data and return
1318 			 */
1319 			rnd_orig->rnd_nhop = NULL;
1320 			rnd_orig->rnd_weight = 0;
1321 			error = EAGAIN;
1322 		}
1323 	} else {
1324 		/* Prefix exists, try to update */
1325 		if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
1326 			/*
1327 			 * Nhop/mpath group hasn't changed. Flip
1328 			 * to the new precalculated one and return
1329 			 */
1330 			error = change_route(rnh, rt_new, rnd_new, rc);
1331 		} else {
1332 			/* Update and retry */
1333 			rnd_orig->rnd_nhop = rt_new->rt_nhop;
1334 			rnd_orig->rnd_weight = rt_new->rt_weight;
1335 			error = EAGAIN;
1336 		}
1337 	}
1338 
1339 	RIB_WUNLOCK(rnh);
1340 
1341 	if (error == 0) {
1342 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
1343 
1344 		if (rnd_orig->rnd_nhop != NULL)
1345 			nhop_free_any(rnd_orig->rnd_nhop);
1346 
1347 	} else {
1348 		if (rnd_new->rnd_nhop != NULL)
1349 			nhop_free_any(rnd_new->rnd_nhop);
1350 	}
1351 
1352 	return (error);
1353 }
1354 
1355 /*
1356  * Performs modification of routing table specificed by @action.
1357  * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
1358  * Needs to be run in network epoch.
1359  *
1360  * Returns 0 on success and fills in @rc with action result.
1361  */
1362 int
1363 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
1364     struct rib_cmd_info *rc)
1365 {
1366 	int error;
1367 
1368 	switch (action) {
1369 	case RTM_ADD:
1370 		error = rib_add_route(fibnum, info, rc);
1371 		break;
1372 	case RTM_DELETE:
1373 		error = rib_del_route(fibnum, info, rc);
1374 		break;
1375 	case RTM_CHANGE:
1376 		error = rib_change_route(fibnum, info, rc);
1377 		break;
1378 	default:
1379 		error = ENOTSUP;
1380 	}
1381 
1382 	return (error);
1383 }
1384 
1385 struct rt_delinfo
1386 {
1387 	struct rib_head *rnh;
1388 	struct rtentry *head;
1389 	rib_filter_f_t *filter_f;
1390 	void *filter_arg;
1391 	int prio;
1392 	struct rib_cmd_info rc;
1393 };
1394 
1395 /*
1396  * Conditionally unlinks rtenties or paths from radix tree based
1397  * on the callback data passed in @arg.
1398  */
1399 static int
1400 rt_checkdelroute(struct radix_node *rn, void *arg)
1401 {
1402 	struct rt_delinfo *di = (struct rt_delinfo *)arg;
1403 	struct rtentry *rt = (struct rtentry *)rn;
1404 
1405 	if (rt_delete_conditional(di->rnh, rt, di->prio,
1406 	    di->filter_f, di->filter_arg, &di->rc) != 0)
1407 		return (0);
1408 
1409 	/*
1410 	 * Add deleted rtentries to the list to GC them
1411 	 *  after dropping the lock.
1412 	 *
1413 	 * XXX: Delayed notifications not implemented
1414 	 *  for nexthop updates.
1415 	 */
1416 	if (di->rc.rc_cmd == RTM_DELETE) {
1417 		/* Add to the list and return */
1418 		rt->rt_chain = di->head;
1419 		di->head = rt;
1420 #ifdef ROUTE_MPATH
1421 	} else {
1422 		/*
1423 		 * RTM_CHANGE to a different nexthop or nexthop group.
1424 		 * Free old multipath group.
1425 		 */
1426 		nhop_free_any(di->rc.rc_nh_old);
1427 #endif
1428 	}
1429 
1430 	return (0);
1431 }
1432 
1433 /*
1434  * Iterates over a routing table specified by @fibnum and @family and
1435  *  deletes elements marked by @filter_f.
1436  * @fibnum: rtable id
1437  * @family: AF_ address family
1438  * @filter_f: function returning non-zero value for items to delete
1439  * @arg: data to pass to the @filter_f function
1440  * @report: true if rtsock notification is needed.
1441  */
1442 void
1443 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg,
1444     bool report)
1445 {
1446 	struct rib_head *rnh;
1447 	struct rtentry *rt;
1448 	struct nhop_object *nh;
1449 	struct epoch_tracker et;
1450 
1451 	rnh = rt_tables_get_rnh(fibnum, family);
1452 	if (rnh == NULL)
1453 		return;
1454 
1455 	struct rt_delinfo di = {
1456 		.rnh = rnh,
1457 		.filter_f = filter_f,
1458 		.filter_arg = filter_arg,
1459 		.prio = NH_PRIORITY_NORMAL,
1460 	};
1461 
1462 	NET_EPOCH_ENTER(et);
1463 
1464 	RIB_WLOCK(rnh);
1465 	rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
1466 	RIB_WUNLOCK(rnh);
1467 
1468 	/* We might have something to reclaim. */
1469 	bzero(&di.rc, sizeof(di.rc));
1470 	di.rc.rc_cmd = RTM_DELETE;
1471 	while (di.head != NULL) {
1472 		rt = di.head;
1473 		di.head = rt->rt_chain;
1474 		rt->rt_chain = NULL;
1475 		nh = rt->rt_nhop;
1476 
1477 		di.rc.rc_rt = rt;
1478 		di.rc.rc_nh_old = nh;
1479 		rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
1480 
1481 		if (report) {
1482 #ifdef ROUTE_MPATH
1483 			struct nhgrp_object *nhg;
1484 			const struct weightened_nhop *wn;
1485 			uint32_t num_nhops;
1486 			if (NH_IS_NHGRP(nh)) {
1487 				nhg = (struct nhgrp_object *)nh;
1488 				wn = nhgrp_get_nhops(nhg, &num_nhops);
1489 				for (int i = 0; i < num_nhops; i++)
1490 					rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum);
1491 			} else
1492 #endif
1493 			rt_routemsg(RTM_DELETE, rt, nh, fibnum);
1494 		}
1495 		rt_free(rt);
1496 	}
1497 
1498 	NET_EPOCH_EXIT(et);
1499 }
1500 
1501 static int
1502 rt_delete_unconditional(struct radix_node *rn, void *arg)
1503 {
1504 	struct rtentry *rt = RNTORT(rn);
1505 	struct rib_head *rnh = (struct rib_head *)arg;
1506 
1507 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
1508 	if (RNTORT(rn) == rt)
1509 		rt_free(rt);
1510 
1511 	return (0);
1512 }
1513 
1514 /*
1515  * Removes all routes from the routing table without executing notifications.
1516  * rtentres will be removed after the end of a current epoch.
1517  */
1518 static void
1519 rib_flush_routes(struct rib_head *rnh)
1520 {
1521 	RIB_WLOCK(rnh);
1522 	rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh);
1523 	RIB_WUNLOCK(rnh);
1524 }
1525 
1526 void
1527 rib_flush_routes_family(int family)
1528 {
1529 	struct rib_head *rnh;
1530 
1531 	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
1532 		if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
1533 			rib_flush_routes(rnh);
1534 	}
1535 }
1536 
1537 const char *
1538 rib_print_family(int family)
1539 {
1540 	switch (family) {
1541 	case AF_INET:
1542 		return ("inet");
1543 	case AF_INET6:
1544 		return ("inet6");
1545 	case AF_LINK:
1546 		return ("link");
1547 	}
1548 	return ("unknown");
1549 }
1550 
1551