xref: /freebsd/sys/net/route/route_ctl.c (revision 7543a9c0280a0f4262489671936a6e03b9b2c563)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
37 #include <sys/mbuf.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
40 #include <sys/syslog.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/rmlock.h>
44 
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/if_dl.h>
48 #include <net/vnet.h>
49 #include <net/route.h>
50 #include <net/route/route_ctl.h>
51 #include <net/route/route_var.h>
52 #include <net/route/nhop_utils.h>
53 #include <net/route/nhop.h>
54 #include <net/route/nhop_var.h>
55 #include <netinet/in.h>
56 #include <netinet6/scope6_var.h>
57 #include <netinet6/in6_var.h>
58 
59 #define	DEBUG_MOD_NAME	route_ctl
60 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
61 #include <net/route/route_debug.h>
62 _DECLARE_DEBUG(LOG_INFO);
63 
64 /*
65  * This file contains control plane routing tables functions.
66  *
67  * All functions assumes they are called in net epoch.
68  */
69 
70 union sockaddr_union {
71 	struct sockaddr		sa;
72 	struct sockaddr_in	sin;
73 	struct sockaddr_in6	sin6;
74 	char			_buf[32];
75 };
76 
77 static int add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
78     struct rib_cmd_info *rc);
79 static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
80     struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
81     struct rib_cmd_info *rc);
82 
83 static int add_route_flags(struct rib_head *rnh, struct rtentry *rt,
84     struct route_nhop_data *rnd_add, int op_flags, struct rib_cmd_info *rc);
85 #ifdef ROUTE_MPATH
86 static int add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
87     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
88     int op_flags, struct rib_cmd_info *rc);
89 #endif
90 
91 static int add_route(struct rib_head *rnh, struct rtentry *rt,
92     struct route_nhop_data *rnd, struct rib_cmd_info *rc);
93 static int delete_route(struct rib_head *rnh, struct rtentry *rt,
94     struct rib_cmd_info *rc);
95 static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
96     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc);
97 
98 static int get_prio_from_info(const struct rt_addrinfo *info);
99 static int nhop_get_prio(const struct nhop_object *nh);
100 
101 #ifdef ROUTE_MPATH
102 static bool rib_can_multipath(struct rib_head *rh);
103 #endif
104 
105 /* Per-vnet multipath routing configuration */
106 SYSCTL_DECL(_net_route);
107 #define	V_rib_route_multipath	VNET(rib_route_multipath)
108 #ifdef ROUTE_MPATH
109 #define _MP_FLAGS	CTLFLAG_RW
110 #else
111 #define _MP_FLAGS	CTLFLAG_RD
112 #endif
113 VNET_DEFINE(u_int, rib_route_multipath) = 1;
114 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
115     &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
116 #undef _MP_FLAGS
117 
118 #ifdef ROUTE_MPATH
119 VNET_DEFINE(u_int, fib_hash_outbound) = 0;
120 SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
121     &VNET_NAME(fib_hash_outbound), 0,
122     "Compute flowid for locally-originated packets");
123 
124 /* Default entropy to add to the hash calculation for the outbound connections*/
125 uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
126 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
127 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
128 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
129 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
130 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
131 };
132 #endif
133 
134 #if defined(INET) && defined(INET6)
135 FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops");
136 #define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop)
137 VNET_DEFINE_STATIC(u_int, rib_route_ipv6_nexthop) = 1;
138 SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET,
139     &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address");
140 #endif
141 
142 /* Debug bits */
143 SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
144 
145 static struct rib_head *
146 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
147 {
148 	struct rib_head *rnh;
149 	struct sockaddr *dst;
150 
151 	KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
152 
153 	dst = info->rti_info[RTAX_DST];
154 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
155 
156 	return (rnh);
157 }
158 
159 #if defined(INET) && defined(INET6)
160 bool
161 rib_can_4o6_nhop(void)
162 {
163 	return (!!V_rib_route_ipv6_nexthop);
164 }
165 #endif
166 
167 #ifdef ROUTE_MPATH
168 static bool
169 rib_can_multipath(struct rib_head *rh)
170 {
171 	int result;
172 
173 	CURVNET_SET(rh->rib_vnet);
174 	result = !!V_rib_route_multipath;
175 	CURVNET_RESTORE();
176 
177 	return (result);
178 }
179 
180 /*
181  * Check is nhop is multipath-eligible.
182  * Avoid nhops without gateways and redirects.
183  *
184  * Returns 1 for multipath-eligible nexthop,
185  * 0 otherwise.
186  */
187 bool
188 nhop_can_multipath(const struct nhop_object *nh)
189 {
190 
191 	if ((nh->nh_flags & NHF_MULTIPATH) != 0)
192 		return (1);
193 	if ((nh->nh_flags & NHF_GATEWAY) == 0)
194 		return (0);
195 	if ((nh->nh_flags & NHF_REDIRECT) != 0)
196 		return (0);
197 
198 	return (1);
199 }
200 #endif
201 
202 static int
203 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
204 {
205 	uint32_t weight;
206 
207 	if (info->rti_mflags & RTV_WEIGHT)
208 		weight = info->rti_rmx->rmx_weight;
209 	else
210 		weight = default_weight;
211 	/* Keep upper 1 byte for adm distance purposes */
212 	if (weight > RT_MAX_WEIGHT)
213 		weight = RT_MAX_WEIGHT;
214 	else if (weight == 0)
215 		weight = default_weight;
216 
217 	return (weight);
218 }
219 
220 /*
221  * File-local concept for distingushing between the normal and
222  * RTF_PINNED routes tha can override the "normal" one.
223  */
224 #define	NH_PRIORITY_HIGH	2
225 #define	NH_PRIORITY_NORMAL	1
226 static int
227 get_prio_from_info(const struct rt_addrinfo *info)
228 {
229 	if (info->rti_flags & RTF_PINNED)
230 		return (NH_PRIORITY_HIGH);
231 	return (NH_PRIORITY_NORMAL);
232 }
233 
234 static int
235 nhop_get_prio(const struct nhop_object *nh)
236 {
237 	if (NH_IS_PINNED(nh))
238 		return (NH_PRIORITY_HIGH);
239 	return (NH_PRIORITY_NORMAL);
240 }
241 
242 /*
243  * Check if specified @gw matches gw data in the nexthop @nh.
244  *
245  * Returns true if matches, false otherwise.
246  */
247 bool
248 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
249 {
250 
251 	if (nh->gw_sa.sa_family != gw->sa_family)
252 		return (false);
253 
254 	switch (gw->sa_family) {
255 	case AF_INET:
256 		return (nh->gw4_sa.sin_addr.s_addr ==
257 		    ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
258 	case AF_INET6:
259 		{
260 			const struct sockaddr_in6 *gw6;
261 			gw6 = (const struct sockaddr_in6 *)gw;
262 
263 			/*
264 			 * Currently (2020-09) IPv6 gws in kernel have their
265 			 * scope embedded. Once this becomes false, this code
266 			 * has to be revisited.
267 			 */
268 			if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
269 			    &gw6->sin6_addr))
270 				return (true);
271 			return (false);
272 		}
273 	case AF_LINK:
274 		{
275 			const struct sockaddr_dl *sdl;
276 			sdl = (const struct sockaddr_dl *)gw;
277 			return (nh->gwl_sa.sdl_index == sdl->sdl_index);
278 		}
279 	default:
280 		return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
281 	}
282 
283 	/* NOTREACHED */
284 	return (false);
285 }
286 
287 /*
288  * Matches all nexthop with given @gw.
289  * Can be used as rib_filter_f callback.
290  */
291 int
292 rib_match_gw(const struct rtentry *rt, const struct nhop_object *nh, void *gw_sa)
293 {
294 	const struct sockaddr *gw = (const struct sockaddr *)gw_sa;
295 
296 	return (match_nhop_gw(nh, gw));
297 }
298 
299 struct gw_filter_data {
300 	const struct sockaddr *gw;
301 	int count;
302 };
303 
304 /*
305  * Matches first occurence of the gateway provided in @gwd
306  */
307 static int
308 match_gw_one(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
309 {
310 	struct gw_filter_data *gwd = (struct gw_filter_data *)_data;
311 
312 	/* Return only first match to make rtsock happy */
313 	if (match_nhop_gw(nh, gwd->gw) && gwd->count++ == 0)
314 		return (1);
315 	return (0);
316 }
317 
318 /*
319  * Checks if data in @info matches nexhop @nh.
320  *
321  * Returns 0 on success,
322  * ESRCH if not matched,
323  * ENOENT if filter function returned false
324  */
325 int
326 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
327     const struct nhop_object *nh)
328 {
329 	const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
330 
331 	if (info->rti_filter != NULL) {
332 	    if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
333 		    return (ENOENT);
334 	    else
335 		    return (0);
336 	}
337 	if ((gw != NULL) && !match_nhop_gw(nh, gw))
338 		return (ESRCH);
339 
340 	return (0);
341 }
342 
343 /*
344  * Runs exact prefix match based on @dst and @netmask.
345  * Returns matched @rtentry if found or NULL.
346  * If rtentry was found, saves nexthop / weight value into @rnd.
347  */
348 static struct rtentry *
349 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
350     const struct sockaddr *netmask, struct route_nhop_data *rnd)
351 {
352 	struct rtentry *rt;
353 
354 	RIB_LOCK_ASSERT(rnh);
355 
356 	rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
357 	if (rt != NULL) {
358 		rnd->rnd_nhop = rt->rt_nhop;
359 		rnd->rnd_weight = rt->rt_weight;
360 	} else {
361 		rnd->rnd_nhop = NULL;
362 		rnd->rnd_weight = 0;
363 	}
364 
365 	return (rt);
366 }
367 
368 struct rtentry *
369 lookup_prefix_rt(struct rib_head *rnh, const struct rtentry *rt,
370     struct route_nhop_data *rnd)
371 {
372 	return (lookup_prefix_bysa(rnh, rt_key_const(rt), rt_mask_const(rt), rnd));
373 }
374 
375 /*
376  * Runs exact prefix match based on dst/netmask from @info.
377  * Assumes RIB lock is held.
378  * Returns matched @rtentry if found or NULL.
379  * If rtentry was found, saves nexthop / weight value into @rnd.
380  */
381 struct rtentry *
382 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
383     struct route_nhop_data *rnd)
384 {
385 	struct rtentry *rt;
386 
387 	rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
388 	    info->rti_info[RTAX_NETMASK], rnd);
389 
390 	return (rt);
391 }
392 
393 static bool
394 fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
395     struct sockaddr **pmask)
396 {
397 	if (plen == -1) {
398 		*pmask = NULL;
399 		return (true);
400 	}
401 
402 	switch (family) {
403 #ifdef INET
404 	case AF_INET:
405 		{
406 			struct sockaddr_in *mask = (struct sockaddr_in *)(*pmask);
407 			struct sockaddr_in *dst= (struct sockaddr_in *)_dst;
408 
409 			memset(mask, 0, sizeof(*mask));
410 			mask->sin_family = family;
411 			mask->sin_len = sizeof(*mask);
412 			if (plen == 32)
413 				*pmask = NULL;
414 			else if (plen > 32 || plen < 0)
415 				return (false);
416 			else {
417 				uint32_t daddr, maddr;
418 				maddr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
419 				mask->sin_addr.s_addr = maddr;
420 				daddr = dst->sin_addr.s_addr;
421 				daddr = htonl(ntohl(daddr) & ntohl(maddr));
422 				dst->sin_addr.s_addr = daddr;
423 			}
424 			return (true);
425 		}
426 		break;
427 #endif
428 #ifdef INET6
429 	case AF_INET6:
430 		{
431 			struct sockaddr_in6 *mask = (struct sockaddr_in6 *)(*pmask);
432 			struct sockaddr_in6 *dst = (struct sockaddr_in6 *)_dst;
433 
434 			memset(mask, 0, sizeof(*mask));
435 			mask->sin6_family = family;
436 			mask->sin6_len = sizeof(*mask);
437 			if (plen == 128)
438 				*pmask = NULL;
439 			else if (plen > 128 || plen < 0)
440 				return (false);
441 			else {
442 				ip6_writemask(&mask->sin6_addr, plen);
443 				IN6_MASK_ADDR(&dst->sin6_addr, &mask->sin6_addr);
444 			}
445 			return (true);
446 		}
447 		break;
448 #endif
449 	}
450 	return (false);
451 }
452 
453 /*
454  * Attempts to add @dst/plen prefix with nexthop/nexhopgroup data @rnd
455  * to the routing table.
456  *
457  * @fibnum: rtable id to insert route to
458  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
459  * @plen: prefix length (or -1 if host route or not applicable for AF)
460  * @op_flags: combination of RTM_F_ flags
461  * @rc: storage to report operation result
462  *
463  * Returns 0 on success.
464  */
465 int
466 rib_add_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
467     struct route_nhop_data *rnd, int op_flags, struct rib_cmd_info *rc)
468 {
469 	union sockaddr_union mask_storage;
470 	struct sockaddr *netmask = &mask_storage.sa;
471 	struct rtentry *rt = NULL;
472 
473 	NET_EPOCH_ASSERT();
474 
475 	bzero(rc, sizeof(struct rib_cmd_info));
476 	rc->rc_cmd = RTM_ADD;
477 
478 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
479 	if (rnh == NULL)
480 		return (EAFNOSUPPORT);
481 
482 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
483 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
484 		return (EINVAL);
485 	}
486 
487 	if (op_flags & RTM_F_CREATE) {
488 		if ((rt = rt_alloc(rnh, dst, netmask)) == NULL) {
489 			FIB_RH_LOG(LOG_INFO, rnh, "rtentry allocation failed");
490 			return (ENOMEM);
491 		}
492 	}
493 
494 	return (add_route_flags(rnh, rt, rnd, op_flags, rc));
495 }
496 
497 /*
498  * Attempts to delete @dst/plen prefix matching gateway @gw from the
499  *  routing rable.
500  *
501  * @fibnum: rtable id to remove route from
502  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
503  * @plen: prefix length (or -1 if host route or not applicable for AF)
504  * @gw: gateway to match
505  * @op_flags: combination of RTM_F_ flags
506  * @rc: storage to report operation result
507  *
508  * Returns 0 on success.
509  */
510 int
511 rib_del_route_px_gw(uint32_t fibnum, struct sockaddr *dst, int plen,
512     const struct sockaddr *gw, int op_flags, struct rib_cmd_info *rc)
513 {
514 	struct gw_filter_data gwd = { .gw = gw };
515 
516 	return (rib_del_route_px(fibnum, dst, plen, match_gw_one, &gwd, op_flags, rc));
517 }
518 
519 /*
520  * Attempts to delete @dst/plen prefix matching @filter_func from the
521  *  routing rable.
522  *
523  * @fibnum: rtable id to remove route from
524  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
525  * @plen: prefix length (or -1 if host route or not applicable for AF)
526  * @filter_func: func to be called for each nexthop of the prefix for matching
527  * @filter_arg: argument to pass to @filter_func
528  * @op_flags: combination of RTM_F_ flags
529  * @rc: storage to report operation result
530  *
531  * Returns 0 on success.
532  */
533 int
534 rib_del_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
535     rib_filter_f_t *filter_func, void *filter_arg, int op_flags,
536     struct rib_cmd_info *rc)
537 {
538 	union sockaddr_union mask_storage;
539 	struct sockaddr *netmask = &mask_storage.sa;
540 	int error;
541 
542 	NET_EPOCH_ASSERT();
543 
544 	bzero(rc, sizeof(struct rib_cmd_info));
545 	rc->rc_cmd = RTM_DELETE;
546 
547 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
548 	if (rnh == NULL)
549 		return (EAFNOSUPPORT);
550 
551 	if (dst->sa_len > sizeof(mask_storage)) {
552 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too big: %d", dst->sa_len);
553 		return (EINVAL);
554 	}
555 
556 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
557 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
558 		return (EINVAL);
559 	}
560 
561 	int prio = (op_flags & RTM_F_FORCE) ? NH_PRIORITY_HIGH : NH_PRIORITY_NORMAL;
562 
563 	RIB_WLOCK(rnh);
564 	struct route_nhop_data rnd;
565 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
566 	if (rt != NULL) {
567 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
568 		    filter_arg, rc);
569 	} else
570 		error = ESRCH;
571 	RIB_WUNLOCK(rnh);
572 
573 	if (error != 0)
574 		return (error);
575 
576 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
577 
578 	if (rc->rc_cmd == RTM_DELETE)
579 		rt_free(rc->rc_rt);
580 #ifdef ROUTE_MPATH
581 	else {
582 		/*
583 		 * Deleting 1 path may result in RTM_CHANGE to
584 		 * a different mpath group/nhop.
585 		 * Free old mpath group.
586 		 */
587 		nhop_free_any(rc->rc_nh_old);
588 	}
589 #endif
590 
591 	return (0);
592 }
593 
594 /*
595  * Tries to copy route @rt from one rtable to the rtable specified by @dst_rh.
596  * @rt: route to copy.
597  * @rnd_src: nhop and weight. Multipath routes are not supported
598  * @rh_dst: target rtable.
599  * @rc: operation result storage
600  *
601  * Return 0 on success.
602  */
603 int
604 rib_copy_route(struct rtentry *rt, const struct route_nhop_data *rnd_src,
605     struct rib_head *rh_dst, struct rib_cmd_info *rc)
606 {
607 	struct nhop_object __diagused *nh_src = rnd_src->rnd_nhop;
608 	int error;
609 
610 	MPASS((nh_src->nh_flags & NHF_MULTIPATH) == 0);
611 
612 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
613 		char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
614 		nhop_print_buf_any(nh_src, nhbuf, sizeof(nhbuf));
615 		rt_print_buf(rt, rtbuf, sizeof(rtbuf));
616 		FIB_RH_LOG(LOG_DEBUG2, rh_dst, "copying %s -> %s from fib %u",
617 		    rtbuf, nhbuf, nhop_get_fibnum(nh_src));
618 	}
619 	struct nhop_object *nh = nhop_alloc(rh_dst->rib_fibnum, rh_dst->rib_family);
620 	if (nh == NULL) {
621 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to allocate new nexthop");
622 		return (ENOMEM);
623 	}
624 	nhop_copy(nh, rnd_src->rnd_nhop);
625 	nhop_set_origin(nh, nhop_get_origin(rnd_src->rnd_nhop));
626 	nhop_set_fibnum(nh, rh_dst->rib_fibnum);
627 	nh = nhop_get_nhop_internal(rh_dst, nh, &error);
628 	if (error != 0) {
629 		FIB_RH_LOG(LOG_INFO, rh_dst,
630 		    "unable to finalize new nexthop: error %d", error);
631 		return (ENOMEM);
632 	}
633 
634 	struct rtentry *rt_new = rt_alloc(rh_dst, rt_key(rt), rt_mask(rt));
635 	if (rt_new == NULL) {
636 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to create new rtentry");
637 		nhop_free(nh);
638 		return (ENOMEM);
639 	}
640 
641 	struct route_nhop_data rnd = {
642 		.rnd_nhop = nh,
643 		.rnd_weight = rnd_src->rnd_weight
644 	};
645 	int op_flags = RTM_F_CREATE | (NH_IS_PINNED(nh) ? RTM_F_FORCE : 0);
646 	error = add_route_flags(rh_dst, rt_new, &rnd, op_flags, rc);
647 
648 	if (error != 0) {
649 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
650 			char buf[NHOP_PRINT_BUFSIZE];
651 			rt_print_buf(rt_new, buf, sizeof(buf));
652 			FIB_RH_LOG(LOG_DEBUG, rh_dst,
653 			    "Unable to add route %s: error %d", buf, error);
654 		}
655 		nhop_free(nh);
656 		rt_free_immediate(rt_new);
657 	}
658 	return (error);
659 }
660 
661 /*
662  * Adds route defined by @info into the kernel table specified by @fibnum and
663  * sa_family in @info->rti_info[RTAX_DST].
664  *
665  * Returns 0 on success and fills in operation metadata into @rc.
666  */
667 int
668 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
669     struct rib_cmd_info *rc)
670 {
671 	struct rib_head *rnh;
672 	int error;
673 
674 	NET_EPOCH_ASSERT();
675 
676 	rnh = get_rnh(fibnum, info);
677 	if (rnh == NULL)
678 		return (EAFNOSUPPORT);
679 
680 	/*
681 	 * Check consistency between RTF_HOST flag and netmask
682 	 * existence.
683 	 */
684 	if (info->rti_flags & RTF_HOST)
685 		info->rti_info[RTAX_NETMASK] = NULL;
686 	else if (info->rti_info[RTAX_NETMASK] == NULL) {
687 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: no RTF_HOST and empty netmask");
688 		return (EINVAL);
689 	}
690 
691 	bzero(rc, sizeof(struct rib_cmd_info));
692 	rc->rc_cmd = RTM_ADD;
693 
694 	error = add_route_byinfo(rnh, info, rc);
695 	if (error == 0)
696 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
697 
698 	return (error);
699 }
700 
701 static int
702 add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
703     struct rib_cmd_info *rc)
704 {
705 	struct route_nhop_data rnd_add;
706 	struct nhop_object *nh;
707 	struct rtentry *rt;
708 	struct sockaddr *dst, *gateway, *netmask;
709 	int error;
710 
711 	dst = info->rti_info[RTAX_DST];
712 	gateway = info->rti_info[RTAX_GATEWAY];
713 	netmask = info->rti_info[RTAX_NETMASK];
714 
715 	if ((info->rti_flags & RTF_GATEWAY) && !gateway) {
716 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: RTF_GATEWAY set with empty gw");
717 		return (EINVAL);
718 	}
719 	if (dst && gateway && !nhop_check_gateway(dst->sa_family, gateway->sa_family)) {
720 		FIB_RH_LOG(LOG_DEBUG, rnh,
721 		    "error: invalid dst/gateway family combination (%d, %d)",
722 		    dst->sa_family, gateway->sa_family);
723 		return (EINVAL);
724 	}
725 
726 	if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) {
727 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large: %d",
728 		    dst->sa_len);
729 		return (EINVAL);
730 	}
731 
732 	if (info->rti_ifa == NULL) {
733 		error = rt_getifa_fib(info, rnh->rib_fibnum);
734 		if (error)
735 			return (error);
736 	}
737 
738 	if ((rt = rt_alloc(rnh, dst, netmask)) == NULL)
739 		return (ENOBUFS);
740 
741 	error = nhop_create_from_info(rnh, info, &nh);
742 	if (error != 0) {
743 		rt_free_immediate(rt);
744 		return (error);
745 	}
746 
747 	rnd_add.rnd_nhop = nh;
748 	rnd_add.rnd_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
749 
750 	int op_flags = RTM_F_CREATE;
751 	if (get_prio_from_info(info) == NH_PRIORITY_HIGH)
752 		op_flags |= RTM_F_FORCE;
753 	else
754 		op_flags |= RTM_F_APPEND;
755 	return (add_route_flags(rnh, rt, &rnd_add, op_flags, rc));
756 
757 }
758 
759 static int
760 add_route_flags(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd_add,
761     int op_flags, struct rib_cmd_info *rc)
762 {
763 	struct route_nhop_data rnd_orig;
764 	struct nhop_object *nh;
765 	struct rtentry *rt_orig;
766 	int error = 0;
767 
768 	nh = rnd_add->rnd_nhop;
769 
770 	RIB_WLOCK(rnh);
771 
772 	rt_orig = lookup_prefix_rt(rnh, rt, &rnd_orig);
773 
774 	if (rt_orig == NULL) {
775 		if (op_flags & RTM_F_CREATE)
776 			error = add_route(rnh, rt, rnd_add, rc);
777 		else
778 			error = ESRCH; /* no entry but creation was not required */
779 		RIB_WUNLOCK(rnh);
780 		if (error != 0)
781 			goto out;
782 		return (0);
783 	}
784 
785 	if (op_flags & RTM_F_EXCL) {
786 		/* We have existing route in the RIB but not allowed to replace. */
787 		RIB_WUNLOCK(rnh);
788 		error = EEXIST;
789 		goto out;
790 	}
791 
792 	/* Now either append or replace */
793 	if (op_flags & RTM_F_REPLACE) {
794 		if (nhop_get_prio(rnd_orig.rnd_nhop) > nhop_get_prio(rnd_add->rnd_nhop)) {
795 			/* Old path is "better" (e.g. has PINNED flag set) */
796 			error = EEXIST;
797 			goto out;
798 		}
799 		change_route(rnh, rt_orig, rnd_add, rc);
800 		RIB_WUNLOCK(rnh);
801 		nh = rc->rc_nh_old;
802 		goto out;
803 	}
804 
805 	RIB_WUNLOCK(rnh);
806 
807 #ifdef ROUTE_MPATH
808 	if ((op_flags & RTM_F_APPEND) && rib_can_multipath(rnh) &&
809 	    nhop_can_multipath(rnd_add->rnd_nhop) &&
810 	    nhop_can_multipath(rnd_orig.rnd_nhop)) {
811 
812 		for (int i = 0; i < RIB_MAX_RETRIES; i++) {
813 			error = add_route_flags_mpath(rnh, rt_orig, rnd_add, &rnd_orig,
814 			    op_flags, rc);
815 			if (error != EAGAIN)
816 				break;
817 			RTSTAT_INC(rts_add_retry);
818 		}
819 
820 		/*
821 		 *  Original nhop reference is unused in any case.
822 		 */
823 		nhop_free_any(rnd_add->rnd_nhop);
824 		if (op_flags & RTM_F_CREATE) {
825 			if (error != 0 || rc->rc_cmd != RTM_ADD)
826 				rt_free_immediate(rt);
827 		}
828 		return (error);
829 	}
830 #endif
831 	/* Out of options - free state and return error */
832 	error = EEXIST;
833 out:
834 	if (op_flags & RTM_F_CREATE)
835 		rt_free_immediate(rt);
836 	nhop_free_any(nh);
837 
838 	return (error);
839 }
840 
841 #ifdef ROUTE_MPATH
842 static int
843 add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
844     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
845     int op_flags, struct rib_cmd_info *rc)
846 {
847 	RIB_RLOCK_TRACKER;
848 	struct route_nhop_data rnd_new;
849 	int error = 0;
850 
851 	error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, &rnd_new);
852 	if (error != 0) {
853 		if (error == EAGAIN) {
854 			/*
855 			 * Group creation failed, most probably because
856 			 * @rnd_orig data got scheduled for deletion.
857 			 * Refresh @rnd_orig data and retry.
858 			 */
859 			RIB_RLOCK(rnh);
860 			lookup_prefix_rt(rnh, rt, rnd_orig);
861 			RIB_RUNLOCK(rnh);
862 			if (rnd_orig == NULL && !(op_flags & RTM_F_CREATE)) {
863 				/* In this iteration route doesn't exist */
864 				error = ENOENT;
865 			}
866 		}
867 		return (error);
868 	}
869 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
870 	if (error != 0)
871 		return (error);
872 
873 	if (V_fib_hash_outbound == 0 && NH_IS_NHGRP(rc->rc_nh_new)) {
874 		/*
875 		 * First multipath route got installed. Enable local
876 		 * outbound connections hashing.
877 		 */
878 		if (bootverbose)
879 			printf("FIB: enabled flowid calculation for locally-originated packets\n");
880 		V_fib_hash_outbound = 1;
881 	}
882 
883 	return (0);
884 }
885 #endif
886 
887 /*
888  * Removes route defined by @info from the kernel table specified by @fibnum and
889  * sa_family in @info->rti_info[RTAX_DST].
890  *
891  * Returns 0 on success and fills in operation metadata into @rc.
892  */
893 int
894 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
895 {
896 	struct rib_head *rnh;
897 	struct sockaddr *dst, *netmask;
898 	struct sockaddr_storage mdst;
899 	int error;
900 
901 	NET_EPOCH_ASSERT();
902 
903 	rnh = get_rnh(fibnum, info);
904 	if (rnh == NULL)
905 		return (EAFNOSUPPORT);
906 
907 	bzero(rc, sizeof(struct rib_cmd_info));
908 	rc->rc_cmd = RTM_DELETE;
909 
910 	dst = info->rti_info[RTAX_DST];
911 	netmask = info->rti_info[RTAX_NETMASK];
912 
913 	if (netmask != NULL) {
914 		/* Ensure @dst is always properly masked */
915 		if (dst->sa_len > sizeof(mdst)) {
916 			FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large");
917 			return (EINVAL);
918 		}
919 		rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
920 		dst = (struct sockaddr *)&mdst;
921 	}
922 
923 	rib_filter_f_t *filter_func = NULL;
924 	void *filter_arg = NULL;
925 	struct gw_filter_data gwd = { .gw = info->rti_info[RTAX_GATEWAY] };
926 
927 	if (info->rti_filter != NULL) {
928 		filter_func = info->rti_filter;
929 		filter_arg = info->rti_filterdata;
930 	} else if (gwd.gw != NULL) {
931 		filter_func = match_gw_one;
932 		filter_arg = &gwd;
933 	}
934 
935 	int prio = get_prio_from_info(info);
936 
937 	RIB_WLOCK(rnh);
938 	struct route_nhop_data rnd;
939 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
940 	if (rt != NULL) {
941 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
942 		    filter_arg, rc);
943 	} else
944 		error = ESRCH;
945 	RIB_WUNLOCK(rnh);
946 
947 	if (error != 0)
948 		return (error);
949 
950 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
951 
952 	if (rc->rc_cmd == RTM_DELETE)
953 		rt_free(rc->rc_rt);
954 #ifdef ROUTE_MPATH
955 	else {
956 		/*
957 		 * Deleting 1 path may result in RTM_CHANGE to
958 		 * a different mpath group/nhop.
959 		 * Free old mpath group.
960 		 */
961 		nhop_free_any(rc->rc_nh_old);
962 	}
963 #endif
964 
965 	return (0);
966 }
967 
968 /*
969  * Conditionally unlinks rtentry paths from @rnh matching @cb.
970  * Returns 0 on success with operation result stored in @rc.
971  * On error, returns:
972  * ESRCH - if prefix was not found or filter function failed to match
973  * EADDRINUSE - if trying to delete higher priority route.
974  */
975 static int
976 rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
977     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc)
978 {
979 	struct nhop_object *nh = rt->rt_nhop;
980 
981 #ifdef ROUTE_MPATH
982 	if (NH_IS_NHGRP(nh)) {
983 		struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
984 		struct route_nhop_data rnd;
985 		int error;
986 
987 		if (cb == NULL)
988 			return (ESRCH);
989 		error = nhgrp_get_filtered_group(rnh, rt, nhg, cb, cbdata, &rnd);
990 		if (error == 0) {
991 			if (rnd.rnd_nhgrp == nhg) {
992 				/* No match, unreference new group and return. */
993 				nhop_free_any(rnd.rnd_nhop);
994 				return (ESRCH);
995 			}
996 			error = change_route(rnh, rt, &rnd, rc);
997 		}
998 		return (error);
999 	}
1000 #endif
1001 	if (cb != NULL && !cb(rt, nh, cbdata))
1002 		return (ESRCH);
1003 
1004 	if (prio < nhop_get_prio(nh))
1005 		return (EADDRINUSE);
1006 
1007 	return (delete_route(rnh, rt, rc));
1008 }
1009 
1010 int
1011 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
1012     struct rib_cmd_info *rc)
1013 {
1014 	RIB_RLOCK_TRACKER;
1015 	struct route_nhop_data rnd_orig;
1016 	struct rib_head *rnh;
1017 	struct rtentry *rt;
1018 	int error;
1019 
1020 	NET_EPOCH_ASSERT();
1021 
1022 	rnh = get_rnh(fibnum, info);
1023 	if (rnh == NULL)
1024 		return (EAFNOSUPPORT);
1025 
1026 	bzero(rc, sizeof(struct rib_cmd_info));
1027 	rc->rc_cmd = RTM_CHANGE;
1028 
1029 	/* Check if updated gateway exists */
1030 	if ((info->rti_flags & RTF_GATEWAY) &&
1031 	    (info->rti_info[RTAX_GATEWAY] == NULL)) {
1032 
1033 		/*
1034 		 * route(8) adds RTF_GATEWAY flag if -interface is not set.
1035 		 * Remove RTF_GATEWAY to enforce consistency and maintain
1036 		 * compatibility..
1037 		 */
1038 		info->rti_flags &= ~RTF_GATEWAY;
1039 	}
1040 
1041 	/*
1042 	 * route change is done in multiple steps, with dropping and
1043 	 * reacquiring lock. In the situations with multiple processes
1044 	 * changes the same route in can lead to the case when route
1045 	 * is changed between the steps. Address it by retrying the operation
1046 	 * multiple times before failing.
1047 	 */
1048 
1049 	RIB_RLOCK(rnh);
1050 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
1051 	    info->rti_info[RTAX_NETMASK], &rnh->head);
1052 
1053 	if (rt == NULL) {
1054 		RIB_RUNLOCK(rnh);
1055 		return (ESRCH);
1056 	}
1057 
1058 	rnd_orig.rnd_nhop = rt->rt_nhop;
1059 	rnd_orig.rnd_weight = rt->rt_weight;
1060 
1061 	RIB_RUNLOCK(rnh);
1062 
1063 	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
1064 		error = change_route_byinfo(rnh, rt, info, &rnd_orig, rc);
1065 		if (error != EAGAIN)
1066 			break;
1067 	}
1068 
1069 	return (error);
1070 }
1071 
1072 static int
1073 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
1074     struct nhop_object *nh_orig, struct nhop_object **nh_new)
1075 {
1076 	int error;
1077 
1078 	/*
1079 	 * New gateway could require new ifaddr, ifp;
1080 	 * flags may also be different; ifp may be specified
1081 	 * by ll sockaddr when protocol address is ambiguous
1082 	 */
1083 	if (((nh_orig->nh_flags & NHF_GATEWAY) &&
1084 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
1085 	    info->rti_info[RTAX_IFP] != NULL ||
1086 	    (info->rti_info[RTAX_IFA] != NULL &&
1087 	     !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
1088 		error = rt_getifa_fib(info, rnh->rib_fibnum);
1089 
1090 		if (error != 0) {
1091 			info->rti_ifa = NULL;
1092 			return (error);
1093 		}
1094 	}
1095 
1096 	error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
1097 	info->rti_ifa = NULL;
1098 
1099 	return (error);
1100 }
1101 
1102 #ifdef ROUTE_MPATH
1103 static int
1104 change_mpath_route(struct rib_head *rnh, struct rtentry *rt,
1105     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1106     struct rib_cmd_info *rc)
1107 {
1108 	int error = 0, found_idx = 0;
1109 	struct nhop_object *nh_orig = NULL, *nh_new;
1110 	struct route_nhop_data rnd_new = {};
1111 	const struct weightened_nhop *wn = NULL;
1112 	struct weightened_nhop *wn_new;
1113 	uint32_t num_nhops;
1114 
1115 	wn = nhgrp_get_nhops(rnd_orig->rnd_nhgrp, &num_nhops);
1116 	for (int i = 0; i < num_nhops; i++) {
1117 		if (check_info_match_nhop(info, NULL, wn[i].nh) == 0) {
1118 			nh_orig = wn[i].nh;
1119 			found_idx = i;
1120 			break;
1121 		}
1122 	}
1123 
1124 	if (nh_orig == NULL)
1125 		return (ESRCH);
1126 
1127 	error = change_nhop(rnh, info, nh_orig, &nh_new);
1128 	if (error != 0)
1129 		return (error);
1130 
1131 	wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
1132 	    M_TEMP, M_NOWAIT | M_ZERO);
1133 	if (wn_new == NULL) {
1134 		nhop_free(nh_new);
1135 		return (EAGAIN);
1136 	}
1137 
1138 	memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
1139 	wn_new[found_idx].nh = nh_new;
1140 	wn_new[found_idx].weight = get_info_weight(info, wn[found_idx].weight);
1141 
1142 	error = nhgrp_get_group(rnh, wn_new, num_nhops, 0, &rnd_new.rnd_nhgrp);
1143 	nhop_free(nh_new);
1144 	free(wn_new, M_TEMP);
1145 
1146 	if (error != 0)
1147 		return (error);
1148 
1149 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1150 
1151 	return (error);
1152 }
1153 #endif
1154 
1155 static int
1156 change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
1157     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1158     struct rib_cmd_info *rc)
1159 {
1160 	int error = 0;
1161 	struct nhop_object *nh_orig;
1162 	struct route_nhop_data rnd_new;
1163 
1164 	nh_orig = rnd_orig->rnd_nhop;
1165 	if (nh_orig == NULL)
1166 		return (ESRCH);
1167 
1168 #ifdef ROUTE_MPATH
1169 	if (NH_IS_NHGRP(nh_orig))
1170 		return (change_mpath_route(rnh, rt, info, rnd_orig, rc));
1171 #endif
1172 
1173 	rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
1174 	error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
1175 	if (error != 0)
1176 		return (error);
1177 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1178 
1179 	return (error);
1180 }
1181 
1182 /*
1183  * Insert @rt with nhop data from @rnd_new to @rnh.
1184  * Returns 0 on success and stores operation results in @rc.
1185  */
1186 static int
1187 add_route(struct rib_head *rnh, struct rtentry *rt,
1188     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1189 {
1190 	struct radix_node *rn;
1191 
1192 	RIB_WLOCK_ASSERT(rnh);
1193 
1194 	rt->rt_nhop = rnd->rnd_nhop;
1195 	rt->rt_weight = rnd->rnd_weight;
1196 	rn = rnh->rnh_addaddr(rt_key(rt), rt_mask_const(rt), &rnh->head, rt->rt_nodes);
1197 
1198 	if (rn != NULL) {
1199 		if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1200 			tmproutes_update(rnh, rt, rnd->rnd_nhop);
1201 
1202 		/* Finalize notification */
1203 		rib_bump_gen(rnh);
1204 		rnh->rnh_prefixes++;
1205 
1206 		rc->rc_cmd = RTM_ADD;
1207 		rc->rc_rt = rt;
1208 		rc->rc_nh_old = NULL;
1209 		rc->rc_nh_new = rnd->rnd_nhop;
1210 		rc->rc_nh_weight = rnd->rnd_weight;
1211 
1212 		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1213 		return (0);
1214 	}
1215 
1216 	/* Existing route or memory allocation failure. */
1217 	return (EEXIST);
1218 }
1219 
1220 /*
1221  * Unconditionally deletes @rt from @rnh.
1222  */
1223 static int
1224 delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc)
1225 {
1226 	RIB_WLOCK_ASSERT(rnh);
1227 
1228 	/* Route deletion requested. */
1229 	struct radix_node *rn;
1230 
1231 	rn = rnh->rnh_deladdr(rt_key_const(rt), rt_mask_const(rt), &rnh->head);
1232 	if (rn == NULL)
1233 		return (ESRCH);
1234 	rt = RNTORT(rn);
1235 	rt->rte_flags &= ~RTF_UP;
1236 
1237 	rib_bump_gen(rnh);
1238 	rnh->rnh_prefixes--;
1239 
1240 	rc->rc_cmd = RTM_DELETE;
1241 	rc->rc_rt = rt;
1242 	rc->rc_nh_old = rt->rt_nhop;
1243 	rc->rc_nh_new = NULL;
1244 	rc->rc_nh_weight = rt->rt_weight;
1245 
1246 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1247 
1248 	return (0);
1249 }
1250 
1251 /*
1252  * Switch @rt nhop/weigh to the ones specified in @rnd.
1253  * Returns 0 on success.
1254  */
1255 int
1256 change_route(struct rib_head *rnh, struct rtentry *rt,
1257     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1258 {
1259 	struct nhop_object *nh_orig;
1260 
1261 	RIB_WLOCK_ASSERT(rnh);
1262 
1263 	nh_orig = rt->rt_nhop;
1264 
1265 	if (rnd->rnd_nhop == NULL)
1266 		return (delete_route(rnh, rt, rc));
1267 
1268 	/* Changing nexthop & weight to a new one */
1269 	rt->rt_nhop = rnd->rnd_nhop;
1270 	rt->rt_weight = rnd->rnd_weight;
1271 	if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1272 		tmproutes_update(rnh, rt, rnd->rnd_nhop);
1273 
1274 	/* Finalize notification */
1275 	rib_bump_gen(rnh);
1276 	rc->rc_cmd = RTM_CHANGE;
1277 	rc->rc_rt = rt;
1278 	rc->rc_nh_old = nh_orig;
1279 	rc->rc_nh_new = rnd->rnd_nhop;
1280 	rc->rc_nh_weight = rnd->rnd_weight;
1281 
1282 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1283 
1284 	return (0);
1285 }
1286 
1287 /*
1288  * Conditionally update route nhop/weight IFF data in @nhd_orig is
1289  *  consistent with the current route data.
1290  * Nexthop in @nhd_new is consumed.
1291  */
1292 int
1293 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
1294     struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new,
1295     struct rib_cmd_info *rc)
1296 {
1297 	struct rtentry *rt_new;
1298 	int error = 0;
1299 
1300 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
1301 		char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
1302 		nhop_print_buf_any(rnd_orig->rnd_nhop, buf_old, NHOP_PRINT_BUFSIZE);
1303 		nhop_print_buf_any(rnd_new->rnd_nhop, buf_new, NHOP_PRINT_BUFSIZE);
1304 		FIB_LOG(LOG_DEBUG2, rnh->rib_fibnum, rnh->rib_family,
1305 		    "trying change %s -> %s", buf_old, buf_new);
1306 	}
1307 	RIB_WLOCK(rnh);
1308 
1309 	struct route_nhop_data rnd;
1310 	rt_new = lookup_prefix_rt(rnh, rt, &rnd);
1311 
1312 	if (rt_new == NULL) {
1313 		if (rnd_orig->rnd_nhop == NULL)
1314 			error = add_route(rnh, rt, rnd_new, rc);
1315 		else {
1316 			/*
1317 			 * Prefix does not exist, which was not our assumption.
1318 			 * Update @rnd_orig with the new data and return
1319 			 */
1320 			rnd_orig->rnd_nhop = NULL;
1321 			rnd_orig->rnd_weight = 0;
1322 			error = EAGAIN;
1323 		}
1324 	} else {
1325 		/* Prefix exists, try to update */
1326 		if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
1327 			/*
1328 			 * Nhop/mpath group hasn't changed. Flip
1329 			 * to the new precalculated one and return
1330 			 */
1331 			error = change_route(rnh, rt_new, rnd_new, rc);
1332 		} else {
1333 			/* Update and retry */
1334 			rnd_orig->rnd_nhop = rt_new->rt_nhop;
1335 			rnd_orig->rnd_weight = rt_new->rt_weight;
1336 			error = EAGAIN;
1337 		}
1338 	}
1339 
1340 	RIB_WUNLOCK(rnh);
1341 
1342 	if (error == 0) {
1343 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
1344 
1345 		if (rnd_orig->rnd_nhop != NULL)
1346 			nhop_free_any(rnd_orig->rnd_nhop);
1347 
1348 	} else {
1349 		if (rnd_new->rnd_nhop != NULL)
1350 			nhop_free_any(rnd_new->rnd_nhop);
1351 	}
1352 
1353 	return (error);
1354 }
1355 
1356 /*
1357  * Performs modification of routing table specificed by @action.
1358  * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
1359  * Needs to be run in network epoch.
1360  *
1361  * Returns 0 on success and fills in @rc with action result.
1362  */
1363 int
1364 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
1365     struct rib_cmd_info *rc)
1366 {
1367 	int error;
1368 
1369 	switch (action) {
1370 	case RTM_ADD:
1371 		error = rib_add_route(fibnum, info, rc);
1372 		break;
1373 	case RTM_DELETE:
1374 		error = rib_del_route(fibnum, info, rc);
1375 		break;
1376 	case RTM_CHANGE:
1377 		error = rib_change_route(fibnum, info, rc);
1378 		break;
1379 	default:
1380 		error = ENOTSUP;
1381 	}
1382 
1383 	return (error);
1384 }
1385 
1386 struct rt_delinfo
1387 {
1388 	struct rib_head *rnh;
1389 	struct rtentry *head;
1390 	rib_filter_f_t *filter_f;
1391 	void *filter_arg;
1392 	int prio;
1393 	struct rib_cmd_info rc;
1394 };
1395 
1396 /*
1397  * Conditionally unlinks rtenties or paths from radix tree based
1398  * on the callback data passed in @arg.
1399  */
1400 static int
1401 rt_checkdelroute(struct radix_node *rn, void *arg)
1402 {
1403 	struct rt_delinfo *di = (struct rt_delinfo *)arg;
1404 	struct rtentry *rt = (struct rtentry *)rn;
1405 
1406 	if (rt_delete_conditional(di->rnh, rt, di->prio,
1407 	    di->filter_f, di->filter_arg, &di->rc) != 0)
1408 		return (0);
1409 
1410 	/*
1411 	 * Add deleted rtentries to the list to GC them
1412 	 *  after dropping the lock.
1413 	 *
1414 	 * XXX: Delayed notifications not implemented
1415 	 *  for nexthop updates.
1416 	 */
1417 	if (di->rc.rc_cmd == RTM_DELETE) {
1418 		/* Add to the list and return */
1419 		rt->rt_chain = di->head;
1420 		di->head = rt;
1421 #ifdef ROUTE_MPATH
1422 	} else {
1423 		/*
1424 		 * RTM_CHANGE to a different nexthop or nexthop group.
1425 		 * Free old multipath group.
1426 		 */
1427 		nhop_free_any(di->rc.rc_nh_old);
1428 #endif
1429 	}
1430 
1431 	return (0);
1432 }
1433 
1434 /*
1435  * Iterates over a routing table specified by @fibnum and @family and
1436  *  deletes elements marked by @filter_f.
1437  * @fibnum: rtable id
1438  * @family: AF_ address family
1439  * @filter_f: function returning non-zero value for items to delete
1440  * @arg: data to pass to the @filter_f function
1441  * @report: true if rtsock notification is needed.
1442  */
1443 void
1444 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg,
1445     bool report)
1446 {
1447 	struct rib_head *rnh;
1448 	struct rtentry *rt;
1449 	struct nhop_object *nh;
1450 	struct epoch_tracker et;
1451 
1452 	rnh = rt_tables_get_rnh(fibnum, family);
1453 	if (rnh == NULL)
1454 		return;
1455 
1456 	struct rt_delinfo di = {
1457 		.rnh = rnh,
1458 		.filter_f = filter_f,
1459 		.filter_arg = filter_arg,
1460 		.prio = NH_PRIORITY_NORMAL,
1461 	};
1462 
1463 	NET_EPOCH_ENTER(et);
1464 
1465 	RIB_WLOCK(rnh);
1466 	rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
1467 	RIB_WUNLOCK(rnh);
1468 
1469 	/* We might have something to reclaim. */
1470 	bzero(&di.rc, sizeof(di.rc));
1471 	di.rc.rc_cmd = RTM_DELETE;
1472 	while (di.head != NULL) {
1473 		rt = di.head;
1474 		di.head = rt->rt_chain;
1475 		rt->rt_chain = NULL;
1476 		nh = rt->rt_nhop;
1477 
1478 		di.rc.rc_rt = rt;
1479 		di.rc.rc_nh_old = nh;
1480 		rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
1481 
1482 		if (report) {
1483 #ifdef ROUTE_MPATH
1484 			struct nhgrp_object *nhg;
1485 			const struct weightened_nhop *wn;
1486 			uint32_t num_nhops;
1487 			if (NH_IS_NHGRP(nh)) {
1488 				nhg = (struct nhgrp_object *)nh;
1489 				wn = nhgrp_get_nhops(nhg, &num_nhops);
1490 				for (int i = 0; i < num_nhops; i++)
1491 					rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum);
1492 			} else
1493 #endif
1494 			rt_routemsg(RTM_DELETE, rt, nh, fibnum);
1495 		}
1496 		rt_free(rt);
1497 	}
1498 
1499 	NET_EPOCH_EXIT(et);
1500 }
1501 
1502 static int
1503 rt_delete_unconditional(struct radix_node *rn, void *arg)
1504 {
1505 	struct rtentry *rt = RNTORT(rn);
1506 	struct rib_head *rnh = (struct rib_head *)arg;
1507 
1508 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
1509 	if (RNTORT(rn) == rt)
1510 		rt_free(rt);
1511 
1512 	return (0);
1513 }
1514 
1515 /*
1516  * Removes all routes from the routing table without executing notifications.
1517  * rtentres will be removed after the end of a current epoch.
1518  */
1519 static void
1520 rib_flush_routes(struct rib_head *rnh)
1521 {
1522 	RIB_WLOCK(rnh);
1523 	rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh);
1524 	RIB_WUNLOCK(rnh);
1525 }
1526 
1527 void
1528 rib_flush_routes_family(int family)
1529 {
1530 	struct rib_head *rnh;
1531 
1532 	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
1533 		if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
1534 			rib_flush_routes(rnh);
1535 	}
1536 }
1537 
1538 const char *
1539 rib_print_family(int family)
1540 {
1541 	switch (family) {
1542 	case AF_INET:
1543 		return ("inet");
1544 	case AF_INET6:
1545 		return ("inet6");
1546 	case AF_LINK:
1547 		return ("link");
1548 	}
1549 	return ("unknown");
1550 }
1551 
1552