xref: /freebsd/sys/net/route/route_ctl.c (revision c07d6445eb89d9dd3950361b065b7bd110e3a043)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
37 #include <sys/mbuf.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
40 #include <sys/syslog.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/rmlock.h>
44 
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/if_dl.h>
48 #include <net/vnet.h>
49 #include <net/route.h>
50 #include <net/route/route_ctl.h>
51 #include <net/route/route_var.h>
52 #include <net/route/nhop_utils.h>
53 #include <net/route/nhop.h>
54 #include <net/route/nhop_var.h>
55 #include <netinet/in.h>
56 #include <netinet6/scope6_var.h>
57 #include <netinet6/in6_var.h>
58 
59 #define	DEBUG_MOD_NAME	route_ctl
60 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
61 #include <net/route/route_debug.h>
62 _DECLARE_DEBUG(LOG_INFO);
63 
64 /*
65  * This file contains control plane routing tables functions.
66  *
67  * All functions assumes they are called in net epoch.
68  */
69 
70 union sockaddr_union {
71 	struct sockaddr		sa;
72 	struct sockaddr_in	sin;
73 	struct sockaddr_in6	sin6;
74 	char			_buf[32];
75 };
76 
77 static int add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
78     struct rib_cmd_info *rc);
79 static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
80     struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
81     struct rib_cmd_info *rc);
82 
83 static int add_route_flags(struct rib_head *rnh, struct rtentry *rt,
84     struct route_nhop_data *rnd_add, int op_flags, struct rib_cmd_info *rc);
85 #ifdef ROUTE_MPATH
86 static int add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
87     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
88     int op_flags, struct rib_cmd_info *rc);
89 #endif
90 
91 static int add_route(struct rib_head *rnh, struct rtentry *rt,
92     struct route_nhop_data *rnd, struct rib_cmd_info *rc);
93 static int delete_route(struct rib_head *rnh, struct rtentry *rt,
94     struct rib_cmd_info *rc);
95 static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
96     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc);
97 
98 static int get_prio_from_info(const struct rt_addrinfo *info);
99 static int nhop_get_prio(const struct nhop_object *nh);
100 
101 #ifdef ROUTE_MPATH
102 static bool rib_can_multipath(struct rib_head *rh);
103 #endif
104 
105 /* Per-vnet multipath routing configuration */
106 SYSCTL_DECL(_net_route);
107 #define	V_rib_route_multipath	VNET(rib_route_multipath)
108 #ifdef ROUTE_MPATH
109 #define _MP_FLAGS	CTLFLAG_RW
110 #else
111 #define _MP_FLAGS	CTLFLAG_RD
112 #endif
113 VNET_DEFINE(u_int, rib_route_multipath) = 1;
114 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
115     &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
116 #undef _MP_FLAGS
117 
118 #ifdef ROUTE_MPATH
119 VNET_DEFINE(u_int, fib_hash_outbound) = 0;
120 SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
121     &VNET_NAME(fib_hash_outbound), 0,
122     "Compute flowid for locally-originated packets");
123 
124 /* Default entropy to add to the hash calculation for the outbound connections*/
125 uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
126 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
127 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
128 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
129 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
130 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
131 };
132 #endif
133 
134 #if defined(INET) && defined(INET6)
135 FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops");
136 #define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop)
137 VNET_DEFINE_STATIC(u_int, rib_route_ipv6_nexthop) = 1;
138 SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET,
139     &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address");
140 #endif
141 
142 /* Debug bits */
143 SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
144 
145 static struct rib_head *
146 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
147 {
148 	struct rib_head *rnh;
149 	struct sockaddr *dst;
150 
151 	KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
152 
153 	dst = info->rti_info[RTAX_DST];
154 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
155 
156 	return (rnh);
157 }
158 
159 #if defined(INET) && defined(INET6)
160 bool
161 rib_can_4o6_nhop(void)
162 {
163 	return (!!V_rib_route_ipv6_nexthop);
164 }
165 #endif
166 
167 #ifdef ROUTE_MPATH
168 static bool
169 rib_can_multipath(struct rib_head *rh)
170 {
171 	int result;
172 
173 	CURVNET_SET(rh->rib_vnet);
174 	result = !!V_rib_route_multipath;
175 	CURVNET_RESTORE();
176 
177 	return (result);
178 }
179 
180 /*
181  * Check is nhop is multipath-eligible.
182  * Avoid nhops without gateways and redirects.
183  *
184  * Returns 1 for multipath-eligible nexthop,
185  * 0 otherwise.
186  */
187 bool
188 nhop_can_multipath(const struct nhop_object *nh)
189 {
190 
191 	if ((nh->nh_flags & NHF_MULTIPATH) != 0)
192 		return (1);
193 	if ((nh->nh_flags & NHF_GATEWAY) == 0)
194 		return (0);
195 	if ((nh->nh_flags & NHF_REDIRECT) != 0)
196 		return (0);
197 
198 	return (1);
199 }
200 #endif
201 
202 static int
203 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
204 {
205 	uint32_t weight;
206 
207 	if (info->rti_mflags & RTV_WEIGHT)
208 		weight = info->rti_rmx->rmx_weight;
209 	else
210 		weight = default_weight;
211 	/* Keep upper 1 byte for adm distance purposes */
212 	if (weight > RT_MAX_WEIGHT)
213 		weight = RT_MAX_WEIGHT;
214 	else if (weight == 0)
215 		weight = default_weight;
216 
217 	return (weight);
218 }
219 
220 /*
221  * File-local concept for distingushing between the normal and
222  * RTF_PINNED routes tha can override the "normal" one.
223  */
224 #define	NH_PRIORITY_HIGH	2
225 #define	NH_PRIORITY_NORMAL	1
226 static int
227 get_prio_from_info(const struct rt_addrinfo *info)
228 {
229 	if (info->rti_flags & RTF_PINNED)
230 		return (NH_PRIORITY_HIGH);
231 	return (NH_PRIORITY_NORMAL);
232 }
233 
234 static int
235 nhop_get_prio(const struct nhop_object *nh)
236 {
237 	if (NH_IS_PINNED(nh))
238 		return (NH_PRIORITY_HIGH);
239 	return (NH_PRIORITY_NORMAL);
240 }
241 
242 /*
243  * Check if specified @gw matches gw data in the nexthop @nh.
244  *
245  * Returns true if matches, false otherwise.
246  */
247 bool
248 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
249 {
250 
251 	if (nh->gw_sa.sa_family != gw->sa_family)
252 		return (false);
253 
254 	switch (gw->sa_family) {
255 	case AF_INET:
256 		return (nh->gw4_sa.sin_addr.s_addr ==
257 		    ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
258 	case AF_INET6:
259 		{
260 			const struct sockaddr_in6 *gw6;
261 			gw6 = (const struct sockaddr_in6 *)gw;
262 
263 			/*
264 			 * Currently (2020-09) IPv6 gws in kernel have their
265 			 * scope embedded. Once this becomes false, this code
266 			 * has to be revisited.
267 			 */
268 			if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
269 			    &gw6->sin6_addr))
270 				return (true);
271 			return (false);
272 		}
273 	case AF_LINK:
274 		{
275 			const struct sockaddr_dl *sdl;
276 			sdl = (const struct sockaddr_dl *)gw;
277 			return (nh->gwl_sa.sdl_index == sdl->sdl_index);
278 		}
279 	default:
280 		return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
281 	}
282 
283 	/* NOTREACHED */
284 	return (false);
285 }
286 
287 /*
288  * Matches all nexthop with given @gw.
289  * Can be used as rib_filter_f callback.
290  */
291 int
292 rib_match_gw(const struct rtentry *rt, const struct nhop_object *nh, void *gw_sa)
293 {
294 	const struct sockaddr *gw = (const struct sockaddr *)gw_sa;
295 
296 	return (match_nhop_gw(nh, gw));
297 }
298 
299 struct gw_filter_data {
300 	const struct sockaddr *gw;
301 	int count;
302 };
303 
304 /*
305  * Matches first occurence of the gateway provided in @gwd
306  */
307 static int
308 match_gw_one(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
309 {
310 	struct gw_filter_data *gwd = (struct gw_filter_data *)_data;
311 
312 	/* Return only first match to make rtsock happy */
313 	if (match_nhop_gw(nh, gwd->gw) && gwd->count++ == 0)
314 		return (1);
315 	return (0);
316 }
317 
318 /*
319  * Checks if data in @info matches nexhop @nh.
320  *
321  * Returns 0 on success,
322  * ESRCH if not matched,
323  * ENOENT if filter function returned false
324  */
325 int
326 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
327     const struct nhop_object *nh)
328 {
329 	const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
330 
331 	if (info->rti_filter != NULL) {
332 	    if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
333 		    return (ENOENT);
334 	    else
335 		    return (0);
336 	}
337 	if ((gw != NULL) && !match_nhop_gw(nh, gw))
338 		return (ESRCH);
339 
340 	return (0);
341 }
342 
343 /*
344  * Runs exact prefix match based on @dst and @netmask.
345  * Returns matched @rtentry if found or NULL.
346  * If rtentry was found, saves nexthop / weight value into @rnd.
347  */
348 static struct rtentry *
349 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
350     const struct sockaddr *netmask, struct route_nhop_data *rnd)
351 {
352 	struct rtentry *rt;
353 
354 	RIB_LOCK_ASSERT(rnh);
355 
356 	rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
357 	if (rt != NULL) {
358 		rnd->rnd_nhop = rt->rt_nhop;
359 		rnd->rnd_weight = rt->rt_weight;
360 	} else {
361 		rnd->rnd_nhop = NULL;
362 		rnd->rnd_weight = 0;
363 	}
364 
365 	return (rt);
366 }
367 
368 struct rtentry *
369 lookup_prefix_rt(struct rib_head *rnh, const struct rtentry *rt,
370     struct route_nhop_data *rnd)
371 {
372 	return (lookup_prefix_bysa(rnh, rt_key_const(rt), rt_mask_const(rt), rnd));
373 }
374 
375 /*
376  * Runs exact prefix match based on dst/netmask from @info.
377  * Assumes RIB lock is held.
378  * Returns matched @rtentry if found or NULL.
379  * If rtentry was found, saves nexthop / weight value into @rnd.
380  */
381 struct rtentry *
382 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
383     struct route_nhop_data *rnd)
384 {
385 	struct rtentry *rt;
386 
387 	rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
388 	    info->rti_info[RTAX_NETMASK], rnd);
389 
390 	return (rt);
391 }
392 
393 static bool
394 fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
395     struct sockaddr **pmask)
396 {
397 	if (plen == -1) {
398 		*pmask = NULL;
399 		return (true);
400 	}
401 
402 	switch (family) {
403 #ifdef INET
404 	case AF_INET:
405 		{
406 			struct sockaddr_in *mask = (struct sockaddr_in *)(*pmask);
407 			struct sockaddr_in *dst= (struct sockaddr_in *)_dst;
408 
409 			memset(mask, 0, sizeof(*mask));
410 			mask->sin_family = family;
411 			mask->sin_len = sizeof(*mask);
412 			if (plen == 32)
413 				*pmask = NULL;
414 			else if (plen > 32 || plen < 0)
415 				return (false);
416 			else {
417 				uint32_t daddr, maddr;
418 				maddr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
419 				mask->sin_addr.s_addr = maddr;
420 				daddr = dst->sin_addr.s_addr;
421 				daddr = htonl(ntohl(daddr) & ntohl(maddr));
422 				dst->sin_addr.s_addr = daddr;
423 			}
424 			return (true);
425 		}
426 		break;
427 #endif
428 #ifdef INET6
429 	case AF_INET6:
430 		{
431 			struct sockaddr_in6 *mask = (struct sockaddr_in6 *)(*pmask);
432 			struct sockaddr_in6 *dst = (struct sockaddr_in6 *)_dst;
433 
434 			memset(mask, 0, sizeof(*mask));
435 			mask->sin6_family = family;
436 			mask->sin6_len = sizeof(*mask);
437 			if (plen == 128)
438 				*pmask = NULL;
439 			else if (plen > 128 || plen < 0)
440 				return (false);
441 			else {
442 				ip6_writemask(&mask->sin6_addr, plen);
443 				IN6_MASK_ADDR(&dst->sin6_addr, &mask->sin6_addr);
444 			}
445 			return (true);
446 		}
447 		break;
448 #endif
449 	}
450 	return (false);
451 }
452 
453 /*
454  * Attempts to add @dst/plen prefix with nexthop/nexhopgroup data @rnd
455  * to the routing table.
456  *
457  * @fibnum: rtable id to insert route to
458  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
459  * @plen: prefix length (or -1 if host route or not applicable for AF)
460  * @op_flags: combination of RTM_F_ flags
461  * @rc: storage to report operation result
462  *
463  * Returns 0 on success.
464  */
465 int
466 rib_add_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
467     struct route_nhop_data *rnd, int op_flags, struct rib_cmd_info *rc)
468 {
469 	union sockaddr_union mask_storage;
470 	struct sockaddr *netmask = &mask_storage.sa;
471 	struct rtentry *rt = NULL;
472 
473 	NET_EPOCH_ASSERT();
474 
475 	bzero(rc, sizeof(struct rib_cmd_info));
476 	rc->rc_cmd = RTM_ADD;
477 
478 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
479 	if (rnh == NULL)
480 		return (EAFNOSUPPORT);
481 
482 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
483 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
484 		return (EINVAL);
485 	}
486 
487 	if (op_flags & RTM_F_CREATE) {
488 		if ((rt = rt_alloc(rnh, dst, netmask)) == NULL) {
489 			FIB_RH_LOG(LOG_INFO, rnh, "rtentry allocation failed");
490 			return (ENOMEM);
491 		}
492 	}
493 
494 	return (add_route_flags(rnh, rt, rnd, op_flags, rc));
495 }
496 
497 /*
498  * Attempts to delete @dst/plen prefix matching gateway @gw from the
499  *  routing rable.
500  *
501  * @fibnum: rtable id to remove route from
502  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
503  * @plen: prefix length (or -1 if host route or not applicable for AF)
504  * @gw: gateway to match
505  * @op_flags: combination of RTM_F_ flags
506  * @rc: storage to report operation result
507  *
508  * Returns 0 on success.
509  */
510 int
511 rib_del_route_px_gw(uint32_t fibnum, struct sockaddr *dst, int plen,
512     const struct sockaddr *gw, int op_flags, struct rib_cmd_info *rc)
513 {
514 	struct gw_filter_data gwd = { .gw = gw };
515 
516 	return (rib_del_route_px(fibnum, dst, plen, match_gw_one, &gwd, op_flags, rc));
517 }
518 
519 /*
520  * Attempts to delete @dst/plen prefix matching @filter_func from the
521  *  routing rable.
522  *
523  * @fibnum: rtable id to remove route from
524  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
525  * @plen: prefix length (or -1 if host route or not applicable for AF)
526  * @filter_func: func to be called for each nexthop of the prefix for matching
527  * @filter_arg: argument to pass to @filter_func
528  * @op_flags: combination of RTM_F_ flags
529  * @rc: storage to report operation result
530  *
531  * Returns 0 on success.
532  */
533 int
534 rib_del_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
535     rib_filter_f_t *filter_func, void *filter_arg, int op_flags,
536     struct rib_cmd_info *rc)
537 {
538 	union sockaddr_union mask_storage;
539 	struct sockaddr *netmask = &mask_storage.sa;
540 	int error;
541 
542 	NET_EPOCH_ASSERT();
543 
544 	bzero(rc, sizeof(struct rib_cmd_info));
545 	rc->rc_cmd = RTM_DELETE;
546 
547 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
548 	if (rnh == NULL)
549 		return (EAFNOSUPPORT);
550 
551 	if (dst->sa_len > sizeof(mask_storage)) {
552 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too big: %d", dst->sa_len);
553 		return (EINVAL);
554 	}
555 
556 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
557 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
558 		return (EINVAL);
559 	}
560 
561 	int prio = (op_flags & RTM_F_FORCE) ? NH_PRIORITY_HIGH : NH_PRIORITY_NORMAL;
562 
563 	RIB_WLOCK(rnh);
564 	struct route_nhop_data rnd;
565 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
566 	if (rt != NULL) {
567 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
568 		    filter_arg, rc);
569 	} else
570 		error = ESRCH;
571 	RIB_WUNLOCK(rnh);
572 
573 	if (error != 0)
574 		return (error);
575 
576 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
577 
578 	if (rc->rc_cmd == RTM_DELETE)
579 		rt_free(rc->rc_rt);
580 #ifdef ROUTE_MPATH
581 	else {
582 		/*
583 		 * Deleting 1 path may result in RTM_CHANGE to
584 		 * a different mpath group/nhop.
585 		 * Free old mpath group.
586 		 */
587 		nhop_free_any(rc->rc_nh_old);
588 	}
589 #endif
590 
591 	return (0);
592 }
593 
594 /*
595  * Tries to copy route @rt from one rtable to the rtable specified by @dst_rh.
596  * @rt: route to copy.
597  * @rnd_src: nhop and weight. Multipath routes are not supported
598  * @rh_dst: target rtable.
599  * @rc: operation result storage
600  *
601  * Return 0 on success.
602  */
603 int
604 rib_copy_route(struct rtentry *rt, const struct route_nhop_data *rnd_src,
605     struct rib_head *rh_dst, struct rib_cmd_info *rc)
606 {
607 	struct nhop_object __diagused *nh_src = rnd_src->rnd_nhop;
608 	int error;
609 
610 	MPASS((nh_src->nh_flags & NHF_MULTIPATH) == 0);
611 
612 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
613 		char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
614 		nhop_print_buf_any(nh_src, nhbuf, sizeof(nhbuf));
615 		rt_print_buf(rt, rtbuf, sizeof(rtbuf));
616 		FIB_RH_LOG(LOG_DEBUG2, rh_dst, "copying %s -> %s from fib %u",
617 		    rtbuf, nhbuf, nhop_get_fibnum(nh_src));
618 	}
619 	struct nhop_object *nh = nhop_alloc(rh_dst->rib_fibnum, rh_dst->rib_family);
620 	if (nh == NULL) {
621 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to allocate new nexthop");
622 		return (ENOMEM);
623 	}
624 	nhop_copy(nh, rnd_src->rnd_nhop);
625 	nhop_set_origin(nh, nhop_get_origin(rnd_src->rnd_nhop));
626 	nhop_set_fibnum(nh, rh_dst->rib_fibnum);
627 	nh = nhop_get_nhop_internal(rh_dst, nh, &error);
628 	if (error != 0) {
629 		FIB_RH_LOG(LOG_INFO, rh_dst,
630 		    "unable to finalize new nexthop: error %d", error);
631 		return (ENOMEM);
632 	}
633 
634 	struct rtentry *rt_new = rt_alloc(rh_dst, rt_key(rt), rt_mask(rt));
635 	if (rt_new == NULL) {
636 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to create new rtentry");
637 		nhop_free(nh);
638 		return (ENOMEM);
639 	}
640 
641 	struct route_nhop_data rnd = {
642 		.rnd_nhop = nh,
643 		.rnd_weight = rnd_src->rnd_weight
644 	};
645 	int op_flags = RTM_F_CREATE | (NH_IS_PINNED(nh) ? RTM_F_FORCE : 0);
646 	error = add_route_flags(rh_dst, rt_new, &rnd, op_flags, rc);
647 
648 	if (error != 0) {
649 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
650 			char buf[NHOP_PRINT_BUFSIZE];
651 			rt_print_buf(rt_new, buf, sizeof(buf));
652 			FIB_RH_LOG(LOG_DEBUG, rh_dst,
653 			    "Unable to add route %s: error %d", buf, error);
654 		}
655 		nhop_free(nh);
656 		rt_free_immediate(rt_new);
657 	}
658 	return (error);
659 }
660 
661 /*
662  * Adds route defined by @info into the kernel table specified by @fibnum and
663  * sa_family in @info->rti_info[RTAX_DST].
664  *
665  * Returns 0 on success and fills in operation metadata into @rc.
666  */
667 int
668 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
669     struct rib_cmd_info *rc)
670 {
671 	struct rib_head *rnh;
672 	int error;
673 
674 	NET_EPOCH_ASSERT();
675 
676 	rnh = get_rnh(fibnum, info);
677 	if (rnh == NULL)
678 		return (EAFNOSUPPORT);
679 
680 	/*
681 	 * Check consistency between RTF_HOST flag and netmask
682 	 * existence.
683 	 */
684 	if (info->rti_flags & RTF_HOST)
685 		info->rti_info[RTAX_NETMASK] = NULL;
686 	else if (info->rti_info[RTAX_NETMASK] == NULL) {
687 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: no RTF_HOST and empty netmask");
688 		return (EINVAL);
689 	}
690 
691 	bzero(rc, sizeof(struct rib_cmd_info));
692 	rc->rc_cmd = RTM_ADD;
693 
694 	error = add_route_byinfo(rnh, info, rc);
695 	if (error == 0)
696 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
697 
698 	return (error);
699 }
700 
701 static int
702 add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
703     struct rib_cmd_info *rc)
704 {
705 	struct route_nhop_data rnd_add;
706 	struct nhop_object *nh;
707 	struct rtentry *rt;
708 	struct sockaddr *dst, *gateway, *netmask;
709 	int error;
710 
711 	dst = info->rti_info[RTAX_DST];
712 	gateway = info->rti_info[RTAX_GATEWAY];
713 	netmask = info->rti_info[RTAX_NETMASK];
714 
715 	if ((info->rti_flags & RTF_GATEWAY) && !gateway) {
716 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: RTF_GATEWAY set with empty gw");
717 		return (EINVAL);
718 	}
719 	if (dst && gateway && !nhop_check_gateway(dst->sa_family, gateway->sa_family)) {
720 		FIB_RH_LOG(LOG_DEBUG, rnh,
721 		    "error: invalid dst/gateway family combination (%d, %d)",
722 		    dst->sa_family, gateway->sa_family);
723 		return (EINVAL);
724 	}
725 
726 	if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) {
727 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large: %d",
728 		    dst->sa_len);
729 		return (EINVAL);
730 	}
731 
732 	if (info->rti_ifa == NULL) {
733 		error = rt_getifa_fib(info, rnh->rib_fibnum);
734 		if (error)
735 			return (error);
736 	}
737 
738 	if ((rt = rt_alloc(rnh, dst, netmask)) == NULL)
739 		return (ENOBUFS);
740 
741 	error = nhop_create_from_info(rnh, info, &nh);
742 	if (error != 0) {
743 		rt_free_immediate(rt);
744 		return (error);
745 	}
746 
747 	rnd_add.rnd_nhop = nh;
748 	rnd_add.rnd_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
749 
750 	int op_flags = RTM_F_CREATE;
751 	if (get_prio_from_info(info) == NH_PRIORITY_HIGH)
752 		op_flags |= RTM_F_FORCE;
753 	else
754 		op_flags |= RTM_F_APPEND;
755 	return (add_route_flags(rnh, rt, &rnd_add, op_flags, rc));
756 
757 }
758 
759 static int
760 add_route_flags(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd_add,
761     int op_flags, struct rib_cmd_info *rc)
762 {
763 	struct route_nhop_data rnd_orig;
764 	struct nhop_object *nh;
765 	struct rtentry *rt_orig;
766 	int error = 0;
767 
768 	nh = rnd_add->rnd_nhop;
769 
770 	RIB_WLOCK(rnh);
771 
772 	rt_orig = lookup_prefix_rt(rnh, rt, &rnd_orig);
773 
774 	if (rt_orig == NULL) {
775 		if (op_flags & RTM_F_CREATE)
776 			error = add_route(rnh, rt, rnd_add, rc);
777 		else
778 			error = ESRCH; /* no entry but creation was not required */
779 		RIB_WUNLOCK(rnh);
780 		if (error != 0)
781 			goto out;
782 		return (0);
783 	}
784 
785 	if (op_flags & RTM_F_EXCL) {
786 		/* We have existing route in the RIB but not allowed to replace. */
787 		RIB_WUNLOCK(rnh);
788 		error = EEXIST;
789 		goto out;
790 	}
791 
792 	/* Now either append or replace */
793 	if (op_flags & RTM_F_REPLACE) {
794 		if (nhop_get_prio(rnd_orig.rnd_nhop) > nhop_get_prio(rnd_add->rnd_nhop)) {
795 			/* Old path is "better" (e.g. has PINNED flag set) */
796 			RIB_WUNLOCK(rnh);
797 			error = EEXIST;
798 			goto out;
799 		}
800 		change_route(rnh, rt_orig, rnd_add, rc);
801 		RIB_WUNLOCK(rnh);
802 		nh = rc->rc_nh_old;
803 		goto out;
804 	}
805 
806 	RIB_WUNLOCK(rnh);
807 
808 #ifdef ROUTE_MPATH
809 	if ((op_flags & RTM_F_APPEND) && rib_can_multipath(rnh) &&
810 	    nhop_can_multipath(rnd_add->rnd_nhop) &&
811 	    nhop_can_multipath(rnd_orig.rnd_nhop)) {
812 
813 		for (int i = 0; i < RIB_MAX_RETRIES; i++) {
814 			error = add_route_flags_mpath(rnh, rt_orig, rnd_add, &rnd_orig,
815 			    op_flags, rc);
816 			if (error != EAGAIN)
817 				break;
818 			RTSTAT_INC(rts_add_retry);
819 		}
820 
821 		/*
822 		 *  Original nhop reference is unused in any case.
823 		 */
824 		nhop_free_any(rnd_add->rnd_nhop);
825 		if (op_flags & RTM_F_CREATE) {
826 			if (error != 0 || rc->rc_cmd != RTM_ADD)
827 				rt_free_immediate(rt);
828 		}
829 		return (error);
830 	}
831 #endif
832 	/* Out of options - free state and return error */
833 	error = EEXIST;
834 out:
835 	if (op_flags & RTM_F_CREATE)
836 		rt_free_immediate(rt);
837 	nhop_free_any(nh);
838 
839 	return (error);
840 }
841 
842 #ifdef ROUTE_MPATH
843 static int
844 add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
845     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
846     int op_flags, struct rib_cmd_info *rc)
847 {
848 	RIB_RLOCK_TRACKER;
849 	struct route_nhop_data rnd_new;
850 	int error = 0;
851 
852 	error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, &rnd_new);
853 	if (error != 0) {
854 		if (error == EAGAIN) {
855 			/*
856 			 * Group creation failed, most probably because
857 			 * @rnd_orig data got scheduled for deletion.
858 			 * Refresh @rnd_orig data and retry.
859 			 */
860 			RIB_RLOCK(rnh);
861 			lookup_prefix_rt(rnh, rt, rnd_orig);
862 			RIB_RUNLOCK(rnh);
863 			if (rnd_orig == NULL && !(op_flags & RTM_F_CREATE)) {
864 				/* In this iteration route doesn't exist */
865 				error = ENOENT;
866 			}
867 		}
868 		return (error);
869 	}
870 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
871 	if (error != 0)
872 		return (error);
873 
874 	if (V_fib_hash_outbound == 0 && NH_IS_NHGRP(rc->rc_nh_new)) {
875 		/*
876 		 * First multipath route got installed. Enable local
877 		 * outbound connections hashing.
878 		 */
879 		if (bootverbose)
880 			printf("FIB: enabled flowid calculation for locally-originated packets\n");
881 		V_fib_hash_outbound = 1;
882 	}
883 
884 	return (0);
885 }
886 #endif
887 
888 /*
889  * Removes route defined by @info from the kernel table specified by @fibnum and
890  * sa_family in @info->rti_info[RTAX_DST].
891  *
892  * Returns 0 on success and fills in operation metadata into @rc.
893  */
894 int
895 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
896 {
897 	struct rib_head *rnh;
898 	struct sockaddr *dst, *netmask;
899 	struct sockaddr_storage mdst;
900 	int error;
901 
902 	NET_EPOCH_ASSERT();
903 
904 	rnh = get_rnh(fibnum, info);
905 	if (rnh == NULL)
906 		return (EAFNOSUPPORT);
907 
908 	bzero(rc, sizeof(struct rib_cmd_info));
909 	rc->rc_cmd = RTM_DELETE;
910 
911 	dst = info->rti_info[RTAX_DST];
912 	netmask = info->rti_info[RTAX_NETMASK];
913 
914 	if (netmask != NULL) {
915 		/* Ensure @dst is always properly masked */
916 		if (dst->sa_len > sizeof(mdst)) {
917 			FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large");
918 			return (EINVAL);
919 		}
920 		rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
921 		dst = (struct sockaddr *)&mdst;
922 	}
923 
924 	rib_filter_f_t *filter_func = NULL;
925 	void *filter_arg = NULL;
926 	struct gw_filter_data gwd = { .gw = info->rti_info[RTAX_GATEWAY] };
927 
928 	if (info->rti_filter != NULL) {
929 		filter_func = info->rti_filter;
930 		filter_arg = info->rti_filterdata;
931 	} else if (gwd.gw != NULL) {
932 		filter_func = match_gw_one;
933 		filter_arg = &gwd;
934 	}
935 
936 	int prio = get_prio_from_info(info);
937 
938 	RIB_WLOCK(rnh);
939 	struct route_nhop_data rnd;
940 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
941 	if (rt != NULL) {
942 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
943 		    filter_arg, rc);
944 	} else
945 		error = ESRCH;
946 	RIB_WUNLOCK(rnh);
947 
948 	if (error != 0)
949 		return (error);
950 
951 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
952 
953 	if (rc->rc_cmd == RTM_DELETE)
954 		rt_free(rc->rc_rt);
955 #ifdef ROUTE_MPATH
956 	else {
957 		/*
958 		 * Deleting 1 path may result in RTM_CHANGE to
959 		 * a different mpath group/nhop.
960 		 * Free old mpath group.
961 		 */
962 		nhop_free_any(rc->rc_nh_old);
963 	}
964 #endif
965 
966 	return (0);
967 }
968 
969 /*
970  * Conditionally unlinks rtentry paths from @rnh matching @cb.
971  * Returns 0 on success with operation result stored in @rc.
972  * On error, returns:
973  * ESRCH - if prefix was not found or filter function failed to match
974  * EADDRINUSE - if trying to delete higher priority route.
975  */
976 static int
977 rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
978     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc)
979 {
980 	struct nhop_object *nh = rt->rt_nhop;
981 
982 #ifdef ROUTE_MPATH
983 	if (NH_IS_NHGRP(nh)) {
984 		struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
985 		struct route_nhop_data rnd;
986 		int error;
987 
988 		if (cb == NULL)
989 			return (ESRCH);
990 		error = nhgrp_get_filtered_group(rnh, rt, nhg, cb, cbdata, &rnd);
991 		if (error == 0) {
992 			if (rnd.rnd_nhgrp == nhg) {
993 				/* No match, unreference new group and return. */
994 				nhop_free_any(rnd.rnd_nhop);
995 				return (ESRCH);
996 			}
997 			error = change_route(rnh, rt, &rnd, rc);
998 		}
999 		return (error);
1000 	}
1001 #endif
1002 	if (cb != NULL && !cb(rt, nh, cbdata))
1003 		return (ESRCH);
1004 
1005 	if (prio < nhop_get_prio(nh))
1006 		return (EADDRINUSE);
1007 
1008 	return (delete_route(rnh, rt, rc));
1009 }
1010 
1011 int
1012 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
1013     struct rib_cmd_info *rc)
1014 {
1015 	RIB_RLOCK_TRACKER;
1016 	struct route_nhop_data rnd_orig;
1017 	struct rib_head *rnh;
1018 	struct rtentry *rt;
1019 	int error;
1020 
1021 	NET_EPOCH_ASSERT();
1022 
1023 	rnh = get_rnh(fibnum, info);
1024 	if (rnh == NULL)
1025 		return (EAFNOSUPPORT);
1026 
1027 	bzero(rc, sizeof(struct rib_cmd_info));
1028 	rc->rc_cmd = RTM_CHANGE;
1029 
1030 	/* Check if updated gateway exists */
1031 	if ((info->rti_flags & RTF_GATEWAY) &&
1032 	    (info->rti_info[RTAX_GATEWAY] == NULL)) {
1033 
1034 		/*
1035 		 * route(8) adds RTF_GATEWAY flag if -interface is not set.
1036 		 * Remove RTF_GATEWAY to enforce consistency and maintain
1037 		 * compatibility..
1038 		 */
1039 		info->rti_flags &= ~RTF_GATEWAY;
1040 	}
1041 
1042 	/*
1043 	 * route change is done in multiple steps, with dropping and
1044 	 * reacquiring lock. In the situations with multiple processes
1045 	 * changes the same route in can lead to the case when route
1046 	 * is changed between the steps. Address it by retrying the operation
1047 	 * multiple times before failing.
1048 	 */
1049 
1050 	RIB_RLOCK(rnh);
1051 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
1052 	    info->rti_info[RTAX_NETMASK], &rnh->head);
1053 
1054 	if (rt == NULL) {
1055 		RIB_RUNLOCK(rnh);
1056 		return (ESRCH);
1057 	}
1058 
1059 	rnd_orig.rnd_nhop = rt->rt_nhop;
1060 	rnd_orig.rnd_weight = rt->rt_weight;
1061 
1062 	RIB_RUNLOCK(rnh);
1063 
1064 	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
1065 		error = change_route_byinfo(rnh, rt, info, &rnd_orig, rc);
1066 		if (error != EAGAIN)
1067 			break;
1068 	}
1069 
1070 	return (error);
1071 }
1072 
1073 static int
1074 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
1075     struct nhop_object *nh_orig, struct nhop_object **nh_new)
1076 {
1077 	int error;
1078 
1079 	/*
1080 	 * New gateway could require new ifaddr, ifp;
1081 	 * flags may also be different; ifp may be specified
1082 	 * by ll sockaddr when protocol address is ambiguous
1083 	 */
1084 	if (((nh_orig->nh_flags & NHF_GATEWAY) &&
1085 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
1086 	    info->rti_info[RTAX_IFP] != NULL ||
1087 	    (info->rti_info[RTAX_IFA] != NULL &&
1088 	     !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
1089 		error = rt_getifa_fib(info, rnh->rib_fibnum);
1090 
1091 		if (error != 0) {
1092 			info->rti_ifa = NULL;
1093 			return (error);
1094 		}
1095 	}
1096 
1097 	error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
1098 	info->rti_ifa = NULL;
1099 
1100 	return (error);
1101 }
1102 
1103 #ifdef ROUTE_MPATH
1104 static int
1105 change_mpath_route(struct rib_head *rnh, struct rtentry *rt,
1106     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1107     struct rib_cmd_info *rc)
1108 {
1109 	int error = 0, found_idx = 0;
1110 	struct nhop_object *nh_orig = NULL, *nh_new;
1111 	struct route_nhop_data rnd_new = {};
1112 	const struct weightened_nhop *wn = NULL;
1113 	struct weightened_nhop *wn_new;
1114 	uint32_t num_nhops;
1115 
1116 	wn = nhgrp_get_nhops(rnd_orig->rnd_nhgrp, &num_nhops);
1117 	for (int i = 0; i < num_nhops; i++) {
1118 		if (check_info_match_nhop(info, NULL, wn[i].nh) == 0) {
1119 			nh_orig = wn[i].nh;
1120 			found_idx = i;
1121 			break;
1122 		}
1123 	}
1124 
1125 	if (nh_orig == NULL)
1126 		return (ESRCH);
1127 
1128 	error = change_nhop(rnh, info, nh_orig, &nh_new);
1129 	if (error != 0)
1130 		return (error);
1131 
1132 	wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
1133 	    M_TEMP, M_NOWAIT | M_ZERO);
1134 	if (wn_new == NULL) {
1135 		nhop_free(nh_new);
1136 		return (EAGAIN);
1137 	}
1138 
1139 	memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
1140 	wn_new[found_idx].nh = nh_new;
1141 	wn_new[found_idx].weight = get_info_weight(info, wn[found_idx].weight);
1142 
1143 	error = nhgrp_get_group(rnh, wn_new, num_nhops, 0, &rnd_new.rnd_nhgrp);
1144 	nhop_free(nh_new);
1145 	free(wn_new, M_TEMP);
1146 
1147 	if (error != 0)
1148 		return (error);
1149 
1150 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1151 
1152 	return (error);
1153 }
1154 #endif
1155 
1156 static int
1157 change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
1158     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1159     struct rib_cmd_info *rc)
1160 {
1161 	int error = 0;
1162 	struct nhop_object *nh_orig;
1163 	struct route_nhop_data rnd_new;
1164 
1165 	nh_orig = rnd_orig->rnd_nhop;
1166 	if (nh_orig == NULL)
1167 		return (ESRCH);
1168 
1169 #ifdef ROUTE_MPATH
1170 	if (NH_IS_NHGRP(nh_orig))
1171 		return (change_mpath_route(rnh, rt, info, rnd_orig, rc));
1172 #endif
1173 
1174 	rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
1175 	error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
1176 	if (error != 0)
1177 		return (error);
1178 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1179 
1180 	return (error);
1181 }
1182 
1183 /*
1184  * Insert @rt with nhop data from @rnd_new to @rnh.
1185  * Returns 0 on success and stores operation results in @rc.
1186  */
1187 static int
1188 add_route(struct rib_head *rnh, struct rtentry *rt,
1189     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1190 {
1191 	struct radix_node *rn;
1192 
1193 	RIB_WLOCK_ASSERT(rnh);
1194 
1195 	rt->rt_nhop = rnd->rnd_nhop;
1196 	rt->rt_weight = rnd->rnd_weight;
1197 	rn = rnh->rnh_addaddr(rt_key(rt), rt_mask_const(rt), &rnh->head, rt->rt_nodes);
1198 
1199 	if (rn != NULL) {
1200 		if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1201 			tmproutes_update(rnh, rt, rnd->rnd_nhop);
1202 
1203 		/* Finalize notification */
1204 		rib_bump_gen(rnh);
1205 		rnh->rnh_prefixes++;
1206 
1207 		rc->rc_cmd = RTM_ADD;
1208 		rc->rc_rt = rt;
1209 		rc->rc_nh_old = NULL;
1210 		rc->rc_nh_new = rnd->rnd_nhop;
1211 		rc->rc_nh_weight = rnd->rnd_weight;
1212 
1213 		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1214 		return (0);
1215 	}
1216 
1217 	/* Existing route or memory allocation failure. */
1218 	return (EEXIST);
1219 }
1220 
1221 /*
1222  * Unconditionally deletes @rt from @rnh.
1223  */
1224 static int
1225 delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc)
1226 {
1227 	RIB_WLOCK_ASSERT(rnh);
1228 
1229 	/* Route deletion requested. */
1230 	struct radix_node *rn;
1231 
1232 	rn = rnh->rnh_deladdr(rt_key_const(rt), rt_mask_const(rt), &rnh->head);
1233 	if (rn == NULL)
1234 		return (ESRCH);
1235 	rt = RNTORT(rn);
1236 	rt->rte_flags &= ~RTF_UP;
1237 
1238 	rib_bump_gen(rnh);
1239 	rnh->rnh_prefixes--;
1240 
1241 	rc->rc_cmd = RTM_DELETE;
1242 	rc->rc_rt = rt;
1243 	rc->rc_nh_old = rt->rt_nhop;
1244 	rc->rc_nh_new = NULL;
1245 	rc->rc_nh_weight = rt->rt_weight;
1246 
1247 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1248 
1249 	return (0);
1250 }
1251 
1252 /*
1253  * Switch @rt nhop/weigh to the ones specified in @rnd.
1254  * Returns 0 on success.
1255  */
1256 int
1257 change_route(struct rib_head *rnh, struct rtentry *rt,
1258     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1259 {
1260 	struct nhop_object *nh_orig;
1261 
1262 	RIB_WLOCK_ASSERT(rnh);
1263 
1264 	nh_orig = rt->rt_nhop;
1265 
1266 	if (rnd->rnd_nhop == NULL)
1267 		return (delete_route(rnh, rt, rc));
1268 
1269 	/* Changing nexthop & weight to a new one */
1270 	rt->rt_nhop = rnd->rnd_nhop;
1271 	rt->rt_weight = rnd->rnd_weight;
1272 	if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1273 		tmproutes_update(rnh, rt, rnd->rnd_nhop);
1274 
1275 	/* Finalize notification */
1276 	rib_bump_gen(rnh);
1277 	rc->rc_cmd = RTM_CHANGE;
1278 	rc->rc_rt = rt;
1279 	rc->rc_nh_old = nh_orig;
1280 	rc->rc_nh_new = rnd->rnd_nhop;
1281 	rc->rc_nh_weight = rnd->rnd_weight;
1282 
1283 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1284 
1285 	return (0);
1286 }
1287 
1288 /*
1289  * Conditionally update route nhop/weight IFF data in @nhd_orig is
1290  *  consistent with the current route data.
1291  * Nexthop in @nhd_new is consumed.
1292  */
1293 int
1294 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
1295     struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new,
1296     struct rib_cmd_info *rc)
1297 {
1298 	struct rtentry *rt_new;
1299 	int error = 0;
1300 
1301 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
1302 		char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
1303 		nhop_print_buf_any(rnd_orig->rnd_nhop, buf_old, NHOP_PRINT_BUFSIZE);
1304 		nhop_print_buf_any(rnd_new->rnd_nhop, buf_new, NHOP_PRINT_BUFSIZE);
1305 		FIB_LOG(LOG_DEBUG2, rnh->rib_fibnum, rnh->rib_family,
1306 		    "trying change %s -> %s", buf_old, buf_new);
1307 	}
1308 	RIB_WLOCK(rnh);
1309 
1310 	struct route_nhop_data rnd;
1311 	rt_new = lookup_prefix_rt(rnh, rt, &rnd);
1312 
1313 	if (rt_new == NULL) {
1314 		if (rnd_orig->rnd_nhop == NULL)
1315 			error = add_route(rnh, rt, rnd_new, rc);
1316 		else {
1317 			/*
1318 			 * Prefix does not exist, which was not our assumption.
1319 			 * Update @rnd_orig with the new data and return
1320 			 */
1321 			rnd_orig->rnd_nhop = NULL;
1322 			rnd_orig->rnd_weight = 0;
1323 			error = EAGAIN;
1324 		}
1325 	} else {
1326 		/* Prefix exists, try to update */
1327 		if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
1328 			/*
1329 			 * Nhop/mpath group hasn't changed. Flip
1330 			 * to the new precalculated one and return
1331 			 */
1332 			error = change_route(rnh, rt_new, rnd_new, rc);
1333 		} else {
1334 			/* Update and retry */
1335 			rnd_orig->rnd_nhop = rt_new->rt_nhop;
1336 			rnd_orig->rnd_weight = rt_new->rt_weight;
1337 			error = EAGAIN;
1338 		}
1339 	}
1340 
1341 	RIB_WUNLOCK(rnh);
1342 
1343 	if (error == 0) {
1344 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
1345 
1346 		if (rnd_orig->rnd_nhop != NULL)
1347 			nhop_free_any(rnd_orig->rnd_nhop);
1348 
1349 	} else {
1350 		if (rnd_new->rnd_nhop != NULL)
1351 			nhop_free_any(rnd_new->rnd_nhop);
1352 	}
1353 
1354 	return (error);
1355 }
1356 
1357 /*
1358  * Performs modification of routing table specificed by @action.
1359  * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
1360  * Needs to be run in network epoch.
1361  *
1362  * Returns 0 on success and fills in @rc with action result.
1363  */
1364 int
1365 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
1366     struct rib_cmd_info *rc)
1367 {
1368 	int error;
1369 
1370 	switch (action) {
1371 	case RTM_ADD:
1372 		error = rib_add_route(fibnum, info, rc);
1373 		break;
1374 	case RTM_DELETE:
1375 		error = rib_del_route(fibnum, info, rc);
1376 		break;
1377 	case RTM_CHANGE:
1378 		error = rib_change_route(fibnum, info, rc);
1379 		break;
1380 	default:
1381 		error = ENOTSUP;
1382 	}
1383 
1384 	return (error);
1385 }
1386 
1387 struct rt_delinfo
1388 {
1389 	struct rib_head *rnh;
1390 	struct rtentry *head;
1391 	rib_filter_f_t *filter_f;
1392 	void *filter_arg;
1393 	int prio;
1394 	struct rib_cmd_info rc;
1395 };
1396 
1397 /*
1398  * Conditionally unlinks rtenties or paths from radix tree based
1399  * on the callback data passed in @arg.
1400  */
1401 static int
1402 rt_checkdelroute(struct radix_node *rn, void *arg)
1403 {
1404 	struct rt_delinfo *di = (struct rt_delinfo *)arg;
1405 	struct rtentry *rt = (struct rtentry *)rn;
1406 
1407 	if (rt_delete_conditional(di->rnh, rt, di->prio,
1408 	    di->filter_f, di->filter_arg, &di->rc) != 0)
1409 		return (0);
1410 
1411 	/*
1412 	 * Add deleted rtentries to the list to GC them
1413 	 *  after dropping the lock.
1414 	 *
1415 	 * XXX: Delayed notifications not implemented
1416 	 *  for nexthop updates.
1417 	 */
1418 	if (di->rc.rc_cmd == RTM_DELETE) {
1419 		/* Add to the list and return */
1420 		rt->rt_chain = di->head;
1421 		di->head = rt;
1422 #ifdef ROUTE_MPATH
1423 	} else {
1424 		/*
1425 		 * RTM_CHANGE to a different nexthop or nexthop group.
1426 		 * Free old multipath group.
1427 		 */
1428 		nhop_free_any(di->rc.rc_nh_old);
1429 #endif
1430 	}
1431 
1432 	return (0);
1433 }
1434 
1435 /*
1436  * Iterates over a routing table specified by @fibnum and @family and
1437  *  deletes elements marked by @filter_f.
1438  * @fibnum: rtable id
1439  * @family: AF_ address family
1440  * @filter_f: function returning non-zero value for items to delete
1441  * @arg: data to pass to the @filter_f function
1442  * @report: true if rtsock notification is needed.
1443  */
1444 void
1445 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg,
1446     bool report)
1447 {
1448 	struct rib_head *rnh;
1449 	struct rtentry *rt;
1450 	struct nhop_object *nh;
1451 	struct epoch_tracker et;
1452 
1453 	rnh = rt_tables_get_rnh(fibnum, family);
1454 	if (rnh == NULL)
1455 		return;
1456 
1457 	struct rt_delinfo di = {
1458 		.rnh = rnh,
1459 		.filter_f = filter_f,
1460 		.filter_arg = filter_arg,
1461 		.prio = NH_PRIORITY_NORMAL,
1462 	};
1463 
1464 	NET_EPOCH_ENTER(et);
1465 
1466 	RIB_WLOCK(rnh);
1467 	rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
1468 	RIB_WUNLOCK(rnh);
1469 
1470 	/* We might have something to reclaim. */
1471 	bzero(&di.rc, sizeof(di.rc));
1472 	di.rc.rc_cmd = RTM_DELETE;
1473 	while (di.head != NULL) {
1474 		rt = di.head;
1475 		di.head = rt->rt_chain;
1476 		rt->rt_chain = NULL;
1477 		nh = rt->rt_nhop;
1478 
1479 		di.rc.rc_rt = rt;
1480 		di.rc.rc_nh_old = nh;
1481 		rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
1482 
1483 		if (report) {
1484 #ifdef ROUTE_MPATH
1485 			struct nhgrp_object *nhg;
1486 			const struct weightened_nhop *wn;
1487 			uint32_t num_nhops;
1488 			if (NH_IS_NHGRP(nh)) {
1489 				nhg = (struct nhgrp_object *)nh;
1490 				wn = nhgrp_get_nhops(nhg, &num_nhops);
1491 				for (int i = 0; i < num_nhops; i++)
1492 					rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum);
1493 			} else
1494 #endif
1495 			rt_routemsg(RTM_DELETE, rt, nh, fibnum);
1496 		}
1497 		rt_free(rt);
1498 	}
1499 
1500 	NET_EPOCH_EXIT(et);
1501 }
1502 
1503 static int
1504 rt_delete_unconditional(struct radix_node *rn, void *arg)
1505 {
1506 	struct rtentry *rt = RNTORT(rn);
1507 	struct rib_head *rnh = (struct rib_head *)arg;
1508 
1509 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
1510 	if (RNTORT(rn) == rt)
1511 		rt_free(rt);
1512 
1513 	return (0);
1514 }
1515 
1516 /*
1517  * Removes all routes from the routing table without executing notifications.
1518  * rtentres will be removed after the end of a current epoch.
1519  */
1520 static void
1521 rib_flush_routes(struct rib_head *rnh)
1522 {
1523 	RIB_WLOCK(rnh);
1524 	rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh);
1525 	RIB_WUNLOCK(rnh);
1526 }
1527 
1528 void
1529 rib_flush_routes_family(int family)
1530 {
1531 	struct rib_head *rnh;
1532 
1533 	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
1534 		if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
1535 			rib_flush_routes(rnh);
1536 	}
1537 }
1538 
1539 const char *
1540 rib_print_family(int family)
1541 {
1542 	switch (family) {
1543 	case AF_INET:
1544 		return ("inet");
1545 	case AF_INET6:
1546 		return ("inet6");
1547 	case AF_LINK:
1548 		return ("link");
1549 	}
1550 	return ("unknown");
1551 }
1552 
1553