xref: /freebsd/sys/net/route/route_ctl.c (revision 0c428864495af9dc7d2af4d0a5ae21732af9c739)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
37 #include <sys/mbuf.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
40 #include <sys/syslog.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/rmlock.h>
44 
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/if_private.h>
48 #include <net/if_dl.h>
49 #include <net/vnet.h>
50 #include <net/route.h>
51 #include <net/route/route_ctl.h>
52 #include <net/route/route_var.h>
53 #include <net/route/nhop_utils.h>
54 #include <net/route/nhop.h>
55 #include <net/route/nhop_var.h>
56 #include <netinet/in.h>
57 #include <netinet6/scope6_var.h>
58 #include <netinet6/in6_var.h>
59 
60 #define	DEBUG_MOD_NAME	route_ctl
61 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
62 #include <net/route/route_debug.h>
63 _DECLARE_DEBUG(LOG_INFO);
64 
65 /*
66  * This file contains control plane routing tables functions.
67  *
68  * All functions assumes they are called in net epoch.
69  */
70 
71 union sockaddr_union {
72 	struct sockaddr		sa;
73 	struct sockaddr_in	sin;
74 	struct sockaddr_in6	sin6;
75 	char			_buf[32];
76 };
77 
78 static int add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
79     struct rib_cmd_info *rc);
80 static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
81     struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
82     struct rib_cmd_info *rc);
83 
84 static int add_route_flags(struct rib_head *rnh, struct rtentry *rt,
85     struct route_nhop_data *rnd_add, int op_flags, struct rib_cmd_info *rc);
86 #ifdef ROUTE_MPATH
87 static int add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
88     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
89     int op_flags, struct rib_cmd_info *rc);
90 #endif
91 
92 static int add_route(struct rib_head *rnh, struct rtentry *rt,
93     struct route_nhop_data *rnd, struct rib_cmd_info *rc);
94 static int delete_route(struct rib_head *rnh, struct rtentry *rt,
95     struct rib_cmd_info *rc);
96 static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
97     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc);
98 
99 static int get_prio_from_info(const struct rt_addrinfo *info);
100 static int nhop_get_prio(const struct nhop_object *nh);
101 
102 #ifdef ROUTE_MPATH
103 static bool rib_can_multipath(struct rib_head *rh);
104 #endif
105 
106 /* Per-vnet multipath routing configuration */
107 SYSCTL_DECL(_net_route);
108 #define	V_rib_route_multipath	VNET(rib_route_multipath)
109 #ifdef ROUTE_MPATH
110 #define _MP_FLAGS	CTLFLAG_RW
111 #else
112 #define _MP_FLAGS	CTLFLAG_RD
113 #endif
114 VNET_DEFINE(u_int, rib_route_multipath) = 1;
115 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
116     &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
117 #undef _MP_FLAGS
118 
119 #ifdef ROUTE_MPATH
120 VNET_DEFINE(u_int, fib_hash_outbound) = 0;
121 SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
122     &VNET_NAME(fib_hash_outbound), 0,
123     "Compute flowid for locally-originated packets");
124 
125 /* Default entropy to add to the hash calculation for the outbound connections*/
126 uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
127 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
128 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
129 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
130 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
131 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
132 };
133 #endif
134 
135 #if defined(INET) && defined(INET6)
136 FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops");
137 #define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop)
138 VNET_DEFINE_STATIC(u_int, rib_route_ipv6_nexthop) = 1;
139 SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET,
140     &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address");
141 #endif
142 
143 /* Debug bits */
144 SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
145 
146 static struct rib_head *
147 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
148 {
149 	struct rib_head *rnh;
150 	struct sockaddr *dst;
151 
152 	KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
153 
154 	dst = info->rti_info[RTAX_DST];
155 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
156 
157 	return (rnh);
158 }
159 
160 #if defined(INET) && defined(INET6)
161 bool
162 rib_can_4o6_nhop(void)
163 {
164 	return (!!V_rib_route_ipv6_nexthop);
165 }
166 #endif
167 
168 #ifdef ROUTE_MPATH
169 static bool
170 rib_can_multipath(struct rib_head *rh)
171 {
172 	int result;
173 
174 	CURVNET_SET(rh->rib_vnet);
175 	result = !!V_rib_route_multipath;
176 	CURVNET_RESTORE();
177 
178 	return (result);
179 }
180 
181 /*
182  * Check is nhop is multipath-eligible.
183  * Avoid nhops without gateways and redirects.
184  *
185  * Returns 1 for multipath-eligible nexthop,
186  * 0 otherwise.
187  */
188 bool
189 nhop_can_multipath(const struct nhop_object *nh)
190 {
191 
192 	if ((nh->nh_flags & NHF_MULTIPATH) != 0)
193 		return (1);
194 	if ((nh->nh_flags & NHF_GATEWAY) == 0)
195 		return (0);
196 	if ((nh->nh_flags & NHF_REDIRECT) != 0)
197 		return (0);
198 
199 	return (1);
200 }
201 #endif
202 
203 static int
204 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
205 {
206 	uint32_t weight;
207 
208 	if (info->rti_mflags & RTV_WEIGHT)
209 		weight = info->rti_rmx->rmx_weight;
210 	else
211 		weight = default_weight;
212 	/* Keep upper 1 byte for adm distance purposes */
213 	if (weight > RT_MAX_WEIGHT)
214 		weight = RT_MAX_WEIGHT;
215 	else if (weight == 0)
216 		weight = default_weight;
217 
218 	return (weight);
219 }
220 
221 /*
222  * File-local concept for distingushing between the normal and
223  * RTF_PINNED routes tha can override the "normal" one.
224  */
225 #define	NH_PRIORITY_HIGH	2
226 #define	NH_PRIORITY_NORMAL	1
227 static int
228 get_prio_from_info(const struct rt_addrinfo *info)
229 {
230 	if (info->rti_flags & RTF_PINNED)
231 		return (NH_PRIORITY_HIGH);
232 	return (NH_PRIORITY_NORMAL);
233 }
234 
235 static int
236 nhop_get_prio(const struct nhop_object *nh)
237 {
238 	if (NH_IS_PINNED(nh))
239 		return (NH_PRIORITY_HIGH);
240 	return (NH_PRIORITY_NORMAL);
241 }
242 
243 /*
244  * Check if specified @gw matches gw data in the nexthop @nh.
245  *
246  * Returns true if matches, false otherwise.
247  */
248 bool
249 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
250 {
251 
252 	if (nh->gw_sa.sa_family != gw->sa_family)
253 		return (false);
254 
255 	switch (gw->sa_family) {
256 	case AF_INET:
257 		return (nh->gw4_sa.sin_addr.s_addr ==
258 		    ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
259 	case AF_INET6:
260 		{
261 			const struct sockaddr_in6 *gw6;
262 			gw6 = (const struct sockaddr_in6 *)gw;
263 
264 			/*
265 			 * Currently (2020-09) IPv6 gws in kernel have their
266 			 * scope embedded. Once this becomes false, this code
267 			 * has to be revisited.
268 			 */
269 			if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
270 			    &gw6->sin6_addr))
271 				return (true);
272 			return (false);
273 		}
274 	case AF_LINK:
275 		{
276 			const struct sockaddr_dl *sdl;
277 			sdl = (const struct sockaddr_dl *)gw;
278 			return (nh->gwl_sa.sdl_index == sdl->sdl_index);
279 		}
280 	default:
281 		return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
282 	}
283 
284 	/* NOTREACHED */
285 	return (false);
286 }
287 
288 /*
289  * Matches all nexthop with given @gw.
290  * Can be used as rib_filter_f callback.
291  */
292 int
293 rib_match_gw(const struct rtentry *rt, const struct nhop_object *nh, void *gw_sa)
294 {
295 	const struct sockaddr *gw = (const struct sockaddr *)gw_sa;
296 
297 	return (match_nhop_gw(nh, gw));
298 }
299 
300 struct gw_filter_data {
301 	const struct sockaddr *gw;
302 	int count;
303 };
304 
305 /*
306  * Matches first occurence of the gateway provided in @gwd
307  */
308 static int
309 match_gw_one(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
310 {
311 	struct gw_filter_data *gwd = (struct gw_filter_data *)_data;
312 
313 	/* Return only first match to make rtsock happy */
314 	if (match_nhop_gw(nh, gwd->gw) && gwd->count++ == 0)
315 		return (1);
316 	return (0);
317 }
318 
319 /*
320  * Checks if data in @info matches nexhop @nh.
321  *
322  * Returns 0 on success,
323  * ESRCH if not matched,
324  * ENOENT if filter function returned false
325  */
326 int
327 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
328     const struct nhop_object *nh)
329 {
330 	const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
331 
332 	if (info->rti_filter != NULL) {
333 	    if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
334 		    return (ENOENT);
335 	    else
336 		    return (0);
337 	}
338 	if ((gw != NULL) && !match_nhop_gw(nh, gw))
339 		return (ESRCH);
340 
341 	return (0);
342 }
343 
344 /*
345  * Runs exact prefix match based on @dst and @netmask.
346  * Returns matched @rtentry if found or NULL.
347  * If rtentry was found, saves nexthop / weight value into @rnd.
348  */
349 static struct rtentry *
350 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
351     const struct sockaddr *netmask, struct route_nhop_data *rnd)
352 {
353 	struct rtentry *rt;
354 
355 	RIB_LOCK_ASSERT(rnh);
356 
357 	rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
358 	if (rt != NULL) {
359 		rnd->rnd_nhop = rt->rt_nhop;
360 		rnd->rnd_weight = rt->rt_weight;
361 	} else {
362 		rnd->rnd_nhop = NULL;
363 		rnd->rnd_weight = 0;
364 	}
365 
366 	return (rt);
367 }
368 
369 struct rtentry *
370 lookup_prefix_rt(struct rib_head *rnh, const struct rtentry *rt,
371     struct route_nhop_data *rnd)
372 {
373 	return (lookup_prefix_bysa(rnh, rt_key_const(rt), rt_mask_const(rt), rnd));
374 }
375 
376 /*
377  * Runs exact prefix match based on dst/netmask from @info.
378  * Assumes RIB lock is held.
379  * Returns matched @rtentry if found or NULL.
380  * If rtentry was found, saves nexthop / weight value into @rnd.
381  */
382 struct rtentry *
383 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
384     struct route_nhop_data *rnd)
385 {
386 	struct rtentry *rt;
387 
388 	rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
389 	    info->rti_info[RTAX_NETMASK], rnd);
390 
391 	return (rt);
392 }
393 
394 static bool
395 fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
396     struct sockaddr **pmask)
397 {
398 	if (plen == -1) {
399 		*pmask = NULL;
400 		return (true);
401 	}
402 
403 	switch (family) {
404 #ifdef INET
405 	case AF_INET:
406 		{
407 			struct sockaddr_in *mask = (struct sockaddr_in *)(*pmask);
408 			struct sockaddr_in *dst= (struct sockaddr_in *)_dst;
409 
410 			memset(mask, 0, sizeof(*mask));
411 			mask->sin_family = family;
412 			mask->sin_len = sizeof(*mask);
413 			if (plen == 32)
414 				*pmask = NULL;
415 			else if (plen > 32 || plen < 0)
416 				return (false);
417 			else {
418 				uint32_t daddr, maddr;
419 				maddr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
420 				mask->sin_addr.s_addr = maddr;
421 				daddr = dst->sin_addr.s_addr;
422 				daddr = htonl(ntohl(daddr) & ntohl(maddr));
423 				dst->sin_addr.s_addr = daddr;
424 			}
425 			return (true);
426 		}
427 		break;
428 #endif
429 #ifdef INET6
430 	case AF_INET6:
431 		{
432 			struct sockaddr_in6 *mask = (struct sockaddr_in6 *)(*pmask);
433 			struct sockaddr_in6 *dst = (struct sockaddr_in6 *)_dst;
434 
435 			memset(mask, 0, sizeof(*mask));
436 			mask->sin6_family = family;
437 			mask->sin6_len = sizeof(*mask);
438 			if (plen == 128)
439 				*pmask = NULL;
440 			else if (plen > 128 || plen < 0)
441 				return (false);
442 			else {
443 				ip6_writemask(&mask->sin6_addr, plen);
444 				IN6_MASK_ADDR(&dst->sin6_addr, &mask->sin6_addr);
445 			}
446 			return (true);
447 		}
448 		break;
449 #endif
450 	}
451 	return (false);
452 }
453 
454 /*
455  * Attempts to add @dst/plen prefix with nexthop/nexhopgroup data @rnd
456  * to the routing table.
457  *
458  * @fibnum: verified kernel rtable id to insert route to
459  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
460  * @plen: prefix length (or -1 if host route or not applicable for AF)
461  * @op_flags: combination of RTM_F_ flags
462  * @rc: storage to report operation result
463  *
464  * Returns 0 on success.
465  */
466 int
467 rib_add_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
468     struct route_nhop_data *rnd, int op_flags, struct rib_cmd_info *rc)
469 {
470 	union sockaddr_union mask_storage;
471 	struct sockaddr *netmask = &mask_storage.sa;
472 	struct rtentry *rt = NULL;
473 
474 	NET_EPOCH_ASSERT();
475 
476 	bzero(rc, sizeof(struct rib_cmd_info));
477 	rc->rc_cmd = RTM_ADD;
478 
479 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
480 	if (rnh == NULL)
481 		return (EAFNOSUPPORT);
482 
483 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
484 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
485 		return (EINVAL);
486 	}
487 
488 	if (op_flags & RTM_F_CREATE) {
489 		if ((rt = rt_alloc(rnh, dst, netmask)) == NULL) {
490 			FIB_RH_LOG(LOG_INFO, rnh, "rtentry allocation failed");
491 			return (ENOMEM);
492 		}
493 	} else {
494 		struct route_nhop_data rnd_tmp;
495 		RIB_RLOCK_TRACKER;
496 
497 		RIB_RLOCK(rnh);
498 		rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd_tmp);
499 		RIB_RUNLOCK(rnh);
500 
501 		if (rt == NULL)
502 			return (ESRCH);
503 	}
504 
505 	return (add_route_flags(rnh, rt, rnd, op_flags, rc));
506 }
507 
508 /*
509  * Attempts to delete @dst/plen prefix matching gateway @gw from the
510  *  routing rable.
511  *
512  * @fibnum: rtable id to remove route from
513  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
514  * @plen: prefix length (or -1 if host route or not applicable for AF)
515  * @gw: gateway to match
516  * @op_flags: combination of RTM_F_ flags
517  * @rc: storage to report operation result
518  *
519  * Returns 0 on success.
520  */
521 int
522 rib_del_route_px_gw(uint32_t fibnum, struct sockaddr *dst, int plen,
523     const struct sockaddr *gw, int op_flags, struct rib_cmd_info *rc)
524 {
525 	struct gw_filter_data gwd = { .gw = gw };
526 
527 	return (rib_del_route_px(fibnum, dst, plen, match_gw_one, &gwd, op_flags, rc));
528 }
529 
530 /*
531  * Attempts to delete @dst/plen prefix matching @filter_func from the
532  *  routing rable.
533  *
534  * @fibnum: rtable id to remove route from
535  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
536  * @plen: prefix length (or -1 if host route or not applicable for AF)
537  * @filter_func: func to be called for each nexthop of the prefix for matching
538  * @filter_arg: argument to pass to @filter_func
539  * @op_flags: combination of RTM_F_ flags
540  * @rc: storage to report operation result
541  *
542  * Returns 0 on success.
543  */
544 int
545 rib_del_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
546     rib_filter_f_t *filter_func, void *filter_arg, int op_flags,
547     struct rib_cmd_info *rc)
548 {
549 	union sockaddr_union mask_storage;
550 	struct sockaddr *netmask = &mask_storage.sa;
551 	int error;
552 
553 	NET_EPOCH_ASSERT();
554 
555 	bzero(rc, sizeof(struct rib_cmd_info));
556 	rc->rc_cmd = RTM_DELETE;
557 
558 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
559 	if (rnh == NULL)
560 		return (EAFNOSUPPORT);
561 
562 	if (dst->sa_len > sizeof(mask_storage)) {
563 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too big: %d", dst->sa_len);
564 		return (EINVAL);
565 	}
566 
567 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
568 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
569 		return (EINVAL);
570 	}
571 
572 	int prio = (op_flags & RTM_F_FORCE) ? NH_PRIORITY_HIGH : NH_PRIORITY_NORMAL;
573 
574 	RIB_WLOCK(rnh);
575 	struct route_nhop_data rnd;
576 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
577 	if (rt != NULL) {
578 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
579 		    filter_arg, rc);
580 	} else
581 		error = ESRCH;
582 	RIB_WUNLOCK(rnh);
583 
584 	if (error != 0)
585 		return (error);
586 
587 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
588 
589 	if (rc->rc_cmd == RTM_DELETE)
590 		rt_free(rc->rc_rt);
591 #ifdef ROUTE_MPATH
592 	else {
593 		/*
594 		 * Deleting 1 path may result in RTM_CHANGE to
595 		 * a different mpath group/nhop.
596 		 * Free old mpath group.
597 		 */
598 		nhop_free_any(rc->rc_nh_old);
599 	}
600 #endif
601 
602 	return (0);
603 }
604 
605 /*
606  * Tries to copy route @rt from one rtable to the rtable specified by @dst_rh.
607  * @rt: route to copy.
608  * @rnd_src: nhop and weight. Multipath routes are not supported
609  * @rh_dst: target rtable.
610  * @rc: operation result storage
611  *
612  * Return 0 on success.
613  */
614 int
615 rib_copy_route(struct rtentry *rt, const struct route_nhop_data *rnd_src,
616     struct rib_head *rh_dst, struct rib_cmd_info *rc)
617 {
618 	struct nhop_object __diagused *nh_src = rnd_src->rnd_nhop;
619 	int error;
620 
621 	MPASS((nh_src->nh_flags & NHF_MULTIPATH) == 0);
622 
623 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
624 		char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
625 		nhop_print_buf_any(nh_src, nhbuf, sizeof(nhbuf));
626 		rt_print_buf(rt, rtbuf, sizeof(rtbuf));
627 		FIB_RH_LOG(LOG_DEBUG2, rh_dst, "copying %s -> %s from fib %u",
628 		    rtbuf, nhbuf, nhop_get_fibnum(nh_src));
629 	}
630 	struct nhop_object *nh = nhop_alloc(rh_dst->rib_fibnum, rh_dst->rib_family);
631 	if (nh == NULL) {
632 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to allocate new nexthop");
633 		return (ENOMEM);
634 	}
635 	nhop_copy(nh, rnd_src->rnd_nhop);
636 	nhop_set_origin(nh, nhop_get_origin(rnd_src->rnd_nhop));
637 	nhop_set_fibnum(nh, rh_dst->rib_fibnum);
638 	nh = nhop_get_nhop_internal(rh_dst, nh, &error);
639 	if (error != 0) {
640 		FIB_RH_LOG(LOG_INFO, rh_dst,
641 		    "unable to finalize new nexthop: error %d", error);
642 		return (ENOMEM);
643 	}
644 
645 	struct rtentry *rt_new = rt_alloc(rh_dst, rt_key(rt), rt_mask(rt));
646 	if (rt_new == NULL) {
647 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to create new rtentry");
648 		nhop_free(nh);
649 		return (ENOMEM);
650 	}
651 
652 	struct route_nhop_data rnd = {
653 		.rnd_nhop = nh,
654 		.rnd_weight = rnd_src->rnd_weight
655 	};
656 	int op_flags = RTM_F_CREATE | (NH_IS_PINNED(nh) ? RTM_F_FORCE : 0);
657 	error = add_route_flags(rh_dst, rt_new, &rnd, op_flags, rc);
658 
659 	if (error != 0) {
660 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
661 			char buf[NHOP_PRINT_BUFSIZE];
662 			rt_print_buf(rt_new, buf, sizeof(buf));
663 			FIB_RH_LOG(LOG_DEBUG, rh_dst,
664 			    "Unable to add route %s: error %d", buf, error);
665 		}
666 		nhop_free(nh);
667 		rt_free_immediate(rt_new);
668 	}
669 	return (error);
670 }
671 
672 /*
673  * Adds route defined by @info into the kernel table specified by @fibnum and
674  * sa_family in @info->rti_info[RTAX_DST].
675  *
676  * Returns 0 on success and fills in operation metadata into @rc.
677  */
678 int
679 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
680     struct rib_cmd_info *rc)
681 {
682 	struct rib_head *rnh;
683 	int error;
684 
685 	NET_EPOCH_ASSERT();
686 
687 	rnh = get_rnh(fibnum, info);
688 	if (rnh == NULL)
689 		return (EAFNOSUPPORT);
690 
691 	/*
692 	 * Check consistency between RTF_HOST flag and netmask
693 	 * existence.
694 	 */
695 	if (info->rti_flags & RTF_HOST)
696 		info->rti_info[RTAX_NETMASK] = NULL;
697 	else if (info->rti_info[RTAX_NETMASK] == NULL) {
698 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: no RTF_HOST and empty netmask");
699 		return (EINVAL);
700 	}
701 
702 	bzero(rc, sizeof(struct rib_cmd_info));
703 	rc->rc_cmd = RTM_ADD;
704 
705 	error = add_route_byinfo(rnh, info, rc);
706 	if (error == 0)
707 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
708 
709 	return (error);
710 }
711 
712 static int
713 add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
714     struct rib_cmd_info *rc)
715 {
716 	struct route_nhop_data rnd_add;
717 	struct nhop_object *nh;
718 	struct rtentry *rt;
719 	struct sockaddr *dst, *gateway, *netmask;
720 	int error;
721 
722 	dst = info->rti_info[RTAX_DST];
723 	gateway = info->rti_info[RTAX_GATEWAY];
724 	netmask = info->rti_info[RTAX_NETMASK];
725 
726 	if ((info->rti_flags & RTF_GATEWAY) && !gateway) {
727 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: RTF_GATEWAY set with empty gw");
728 		return (EINVAL);
729 	}
730 	if (dst && gateway && !nhop_check_gateway(dst->sa_family, gateway->sa_family)) {
731 		FIB_RH_LOG(LOG_DEBUG, rnh,
732 		    "error: invalid dst/gateway family combination (%d, %d)",
733 		    dst->sa_family, gateway->sa_family);
734 		return (EINVAL);
735 	}
736 
737 	if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) {
738 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large: %d",
739 		    dst->sa_len);
740 		return (EINVAL);
741 	}
742 
743 	if (info->rti_ifa == NULL) {
744 		error = rt_getifa_fib(info, rnh->rib_fibnum);
745 		if (error)
746 			return (error);
747 	}
748 
749 	if ((rt = rt_alloc(rnh, dst, netmask)) == NULL)
750 		return (ENOBUFS);
751 
752 	error = nhop_create_from_info(rnh, info, &nh);
753 	if (error != 0) {
754 		rt_free_immediate(rt);
755 		return (error);
756 	}
757 
758 	rnd_add.rnd_nhop = nh;
759 	rnd_add.rnd_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
760 
761 	int op_flags = RTM_F_CREATE;
762 	if (get_prio_from_info(info) == NH_PRIORITY_HIGH)
763 		op_flags |= RTM_F_FORCE;
764 	else
765 		op_flags |= RTM_F_APPEND;
766 	return (add_route_flags(rnh, rt, &rnd_add, op_flags, rc));
767 
768 }
769 
770 static int
771 add_route_flags(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd_add,
772     int op_flags, struct rib_cmd_info *rc)
773 {
774 	struct route_nhop_data rnd_orig;
775 	struct nhop_object *nh;
776 	struct rtentry *rt_orig;
777 	int error = 0;
778 
779 	MPASS(rt != NULL);
780 
781 	nh = rnd_add->rnd_nhop;
782 
783 	RIB_WLOCK(rnh);
784 
785 	rt_orig = lookup_prefix_rt(rnh, rt, &rnd_orig);
786 
787 	if (rt_orig == NULL) {
788 		if (op_flags & RTM_F_CREATE)
789 			error = add_route(rnh, rt, rnd_add, rc);
790 		else
791 			error = ESRCH; /* no entry but creation was not required */
792 		RIB_WUNLOCK(rnh);
793 		if (error != 0)
794 			goto out;
795 		return (0);
796 	}
797 
798 	if (op_flags & RTM_F_EXCL) {
799 		/* We have existing route in the RIB but not allowed to replace. */
800 		RIB_WUNLOCK(rnh);
801 		error = EEXIST;
802 		goto out;
803 	}
804 
805 	/* Now either append or replace */
806 	if (op_flags & RTM_F_REPLACE) {
807 		if (nhop_get_prio(rnd_orig.rnd_nhop) > nhop_get_prio(rnd_add->rnd_nhop)) {
808 			/* Old path is "better" (e.g. has PINNED flag set) */
809 			RIB_WUNLOCK(rnh);
810 			error = EEXIST;
811 			goto out;
812 		}
813 		change_route(rnh, rt_orig, rnd_add, rc);
814 		RIB_WUNLOCK(rnh);
815 		nh = rc->rc_nh_old;
816 		goto out;
817 	}
818 
819 	RIB_WUNLOCK(rnh);
820 
821 #ifdef ROUTE_MPATH
822 	if ((op_flags & RTM_F_APPEND) && rib_can_multipath(rnh) &&
823 	    nhop_can_multipath(rnd_add->rnd_nhop) &&
824 	    nhop_can_multipath(rnd_orig.rnd_nhop)) {
825 
826 		for (int i = 0; i < RIB_MAX_RETRIES; i++) {
827 			error = add_route_flags_mpath(rnh, rt_orig, rnd_add, &rnd_orig,
828 			    op_flags, rc);
829 			if (error != EAGAIN)
830 				break;
831 			RTSTAT_INC(rts_add_retry);
832 		}
833 
834 		/*
835 		 *  Original nhop reference is unused in any case.
836 		 */
837 		nhop_free_any(rnd_add->rnd_nhop);
838 		if (op_flags & RTM_F_CREATE) {
839 			if (error != 0 || rc->rc_cmd != RTM_ADD)
840 				rt_free_immediate(rt);
841 		}
842 		return (error);
843 	}
844 #endif
845 	/* Out of options - free state and return error */
846 	error = EEXIST;
847 out:
848 	if (op_flags & RTM_F_CREATE)
849 		rt_free_immediate(rt);
850 	nhop_free_any(nh);
851 
852 	return (error);
853 }
854 
855 #ifdef ROUTE_MPATH
856 static int
857 add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
858     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
859     int op_flags, struct rib_cmd_info *rc)
860 {
861 	RIB_RLOCK_TRACKER;
862 	struct route_nhop_data rnd_new;
863 	int error = 0;
864 
865 	error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, &rnd_new);
866 	if (error != 0) {
867 		if (error == EAGAIN) {
868 			/*
869 			 * Group creation failed, most probably because
870 			 * @rnd_orig data got scheduled for deletion.
871 			 * Refresh @rnd_orig data and retry.
872 			 */
873 			RIB_RLOCK(rnh);
874 			lookup_prefix_rt(rnh, rt, rnd_orig);
875 			RIB_RUNLOCK(rnh);
876 			if (rnd_orig == NULL && !(op_flags & RTM_F_CREATE)) {
877 				/* In this iteration route doesn't exist */
878 				error = ENOENT;
879 			}
880 		}
881 		return (error);
882 	}
883 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
884 	if (error != 0)
885 		return (error);
886 
887 	if (V_fib_hash_outbound == 0 && NH_IS_NHGRP(rc->rc_nh_new)) {
888 		/*
889 		 * First multipath route got installed. Enable local
890 		 * outbound connections hashing.
891 		 */
892 		if (bootverbose)
893 			printf("FIB: enabled flowid calculation for locally-originated packets\n");
894 		V_fib_hash_outbound = 1;
895 	}
896 
897 	return (0);
898 }
899 #endif
900 
901 /*
902  * Removes route defined by @info from the kernel table specified by @fibnum and
903  * sa_family in @info->rti_info[RTAX_DST].
904  *
905  * Returns 0 on success and fills in operation metadata into @rc.
906  */
907 int
908 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
909 {
910 	struct rib_head *rnh;
911 	struct sockaddr *dst, *netmask;
912 	struct sockaddr_storage mdst;
913 	int error;
914 
915 	NET_EPOCH_ASSERT();
916 
917 	rnh = get_rnh(fibnum, info);
918 	if (rnh == NULL)
919 		return (EAFNOSUPPORT);
920 
921 	bzero(rc, sizeof(struct rib_cmd_info));
922 	rc->rc_cmd = RTM_DELETE;
923 
924 	dst = info->rti_info[RTAX_DST];
925 	netmask = info->rti_info[RTAX_NETMASK];
926 
927 	if (netmask != NULL) {
928 		/* Ensure @dst is always properly masked */
929 		if (dst->sa_len > sizeof(mdst)) {
930 			FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large");
931 			return (EINVAL);
932 		}
933 		rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
934 		dst = (struct sockaddr *)&mdst;
935 	}
936 
937 	rib_filter_f_t *filter_func = NULL;
938 	void *filter_arg = NULL;
939 	struct gw_filter_data gwd = { .gw = info->rti_info[RTAX_GATEWAY] };
940 
941 	if (info->rti_filter != NULL) {
942 		filter_func = info->rti_filter;
943 		filter_arg = info->rti_filterdata;
944 	} else if (gwd.gw != NULL) {
945 		filter_func = match_gw_one;
946 		filter_arg = &gwd;
947 	}
948 
949 	int prio = get_prio_from_info(info);
950 
951 	RIB_WLOCK(rnh);
952 	struct route_nhop_data rnd;
953 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
954 	if (rt != NULL) {
955 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
956 		    filter_arg, rc);
957 	} else
958 		error = ESRCH;
959 	RIB_WUNLOCK(rnh);
960 
961 	if (error != 0)
962 		return (error);
963 
964 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
965 
966 	if (rc->rc_cmd == RTM_DELETE)
967 		rt_free(rc->rc_rt);
968 #ifdef ROUTE_MPATH
969 	else {
970 		/*
971 		 * Deleting 1 path may result in RTM_CHANGE to
972 		 * a different mpath group/nhop.
973 		 * Free old mpath group.
974 		 */
975 		nhop_free_any(rc->rc_nh_old);
976 	}
977 #endif
978 
979 	return (0);
980 }
981 
982 /*
983  * Conditionally unlinks rtentry paths from @rnh matching @cb.
984  * Returns 0 on success with operation result stored in @rc.
985  * On error, returns:
986  * ESRCH - if prefix was not found or filter function failed to match
987  * EADDRINUSE - if trying to delete higher priority route.
988  */
989 static int
990 rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
991     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc)
992 {
993 	struct nhop_object *nh = rt->rt_nhop;
994 
995 #ifdef ROUTE_MPATH
996 	if (NH_IS_NHGRP(nh)) {
997 		struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
998 		struct route_nhop_data rnd;
999 		int error;
1000 
1001 		if (cb == NULL)
1002 			return (ESRCH);
1003 		error = nhgrp_get_filtered_group(rnh, rt, nhg, cb, cbdata, &rnd);
1004 		if (error == 0) {
1005 			if (rnd.rnd_nhgrp == nhg) {
1006 				/* No match, unreference new group and return. */
1007 				nhop_free_any(rnd.rnd_nhop);
1008 				return (ESRCH);
1009 			}
1010 			error = change_route(rnh, rt, &rnd, rc);
1011 		}
1012 		return (error);
1013 	}
1014 #endif
1015 	if (cb != NULL && !cb(rt, nh, cbdata))
1016 		return (ESRCH);
1017 
1018 	if (prio < nhop_get_prio(nh))
1019 		return (EADDRINUSE);
1020 
1021 	return (delete_route(rnh, rt, rc));
1022 }
1023 
1024 int
1025 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
1026     struct rib_cmd_info *rc)
1027 {
1028 	RIB_RLOCK_TRACKER;
1029 	struct route_nhop_data rnd_orig;
1030 	struct rib_head *rnh;
1031 	struct rtentry *rt;
1032 	int error;
1033 
1034 	NET_EPOCH_ASSERT();
1035 
1036 	rnh = get_rnh(fibnum, info);
1037 	if (rnh == NULL)
1038 		return (EAFNOSUPPORT);
1039 
1040 	bzero(rc, sizeof(struct rib_cmd_info));
1041 	rc->rc_cmd = RTM_CHANGE;
1042 
1043 	/* Check if updated gateway exists */
1044 	if ((info->rti_flags & RTF_GATEWAY) &&
1045 	    (info->rti_info[RTAX_GATEWAY] == NULL)) {
1046 
1047 		/*
1048 		 * route(8) adds RTF_GATEWAY flag if -interface is not set.
1049 		 * Remove RTF_GATEWAY to enforce consistency and maintain
1050 		 * compatibility..
1051 		 */
1052 		info->rti_flags &= ~RTF_GATEWAY;
1053 	}
1054 
1055 	/*
1056 	 * route change is done in multiple steps, with dropping and
1057 	 * reacquiring lock. In the situations with multiple processes
1058 	 * changes the same route in can lead to the case when route
1059 	 * is changed between the steps. Address it by retrying the operation
1060 	 * multiple times before failing.
1061 	 */
1062 
1063 	RIB_RLOCK(rnh);
1064 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
1065 	    info->rti_info[RTAX_NETMASK], &rnh->head);
1066 
1067 	if (rt == NULL) {
1068 		RIB_RUNLOCK(rnh);
1069 		return (ESRCH);
1070 	}
1071 
1072 	rnd_orig.rnd_nhop = rt->rt_nhop;
1073 	rnd_orig.rnd_weight = rt->rt_weight;
1074 
1075 	RIB_RUNLOCK(rnh);
1076 
1077 	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
1078 		error = change_route_byinfo(rnh, rt, info, &rnd_orig, rc);
1079 		if (error != EAGAIN)
1080 			break;
1081 	}
1082 
1083 	return (error);
1084 }
1085 
1086 static int
1087 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
1088     struct nhop_object *nh_orig, struct nhop_object **nh_new)
1089 {
1090 	int error;
1091 
1092 	/*
1093 	 * New gateway could require new ifaddr, ifp;
1094 	 * flags may also be different; ifp may be specified
1095 	 * by ll sockaddr when protocol address is ambiguous
1096 	 */
1097 	if (((nh_orig->nh_flags & NHF_GATEWAY) &&
1098 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
1099 	    info->rti_info[RTAX_IFP] != NULL ||
1100 	    (info->rti_info[RTAX_IFA] != NULL &&
1101 	     !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
1102 		error = rt_getifa_fib(info, rnh->rib_fibnum);
1103 
1104 		if (error != 0) {
1105 			info->rti_ifa = NULL;
1106 			return (error);
1107 		}
1108 	}
1109 
1110 	error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
1111 	info->rti_ifa = NULL;
1112 
1113 	return (error);
1114 }
1115 
1116 #ifdef ROUTE_MPATH
1117 static int
1118 change_mpath_route(struct rib_head *rnh, struct rtentry *rt,
1119     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1120     struct rib_cmd_info *rc)
1121 {
1122 	int error = 0, found_idx = 0;
1123 	struct nhop_object *nh_orig = NULL, *nh_new;
1124 	struct route_nhop_data rnd_new = {};
1125 	const struct weightened_nhop *wn = NULL;
1126 	struct weightened_nhop *wn_new;
1127 	uint32_t num_nhops;
1128 
1129 	wn = nhgrp_get_nhops(rnd_orig->rnd_nhgrp, &num_nhops);
1130 	for (int i = 0; i < num_nhops; i++) {
1131 		if (check_info_match_nhop(info, NULL, wn[i].nh) == 0) {
1132 			nh_orig = wn[i].nh;
1133 			found_idx = i;
1134 			break;
1135 		}
1136 	}
1137 
1138 	if (nh_orig == NULL)
1139 		return (ESRCH);
1140 
1141 	error = change_nhop(rnh, info, nh_orig, &nh_new);
1142 	if (error != 0)
1143 		return (error);
1144 
1145 	wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
1146 	    M_TEMP, M_NOWAIT | M_ZERO);
1147 	if (wn_new == NULL) {
1148 		nhop_free(nh_new);
1149 		return (EAGAIN);
1150 	}
1151 
1152 	memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
1153 	wn_new[found_idx].nh = nh_new;
1154 	wn_new[found_idx].weight = get_info_weight(info, wn[found_idx].weight);
1155 
1156 	error = nhgrp_get_group(rnh, wn_new, num_nhops, 0, &rnd_new.rnd_nhgrp);
1157 	nhop_free(nh_new);
1158 	free(wn_new, M_TEMP);
1159 
1160 	if (error != 0)
1161 		return (error);
1162 
1163 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1164 
1165 	return (error);
1166 }
1167 #endif
1168 
1169 static int
1170 change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
1171     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1172     struct rib_cmd_info *rc)
1173 {
1174 	int error = 0;
1175 	struct nhop_object *nh_orig;
1176 	struct route_nhop_data rnd_new;
1177 
1178 	nh_orig = rnd_orig->rnd_nhop;
1179 	if (nh_orig == NULL)
1180 		return (ESRCH);
1181 
1182 #ifdef ROUTE_MPATH
1183 	if (NH_IS_NHGRP(nh_orig))
1184 		return (change_mpath_route(rnh, rt, info, rnd_orig, rc));
1185 #endif
1186 
1187 	rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
1188 	error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
1189 	if (error != 0)
1190 		return (error);
1191 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1192 
1193 	return (error);
1194 }
1195 
1196 /*
1197  * Insert @rt with nhop data from @rnd_new to @rnh.
1198  * Returns 0 on success and stores operation results in @rc.
1199  */
1200 static int
1201 add_route(struct rib_head *rnh, struct rtentry *rt,
1202     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1203 {
1204 	struct radix_node *rn;
1205 
1206 	RIB_WLOCK_ASSERT(rnh);
1207 
1208 	rt->rt_nhop = rnd->rnd_nhop;
1209 	rt->rt_weight = rnd->rnd_weight;
1210 	rn = rnh->rnh_addaddr(rt_key(rt), rt_mask_const(rt), &rnh->head, rt->rt_nodes);
1211 
1212 	if (rn != NULL) {
1213 		if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1214 			tmproutes_update(rnh, rt, rnd->rnd_nhop);
1215 
1216 		/* Finalize notification */
1217 		rib_bump_gen(rnh);
1218 		rnh->rnh_prefixes++;
1219 
1220 		rc->rc_cmd = RTM_ADD;
1221 		rc->rc_rt = rt;
1222 		rc->rc_nh_old = NULL;
1223 		rc->rc_nh_new = rnd->rnd_nhop;
1224 		rc->rc_nh_weight = rnd->rnd_weight;
1225 
1226 		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1227 		return (0);
1228 	}
1229 
1230 	/* Existing route or memory allocation failure. */
1231 	return (EEXIST);
1232 }
1233 
1234 /*
1235  * Unconditionally deletes @rt from @rnh.
1236  */
1237 static int
1238 delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc)
1239 {
1240 	RIB_WLOCK_ASSERT(rnh);
1241 
1242 	/* Route deletion requested. */
1243 	struct radix_node *rn;
1244 
1245 	rn = rnh->rnh_deladdr(rt_key_const(rt), rt_mask_const(rt), &rnh->head);
1246 	if (rn == NULL)
1247 		return (ESRCH);
1248 	rt = RNTORT(rn);
1249 	rt->rte_flags &= ~RTF_UP;
1250 
1251 	rib_bump_gen(rnh);
1252 	rnh->rnh_prefixes--;
1253 
1254 	rc->rc_cmd = RTM_DELETE;
1255 	rc->rc_rt = rt;
1256 	rc->rc_nh_old = rt->rt_nhop;
1257 	rc->rc_nh_new = NULL;
1258 	rc->rc_nh_weight = rt->rt_weight;
1259 
1260 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1261 
1262 	return (0);
1263 }
1264 
1265 /*
1266  * Switch @rt nhop/weigh to the ones specified in @rnd.
1267  * Returns 0 on success.
1268  */
1269 int
1270 change_route(struct rib_head *rnh, struct rtentry *rt,
1271     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1272 {
1273 	struct nhop_object *nh_orig;
1274 
1275 	RIB_WLOCK_ASSERT(rnh);
1276 
1277 	nh_orig = rt->rt_nhop;
1278 
1279 	if (rnd->rnd_nhop == NULL)
1280 		return (delete_route(rnh, rt, rc));
1281 
1282 	/* Changing nexthop & weight to a new one */
1283 	rt->rt_nhop = rnd->rnd_nhop;
1284 	rt->rt_weight = rnd->rnd_weight;
1285 	if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1286 		tmproutes_update(rnh, rt, rnd->rnd_nhop);
1287 
1288 	/* Finalize notification */
1289 	rib_bump_gen(rnh);
1290 	rc->rc_cmd = RTM_CHANGE;
1291 	rc->rc_rt = rt;
1292 	rc->rc_nh_old = nh_orig;
1293 	rc->rc_nh_new = rnd->rnd_nhop;
1294 	rc->rc_nh_weight = rnd->rnd_weight;
1295 
1296 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1297 
1298 	return (0);
1299 }
1300 
1301 /*
1302  * Conditionally update route nhop/weight IFF data in @nhd_orig is
1303  *  consistent with the current route data.
1304  * Nexthop in @nhd_new is consumed.
1305  */
1306 int
1307 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
1308     struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new,
1309     struct rib_cmd_info *rc)
1310 {
1311 	struct rtentry *rt_new;
1312 	int error = 0;
1313 
1314 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
1315 		char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
1316 		nhop_print_buf_any(rnd_orig->rnd_nhop, buf_old, NHOP_PRINT_BUFSIZE);
1317 		nhop_print_buf_any(rnd_new->rnd_nhop, buf_new, NHOP_PRINT_BUFSIZE);
1318 		FIB_LOG(LOG_DEBUG2, rnh->rib_fibnum, rnh->rib_family,
1319 		    "trying change %s -> %s", buf_old, buf_new);
1320 	}
1321 	RIB_WLOCK(rnh);
1322 
1323 	struct route_nhop_data rnd;
1324 	rt_new = lookup_prefix_rt(rnh, rt, &rnd);
1325 
1326 	if (rt_new == NULL) {
1327 		if (rnd_orig->rnd_nhop == NULL)
1328 			error = add_route(rnh, rt, rnd_new, rc);
1329 		else {
1330 			/*
1331 			 * Prefix does not exist, which was not our assumption.
1332 			 * Update @rnd_orig with the new data and return
1333 			 */
1334 			rnd_orig->rnd_nhop = NULL;
1335 			rnd_orig->rnd_weight = 0;
1336 			error = EAGAIN;
1337 		}
1338 	} else {
1339 		/* Prefix exists, try to update */
1340 		if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
1341 			/*
1342 			 * Nhop/mpath group hasn't changed. Flip
1343 			 * to the new precalculated one and return
1344 			 */
1345 			error = change_route(rnh, rt_new, rnd_new, rc);
1346 		} else {
1347 			/* Update and retry */
1348 			rnd_orig->rnd_nhop = rt_new->rt_nhop;
1349 			rnd_orig->rnd_weight = rt_new->rt_weight;
1350 			error = EAGAIN;
1351 		}
1352 	}
1353 
1354 	RIB_WUNLOCK(rnh);
1355 
1356 	if (error == 0) {
1357 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
1358 
1359 		if (rnd_orig->rnd_nhop != NULL)
1360 			nhop_free_any(rnd_orig->rnd_nhop);
1361 
1362 	} else {
1363 		if (rnd_new->rnd_nhop != NULL)
1364 			nhop_free_any(rnd_new->rnd_nhop);
1365 	}
1366 
1367 	return (error);
1368 }
1369 
1370 /*
1371  * Performs modification of routing table specificed by @action.
1372  * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
1373  * Needs to be run in network epoch.
1374  *
1375  * Returns 0 on success and fills in @rc with action result.
1376  */
1377 int
1378 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
1379     struct rib_cmd_info *rc)
1380 {
1381 	int error;
1382 
1383 	switch (action) {
1384 	case RTM_ADD:
1385 		error = rib_add_route(fibnum, info, rc);
1386 		break;
1387 	case RTM_DELETE:
1388 		error = rib_del_route(fibnum, info, rc);
1389 		break;
1390 	case RTM_CHANGE:
1391 		error = rib_change_route(fibnum, info, rc);
1392 		break;
1393 	default:
1394 		error = ENOTSUP;
1395 	}
1396 
1397 	return (error);
1398 }
1399 
1400 struct rt_delinfo
1401 {
1402 	struct rib_head *rnh;
1403 	struct rtentry *head;
1404 	rib_filter_f_t *filter_f;
1405 	void *filter_arg;
1406 	int prio;
1407 	struct rib_cmd_info rc;
1408 };
1409 
1410 /*
1411  * Conditionally unlinks rtenties or paths from radix tree based
1412  * on the callback data passed in @arg.
1413  */
1414 static int
1415 rt_checkdelroute(struct radix_node *rn, void *arg)
1416 {
1417 	struct rt_delinfo *di = (struct rt_delinfo *)arg;
1418 	struct rtentry *rt = (struct rtentry *)rn;
1419 
1420 	if (rt_delete_conditional(di->rnh, rt, di->prio,
1421 	    di->filter_f, di->filter_arg, &di->rc) != 0)
1422 		return (0);
1423 
1424 	/*
1425 	 * Add deleted rtentries to the list to GC them
1426 	 *  after dropping the lock.
1427 	 *
1428 	 * XXX: Delayed notifications not implemented
1429 	 *  for nexthop updates.
1430 	 */
1431 	if (di->rc.rc_cmd == RTM_DELETE) {
1432 		/* Add to the list and return */
1433 		rt->rt_chain = di->head;
1434 		di->head = rt;
1435 #ifdef ROUTE_MPATH
1436 	} else {
1437 		/*
1438 		 * RTM_CHANGE to a different nexthop or nexthop group.
1439 		 * Free old multipath group.
1440 		 */
1441 		nhop_free_any(di->rc.rc_nh_old);
1442 #endif
1443 	}
1444 
1445 	return (0);
1446 }
1447 
1448 /*
1449  * Iterates over a routing table specified by @fibnum and @family and
1450  *  deletes elements marked by @filter_f.
1451  * @fibnum: rtable id
1452  * @family: AF_ address family
1453  * @filter_f: function returning non-zero value for items to delete
1454  * @arg: data to pass to the @filter_f function
1455  * @report: true if rtsock notification is needed.
1456  */
1457 void
1458 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg,
1459     bool report)
1460 {
1461 	struct rib_head *rnh;
1462 	struct rtentry *rt;
1463 	struct nhop_object *nh;
1464 	struct epoch_tracker et;
1465 
1466 	rnh = rt_tables_get_rnh(fibnum, family);
1467 	if (rnh == NULL)
1468 		return;
1469 
1470 	struct rt_delinfo di = {
1471 		.rnh = rnh,
1472 		.filter_f = filter_f,
1473 		.filter_arg = filter_arg,
1474 		.prio = NH_PRIORITY_NORMAL,
1475 	};
1476 
1477 	NET_EPOCH_ENTER(et);
1478 
1479 	RIB_WLOCK(rnh);
1480 	rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
1481 	RIB_WUNLOCK(rnh);
1482 
1483 	/* We might have something to reclaim. */
1484 	bzero(&di.rc, sizeof(di.rc));
1485 	di.rc.rc_cmd = RTM_DELETE;
1486 	while (di.head != NULL) {
1487 		rt = di.head;
1488 		di.head = rt->rt_chain;
1489 		rt->rt_chain = NULL;
1490 		nh = rt->rt_nhop;
1491 
1492 		di.rc.rc_rt = rt;
1493 		di.rc.rc_nh_old = nh;
1494 		rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
1495 
1496 		if (report) {
1497 #ifdef ROUTE_MPATH
1498 			struct nhgrp_object *nhg;
1499 			const struct weightened_nhop *wn;
1500 			uint32_t num_nhops;
1501 			if (NH_IS_NHGRP(nh)) {
1502 				nhg = (struct nhgrp_object *)nh;
1503 				wn = nhgrp_get_nhops(nhg, &num_nhops);
1504 				for (int i = 0; i < num_nhops; i++)
1505 					rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum);
1506 			} else
1507 #endif
1508 			rt_routemsg(RTM_DELETE, rt, nh, fibnum);
1509 		}
1510 		rt_free(rt);
1511 	}
1512 
1513 	NET_EPOCH_EXIT(et);
1514 }
1515 
1516 static int
1517 rt_delete_unconditional(struct radix_node *rn, void *arg)
1518 {
1519 	struct rtentry *rt = RNTORT(rn);
1520 	struct rib_head *rnh = (struct rib_head *)arg;
1521 
1522 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
1523 	if (RNTORT(rn) == rt)
1524 		rt_free(rt);
1525 
1526 	return (0);
1527 }
1528 
1529 /*
1530  * Removes all routes from the routing table without executing notifications.
1531  * rtentres will be removed after the end of a current epoch.
1532  */
1533 static void
1534 rib_flush_routes(struct rib_head *rnh)
1535 {
1536 	RIB_WLOCK(rnh);
1537 	rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh);
1538 	RIB_WUNLOCK(rnh);
1539 }
1540 
1541 void
1542 rib_flush_routes_family(int family)
1543 {
1544 	struct rib_head *rnh;
1545 
1546 	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
1547 		if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
1548 			rib_flush_routes(rnh);
1549 	}
1550 }
1551 
1552 const char *
1553 rib_print_family(int family)
1554 {
1555 	switch (family) {
1556 	case AF_INET:
1557 		return ("inet");
1558 	case AF_INET6:
1559 		return ("inet6");
1560 	case AF_LINK:
1561 		return ("link");
1562 	}
1563 	return ("unknown");
1564 }
1565 
1566