xref: /freebsd/sys/net/route/route_ctl.c (revision a64729f5077d77e13b9497cb33ecb3c82e606ee8)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 #include "opt_inet.h"
30 #include "opt_inet6.h"
31 #include "opt_route.h"
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/malloc.h>
36 #include <sys/mbuf.h>
37 #include <sys/socket.h>
38 #include <sys/sysctl.h>
39 #include <sys/syslog.h>
40 #include <sys/kernel.h>
41 #include <sys/lock.h>
42 #include <sys/rmlock.h>
43 
44 #include <net/if.h>
45 #include <net/if_var.h>
46 #include <net/if_private.h>
47 #include <net/if_dl.h>
48 #include <net/vnet.h>
49 #include <net/route.h>
50 #include <net/route/route_ctl.h>
51 #include <net/route/route_var.h>
52 #include <net/route/nhop_utils.h>
53 #include <net/route/nhop.h>
54 #include <net/route/nhop_var.h>
55 #include <netinet/in.h>
56 #include <netinet6/scope6_var.h>
57 #include <netinet6/in6_var.h>
58 
59 #define	DEBUG_MOD_NAME	route_ctl
60 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
61 #include <net/route/route_debug.h>
62 _DECLARE_DEBUG(LOG_INFO);
63 
64 /*
65  * This file contains control plane routing tables functions.
66  *
67  * All functions assumes they are called in net epoch.
68  */
69 
70 union sockaddr_union {
71 	struct sockaddr		sa;
72 	struct sockaddr_in	sin;
73 	struct sockaddr_in6	sin6;
74 	char			_buf[32];
75 };
76 
77 static int add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
78     struct rib_cmd_info *rc);
79 static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
80     struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
81     struct rib_cmd_info *rc);
82 
83 static int add_route_flags(struct rib_head *rnh, struct rtentry *rt,
84     struct route_nhop_data *rnd_add, int op_flags, struct rib_cmd_info *rc);
85 #ifdef ROUTE_MPATH
86 static int add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
87     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
88     int op_flags, struct rib_cmd_info *rc);
89 #endif
90 
91 static int add_route(struct rib_head *rnh, struct rtentry *rt,
92     struct route_nhop_data *rnd, struct rib_cmd_info *rc);
93 static int delete_route(struct rib_head *rnh, struct rtentry *rt,
94     struct rib_cmd_info *rc);
95 static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
96     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc);
97 
98 static bool fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
99     struct sockaddr **pmask);
100 static int get_prio_from_info(const struct rt_addrinfo *info);
101 static int nhop_get_prio(const struct nhop_object *nh);
102 
103 #ifdef ROUTE_MPATH
104 static bool rib_can_multipath(struct rib_head *rh);
105 #endif
106 
107 /* Per-vnet multipath routing configuration */
108 SYSCTL_DECL(_net_route);
109 #define	V_rib_route_multipath	VNET(rib_route_multipath)
110 #ifdef ROUTE_MPATH
111 #define _MP_FLAGS	CTLFLAG_RW
112 #else
113 #define _MP_FLAGS	CTLFLAG_RD
114 #endif
115 VNET_DEFINE(u_int, rib_route_multipath) = 1;
116 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
117     &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
118 #undef _MP_FLAGS
119 
120 #ifdef ROUTE_MPATH
121 VNET_DEFINE(u_int, fib_hash_outbound) = 0;
122 SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
123     &VNET_NAME(fib_hash_outbound), 0,
124     "Compute flowid for locally-originated packets");
125 
126 /* Default entropy to add to the hash calculation for the outbound connections*/
127 uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
128 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
129 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
130 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
131 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
132 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
133 };
134 #endif
135 
136 #if defined(INET) && defined(INET6)
137 FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops");
138 #define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop)
139 VNET_DEFINE_STATIC(u_int, rib_route_ipv6_nexthop) = 1;
140 SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET,
141     &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address");
142 #endif
143 
144 /* Debug bits */
145 SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
146 
147 static struct rib_head *
148 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
149 {
150 	struct rib_head *rnh;
151 	struct sockaddr *dst;
152 
153 	KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
154 
155 	dst = info->rti_info[RTAX_DST];
156 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
157 
158 	return (rnh);
159 }
160 
161 #if defined(INET) && defined(INET6)
162 bool
163 rib_can_4o6_nhop(void)
164 {
165 	return (!!V_rib_route_ipv6_nexthop);
166 }
167 #endif
168 
169 #ifdef ROUTE_MPATH
170 static bool
171 rib_can_multipath(struct rib_head *rh)
172 {
173 	int result;
174 
175 	CURVNET_SET(rh->rib_vnet);
176 	result = !!V_rib_route_multipath;
177 	CURVNET_RESTORE();
178 
179 	return (result);
180 }
181 
182 /*
183  * Check is nhop is multipath-eligible.
184  * Avoid nhops without gateways and redirects.
185  *
186  * Returns 1 for multipath-eligible nexthop,
187  * 0 otherwise.
188  */
189 bool
190 nhop_can_multipath(const struct nhop_object *nh)
191 {
192 
193 	if ((nh->nh_flags & NHF_MULTIPATH) != 0)
194 		return (1);
195 	if ((nh->nh_flags & NHF_GATEWAY) == 0)
196 		return (0);
197 	if ((nh->nh_flags & NHF_REDIRECT) != 0)
198 		return (0);
199 
200 	return (1);
201 }
202 #endif
203 
204 static int
205 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
206 {
207 	uint32_t weight;
208 
209 	if (info->rti_mflags & RTV_WEIGHT)
210 		weight = info->rti_rmx->rmx_weight;
211 	else
212 		weight = default_weight;
213 	/* Keep upper 1 byte for adm distance purposes */
214 	if (weight > RT_MAX_WEIGHT)
215 		weight = RT_MAX_WEIGHT;
216 	else if (weight == 0)
217 		weight = default_weight;
218 
219 	return (weight);
220 }
221 
222 /*
223  * File-local concept for distingushing between the normal and
224  * RTF_PINNED routes tha can override the "normal" one.
225  */
226 #define	NH_PRIORITY_HIGH	2
227 #define	NH_PRIORITY_NORMAL	1
228 static int
229 get_prio_from_info(const struct rt_addrinfo *info)
230 {
231 	if (info->rti_flags & RTF_PINNED)
232 		return (NH_PRIORITY_HIGH);
233 	return (NH_PRIORITY_NORMAL);
234 }
235 
236 static int
237 nhop_get_prio(const struct nhop_object *nh)
238 {
239 	if (NH_IS_PINNED(nh))
240 		return (NH_PRIORITY_HIGH);
241 	return (NH_PRIORITY_NORMAL);
242 }
243 
244 /*
245  * Check if specified @gw matches gw data in the nexthop @nh.
246  *
247  * Returns true if matches, false otherwise.
248  */
249 bool
250 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
251 {
252 
253 	if (nh->gw_sa.sa_family != gw->sa_family)
254 		return (false);
255 
256 	switch (gw->sa_family) {
257 	case AF_INET:
258 		return (nh->gw4_sa.sin_addr.s_addr ==
259 		    ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
260 	case AF_INET6:
261 		{
262 			const struct sockaddr_in6 *gw6;
263 			gw6 = (const struct sockaddr_in6 *)gw;
264 
265 			/*
266 			 * Currently (2020-09) IPv6 gws in kernel have their
267 			 * scope embedded. Once this becomes false, this code
268 			 * has to be revisited.
269 			 */
270 			if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
271 			    &gw6->sin6_addr))
272 				return (true);
273 			return (false);
274 		}
275 	case AF_LINK:
276 		{
277 			const struct sockaddr_dl *sdl;
278 			sdl = (const struct sockaddr_dl *)gw;
279 			return (nh->gwl_sa.sdl_index == sdl->sdl_index);
280 		}
281 	default:
282 		return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
283 	}
284 
285 	/* NOTREACHED */
286 	return (false);
287 }
288 
289 /*
290  * Matches all nexthop with given @gw.
291  * Can be used as rib_filter_f callback.
292  */
293 int
294 rib_match_gw(const struct rtentry *rt, const struct nhop_object *nh, void *gw_sa)
295 {
296 	const struct sockaddr *gw = (const struct sockaddr *)gw_sa;
297 
298 	return (match_nhop_gw(nh, gw));
299 }
300 
301 struct gw_filter_data {
302 	const struct sockaddr *gw;
303 	int count;
304 };
305 
306 /*
307  * Matches first occurence of the gateway provided in @gwd
308  */
309 static int
310 match_gw_one(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
311 {
312 	struct gw_filter_data *gwd = (struct gw_filter_data *)_data;
313 
314 	/* Return only first match to make rtsock happy */
315 	if (match_nhop_gw(nh, gwd->gw) && gwd->count++ == 0)
316 		return (1);
317 	return (0);
318 }
319 
320 /*
321  * Checks if data in @info matches nexhop @nh.
322  *
323  * Returns 0 on success,
324  * ESRCH if not matched,
325  * ENOENT if filter function returned false
326  */
327 int
328 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
329     const struct nhop_object *nh)
330 {
331 	const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
332 
333 	if (info->rti_filter != NULL) {
334 	    if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
335 		    return (ENOENT);
336 	    else
337 		    return (0);
338 	}
339 	if ((gw != NULL) && !match_nhop_gw(nh, gw))
340 		return (ESRCH);
341 
342 	return (0);
343 }
344 
345 /*
346  * Runs exact prefix match based on @dst and @netmask.
347  * Returns matched @rtentry if found or NULL.
348  * If rtentry was found, saves nexthop / weight value into @rnd.
349  */
350 static struct rtentry *
351 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
352     const struct sockaddr *netmask, struct route_nhop_data *rnd)
353 {
354 	struct rtentry *rt;
355 
356 	RIB_LOCK_ASSERT(rnh);
357 
358 	rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
359 	if (rt != NULL) {
360 		rnd->rnd_nhop = rt->rt_nhop;
361 		rnd->rnd_weight = rt->rt_weight;
362 	} else {
363 		rnd->rnd_nhop = NULL;
364 		rnd->rnd_weight = 0;
365 	}
366 
367 	return (rt);
368 }
369 
370 struct rtentry *
371 lookup_prefix_rt(struct rib_head *rnh, const struct rtentry *rt,
372     struct route_nhop_data *rnd)
373 {
374 	return (lookup_prefix_bysa(rnh, rt_key_const(rt), rt_mask_const(rt), rnd));
375 }
376 
377 /*
378  * Runs exact prefix match based on dst/netmask from @info.
379  * Assumes RIB lock is held.
380  * Returns matched @rtentry if found or NULL.
381  * If rtentry was found, saves nexthop / weight value into @rnd.
382  */
383 struct rtentry *
384 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
385     struct route_nhop_data *rnd)
386 {
387 	struct rtentry *rt;
388 
389 	rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
390 	    info->rti_info[RTAX_NETMASK], rnd);
391 
392 	return (rt);
393 }
394 
395 const struct rtentry *
396 rib_lookup_prefix_plen(struct rib_head *rnh, struct sockaddr *dst, int plen,
397     struct route_nhop_data *rnd)
398 {
399 	union sockaddr_union mask_storage;
400 	struct sockaddr *netmask = &mask_storage.sa;
401 
402 	if (fill_pxmask_family(dst->sa_family, plen, dst, &netmask))
403 		return (lookup_prefix_bysa(rnh, dst, netmask, rnd));
404 	return (NULL);
405 }
406 
407 static bool
408 fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
409     struct sockaddr **pmask)
410 {
411 	if (plen == -1) {
412 		*pmask = NULL;
413 		return (true);
414 	}
415 
416 	switch (family) {
417 #ifdef INET
418 	case AF_INET:
419 		{
420 			struct sockaddr_in *mask = (struct sockaddr_in *)(*pmask);
421 			struct sockaddr_in *dst= (struct sockaddr_in *)_dst;
422 
423 			memset(mask, 0, sizeof(*mask));
424 			mask->sin_family = family;
425 			mask->sin_len = sizeof(*mask);
426 			if (plen == 32)
427 				*pmask = NULL;
428 			else if (plen > 32 || plen < 0)
429 				return (false);
430 			else {
431 				uint32_t daddr, maddr;
432 				maddr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
433 				mask->sin_addr.s_addr = maddr;
434 				daddr = dst->sin_addr.s_addr;
435 				daddr = htonl(ntohl(daddr) & ntohl(maddr));
436 				dst->sin_addr.s_addr = daddr;
437 			}
438 			return (true);
439 		}
440 		break;
441 #endif
442 #ifdef INET6
443 	case AF_INET6:
444 		{
445 			struct sockaddr_in6 *mask = (struct sockaddr_in6 *)(*pmask);
446 			struct sockaddr_in6 *dst = (struct sockaddr_in6 *)_dst;
447 
448 			memset(mask, 0, sizeof(*mask));
449 			mask->sin6_family = family;
450 			mask->sin6_len = sizeof(*mask);
451 			if (plen == 128)
452 				*pmask = NULL;
453 			else if (plen > 128 || plen < 0)
454 				return (false);
455 			else {
456 				ip6_writemask(&mask->sin6_addr, plen);
457 				IN6_MASK_ADDR(&dst->sin6_addr, &mask->sin6_addr);
458 			}
459 			return (true);
460 		}
461 		break;
462 #endif
463 	}
464 	return (false);
465 }
466 
467 /*
468  * Attempts to add @dst/plen prefix with nexthop/nexhopgroup data @rnd
469  * to the routing table.
470  *
471  * @fibnum: verified kernel rtable id to insert route to
472  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
473  * @plen: prefix length (or -1 if host route or not applicable for AF)
474  * @op_flags: combination of RTM_F_ flags
475  * @rc: storage to report operation result
476  *
477  * Returns 0 on success.
478  */
479 int
480 rib_add_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
481     struct route_nhop_data *rnd, int op_flags, struct rib_cmd_info *rc)
482 {
483 	union sockaddr_union mask_storage;
484 	struct sockaddr *netmask = &mask_storage.sa;
485 	struct rtentry *rt = NULL;
486 
487 	NET_EPOCH_ASSERT();
488 
489 	bzero(rc, sizeof(struct rib_cmd_info));
490 	rc->rc_cmd = RTM_ADD;
491 
492 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
493 	if (rnh == NULL)
494 		return (EAFNOSUPPORT);
495 
496 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
497 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
498 		return (EINVAL);
499 	}
500 
501 	if (op_flags & RTM_F_CREATE) {
502 		if ((rt = rt_alloc(rnh, dst, netmask)) == NULL) {
503 			FIB_RH_LOG(LOG_INFO, rnh, "rtentry allocation failed");
504 			return (ENOMEM);
505 		}
506 	} else {
507 		struct route_nhop_data rnd_tmp;
508 		RIB_RLOCK_TRACKER;
509 
510 		RIB_RLOCK(rnh);
511 		rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd_tmp);
512 		RIB_RUNLOCK(rnh);
513 
514 		if (rt == NULL)
515 			return (ESRCH);
516 	}
517 
518 	return (add_route_flags(rnh, rt, rnd, op_flags, rc));
519 }
520 
521 /*
522  * Attempts to delete @dst/plen prefix matching gateway @gw from the
523  *  routing rable.
524  *
525  * @fibnum: rtable id to remove route from
526  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
527  * @plen: prefix length (or -1 if host route or not applicable for AF)
528  * @gw: gateway to match
529  * @op_flags: combination of RTM_F_ flags
530  * @rc: storage to report operation result
531  *
532  * Returns 0 on success.
533  */
534 int
535 rib_del_route_px_gw(uint32_t fibnum, struct sockaddr *dst, int plen,
536     const struct sockaddr *gw, int op_flags, struct rib_cmd_info *rc)
537 {
538 	struct gw_filter_data gwd = { .gw = gw };
539 
540 	return (rib_del_route_px(fibnum, dst, plen, match_gw_one, &gwd, op_flags, rc));
541 }
542 
543 /*
544  * Attempts to delete @dst/plen prefix matching @filter_func from the
545  *  routing rable.
546  *
547  * @fibnum: rtable id to remove route from
548  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
549  * @plen: prefix length (or -1 if host route or not applicable for AF)
550  * @filter_func: func to be called for each nexthop of the prefix for matching
551  * @filter_arg: argument to pass to @filter_func
552  * @op_flags: combination of RTM_F_ flags
553  * @rc: storage to report operation result
554  *
555  * Returns 0 on success.
556  */
557 int
558 rib_del_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
559     rib_filter_f_t *filter_func, void *filter_arg, int op_flags,
560     struct rib_cmd_info *rc)
561 {
562 	union sockaddr_union mask_storage;
563 	struct sockaddr *netmask = &mask_storage.sa;
564 	int error;
565 
566 	NET_EPOCH_ASSERT();
567 
568 	bzero(rc, sizeof(struct rib_cmd_info));
569 	rc->rc_cmd = RTM_DELETE;
570 
571 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
572 	if (rnh == NULL)
573 		return (EAFNOSUPPORT);
574 
575 	if (dst->sa_len > sizeof(mask_storage)) {
576 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too big: %d", dst->sa_len);
577 		return (EINVAL);
578 	}
579 
580 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
581 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
582 		return (EINVAL);
583 	}
584 
585 	int prio = (op_flags & RTM_F_FORCE) ? NH_PRIORITY_HIGH : NH_PRIORITY_NORMAL;
586 
587 	RIB_WLOCK(rnh);
588 	struct route_nhop_data rnd;
589 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
590 	if (rt != NULL) {
591 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
592 		    filter_arg, rc);
593 	} else
594 		error = ESRCH;
595 	RIB_WUNLOCK(rnh);
596 
597 	if (error != 0)
598 		return (error);
599 
600 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
601 
602 	if (rc->rc_cmd == RTM_DELETE)
603 		rt_free(rc->rc_rt);
604 #ifdef ROUTE_MPATH
605 	else {
606 		/*
607 		 * Deleting 1 path may result in RTM_CHANGE to
608 		 * a different mpath group/nhop.
609 		 * Free old mpath group.
610 		 */
611 		nhop_free_any(rc->rc_nh_old);
612 	}
613 #endif
614 
615 	return (0);
616 }
617 
618 /*
619  * Tries to copy route @rt from one rtable to the rtable specified by @dst_rh.
620  * @rt: route to copy.
621  * @rnd_src: nhop and weight. Multipath routes are not supported
622  * @rh_dst: target rtable.
623  * @rc: operation result storage
624  *
625  * Return 0 on success.
626  */
627 int
628 rib_copy_route(struct rtentry *rt, const struct route_nhop_data *rnd_src,
629     struct rib_head *rh_dst, struct rib_cmd_info *rc)
630 {
631 	struct nhop_object __diagused *nh_src = rnd_src->rnd_nhop;
632 	int error;
633 
634 	MPASS((nh_src->nh_flags & NHF_MULTIPATH) == 0);
635 
636 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
637 		char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
638 		nhop_print_buf_any(nh_src, nhbuf, sizeof(nhbuf));
639 		rt_print_buf(rt, rtbuf, sizeof(rtbuf));
640 		FIB_RH_LOG(LOG_DEBUG2, rh_dst, "copying %s -> %s from fib %u",
641 		    rtbuf, nhbuf, nhop_get_fibnum(nh_src));
642 	}
643 	struct nhop_object *nh = nhop_alloc(rh_dst->rib_fibnum, rh_dst->rib_family);
644 	if (nh == NULL) {
645 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to allocate new nexthop");
646 		return (ENOMEM);
647 	}
648 	nhop_copy(nh, rnd_src->rnd_nhop);
649 	nhop_set_origin(nh, nhop_get_origin(rnd_src->rnd_nhop));
650 	nhop_set_fibnum(nh, rh_dst->rib_fibnum);
651 	nh = nhop_get_nhop_internal(rh_dst, nh, &error);
652 	if (error != 0) {
653 		FIB_RH_LOG(LOG_INFO, rh_dst,
654 		    "unable to finalize new nexthop: error %d", error);
655 		return (ENOMEM);
656 	}
657 
658 	struct rtentry *rt_new = rt_alloc(rh_dst, rt_key(rt), rt_mask(rt));
659 	if (rt_new == NULL) {
660 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to create new rtentry");
661 		nhop_free(nh);
662 		return (ENOMEM);
663 	}
664 
665 	struct route_nhop_data rnd = {
666 		.rnd_nhop = nh,
667 		.rnd_weight = rnd_src->rnd_weight
668 	};
669 	int op_flags = RTM_F_CREATE | (NH_IS_PINNED(nh) ? RTM_F_FORCE : 0);
670 	error = add_route_flags(rh_dst, rt_new, &rnd, op_flags, rc);
671 
672 	if (error != 0) {
673 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
674 			char buf[NHOP_PRINT_BUFSIZE];
675 			rt_print_buf(rt_new, buf, sizeof(buf));
676 			FIB_RH_LOG(LOG_DEBUG, rh_dst,
677 			    "Unable to add route %s: error %d", buf, error);
678 		}
679 		nhop_free(nh);
680 		rt_free_immediate(rt_new);
681 	}
682 	return (error);
683 }
684 
685 /*
686  * Adds route defined by @info into the kernel table specified by @fibnum and
687  * sa_family in @info->rti_info[RTAX_DST].
688  *
689  * Returns 0 on success and fills in operation metadata into @rc.
690  */
691 int
692 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
693     struct rib_cmd_info *rc)
694 {
695 	struct rib_head *rnh;
696 	int error;
697 
698 	NET_EPOCH_ASSERT();
699 
700 	rnh = get_rnh(fibnum, info);
701 	if (rnh == NULL)
702 		return (EAFNOSUPPORT);
703 
704 	/*
705 	 * Check consistency between RTF_HOST flag and netmask
706 	 * existence.
707 	 */
708 	if (info->rti_flags & RTF_HOST)
709 		info->rti_info[RTAX_NETMASK] = NULL;
710 	else if (info->rti_info[RTAX_NETMASK] == NULL) {
711 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: no RTF_HOST and empty netmask");
712 		return (EINVAL);
713 	}
714 
715 	bzero(rc, sizeof(struct rib_cmd_info));
716 	rc->rc_cmd = RTM_ADD;
717 
718 	error = add_route_byinfo(rnh, info, rc);
719 	if (error == 0)
720 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
721 
722 	return (error);
723 }
724 
725 static int
726 add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
727     struct rib_cmd_info *rc)
728 {
729 	struct route_nhop_data rnd_add;
730 	struct nhop_object *nh;
731 	struct rtentry *rt;
732 	struct sockaddr *dst, *gateway, *netmask;
733 	int error;
734 
735 	dst = info->rti_info[RTAX_DST];
736 	gateway = info->rti_info[RTAX_GATEWAY];
737 	netmask = info->rti_info[RTAX_NETMASK];
738 
739 	if ((info->rti_flags & RTF_GATEWAY) && !gateway) {
740 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: RTF_GATEWAY set with empty gw");
741 		return (EINVAL);
742 	}
743 	if (dst && gateway && !nhop_check_gateway(dst->sa_family, gateway->sa_family)) {
744 		FIB_RH_LOG(LOG_DEBUG, rnh,
745 		    "error: invalid dst/gateway family combination (%d, %d)",
746 		    dst->sa_family, gateway->sa_family);
747 		return (EINVAL);
748 	}
749 
750 	if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) {
751 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large: %d",
752 		    dst->sa_len);
753 		return (EINVAL);
754 	}
755 
756 	if (info->rti_ifa == NULL) {
757 		error = rt_getifa_fib(info, rnh->rib_fibnum);
758 		if (error)
759 			return (error);
760 	}
761 
762 	if ((rt = rt_alloc(rnh, dst, netmask)) == NULL)
763 		return (ENOBUFS);
764 
765 	error = nhop_create_from_info(rnh, info, &nh);
766 	if (error != 0) {
767 		rt_free_immediate(rt);
768 		return (error);
769 	}
770 
771 	rnd_add.rnd_nhop = nh;
772 	rnd_add.rnd_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
773 
774 	int op_flags = RTM_F_CREATE;
775 	if (get_prio_from_info(info) == NH_PRIORITY_HIGH)
776 		op_flags |= RTM_F_FORCE;
777 	else
778 		op_flags |= RTM_F_APPEND;
779 	return (add_route_flags(rnh, rt, &rnd_add, op_flags, rc));
780 
781 }
782 
783 static int
784 add_route_flags(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd_add,
785     int op_flags, struct rib_cmd_info *rc)
786 {
787 	struct route_nhop_data rnd_orig;
788 	struct nhop_object *nh;
789 	struct rtentry *rt_orig;
790 	int error = 0;
791 
792 	MPASS(rt != NULL);
793 
794 	nh = rnd_add->rnd_nhop;
795 
796 	RIB_WLOCK(rnh);
797 
798 	rt_orig = lookup_prefix_rt(rnh, rt, &rnd_orig);
799 
800 	if (rt_orig == NULL) {
801 		if (op_flags & RTM_F_CREATE)
802 			error = add_route(rnh, rt, rnd_add, rc);
803 		else
804 			error = ESRCH; /* no entry but creation was not required */
805 		RIB_WUNLOCK(rnh);
806 		if (error != 0)
807 			goto out;
808 		return (0);
809 	}
810 
811 	if (op_flags & RTM_F_EXCL) {
812 		/* We have existing route in the RIB but not allowed to replace. */
813 		RIB_WUNLOCK(rnh);
814 		error = EEXIST;
815 		goto out;
816 	}
817 
818 	/* Now either append or replace */
819 	if (op_flags & RTM_F_REPLACE) {
820 		if (nhop_get_prio(rnd_orig.rnd_nhop) > nhop_get_prio(rnd_add->rnd_nhop)) {
821 			/* Old path is "better" (e.g. has PINNED flag set) */
822 			RIB_WUNLOCK(rnh);
823 			error = EEXIST;
824 			goto out;
825 		}
826 		change_route(rnh, rt_orig, rnd_add, rc);
827 		RIB_WUNLOCK(rnh);
828 		nh = rc->rc_nh_old;
829 		goto out;
830 	}
831 
832 	RIB_WUNLOCK(rnh);
833 
834 #ifdef ROUTE_MPATH
835 	if ((op_flags & RTM_F_APPEND) && rib_can_multipath(rnh) &&
836 	    nhop_can_multipath(rnd_add->rnd_nhop) &&
837 	    nhop_can_multipath(rnd_orig.rnd_nhop)) {
838 
839 		for (int i = 0; i < RIB_MAX_RETRIES; i++) {
840 			error = add_route_flags_mpath(rnh, rt_orig, rnd_add, &rnd_orig,
841 			    op_flags, rc);
842 			if (error != EAGAIN)
843 				break;
844 			RTSTAT_INC(rts_add_retry);
845 		}
846 
847 		/*
848 		 *  Original nhop reference is unused in any case.
849 		 */
850 		nhop_free_any(rnd_add->rnd_nhop);
851 		if (op_flags & RTM_F_CREATE) {
852 			if (error != 0 || rc->rc_cmd != RTM_ADD)
853 				rt_free_immediate(rt);
854 		}
855 		return (error);
856 	}
857 #endif
858 	/* Out of options - free state and return error */
859 	error = EEXIST;
860 out:
861 	if (op_flags & RTM_F_CREATE)
862 		rt_free_immediate(rt);
863 	nhop_free_any(nh);
864 
865 	return (error);
866 }
867 
868 #ifdef ROUTE_MPATH
869 static int
870 add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
871     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
872     int op_flags, struct rib_cmd_info *rc)
873 {
874 	RIB_RLOCK_TRACKER;
875 	struct route_nhop_data rnd_new;
876 	int error = 0;
877 
878 	error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, &rnd_new);
879 	if (error != 0) {
880 		if (error == EAGAIN) {
881 			/*
882 			 * Group creation failed, most probably because
883 			 * @rnd_orig data got scheduled for deletion.
884 			 * Refresh @rnd_orig data and retry.
885 			 */
886 			RIB_RLOCK(rnh);
887 			lookup_prefix_rt(rnh, rt, rnd_orig);
888 			RIB_RUNLOCK(rnh);
889 			if (rnd_orig == NULL && !(op_flags & RTM_F_CREATE)) {
890 				/* In this iteration route doesn't exist */
891 				error = ENOENT;
892 			}
893 		}
894 		return (error);
895 	}
896 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
897 	if (error != 0)
898 		return (error);
899 
900 	if (V_fib_hash_outbound == 0 && NH_IS_NHGRP(rc->rc_nh_new)) {
901 		/*
902 		 * First multipath route got installed. Enable local
903 		 * outbound connections hashing.
904 		 */
905 		if (bootverbose)
906 			printf("FIB: enabled flowid calculation for locally-originated packets\n");
907 		V_fib_hash_outbound = 1;
908 	}
909 
910 	return (0);
911 }
912 #endif
913 
914 /*
915  * Removes route defined by @info from the kernel table specified by @fibnum and
916  * sa_family in @info->rti_info[RTAX_DST].
917  *
918  * Returns 0 on success and fills in operation metadata into @rc.
919  */
920 int
921 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
922 {
923 	struct rib_head *rnh;
924 	struct sockaddr *dst, *netmask;
925 	struct sockaddr_storage mdst;
926 	int error;
927 
928 	NET_EPOCH_ASSERT();
929 
930 	rnh = get_rnh(fibnum, info);
931 	if (rnh == NULL)
932 		return (EAFNOSUPPORT);
933 
934 	bzero(rc, sizeof(struct rib_cmd_info));
935 	rc->rc_cmd = RTM_DELETE;
936 
937 	dst = info->rti_info[RTAX_DST];
938 	netmask = info->rti_info[RTAX_NETMASK];
939 
940 	if (netmask != NULL) {
941 		/* Ensure @dst is always properly masked */
942 		if (dst->sa_len > sizeof(mdst)) {
943 			FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large");
944 			return (EINVAL);
945 		}
946 		rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
947 		dst = (struct sockaddr *)&mdst;
948 	}
949 
950 	rib_filter_f_t *filter_func = NULL;
951 	void *filter_arg = NULL;
952 	struct gw_filter_data gwd = { .gw = info->rti_info[RTAX_GATEWAY] };
953 
954 	if (info->rti_filter != NULL) {
955 		filter_func = info->rti_filter;
956 		filter_arg = info->rti_filterdata;
957 	} else if (gwd.gw != NULL) {
958 		filter_func = match_gw_one;
959 		filter_arg = &gwd;
960 	}
961 
962 	int prio = get_prio_from_info(info);
963 
964 	RIB_WLOCK(rnh);
965 	struct route_nhop_data rnd;
966 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
967 	if (rt != NULL) {
968 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
969 		    filter_arg, rc);
970 	} else
971 		error = ESRCH;
972 	RIB_WUNLOCK(rnh);
973 
974 	if (error != 0)
975 		return (error);
976 
977 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
978 
979 	if (rc->rc_cmd == RTM_DELETE)
980 		rt_free(rc->rc_rt);
981 #ifdef ROUTE_MPATH
982 	else {
983 		/*
984 		 * Deleting 1 path may result in RTM_CHANGE to
985 		 * a different mpath group/nhop.
986 		 * Free old mpath group.
987 		 */
988 		nhop_free_any(rc->rc_nh_old);
989 	}
990 #endif
991 
992 	return (0);
993 }
994 
995 /*
996  * Conditionally unlinks rtentry paths from @rnh matching @cb.
997  * Returns 0 on success with operation result stored in @rc.
998  * On error, returns:
999  * ESRCH - if prefix was not found or filter function failed to match
1000  * EADDRINUSE - if trying to delete higher priority route.
1001  */
1002 static int
1003 rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
1004     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc)
1005 {
1006 	struct nhop_object *nh = rt->rt_nhop;
1007 
1008 #ifdef ROUTE_MPATH
1009 	if (NH_IS_NHGRP(nh)) {
1010 		struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
1011 		struct route_nhop_data rnd;
1012 		int error;
1013 
1014 		if (cb == NULL)
1015 			return (ESRCH);
1016 		error = nhgrp_get_filtered_group(rnh, rt, nhg, cb, cbdata, &rnd);
1017 		if (error == 0) {
1018 			if (rnd.rnd_nhgrp == nhg) {
1019 				/* No match, unreference new group and return. */
1020 				nhop_free_any(rnd.rnd_nhop);
1021 				return (ESRCH);
1022 			}
1023 			error = change_route(rnh, rt, &rnd, rc);
1024 		}
1025 		return (error);
1026 	}
1027 #endif
1028 	if (cb != NULL && !cb(rt, nh, cbdata))
1029 		return (ESRCH);
1030 
1031 	if (prio < nhop_get_prio(nh))
1032 		return (EADDRINUSE);
1033 
1034 	return (delete_route(rnh, rt, rc));
1035 }
1036 
1037 int
1038 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
1039     struct rib_cmd_info *rc)
1040 {
1041 	RIB_RLOCK_TRACKER;
1042 	struct route_nhop_data rnd_orig;
1043 	struct rib_head *rnh;
1044 	struct rtentry *rt;
1045 	int error;
1046 
1047 	NET_EPOCH_ASSERT();
1048 
1049 	rnh = get_rnh(fibnum, info);
1050 	if (rnh == NULL)
1051 		return (EAFNOSUPPORT);
1052 
1053 	bzero(rc, sizeof(struct rib_cmd_info));
1054 	rc->rc_cmd = RTM_CHANGE;
1055 
1056 	/* Check if updated gateway exists */
1057 	if ((info->rti_flags & RTF_GATEWAY) &&
1058 	    (info->rti_info[RTAX_GATEWAY] == NULL)) {
1059 
1060 		/*
1061 		 * route(8) adds RTF_GATEWAY flag if -interface is not set.
1062 		 * Remove RTF_GATEWAY to enforce consistency and maintain
1063 		 * compatibility..
1064 		 */
1065 		info->rti_flags &= ~RTF_GATEWAY;
1066 	}
1067 
1068 	/*
1069 	 * route change is done in multiple steps, with dropping and
1070 	 * reacquiring lock. In the situations with multiple processes
1071 	 * changes the same route in can lead to the case when route
1072 	 * is changed between the steps. Address it by retrying the operation
1073 	 * multiple times before failing.
1074 	 */
1075 
1076 	RIB_RLOCK(rnh);
1077 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
1078 	    info->rti_info[RTAX_NETMASK], &rnh->head);
1079 
1080 	if (rt == NULL) {
1081 		RIB_RUNLOCK(rnh);
1082 		return (ESRCH);
1083 	}
1084 
1085 	rnd_orig.rnd_nhop = rt->rt_nhop;
1086 	rnd_orig.rnd_weight = rt->rt_weight;
1087 
1088 	RIB_RUNLOCK(rnh);
1089 
1090 	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
1091 		error = change_route_byinfo(rnh, rt, info, &rnd_orig, rc);
1092 		if (error != EAGAIN)
1093 			break;
1094 	}
1095 
1096 	return (error);
1097 }
1098 
1099 static int
1100 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
1101     struct nhop_object *nh_orig, struct nhop_object **nh_new)
1102 {
1103 	int error;
1104 
1105 	/*
1106 	 * New gateway could require new ifaddr, ifp;
1107 	 * flags may also be different; ifp may be specified
1108 	 * by ll sockaddr when protocol address is ambiguous
1109 	 */
1110 	if (((nh_orig->nh_flags & NHF_GATEWAY) &&
1111 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
1112 	    info->rti_info[RTAX_IFP] != NULL ||
1113 	    (info->rti_info[RTAX_IFA] != NULL &&
1114 	     !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
1115 		error = rt_getifa_fib(info, rnh->rib_fibnum);
1116 
1117 		if (error != 0) {
1118 			info->rti_ifa = NULL;
1119 			return (error);
1120 		}
1121 	}
1122 
1123 	error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
1124 	info->rti_ifa = NULL;
1125 
1126 	return (error);
1127 }
1128 
1129 #ifdef ROUTE_MPATH
1130 static int
1131 change_mpath_route(struct rib_head *rnh, struct rtentry *rt,
1132     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1133     struct rib_cmd_info *rc)
1134 {
1135 	int error = 0, found_idx = 0;
1136 	struct nhop_object *nh_orig = NULL, *nh_new;
1137 	struct route_nhop_data rnd_new = {};
1138 	const struct weightened_nhop *wn = NULL;
1139 	struct weightened_nhop *wn_new;
1140 	uint32_t num_nhops;
1141 
1142 	wn = nhgrp_get_nhops(rnd_orig->rnd_nhgrp, &num_nhops);
1143 	for (int i = 0; i < num_nhops; i++) {
1144 		if (check_info_match_nhop(info, NULL, wn[i].nh) == 0) {
1145 			nh_orig = wn[i].nh;
1146 			found_idx = i;
1147 			break;
1148 		}
1149 	}
1150 
1151 	if (nh_orig == NULL)
1152 		return (ESRCH);
1153 
1154 	error = change_nhop(rnh, info, nh_orig, &nh_new);
1155 	if (error != 0)
1156 		return (error);
1157 
1158 	wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
1159 	    M_TEMP, M_NOWAIT | M_ZERO);
1160 	if (wn_new == NULL) {
1161 		nhop_free(nh_new);
1162 		return (EAGAIN);
1163 	}
1164 
1165 	memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
1166 	wn_new[found_idx].nh = nh_new;
1167 	wn_new[found_idx].weight = get_info_weight(info, wn[found_idx].weight);
1168 
1169 	error = nhgrp_get_group(rnh, wn_new, num_nhops, 0, &rnd_new.rnd_nhgrp);
1170 	nhop_free(nh_new);
1171 	free(wn_new, M_TEMP);
1172 
1173 	if (error != 0)
1174 		return (error);
1175 
1176 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1177 
1178 	return (error);
1179 }
1180 #endif
1181 
1182 static int
1183 change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
1184     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1185     struct rib_cmd_info *rc)
1186 {
1187 	int error = 0;
1188 	struct nhop_object *nh_orig;
1189 	struct route_nhop_data rnd_new;
1190 
1191 	nh_orig = rnd_orig->rnd_nhop;
1192 	if (nh_orig == NULL)
1193 		return (ESRCH);
1194 
1195 #ifdef ROUTE_MPATH
1196 	if (NH_IS_NHGRP(nh_orig))
1197 		return (change_mpath_route(rnh, rt, info, rnd_orig, rc));
1198 #endif
1199 
1200 	rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
1201 	error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
1202 	if (error != 0)
1203 		return (error);
1204 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1205 
1206 	return (error);
1207 }
1208 
1209 /*
1210  * Insert @rt with nhop data from @rnd_new to @rnh.
1211  * Returns 0 on success and stores operation results in @rc.
1212  */
1213 static int
1214 add_route(struct rib_head *rnh, struct rtentry *rt,
1215     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1216 {
1217 	struct radix_node *rn;
1218 
1219 	RIB_WLOCK_ASSERT(rnh);
1220 
1221 	rt->rt_nhop = rnd->rnd_nhop;
1222 	rt->rt_weight = rnd->rnd_weight;
1223 	rn = rnh->rnh_addaddr(rt_key(rt), rt_mask_const(rt), &rnh->head, rt->rt_nodes);
1224 
1225 	if (rn != NULL) {
1226 		if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1227 			tmproutes_update(rnh, rt, rnd->rnd_nhop);
1228 
1229 		/* Finalize notification */
1230 		rib_bump_gen(rnh);
1231 		rnh->rnh_prefixes++;
1232 
1233 		rc->rc_cmd = RTM_ADD;
1234 		rc->rc_rt = rt;
1235 		rc->rc_nh_old = NULL;
1236 		rc->rc_nh_new = rnd->rnd_nhop;
1237 		rc->rc_nh_weight = rnd->rnd_weight;
1238 
1239 		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1240 		return (0);
1241 	}
1242 
1243 	/* Existing route or memory allocation failure. */
1244 	return (EEXIST);
1245 }
1246 
1247 /*
1248  * Unconditionally deletes @rt from @rnh.
1249  */
1250 static int
1251 delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc)
1252 {
1253 	RIB_WLOCK_ASSERT(rnh);
1254 
1255 	/* Route deletion requested. */
1256 	struct radix_node *rn;
1257 
1258 	rn = rnh->rnh_deladdr(rt_key_const(rt), rt_mask_const(rt), &rnh->head);
1259 	if (rn == NULL)
1260 		return (ESRCH);
1261 	rt = RNTORT(rn);
1262 	rt->rte_flags &= ~RTF_UP;
1263 
1264 	rib_bump_gen(rnh);
1265 	rnh->rnh_prefixes--;
1266 
1267 	rc->rc_cmd = RTM_DELETE;
1268 	rc->rc_rt = rt;
1269 	rc->rc_nh_old = rt->rt_nhop;
1270 	rc->rc_nh_new = NULL;
1271 	rc->rc_nh_weight = rt->rt_weight;
1272 
1273 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1274 
1275 	return (0);
1276 }
1277 
1278 /*
1279  * Switch @rt nhop/weigh to the ones specified in @rnd.
1280  * Returns 0 on success.
1281  */
1282 int
1283 change_route(struct rib_head *rnh, struct rtentry *rt,
1284     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1285 {
1286 	struct nhop_object *nh_orig;
1287 
1288 	RIB_WLOCK_ASSERT(rnh);
1289 
1290 	nh_orig = rt->rt_nhop;
1291 
1292 	if (rnd->rnd_nhop == NULL)
1293 		return (delete_route(rnh, rt, rc));
1294 
1295 	/* Changing nexthop & weight to a new one */
1296 	rt->rt_nhop = rnd->rnd_nhop;
1297 	rt->rt_weight = rnd->rnd_weight;
1298 	if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1299 		tmproutes_update(rnh, rt, rnd->rnd_nhop);
1300 
1301 	/* Finalize notification */
1302 	rib_bump_gen(rnh);
1303 	rc->rc_cmd = RTM_CHANGE;
1304 	rc->rc_rt = rt;
1305 	rc->rc_nh_old = nh_orig;
1306 	rc->rc_nh_new = rnd->rnd_nhop;
1307 	rc->rc_nh_weight = rnd->rnd_weight;
1308 
1309 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1310 
1311 	return (0);
1312 }
1313 
1314 /*
1315  * Conditionally update route nhop/weight IFF data in @nhd_orig is
1316  *  consistent with the current route data.
1317  * Nexthop in @nhd_new is consumed.
1318  */
1319 int
1320 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
1321     struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new,
1322     struct rib_cmd_info *rc)
1323 {
1324 	struct rtentry *rt_new;
1325 	int error = 0;
1326 
1327 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
1328 		char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
1329 		nhop_print_buf_any(rnd_orig->rnd_nhop, buf_old, NHOP_PRINT_BUFSIZE);
1330 		nhop_print_buf_any(rnd_new->rnd_nhop, buf_new, NHOP_PRINT_BUFSIZE);
1331 		FIB_LOG(LOG_DEBUG2, rnh->rib_fibnum, rnh->rib_family,
1332 		    "trying change %s -> %s", buf_old, buf_new);
1333 	}
1334 	RIB_WLOCK(rnh);
1335 
1336 	struct route_nhop_data rnd;
1337 	rt_new = lookup_prefix_rt(rnh, rt, &rnd);
1338 
1339 	if (rt_new == NULL) {
1340 		if (rnd_orig->rnd_nhop == NULL)
1341 			error = add_route(rnh, rt, rnd_new, rc);
1342 		else {
1343 			/*
1344 			 * Prefix does not exist, which was not our assumption.
1345 			 * Update @rnd_orig with the new data and return
1346 			 */
1347 			rnd_orig->rnd_nhop = NULL;
1348 			rnd_orig->rnd_weight = 0;
1349 			error = EAGAIN;
1350 		}
1351 	} else {
1352 		/* Prefix exists, try to update */
1353 		if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
1354 			/*
1355 			 * Nhop/mpath group hasn't changed. Flip
1356 			 * to the new precalculated one and return
1357 			 */
1358 			error = change_route(rnh, rt_new, rnd_new, rc);
1359 		} else {
1360 			/* Update and retry */
1361 			rnd_orig->rnd_nhop = rt_new->rt_nhop;
1362 			rnd_orig->rnd_weight = rt_new->rt_weight;
1363 			error = EAGAIN;
1364 		}
1365 	}
1366 
1367 	RIB_WUNLOCK(rnh);
1368 
1369 	if (error == 0) {
1370 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
1371 
1372 		if (rnd_orig->rnd_nhop != NULL)
1373 			nhop_free_any(rnd_orig->rnd_nhop);
1374 
1375 	} else {
1376 		if (rnd_new->rnd_nhop != NULL)
1377 			nhop_free_any(rnd_new->rnd_nhop);
1378 	}
1379 
1380 	return (error);
1381 }
1382 
1383 /*
1384  * Performs modification of routing table specificed by @action.
1385  * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
1386  * Needs to be run in network epoch.
1387  *
1388  * Returns 0 on success and fills in @rc with action result.
1389  */
1390 int
1391 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
1392     struct rib_cmd_info *rc)
1393 {
1394 	int error;
1395 
1396 	switch (action) {
1397 	case RTM_ADD:
1398 		error = rib_add_route(fibnum, info, rc);
1399 		break;
1400 	case RTM_DELETE:
1401 		error = rib_del_route(fibnum, info, rc);
1402 		break;
1403 	case RTM_CHANGE:
1404 		error = rib_change_route(fibnum, info, rc);
1405 		break;
1406 	default:
1407 		error = ENOTSUP;
1408 	}
1409 
1410 	return (error);
1411 }
1412 
1413 struct rt_delinfo
1414 {
1415 	struct rib_head *rnh;
1416 	struct rtentry *head;
1417 	rib_filter_f_t *filter_f;
1418 	void *filter_arg;
1419 	int prio;
1420 	struct rib_cmd_info rc;
1421 };
1422 
1423 /*
1424  * Conditionally unlinks rtenties or paths from radix tree based
1425  * on the callback data passed in @arg.
1426  */
1427 static int
1428 rt_checkdelroute(struct radix_node *rn, void *arg)
1429 {
1430 	struct rt_delinfo *di = (struct rt_delinfo *)arg;
1431 	struct rtentry *rt = (struct rtentry *)rn;
1432 
1433 	if (rt_delete_conditional(di->rnh, rt, di->prio,
1434 	    di->filter_f, di->filter_arg, &di->rc) != 0)
1435 		return (0);
1436 
1437 	/*
1438 	 * Add deleted rtentries to the list to GC them
1439 	 *  after dropping the lock.
1440 	 *
1441 	 * XXX: Delayed notifications not implemented
1442 	 *  for nexthop updates.
1443 	 */
1444 	if (di->rc.rc_cmd == RTM_DELETE) {
1445 		/* Add to the list and return */
1446 		rt->rt_chain = di->head;
1447 		di->head = rt;
1448 #ifdef ROUTE_MPATH
1449 	} else {
1450 		/*
1451 		 * RTM_CHANGE to a different nexthop or nexthop group.
1452 		 * Free old multipath group.
1453 		 */
1454 		nhop_free_any(di->rc.rc_nh_old);
1455 #endif
1456 	}
1457 
1458 	return (0);
1459 }
1460 
1461 /*
1462  * Iterates over a routing table specified by @fibnum and @family and
1463  *  deletes elements marked by @filter_f.
1464  * @fibnum: rtable id
1465  * @family: AF_ address family
1466  * @filter_f: function returning non-zero value for items to delete
1467  * @arg: data to pass to the @filter_f function
1468  * @report: true if rtsock notification is needed.
1469  */
1470 void
1471 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg,
1472     bool report)
1473 {
1474 	struct rib_head *rnh;
1475 	struct rtentry *rt;
1476 	struct nhop_object *nh;
1477 	struct epoch_tracker et;
1478 
1479 	rnh = rt_tables_get_rnh(fibnum, family);
1480 	if (rnh == NULL)
1481 		return;
1482 
1483 	struct rt_delinfo di = {
1484 		.rnh = rnh,
1485 		.filter_f = filter_f,
1486 		.filter_arg = filter_arg,
1487 		.prio = NH_PRIORITY_NORMAL,
1488 	};
1489 
1490 	NET_EPOCH_ENTER(et);
1491 
1492 	RIB_WLOCK(rnh);
1493 	rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
1494 	RIB_WUNLOCK(rnh);
1495 
1496 	/* We might have something to reclaim. */
1497 	bzero(&di.rc, sizeof(di.rc));
1498 	di.rc.rc_cmd = RTM_DELETE;
1499 	while (di.head != NULL) {
1500 		rt = di.head;
1501 		di.head = rt->rt_chain;
1502 		rt->rt_chain = NULL;
1503 		nh = rt->rt_nhop;
1504 
1505 		di.rc.rc_rt = rt;
1506 		di.rc.rc_nh_old = nh;
1507 		rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
1508 
1509 		if (report) {
1510 #ifdef ROUTE_MPATH
1511 			struct nhgrp_object *nhg;
1512 			const struct weightened_nhop *wn;
1513 			uint32_t num_nhops;
1514 			if (NH_IS_NHGRP(nh)) {
1515 				nhg = (struct nhgrp_object *)nh;
1516 				wn = nhgrp_get_nhops(nhg, &num_nhops);
1517 				for (int i = 0; i < num_nhops; i++)
1518 					rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum);
1519 			} else
1520 #endif
1521 			rt_routemsg(RTM_DELETE, rt, nh, fibnum);
1522 		}
1523 		rt_free(rt);
1524 	}
1525 
1526 	NET_EPOCH_EXIT(et);
1527 }
1528 
1529 static int
1530 rt_delete_unconditional(struct radix_node *rn, void *arg)
1531 {
1532 	struct rtentry *rt = RNTORT(rn);
1533 	struct rib_head *rnh = (struct rib_head *)arg;
1534 
1535 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
1536 	if (RNTORT(rn) == rt)
1537 		rt_free(rt);
1538 
1539 	return (0);
1540 }
1541 
1542 /*
1543  * Removes all routes from the routing table without executing notifications.
1544  * rtentres will be removed after the end of a current epoch.
1545  */
1546 static void
1547 rib_flush_routes(struct rib_head *rnh)
1548 {
1549 	RIB_WLOCK(rnh);
1550 	rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh);
1551 	RIB_WUNLOCK(rnh);
1552 }
1553 
1554 void
1555 rib_flush_routes_family(int family)
1556 {
1557 	struct rib_head *rnh;
1558 
1559 	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
1560 		if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
1561 			rib_flush_routes(rnh);
1562 	}
1563 }
1564 
1565 const char *
1566 rib_print_family(int family)
1567 {
1568 	switch (family) {
1569 	case AF_INET:
1570 		return ("inet");
1571 	case AF_INET6:
1572 		return ("inet6");
1573 	case AF_LINK:
1574 		return ("link");
1575 	}
1576 	return ("unknown");
1577 }
1578 
1579