xref: /freebsd/sys/net/route/route_ctl.c (revision 35c0a8c449fd2b7f75029ebed5e10852240f0865)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 #include "opt_inet.h"
30 #include "opt_inet6.h"
31 #include "opt_route.h"
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/malloc.h>
36 #include <sys/mbuf.h>
37 #include <sys/socket.h>
38 #include <sys/sysctl.h>
39 #include <sys/syslog.h>
40 #include <sys/kernel.h>
41 #include <sys/lock.h>
42 #include <sys/rmlock.h>
43 
44 #include <net/if.h>
45 #include <net/if_var.h>
46 #include <net/if_private.h>
47 #include <net/if_dl.h>
48 #include <net/vnet.h>
49 #include <net/route.h>
50 #include <net/route/route_ctl.h>
51 #include <net/route/route_var.h>
52 #include <net/route/nhop_utils.h>
53 #include <net/route/nhop.h>
54 #include <net/route/nhop_var.h>
55 #include <netinet/in.h>
56 #include <netinet6/scope6_var.h>
57 #include <netinet6/in6_var.h>
58 
59 #define	DEBUG_MOD_NAME	route_ctl
60 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
61 #include <net/route/route_debug.h>
62 _DECLARE_DEBUG(LOG_INFO);
63 
64 /*
65  * This file contains control plane routing tables functions.
66  *
67  * All functions assumes they are called in net epoch.
68  */
69 
70 union sockaddr_union {
71 	struct sockaddr		sa;
72 	struct sockaddr_in	sin;
73 	struct sockaddr_in6	sin6;
74 	char			_buf[32];
75 };
76 
77 static int add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
78     struct rib_cmd_info *rc);
79 static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
80     struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
81     struct rib_cmd_info *rc);
82 
83 static int add_route_flags(struct rib_head *rnh, struct rtentry *rt,
84     struct route_nhop_data *rnd_add, int op_flags, struct rib_cmd_info *rc);
85 #ifdef ROUTE_MPATH
86 static int add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
87     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
88     int op_flags, struct rib_cmd_info *rc);
89 #endif
90 
91 static int add_route(struct rib_head *rnh, struct rtentry *rt,
92     struct route_nhop_data *rnd, struct rib_cmd_info *rc);
93 static int delete_route(struct rib_head *rnh, struct rtentry *rt,
94     struct rib_cmd_info *rc);
95 static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
96     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc);
97 
98 static bool fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
99     struct sockaddr **pmask);
100 static int get_prio_from_info(const struct rt_addrinfo *info);
101 static int nhop_get_prio(const struct nhop_object *nh);
102 
103 #ifdef ROUTE_MPATH
104 static bool rib_can_multipath(struct rib_head *rh);
105 #endif
106 
107 /* Per-vnet multipath routing configuration */
108 SYSCTL_DECL(_net_route);
109 #define	V_rib_route_multipath	VNET(rib_route_multipath)
110 #ifdef ROUTE_MPATH
111 #define _MP_FLAGS	CTLFLAG_RW
112 #else
113 #define _MP_FLAGS	CTLFLAG_RD
114 #endif
115 VNET_DEFINE(u_int, rib_route_multipath) = 1;
116 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
117     &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
118 #undef _MP_FLAGS
119 
120 #ifdef ROUTE_MPATH
121 VNET_DEFINE(u_int, fib_hash_outbound) = 0;
122 SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
123     &VNET_NAME(fib_hash_outbound), 0,
124     "Compute flowid for locally-originated packets");
125 
126 /* Default entropy to add to the hash calculation for the outbound connections*/
127 uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
128 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
129 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
130 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
131 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
132 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
133 };
134 #endif
135 
136 #if defined(INET) && defined(INET6)
137 FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops");
138 #define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop)
139 VNET_DEFINE_STATIC(u_int, rib_route_ipv6_nexthop) = 1;
140 SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET,
141     &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address");
142 #endif
143 
144 /* Debug bits */
145 SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
146 
147 static struct rib_head *
148 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
149 {
150 	struct rib_head *rnh;
151 	struct sockaddr *dst;
152 
153 	KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
154 
155 	dst = info->rti_info[RTAX_DST];
156 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
157 
158 	return (rnh);
159 }
160 
161 #if defined(INET) && defined(INET6)
162 bool
163 rib_can_4o6_nhop(void)
164 {
165 	return (!!V_rib_route_ipv6_nexthop);
166 }
167 #endif
168 
169 #ifdef ROUTE_MPATH
170 static bool
171 rib_can_multipath(struct rib_head *rh)
172 {
173 	int result;
174 
175 	CURVNET_SET(rh->rib_vnet);
176 	result = !!V_rib_route_multipath;
177 	CURVNET_RESTORE();
178 
179 	return (result);
180 }
181 
182 /*
183  * Check is nhop is multipath-eligible.
184  * Avoid nhops without gateways and redirects.
185  *
186  * Returns 1 for multipath-eligible nexthop,
187  * 0 otherwise.
188  */
189 bool
190 nhop_can_multipath(const struct nhop_object *nh)
191 {
192 
193 	if ((nh->nh_flags & NHF_MULTIPATH) != 0)
194 		return (1);
195 	if ((nh->nh_flags & NHF_GATEWAY) == 0)
196 		return (0);
197 	if ((nh->nh_flags & NHF_REDIRECT) != 0)
198 		return (0);
199 
200 	return (1);
201 }
202 #endif
203 
204 static int
205 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
206 {
207 	uint32_t weight;
208 
209 	if (info->rti_mflags & RTV_WEIGHT)
210 		weight = info->rti_rmx->rmx_weight;
211 	else
212 		weight = default_weight;
213 	/* Keep upper 1 byte for adm distance purposes */
214 	if (weight > RT_MAX_WEIGHT)
215 		weight = RT_MAX_WEIGHT;
216 	else if (weight == 0)
217 		weight = default_weight;
218 
219 	return (weight);
220 }
221 
222 /*
223  * File-local concept for distingushing between the normal and
224  * RTF_PINNED routes tha can override the "normal" one.
225  */
226 #define	NH_PRIORITY_HIGH	2
227 #define	NH_PRIORITY_NORMAL	1
228 static int
229 get_prio_from_info(const struct rt_addrinfo *info)
230 {
231 	if (info->rti_flags & RTF_PINNED)
232 		return (NH_PRIORITY_HIGH);
233 	return (NH_PRIORITY_NORMAL);
234 }
235 
236 static int
237 nhop_get_prio(const struct nhop_object *nh)
238 {
239 	if (NH_IS_PINNED(nh))
240 		return (NH_PRIORITY_HIGH);
241 	return (NH_PRIORITY_NORMAL);
242 }
243 
244 /*
245  * Check if specified @gw matches gw data in the nexthop @nh.
246  *
247  * Returns true if matches, false otherwise.
248  */
249 bool
250 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
251 {
252 
253 	if (nh->gw_sa.sa_family != gw->sa_family)
254 		return (false);
255 
256 	switch (gw->sa_family) {
257 	case AF_INET:
258 		return (nh->gw4_sa.sin_addr.s_addr ==
259 		    ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
260 	case AF_INET6:
261 		{
262 			const struct sockaddr_in6 *gw6;
263 			gw6 = (const struct sockaddr_in6 *)gw;
264 
265 			/*
266 			 * Currently (2020-09) IPv6 gws in kernel have their
267 			 * scope embedded. Once this becomes false, this code
268 			 * has to be revisited.
269 			 */
270 			if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
271 			    &gw6->sin6_addr))
272 				return (true);
273 			return (false);
274 		}
275 	case AF_LINK:
276 		{
277 			const struct sockaddr_dl *sdl;
278 			sdl = (const struct sockaddr_dl *)gw;
279 			return (nh->gwl_sa.sdl_index == sdl->sdl_index);
280 		}
281 	default:
282 		return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
283 	}
284 
285 	/* NOTREACHED */
286 	return (false);
287 }
288 
289 /*
290  * Matches all nexthop with given @gw.
291  * Can be used as rib_filter_f callback.
292  */
293 int
294 rib_match_gw(const struct rtentry *rt, const struct nhop_object *nh, void *gw_sa)
295 {
296 	const struct sockaddr *gw = (const struct sockaddr *)gw_sa;
297 
298 	return (match_nhop_gw(nh, gw));
299 }
300 
301 struct gw_filter_data {
302 	const struct sockaddr *gw;
303 	int count;
304 };
305 
306 /*
307  * Matches first occurence of the gateway provided in @gwd
308  */
309 static int
310 match_gw_one(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
311 {
312 	struct gw_filter_data *gwd = (struct gw_filter_data *)_data;
313 
314 	/* Return only first match to make rtsock happy */
315 	if (match_nhop_gw(nh, gwd->gw) && gwd->count++ == 0)
316 		return (1);
317 	return (0);
318 }
319 
320 /*
321  * Checks if data in @info matches nexhop @nh.
322  *
323  * Returns 0 on success,
324  * ESRCH if not matched,
325  * ENOENT if filter function returned false
326  */
327 int
328 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
329     const struct nhop_object *nh)
330 {
331 	const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
332 
333 	if (info->rti_filter != NULL) {
334 	    if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
335 		    return (ENOENT);
336 	    else
337 		    return (0);
338 	}
339 	if ((gw != NULL) && !match_nhop_gw(nh, gw))
340 		return (ESRCH);
341 
342 	return (0);
343 }
344 
345 /*
346  * Runs exact prefix match based on @dst and @netmask.
347  * Returns matched @rtentry if found or NULL.
348  * If rtentry was found, saves nexthop / weight value into @rnd.
349  */
350 static struct rtentry *
351 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
352     const struct sockaddr *netmask, struct route_nhop_data *rnd)
353 {
354 	struct rtentry *rt;
355 
356 	RIB_LOCK_ASSERT(rnh);
357 
358 	rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
359 	if (rt != NULL) {
360 		rnd->rnd_nhop = rt->rt_nhop;
361 		rnd->rnd_weight = rt->rt_weight;
362 	} else {
363 		rnd->rnd_nhop = NULL;
364 		rnd->rnd_weight = 0;
365 	}
366 
367 	return (rt);
368 }
369 
370 struct rtentry *
371 lookup_prefix_rt(struct rib_head *rnh, const struct rtentry *rt,
372     struct route_nhop_data *rnd)
373 {
374 	return (lookup_prefix_bysa(rnh, rt_key_const(rt), rt_mask_const(rt), rnd));
375 }
376 
377 /*
378  * Runs exact prefix match based on dst/netmask from @info.
379  * Assumes RIB lock is held.
380  * Returns matched @rtentry if found or NULL.
381  * If rtentry was found, saves nexthop / weight value into @rnd.
382  */
383 struct rtentry *
384 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
385     struct route_nhop_data *rnd)
386 {
387 	struct rtentry *rt;
388 
389 	rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
390 	    info->rti_info[RTAX_NETMASK], rnd);
391 
392 	return (rt);
393 }
394 
395 const struct rtentry *
396 rib_lookup_prefix_plen(struct rib_head *rnh, struct sockaddr *dst, int plen,
397     struct route_nhop_data *rnd)
398 {
399 	union sockaddr_union mask_storage;
400 	struct sockaddr *netmask = &mask_storage.sa;
401 
402 	if (fill_pxmask_family(dst->sa_family, plen, dst, &netmask))
403 		return (lookup_prefix_bysa(rnh, dst, netmask, rnd));
404 	return (NULL);
405 }
406 
407 static bool
408 fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
409     struct sockaddr **pmask)
410 {
411 	if (plen == -1) {
412 		*pmask = NULL;
413 		return (true);
414 	}
415 
416 	switch (family) {
417 #ifdef INET
418 	case AF_INET:
419 		{
420 			struct sockaddr_in *mask = (struct sockaddr_in *)(*pmask);
421 			struct sockaddr_in *dst= (struct sockaddr_in *)_dst;
422 
423 			memset(mask, 0, sizeof(*mask));
424 			mask->sin_family = family;
425 			mask->sin_len = sizeof(*mask);
426 			if (plen == 32)
427 				*pmask = NULL;
428 			else if (plen > 32 || plen < 0)
429 				return (false);
430 			else {
431 				uint32_t daddr, maddr;
432 				maddr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
433 				mask->sin_addr.s_addr = maddr;
434 				daddr = dst->sin_addr.s_addr;
435 				daddr = htonl(ntohl(daddr) & ntohl(maddr));
436 				dst->sin_addr.s_addr = daddr;
437 			}
438 			return (true);
439 		}
440 		break;
441 #endif
442 #ifdef INET6
443 	case AF_INET6:
444 		{
445 			struct sockaddr_in6 *mask = (struct sockaddr_in6 *)(*pmask);
446 			struct sockaddr_in6 *dst = (struct sockaddr_in6 *)_dst;
447 
448 			memset(mask, 0, sizeof(*mask));
449 			mask->sin6_family = family;
450 			mask->sin6_len = sizeof(*mask);
451 			if (plen == 128)
452 				*pmask = NULL;
453 			else if (plen > 128 || plen < 0)
454 				return (false);
455 			else {
456 				ip6_writemask(&mask->sin6_addr, plen);
457 				IN6_MASK_ADDR(&dst->sin6_addr, &mask->sin6_addr);
458 			}
459 			return (true);
460 		}
461 		break;
462 #endif
463 	}
464 	return (false);
465 }
466 
467 /*
468  * Attempts to add @dst/plen prefix with nexthop/nexhopgroup data @rnd
469  * to the routing table.
470  *
471  * @fibnum: verified kernel rtable id to insert route to
472  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
473  * @plen: prefix length (or -1 if host route or not applicable for AF)
474  * @op_flags: combination of RTM_F_ flags
475  * @rc: storage to report operation result
476  *
477  * Returns 0 on success.
478  */
479 int
480 rib_add_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
481     struct route_nhop_data *rnd, int op_flags, struct rib_cmd_info *rc)
482 {
483 	union sockaddr_union mask_storage;
484 	struct sockaddr *netmask = &mask_storage.sa;
485 	struct rtentry *rt = NULL;
486 
487 	NET_EPOCH_ASSERT();
488 
489 	bzero(rc, sizeof(struct rib_cmd_info));
490 	rc->rc_cmd = RTM_ADD;
491 
492 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
493 	if (rnh == NULL)
494 		return (EAFNOSUPPORT);
495 
496 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
497 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
498 		return (EINVAL);
499 	}
500 
501 	if (op_flags & RTM_F_CREATE) {
502 		if ((rt = rt_alloc(rnh, dst, netmask)) == NULL) {
503 			FIB_RH_LOG(LOG_INFO, rnh, "rtentry allocation failed");
504 			return (ENOMEM);
505 		}
506 	} else {
507 		struct route_nhop_data rnd_tmp;
508 		RIB_RLOCK_TRACKER;
509 
510 		RIB_RLOCK(rnh);
511 		rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd_tmp);
512 		RIB_RUNLOCK(rnh);
513 
514 		if (rt == NULL)
515 			return (ESRCH);
516 	}
517 
518 	return (add_route_flags(rnh, rt, rnd, op_flags, rc));
519 }
520 
521 /*
522  * Attempts to delete @dst/plen prefix matching gateway @gw from the
523  *  routing rable.
524  *
525  * @fibnum: rtable id to remove route from
526  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
527  * @plen: prefix length (or -1 if host route or not applicable for AF)
528  * @gw: gateway to match
529  * @op_flags: combination of RTM_F_ flags
530  * @rc: storage to report operation result
531  *
532  * Returns 0 on success.
533  */
534 int
535 rib_del_route_px_gw(uint32_t fibnum, struct sockaddr *dst, int plen,
536     const struct sockaddr *gw, int op_flags, struct rib_cmd_info *rc)
537 {
538 	struct gw_filter_data gwd = { .gw = gw };
539 
540 	return (rib_del_route_px(fibnum, dst, plen, match_gw_one, &gwd, op_flags, rc));
541 }
542 
543 /*
544  * Attempts to delete @dst/plen prefix matching @filter_func from the
545  *  routing rable.
546  *
547  * @fibnum: rtable id to remove route from
548  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
549  * @plen: prefix length (or -1 if host route or not applicable for AF)
550  * @filter_func: func to be called for each nexthop of the prefix for matching
551  * @filter_arg: argument to pass to @filter_func
552  * @op_flags: combination of RTM_F_ flags
553  * @rc: storage to report operation result
554  *
555  * Returns 0 on success.
556  */
557 int
558 rib_del_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
559     rib_filter_f_t *filter_func, void *filter_arg, int op_flags,
560     struct rib_cmd_info *rc)
561 {
562 	union sockaddr_union mask_storage;
563 	struct sockaddr *netmask = &mask_storage.sa;
564 	int error;
565 
566 	NET_EPOCH_ASSERT();
567 
568 	bzero(rc, sizeof(struct rib_cmd_info));
569 	rc->rc_cmd = RTM_DELETE;
570 
571 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
572 	if (rnh == NULL)
573 		return (EAFNOSUPPORT);
574 
575 	if (dst->sa_len > sizeof(mask_storage)) {
576 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too big: %d", dst->sa_len);
577 		return (EINVAL);
578 	}
579 
580 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
581 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
582 		return (EINVAL);
583 	}
584 
585 	int prio = (op_flags & RTM_F_FORCE) ? NH_PRIORITY_HIGH : NH_PRIORITY_NORMAL;
586 
587 	RIB_WLOCK(rnh);
588 	struct route_nhop_data rnd;
589 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
590 	if (rt != NULL) {
591 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
592 		    filter_arg, rc);
593 	} else
594 		error = ESRCH;
595 	RIB_WUNLOCK(rnh);
596 
597 	if (error != 0)
598 		return (error);
599 
600 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
601 
602 	if (rc->rc_cmd == RTM_DELETE)
603 		rt_free(rc->rc_rt);
604 #ifdef ROUTE_MPATH
605 	else {
606 		/*
607 		 * Deleting 1 path may result in RTM_CHANGE to
608 		 * a different mpath group/nhop.
609 		 * Free old mpath group.
610 		 */
611 		nhop_free_any(rc->rc_nh_old);
612 	}
613 #endif
614 
615 	return (0);
616 }
617 
618 /*
619  * Tries to copy route @rt from one rtable to the rtable specified by @dst_rh.
620  * @rt: route to copy.
621  * @rnd_src: nhop and weight. Multipath routes are not supported
622  * @rh_dst: target rtable.
623  * @rc: operation result storage
624  *
625  * Return 0 on success.
626  */
627 int
628 rib_copy_route(struct rtentry *rt, const struct route_nhop_data *rnd_src,
629     struct rib_head *rh_dst, struct rib_cmd_info *rc)
630 {
631 	struct nhop_object __diagused *nh_src = rnd_src->rnd_nhop;
632 	int error;
633 
634 	MPASS((nh_src->nh_flags & NHF_MULTIPATH) == 0);
635 
636 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
637 		char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
638 		nhop_print_buf_any(nh_src, nhbuf, sizeof(nhbuf));
639 		rt_print_buf(rt, rtbuf, sizeof(rtbuf));
640 		FIB_RH_LOG(LOG_DEBUG2, rh_dst, "copying %s -> %s from fib %u",
641 		    rtbuf, nhbuf, nhop_get_fibnum(nh_src));
642 	}
643 	struct nhop_object *nh = nhop_alloc(rh_dst->rib_fibnum, rh_dst->rib_family);
644 	if (nh == NULL) {
645 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to allocate new nexthop");
646 		return (ENOMEM);
647 	}
648 	nhop_copy(nh, rnd_src->rnd_nhop);
649 	nhop_set_origin(nh, nhop_get_origin(rnd_src->rnd_nhop));
650 	nhop_set_fibnum(nh, rh_dst->rib_fibnum);
651 	nh = nhop_get_nhop_internal(rh_dst, nh, &error);
652 	if (error != 0) {
653 		FIB_RH_LOG(LOG_INFO, rh_dst,
654 		    "unable to finalize new nexthop: error %d", error);
655 		return (ENOMEM);
656 	}
657 
658 	struct rtentry *rt_new = rt_alloc(rh_dst, rt_key(rt), rt_mask(rt));
659 	if (rt_new == NULL) {
660 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to create new rtentry");
661 		nhop_free(nh);
662 		return (ENOMEM);
663 	}
664 
665 	struct route_nhop_data rnd = {
666 		.rnd_nhop = nh,
667 		.rnd_weight = rnd_src->rnd_weight
668 	};
669 	int op_flags = RTM_F_CREATE | (NH_IS_PINNED(nh) ? RTM_F_FORCE : 0);
670 	error = add_route_flags(rh_dst, rt_new, &rnd, op_flags, rc);
671 
672 	if (error != 0) {
673 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
674 			char buf[NHOP_PRINT_BUFSIZE];
675 			rt_print_buf(rt_new, buf, sizeof(buf));
676 			FIB_RH_LOG(LOG_DEBUG, rh_dst,
677 			    "Unable to add route %s: error %d", buf, error);
678 		}
679 		nhop_free(nh);
680 		rt_free_immediate(rt_new);
681 	}
682 	return (error);
683 }
684 
685 /*
686  * Adds route defined by @info into the kernel table specified by @fibnum and
687  * sa_family in @info->rti_info[RTAX_DST].
688  *
689  * Returns 0 on success and fills in operation metadata into @rc.
690  */
691 int
692 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
693     struct rib_cmd_info *rc)
694 {
695 	struct rib_head *rnh;
696 	int error;
697 
698 	NET_EPOCH_ASSERT();
699 
700 	rnh = get_rnh(fibnum, info);
701 	if (rnh == NULL)
702 		return (EAFNOSUPPORT);
703 
704 	/*
705 	 * Check consistency between RTF_HOST flag and netmask
706 	 * existence.
707 	 */
708 	if (info->rti_flags & RTF_HOST)
709 		info->rti_info[RTAX_NETMASK] = NULL;
710 	else if (info->rti_info[RTAX_NETMASK] == NULL) {
711 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: no RTF_HOST and empty netmask");
712 		return (EINVAL);
713 	}
714 
715 	bzero(rc, sizeof(struct rib_cmd_info));
716 	rc->rc_cmd = RTM_ADD;
717 
718 	error = add_route_byinfo(rnh, info, rc);
719 	if (error == 0)
720 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
721 
722 	return (error);
723 }
724 
725 static int
726 add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
727     struct rib_cmd_info *rc)
728 {
729 	struct route_nhop_data rnd_add;
730 	struct nhop_object *nh;
731 	struct rtentry *rt;
732 	struct sockaddr *dst, *gateway, *netmask;
733 	int error;
734 
735 	dst = info->rti_info[RTAX_DST];
736 	gateway = info->rti_info[RTAX_GATEWAY];
737 	netmask = info->rti_info[RTAX_NETMASK];
738 
739 	if ((info->rti_flags & RTF_GATEWAY) && !gateway) {
740 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: RTF_GATEWAY set with empty gw");
741 		return (EINVAL);
742 	}
743 	if (dst && gateway && !nhop_check_gateway(dst->sa_family, gateway->sa_family)) {
744 		FIB_RH_LOG(LOG_DEBUG, rnh,
745 		    "error: invalid dst/gateway family combination (%d, %d)",
746 		    dst->sa_family, gateway->sa_family);
747 		return (EINVAL);
748 	}
749 
750 	if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) {
751 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large: %d",
752 		    dst->sa_len);
753 		return (EINVAL);
754 	}
755 
756 	if (info->rti_ifa == NULL) {
757 		error = rt_getifa_fib(info, rnh->rib_fibnum);
758 		if (error)
759 			return (error);
760 	}
761 
762 	if ((rt = rt_alloc(rnh, dst, netmask)) == NULL)
763 		return (ENOBUFS);
764 
765 	error = nhop_create_from_info(rnh, info, &nh);
766 	if (error != 0) {
767 		rt_free_immediate(rt);
768 		return (error);
769 	}
770 
771 	rnd_add.rnd_nhop = nh;
772 	rnd_add.rnd_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
773 
774 	int op_flags = RTM_F_CREATE;
775 
776 	/*
777 	 * Set the desired action when the route already exists:
778 	 * If RTF_PINNED is present, assume the direct kernel routes that cannot be multipath.
779 	 * Otherwise, append the path.
780 	 */
781 	op_flags |= (info->rti_flags & RTF_PINNED) ? RTM_F_REPLACE : RTM_F_APPEND;
782 
783 	return (add_route_flags(rnh, rt, &rnd_add, op_flags, rc));
784 }
785 
786 static int
787 add_route_flags(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd_add,
788     int op_flags, struct rib_cmd_info *rc)
789 {
790 	struct route_nhop_data rnd_orig;
791 	struct nhop_object *nh;
792 	struct rtentry *rt_orig;
793 	int error = 0;
794 
795 	MPASS(rt != NULL);
796 
797 	nh = rnd_add->rnd_nhop;
798 
799 	RIB_WLOCK(rnh);
800 
801 	rt_orig = lookup_prefix_rt(rnh, rt, &rnd_orig);
802 
803 	if (rt_orig == NULL) {
804 		if (op_flags & RTM_F_CREATE)
805 			error = add_route(rnh, rt, rnd_add, rc);
806 		else
807 			error = ESRCH; /* no entry but creation was not required */
808 		RIB_WUNLOCK(rnh);
809 		if (error != 0)
810 			goto out;
811 		return (0);
812 	}
813 
814 	if (op_flags & RTM_F_EXCL) {
815 		/* We have existing route in the RIB but not allowed to replace. */
816 		RIB_WUNLOCK(rnh);
817 		error = EEXIST;
818 		goto out;
819 	}
820 
821 	/* Now either append or replace */
822 	if (op_flags & RTM_F_REPLACE) {
823 		if (nhop_get_prio(rnd_orig.rnd_nhop) > nhop_get_prio(rnd_add->rnd_nhop)) {
824 			/* Old path is "better" (e.g. has PINNED flag set) */
825 			RIB_WUNLOCK(rnh);
826 			error = EEXIST;
827 			goto out;
828 		}
829 		change_route(rnh, rt_orig, rnd_add, rc);
830 		RIB_WUNLOCK(rnh);
831 		nh = rc->rc_nh_old;
832 		goto out;
833 	}
834 
835 	RIB_WUNLOCK(rnh);
836 
837 #ifdef ROUTE_MPATH
838 	if ((op_flags & RTM_F_APPEND) && rib_can_multipath(rnh) &&
839 	    nhop_can_multipath(rnd_add->rnd_nhop) &&
840 	    nhop_can_multipath(rnd_orig.rnd_nhop)) {
841 
842 		for (int i = 0; i < RIB_MAX_RETRIES; i++) {
843 			error = add_route_flags_mpath(rnh, rt_orig, rnd_add, &rnd_orig,
844 			    op_flags, rc);
845 			if (error != EAGAIN)
846 				break;
847 			RTSTAT_INC(rts_add_retry);
848 		}
849 
850 		/*
851 		 *  Original nhop reference is unused in any case.
852 		 */
853 		nhop_free_any(rnd_add->rnd_nhop);
854 		if (op_flags & RTM_F_CREATE) {
855 			if (error != 0 || rc->rc_cmd != RTM_ADD)
856 				rt_free_immediate(rt);
857 		}
858 		return (error);
859 	}
860 #endif
861 	/* Out of options - free state and return error */
862 	error = EEXIST;
863 out:
864 	if (op_flags & RTM_F_CREATE)
865 		rt_free_immediate(rt);
866 	nhop_free_any(nh);
867 
868 	return (error);
869 }
870 
871 #ifdef ROUTE_MPATH
872 static int
873 add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
874     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
875     int op_flags, struct rib_cmd_info *rc)
876 {
877 	RIB_RLOCK_TRACKER;
878 	struct route_nhop_data rnd_new;
879 	int error = 0;
880 
881 	error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, &rnd_new);
882 	if (error != 0) {
883 		if (error == EAGAIN) {
884 			/*
885 			 * Group creation failed, most probably because
886 			 * @rnd_orig data got scheduled for deletion.
887 			 * Refresh @rnd_orig data and retry.
888 			 */
889 			RIB_RLOCK(rnh);
890 			lookup_prefix_rt(rnh, rt, rnd_orig);
891 			RIB_RUNLOCK(rnh);
892 			if (rnd_orig == NULL && !(op_flags & RTM_F_CREATE)) {
893 				/* In this iteration route doesn't exist */
894 				error = ENOENT;
895 			}
896 		}
897 		return (error);
898 	}
899 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
900 	if (error != 0)
901 		return (error);
902 
903 	if (V_fib_hash_outbound == 0 && NH_IS_NHGRP(rc->rc_nh_new)) {
904 		/*
905 		 * First multipath route got installed. Enable local
906 		 * outbound connections hashing.
907 		 */
908 		if (bootverbose)
909 			printf("FIB: enabled flowid calculation for locally-originated packets\n");
910 		V_fib_hash_outbound = 1;
911 	}
912 
913 	return (0);
914 }
915 #endif
916 
917 /*
918  * Removes route defined by @info from the kernel table specified by @fibnum and
919  * sa_family in @info->rti_info[RTAX_DST].
920  *
921  * Returns 0 on success and fills in operation metadata into @rc.
922  */
923 int
924 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
925 {
926 	struct rib_head *rnh;
927 	struct sockaddr *dst, *netmask;
928 	struct sockaddr_storage mdst;
929 	int error;
930 
931 	NET_EPOCH_ASSERT();
932 
933 	rnh = get_rnh(fibnum, info);
934 	if (rnh == NULL)
935 		return (EAFNOSUPPORT);
936 
937 	bzero(rc, sizeof(struct rib_cmd_info));
938 	rc->rc_cmd = RTM_DELETE;
939 
940 	dst = info->rti_info[RTAX_DST];
941 	netmask = info->rti_info[RTAX_NETMASK];
942 
943 	if (netmask != NULL) {
944 		/* Ensure @dst is always properly masked */
945 		if (dst->sa_len > sizeof(mdst)) {
946 			FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large");
947 			return (EINVAL);
948 		}
949 		rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
950 		dst = (struct sockaddr *)&mdst;
951 	}
952 
953 	rib_filter_f_t *filter_func = NULL;
954 	void *filter_arg = NULL;
955 	struct gw_filter_data gwd = { .gw = info->rti_info[RTAX_GATEWAY] };
956 
957 	if (info->rti_filter != NULL) {
958 		filter_func = info->rti_filter;
959 		filter_arg = info->rti_filterdata;
960 	} else if (gwd.gw != NULL) {
961 		filter_func = match_gw_one;
962 		filter_arg = &gwd;
963 	}
964 
965 	int prio = get_prio_from_info(info);
966 
967 	RIB_WLOCK(rnh);
968 	struct route_nhop_data rnd;
969 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
970 	if (rt != NULL) {
971 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
972 		    filter_arg, rc);
973 	} else
974 		error = ESRCH;
975 	RIB_WUNLOCK(rnh);
976 
977 	if (error != 0)
978 		return (error);
979 
980 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
981 
982 	if (rc->rc_cmd == RTM_DELETE)
983 		rt_free(rc->rc_rt);
984 #ifdef ROUTE_MPATH
985 	else {
986 		/*
987 		 * Deleting 1 path may result in RTM_CHANGE to
988 		 * a different mpath group/nhop.
989 		 * Free old mpath group.
990 		 */
991 		nhop_free_any(rc->rc_nh_old);
992 	}
993 #endif
994 
995 	return (0);
996 }
997 
998 /*
999  * Conditionally unlinks rtentry paths from @rnh matching @cb.
1000  * Returns 0 on success with operation result stored in @rc.
1001  * On error, returns:
1002  * ESRCH - if prefix was not found or filter function failed to match
1003  * EADDRINUSE - if trying to delete higher priority route.
1004  */
1005 static int
1006 rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
1007     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc)
1008 {
1009 	struct nhop_object *nh = rt->rt_nhop;
1010 
1011 #ifdef ROUTE_MPATH
1012 	if (NH_IS_NHGRP(nh)) {
1013 		struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
1014 		struct route_nhop_data rnd;
1015 		int error;
1016 
1017 		if (cb == NULL)
1018 			return (ESRCH);
1019 		error = nhgrp_get_filtered_group(rnh, rt, nhg, cb, cbdata, &rnd);
1020 		if (error == 0) {
1021 			if (rnd.rnd_nhgrp == nhg) {
1022 				/* No match, unreference new group and return. */
1023 				nhop_free_any(rnd.rnd_nhop);
1024 				return (ESRCH);
1025 			}
1026 			error = change_route(rnh, rt, &rnd, rc);
1027 		}
1028 		return (error);
1029 	}
1030 #endif
1031 	if (cb != NULL && !cb(rt, nh, cbdata))
1032 		return (ESRCH);
1033 
1034 	if (prio < nhop_get_prio(nh))
1035 		return (EADDRINUSE);
1036 
1037 	return (delete_route(rnh, rt, rc));
1038 }
1039 
1040 int
1041 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
1042     struct rib_cmd_info *rc)
1043 {
1044 	RIB_RLOCK_TRACKER;
1045 	struct route_nhop_data rnd_orig;
1046 	struct rib_head *rnh;
1047 	struct rtentry *rt;
1048 	int error;
1049 
1050 	NET_EPOCH_ASSERT();
1051 
1052 	rnh = get_rnh(fibnum, info);
1053 	if (rnh == NULL)
1054 		return (EAFNOSUPPORT);
1055 
1056 	bzero(rc, sizeof(struct rib_cmd_info));
1057 	rc->rc_cmd = RTM_CHANGE;
1058 
1059 	/* Check if updated gateway exists */
1060 	if ((info->rti_flags & RTF_GATEWAY) &&
1061 	    (info->rti_info[RTAX_GATEWAY] == NULL)) {
1062 
1063 		/*
1064 		 * route(8) adds RTF_GATEWAY flag if -interface is not set.
1065 		 * Remove RTF_GATEWAY to enforce consistency and maintain
1066 		 * compatibility..
1067 		 */
1068 		info->rti_flags &= ~RTF_GATEWAY;
1069 	}
1070 
1071 	/*
1072 	 * route change is done in multiple steps, with dropping and
1073 	 * reacquiring lock. In the situations with multiple processes
1074 	 * changes the same route in can lead to the case when route
1075 	 * is changed between the steps. Address it by retrying the operation
1076 	 * multiple times before failing.
1077 	 */
1078 
1079 	RIB_RLOCK(rnh);
1080 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
1081 	    info->rti_info[RTAX_NETMASK], &rnh->head);
1082 
1083 	if (rt == NULL) {
1084 		RIB_RUNLOCK(rnh);
1085 		return (ESRCH);
1086 	}
1087 
1088 	rnd_orig.rnd_nhop = rt->rt_nhop;
1089 	rnd_orig.rnd_weight = rt->rt_weight;
1090 
1091 	RIB_RUNLOCK(rnh);
1092 
1093 	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
1094 		error = change_route_byinfo(rnh, rt, info, &rnd_orig, rc);
1095 		if (error != EAGAIN)
1096 			break;
1097 	}
1098 
1099 	return (error);
1100 }
1101 
1102 static int
1103 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
1104     struct nhop_object *nh_orig, struct nhop_object **nh_new)
1105 {
1106 	int error;
1107 
1108 	/*
1109 	 * New gateway could require new ifaddr, ifp;
1110 	 * flags may also be different; ifp may be specified
1111 	 * by ll sockaddr when protocol address is ambiguous
1112 	 */
1113 	if (((nh_orig->nh_flags & NHF_GATEWAY) &&
1114 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
1115 	    info->rti_info[RTAX_IFP] != NULL ||
1116 	    (info->rti_info[RTAX_IFA] != NULL &&
1117 	     !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
1118 		error = rt_getifa_fib(info, rnh->rib_fibnum);
1119 
1120 		if (error != 0) {
1121 			info->rti_ifa = NULL;
1122 			return (error);
1123 		}
1124 	}
1125 
1126 	error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
1127 	info->rti_ifa = NULL;
1128 
1129 	return (error);
1130 }
1131 
1132 #ifdef ROUTE_MPATH
1133 static int
1134 change_mpath_route(struct rib_head *rnh, struct rtentry *rt,
1135     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1136     struct rib_cmd_info *rc)
1137 {
1138 	int error = 0, found_idx = 0;
1139 	struct nhop_object *nh_orig = NULL, *nh_new;
1140 	struct route_nhop_data rnd_new = {};
1141 	const struct weightened_nhop *wn = NULL;
1142 	struct weightened_nhop *wn_new;
1143 	uint32_t num_nhops;
1144 
1145 	wn = nhgrp_get_nhops(rnd_orig->rnd_nhgrp, &num_nhops);
1146 	for (int i = 0; i < num_nhops; i++) {
1147 		if (check_info_match_nhop(info, NULL, wn[i].nh) == 0) {
1148 			nh_orig = wn[i].nh;
1149 			found_idx = i;
1150 			break;
1151 		}
1152 	}
1153 
1154 	if (nh_orig == NULL)
1155 		return (ESRCH);
1156 
1157 	error = change_nhop(rnh, info, nh_orig, &nh_new);
1158 	if (error != 0)
1159 		return (error);
1160 
1161 	wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
1162 	    M_TEMP, M_NOWAIT | M_ZERO);
1163 	if (wn_new == NULL) {
1164 		nhop_free(nh_new);
1165 		return (EAGAIN);
1166 	}
1167 
1168 	memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
1169 	wn_new[found_idx].nh = nh_new;
1170 	wn_new[found_idx].weight = get_info_weight(info, wn[found_idx].weight);
1171 
1172 	error = nhgrp_get_group(rnh, wn_new, num_nhops, 0, &rnd_new.rnd_nhgrp);
1173 	nhop_free(nh_new);
1174 	free(wn_new, M_TEMP);
1175 
1176 	if (error != 0)
1177 		return (error);
1178 
1179 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1180 
1181 	return (error);
1182 }
1183 #endif
1184 
1185 static int
1186 change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
1187     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1188     struct rib_cmd_info *rc)
1189 {
1190 	int error = 0;
1191 	struct nhop_object *nh_orig;
1192 	struct route_nhop_data rnd_new;
1193 
1194 	nh_orig = rnd_orig->rnd_nhop;
1195 	if (nh_orig == NULL)
1196 		return (ESRCH);
1197 
1198 #ifdef ROUTE_MPATH
1199 	if (NH_IS_NHGRP(nh_orig))
1200 		return (change_mpath_route(rnh, rt, info, rnd_orig, rc));
1201 #endif
1202 
1203 	rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
1204 	error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
1205 	if (error != 0)
1206 		return (error);
1207 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1208 
1209 	return (error);
1210 }
1211 
1212 /*
1213  * Insert @rt with nhop data from @rnd_new to @rnh.
1214  * Returns 0 on success and stores operation results in @rc.
1215  */
1216 static int
1217 add_route(struct rib_head *rnh, struct rtentry *rt,
1218     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1219 {
1220 	struct radix_node *rn;
1221 
1222 	RIB_WLOCK_ASSERT(rnh);
1223 
1224 	rt->rt_nhop = rnd->rnd_nhop;
1225 	rt->rt_weight = rnd->rnd_weight;
1226 	rn = rnh->rnh_addaddr(rt_key(rt), rt_mask_const(rt), &rnh->head, rt->rt_nodes);
1227 
1228 	if (rn != NULL) {
1229 		if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1230 			tmproutes_update(rnh, rt, rnd->rnd_nhop);
1231 
1232 		/* Finalize notification */
1233 		rib_bump_gen(rnh);
1234 		rnh->rnh_prefixes++;
1235 
1236 		rc->rc_cmd = RTM_ADD;
1237 		rc->rc_rt = rt;
1238 		rc->rc_nh_old = NULL;
1239 		rc->rc_nh_new = rnd->rnd_nhop;
1240 		rc->rc_nh_weight = rnd->rnd_weight;
1241 
1242 		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1243 		return (0);
1244 	}
1245 
1246 	/* Existing route or memory allocation failure. */
1247 	return (EEXIST);
1248 }
1249 
1250 /*
1251  * Unconditionally deletes @rt from @rnh.
1252  */
1253 static int
1254 delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc)
1255 {
1256 	RIB_WLOCK_ASSERT(rnh);
1257 
1258 	/* Route deletion requested. */
1259 	struct radix_node *rn;
1260 
1261 	rn = rnh->rnh_deladdr(rt_key_const(rt), rt_mask_const(rt), &rnh->head);
1262 	if (rn == NULL)
1263 		return (ESRCH);
1264 	rt = RNTORT(rn);
1265 	rt->rte_flags &= ~RTF_UP;
1266 
1267 	rib_bump_gen(rnh);
1268 	rnh->rnh_prefixes--;
1269 
1270 	rc->rc_cmd = RTM_DELETE;
1271 	rc->rc_rt = rt;
1272 	rc->rc_nh_old = rt->rt_nhop;
1273 	rc->rc_nh_new = NULL;
1274 	rc->rc_nh_weight = rt->rt_weight;
1275 
1276 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1277 
1278 	return (0);
1279 }
1280 
1281 /*
1282  * Switch @rt nhop/weigh to the ones specified in @rnd.
1283  * Returns 0 on success.
1284  */
1285 int
1286 change_route(struct rib_head *rnh, struct rtentry *rt,
1287     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1288 {
1289 	struct nhop_object *nh_orig;
1290 
1291 	RIB_WLOCK_ASSERT(rnh);
1292 
1293 	nh_orig = rt->rt_nhop;
1294 
1295 	if (rnd->rnd_nhop == NULL)
1296 		return (delete_route(rnh, rt, rc));
1297 
1298 	/* Changing nexthop & weight to a new one */
1299 	rt->rt_nhop = rnd->rnd_nhop;
1300 	rt->rt_weight = rnd->rnd_weight;
1301 	if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1302 		tmproutes_update(rnh, rt, rnd->rnd_nhop);
1303 
1304 	/* Finalize notification */
1305 	rib_bump_gen(rnh);
1306 	rc->rc_cmd = RTM_CHANGE;
1307 	rc->rc_rt = rt;
1308 	rc->rc_nh_old = nh_orig;
1309 	rc->rc_nh_new = rnd->rnd_nhop;
1310 	rc->rc_nh_weight = rnd->rnd_weight;
1311 
1312 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1313 
1314 	return (0);
1315 }
1316 
1317 /*
1318  * Conditionally update route nhop/weight IFF data in @nhd_orig is
1319  *  consistent with the current route data.
1320  * Nexthop in @nhd_new is consumed.
1321  */
1322 int
1323 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
1324     struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new,
1325     struct rib_cmd_info *rc)
1326 {
1327 	struct rtentry *rt_new;
1328 	int error = 0;
1329 
1330 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
1331 		char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
1332 		nhop_print_buf_any(rnd_orig->rnd_nhop, buf_old, NHOP_PRINT_BUFSIZE);
1333 		nhop_print_buf_any(rnd_new->rnd_nhop, buf_new, NHOP_PRINT_BUFSIZE);
1334 		FIB_LOG(LOG_DEBUG2, rnh->rib_fibnum, rnh->rib_family,
1335 		    "trying change %s -> %s", buf_old, buf_new);
1336 	}
1337 	RIB_WLOCK(rnh);
1338 
1339 	struct route_nhop_data rnd;
1340 	rt_new = lookup_prefix_rt(rnh, rt, &rnd);
1341 
1342 	if (rt_new == NULL) {
1343 		if (rnd_orig->rnd_nhop == NULL)
1344 			error = add_route(rnh, rt, rnd_new, rc);
1345 		else {
1346 			/*
1347 			 * Prefix does not exist, which was not our assumption.
1348 			 * Update @rnd_orig with the new data and return
1349 			 */
1350 			rnd_orig->rnd_nhop = NULL;
1351 			rnd_orig->rnd_weight = 0;
1352 			error = EAGAIN;
1353 		}
1354 	} else {
1355 		/* Prefix exists, try to update */
1356 		if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
1357 			/*
1358 			 * Nhop/mpath group hasn't changed. Flip
1359 			 * to the new precalculated one and return
1360 			 */
1361 			error = change_route(rnh, rt_new, rnd_new, rc);
1362 		} else {
1363 			/* Update and retry */
1364 			rnd_orig->rnd_nhop = rt_new->rt_nhop;
1365 			rnd_orig->rnd_weight = rt_new->rt_weight;
1366 			error = EAGAIN;
1367 		}
1368 	}
1369 
1370 	RIB_WUNLOCK(rnh);
1371 
1372 	if (error == 0) {
1373 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
1374 
1375 		if (rnd_orig->rnd_nhop != NULL)
1376 			nhop_free_any(rnd_orig->rnd_nhop);
1377 
1378 	} else {
1379 		if (rnd_new->rnd_nhop != NULL)
1380 			nhop_free_any(rnd_new->rnd_nhop);
1381 	}
1382 
1383 	return (error);
1384 }
1385 
1386 /*
1387  * Performs modification of routing table specificed by @action.
1388  * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
1389  * Needs to be run in network epoch.
1390  *
1391  * Returns 0 on success and fills in @rc with action result.
1392  */
1393 int
1394 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
1395     struct rib_cmd_info *rc)
1396 {
1397 	int error;
1398 
1399 	switch (action) {
1400 	case RTM_ADD:
1401 		error = rib_add_route(fibnum, info, rc);
1402 		break;
1403 	case RTM_DELETE:
1404 		error = rib_del_route(fibnum, info, rc);
1405 		break;
1406 	case RTM_CHANGE:
1407 		error = rib_change_route(fibnum, info, rc);
1408 		break;
1409 	default:
1410 		error = ENOTSUP;
1411 	}
1412 
1413 	return (error);
1414 }
1415 
1416 struct rt_delinfo
1417 {
1418 	struct rib_head *rnh;
1419 	struct rtentry *head;
1420 	rib_filter_f_t *filter_f;
1421 	void *filter_arg;
1422 	int prio;
1423 	struct rib_cmd_info rc;
1424 };
1425 
1426 /*
1427  * Conditionally unlinks rtenties or paths from radix tree based
1428  * on the callback data passed in @arg.
1429  */
1430 static int
1431 rt_checkdelroute(struct radix_node *rn, void *arg)
1432 {
1433 	struct rt_delinfo *di = (struct rt_delinfo *)arg;
1434 	struct rtentry *rt = (struct rtentry *)rn;
1435 
1436 	if (rt_delete_conditional(di->rnh, rt, di->prio,
1437 	    di->filter_f, di->filter_arg, &di->rc) != 0)
1438 		return (0);
1439 
1440 	/*
1441 	 * Add deleted rtentries to the list to GC them
1442 	 *  after dropping the lock.
1443 	 *
1444 	 * XXX: Delayed notifications not implemented
1445 	 *  for nexthop updates.
1446 	 */
1447 	if (di->rc.rc_cmd == RTM_DELETE) {
1448 		/* Add to the list and return */
1449 		rt->rt_chain = di->head;
1450 		di->head = rt;
1451 #ifdef ROUTE_MPATH
1452 	} else {
1453 		/*
1454 		 * RTM_CHANGE to a different nexthop or nexthop group.
1455 		 * Free old multipath group.
1456 		 */
1457 		nhop_free_any(di->rc.rc_nh_old);
1458 #endif
1459 	}
1460 
1461 	return (0);
1462 }
1463 
1464 /*
1465  * Iterates over a routing table specified by @fibnum and @family and
1466  *  deletes elements marked by @filter_f.
1467  * @fibnum: rtable id
1468  * @family: AF_ address family
1469  * @filter_f: function returning non-zero value for items to delete
1470  * @arg: data to pass to the @filter_f function
1471  * @report: true if rtsock notification is needed.
1472  */
1473 void
1474 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg,
1475     bool report)
1476 {
1477 	struct rib_head *rnh;
1478 	struct rtentry *rt;
1479 	struct nhop_object *nh;
1480 	struct epoch_tracker et;
1481 
1482 	rnh = rt_tables_get_rnh(fibnum, family);
1483 	if (rnh == NULL)
1484 		return;
1485 
1486 	struct rt_delinfo di = {
1487 		.rnh = rnh,
1488 		.filter_f = filter_f,
1489 		.filter_arg = filter_arg,
1490 		.prio = NH_PRIORITY_NORMAL,
1491 	};
1492 
1493 	NET_EPOCH_ENTER(et);
1494 
1495 	RIB_WLOCK(rnh);
1496 	rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
1497 	RIB_WUNLOCK(rnh);
1498 
1499 	/* We might have something to reclaim. */
1500 	bzero(&di.rc, sizeof(di.rc));
1501 	di.rc.rc_cmd = RTM_DELETE;
1502 	while (di.head != NULL) {
1503 		rt = di.head;
1504 		di.head = rt->rt_chain;
1505 		rt->rt_chain = NULL;
1506 		nh = rt->rt_nhop;
1507 
1508 		di.rc.rc_rt = rt;
1509 		di.rc.rc_nh_old = nh;
1510 		rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
1511 
1512 		if (report) {
1513 #ifdef ROUTE_MPATH
1514 			struct nhgrp_object *nhg;
1515 			const struct weightened_nhop *wn;
1516 			uint32_t num_nhops;
1517 			if (NH_IS_NHGRP(nh)) {
1518 				nhg = (struct nhgrp_object *)nh;
1519 				wn = nhgrp_get_nhops(nhg, &num_nhops);
1520 				for (int i = 0; i < num_nhops; i++)
1521 					rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum);
1522 			} else
1523 #endif
1524 			rt_routemsg(RTM_DELETE, rt, nh, fibnum);
1525 		}
1526 		rt_free(rt);
1527 	}
1528 
1529 	NET_EPOCH_EXIT(et);
1530 }
1531 
1532 static int
1533 rt_delete_unconditional(struct radix_node *rn, void *arg)
1534 {
1535 	struct rtentry *rt = RNTORT(rn);
1536 	struct rib_head *rnh = (struct rib_head *)arg;
1537 
1538 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
1539 	if (RNTORT(rn) == rt)
1540 		rt_free(rt);
1541 
1542 	return (0);
1543 }
1544 
1545 /*
1546  * Removes all routes from the routing table without executing notifications.
1547  * rtentres will be removed after the end of a current epoch.
1548  */
1549 static void
1550 rib_flush_routes(struct rib_head *rnh)
1551 {
1552 	RIB_WLOCK(rnh);
1553 	rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh);
1554 	RIB_WUNLOCK(rnh);
1555 }
1556 
1557 void
1558 rib_flush_routes_family(int family)
1559 {
1560 	struct rib_head *rnh;
1561 
1562 	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
1563 		if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
1564 			rib_flush_routes(rnh);
1565 	}
1566 }
1567 
1568 const char *
1569 rib_print_family(int family)
1570 {
1571 	switch (family) {
1572 	case AF_INET:
1573 		return ("inet");
1574 	case AF_INET6:
1575 		return ("inet6");
1576 	case AF_LINK:
1577 		return ("link");
1578 	}
1579 	return ("unknown");
1580 }
1581 
1582