xref: /freebsd/sys/net/route/route_ctl.c (revision f2d48b5e2c3b45850585e4d7aee324fe148afbf2)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
37 #include <sys/mbuf.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
40 #include <sys/syslog.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/rmlock.h>
44 
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/if_dl.h>
48 #include <net/vnet.h>
49 #include <net/route.h>
50 #include <net/route/route_ctl.h>
51 #include <net/route/route_var.h>
52 #include <net/route/nhop_utils.h>
53 #include <net/route/nhop.h>
54 #include <net/route/nhop_var.h>
55 #include <netinet/in.h>
56 #include <netinet6/scope6_var.h>
57 
58 #include <vm/uma.h>
59 
60 /*
61  * This file contains control plane routing tables functions.
62  *
63  * All functions assumes they are called in net epoch.
64  */
65 
66 struct rib_subscription {
67 	CK_STAILQ_ENTRY(rib_subscription)	next;
68 	rib_subscription_cb_t			*func;
69 	void					*arg;
70 	struct rib_head				*rnh;
71 	enum rib_subscription_type		type;
72 	struct epoch_context			epoch_ctx;
73 };
74 
75 static int add_route(struct rib_head *rnh, struct rt_addrinfo *info,
76     struct rib_cmd_info *rc);
77 static int add_route_nhop(struct rib_head *rnh, struct rtentry *rt,
78     struct rt_addrinfo *info, struct route_nhop_data *rnd,
79     struct rib_cmd_info *rc);
80 static int del_route(struct rib_head *rnh, struct rt_addrinfo *info,
81     struct rib_cmd_info *rc);
82 static int change_route(struct rib_head *rnh, struct rt_addrinfo *info,
83     struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc);
84 
85 static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info,
86     struct rib_cmd_info *rc);
87 
88 static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type,
89     struct rib_cmd_info *rc);
90 
91 static void destroy_subscription_epoch(epoch_context_t ctx);
92 #ifdef ROUTE_MPATH
93 static bool rib_can_multipath(struct rib_head *rh);
94 #endif
95 
96 /* Per-vnet multipath routing configuration */
97 SYSCTL_DECL(_net_route);
98 #define	V_rib_route_multipath	VNET(rib_route_multipath)
99 #ifdef ROUTE_MPATH
100 #define _MP_FLAGS	CTLFLAG_RW
101 #else
102 #define _MP_FLAGS	CTLFLAG_RD
103 #endif
104 VNET_DEFINE(u_int, rib_route_multipath) = 0;
105 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
106     &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
107 #undef _MP_FLAGS
108 
109 /* Routing table UMA zone */
110 VNET_DEFINE_STATIC(uma_zone_t, rtzone);
111 #define	V_rtzone	VNET(rtzone)
112 
113 void
114 vnet_rtzone_init()
115 {
116 
117 	V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry),
118 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
119 }
120 
121 #ifdef VIMAGE
122 void
123 vnet_rtzone_destroy()
124 {
125 
126 	uma_zdestroy(V_rtzone);
127 }
128 #endif
129 
130 static void
131 destroy_rtentry(struct rtentry *rt)
132 {
133 
134 	/*
135 	 * At this moment rnh, nh_control may be already freed.
136 	 * nhop interface may have been migrated to a different vnet.
137 	 * Use vnet stored in the nexthop to delete the entry.
138 	 */
139 	CURVNET_SET(nhop_get_vnet(rt->rt_nhop));
140 
141 	/* Unreference nexthop */
142 	nhop_free_any(rt->rt_nhop);
143 
144 	uma_zfree(V_rtzone, rt);
145 
146 	CURVNET_RESTORE();
147 }
148 
149 /*
150  * Epoch callback indicating rtentry is safe to destroy
151  */
152 static void
153 destroy_rtentry_epoch(epoch_context_t ctx)
154 {
155 	struct rtentry *rt;
156 
157 	rt = __containerof(ctx, struct rtentry, rt_epoch_ctx);
158 
159 	destroy_rtentry(rt);
160 }
161 
162 /*
163  * Schedule rtentry deletion
164  */
165 static void
166 rtfree(struct rtentry *rt)
167 {
168 
169 	KASSERT(rt != NULL, ("%s: NULL rt", __func__));
170 
171 	epoch_call(net_epoch_preempt, destroy_rtentry_epoch,
172 	    &rt->rt_epoch_ctx);
173 }
174 
175 static struct rib_head *
176 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
177 {
178 	struct rib_head *rnh;
179 	struct sockaddr *dst;
180 
181 	KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
182 
183 	dst = info->rti_info[RTAX_DST];
184 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
185 
186 	return (rnh);
187 }
188 
189 #ifdef ROUTE_MPATH
190 static bool
191 rib_can_multipath(struct rib_head *rh)
192 {
193 	int result;
194 
195 	CURVNET_SET(rh->rib_vnet);
196 	result = !!V_rib_route_multipath;
197 	CURVNET_RESTORE();
198 
199 	return (result);
200 }
201 
202 /*
203  * Check is nhop is multipath-eligible.
204  * Avoid nhops without gateways and redirects.
205  *
206  * Returns 1 for multipath-eligible nexthop,
207  * 0 otherwise.
208  */
209 bool
210 nhop_can_multipath(const struct nhop_object *nh)
211 {
212 
213 	if ((nh->nh_flags & NHF_MULTIPATH) != 0)
214 		return (1);
215 	if ((nh->nh_flags & NHF_GATEWAY) == 0)
216 		return (0);
217 	if ((nh->nh_flags & NHF_REDIRECT) != 0)
218 		return (0);
219 
220 	return (1);
221 }
222 #endif
223 
224 static int
225 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
226 {
227 	uint32_t weight;
228 
229 	if (info->rti_mflags & RTV_WEIGHT)
230 		weight = info->rti_rmx->rmx_weight;
231 	else
232 		weight = default_weight;
233 	/* Keep upper 1 byte for adm distance purposes */
234 	if (weight > RT_MAX_WEIGHT)
235 		weight = RT_MAX_WEIGHT;
236 
237 	return (weight);
238 }
239 
240 bool
241 rt_is_host(const struct rtentry *rt)
242 {
243 
244 	return (rt->rte_flags & RTF_HOST);
245 }
246 
247 /*
248  * Returns pointer to nexthop or nexthop group
249  * associated with @rt
250  */
251 struct nhop_object *
252 rt_get_raw_nhop(const struct rtentry *rt)
253 {
254 
255 	return (rt->rt_nhop);
256 }
257 
258 #ifdef INET
259 /*
260  * Stores IPv4 address and prefix length of @rt inside
261  *  @paddr and @plen.
262  * @pscopeid is currently always set to 0.
263  */
264 void
265 rt_get_inet_prefix_plen(const struct rtentry *rt, struct in_addr *paddr,
266     int *plen, uint32_t *pscopeid)
267 {
268 	const struct sockaddr_in *dst;
269 
270 	dst = (const struct sockaddr_in *)rt_key_const(rt);
271 	KASSERT((dst->sin_family == AF_INET),
272 	    ("rt family is %d, not inet", dst->sin_family));
273 	*paddr = dst->sin_addr;
274 	dst = (const struct sockaddr_in *)rt_mask_const(rt);
275 	if (dst == NULL)
276 		*plen = 32;
277 	else
278 		*plen = bitcount32(dst->sin_addr.s_addr);
279 	*pscopeid = 0;
280 }
281 
282 /*
283  * Stores IPv4 address and prefix mask of @rt inside
284  *  @paddr and @pmask. Sets mask to INADDR_ANY for host routes.
285  * @pscopeid is currently always set to 0.
286  */
287 void
288 rt_get_inet_prefix_pmask(const struct rtentry *rt, struct in_addr *paddr,
289     struct in_addr *pmask, uint32_t *pscopeid)
290 {
291 	const struct sockaddr_in *dst;
292 
293 	dst = (const struct sockaddr_in *)rt_key_const(rt);
294 	KASSERT((dst->sin_family == AF_INET),
295 	    ("rt family is %d, not inet", dst->sin_family));
296 	*paddr = dst->sin_addr;
297 	dst = (const struct sockaddr_in *)rt_mask_const(rt);
298 	if (dst == NULL)
299 		pmask->s_addr = INADDR_BROADCAST;
300 	else
301 		*pmask = dst->sin_addr;
302 	*pscopeid = 0;
303 }
304 #endif
305 
306 #ifdef INET6
307 static int
308 inet6_get_plen(const struct in6_addr *addr)
309 {
310 
311 	return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) +
312 	    bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3]));
313 }
314 
315 /*
316  * Stores IPv6 address and prefix length of @rt inside
317  *  @paddr and @plen. Addresses are returned in de-embedded form.
318  * Scopeid is set to 0 for non-LL addresses.
319  */
320 void
321 rt_get_inet6_prefix_plen(const struct rtentry *rt, struct in6_addr *paddr,
322     int *plen, uint32_t *pscopeid)
323 {
324 	const struct sockaddr_in6 *dst;
325 
326 	dst = (const struct sockaddr_in6 *)rt_key_const(rt);
327 	KASSERT((dst->sin6_family == AF_INET6),
328 	    ("rt family is %d, not inet6", dst->sin6_family));
329 	if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr))
330 		in6_splitscope(&dst->sin6_addr, paddr, pscopeid);
331 	else
332 		*paddr = dst->sin6_addr;
333 	dst = (const struct sockaddr_in6 *)rt_mask_const(rt);
334 	if (dst == NULL)
335 		*plen = 128;
336 	else
337 		*plen = inet6_get_plen(&dst->sin6_addr);
338 }
339 
340 /*
341  * Stores IPv6 address and prefix mask of @rt inside
342  *  @paddr and @pmask. Addresses are returned in de-embedded form.
343  * Scopeid is set to 0 for non-LL addresses.
344  */
345 void
346 rt_get_inet6_prefix_pmask(const struct rtentry *rt, struct in6_addr *paddr,
347     struct in6_addr *pmask, uint32_t *pscopeid)
348 {
349 	const struct sockaddr_in6 *dst;
350 
351 	dst = (const struct sockaddr_in6 *)rt_key_const(rt);
352 	KASSERT((dst->sin6_family == AF_INET6),
353 	    ("rt family is %d, not inet", dst->sin6_family));
354 	if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr))
355 		in6_splitscope(&dst->sin6_addr, paddr, pscopeid);
356 	else
357 		*paddr = dst->sin6_addr;
358 	dst = (const struct sockaddr_in6 *)rt_mask_const(rt);
359 	if (dst == NULL)
360 		memset(pmask, 0xFF, sizeof(struct in6_addr));
361 	else
362 		*pmask = dst->sin6_addr;
363 }
364 #endif
365 
366 static void
367 rt_set_expire_info(struct rtentry *rt, const struct rt_addrinfo *info)
368 {
369 
370 	/* Kernel -> userland timebase conversion. */
371 	if (info->rti_mflags & RTV_EXPIRE)
372 		rt->rt_expire = info->rti_rmx->rmx_expire ?
373 		    info->rti_rmx->rmx_expire - time_second + time_uptime : 0;
374 }
375 
376 /*
377  * Check if specified @gw matches gw data in the nexthop @nh.
378  *
379  * Returns true if matches, false otherwise.
380  */
381 bool
382 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
383 {
384 
385 	if (nh->gw_sa.sa_family != gw->sa_family)
386 		return (false);
387 
388 	switch (gw->sa_family) {
389 	case AF_INET:
390 		return (nh->gw4_sa.sin_addr.s_addr ==
391 		    ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
392 	case AF_INET6:
393 		{
394 			const struct sockaddr_in6 *gw6;
395 			gw6 = (const struct sockaddr_in6 *)gw;
396 
397 			/*
398 			 * Currently (2020-09) IPv6 gws in kernel have their
399 			 * scope embedded. Once this becomes false, this code
400 			 * has to be revisited.
401 			 */
402 			if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
403 			    &gw6->sin6_addr))
404 				return (true);
405 			return (false);
406 		}
407 	case AF_LINK:
408 		{
409 			const struct sockaddr_dl *sdl;
410 			sdl = (const struct sockaddr_dl *)gw;
411 			return (nh->gwl_sa.sdl_index == sdl->sdl_index);
412 		}
413 	default:
414 		return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
415 	}
416 
417 	/* NOTREACHED */
418 	return (false);
419 }
420 
421 /*
422  * Checks if data in @info matches nexhop @nh.
423  *
424  * Returns 0 on success,
425  * ESRCH if not matched,
426  * ENOENT if filter function returned false
427  */
428 int
429 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
430     const struct nhop_object *nh)
431 {
432 	const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
433 
434 	if (info->rti_filter != NULL) {
435 	    if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
436 		    return (ENOENT);
437 	    else
438 		    return (0);
439 	}
440 	if ((gw != NULL) && !match_nhop_gw(nh, gw))
441 		return (ESRCH);
442 
443 	return (0);
444 }
445 
446 /*
447  * Checks if nexhop @nh can be rewritten by data in @info because
448  *  of higher "priority". Currently the only case for such scenario
449  *  is kernel installing interface routes, marked by RTF_PINNED flag.
450  *
451  * Returns:
452  * 1 if @info data has higher priority
453  * 0 if priority is the same
454  * -1 if priority is lower
455  */
456 int
457 can_override_nhop(const struct rt_addrinfo *info, const struct nhop_object *nh)
458 {
459 
460 	if (info->rti_flags & RTF_PINNED) {
461 		return (NH_IS_PINNED(nh)) ? 0 : 1;
462 	} else {
463 		return (NH_IS_PINNED(nh)) ? -1 : 0;
464 	}
465 }
466 
467 /*
468  * Runs exact prefix match based on @dst and @netmask.
469  * Returns matched @rtentry if found or NULL.
470  * If rtentry was found, saves nexthop / weight value into @rnd.
471  */
472 static struct rtentry *
473 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
474     const struct sockaddr *netmask, struct route_nhop_data *rnd)
475 {
476 	struct rtentry *rt;
477 
478 	RIB_LOCK_ASSERT(rnh);
479 
480 	rt = (struct rtentry *)rnh->rnh_lookup(__DECONST(void *, dst),
481 	    __DECONST(void *, netmask), &rnh->head);
482 	if (rt != NULL) {
483 		rnd->rnd_nhop = rt->rt_nhop;
484 		rnd->rnd_weight = rt->rt_weight;
485 	} else {
486 		rnd->rnd_nhop = NULL;
487 		rnd->rnd_weight = 0;
488 	}
489 
490 	return (rt);
491 }
492 
493 /*
494  * Runs exact prefix match based on dst/netmask from @info.
495  * Assumes RIB lock is held.
496  * Returns matched @rtentry if found or NULL.
497  * If rtentry was found, saves nexthop / weight value into @rnd.
498  */
499 struct rtentry *
500 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
501     struct route_nhop_data *rnd)
502 {
503 	struct rtentry *rt;
504 
505 	rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
506 	    info->rti_info[RTAX_NETMASK], rnd);
507 
508 	return (rt);
509 }
510 
511 /*
512  * Adds route defined by @info into the kernel table specified by @fibnum and
513  * sa_family in @info->rti_info[RTAX_DST].
514  *
515  * Returns 0 on success and fills in operation metadata into @rc.
516  */
517 int
518 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
519     struct rib_cmd_info *rc)
520 {
521 	struct rib_head *rnh;
522 	int error;
523 
524 	NET_EPOCH_ASSERT();
525 
526 	rnh = get_rnh(fibnum, info);
527 	if (rnh == NULL)
528 		return (EAFNOSUPPORT);
529 
530 	/*
531 	 * Check consistency between RTF_HOST flag and netmask
532 	 * existence.
533 	 */
534 	if (info->rti_flags & RTF_HOST)
535 		info->rti_info[RTAX_NETMASK] = NULL;
536 	else if (info->rti_info[RTAX_NETMASK] == NULL)
537 		return (EINVAL);
538 
539 	bzero(rc, sizeof(struct rib_cmd_info));
540 	rc->rc_cmd = RTM_ADD;
541 
542 	error = add_route(rnh, info, rc);
543 	if (error == 0)
544 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
545 
546 	return (error);
547 }
548 
549 /*
550  * Creates rtentry and nexthop based on @info data.
551  * Return 0 and fills in rtentry into @prt on success,
552  * return errno otherwise.
553  */
554 static int
555 create_rtentry(struct rib_head *rnh, struct rt_addrinfo *info,
556     struct rtentry **prt)
557 {
558 	struct sockaddr *dst, *ndst, *gateway, *netmask;
559 	struct rtentry *rt;
560 	struct nhop_object *nh;
561 	struct ifaddr *ifa;
562 	int error, flags;
563 
564 	dst = info->rti_info[RTAX_DST];
565 	gateway = info->rti_info[RTAX_GATEWAY];
566 	netmask = info->rti_info[RTAX_NETMASK];
567 	flags = info->rti_flags;
568 
569 	if ((flags & RTF_GATEWAY) && !gateway)
570 		return (EINVAL);
571 	if (dst && gateway && (dst->sa_family != gateway->sa_family) &&
572 	    (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
573 		return (EINVAL);
574 
575 	if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb))
576 		return (EINVAL);
577 
578 	if (info->rti_ifa == NULL) {
579 		error = rt_getifa_fib(info, rnh->rib_fibnum);
580 		if (error)
581 			return (error);
582 	} else {
583 		ifa_ref(info->rti_ifa);
584 	}
585 
586 	error = nhop_create_from_info(rnh, info, &nh);
587 	if (error != 0) {
588 		ifa_free(info->rti_ifa);
589 		return (error);
590 	}
591 
592 	rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO);
593 	if (rt == NULL) {
594 		ifa_free(info->rti_ifa);
595 		nhop_free(nh);
596 		return (ENOBUFS);
597 	}
598 	rt->rte_flags = (RTF_UP | flags) & RTE_RT_FLAG_MASK;
599 	rt->rt_nhop = nh;
600 
601 	/* Fill in dst */
602 	memcpy(&rt->rt_dst, dst, dst->sa_len);
603 	rt_key(rt) = &rt->rt_dst;
604 
605 	/*
606 	 * point to the (possibly newly malloc'd) dest address.
607 	 */
608 	ndst = (struct sockaddr *)rt_key(rt);
609 
610 	/*
611 	 * make sure it contains the value we want (masked if needed).
612 	 */
613 	if (netmask) {
614 		rt_maskedcopy(dst, ndst, netmask);
615 	} else
616 		bcopy(dst, ndst, dst->sa_len);
617 
618 	/*
619 	 * We use the ifa reference returned by rt_getifa_fib().
620 	 * This moved from below so that rnh->rnh_addaddr() can
621 	 * examine the ifa and  ifa->ifa_ifp if it so desires.
622 	 */
623 	ifa = info->rti_ifa;
624 	rt->rt_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
625 	rt_set_expire_info(rt, info);
626 
627 	*prt = rt;
628 	return (0);
629 }
630 
631 static int
632 add_route(struct rib_head *rnh, struct rt_addrinfo *info,
633     struct rib_cmd_info *rc)
634 {
635 	struct nhop_object *nh_orig;
636 	struct route_nhop_data rnd_orig, rnd_add;
637 	struct nhop_object *nh;
638 	struct rtentry *rt, *rt_orig;
639 	int error;
640 
641 	error = create_rtentry(rnh, info, &rt);
642 	if (error != 0)
643 		return (error);
644 
645 	rnd_add.rnd_nhop = rt->rt_nhop;
646 	rnd_add.rnd_weight = rt->rt_weight;
647 	nh = rt->rt_nhop;
648 
649 	RIB_WLOCK(rnh);
650 	error = add_route_nhop(rnh, rt, info, &rnd_add, rc);
651 	if (error == 0) {
652 		RIB_WUNLOCK(rnh);
653 		return (0);
654 	}
655 
656 	/* addition failed. Lookup prefix in the rib to determine the cause */
657 	rt_orig = lookup_prefix(rnh, info, &rnd_orig);
658 	if (rt_orig == NULL) {
659 		/* No prefix -> rnh_addaddr() failed to allocate memory */
660 		RIB_WUNLOCK(rnh);
661 		nhop_free(nh);
662 		uma_zfree(V_rtzone, rt);
663 		return (ENOMEM);
664 	}
665 
666 	/* We have existing route in the RIB. */
667 	nh_orig = rnd_orig.rnd_nhop;
668 	/* Check if new route has higher preference */
669 	if (can_override_nhop(info, nh_orig) > 0) {
670 		/* Update nexthop to the new route */
671 		change_route_nhop(rnh, rt_orig, info, &rnd_add, rc);
672 		RIB_WUNLOCK(rnh);
673 		uma_zfree(V_rtzone, rt);
674 		nhop_free(nh_orig);
675 		return (0);
676 	}
677 
678 	RIB_WUNLOCK(rnh);
679 
680 #ifdef ROUTE_MPATH
681 	if (rib_can_multipath(rnh) && nhop_can_multipath(rnd_add.rnd_nhop) &&
682 	    nhop_can_multipath(rnd_orig.rnd_nhop))
683 		error = add_route_mpath(rnh, info, rt, &rnd_add, &rnd_orig, rc);
684 	else
685 #endif
686 	/* Unable to add - another route with the same preference exists */
687 	error = EEXIST;
688 
689 	/*
690 	 * ROUTE_MPATH disabled: failed to add route, free both nhop and rt.
691 	 * ROUTE_MPATH enabled: original nhop reference is unused in any case,
692 	 *  free rt only if not _adding_ new route to rib (e.g. the case
693 	 *  when initial lookup returned existing route, but then it got
694 	 *  deleted prior to multipath group insertion, leading to a simple
695 	 *  non-multipath add as a result).
696 	 */
697 	nhop_free(nh);
698 	if ((error != 0) || rc->rc_cmd != RTM_ADD)
699 		uma_zfree(V_rtzone, rt);
700 
701 	return (error);
702 }
703 
704 /*
705  * Removes route defined by @info from the kernel table specified by @fibnum and
706  * sa_family in @info->rti_info[RTAX_DST].
707  *
708  * Returns 0 on success and fills in operation metadata into @rc.
709  */
710 int
711 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
712 {
713 	struct rib_head *rnh;
714 	struct sockaddr *dst_orig, *netmask;
715 	struct sockaddr_storage mdst;
716 	int error;
717 
718 	NET_EPOCH_ASSERT();
719 
720 	rnh = get_rnh(fibnum, info);
721 	if (rnh == NULL)
722 		return (EAFNOSUPPORT);
723 
724 	bzero(rc, sizeof(struct rib_cmd_info));
725 	rc->rc_cmd = RTM_DELETE;
726 
727 	dst_orig = info->rti_info[RTAX_DST];
728 	netmask = info->rti_info[RTAX_NETMASK];
729 
730 	if (netmask != NULL) {
731 		/* Ensure @dst is always properly masked */
732 		if (dst_orig->sa_len > sizeof(mdst))
733 			return (EINVAL);
734 		rt_maskedcopy(dst_orig, (struct sockaddr *)&mdst, netmask);
735 		info->rti_info[RTAX_DST] = (struct sockaddr *)&mdst;
736 	}
737 	error = del_route(rnh, info, rc);
738 	info->rti_info[RTAX_DST] = dst_orig;
739 
740 	return (error);
741 }
742 
743 /*
744  * Conditionally unlinks rtentry matching data inside @info from @rnh.
745  * Returns 0 on success with operation result stored in @rc.
746  * On error, returns:
747  * ESRCH - if prefix was not found,
748  * EADDRINUSE - if trying to delete higher priority route.
749  * ENOENT - if supplied filter function returned 0 (not matched).
750  */
751 static int
752 rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc)
753 {
754 	struct rtentry *rt;
755 	struct nhop_object *nh;
756 	struct radix_node *rn;
757 	struct route_nhop_data rnd;
758 	int error;
759 
760 	rt = lookup_prefix(rnh, info, &rnd);
761 	if (rt == NULL)
762 		return (ESRCH);
763 
764 	nh = rt->rt_nhop;
765 #ifdef ROUTE_MPATH
766 	if (NH_IS_NHGRP(nh)) {
767 		error = del_route_mpath(rnh, info, rt,
768 		    (struct nhgrp_object *)nh, rc);
769 		return (error);
770 	}
771 #endif
772 	error = check_info_match_nhop(info, rt, nh);
773 	if (error != 0)
774 		return (error);
775 
776 	if (can_override_nhop(info, nh) < 0)
777 		return (EADDRINUSE);
778 
779 	/*
780 	 * Remove the item from the tree and return it.
781 	 * Complain if it is not there and do no more processing.
782 	 */
783 	rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST],
784 	    info->rti_info[RTAX_NETMASK], &rnh->head);
785 	if (rn == NULL)
786 		return (ESRCH);
787 
788 	if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
789 		panic ("rtrequest delete");
790 
791 	rt = RNTORT(rn);
792 	rt->rte_flags &= ~RTF_UP;
793 
794 	/* Finalize notification */
795 	rnh->rnh_gen++;
796 	rnh->rnh_prefixes--;
797 
798 	rc->rc_cmd = RTM_DELETE;
799 	rc->rc_rt = rt;
800 	rc->rc_nh_old = rt->rt_nhop;
801 	rc->rc_nh_weight = rt->rt_weight;
802 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
803 
804 	return (0);
805 }
806 
807 static int
808 del_route(struct rib_head *rnh, struct rt_addrinfo *info,
809     struct rib_cmd_info *rc)
810 {
811 	int error;
812 
813 	RIB_WLOCK(rnh);
814 	error = rt_unlinkrte(rnh, info, rc);
815 	RIB_WUNLOCK(rnh);
816 	if (error != 0)
817 		return (error);
818 
819 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
820 
821 	/*
822 	 * If the caller wants it, then it can have it,
823 	 * the entry will be deleted after the end of the current epoch.
824 	 */
825 	if (rc->rc_cmd == RTM_DELETE)
826 		rtfree(rc->rc_rt);
827 #ifdef ROUTE_MPATH
828 	else {
829 		/*
830 		 * Deleting 1 path may result in RTM_CHANGE to
831 		 * a different mpath group/nhop.
832 		 * Free old mpath group.
833 		 */
834 		nhop_free_any(rc->rc_nh_old);
835 	}
836 #endif
837 
838 	return (0);
839 }
840 
841 int
842 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
843     struct rib_cmd_info *rc)
844 {
845 	RIB_RLOCK_TRACKER;
846 	struct route_nhop_data rnd_orig;
847 	struct rib_head *rnh;
848 	struct rtentry *rt;
849 	int error;
850 
851 	NET_EPOCH_ASSERT();
852 
853 	rnh = get_rnh(fibnum, info);
854 	if (rnh == NULL)
855 		return (EAFNOSUPPORT);
856 
857 	bzero(rc, sizeof(struct rib_cmd_info));
858 	rc->rc_cmd = RTM_CHANGE;
859 
860 	/* Check if updated gateway exists */
861 	if ((info->rti_flags & RTF_GATEWAY) &&
862 	    (info->rti_info[RTAX_GATEWAY] == NULL)) {
863 
864 		/*
865 		 * route(8) adds RTF_GATEWAY flag if -interface is not set.
866 		 * Remove RTF_GATEWAY to enforce consistency and maintain
867 		 * compatibility..
868 		 */
869 		info->rti_flags &= ~RTF_GATEWAY;
870 	}
871 
872 	/*
873 	 * route change is done in multiple steps, with dropping and
874 	 * reacquiring lock. In the situations with multiple processes
875 	 * changes the same route in can lead to the case when route
876 	 * is changed between the steps. Address it by retrying the operation
877 	 * multiple times before failing.
878 	 */
879 
880 	RIB_RLOCK(rnh);
881 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
882 	    info->rti_info[RTAX_NETMASK], &rnh->head);
883 
884 	if (rt == NULL) {
885 		RIB_RUNLOCK(rnh);
886 		return (ESRCH);
887 	}
888 
889 	rnd_orig.rnd_nhop = rt->rt_nhop;
890 	rnd_orig.rnd_weight = rt->rt_weight;
891 
892 	RIB_RUNLOCK(rnh);
893 
894 	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
895 		error = change_route(rnh, info, &rnd_orig, rc);
896 		if (error != EAGAIN)
897 			break;
898 	}
899 
900 	return (error);
901 }
902 
903 static int
904 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
905     struct nhop_object *nh_orig, struct nhop_object **nh_new)
906 {
907 	int free_ifa = 0;
908 	int error;
909 
910 	/*
911 	 * New gateway could require new ifaddr, ifp;
912 	 * flags may also be different; ifp may be specified
913 	 * by ll sockaddr when protocol address is ambiguous
914 	 */
915 	if (((nh_orig->nh_flags & NHF_GATEWAY) &&
916 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
917 	    info->rti_info[RTAX_IFP] != NULL ||
918 	    (info->rti_info[RTAX_IFA] != NULL &&
919 	     !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
920 		error = rt_getifa_fib(info, rnh->rib_fibnum);
921 		if (info->rti_ifa != NULL)
922 			free_ifa = 1;
923 
924 		if (error != 0) {
925 			if (free_ifa) {
926 				ifa_free(info->rti_ifa);
927 				info->rti_ifa = NULL;
928 			}
929 
930 			return (error);
931 		}
932 	}
933 
934 	error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
935 	if (free_ifa) {
936 		ifa_free(info->rti_ifa);
937 		info->rti_ifa = NULL;
938 	}
939 
940 	return (error);
941 }
942 
943 #ifdef ROUTE_MPATH
944 static int
945 change_mpath_route(struct rib_head *rnh, struct rt_addrinfo *info,
946     struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
947 {
948 	int error = 0;
949 	struct nhop_object *nh, *nh_orig, *nh_new;
950 	struct route_nhop_data rnd_new;
951 
952 	nh = NULL;
953 	nh_orig = rnd_orig->rnd_nhop;
954 
955 	struct weightened_nhop *wn = NULL, *wn_new;
956 	uint32_t num_nhops;
957 
958 	wn = nhgrp_get_nhops((struct nhgrp_object *)nh_orig, &num_nhops);
959 	nh_orig = NULL;
960 	for (int i = 0; i < num_nhops; i++) {
961 		if (check_info_match_nhop(info, NULL, wn[i].nh)) {
962 			nh_orig = wn[i].nh;
963 			break;
964 		}
965 	}
966 
967 	if (nh_orig == NULL)
968 		return (ESRCH);
969 
970 	error = change_nhop(rnh, info, nh_orig, &nh_new);
971 	if (error != 0)
972 		return (error);
973 
974 	wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
975 	    M_TEMP, M_NOWAIT | M_ZERO);
976 	if (wn_new == NULL) {
977 		nhop_free(nh_new);
978 		return (EAGAIN);
979 	}
980 
981 	memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
982 	for (int i = 0; i < num_nhops; i++) {
983 		if (wn[i].nh == nh_orig) {
984 			wn[i].nh = nh_new;
985 			wn[i].weight = get_info_weight(info, rnd_orig->rnd_weight);
986 			break;
987 		}
988 	}
989 
990 	error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new);
991 	nhop_free(nh_new);
992 	free(wn_new, M_TEMP);
993 
994 	if (error != 0)
995 		return (error);
996 
997 	error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc);
998 
999 	return (error);
1000 }
1001 #endif
1002 
1003 static int
1004 change_route(struct rib_head *rnh, struct rt_addrinfo *info,
1005     struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
1006 {
1007 	int error = 0;
1008 	struct nhop_object *nh, *nh_orig;
1009 	struct route_nhop_data rnd_new;
1010 
1011 	nh = NULL;
1012 	nh_orig = rnd_orig->rnd_nhop;
1013 	if (nh_orig == NULL)
1014 		return (ESRCH);
1015 
1016 #ifdef ROUTE_MPATH
1017 	if (NH_IS_NHGRP(nh_orig))
1018 		return (change_mpath_route(rnh, info, rnd_orig, rc));
1019 #endif
1020 
1021 	rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
1022 	error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
1023 	if (error != 0)
1024 		return (error);
1025 	error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc);
1026 
1027 	return (error);
1028 }
1029 
1030 /*
1031  * Insert @rt with nhop data from @rnd_new to @rnh.
1032  * Returns 0 on success and stores operation results in @rc.
1033  */
1034 static int
1035 add_route_nhop(struct rib_head *rnh, struct rtentry *rt,
1036     struct rt_addrinfo *info, struct route_nhop_data *rnd,
1037     struct rib_cmd_info *rc)
1038 {
1039 	struct sockaddr *ndst, *netmask;
1040 	struct radix_node *rn;
1041 	int error = 0;
1042 
1043 	RIB_WLOCK_ASSERT(rnh);
1044 
1045 	ndst = (struct sockaddr *)rt_key(rt);
1046 	netmask = info->rti_info[RTAX_NETMASK];
1047 
1048 	rt->rt_nhop = rnd->rnd_nhop;
1049 	rt->rt_weight = rnd->rnd_weight;
1050 	rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes);
1051 
1052 	if (rn != NULL) {
1053 		if (rt->rt_expire > 0)
1054 			tmproutes_update(rnh, rt);
1055 
1056 		/* Finalize notification */
1057 		rnh->rnh_gen++;
1058 		rnh->rnh_prefixes++;
1059 
1060 		rc->rc_cmd = RTM_ADD;
1061 		rc->rc_rt = rt;
1062 		rc->rc_nh_old = NULL;
1063 		rc->rc_nh_new = rnd->rnd_nhop;
1064 		rc->rc_nh_weight = rnd->rnd_weight;
1065 
1066 		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1067 	} else {
1068 		/* Existing route or memory allocation failure */
1069 		error = EEXIST;
1070 	}
1071 
1072 	return (error);
1073 }
1074 
1075 /*
1076  * Switch @rt nhop/weigh to the ones specified in @rnd.
1077  *  Conditionally set rt_expire if set in @info.
1078  * Returns 0 on success.
1079  */
1080 int
1081 change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
1082     struct rt_addrinfo *info, struct route_nhop_data *rnd,
1083     struct rib_cmd_info *rc)
1084 {
1085 	struct nhop_object *nh_orig;
1086 
1087 	RIB_WLOCK_ASSERT(rnh);
1088 
1089 	nh_orig = rt->rt_nhop;
1090 
1091 	if (rnd->rnd_nhop != NULL) {
1092 		/* Changing expiration & nexthop & weight to a new one */
1093 		rt_set_expire_info(rt, info);
1094 		rt->rt_nhop = rnd->rnd_nhop;
1095 		rt->rt_weight = rnd->rnd_weight;
1096 		if (rt->rt_expire > 0)
1097 			tmproutes_update(rnh, rt);
1098 	} else {
1099 		/* Route deletion requested. */
1100 		struct sockaddr *ndst, *netmask;
1101 		struct radix_node *rn;
1102 
1103 		ndst = (struct sockaddr *)rt_key(rt);
1104 		netmask = info->rti_info[RTAX_NETMASK];
1105 		rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head);
1106 		if (rn == NULL)
1107 			return (ESRCH);
1108 		rt = RNTORT(rn);
1109 		rt->rte_flags &= ~RTF_UP;
1110 	}
1111 
1112 	/* Finalize notification */
1113 	rnh->rnh_gen++;
1114 	if (rnd->rnd_nhop == NULL)
1115 		rnh->rnh_prefixes--;
1116 
1117 	rc->rc_cmd = (rnd->rnd_nhop != NULL) ? RTM_CHANGE : RTM_DELETE;
1118 	rc->rc_rt = rt;
1119 	rc->rc_nh_old = nh_orig;
1120 	rc->rc_nh_new = rnd->rnd_nhop;
1121 	rc->rc_nh_weight = rnd->rnd_weight;
1122 
1123 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1124 
1125 	return (0);
1126 }
1127 
1128 /*
1129  * Conditionally update route nhop/weight IFF data in @nhd_orig is
1130  *  consistent with the current route data.
1131  * Nexthop in @nhd_new is consumed.
1132  */
1133 int
1134 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
1135     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1136     struct route_nhop_data *rnd_new, struct rib_cmd_info *rc)
1137 {
1138 	struct rtentry *rt_new;
1139 	int error = 0;
1140 
1141 	RIB_WLOCK(rnh);
1142 
1143 	rt_new = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
1144 	    info->rti_info[RTAX_NETMASK], &rnh->head);
1145 
1146 	if (rt_new == NULL) {
1147 		if (rnd_orig->rnd_nhop == NULL)
1148 			error = add_route_nhop(rnh, rt, info, rnd_new, rc);
1149 		else {
1150 			/*
1151 			 * Prefix does not exist, which was not our assumption.
1152 			 * Update @rnd_orig with the new data and return
1153 			 */
1154 			rnd_orig->rnd_nhop = NULL;
1155 			rnd_orig->rnd_weight = 0;
1156 			error = EAGAIN;
1157 		}
1158 	} else {
1159 		/* Prefix exists, try to update */
1160 		if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
1161 			/*
1162 			 * Nhop/mpath group hasn't changed. Flip
1163 			 * to the new precalculated one and return
1164 			 */
1165 			error = change_route_nhop(rnh, rt_new, info, rnd_new, rc);
1166 		} else {
1167 			/* Update and retry */
1168 			rnd_orig->rnd_nhop = rt_new->rt_nhop;
1169 			rnd_orig->rnd_weight = rt_new->rt_weight;
1170 			error = EAGAIN;
1171 		}
1172 	}
1173 
1174 	RIB_WUNLOCK(rnh);
1175 
1176 	if (error == 0) {
1177 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
1178 
1179 		if (rnd_orig->rnd_nhop != NULL)
1180 			nhop_free_any(rnd_orig->rnd_nhop);
1181 
1182 	} else {
1183 		if (rnd_new->rnd_nhop != NULL)
1184 			nhop_free_any(rnd_new->rnd_nhop);
1185 	}
1186 
1187 	return (error);
1188 }
1189 
1190 /*
1191  * Performs modification of routing table specificed by @action.
1192  * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
1193  * Needs to be run in network epoch.
1194  *
1195  * Returns 0 on success and fills in @rc with action result.
1196  */
1197 int
1198 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
1199     struct rib_cmd_info *rc)
1200 {
1201 	int error;
1202 
1203 	switch (action) {
1204 	case RTM_ADD:
1205 		error = rib_add_route(fibnum, info, rc);
1206 		break;
1207 	case RTM_DELETE:
1208 		error = rib_del_route(fibnum, info, rc);
1209 		break;
1210 	case RTM_CHANGE:
1211 		error = rib_change_route(fibnum, info, rc);
1212 		break;
1213 	default:
1214 		error = ENOTSUP;
1215 	}
1216 
1217 	return (error);
1218 }
1219 
1220 struct rt_delinfo
1221 {
1222 	struct rt_addrinfo info;
1223 	struct rib_head *rnh;
1224 	struct rtentry *head;
1225 	struct rib_cmd_info rc;
1226 };
1227 
1228 /*
1229  * Conditionally unlinks @rn from radix tree based
1230  * on info data passed in @arg.
1231  */
1232 static int
1233 rt_checkdelroute(struct radix_node *rn, void *arg)
1234 {
1235 	struct rt_delinfo *di;
1236 	struct rt_addrinfo *info;
1237 	struct rtentry *rt;
1238 	int error;
1239 
1240 	di = (struct rt_delinfo *)arg;
1241 	rt = (struct rtentry *)rn;
1242 	info = &di->info;
1243 
1244 	info->rti_info[RTAX_DST] = rt_key(rt);
1245 	info->rti_info[RTAX_NETMASK] = rt_mask(rt);
1246 
1247 	error = rt_unlinkrte(di->rnh, info, &di->rc);
1248 
1249 	/*
1250 	 * Add deleted rtentries to the list to GC them
1251 	 *  after dropping the lock.
1252 	 *
1253 	 * XXX: Delayed notifications not implemented
1254 	 *  for nexthop updates.
1255 	 */
1256 	if ((error == 0) && (di->rc.rc_cmd == RTM_DELETE)) {
1257 		/* Add to the list and return */
1258 		rt->rt_chain = di->head;
1259 		di->head = rt;
1260 	}
1261 
1262 	return (0);
1263 }
1264 
1265 /*
1266  * Iterates over a routing table specified by @fibnum and @family and
1267  *  deletes elements marked by @filter_f.
1268  * @fibnum: rtable id
1269  * @family: AF_ address family
1270  * @filter_f: function returning non-zero value for items to delete
1271  * @arg: data to pass to the @filter_f function
1272  * @report: true if rtsock notification is needed.
1273  */
1274 void
1275 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *arg, bool report)
1276 {
1277 	struct rib_head *rnh;
1278 	struct rt_delinfo di;
1279 	struct rtentry *rt;
1280 	struct nhop_object *nh;
1281 	struct epoch_tracker et;
1282 
1283 	rnh = rt_tables_get_rnh(fibnum, family);
1284 	if (rnh == NULL)
1285 		return;
1286 
1287 	bzero(&di, sizeof(di));
1288 	di.info.rti_filter = filter_f;
1289 	di.info.rti_filterdata = arg;
1290 	di.rnh = rnh;
1291 	di.rc.rc_cmd = RTM_DELETE;
1292 
1293 	NET_EPOCH_ENTER(et);
1294 
1295 	RIB_WLOCK(rnh);
1296 	rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
1297 	RIB_WUNLOCK(rnh);
1298 
1299 	/* We might have something to reclaim. */
1300 	bzero(&di.rc, sizeof(di.rc));
1301 	di.rc.rc_cmd = RTM_DELETE;
1302 	while (di.head != NULL) {
1303 		rt = di.head;
1304 		di.head = rt->rt_chain;
1305 		rt->rt_chain = NULL;
1306 		nh = rt->rt_nhop;
1307 
1308 		di.rc.rc_rt = rt;
1309 		di.rc.rc_nh_old = nh;
1310 		rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
1311 
1312 		/* TODO std rt -> rt_addrinfo export */
1313 		di.info.rti_info[RTAX_DST] = rt_key(rt);
1314 		di.info.rti_info[RTAX_NETMASK] = rt_mask(rt);
1315 
1316 		if (report) {
1317 #ifdef ROUTE_MPATH
1318 			struct nhgrp_object *nhg;
1319 			struct weightened_nhop *wn;
1320 			uint32_t num_nhops;
1321 			if (NH_IS_NHGRP(nh)) {
1322 				nhg = (struct nhgrp_object *)nh;
1323 				wn = nhgrp_get_nhops(nhg, &num_nhops);
1324 				for (int i = 0; i < num_nhops; i++)
1325 					rt_routemsg(RTM_DELETE, rt,
1326 					    wn[i].nh->nh_ifp, 0, fibnum);
1327 			} else
1328 #endif
1329 			rt_routemsg(RTM_DELETE, rt, nh->nh_ifp, 0, fibnum);
1330 		}
1331 		rtfree(rt);
1332 	}
1333 
1334 	NET_EPOCH_EXIT(et);
1335 }
1336 
1337 static void
1338 rib_notify(struct rib_head *rnh, enum rib_subscription_type type,
1339     struct rib_cmd_info *rc)
1340 {
1341 	struct rib_subscription *rs;
1342 
1343 	CK_STAILQ_FOREACH(rs, &rnh->rnh_subscribers, next) {
1344 		if (rs->type == type)
1345 			rs->func(rnh, rc, rs->arg);
1346 	}
1347 }
1348 
1349 static struct rib_subscription *
1350 allocate_subscription(rib_subscription_cb_t *f, void *arg,
1351     enum rib_subscription_type type, bool waitok)
1352 {
1353 	struct rib_subscription *rs;
1354 	int flags = M_ZERO | (waitok ? M_WAITOK : M_NOWAIT);
1355 
1356 	rs = malloc(sizeof(struct rib_subscription), M_RTABLE, flags);
1357 	if (rs == NULL)
1358 		return (NULL);
1359 
1360 	rs->func = f;
1361 	rs->arg = arg;
1362 	rs->type = type;
1363 
1364 	return (rs);
1365 }
1366 
1367 /*
1368  * Subscribe for the changes in the routing table specified by @fibnum and
1369  *  @family.
1370  *
1371  * Returns pointer to the subscription structure on success.
1372  */
1373 struct rib_subscription *
1374 rib_subscribe(uint32_t fibnum, int family, rib_subscription_cb_t *f, void *arg,
1375     enum rib_subscription_type type, bool waitok)
1376 {
1377 	struct rib_head *rnh;
1378 	struct epoch_tracker et;
1379 
1380 	NET_EPOCH_ENTER(et);
1381 	KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__));
1382 	rnh = rt_tables_get_rnh(fibnum, family);
1383 	NET_EPOCH_EXIT(et);
1384 
1385 	return (rib_subscribe_internal(rnh, f, arg, type, waitok));
1386 }
1387 
1388 struct rib_subscription *
1389 rib_subscribe_internal(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg,
1390     enum rib_subscription_type type, bool waitok)
1391 {
1392 	struct rib_subscription *rs;
1393 	struct epoch_tracker et;
1394 
1395 	if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL)
1396 		return (NULL);
1397 	rs->rnh = rnh;
1398 
1399 	NET_EPOCH_ENTER(et);
1400 	RIB_WLOCK(rnh);
1401 	CK_STAILQ_INSERT_TAIL(&rnh->rnh_subscribers, rs, next);
1402 	RIB_WUNLOCK(rnh);
1403 	NET_EPOCH_EXIT(et);
1404 
1405 	return (rs);
1406 }
1407 
1408 /*
1409  * Remove rtable subscription @rs from the routing table.
1410  * Needs to be run in network epoch.
1411  */
1412 void
1413 rib_unsibscribe(struct rib_subscription *rs)
1414 {
1415 	struct rib_head *rnh = rs->rnh;
1416 
1417 	NET_EPOCH_ASSERT();
1418 
1419 	RIB_WLOCK(rnh);
1420 	CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next);
1421 	RIB_WUNLOCK(rnh);
1422 
1423 	epoch_call(net_epoch_preempt, destroy_subscription_epoch,
1424 	    &rs->epoch_ctx);
1425 }
1426 
1427 /*
1428  * Epoch callback indicating subscription is safe to destroy
1429  */
1430 static void
1431 destroy_subscription_epoch(epoch_context_t ctx)
1432 {
1433 	struct rib_subscription *rs;
1434 
1435 	rs = __containerof(ctx, struct rib_subscription, epoch_ctx);
1436 
1437 	free(rs, M_RTABLE);
1438 }
1439 
1440 void
1441 rib_init_subscriptions(struct rib_head *rnh)
1442 {
1443 
1444 	CK_STAILQ_INIT(&rnh->rnh_subscribers);
1445 }
1446 
1447 void
1448 rib_destroy_subscriptions(struct rib_head *rnh)
1449 {
1450 	struct rib_subscription *rs;
1451 	struct epoch_tracker et;
1452 
1453 	NET_EPOCH_ENTER(et);
1454 	RIB_WLOCK(rnh);
1455 	while ((rs = CK_STAILQ_FIRST(&rnh->rnh_subscribers)) != NULL) {
1456 		CK_STAILQ_REMOVE_HEAD(&rnh->rnh_subscribers, next);
1457 		epoch_call(net_epoch_preempt, destroy_subscription_epoch,
1458 		    &rs->epoch_ctx);
1459 	}
1460 	RIB_WUNLOCK(rnh);
1461 	NET_EPOCH_EXIT(et);
1462 }
1463