xref: /freebsd/sys/net/route/route_ctl.c (revision a3d9bf49b57923118c339642594246ef73872ee8)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_mpath.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
37 #include <sys/mbuf.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
40 #include <sys/syslog.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/rmlock.h>
44 
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/if_dl.h>
48 #include <net/vnet.h>
49 #include <net/route.h>
50 #include <net/route/route_ctl.h>
51 #include <net/route/route_var.h>
52 #include <net/route/nhop_utils.h>
53 #include <net/route/nhop.h>
54 #include <net/route/nhop_var.h>
55 #include <netinet/in.h>
56 
57 #ifdef RADIX_MPATH
58 #include <net/radix_mpath.h>
59 #endif
60 
61 #include <vm/uma.h>
62 
63 /*
64  * This file contains control plane routing tables functions.
65  *
66  * All functions assumes they are called in net epoch.
67  */
68 
69 struct rib_subscription {
70 	CK_STAILQ_ENTRY(rib_subscription)	next;
71 	rib_subscription_cb_t			*func;
72 	void					*arg;
73 	enum rib_subscription_type		type;
74 	struct epoch_context			epoch_ctx;
75 };
76 
77 static int add_route(struct rib_head *rnh, struct rt_addrinfo *info,
78     struct rib_cmd_info *rc);
79 static int add_route_nhop(struct rib_head *rnh, struct rtentry *rt,
80     struct rt_addrinfo *info, struct route_nhop_data *rnd,
81     struct rib_cmd_info *rc);
82 static int del_route(struct rib_head *rnh, struct rt_addrinfo *info,
83     struct rib_cmd_info *rc);
84 static int change_route(struct rib_head *rnh, struct rt_addrinfo *info,
85     struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc);
86 static int change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
87     struct rt_addrinfo *info, struct route_nhop_data *rnd,
88     struct rib_cmd_info *rc);
89 
90 static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info,
91     struct rib_cmd_info *rc);
92 
93 static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type,
94     struct rib_cmd_info *rc);
95 
96 static void destroy_subscription_epoch(epoch_context_t ctx);
97 
98 /* Routing table UMA zone */
99 VNET_DEFINE_STATIC(uma_zone_t, rtzone);
100 #define	V_rtzone	VNET(rtzone)
101 
102 void
103 vnet_rtzone_init()
104 {
105 
106 	V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry),
107 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
108 }
109 
110 #ifdef VIMAGE
111 void
112 vnet_rtzone_destroy()
113 {
114 
115 	uma_zdestroy(V_rtzone);
116 }
117 #endif
118 
119 static void
120 destroy_rtentry(struct rtentry *rt)
121 {
122 
123 	/*
124 	 * At this moment rnh, nh_control may be already freed.
125 	 * nhop interface may have been migrated to a different vnet.
126 	 * Use vnet stored in the nexthop to delete the entry.
127 	 */
128 	CURVNET_SET(nhop_get_vnet(rt->rt_nhop));
129 
130 	/* Unreference nexthop */
131 	nhop_free(rt->rt_nhop);
132 
133 	uma_zfree(V_rtzone, rt);
134 
135 	CURVNET_RESTORE();
136 }
137 
138 /*
139  * Epoch callback indicating rtentry is safe to destroy
140  */
141 static void
142 destroy_rtentry_epoch(epoch_context_t ctx)
143 {
144 	struct rtentry *rt;
145 
146 	rt = __containerof(ctx, struct rtentry, rt_epoch_ctx);
147 
148 	destroy_rtentry(rt);
149 }
150 
151 /*
152  * Schedule rtentry deletion
153  */
154 static void
155 rtfree(struct rtentry *rt)
156 {
157 
158 	KASSERT(rt != NULL, ("%s: NULL rt", __func__));
159 
160 	epoch_call(net_epoch_preempt, destroy_rtentry_epoch,
161 	    &rt->rt_epoch_ctx);
162 }
163 
164 static struct rib_head *
165 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
166 {
167 	struct rib_head *rnh;
168 	struct sockaddr *dst;
169 
170 	KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
171 
172 	dst = info->rti_info[RTAX_DST];
173 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
174 
175 	return (rnh);
176 }
177 
178 static int
179 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
180 {
181 	uint32_t weight;
182 
183 	if (info->rti_mflags & RTV_WEIGHT)
184 		weight = info->rti_rmx->rmx_weight;
185 	else
186 		weight = default_weight;
187 	/* Keep upper 1 byte for adm distance purposes */
188 	if (weight > RT_MAX_WEIGHT)
189 		weight = RT_MAX_WEIGHT;
190 
191 	return (weight);
192 }
193 
194 static void
195 rt_set_expire_info(struct rtentry *rt, const struct rt_addrinfo *info)
196 {
197 
198 	/* Kernel -> userland timebase conversion. */
199 	if (info->rti_mflags & RTV_EXPIRE)
200 		rt->rt_expire = info->rti_rmx->rmx_expire ?
201 		    info->rti_rmx->rmx_expire - time_second + time_uptime : 0;
202 }
203 
204 /*
205  * Check if specified @gw matches gw data in the nexthop @nh.
206  *
207  * Returns true if matches, false otherwise.
208  */
209 static bool
210 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
211 {
212 
213 	if (nh->gw_sa.sa_family != gw->sa_family)
214 		return (false);
215 
216 	switch (gw->sa_family) {
217 	case AF_INET:
218 		return (nh->gw4_sa.sin_addr.s_addr ==
219 		    ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
220 	case AF_INET6:
221 		{
222 			const struct sockaddr_in6 *gw6;
223 			gw6 = (const struct sockaddr_in6 *)gw;
224 
225 			/*
226 			 * Currently (2020-09) IPv6 gws in kernel have their
227 			 * scope embedded. Once this becomes false, this code
228 			 * has to be revisited.
229 			 */
230 			if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
231 			    &gw6->sin6_addr))
232 				return (true);
233 			return (false);
234 		}
235 	case AF_LINK:
236 		{
237 			const struct sockaddr_dl *sdl;
238 			sdl = (const struct sockaddr_dl *)gw;
239 			return (nh->gwl_sa.sdl_index == sdl->sdl_index);
240 		}
241 	default:
242 		return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
243 	}
244 
245 	/* NOTREACHED */
246 	return (false);
247 }
248 
249 /*
250  * Checks if data in @info matches nexhop @nh.
251  *
252  * Returns 0 on success,
253  * ESRCH if not matched,
254  * ENOENT if filter function returned false
255  */
256 int
257 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
258     const struct nhop_object *nh)
259 {
260 	const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
261 
262 	if (info->rti_filter != NULL) {
263 	    if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
264 		    return (ENOENT);
265 	    else
266 		    return (0);
267 	}
268 	if ((gw != NULL) && !match_nhop_gw(nh, gw))
269 		return (ESRCH);
270 
271 	return (0);
272 }
273 
274 /*
275  * Checks if nexhop @nh can be rewritten by data in @info because
276  *  of higher "priority". Currently the only case for such scenario
277  *  is kernel installing interface routes, marked by RTF_PINNED flag.
278  *
279  * Returns:
280  * 1 if @info data has higher priority
281  * 0 if priority is the same
282  * -1 if priority is lower
283  */
284 int
285 can_override_nhop(const struct rt_addrinfo *info, const struct nhop_object *nh)
286 {
287 
288 	if (info->rti_flags & RTF_PINNED) {
289 		return (NH_IS_PINNED(nh)) ? 0 : 1;
290 	} else {
291 		return (NH_IS_PINNED(nh)) ? -1 : 0;
292 	}
293 }
294 
295 /*
296  * Runs exact prefix match based on @dst and @netmask.
297  * Returns matched @rtentry if found or NULL.
298  * If rtentry was found, saves nexthop / weight value into @rnd.
299  */
300 static struct rtentry *
301 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
302     const struct sockaddr *netmask, struct route_nhop_data *rnd)
303 {
304 	struct rtentry *rt;
305 
306 	RIB_LOCK_ASSERT(rnh);
307 
308 	rt = (struct rtentry *)rnh->rnh_lookup(__DECONST(void *, dst),
309 	    __DECONST(void *, netmask), &rnh->head);
310 	if (rt != NULL) {
311 		rnd->rnd_nhop = rt->rt_nhop;
312 		rnd->rnd_weight = rt->rt_weight;
313 	} else {
314 		rnd->rnd_nhop = NULL;
315 		rnd->rnd_weight = 0;
316 	}
317 
318 	return (rt);
319 }
320 
321 /*
322  * Runs exact prefix match based on dst/netmask from @info.
323  * Assumes RIB lock is held.
324  * Returns matched @rtentry if found or NULL.
325  * If rtentry was found, saves nexthop / weight value into @rnd.
326  */
327 struct rtentry *
328 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
329     struct route_nhop_data *rnd)
330 {
331 	struct rtentry *rt;
332 
333 	rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
334 	    info->rti_info[RTAX_NETMASK], rnd);
335 
336 	return (rt);
337 }
338 
339 /*
340  * Adds route defined by @info into the kernel table specified by @fibnum and
341  * sa_family in @info->rti_info[RTAX_DST].
342  *
343  * Returns 0 on success and fills in operation metadata into @rc.
344  */
345 int
346 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
347     struct rib_cmd_info *rc)
348 {
349 	struct rib_head *rnh;
350 	int error;
351 
352 	NET_EPOCH_ASSERT();
353 
354 	rnh = get_rnh(fibnum, info);
355 	if (rnh == NULL)
356 		return (EAFNOSUPPORT);
357 
358 	/*
359 	 * Check consistency between RTF_HOST flag and netmask
360 	 * existence.
361 	 */
362 	if (info->rti_flags & RTF_HOST)
363 		info->rti_info[RTAX_NETMASK] = NULL;
364 	else if (info->rti_info[RTAX_NETMASK] == NULL)
365 		return (EINVAL);
366 
367 	bzero(rc, sizeof(struct rib_cmd_info));
368 	rc->rc_cmd = RTM_ADD;
369 
370 	error = add_route(rnh, info, rc);
371 	if (error == 0)
372 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
373 
374 	return (error);
375 }
376 
377 /*
378  * Creates rtentry and nexthop based on @info data.
379  * Return 0 and fills in rtentry into @prt on success,
380  * return errno otherwise.
381  */
382 static int
383 create_rtentry(struct rib_head *rnh, struct rt_addrinfo *info,
384     struct rtentry **prt)
385 {
386 	struct sockaddr *dst, *ndst, *gateway, *netmask;
387 	struct rtentry *rt;
388 	struct nhop_object *nh;
389 	struct ifaddr *ifa;
390 	int error, flags;
391 
392 	dst = info->rti_info[RTAX_DST];
393 	gateway = info->rti_info[RTAX_GATEWAY];
394 	netmask = info->rti_info[RTAX_NETMASK];
395 	flags = info->rti_flags;
396 
397 	if ((flags & RTF_GATEWAY) && !gateway)
398 		return (EINVAL);
399 	if (dst && gateway && (dst->sa_family != gateway->sa_family) &&
400 	    (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
401 		return (EINVAL);
402 
403 	if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb))
404 		return (EINVAL);
405 
406 	if (info->rti_ifa == NULL) {
407 		error = rt_getifa_fib(info, rnh->rib_fibnum);
408 		if (error)
409 			return (error);
410 	} else {
411 		ifa_ref(info->rti_ifa);
412 	}
413 
414 	error = nhop_create_from_info(rnh, info, &nh);
415 	if (error != 0) {
416 		ifa_free(info->rti_ifa);
417 		return (error);
418 	}
419 
420 	rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO);
421 	if (rt == NULL) {
422 		ifa_free(info->rti_ifa);
423 		nhop_free(nh);
424 		return (ENOBUFS);
425 	}
426 	rt->rte_flags = (RTF_UP | flags) & RTE_RT_FLAG_MASK;
427 	rt->rt_nhop = nh;
428 
429 	/* Fill in dst */
430 	memcpy(&rt->rt_dst, dst, dst->sa_len);
431 	rt_key(rt) = &rt->rt_dst;
432 
433 	/*
434 	 * point to the (possibly newly malloc'd) dest address.
435 	 */
436 	ndst = (struct sockaddr *)rt_key(rt);
437 
438 	/*
439 	 * make sure it contains the value we want (masked if needed).
440 	 */
441 	if (netmask) {
442 		rt_maskedcopy(dst, ndst, netmask);
443 	} else
444 		bcopy(dst, ndst, dst->sa_len);
445 
446 	/*
447 	 * We use the ifa reference returned by rt_getifa_fib().
448 	 * This moved from below so that rnh->rnh_addaddr() can
449 	 * examine the ifa and  ifa->ifa_ifp if it so desires.
450 	 */
451 	ifa = info->rti_ifa;
452 	rt->rt_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
453 	rt_set_expire_info(rt, info);
454 
455 	*prt = rt;
456 	return (0);
457 }
458 
459 static int
460 add_route(struct rib_head *rnh, struct rt_addrinfo *info,
461     struct rib_cmd_info *rc)
462 {
463 	struct nhop_object *nh_orig;
464 	struct route_nhop_data rnd;
465 	struct nhop_object *nh;
466 	struct rtentry *rt, *rt_orig;
467 	int error;
468 
469 	error = create_rtentry(rnh, info, &rt);
470 	if (error != 0)
471 		return (error);
472 
473 	rnd.rnd_nhop = rt->rt_nhop;
474 	rnd.rnd_weight = rt->rt_weight;
475 	nh = rt->rt_nhop;
476 
477 	RIB_WLOCK(rnh);
478 #ifdef RADIX_MPATH
479 	struct sockaddr *netmask;
480 	netmask = info->rti_info[RTAX_NETMASK];
481 	/* do not permit exactly the same dst/mask/gw pair */
482 	if (rt_mpath_capable(rnh) &&
483 		rt_mpath_conflict(rnh, rt, netmask)) {
484 		RIB_WUNLOCK(rnh);
485 
486 		nhop_free(nh);
487 		uma_zfree(V_rtzone, rt);
488 		return (EEXIST);
489 	}
490 #endif
491 	error = add_route_nhop(rnh, rt, info, &rnd, rc);
492 	if (error == 0) {
493 		RIB_WUNLOCK(rnh);
494 		return (0);
495 	}
496 
497 	/* addition failed. Lookup prefix in the rib to determine the cause */
498 	rt_orig = lookup_prefix(rnh, info, &rnd);
499 	if (rt_orig == NULL) {
500 		/* No prefix -> rnh_addaddr() failed to allocate memory */
501 		RIB_WUNLOCK(rnh);
502 		nhop_free(nh);
503 		uma_zfree(V_rtzone, rt);
504 		return (ENOMEM);
505 	}
506 
507 	/* We have existing route in the RIB. */
508 	nh_orig = rnd.rnd_nhop;
509 	/* Check if new route has higher preference */
510 	if (can_override_nhop(info, nh_orig) > 0) {
511 		/* Update nexthop to the new route */
512 		change_route_nhop(rnh, rt_orig, info, &rnd, rc);
513 		RIB_WUNLOCK(rnh);
514 		uma_zfree(V_rtzone, rt);
515 		nhop_free(nh_orig);
516 		return (0);
517 	}
518 
519 	RIB_WUNLOCK(rnh);
520 
521 	/* Unable to add - another route with the same preference exists */
522 	error = EEXIST;
523 
524 	nhop_free(nh);
525 	uma_zfree(V_rtzone, rt);
526 
527 	return (error);
528 }
529 
530 /*
531  * Removes route defined by @info from the kernel table specified by @fibnum and
532  * sa_family in @info->rti_info[RTAX_DST].
533  *
534  * Returns 0 on success and fills in operation metadata into @rc.
535  */
536 int
537 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
538 {
539 	struct rib_head *rnh;
540 	struct sockaddr *dst_orig, *netmask;
541 	struct sockaddr_storage mdst;
542 	int error;
543 
544 	NET_EPOCH_ASSERT();
545 
546 	rnh = get_rnh(fibnum, info);
547 	if (rnh == NULL)
548 		return (EAFNOSUPPORT);
549 
550 	bzero(rc, sizeof(struct rib_cmd_info));
551 	rc->rc_cmd = RTM_DELETE;
552 
553 	dst_orig = info->rti_info[RTAX_DST];
554 	netmask = info->rti_info[RTAX_NETMASK];
555 
556 	if (netmask != NULL) {
557 		/* Ensure @dst is always properly masked */
558 		if (dst_orig->sa_len > sizeof(mdst))
559 			return (EINVAL);
560 		rt_maskedcopy(dst_orig, (struct sockaddr *)&mdst, netmask);
561 		info->rti_info[RTAX_DST] = (struct sockaddr *)&mdst;
562 	}
563 	error = del_route(rnh, info, rc);
564 	info->rti_info[RTAX_DST] = dst_orig;
565 
566 	return (error);
567 }
568 
569 /*
570  * Conditionally unlinks rtentry matching data inside @info from @rnh.
571  * Returns 0 on success with operation result stored in @rc.
572  * On error, returns:
573  * ESRCH - if prefix was not found,
574  * EADDRINUSE - if trying to delete higher priority route.
575  * ENOENT - if supplied filter function returned 0 (not matched).
576  */
577 static int
578 rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc)
579 {
580 	struct rtentry *rt;
581 	struct nhop_object *nh;
582 	struct radix_node *rn;
583 	struct route_nhop_data rnd;
584 	int error;
585 
586 	rt = lookup_prefix(rnh, info, &rnd);
587 	if (rt == NULL)
588 		return (ESRCH);
589 
590 	nh = rt->rt_nhop;
591 
592 	error = check_info_match_nhop(info, rt, nh);
593 	if (error != 0)
594 		return (error);
595 
596 	if (can_override_nhop(info, nh) < 0)
597 		return (EADDRINUSE);
598 
599 	/*
600 	 * Remove the item from the tree and return it.
601 	 * Complain if it is not there and do no more processing.
602 	 */
603 #ifdef RADIX_MPATH
604 	info->rti_info[RTAX_GATEWAY] = &nh->gw_sa;
605 	if (rt_mpath_capable(rnh)) {
606 		rn = rt_mpath_unlink(rnh, info, rt, &error);
607 		if (error != 0)
608 			return (error);
609 	} else
610 #endif
611 	rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST],
612 	    info->rti_info[RTAX_NETMASK], &rnh->head);
613 	if (rn == NULL)
614 		return (ESRCH);
615 
616 	if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
617 		panic ("rtrequest delete");
618 
619 	rt = RNTORT(rn);
620 	rt->rte_flags &= ~RTF_UP;
621 
622 	/* Finalize notification */
623 	rnh->rnh_gen++;
624 	rc->rc_cmd = RTM_DELETE;
625 	rc->rc_rt = rt;
626 	rc->rc_nh_old = rt->rt_nhop;
627 	rc->rc_nh_weight = rt->rt_weight;
628 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
629 
630 	return (0);
631 }
632 
633 static int
634 del_route(struct rib_head *rnh, struct rt_addrinfo *info,
635     struct rib_cmd_info *rc)
636 {
637 	int error;
638 
639 	RIB_WLOCK(rnh);
640 	error = rt_unlinkrte(rnh, info, rc);
641 	RIB_WUNLOCK(rnh);
642 	if (error != 0)
643 		return (error);
644 
645 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
646 
647 	/*
648 	 * If the caller wants it, then it can have it,
649 	 * the entry will be deleted after the end of the current epoch.
650 	 */
651 	rtfree(rc->rc_rt);
652 
653 	return (0);
654 }
655 
656 int
657 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
658     struct rib_cmd_info *rc)
659 {
660 	RIB_RLOCK_TRACKER;
661 	struct route_nhop_data rnd_orig;
662 	struct rib_head *rnh;
663 	struct rtentry *rt;
664 	int error;
665 
666 	NET_EPOCH_ASSERT();
667 
668 	rnh = get_rnh(fibnum, info);
669 	if (rnh == NULL)
670 		return (EAFNOSUPPORT);
671 
672 	bzero(rc, sizeof(struct rib_cmd_info));
673 	rc->rc_cmd = RTM_CHANGE;
674 
675 	/* Check if updated gateway exists */
676 	if ((info->rti_flags & RTF_GATEWAY) &&
677 	    (info->rti_info[RTAX_GATEWAY] == NULL))
678 		return (EINVAL);
679 
680 	/*
681 	 * route change is done in multiple steps, with dropping and
682 	 * reacquiring lock. In the situations with multiple processes
683 	 * changes the same route in can lead to the case when route
684 	 * is changed between the steps. Address it by retrying the operation
685 	 * multiple times before failing.
686 	 */
687 
688 	RIB_RLOCK(rnh);
689 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
690 	    info->rti_info[RTAX_NETMASK], &rnh->head);
691 
692 	if (rt == NULL) {
693 		RIB_RUNLOCK(rnh);
694 		return (ESRCH);
695 	}
696 
697 #ifdef RADIX_MPATH
698 	/*
699 	 * If we got multipath routes,
700 	 * we require users to specify a matching RTAX_GATEWAY.
701 	 */
702 	if (rt_mpath_capable(rnh)) {
703 		rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]);
704 		if (rt == NULL) {
705 			RIB_RUNLOCK(rnh);
706 			return (ESRCH);
707 		}
708 	}
709 #endif
710 	rnd_orig.rnd_nhop = rt->rt_nhop;
711 	rnd_orig.rnd_weight = rt->rt_weight;
712 
713 	RIB_RUNLOCK(rnh);
714 
715 	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
716 		error = change_route(rnh, info, &rnd_orig, rc);
717 		if (error != EAGAIN)
718 			break;
719 	}
720 
721 	return (error);
722 }
723 
724 static int
725 change_route(struct rib_head *rnh, struct rt_addrinfo *info,
726     struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
727 {
728 	int error = 0;
729 	int free_ifa = 0;
730 	struct nhop_object *nh, *nh_orig;
731 	struct route_nhop_data rnd_new;
732 
733 	nh = NULL;
734 	nh_orig = rnd_orig->rnd_nhop;
735 	if (nh_orig == NULL)
736 		return (ESRCH);
737 
738 	/*
739 	 * New gateway could require new ifaddr, ifp;
740 	 * flags may also be different; ifp may be specified
741 	 * by ll sockaddr when protocol address is ambiguous
742 	 */
743 	if (((nh_orig->nh_flags & NHF_GATEWAY) &&
744 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
745 	    info->rti_info[RTAX_IFP] != NULL ||
746 	    (info->rti_info[RTAX_IFA] != NULL &&
747 	     !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
748 		error = rt_getifa_fib(info, rnh->rib_fibnum);
749 		if (info->rti_ifa != NULL)
750 			free_ifa = 1;
751 
752 		if (error != 0) {
753 			if (free_ifa) {
754 				ifa_free(info->rti_ifa);
755 				info->rti_ifa = NULL;
756 			}
757 
758 			return (error);
759 		}
760 	}
761 
762 	error = nhop_create_from_nhop(rnh, nh_orig, info, &nh);
763 	if (free_ifa) {
764 		ifa_free(info->rti_ifa);
765 		info->rti_ifa = NULL;
766 	}
767 	if (error != 0)
768 		return (error);
769 
770 	rnd_new.rnd_nhop = nh;
771 	if (info->rti_mflags & RTV_WEIGHT)
772 		rnd_new.rnd_weight = info->rti_rmx->rmx_weight;
773 	else
774 		rnd_new.rnd_weight = rnd_orig->rnd_weight;
775 
776 	error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc);
777 
778 	return (error);
779 }
780 
781 /*
782  * Insert @rt with nhop data from @rnd_new to @rnh.
783  * Returns 0 on success and stores operation results in @rc.
784  */
785 static int
786 add_route_nhop(struct rib_head *rnh, struct rtentry *rt,
787     struct rt_addrinfo *info, struct route_nhop_data *rnd,
788     struct rib_cmd_info *rc)
789 {
790 	struct sockaddr *ndst, *netmask;
791 	struct radix_node *rn;
792 	int error = 0;
793 
794 	RIB_WLOCK_ASSERT(rnh);
795 
796 	ndst = (struct sockaddr *)rt_key(rt);
797 	netmask = info->rti_info[RTAX_NETMASK];
798 
799 	rt->rt_nhop = rnd->rnd_nhop;
800 	rt->rt_weight = rnd->rnd_weight;
801 	rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes);
802 
803 	if (rn != NULL) {
804 		if (rt->rt_expire > 0)
805 			tmproutes_update(rnh, rt);
806 
807 		/* Finalize notification */
808 		rnh->rnh_gen++;
809 
810 		rc->rc_cmd = RTM_ADD;
811 		rc->rc_rt = rt;
812 		rc->rc_nh_old = NULL;
813 		rc->rc_nh_new = rnd->rnd_nhop;
814 		rc->rc_nh_weight = rnd->rnd_weight;
815 
816 		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
817 	} else {
818 		/* Existing route or memory allocation failure */
819 		error = EEXIST;
820 	}
821 
822 	return (error);
823 }
824 
825 /*
826  * Switch @rt nhop/weigh to the ones specified in @rnd.
827  *  Conditionally set rt_expire if set in @info.
828  * Returns 0 on success.
829  */
830 static int
831 change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
832     struct rt_addrinfo *info, struct route_nhop_data *rnd,
833     struct rib_cmd_info *rc)
834 {
835 	struct nhop_object *nh_orig;
836 
837 	RIB_WLOCK_ASSERT(rnh);
838 
839 	nh_orig = rt->rt_nhop;
840 
841 	if (rnd->rnd_nhop != NULL) {
842 		/* Changing expiration & nexthop & weight to a new one */
843 		rt_set_expire_info(rt, info);
844 		rt->rt_nhop = rnd->rnd_nhop;
845 		rt->rt_weight = rnd->rnd_weight;
846 		if (rt->rt_expire > 0)
847 			tmproutes_update(rnh, rt);
848 	} else {
849 		/* Route deletion requested. */
850 		struct sockaddr *ndst, *netmask;
851 		struct radix_node *rn;
852 
853 		ndst = (struct sockaddr *)rt_key(rt);
854 		netmask = info->rti_info[RTAX_NETMASK];
855 		rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head);
856 		if (rn == NULL)
857 			return (ESRCH);
858 	}
859 
860 	/* Finalize notification */
861 	rnh->rnh_gen++;
862 
863 	rc->rc_cmd = (rnd->rnd_nhop != NULL) ? RTM_CHANGE : RTM_DELETE;
864 	rc->rc_rt = rt;
865 	rc->rc_nh_old = nh_orig;
866 	rc->rc_nh_new = rnd->rnd_nhop;
867 	rc->rc_nh_weight = rnd->rnd_weight;
868 
869 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
870 
871 	return (0);
872 }
873 
874 /*
875  * Conditionally update route nhop/weight IFF data in @nhd_orig is
876  *  consistent with the current route data.
877  * Nexthop in @nhd_new is consumed.
878  */
879 int
880 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
881     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
882     struct route_nhop_data *rnd_new, struct rib_cmd_info *rc)
883 {
884 	struct rtentry *rt_new;
885 	int error = 0;
886 
887 	RIB_WLOCK(rnh);
888 
889 	rt_new = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
890 	    info->rti_info[RTAX_NETMASK], &rnh->head);
891 
892 	if (rt_new == NULL) {
893 		if (rnd_orig->rnd_nhop == NULL)
894 			error = add_route_nhop(rnh, rt, info, rnd_new, rc);
895 		else {
896 			/*
897 			 * Prefix does not exist, which was not our assumption.
898 			 * Update @rnd_orig with the new data and return
899 			 */
900 			rnd_orig->rnd_nhop = NULL;
901 			rnd_orig->rnd_weight = 0;
902 			error = EAGAIN;
903 		}
904 	} else {
905 		/* Prefix exists, try to update */
906 		if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
907 			/*
908 			 * Nhop/mpath group hasn't changed. Flip
909 			 * to the new precalculated one and return
910 			 */
911 			error = change_route_nhop(rnh, rt_new, info, rnd_new, rc);
912 		} else {
913 			/* Update and retry */
914 			rnd_orig->rnd_nhop = rt_new->rt_nhop;
915 			rnd_orig->rnd_weight = rt_new->rt_weight;
916 			error = EAGAIN;
917 		}
918 	}
919 
920 	RIB_WUNLOCK(rnh);
921 
922 	if (error == 0) {
923 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
924 
925 		if (rnd_orig->rnd_nhop != NULL)
926 			nhop_free_any(rnd_orig->rnd_nhop);
927 
928 	} else {
929 		if (rnd_new->rnd_nhop != NULL)
930 			nhop_free_any(rnd_new->rnd_nhop);
931 	}
932 
933 	return (error);
934 }
935 
936 /*
937  * Performs modification of routing table specificed by @action.
938  * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
939  * Needs to be run in network epoch.
940  *
941  * Returns 0 on success and fills in @rc with action result.
942  */
943 int
944 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
945     struct rib_cmd_info *rc)
946 {
947 	int error;
948 
949 	switch (action) {
950 	case RTM_ADD:
951 		error = rib_add_route(fibnum, info, rc);
952 		break;
953 	case RTM_DELETE:
954 		error = rib_del_route(fibnum, info, rc);
955 		break;
956 	case RTM_CHANGE:
957 		error = rib_change_route(fibnum, info, rc);
958 		break;
959 	default:
960 		error = ENOTSUP;
961 	}
962 
963 	return (error);
964 }
965 
966 struct rt_delinfo
967 {
968 	struct rt_addrinfo info;
969 	struct rib_head *rnh;
970 	struct rtentry *head;
971 	struct rib_cmd_info rc;
972 };
973 
974 /*
975  * Conditionally unlinks @rn from radix tree based
976  * on info data passed in @arg.
977  */
978 static int
979 rt_checkdelroute(struct radix_node *rn, void *arg)
980 {
981 	struct rt_delinfo *di;
982 	struct rt_addrinfo *info;
983 	struct rtentry *rt;
984 	int error;
985 
986 	di = (struct rt_delinfo *)arg;
987 	rt = (struct rtentry *)rn;
988 	info = &di->info;
989 
990 	info->rti_info[RTAX_DST] = rt_key(rt);
991 	info->rti_info[RTAX_NETMASK] = rt_mask(rt);
992 	info->rti_info[RTAX_GATEWAY] = &rt->rt_nhop->gw_sa;
993 
994 	error = rt_unlinkrte(di->rnh, info, &di->rc);
995 
996 	/*
997 	 * Add deleted rtentries to the list to GC them
998 	 *  after dropping the lock.
999 	 *
1000 	 * XXX: Delayed notifications not implemented
1001 	 *  for nexthop updates.
1002 	 */
1003 	if (error == 0) {
1004 		/* Add to the list and return */
1005 		rt->rt_chain = di->head;
1006 		di->head = rt;
1007 	}
1008 
1009 	return (0);
1010 }
1011 
1012 /*
1013  * Iterates over a routing table specified by @fibnum and @family and
1014  *  deletes elements marked by @filter_f.
1015  * @fibnum: rtable id
1016  * @family: AF_ address family
1017  * @filter_f: function returning non-zero value for items to delete
1018  * @arg: data to pass to the @filter_f function
1019  * @report: true if rtsock notification is needed.
1020  */
1021 void
1022 rib_walk_del(u_int fibnum, int family, rt_filter_f_t *filter_f, void *arg, bool report)
1023 {
1024 	struct rib_head *rnh;
1025 	struct rt_delinfo di;
1026 	struct rtentry *rt;
1027 	struct epoch_tracker et;
1028 
1029 	rnh = rt_tables_get_rnh(fibnum, family);
1030 	if (rnh == NULL)
1031 		return;
1032 
1033 	bzero(&di, sizeof(di));
1034 	di.info.rti_filter = filter_f;
1035 	di.info.rti_filterdata = arg;
1036 	di.rnh = rnh;
1037 	di.rc.rc_cmd = RTM_DELETE;
1038 
1039 	NET_EPOCH_ENTER(et);
1040 
1041 	RIB_WLOCK(rnh);
1042 	rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
1043 	RIB_WUNLOCK(rnh);
1044 
1045 	/* We might have something to reclaim. */
1046 	bzero(&di.rc, sizeof(di.rc));
1047 	di.rc.rc_cmd = RTM_DELETE;
1048 	while (di.head != NULL) {
1049 		rt = di.head;
1050 		di.head = rt->rt_chain;
1051 		rt->rt_chain = NULL;
1052 
1053 		di.rc.rc_rt = rt;
1054 		di.rc.rc_nh_old = rt->rt_nhop;
1055 		rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
1056 
1057 		/* TODO std rt -> rt_addrinfo export */
1058 		di.info.rti_info[RTAX_DST] = rt_key(rt);
1059 		di.info.rti_info[RTAX_NETMASK] = rt_mask(rt);
1060 
1061 		if (report)
1062 			rt_routemsg(RTM_DELETE, rt, rt->rt_nhop->nh_ifp, 0,
1063 			    fibnum);
1064 		rtfree(rt);
1065 	}
1066 
1067 	NET_EPOCH_EXIT(et);
1068 }
1069 
1070 static void
1071 rib_notify(struct rib_head *rnh, enum rib_subscription_type type,
1072     struct rib_cmd_info *rc)
1073 {
1074 	struct rib_subscription *rs;
1075 
1076 	CK_STAILQ_FOREACH(rs, &rnh->rnh_subscribers, next) {
1077 		if (rs->type == type)
1078 			rs->func(rnh, rc, rs->arg);
1079 	}
1080 }
1081 
1082 static struct rib_subscription *
1083 allocate_subscription(rib_subscription_cb_t *f, void *arg,
1084     enum rib_subscription_type type, bool waitok)
1085 {
1086 	struct rib_subscription *rs;
1087 	int flags = M_ZERO | (waitok ? M_WAITOK : 0);
1088 
1089 	rs = malloc(sizeof(struct rib_subscription), M_RTABLE, flags);
1090 	if (rs == NULL)
1091 		return (NULL);
1092 
1093 	rs->func = f;
1094 	rs->arg = arg;
1095 	rs->type = type;
1096 
1097 	return (rs);
1098 }
1099 
1100 /*
1101  * Subscribe for the changes in the routing table specified by @fibnum and
1102  *  @family.
1103  *
1104  * Returns pointer to the subscription structure on success.
1105  */
1106 struct rib_subscription *
1107 rib_subscribe(uint32_t fibnum, int family, rib_subscription_cb_t *f, void *arg,
1108     enum rib_subscription_type type, bool waitok)
1109 {
1110 	struct rib_head *rnh;
1111 	struct rib_subscription *rs;
1112 	struct epoch_tracker et;
1113 
1114 	if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL)
1115 		return (NULL);
1116 
1117 	NET_EPOCH_ENTER(et);
1118 	KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__));
1119 	rnh = rt_tables_get_rnh(fibnum, family);
1120 
1121 	RIB_WLOCK(rnh);
1122 	CK_STAILQ_INSERT_TAIL(&rnh->rnh_subscribers, rs, next);
1123 	RIB_WUNLOCK(rnh);
1124 	NET_EPOCH_EXIT(et);
1125 
1126 	return (rs);
1127 }
1128 
1129 struct rib_subscription *
1130 rib_subscribe_internal(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg,
1131     enum rib_subscription_type type, bool waitok)
1132 {
1133 	struct rib_subscription *rs;
1134 	struct epoch_tracker et;
1135 
1136 	if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL)
1137 		return (NULL);
1138 
1139 	NET_EPOCH_ENTER(et);
1140 	RIB_WLOCK(rnh);
1141 	CK_STAILQ_INSERT_TAIL(&rnh->rnh_subscribers, rs, next);
1142 	RIB_WUNLOCK(rnh);
1143 	NET_EPOCH_EXIT(et);
1144 
1145 	return (rs);
1146 }
1147 
1148 /*
1149  * Remove rtable subscription @rs from the table specified by @fibnum
1150  *  and @family.
1151  * Needs to be run in network epoch.
1152  *
1153  * Returns 0 on success.
1154  */
1155 int
1156 rib_unsibscribe(uint32_t fibnum, int family, struct rib_subscription *rs)
1157 {
1158 	struct rib_head *rnh;
1159 
1160 	NET_EPOCH_ASSERT();
1161 	KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__));
1162 	rnh = rt_tables_get_rnh(fibnum, family);
1163 
1164 	if (rnh == NULL)
1165 		return (ENOENT);
1166 
1167 	RIB_WLOCK(rnh);
1168 	CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next);
1169 	RIB_WUNLOCK(rnh);
1170 
1171 	epoch_call(net_epoch_preempt, destroy_subscription_epoch,
1172 	    &rs->epoch_ctx);
1173 
1174 	return (0);
1175 }
1176 
1177 /*
1178  * Epoch callback indicating subscription is safe to destroy
1179  */
1180 static void
1181 destroy_subscription_epoch(epoch_context_t ctx)
1182 {
1183 	struct rib_subscription *rs;
1184 
1185 	rs = __containerof(ctx, struct rib_subscription, epoch_ctx);
1186 
1187 	free(rs, M_RTABLE);
1188 }
1189 
1190 void
1191 rib_init_subscriptions(struct rib_head *rnh)
1192 {
1193 
1194 	CK_STAILQ_INIT(&rnh->rnh_subscribers);
1195 }
1196 
1197 void
1198 rib_destroy_subscriptions(struct rib_head *rnh)
1199 {
1200 	struct rib_subscription *rs;
1201 	struct epoch_tracker et;
1202 
1203 	NET_EPOCH_ENTER(et);
1204 	RIB_WLOCK(rnh);
1205 	while ((rs = CK_STAILQ_FIRST(&rnh->rnh_subscribers)) != NULL) {
1206 		CK_STAILQ_REMOVE_HEAD(&rnh->rnh_subscribers, next);
1207 		epoch_call(net_epoch_preempt, destroy_subscription_epoch,
1208 		    &rs->epoch_ctx);
1209 	}
1210 	RIB_WUNLOCK(rnh);
1211 	NET_EPOCH_EXIT(et);
1212 }
1213