xref: /freebsd/sys/net/route.c (revision 6ff6d951ade3f3379932df7f878ef3ea272cfc59)
1 /*-
2  * Copyright (c) 1980, 1986, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	@(#)route.c	8.3.1.1 (Berkeley) 2/23/95
30  * $FreeBSD$
31  */
32 
33 #include "opt_inet.h"
34 #include "opt_mrouting.h"
35 #include "opt_mpath.h"
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/malloc.h>
40 #include <sys/mbuf.h>
41 #include <sys/socket.h>
42 #include <sys/domain.h>
43 #include <sys/kernel.h>
44 
45 #include <net/if.h>
46 #include <net/route.h>
47 
48 #ifdef RADIX_MPATH
49 #include <net/radix_mpath.h>
50 #endif
51 
52 #include <netinet/in.h>
53 #include <netinet/ip_mroute.h>
54 
55 #include <vm/uma.h>
56 
57 static struct rtstat rtstat;
58 struct radix_node_head *rt_tables[AF_MAX+1];
59 
60 static int	rttrash;		/* routes not in table but not freed */
61 
62 static void rt_maskedcopy(struct sockaddr *,
63 	    struct sockaddr *, struct sockaddr *);
64 static void rtable_init(void **);
65 
66 /* compare two sockaddr structures */
67 #define	sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0)
68 
69 /*
70  * Convert a 'struct radix_node *' to a 'struct rtentry *'.
71  * The operation can be done safely (in this code) because a
72  * 'struct rtentry' starts with two 'struct radix_node''s, the first
73  * one representing leaf nodes in the routing tree, which is
74  * what the code in radix.c passes us as a 'struct radix_node'.
75  *
76  * But because there are a lot of assumptions in this conversion,
77  * do not cast explicitly, but always use the macro below.
78  */
79 #define RNTORT(p)	((struct rtentry *)(p))
80 
81 static void
82 rtable_init(void **table)
83 {
84 	struct domain *dom;
85 	for (dom = domains; dom; dom = dom->dom_next)
86 		if (dom->dom_rtattach)
87 			dom->dom_rtattach(&table[dom->dom_family],
88 			    dom->dom_rtoffset);
89 }
90 
91 static uma_zone_t rtzone;		/* Routing table UMA zone. */
92 
93 static void
94 route_init(void)
95 {
96 	rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL,
97 	    NULL, NULL, UMA_ALIGN_PTR, 0);
98 	rn_init();	/* initialize all zeroes, all ones, mask table */
99 	rtable_init((void **)rt_tables);
100 }
101 
102 /*
103  * Packet routing routines.
104  */
105 void
106 rtalloc(struct route *ro)
107 {
108 	rtalloc_ign(ro, 0UL);
109 }
110 
111 void
112 rtalloc_ign(struct route *ro, u_long ignore)
113 {
114 	struct rtentry *rt;
115 
116 	if ((rt = ro->ro_rt) != NULL) {
117 		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
118 			return;
119 		RTFREE(rt);
120 		ro->ro_rt = NULL;
121 	}
122 	ro->ro_rt = rtalloc1(&ro->ro_dst, 1, ignore);
123 	if (ro->ro_rt)
124 		RT_UNLOCK(ro->ro_rt);
125 }
126 
127 /*
128  * Look up the route that matches the address given
129  * Or, at least try.. Create a cloned route if needed.
130  *
131  * The returned route, if any, is locked.
132  */
133 struct rtentry *
134 rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
135 {
136 	struct radix_node_head *rnh = rt_tables[dst->sa_family];
137 	struct rtentry *rt;
138 	struct radix_node *rn;
139 	struct rtentry *newrt;
140 	struct rt_addrinfo info;
141 	u_long nflags;
142 	int err = 0, msgtype = RTM_MISS;
143 
144 	newrt = NULL;
145 	/*
146 	 * Look up the address in the table for that Address Family
147 	 */
148 	if (rnh == NULL) {
149 		rtstat.rts_unreach++;
150 		goto miss2;
151 	}
152 	RADIX_NODE_HEAD_LOCK(rnh);
153 	if ((rn = rnh->rnh_matchaddr(dst, rnh)) &&
154 	    (rn->rn_flags & RNF_ROOT) == 0) {
155 		/*
156 		 * If we find it and it's not the root node, then
157 		 * get a reference on the rtentry associated.
158 		 */
159 		newrt = rt = RNTORT(rn);
160 		nflags = rt->rt_flags & ~ignflags;
161 		if (report && (nflags & RTF_CLONING)) {
162 			/*
163 			 * We are apparently adding (report = 0 in delete).
164 			 * If it requires that it be cloned, do so.
165 			 * (This implies it wasn't a HOST route.)
166 			 */
167 			err = rtrequest(RTM_RESOLVE, dst, NULL,
168 					      NULL, 0, &newrt);
169 			if (err) {
170 				/*
171 				 * If the cloning didn't succeed, maybe
172 				 * what we have will do. Return that.
173 				 */
174 				newrt = rt;		/* existing route */
175 				RT_LOCK(newrt);
176 				RT_ADDREF(newrt);
177 				goto miss;
178 			}
179 			KASSERT(newrt, ("no route and no error"));
180 			RT_LOCK(newrt);
181 			if (newrt->rt_flags & RTF_XRESOLVE) {
182 				/*
183 				 * If the new route specifies it be
184 				 * externally resolved, then go do that.
185 				 */
186 				msgtype = RTM_RESOLVE;
187 				goto miss;
188 			}
189 			/* Inform listeners of the new route. */
190 			bzero(&info, sizeof(info));
191 			info.rti_info[RTAX_DST] = rt_key(newrt);
192 			info.rti_info[RTAX_NETMASK] = rt_mask(newrt);
193 			info.rti_info[RTAX_GATEWAY] = newrt->rt_gateway;
194 			if (newrt->rt_ifp != NULL) {
195 				info.rti_info[RTAX_IFP] =
196 				    newrt->rt_ifp->if_addr->ifa_addr;
197 				info.rti_info[RTAX_IFA] = newrt->rt_ifa->ifa_addr;
198 			}
199 			rt_missmsg(RTM_ADD, &info, newrt->rt_flags, 0);
200 		} else {
201 			RT_LOCK(newrt);
202 			RT_ADDREF(newrt);
203 		}
204 		RADIX_NODE_HEAD_UNLOCK(rnh);
205 	} else {
206 		/*
207 		 * Either we hit the root or couldn't find any match,
208 		 * Which basically means
209 		 * "caint get there frm here"
210 		 */
211 		rtstat.rts_unreach++;
212 	miss:
213 		RADIX_NODE_HEAD_UNLOCK(rnh);
214 	miss2:	if (report) {
215 			/*
216 			 * If required, report the failure to the supervising
217 			 * Authorities.
218 			 * For a delete, this is not an error. (report == 0)
219 			 */
220 			bzero(&info, sizeof(info));
221 			info.rti_info[RTAX_DST] = dst;
222 			rt_missmsg(msgtype, &info, 0, err);
223 		}
224 	}
225 	if (newrt)
226 		RT_LOCK_ASSERT(newrt);
227 	return (newrt);
228 }
229 
230 /*
231  * Remove a reference count from an rtentry.
232  * If the count gets low enough, take it out of the routing table
233  */
234 void
235 rtfree(struct rtentry *rt)
236 {
237 	struct radix_node_head *rnh;
238 
239 	KASSERT(rt != NULL,("%s: NULL rt", __func__));
240 	rnh = rt_tables[rt_key(rt)->sa_family];
241 	KASSERT(rnh != NULL,("%s: NULL rnh", __func__));
242 
243 	RT_LOCK_ASSERT(rt);
244 
245 	/*
246 	 * The callers should use RTFREE_LOCKED() or RTFREE(), so
247 	 * we should come here exactly with the last reference.
248 	 */
249 	RT_REMREF(rt);
250 	if (rt->rt_refcnt > 0) {
251 		printf("%s: %p has %lu refs\n", __func__, rt, rt->rt_refcnt);
252 		goto done;
253 	}
254 
255 	/*
256 	 * On last reference give the "close method" a chance
257 	 * to cleanup private state.  This also permits (for
258 	 * IPv4 and IPv6) a chance to decide if the routing table
259 	 * entry should be purged immediately or at a later time.
260 	 * When an immediate purge is to happen the close routine
261 	 * typically calls rtexpunge which clears the RTF_UP flag
262 	 * on the entry so that the code below reclaims the storage.
263 	 */
264 	if (rt->rt_refcnt == 0 && rnh->rnh_close)
265 		rnh->rnh_close((struct radix_node *)rt, rnh);
266 
267 	/*
268 	 * If we are no longer "up" (and ref == 0)
269 	 * then we can free the resources associated
270 	 * with the route.
271 	 */
272 	if ((rt->rt_flags & RTF_UP) == 0) {
273 		if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
274 			panic("rtfree 2");
275 		/*
276 		 * the rtentry must have been removed from the routing table
277 		 * so it is represented in rttrash.. remove that now.
278 		 */
279 		rttrash--;
280 #ifdef	DIAGNOSTIC
281 		if (rt->rt_refcnt < 0) {
282 			printf("rtfree: %p not freed (neg refs)\n", rt);
283 			goto done;
284 		}
285 #endif
286 		/*
287 		 * release references on items we hold them on..
288 		 * e.g other routes and ifaddrs.
289 		 */
290 		if (rt->rt_ifa)
291 			IFAFREE(rt->rt_ifa);
292 		rt->rt_parent = NULL;		/* NB: no refcnt on parent */
293 
294 		/*
295 		 * The key is separatly alloc'd so free it (see rt_setgate()).
296 		 * This also frees the gateway, as they are always malloc'd
297 		 * together.
298 		 */
299 		Free(rt_key(rt));
300 
301 		/*
302 		 * and the rtentry itself of course
303 		 */
304 		RT_LOCK_DESTROY(rt);
305 		uma_zfree(rtzone, rt);
306 		return;
307 	}
308 done:
309 	RT_UNLOCK(rt);
310 }
311 
312 
313 /*
314  * Force a routing table entry to the specified
315  * destination to go through the given gateway.
316  * Normally called as a result of a routing redirect
317  * message from the network layer.
318  */
319 void
320 rtredirect(struct sockaddr *dst,
321 	struct sockaddr *gateway,
322 	struct sockaddr *netmask,
323 	int flags,
324 	struct sockaddr *src)
325 {
326 	struct rtentry *rt, *rt0 = NULL;
327 	int error = 0;
328 	short *stat = NULL;
329 	struct rt_addrinfo info;
330 	struct ifaddr *ifa;
331 
332 	/* verify the gateway is directly reachable */
333 	if ((ifa = ifa_ifwithnet(gateway)) == NULL) {
334 		error = ENETUNREACH;
335 		goto out;
336 	}
337 	rt = rtalloc1(dst, 0, 0UL);	/* NB: rt is locked */
338 	/*
339 	 * If the redirect isn't from our current router for this dst,
340 	 * it's either old or wrong.  If it redirects us to ourselves,
341 	 * we have a routing loop, perhaps as a result of an interface
342 	 * going down recently.
343 	 */
344 	if (!(flags & RTF_DONE) && rt &&
345 	     (!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa))
346 		error = EINVAL;
347 	else if (ifa_ifwithaddr(gateway))
348 		error = EHOSTUNREACH;
349 	if (error)
350 		goto done;
351 	/*
352 	 * Create a new entry if we just got back a wildcard entry
353 	 * or the the lookup failed.  This is necessary for hosts
354 	 * which use routing redirects generated by smart gateways
355 	 * to dynamically build the routing tables.
356 	 */
357 	if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
358 		goto create;
359 	/*
360 	 * Don't listen to the redirect if it's
361 	 * for a route to an interface.
362 	 */
363 	if (rt->rt_flags & RTF_GATEWAY) {
364 		if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
365 			/*
366 			 * Changing from route to net => route to host.
367 			 * Create new route, rather than smashing route to net.
368 			 */
369 		create:
370 			rt0 = rt;
371 			rt = NULL;
372 
373 			flags |=  RTF_GATEWAY | RTF_DYNAMIC;
374 			bzero((caddr_t)&info, sizeof(info));
375 			info.rti_info[RTAX_DST] = dst;
376 			info.rti_info[RTAX_GATEWAY] = gateway;
377 			info.rti_info[RTAX_NETMASK] = netmask;
378 			info.rti_ifa = ifa;
379 			info.rti_flags = flags;
380 			error = rtrequest1(RTM_ADD, &info, &rt);
381 			if (rt != NULL) {
382 				RT_LOCK(rt);
383 				EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst);
384 				flags = rt->rt_flags;
385 			}
386 			if (rt0)
387 				RTFREE_LOCKED(rt0);
388 
389 			stat = &rtstat.rts_dynamic;
390 		} else {
391 			struct rtentry *gwrt;
392 
393 			/*
394 			 * Smash the current notion of the gateway to
395 			 * this destination.  Should check about netmask!!!
396 			 */
397 			rt->rt_flags |= RTF_MODIFIED;
398 			flags |= RTF_MODIFIED;
399 			stat = &rtstat.rts_newgateway;
400 			/*
401 			 * add the key and gateway (in one malloc'd chunk).
402 			 */
403 			rt_setgate(rt, rt_key(rt), gateway);
404 			gwrt = rtalloc1(gateway, 1, 0);
405 			EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst);
406 			RTFREE_LOCKED(gwrt);
407 		}
408 	} else
409 		error = EHOSTUNREACH;
410 done:
411 	if (rt)
412 		RTFREE_LOCKED(rt);
413 out:
414 	if (error)
415 		rtstat.rts_badredirect++;
416 	else if (stat != NULL)
417 		(*stat)++;
418 	bzero((caddr_t)&info, sizeof(info));
419 	info.rti_info[RTAX_DST] = dst;
420 	info.rti_info[RTAX_GATEWAY] = gateway;
421 	info.rti_info[RTAX_NETMASK] = netmask;
422 	info.rti_info[RTAX_AUTHOR] = src;
423 	rt_missmsg(RTM_REDIRECT, &info, flags, error);
424 }
425 
426 /*
427  * Routing table ioctl interface.
428  */
429 int
430 rtioctl(u_long req, caddr_t data)
431 {
432 
433 	/*
434 	 * If more ioctl commands are added here, make sure the proper
435 	 * super-user checks are being performed because it is possible for
436 	 * prison-root to make it this far if raw sockets have been enabled
437 	 * in jails.
438 	 */
439 #ifdef INET
440 	/* Multicast goop, grrr... */
441 	return mrt_ioctl ? mrt_ioctl(req, data) : EOPNOTSUPP;
442 #else /* INET */
443 	return ENXIO;
444 #endif /* INET */
445 }
446 
447 struct ifaddr *
448 ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway)
449 {
450 	register struct ifaddr *ifa;
451 	int not_found = 0;
452 
453 	if ((flags & RTF_GATEWAY) == 0) {
454 		/*
455 		 * If we are adding a route to an interface,
456 		 * and the interface is a pt to pt link
457 		 * we should search for the destination
458 		 * as our clue to the interface.  Otherwise
459 		 * we can use the local address.
460 		 */
461 		ifa = NULL;
462 		if (flags & RTF_HOST)
463 			ifa = ifa_ifwithdstaddr(dst);
464 		if (ifa == NULL)
465 			ifa = ifa_ifwithaddr(gateway);
466 	} else {
467 		/*
468 		 * If we are adding a route to a remote net
469 		 * or host, the gateway may still be on the
470 		 * other end of a pt to pt link.
471 		 */
472 		ifa = ifa_ifwithdstaddr(gateway);
473 	}
474 	if (ifa == NULL)
475 		ifa = ifa_ifwithnet(gateway);
476 	if (ifa == NULL) {
477 		struct rtentry *rt = rtalloc1(gateway, 0, 0UL);
478 		if (rt == NULL)
479 			return (NULL);
480 		/*
481 		 * dismiss a gateway that is reachable only
482 		 * through the default router
483 		 */
484 		switch (gateway->sa_family) {
485 		case AF_INET:
486 			if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY)
487 				not_found = 1;
488 			break;
489 		case AF_INET6:
490 			if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr))
491 				not_found = 1;
492 			break;
493 		default:
494 			break;
495 		}
496 		RT_REMREF(rt);
497 		RT_UNLOCK(rt);
498 		if (not_found)
499 			return (NULL);
500 		if ((ifa = rt->rt_ifa) == NULL)
501 			return (NULL);
502 	}
503 	if (ifa->ifa_addr->sa_family != dst->sa_family) {
504 		struct ifaddr *oifa = ifa;
505 		ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
506 		if (ifa == NULL)
507 			ifa = oifa;
508 	}
509 	return (ifa);
510 }
511 
512 static walktree_f_t rt_fixdelete;
513 static walktree_f_t rt_fixchange;
514 
515 struct rtfc_arg {
516 	struct rtentry *rt0;
517 	struct radix_node_head *rnh;
518 };
519 
520 /*
521  * Do appropriate manipulations of a routing tree given
522  * all the bits of info needed
523  */
524 int
525 rtrequest(int req,
526 	struct sockaddr *dst,
527 	struct sockaddr *gateway,
528 	struct sockaddr *netmask,
529 	int flags,
530 	struct rtentry **ret_nrt)
531 {
532 	struct rt_addrinfo info;
533 
534 	if (dst->sa_len == 0)
535 		return(EINVAL);
536 
537 	bzero((caddr_t)&info, sizeof(info));
538 	info.rti_flags = flags;
539 	info.rti_info[RTAX_DST] = dst;
540 	info.rti_info[RTAX_GATEWAY] = gateway;
541 	info.rti_info[RTAX_NETMASK] = netmask;
542 	return rtrequest1(req, &info, ret_nrt);
543 }
544 
545 /*
546  * These (questionable) definitions of apparent local variables apply
547  * to the next two functions.  XXXXXX!!!
548  */
549 #define	dst	info->rti_info[RTAX_DST]
550 #define	gateway	info->rti_info[RTAX_GATEWAY]
551 #define	netmask	info->rti_info[RTAX_NETMASK]
552 #define	ifaaddr	info->rti_info[RTAX_IFA]
553 #define	ifpaddr	info->rti_info[RTAX_IFP]
554 #define	flags	info->rti_flags
555 
556 int
557 rt_getifa(struct rt_addrinfo *info)
558 {
559 	struct ifaddr *ifa;
560 	int error = 0;
561 
562 	/*
563 	 * ifp may be specified by sockaddr_dl
564 	 * when protocol address is ambiguous.
565 	 */
566 	if (info->rti_ifp == NULL && ifpaddr != NULL &&
567 	    ifpaddr->sa_family == AF_LINK &&
568 	    (ifa = ifa_ifwithnet(ifpaddr)) != NULL)
569 		info->rti_ifp = ifa->ifa_ifp;
570 	if (info->rti_ifa == NULL && ifaaddr != NULL)
571 		info->rti_ifa = ifa_ifwithaddr(ifaaddr);
572 	if (info->rti_ifa == NULL) {
573 		struct sockaddr *sa;
574 
575 		sa = ifaaddr != NULL ? ifaaddr :
576 		    (gateway != NULL ? gateway : dst);
577 		if (sa != NULL && info->rti_ifp != NULL)
578 			info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
579 		else if (dst != NULL && gateway != NULL)
580 			info->rti_ifa = ifa_ifwithroute(flags, dst, gateway);
581 		else if (sa != NULL)
582 			info->rti_ifa = ifa_ifwithroute(flags, sa, sa);
583 	}
584 	if ((ifa = info->rti_ifa) != NULL) {
585 		if (info->rti_ifp == NULL)
586 			info->rti_ifp = ifa->ifa_ifp;
587 	} else
588 		error = ENETUNREACH;
589 	return (error);
590 }
591 
592 /*
593  * Expunges references to a route that's about to be reclaimed.
594  * The route must be locked.
595  */
596 int
597 rtexpunge(struct rtentry *rt)
598 {
599 	struct radix_node *rn;
600 	struct radix_node_head *rnh;
601 	struct ifaddr *ifa;
602 	int error = 0;
603 
604 	RT_LOCK_ASSERT(rt);
605 #if 0
606 	/*
607 	 * We cannot assume anything about the reference count
608 	 * because protocols call us in many situations; often
609 	 * before unwinding references to the table entry.
610 	 */
611 	KASSERT(rt->rt_refcnt <= 1, ("bogus refcnt %ld", rt->rt_refcnt));
612 #endif
613 	/*
614 	 * Find the correct routing tree to use for this Address Family
615 	 */
616 	rnh = rt_tables[rt_key(rt)->sa_family];
617 	if (rnh == NULL)
618 		return (EAFNOSUPPORT);
619 
620 	RADIX_NODE_HEAD_LOCK(rnh);
621 
622 	/*
623 	 * Remove the item from the tree; it should be there,
624 	 * but when callers invoke us blindly it may not (sigh).
625 	 */
626 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh);
627 	if (rn == NULL) {
628 		error = ESRCH;
629 		goto bad;
630 	}
631 	KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0,
632 		("unexpected flags 0x%x", rn->rn_flags));
633 	KASSERT(rt == RNTORT(rn),
634 		("lookup mismatch, rt %p rn %p", rt, rn));
635 
636 	rt->rt_flags &= ~RTF_UP;
637 
638 	/*
639 	 * Now search what's left of the subtree for any cloned
640 	 * routes which might have been formed from this node.
641 	 */
642 	if ((rt->rt_flags & RTF_CLONING) && rt_mask(rt))
643 		rnh->rnh_walktree_from(rnh, rt_key(rt), rt_mask(rt),
644 				       rt_fixdelete, rt);
645 
646 	/*
647 	 * Remove any external references we may have.
648 	 * This might result in another rtentry being freed if
649 	 * we held its last reference.
650 	 */
651 	if (rt->rt_gwroute) {
652 		RTFREE(rt->rt_gwroute);
653 		rt->rt_gwroute = NULL;
654 	}
655 
656 	/*
657 	 * Give the protocol a chance to keep things in sync.
658 	 */
659 	if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) {
660 		struct rt_addrinfo info;
661 
662 		bzero((caddr_t)&info, sizeof(info));
663 		info.rti_flags = rt->rt_flags;
664 		info.rti_info[RTAX_DST] = rt_key(rt);
665 		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
666 		info.rti_info[RTAX_NETMASK] = rt_mask(rt);
667 		ifa->ifa_rtrequest(RTM_DELETE, rt, &info);
668 	}
669 
670 	/*
671 	 * one more rtentry floating around that is not
672 	 * linked to the routing table.
673 	 */
674 	rttrash++;
675 bad:
676 	RADIX_NODE_HEAD_UNLOCK(rnh);
677 	return (error);
678 }
679 
680 int
681 rtrequest1(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt)
682 {
683 	int error = 0;
684 	register struct rtentry *rt;
685 	register struct radix_node *rn;
686 	register struct radix_node_head *rnh;
687 	struct ifaddr *ifa;
688 	struct sockaddr *ndst;
689 #define senderr(x) { error = x ; goto bad; }
690 
691 	/*
692 	 * Find the correct routing tree to use for this Address Family
693 	 */
694 	rnh = rt_tables[dst->sa_family];
695 	if (rnh == NULL)
696 		return (EAFNOSUPPORT);
697 	RADIX_NODE_HEAD_LOCK(rnh);
698 	/*
699 	 * If we are adding a host route then we don't want to put
700 	 * a netmask in the tree, nor do we want to clone it.
701 	 */
702 	if (flags & RTF_HOST) {
703 		netmask = NULL;
704 		flags &= ~RTF_CLONING;
705 	}
706 	switch (req) {
707 	case RTM_DELETE:
708 #ifdef RADIX_MPATH
709 		/*
710 		 * if we got multipath routes, we require users to specify
711 		 * a matching RTAX_GATEWAY.
712 		 */
713 		if (rn_mpath_capable(rnh)) {
714 			struct rtentry *rto = NULL;
715 
716 			rn = rnh->rnh_matchaddr(dst, rnh);
717 			if (rn == NULL)
718 				senderr(ESRCH);
719  			rto = rt = RNTORT(rn);
720 			rt = rt_mpath_matchgate(rt, gateway);
721 			if (!rt)
722 				senderr(ESRCH);
723 			/*
724 			 * this is the first entry in the chain
725 			 */
726 			if (rto == rt) {
727 				rn = rn_mpath_next((struct radix_node *)rt);
728 				/*
729 				 * there is another entry, now it's active
730 				 */
731 				if (rn) {
732 					rto = RNTORT(rn);
733 					RT_LOCK(rto);
734 					rto->rt_flags |= RTF_UP;
735 					RT_UNLOCK(rto);
736 				} else if (rt->rt_flags & RTF_GATEWAY) {
737 					/*
738 					 * For gateway routes, we need to
739 					 * make sure that we we are deleting
740 					 * the correct gateway.
741 					 * rt_mpath_matchgate() does not
742 					 * check the case when there is only
743 					 * one route in the chain.
744 					 */
745 					if (gateway &&
746 					    (rt->rt_gateway->sa_len != gateway->sa_len ||
747 					    memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
748 						senderr(ESRCH);
749 				}
750 				/*
751 				 * use the normal delete code to remove
752 				 * the first entry
753 				 */
754 				goto normal_rtdel;
755 			}
756 			/*
757 			 * if the entry is 2nd and on up
758 			 */
759 			if (!rt_mpath_deldup(rto, rt))
760 				panic ("rtrequest1: rt_mpath_deldup");
761 			RT_LOCK(rt);
762 			RT_ADDREF(rt);
763 			rt->rt_flags &= ~RTF_UP;
764 			goto deldone;  /* done with the RTM_DELETE command */
765 		}
766 
767 normal_rtdel:
768 #endif
769 		/*
770 		 * Remove the item from the tree and return it.
771 		 * Complain if it is not there and do no more processing.
772 		 */
773 		rn = rnh->rnh_deladdr(dst, netmask, rnh);
774 		if (rn == NULL)
775 			senderr(ESRCH);
776 		if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
777 			panic ("rtrequest delete");
778 		rt = RNTORT(rn);
779 		RT_LOCK(rt);
780 		RT_ADDREF(rt);
781 		rt->rt_flags &= ~RTF_UP;
782 
783 		/*
784 		 * Now search what's left of the subtree for any cloned
785 		 * routes which might have been formed from this node.
786 		 */
787 		if ((rt->rt_flags & RTF_CLONING) &&
788 		    rt_mask(rt)) {
789 			rnh->rnh_walktree_from(rnh, dst, rt_mask(rt),
790 					       rt_fixdelete, rt);
791 		}
792 
793 		/*
794 		 * Remove any external references we may have.
795 		 * This might result in another rtentry being freed if
796 		 * we held its last reference.
797 		 */
798 		if (rt->rt_gwroute) {
799 			RTFREE(rt->rt_gwroute);
800 			rt->rt_gwroute = NULL;
801 		}
802 
803 		/*
804 		 * give the protocol a chance to keep things in sync.
805 		 */
806 		if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
807 			ifa->ifa_rtrequest(RTM_DELETE, rt, info);
808 
809 #ifdef RADIX_MPATH
810 deldone:
811 #endif
812 		/*
813 		 * One more rtentry floating around that is not
814 		 * linked to the routing table. rttrash will be decremented
815 		 * when RTFREE(rt) is eventually called.
816 		 */
817 		rttrash++;
818 
819 		/*
820 		 * If the caller wants it, then it can have it,
821 		 * but it's up to it to free the rtentry as we won't be
822 		 * doing it.
823 		 */
824 		if (ret_nrt) {
825 			*ret_nrt = rt;
826 			RT_UNLOCK(rt);
827 		} else
828 			RTFREE_LOCKED(rt);
829 		break;
830 
831 	case RTM_RESOLVE:
832 		if (ret_nrt == NULL || (rt = *ret_nrt) == NULL)
833 			senderr(EINVAL);
834 		ifa = rt->rt_ifa;
835 		/* XXX locking? */
836 		flags = rt->rt_flags &
837 		    ~(RTF_CLONING | RTF_STATIC);
838 		flags |= RTF_WASCLONED;
839 		gateway = rt->rt_gateway;
840 		if ((netmask = rt->rt_genmask) == NULL)
841 			flags |= RTF_HOST;
842 		goto makeroute;
843 
844 	case RTM_ADD:
845 		if ((flags & RTF_GATEWAY) && !gateway)
846 			senderr(EINVAL);
847 		if (dst && gateway && (dst->sa_family != gateway->sa_family) &&
848 		    (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
849 			senderr(EINVAL);
850 
851 		if (info->rti_ifa == NULL && (error = rt_getifa(info)))
852 			senderr(error);
853 		ifa = info->rti_ifa;
854 
855 	makeroute:
856 		rt = uma_zalloc(rtzone, M_NOWAIT | M_ZERO);
857 		if (rt == NULL)
858 			senderr(ENOBUFS);
859 		RT_LOCK_INIT(rt);
860 		rt->rt_flags = RTF_UP | flags;
861 		/*
862 		 * Add the gateway. Possibly re-malloc-ing the storage for it
863 		 * also add the rt_gwroute if possible.
864 		 */
865 		RT_LOCK(rt);
866 		if ((error = rt_setgate(rt, dst, gateway)) != 0) {
867 			RT_LOCK_DESTROY(rt);
868 			uma_zfree(rtzone, rt);
869 			senderr(error);
870 		}
871 
872 		/*
873 		 * point to the (possibly newly malloc'd) dest address.
874 		 */
875 		ndst = (struct sockaddr *)rt_key(rt);
876 
877 		/*
878 		 * make sure it contains the value we want (masked if needed).
879 		 */
880 		if (netmask) {
881 			rt_maskedcopy(dst, ndst, netmask);
882 		} else
883 			bcopy(dst, ndst, dst->sa_len);
884 
885 		/*
886 		 * Note that we now have a reference to the ifa.
887 		 * This moved from below so that rnh->rnh_addaddr() can
888 		 * examine the ifa and  ifa->ifa_ifp if it so desires.
889 		 */
890 		IFAREF(ifa);
891 		rt->rt_ifa = ifa;
892 		rt->rt_ifp = ifa->ifa_ifp;
893 
894 #ifdef RADIX_MPATH
895 		/* do not permit exactly the same dst/mask/gw pair */
896 		if (rn_mpath_capable(rnh) &&
897 			rt_mpath_conflict(rnh, rt, netmask)) {
898 			if (rt->rt_gwroute)
899 				RTFREE(rt->rt_gwroute);
900 			if (rt->rt_ifa) {
901 				IFAFREE(rt->rt_ifa);
902 			}
903 			Free(rt_key(rt));
904 			RT_LOCK_DESTROY(rt);
905 			uma_zfree(rtzone, rt);
906 			senderr(EEXIST);
907 		}
908 #endif
909 
910 		/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
911 		rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes);
912 		if (rn == NULL) {
913 			struct rtentry *rt2;
914 			/*
915 			 * Uh-oh, we already have one of these in the tree.
916 			 * We do a special hack: if the route that's already
917 			 * there was generated by the cloning mechanism
918 			 * then we just blow it away and retry the insertion
919 			 * of the new one.
920 			 */
921 			rt2 = rtalloc1(dst, 0, 0);
922 			if (rt2 && rt2->rt_parent) {
923 				rtexpunge(rt2);
924 				RT_UNLOCK(rt2);
925 				rn = rnh->rnh_addaddr(ndst, netmask,
926 						      rnh, rt->rt_nodes);
927 			} else if (rt2) {
928 				/* undo the extra ref we got */
929 				RTFREE_LOCKED(rt2);
930 			}
931 		}
932 
933 		/*
934 		 * If it still failed to go into the tree,
935 		 * then un-make it (this should be a function)
936 		 */
937 		if (rn == NULL) {
938 			if (rt->rt_gwroute)
939 				RTFREE(rt->rt_gwroute);
940 			if (rt->rt_ifa)
941 				IFAFREE(rt->rt_ifa);
942 			Free(rt_key(rt));
943 			RT_LOCK_DESTROY(rt);
944 			uma_zfree(rtzone, rt);
945 			senderr(EEXIST);
946 		}
947 
948 		rt->rt_parent = NULL;
949 
950 		/*
951 		 * If we got here from RESOLVE, then we are cloning
952 		 * so clone the rest, and note that we
953 		 * are a clone (and increment the parent's references)
954 		 */
955 		if (req == RTM_RESOLVE) {
956 			KASSERT(ret_nrt && *ret_nrt,
957 				("no route to clone from"));
958 			rt->rt_rmx = (*ret_nrt)->rt_rmx; /* copy metrics */
959 			rt->rt_rmx.rmx_pksent = 0; /* reset packet counter */
960 			if ((*ret_nrt)->rt_flags & RTF_CLONING) {
961 				/*
962 				 * NB: We do not bump the refcnt on the parent
963 				 * entry under the assumption that it will
964 				 * remain so long as we do.  This is
965 				 * important when deleting the parent route
966 				 * as this operation requires traversing
967 				 * the tree to delete all clones and futzing
968 				 * with refcnts requires us to double-lock
969 				 * parent through this back reference.
970 				 */
971 				rt->rt_parent = *ret_nrt;
972 			}
973 		}
974 
975 		/*
976 		 * If this protocol has something to add to this then
977 		 * allow it to do that as well.
978 		 */
979 		if (ifa->ifa_rtrequest)
980 			ifa->ifa_rtrequest(req, rt, info);
981 
982 		/*
983 		 * We repeat the same procedure from rt_setgate() here because
984 		 * it doesn't fire when we call it there because the node
985 		 * hasn't been added to the tree yet.
986 		 */
987 		if (req == RTM_ADD &&
988 		    !(rt->rt_flags & RTF_HOST) && rt_mask(rt) != NULL) {
989 			struct rtfc_arg arg;
990 			arg.rnh = rnh;
991 			arg.rt0 = rt;
992 			rnh->rnh_walktree_from(rnh, rt_key(rt), rt_mask(rt),
993 					       rt_fixchange, &arg);
994 		}
995 
996 		/*
997 		 * actually return a resultant rtentry and
998 		 * give the caller a single reference.
999 		 */
1000 		if (ret_nrt) {
1001 			*ret_nrt = rt;
1002 			RT_ADDREF(rt);
1003 		}
1004 		RT_UNLOCK(rt);
1005 		break;
1006 	default:
1007 		error = EOPNOTSUPP;
1008 	}
1009 bad:
1010 	RADIX_NODE_HEAD_UNLOCK(rnh);
1011 	return (error);
1012 #undef senderr
1013 }
1014 
1015 #undef dst
1016 #undef gateway
1017 #undef netmask
1018 #undef ifaaddr
1019 #undef ifpaddr
1020 #undef flags
1021 
1022 /*
1023  * Called from rtrequest(RTM_DELETE, ...) to fix up the route's ``family''
1024  * (i.e., the routes related to it by the operation of cloning).  This
1025  * routine is iterated over all potential former-child-routes by way of
1026  * rnh->rnh_walktree_from() above, and those that actually are children of
1027  * the late parent (passed in as VP here) are themselves deleted.
1028  */
1029 static int
1030 rt_fixdelete(struct radix_node *rn, void *vp)
1031 {
1032 	struct rtentry *rt = RNTORT(rn);
1033 	struct rtentry *rt0 = vp;
1034 
1035 	if (rt->rt_parent == rt0 &&
1036 	    !(rt->rt_flags & (RTF_PINNED | RTF_CLONING))) {
1037 		return rtrequest(RTM_DELETE, rt_key(rt), NULL, rt_mask(rt),
1038 				 rt->rt_flags, NULL);
1039 	}
1040 	return 0;
1041 }
1042 
1043 /*
1044  * This routine is called from rt_setgate() to do the analogous thing for
1045  * adds and changes.  There is the added complication in this case of a
1046  * middle insert; i.e., insertion of a new network route between an older
1047  * network route and (cloned) host routes.  For this reason, a simple check
1048  * of rt->rt_parent is insufficient; each candidate route must be tested
1049  * against the (mask, value) of the new route (passed as before in vp)
1050  * to see if the new route matches it.
1051  *
1052  * XXX - it may be possible to do fixdelete() for changes and reserve this
1053  * routine just for adds.  I'm not sure why I thought it was necessary to do
1054  * changes this way.
1055  */
1056 
1057 static int
1058 rt_fixchange(struct radix_node *rn, void *vp)
1059 {
1060 	struct rtentry *rt = RNTORT(rn);
1061 	struct rtfc_arg *ap = vp;
1062 	struct rtentry *rt0 = ap->rt0;
1063 	struct radix_node_head *rnh = ap->rnh;
1064 	u_char *xk1, *xm1, *xk2, *xmp;
1065 	int i, len, mlen;
1066 
1067 	/* make sure we have a parent, and route is not pinned or cloning */
1068 	if (!rt->rt_parent ||
1069 	    (rt->rt_flags & (RTF_PINNED | RTF_CLONING)))
1070 		return 0;
1071 
1072 	if (rt->rt_parent == rt0)	/* parent match */
1073 		goto delete_rt;
1074 	/*
1075 	 * There probably is a function somewhere which does this...
1076 	 * if not, there should be.
1077 	 */
1078 	len = imin(rt_key(rt0)->sa_len, rt_key(rt)->sa_len);
1079 
1080 	xk1 = (u_char *)rt_key(rt0);
1081 	xm1 = (u_char *)rt_mask(rt0);
1082 	xk2 = (u_char *)rt_key(rt);
1083 
1084 	/* avoid applying a less specific route */
1085 	xmp = (u_char *)rt_mask(rt->rt_parent);
1086 	mlen = rt_key(rt->rt_parent)->sa_len;
1087 	if (mlen > rt_key(rt0)->sa_len)		/* less specific route */
1088 		return 0;
1089 	for (i = rnh->rnh_treetop->rn_offset; i < mlen; i++)
1090 		if ((xmp[i] & ~(xmp[i] ^ xm1[i])) != xmp[i])
1091 			return 0;	/* less specific route */
1092 
1093 	for (i = rnh->rnh_treetop->rn_offset; i < len; i++)
1094 		if ((xk2[i] & xm1[i]) != xk1[i])
1095 			return 0;	/* no match */
1096 
1097 	/*
1098 	 * OK, this node is a clone, and matches the node currently being
1099 	 * changed/added under the node's mask.  So, get rid of it.
1100 	 */
1101 delete_rt:
1102 	return rtrequest(RTM_DELETE, rt_key(rt), NULL,
1103 			 rt_mask(rt), rt->rt_flags, NULL);
1104 }
1105 
1106 int
1107 rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate)
1108 {
1109 	/* XXX dst may be overwritten, can we move this to below */
1110 	struct radix_node_head *rnh = rt_tables[dst->sa_family];
1111 	int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);
1112 
1113 again:
1114 	RT_LOCK_ASSERT(rt);
1115 
1116 	/*
1117 	 * A host route with the destination equal to the gateway
1118 	 * will interfere with keeping LLINFO in the routing
1119 	 * table, so disallow it.
1120 	 */
1121 	if (((rt->rt_flags & (RTF_HOST|RTF_GATEWAY|RTF_LLINFO)) ==
1122 					(RTF_HOST|RTF_GATEWAY)) &&
1123 	    dst->sa_len == gate->sa_len &&
1124 	    bcmp(dst, gate, dst->sa_len) == 0) {
1125 		/*
1126 		 * The route might already exist if this is an RTM_CHANGE
1127 		 * or a routing redirect, so try to delete it.
1128 		 */
1129 		if (rt_key(rt))
1130 			rtexpunge(rt);
1131 		return EADDRNOTAVAIL;
1132 	}
1133 
1134 	/*
1135 	 * Cloning loop avoidance in case of bad configuration.
1136 	 */
1137 	if (rt->rt_flags & RTF_GATEWAY) {
1138 		struct rtentry *gwrt;
1139 
1140 		RT_UNLOCK(rt);		/* XXX workaround LOR */
1141 		gwrt = rtalloc1(gate, 1, 0);
1142 		if (gwrt == rt) {
1143 			RT_REMREF(rt);
1144 			return (EADDRINUSE); /* failure */
1145 		}
1146 		/*
1147 		 * Try to reacquire the lock on rt, and if it fails,
1148 		 * clean state and restart from scratch.
1149 		 */
1150 		if (!RT_TRYLOCK(rt)) {
1151 			RTFREE_LOCKED(gwrt);
1152 			RT_LOCK(rt);
1153 			goto again;
1154 		}
1155 		/*
1156 		 * If there is already a gwroute, then drop it. If we
1157 		 * are asked to replace route with itself, then do
1158 		 * not leak its refcounter.
1159 		 */
1160 		if (rt->rt_gwroute != NULL) {
1161 			if (rt->rt_gwroute == gwrt) {
1162 				RT_REMREF(rt->rt_gwroute);
1163 			} else
1164 				RTFREE(rt->rt_gwroute);
1165 		}
1166 
1167 		if ((rt->rt_gwroute = gwrt) != NULL)
1168 			RT_UNLOCK(rt->rt_gwroute);
1169 	}
1170 
1171 	/*
1172 	 * Prepare to store the gateway in rt->rt_gateway.
1173 	 * Both dst and gateway are stored one after the other in the same
1174 	 * malloc'd chunk. If we have room, we can reuse the old buffer,
1175 	 * rt_gateway already points to the right place.
1176 	 * Otherwise, malloc a new block and update the 'dst' address.
1177 	 */
1178 	if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) {
1179 		caddr_t new;
1180 
1181 		R_Malloc(new, caddr_t, dlen + glen);
1182 		if (new == NULL)
1183 			return ENOBUFS;
1184 		/*
1185 		 * XXX note, we copy from *dst and not *rt_key(rt) because
1186 		 * rt_setgate() can be called to initialize a newly
1187 		 * allocated route entry, in which case rt_key(rt) == NULL
1188 		 * (and also rt->rt_gateway == NULL).
1189 		 * Free()/free() handle a NULL argument just fine.
1190 		 */
1191 		bcopy(dst, new, dlen);
1192 		Free(rt_key(rt));	/* free old block, if any */
1193 		rt_key(rt) = (struct sockaddr *)new;
1194 		rt->rt_gateway = (struct sockaddr *)(new + dlen);
1195 	}
1196 
1197 	/*
1198 	 * Copy the new gateway value into the memory chunk.
1199 	 */
1200 	bcopy(gate, rt->rt_gateway, glen);
1201 
1202 	/*
1203 	 * This isn't going to do anything useful for host routes, so
1204 	 * don't bother.  Also make sure we have a reasonable mask
1205 	 * (we don't yet have one during adds).
1206 	 */
1207 	if (!(rt->rt_flags & RTF_HOST) && rt_mask(rt) != 0) {
1208 		struct rtfc_arg arg;
1209 
1210 		arg.rnh = rnh;
1211 		arg.rt0 = rt;
1212 		RT_UNLOCK(rt);		/* XXX workaround LOR */
1213 		RADIX_NODE_HEAD_LOCK(rnh);
1214 		RT_LOCK(rt);
1215 		rnh->rnh_walktree_from(rnh, rt_key(rt), rt_mask(rt),
1216 				       rt_fixchange, &arg);
1217 		RADIX_NODE_HEAD_UNLOCK(rnh);
1218 	}
1219 
1220 	return 0;
1221 }
1222 
1223 static void
1224 rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
1225 {
1226 	register u_char *cp1 = (u_char *)src;
1227 	register u_char *cp2 = (u_char *)dst;
1228 	register u_char *cp3 = (u_char *)netmask;
1229 	u_char *cplim = cp2 + *cp3;
1230 	u_char *cplim2 = cp2 + *cp1;
1231 
1232 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
1233 	cp3 += 2;
1234 	if (cplim > cplim2)
1235 		cplim = cplim2;
1236 	while (cp2 < cplim)
1237 		*cp2++ = *cp1++ & *cp3++;
1238 	if (cp2 < cplim2)
1239 		bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
1240 }
1241 
1242 /*
1243  * Set up a routing table entry, normally
1244  * for an interface.
1245  */
1246 int
1247 rtinit(struct ifaddr *ifa, int cmd, int flags)
1248 {
1249 	struct sockaddr *dst;
1250 	struct sockaddr *netmask;
1251 	struct mbuf *m = NULL;
1252 	struct rtentry *rt = NULL;
1253 	struct rt_addrinfo info;
1254 	int error=0;
1255 
1256 	if (flags & RTF_HOST) {
1257 		dst = ifa->ifa_dstaddr;
1258 		netmask = NULL;
1259 	} else {
1260 		dst = ifa->ifa_addr;
1261 		netmask = ifa->ifa_netmask;
1262 	}
1263 	if (dst->sa_len == 0)
1264 		return(EINVAL);
1265 
1266 	/*
1267 	 * If it's a delete, check that if it exists, it's on the correct
1268 	 * interface or we might scrub a route to another ifa which would
1269 	 * be confusing at best and possibly worse.
1270 	 */
1271 	if (cmd == RTM_DELETE) {
1272 		struct sockaddr *deldst;
1273 		struct radix_node_head *rnh;
1274 		struct radix_node *rn;
1275 
1276 		/*
1277 		 * It's a delete, so it should already exist..
1278 		 * If it's a net, mask off the host bits
1279 		 * (Assuming we have a mask)
1280 		 */
1281 		if (netmask != NULL) {
1282 			m = m_get(M_DONTWAIT, MT_SONAME);
1283 			if (m == NULL)
1284 				return(ENOBUFS);
1285 			deldst = mtod(m, struct sockaddr *);
1286 			rt_maskedcopy(dst, deldst, netmask);
1287 			dst = deldst;
1288 		}
1289 		/*
1290 		 * Look up an rtentry that is in the routing tree and
1291 		 * contains the correct info.
1292 		 */
1293 		if ((rnh = rt_tables[dst->sa_family]) == NULL)
1294 			goto bad;
1295 		RADIX_NODE_HEAD_LOCK(rnh);
1296 #ifdef RADIX_MPATH
1297 		if (rn_mpath_capable(rnh)) {
1298 
1299 			rn = rnh->rnh_matchaddr(dst, rnh);
1300 			if (rn == NULL)
1301 				error = ESRCH;
1302 			else {
1303 				rt = RNTORT(rn);
1304 				/*
1305 				 * for interface route the rt->rt_gateway is
1306 				 * sockaddr_intf for cloning ARP entries, so
1307 				 * rt_mpath_matchgate must use the interface
1308 				 * address
1309 				 */
1310 				rt = rt_mpath_matchgate(rt, ifa->ifa_addr);
1311 				if (!rt)
1312 					error = ESRCH;
1313 			}
1314 		}
1315 		else
1316 #endif
1317 		error = ((rn = rnh->rnh_lookup(dst, netmask, rnh)) == NULL ||
1318 		    (rn->rn_flags & RNF_ROOT) ||
1319 		    RNTORT(rn)->rt_ifa != ifa ||
1320 		    !sa_equal((struct sockaddr *)rn->rn_key, dst));
1321 
1322 		RADIX_NODE_HEAD_UNLOCK(rnh);
1323 		if (error) {
1324 bad:
1325 			if (m)
1326 				(void) m_free(m);
1327 			return (flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
1328 		}
1329 	}
1330 	/*
1331 	 * Do the actual request
1332 	 */
1333 	bzero((caddr_t)&info, sizeof(info));
1334 	info.rti_ifa = ifa;
1335 	info.rti_flags = flags | ifa->ifa_flags;
1336 	info.rti_info[RTAX_DST] = dst;
1337 	info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
1338 	info.rti_info[RTAX_NETMASK] = netmask;
1339 	error = rtrequest1(cmd, &info, &rt);
1340 	if (error == 0 && rt != NULL) {
1341 		/*
1342 		 * notify any listening routing agents of the change
1343 		 */
1344 		RT_LOCK(rt);
1345 #ifdef RADIX_MPATH
1346 		/*
1347 		 * in case address alias finds the first address
1348 		 * e.g. ifconfig bge0 192.103.54.246/24
1349 		 * e.g. ifconfig bge0 192.103.54.247/24
1350 		 * the address set in the route is 192.103.54.246
1351 		 * so we need to replace it with 192.103.54.247
1352 		 */
1353 		if (memcmp(rt->rt_ifa->ifa_addr, ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
1354 			IFAFREE(rt->rt_ifa);
1355 			IFAREF(ifa);
1356 			rt->rt_ifp = ifa->ifa_ifp;
1357 			rt->rt_ifa = ifa;
1358 		}
1359 #endif
1360 		rt_newaddrmsg(cmd, ifa, error, rt);
1361 		if (cmd == RTM_DELETE) {
1362 			/*
1363 			 * If we are deleting, and we found an entry, then
1364 			 * it's been removed from the tree.. now throw it away.
1365 			 */
1366 			RTFREE_LOCKED(rt);
1367 		} else {
1368 			if (cmd == RTM_ADD) {
1369 				/*
1370 				 * We just wanted to add it.. we don't actually
1371 				 * need a reference.
1372 				 */
1373 				RT_REMREF(rt);
1374 			}
1375 			RT_UNLOCK(rt);
1376 		}
1377 	}
1378 	if (m)
1379 		(void) m_free(m);
1380 	return (error);
1381 }
1382 
1383 /*
1384  * rt_check() is invoked on each layer 2 output path, prior to
1385  * encapsulating outbound packets.
1386  *
1387  * The function is mostly used to find a routing entry for the gateway,
1388  * which in some protocol families could also point to the link-level
1389  * address for the gateway itself (the side effect of revalidating the
1390  * route to the destination is rather pointless at this stage, we did it
1391  * already a moment before in the pr_output() routine to locate the ifp
1392  * and gateway to use).
1393  *
1394  * When we remove the layer-3 to layer-2 mapping tables from the
1395  * routing table, this function can be removed.
1396  *
1397  * === On input ===
1398  *   *dst is the address of the NEXT HOP (which coincides with the
1399  *	final destination if directly reachable);
1400  *   *lrt0 points to the cached route to the final destination;
1401  *   *lrt is not meaningful;
1402  *
1403  * === Operation ===
1404  * If the route is marked down try to find a new route.  If the route
1405  * to the gateway is gone, try to setup a new route.  Otherwise,
1406  * if the route is marked for packets to be rejected, enforce that.
1407  *
1408  * === On return ===
1409  *   *dst is unchanged;
1410  *   *lrt0 points to the (possibly new) route to the final destination
1411  *   *lrt points to the route to the next hop
1412  *
1413  * Their values are meaningful ONLY if no error is returned.
1414  */
1415 int
1416 rt_check(struct rtentry **lrt, struct rtentry **lrt0, struct sockaddr *dst)
1417 {
1418 	struct rtentry *rt;
1419 	struct rtentry *rt0;
1420 	int error;
1421 
1422 	KASSERT(*lrt0 != NULL, ("rt_check"));
1423 	rt = rt0 = *lrt0;
1424 
1425 	/* NB: the locking here is tortuous... */
1426 	RT_LOCK(rt);
1427 	if ((rt->rt_flags & RTF_UP) == 0) {
1428 		RT_UNLOCK(rt);
1429 		rt = rtalloc1(dst, 1, 0UL);
1430 		if (rt != NULL) {
1431 			RT_REMREF(rt);
1432 			/* XXX what about if change? */
1433 		} else
1434 			return (EHOSTUNREACH);
1435 		rt0 = rt;
1436 	}
1437 	/* XXX BSD/OS checks dst->sa_family != AF_NS */
1438 	if (rt->rt_flags & RTF_GATEWAY) {
1439 		if (rt->rt_gwroute == NULL)
1440 			goto lookup;
1441 		rt = rt->rt_gwroute;
1442 		RT_LOCK(rt);		/* NB: gwroute */
1443 		if ((rt->rt_flags & RTF_UP) == 0) {
1444 			RTFREE_LOCKED(rt);	/* unlock gwroute */
1445 			rt = rt0;
1446 			rt0->rt_gwroute = NULL;
1447 		lookup:
1448 			RT_UNLOCK(rt0);
1449 			rt = rtalloc1(rt->rt_gateway, 1, 0UL);
1450 			if (rt == rt0) {
1451 				RT_REMREF(rt0);
1452 				RT_UNLOCK(rt0);
1453 				return (ENETUNREACH);
1454 			}
1455 			RT_LOCK(rt0);
1456 			if (rt0->rt_gwroute != NULL)
1457 				RTFREE(rt0->rt_gwroute);
1458 			rt0->rt_gwroute = rt;
1459 			if (rt == NULL) {
1460 				RT_UNLOCK(rt0);
1461 				return (EHOSTUNREACH);
1462 			}
1463 		}
1464 		RT_UNLOCK(rt0);
1465 	}
1466 	/* XXX why are we inspecting rmx_expire? */
1467 	error = (rt->rt_flags & RTF_REJECT) &&
1468 		(rt->rt_rmx.rmx_expire == 0 ||
1469 			time_uptime < rt->rt_rmx.rmx_expire);
1470 	if (error) {
1471 		RT_UNLOCK(rt);
1472 		return (rt == rt0 ? EHOSTDOWN : EHOSTUNREACH);
1473 	}
1474 
1475 	*lrt = rt;
1476 	*lrt0 = rt0;
1477 	return (0);
1478 }
1479 
1480 /* This must be before ip6_init2(), which is now SI_ORDER_MIDDLE */
1481 SYSINIT(route, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);
1482