xref: /freebsd/sys/net/route.c (revision 595e514d0df2bac5b813d35f83e32875dbf16a83)
1 /*-
2  * Copyright (c) 1980, 1986, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	@(#)route.c	8.3.1.1 (Berkeley) 2/23/95
30  * $FreeBSD$
31  */
32 /************************************************************************
33  * Note: In this file a 'fib' is a "forwarding information base"	*
34  * Which is the new name for an in kernel routing (next hop) table.	*
35  ***********************************************************************/
36 
37 #include "opt_inet.h"
38 #include "opt_inet6.h"
39 #include "opt_route.h"
40 #include "opt_mrouting.h"
41 #include "opt_mpath.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/syslog.h>
46 #include <sys/malloc.h>
47 #include <sys/mbuf.h>
48 #include <sys/socket.h>
49 #include <sys/sysctl.h>
50 #include <sys/syslog.h>
51 #include <sys/sysproto.h>
52 #include <sys/proc.h>
53 #include <sys/domain.h>
54 #include <sys/kernel.h>
55 
56 #include <net/if.h>
57 #include <net/if_dl.h>
58 #include <net/route.h>
59 #include <net/vnet.h>
60 #include <net/flowtable.h>
61 
62 #ifdef RADIX_MPATH
63 #include <net/radix_mpath.h>
64 #endif
65 
66 #include <netinet/in.h>
67 #include <netinet/ip_mroute.h>
68 
69 #include <vm/uma.h>
70 
71 #define	RT_MAXFIBS	UINT16_MAX
72 
73 /* Kernel config default option. */
74 #ifdef ROUTETABLES
75 #if ROUTETABLES <= 0
76 #error "ROUTETABLES defined too low"
77 #endif
78 #if ROUTETABLES > RT_MAXFIBS
79 #error "ROUTETABLES defined too big"
80 #endif
81 #define	RT_NUMFIBS	ROUTETABLES
82 #endif /* ROUTETABLES */
83 /* Initialize to default if not otherwise set. */
84 #ifndef	RT_NUMFIBS
85 #define	RT_NUMFIBS	1
86 #endif
87 
88 /* This is read-only.. */
89 u_int rt_numfibs = RT_NUMFIBS;
90 SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, "");
91 /* and this can be set too big but will be fixed before it is used */
92 TUNABLE_INT("net.fibs", &rt_numfibs);
93 
94 /*
95  * By default add routes to all fibs for new interfaces.
96  * Once this is set to 0 then only allocate routes on interface
97  * changes for the FIB of the caller when adding a new set of addresses
98  * to an interface.  XXX this is a shotgun aproach to a problem that needs
99  * a more fine grained solution.. that will come.
100  * XXX also has the problems getting the FIB from curthread which will not
101  * always work given the fib can be overridden and prefixes can be added
102  * from the network stack context.
103  */
104 u_int rt_add_addr_allfibs = 1;
105 SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW,
106     &rt_add_addr_allfibs, 0, "");
107 TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs);
108 
109 VNET_DEFINE(struct rtstat, rtstat);
110 #define	V_rtstat	VNET(rtstat)
111 
112 VNET_DEFINE(struct radix_node_head *, rt_tables);
113 #define	V_rt_tables	VNET(rt_tables)
114 
115 VNET_DEFINE(int, rttrash);		/* routes not in table but not freed */
116 #define	V_rttrash	VNET(rttrash)
117 
118 
119 /* compare two sockaddr structures */
120 #define	sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0)
121 
122 /*
123  * Convert a 'struct radix_node *' to a 'struct rtentry *'.
124  * The operation can be done safely (in this code) because a
125  * 'struct rtentry' starts with two 'struct radix_node''s, the first
126  * one representing leaf nodes in the routing tree, which is
127  * what the code in radix.c passes us as a 'struct radix_node'.
128  *
129  * But because there are a lot of assumptions in this conversion,
130  * do not cast explicitly, but always use the macro below.
131  */
132 #define RNTORT(p)	((struct rtentry *)(p))
133 
134 static VNET_DEFINE(uma_zone_t, rtzone);		/* Routing table UMA zone. */
135 #define	V_rtzone	VNET(rtzone)
136 
137 /*
138  * handler for net.my_fibnum
139  */
140 static int
141 sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
142 {
143         int fibnum;
144         int error;
145 
146         fibnum = curthread->td_proc->p_fibnum;
147         error = sysctl_handle_int(oidp, &fibnum, 0, req);
148         return (error);
149 }
150 
151 SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD,
152             NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller");
153 
154 static __inline struct radix_node_head **
155 rt_tables_get_rnh_ptr(int table, int fam)
156 {
157 	struct radix_node_head **rnh;
158 
159 	KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.",
160 	    __func__));
161 	KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.",
162 	    __func__));
163 
164 	/* rnh is [fib=0][af=0]. */
165 	rnh = (struct radix_node_head **)V_rt_tables;
166 	/* Get the offset to the requested table and fam. */
167 	rnh += table * (AF_MAX+1) + fam;
168 
169 	return (rnh);
170 }
171 
172 struct radix_node_head *
173 rt_tables_get_rnh(int table, int fam)
174 {
175 
176 	return (*rt_tables_get_rnh_ptr(table, fam));
177 }
178 
179 /*
180  * route initialization must occur before ip6_init2(), which happenas at
181  * SI_ORDER_MIDDLE.
182  */
183 static void
184 route_init(void)
185 {
186 	struct domain *dom;
187 	int max_keylen = 0;
188 
189 	/* whack the tunable ints into  line. */
190 	if (rt_numfibs > RT_MAXFIBS)
191 		rt_numfibs = RT_MAXFIBS;
192 	if (rt_numfibs == 0)
193 		rt_numfibs = 1;
194 
195 	for (dom = domains; dom; dom = dom->dom_next)
196 		if (dom->dom_maxrtkey > max_keylen)
197 			max_keylen = dom->dom_maxrtkey;
198 
199 	rn_init(max_keylen);	/* init all zeroes, all ones, mask table */
200 }
201 SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);
202 
203 static void
204 vnet_route_init(const void *unused __unused)
205 {
206 	struct domain *dom;
207 	struct radix_node_head **rnh;
208 	int table;
209 	int fam;
210 
211 	V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) *
212 	    sizeof(struct radix_node_head *), M_RTABLE, M_WAITOK|M_ZERO);
213 
214 	V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL,
215 	    NULL, NULL, UMA_ALIGN_PTR, 0);
216 	for (dom = domains; dom; dom = dom->dom_next) {
217 		if (dom->dom_rtattach == NULL)
218 			continue;
219 
220 		for  (table = 0; table < rt_numfibs; table++) {
221 			fam = dom->dom_family;
222 			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
223 				break;
224 
225 			/*
226 			 * XXX MRT rtattach will be also called from
227 			 * vfs_export.c but the offset will be 0 (only for
228 			 * AF_INET and AF_INET6 which don't need it anyhow).
229 			 */
230 			rnh = rt_tables_get_rnh_ptr(table, fam);
231 			if (rnh == NULL)
232 				panic("%s: rnh NULL", __func__);
233 			dom->dom_rtattach((void **)rnh, dom->dom_rtoffset);
234 		}
235 	}
236 }
237 VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
238     vnet_route_init, 0);
239 
240 #ifdef VIMAGE
241 static void
242 vnet_route_uninit(const void *unused __unused)
243 {
244 	int table;
245 	int fam;
246 	struct domain *dom;
247 	struct radix_node_head **rnh;
248 
249 	for (dom = domains; dom; dom = dom->dom_next) {
250 		if (dom->dom_rtdetach == NULL)
251 			continue;
252 
253 		for (table = 0; table < rt_numfibs; table++) {
254 			fam = dom->dom_family;
255 
256 			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
257 				break;
258 
259 			rnh = rt_tables_get_rnh_ptr(table, fam);
260 			if (rnh == NULL)
261 				panic("%s: rnh NULL", __func__);
262 			dom->dom_rtdetach((void **)rnh, dom->dom_rtoffset);
263 		}
264 	}
265 }
266 VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
267     vnet_route_uninit, 0);
268 #endif
269 
270 #ifndef _SYS_SYSPROTO_H_
271 struct setfib_args {
272 	int     fibnum;
273 };
274 #endif
275 int
276 sys_setfib(struct thread *td, struct setfib_args *uap)
277 {
278 	if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs)
279 		return EINVAL;
280 	td->td_proc->p_fibnum = uap->fibnum;
281 	return (0);
282 }
283 
284 /*
285  * Packet routing routines.
286  */
287 void
288 rtalloc(struct route *ro)
289 {
290 
291 	rtalloc_ign_fib(ro, 0UL, RT_DEFAULT_FIB);
292 }
293 
294 void
295 rtalloc_fib(struct route *ro, u_int fibnum)
296 {
297 	rtalloc_ign_fib(ro, 0UL, fibnum);
298 }
299 
300 void
301 rtalloc_ign(struct route *ro, u_long ignore)
302 {
303 	struct rtentry *rt;
304 
305 	if ((rt = ro->ro_rt) != NULL) {
306 		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
307 			return;
308 		RTFREE(rt);
309 		ro->ro_rt = NULL;
310 	}
311 	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, RT_DEFAULT_FIB);
312 	if (ro->ro_rt)
313 		RT_UNLOCK(ro->ro_rt);
314 }
315 
316 void
317 rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum)
318 {
319 	struct rtentry *rt;
320 
321 	if ((rt = ro->ro_rt) != NULL) {
322 		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
323 			return;
324 		RTFREE(rt);
325 		ro->ro_rt = NULL;
326 	}
327 	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum);
328 	if (ro->ro_rt)
329 		RT_UNLOCK(ro->ro_rt);
330 }
331 
332 /*
333  * Look up the route that matches the address given
334  * Or, at least try.. Create a cloned route if needed.
335  *
336  * The returned route, if any, is locked.
337  */
338 struct rtentry *
339 rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
340 {
341 
342 	return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB));
343 }
344 
345 struct rtentry *
346 rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags,
347 		    u_int fibnum)
348 {
349 	struct radix_node_head *rnh;
350 	struct radix_node *rn;
351 	struct rtentry *newrt;
352 	struct rt_addrinfo info;
353 	int err = 0, msgtype = RTM_MISS;
354 	int needlock;
355 
356 	KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
357 	switch (dst->sa_family) {
358 	case AF_INET6:
359 	case AF_INET:
360 		/* We support multiple FIBs. */
361 		break;
362 	default:
363 		fibnum = RT_DEFAULT_FIB;
364 		break;
365 	}
366 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
367 	newrt = NULL;
368 	if (rnh == NULL)
369 		goto miss;
370 
371 	/*
372 	 * Look up the address in the table for that Address Family
373 	 */
374 	needlock = !(ignflags & RTF_RNH_LOCKED);
375 	if (needlock)
376 		RADIX_NODE_HEAD_RLOCK(rnh);
377 #ifdef INVARIANTS
378 	else
379 		RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
380 #endif
381 	rn = rnh->rnh_matchaddr(dst, rnh);
382 	if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
383 		newrt = RNTORT(rn);
384 		RT_LOCK(newrt);
385 		RT_ADDREF(newrt);
386 		if (needlock)
387 			RADIX_NODE_HEAD_RUNLOCK(rnh);
388 		goto done;
389 
390 	} else if (needlock)
391 		RADIX_NODE_HEAD_RUNLOCK(rnh);
392 
393 	/*
394 	 * Either we hit the root or couldn't find any match,
395 	 * Which basically means
396 	 * "caint get there frm here"
397 	 */
398 miss:
399 	V_rtstat.rts_unreach++;
400 
401 	if (report) {
402 		/*
403 		 * If required, report the failure to the supervising
404 		 * Authorities.
405 		 * For a delete, this is not an error. (report == 0)
406 		 */
407 		bzero(&info, sizeof(info));
408 		info.rti_info[RTAX_DST] = dst;
409 		rt_missmsg_fib(msgtype, &info, 0, err, fibnum);
410 	}
411 done:
412 	if (newrt)
413 		RT_LOCK_ASSERT(newrt);
414 	return (newrt);
415 }
416 
417 /*
418  * Remove a reference count from an rtentry.
419  * If the count gets low enough, take it out of the routing table
420  */
421 void
422 rtfree(struct rtentry *rt)
423 {
424 	struct radix_node_head *rnh;
425 
426 	KASSERT(rt != NULL,("%s: NULL rt", __func__));
427 	rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
428 	KASSERT(rnh != NULL,("%s: NULL rnh", __func__));
429 
430 	RT_LOCK_ASSERT(rt);
431 
432 	/*
433 	 * The callers should use RTFREE_LOCKED() or RTFREE(), so
434 	 * we should come here exactly with the last reference.
435 	 */
436 	RT_REMREF(rt);
437 	if (rt->rt_refcnt > 0) {
438 		log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt);
439 		goto done;
440 	}
441 
442 	/*
443 	 * On last reference give the "close method" a chance
444 	 * to cleanup private state.  This also permits (for
445 	 * IPv4 and IPv6) a chance to decide if the routing table
446 	 * entry should be purged immediately or at a later time.
447 	 * When an immediate purge is to happen the close routine
448 	 * typically calls rtexpunge which clears the RTF_UP flag
449 	 * on the entry so that the code below reclaims the storage.
450 	 */
451 	if (rt->rt_refcnt == 0 && rnh->rnh_close)
452 		rnh->rnh_close((struct radix_node *)rt, rnh);
453 
454 	/*
455 	 * If we are no longer "up" (and ref == 0)
456 	 * then we can free the resources associated
457 	 * with the route.
458 	 */
459 	if ((rt->rt_flags & RTF_UP) == 0) {
460 		if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
461 			panic("rtfree 2");
462 		/*
463 		 * the rtentry must have been removed from the routing table
464 		 * so it is represented in rttrash.. remove that now.
465 		 */
466 		V_rttrash--;
467 #ifdef	DIAGNOSTIC
468 		if (rt->rt_refcnt < 0) {
469 			printf("rtfree: %p not freed (neg refs)\n", rt);
470 			goto done;
471 		}
472 #endif
473 		/*
474 		 * release references on items we hold them on..
475 		 * e.g other routes and ifaddrs.
476 		 */
477 		if (rt->rt_ifa)
478 			ifa_free(rt->rt_ifa);
479 		/*
480 		 * The key is separatly alloc'd so free it (see rt_setgate()).
481 		 * This also frees the gateway, as they are always malloc'd
482 		 * together.
483 		 */
484 		Free(rt_key(rt));
485 
486 		/*
487 		 * and the rtentry itself of course
488 		 */
489 		RT_LOCK_DESTROY(rt);
490 		uma_zfree(V_rtzone, rt);
491 		return;
492 	}
493 done:
494 	RT_UNLOCK(rt);
495 }
496 
497 
498 /*
499  * Force a routing table entry to the specified
500  * destination to go through the given gateway.
501  * Normally called as a result of a routing redirect
502  * message from the network layer.
503  */
504 void
505 rtredirect(struct sockaddr *dst,
506 	struct sockaddr *gateway,
507 	struct sockaddr *netmask,
508 	int flags,
509 	struct sockaddr *src)
510 {
511 
512 	rtredirect_fib(dst, gateway, netmask, flags, src, RT_DEFAULT_FIB);
513 }
514 
515 void
516 rtredirect_fib(struct sockaddr *dst,
517 	struct sockaddr *gateway,
518 	struct sockaddr *netmask,
519 	int flags,
520 	struct sockaddr *src,
521 	u_int fibnum)
522 {
523 	struct rtentry *rt, *rt0 = NULL;
524 	int error = 0;
525 	short *stat = NULL;
526 	struct rt_addrinfo info;
527 	struct ifaddr *ifa;
528 	struct radix_node_head *rnh;
529 
530 	ifa = NULL;
531 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
532 	if (rnh == NULL) {
533 		error = EAFNOSUPPORT;
534 		goto out;
535 	}
536 
537 	/* verify the gateway is directly reachable */
538 	if ((ifa = ifa_ifwithnet(gateway, 0)) == NULL) {
539 		error = ENETUNREACH;
540 		goto out;
541 	}
542 	rt = rtalloc1_fib(dst, 0, 0UL, fibnum);	/* NB: rt is locked */
543 	/*
544 	 * If the redirect isn't from our current router for this dst,
545 	 * it's either old or wrong.  If it redirects us to ourselves,
546 	 * we have a routing loop, perhaps as a result of an interface
547 	 * going down recently.
548 	 */
549 	if (!(flags & RTF_DONE) && rt &&
550 	     (!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa))
551 		error = EINVAL;
552 	else if (ifa_ifwithaddr_check(gateway))
553 		error = EHOSTUNREACH;
554 	if (error)
555 		goto done;
556 	/*
557 	 * Create a new entry if we just got back a wildcard entry
558 	 * or the lookup failed.  This is necessary for hosts
559 	 * which use routing redirects generated by smart gateways
560 	 * to dynamically build the routing tables.
561 	 */
562 	if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
563 		goto create;
564 	/*
565 	 * Don't listen to the redirect if it's
566 	 * for a route to an interface.
567 	 */
568 	if (rt->rt_flags & RTF_GATEWAY) {
569 		if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
570 			/*
571 			 * Changing from route to net => route to host.
572 			 * Create new route, rather than smashing route to net.
573 			 */
574 		create:
575 			rt0 = rt;
576 			rt = NULL;
577 
578 			flags |=  RTF_GATEWAY | RTF_DYNAMIC;
579 			bzero((caddr_t)&info, sizeof(info));
580 			info.rti_info[RTAX_DST] = dst;
581 			info.rti_info[RTAX_GATEWAY] = gateway;
582 			info.rti_info[RTAX_NETMASK] = netmask;
583 			info.rti_ifa = ifa;
584 			info.rti_flags = flags;
585 			if (rt0 != NULL)
586 				RT_UNLOCK(rt0);	/* drop lock to avoid LOR with RNH */
587 			error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
588 			if (rt != NULL) {
589 				RT_LOCK(rt);
590 				if (rt0 != NULL)
591 					EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst);
592 				flags = rt->rt_flags;
593 			}
594 			if (rt0 != NULL)
595 				RTFREE(rt0);
596 
597 			stat = &V_rtstat.rts_dynamic;
598 		} else {
599 			struct rtentry *gwrt;
600 
601 			/*
602 			 * Smash the current notion of the gateway to
603 			 * this destination.  Should check about netmask!!!
604 			 */
605 			rt->rt_flags |= RTF_MODIFIED;
606 			flags |= RTF_MODIFIED;
607 			stat = &V_rtstat.rts_newgateway;
608 			/*
609 			 * add the key and gateway (in one malloc'd chunk).
610 			 */
611 			RT_UNLOCK(rt);
612 			RADIX_NODE_HEAD_LOCK(rnh);
613 			RT_LOCK(rt);
614 			rt_setgate(rt, rt_key(rt), gateway);
615 			gwrt = rtalloc1(gateway, 1, RTF_RNH_LOCKED);
616 			RADIX_NODE_HEAD_UNLOCK(rnh);
617 			EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst);
618 			RTFREE_LOCKED(gwrt);
619 		}
620 	} else
621 		error = EHOSTUNREACH;
622 done:
623 	if (rt)
624 		RTFREE_LOCKED(rt);
625 out:
626 	if (error)
627 		V_rtstat.rts_badredirect++;
628 	else if (stat != NULL)
629 		(*stat)++;
630 	bzero((caddr_t)&info, sizeof(info));
631 	info.rti_info[RTAX_DST] = dst;
632 	info.rti_info[RTAX_GATEWAY] = gateway;
633 	info.rti_info[RTAX_NETMASK] = netmask;
634 	info.rti_info[RTAX_AUTHOR] = src;
635 	rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum);
636 	if (ifa != NULL)
637 		ifa_free(ifa);
638 }
639 
640 int
641 rtioctl(u_long req, caddr_t data)
642 {
643 
644 	return (rtioctl_fib(req, data, RT_DEFAULT_FIB));
645 }
646 
647 /*
648  * Routing table ioctl interface.
649  */
650 int
651 rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
652 {
653 
654 	/*
655 	 * If more ioctl commands are added here, make sure the proper
656 	 * super-user checks are being performed because it is possible for
657 	 * prison-root to make it this far if raw sockets have been enabled
658 	 * in jails.
659 	 */
660 #ifdef INET
661 	/* Multicast goop, grrr... */
662 	return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
663 #else /* INET */
664 	return ENXIO;
665 #endif /* INET */
666 }
667 
668 /*
669  * For both ifa_ifwithroute() routines, 'ifa' is returned referenced.
670  */
671 struct ifaddr *
672 ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway)
673 {
674 
675 	return (ifa_ifwithroute_fib(flags, dst, gateway, RT_DEFAULT_FIB));
676 }
677 
678 struct ifaddr *
679 ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway,
680 				u_int fibnum)
681 {
682 	register struct ifaddr *ifa;
683 	int not_found = 0;
684 
685 	if ((flags & RTF_GATEWAY) == 0) {
686 		/*
687 		 * If we are adding a route to an interface,
688 		 * and the interface is a pt to pt link
689 		 * we should search for the destination
690 		 * as our clue to the interface.  Otherwise
691 		 * we can use the local address.
692 		 */
693 		ifa = NULL;
694 		if (flags & RTF_HOST)
695 			ifa = ifa_ifwithdstaddr(dst);
696 		if (ifa == NULL)
697 			ifa = ifa_ifwithaddr(gateway);
698 	} else {
699 		/*
700 		 * If we are adding a route to a remote net
701 		 * or host, the gateway may still be on the
702 		 * other end of a pt to pt link.
703 		 */
704 		ifa = ifa_ifwithdstaddr(gateway);
705 	}
706 	if (ifa == NULL)
707 		ifa = ifa_ifwithnet(gateway, 0);
708 	if (ifa == NULL) {
709 		struct rtentry *rt = rtalloc1_fib(gateway, 0, RTF_RNH_LOCKED, fibnum);
710 		if (rt == NULL)
711 			return (NULL);
712 		/*
713 		 * dismiss a gateway that is reachable only
714 		 * through the default router
715 		 */
716 		switch (gateway->sa_family) {
717 		case AF_INET:
718 			if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY)
719 				not_found = 1;
720 			break;
721 		case AF_INET6:
722 			if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr))
723 				not_found = 1;
724 			break;
725 		default:
726 			break;
727 		}
728 		if (!not_found && rt->rt_ifa != NULL) {
729 			ifa = rt->rt_ifa;
730 			ifa_ref(ifa);
731 		}
732 		RT_REMREF(rt);
733 		RT_UNLOCK(rt);
734 		if (not_found || ifa == NULL)
735 			return (NULL);
736 	}
737 	if (ifa->ifa_addr->sa_family != dst->sa_family) {
738 		struct ifaddr *oifa = ifa;
739 		ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
740 		if (ifa == NULL)
741 			ifa = oifa;
742 		else
743 			ifa_free(oifa);
744 	}
745 	return (ifa);
746 }
747 
748 /*
749  * Do appropriate manipulations of a routing tree given
750  * all the bits of info needed
751  */
752 int
753 rtrequest(int req,
754 	struct sockaddr *dst,
755 	struct sockaddr *gateway,
756 	struct sockaddr *netmask,
757 	int flags,
758 	struct rtentry **ret_nrt)
759 {
760 
761 	return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt,
762 	    RT_DEFAULT_FIB));
763 }
764 
765 int
766 rtrequest_fib(int req,
767 	struct sockaddr *dst,
768 	struct sockaddr *gateway,
769 	struct sockaddr *netmask,
770 	int flags,
771 	struct rtentry **ret_nrt,
772 	u_int fibnum)
773 {
774 	struct rt_addrinfo info;
775 
776 	if (dst->sa_len == 0)
777 		return(EINVAL);
778 
779 	bzero((caddr_t)&info, sizeof(info));
780 	info.rti_flags = flags;
781 	info.rti_info[RTAX_DST] = dst;
782 	info.rti_info[RTAX_GATEWAY] = gateway;
783 	info.rti_info[RTAX_NETMASK] = netmask;
784 	return rtrequest1_fib(req, &info, ret_nrt, fibnum);
785 }
786 
787 /*
788  * These (questionable) definitions of apparent local variables apply
789  * to the next two functions.  XXXXXX!!!
790  */
791 #define	dst	info->rti_info[RTAX_DST]
792 #define	gateway	info->rti_info[RTAX_GATEWAY]
793 #define	netmask	info->rti_info[RTAX_NETMASK]
794 #define	ifaaddr	info->rti_info[RTAX_IFA]
795 #define	ifpaddr	info->rti_info[RTAX_IFP]
796 #define	flags	info->rti_flags
797 
798 int
799 rt_getifa(struct rt_addrinfo *info)
800 {
801 
802 	return (rt_getifa_fib(info, RT_DEFAULT_FIB));
803 }
804 
805 /*
806  * Look up rt_addrinfo for a specific fib.  Note that if rti_ifa is defined,
807  * it will be referenced so the caller must free it.
808  */
809 int
810 rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
811 {
812 	struct ifaddr *ifa;
813 	int error = 0;
814 
815 	/*
816 	 * ifp may be specified by sockaddr_dl
817 	 * when protocol address is ambiguous.
818 	 */
819 	if (info->rti_ifp == NULL && ifpaddr != NULL &&
820 	    ifpaddr->sa_family == AF_LINK &&
821 	    (ifa = ifa_ifwithnet(ifpaddr, 0)) != NULL) {
822 		info->rti_ifp = ifa->ifa_ifp;
823 		ifa_free(ifa);
824 	}
825 	if (info->rti_ifa == NULL && ifaaddr != NULL)
826 		info->rti_ifa = ifa_ifwithaddr(ifaaddr);
827 	if (info->rti_ifa == NULL) {
828 		struct sockaddr *sa;
829 
830 		sa = ifaaddr != NULL ? ifaaddr :
831 		    (gateway != NULL ? gateway : dst);
832 		if (sa != NULL && info->rti_ifp != NULL)
833 			info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
834 		else if (dst != NULL && gateway != NULL)
835 			info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway,
836 							fibnum);
837 		else if (sa != NULL)
838 			info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa,
839 							fibnum);
840 	}
841 	if ((ifa = info->rti_ifa) != NULL) {
842 		if (info->rti_ifp == NULL)
843 			info->rti_ifp = ifa->ifa_ifp;
844 	} else
845 		error = ENETUNREACH;
846 	return (error);
847 }
848 
849 /*
850  * Expunges references to a route that's about to be reclaimed.
851  * The route must be locked.
852  */
853 int
854 rtexpunge(struct rtentry *rt)
855 {
856 #if !defined(RADIX_MPATH)
857 	struct radix_node *rn;
858 #else
859 	struct rt_addrinfo info;
860 	int fib;
861 	struct rtentry *rt0;
862 #endif
863 	struct radix_node_head *rnh;
864 	struct ifaddr *ifa;
865 	int error = 0;
866 
867 	/*
868 	 * Find the correct routing tree to use for this Address Family
869 	 */
870 	rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
871 	RT_LOCK_ASSERT(rt);
872 	if (rnh == NULL)
873 		return (EAFNOSUPPORT);
874 	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
875 
876 #ifdef RADIX_MPATH
877 	fib = rt->rt_fibnum;
878 	bzero(&info, sizeof(info));
879 	info.rti_ifp = rt->rt_ifp;
880 	info.rti_flags = RTF_RNH_LOCKED;
881 	info.rti_info[RTAX_DST] = rt_key(rt);
882 	info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr;
883 
884 	RT_UNLOCK(rt);
885 	error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib);
886 
887 	if (error == 0 && rt0 != NULL) {
888 		rt = rt0;
889 		RT_LOCK(rt);
890 	} else if (error != 0) {
891 		RT_LOCK(rt);
892 		return (error);
893 	}
894 #else
895 	/*
896 	 * Remove the item from the tree; it should be there,
897 	 * but when callers invoke us blindly it may not (sigh).
898 	 */
899 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh);
900 	if (rn == NULL) {
901 		error = ESRCH;
902 		goto bad;
903 	}
904 	KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0,
905 		("unexpected flags 0x%x", rn->rn_flags));
906 	KASSERT(rt == RNTORT(rn),
907 		("lookup mismatch, rt %p rn %p", rt, rn));
908 #endif /* RADIX_MPATH */
909 
910 	rt->rt_flags &= ~RTF_UP;
911 
912 	/*
913 	 * Give the protocol a chance to keep things in sync.
914 	 */
915 	if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) {
916 		struct rt_addrinfo info;
917 
918 		bzero((caddr_t)&info, sizeof(info));
919 		info.rti_flags = rt->rt_flags;
920 		info.rti_info[RTAX_DST] = rt_key(rt);
921 		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
922 		info.rti_info[RTAX_NETMASK] = rt_mask(rt);
923 		ifa->ifa_rtrequest(RTM_DELETE, rt, &info);
924 	}
925 
926 	/*
927 	 * one more rtentry floating around that is not
928 	 * linked to the routing table.
929 	 */
930 	V_rttrash++;
931 #if !defined(RADIX_MPATH)
932 bad:
933 #endif
934 	return (error);
935 }
936 
937 #ifdef RADIX_MPATH
938 static int
939 rn_mpath_update(int req, struct rt_addrinfo *info,
940     struct radix_node_head *rnh, struct rtentry **ret_nrt)
941 {
942 	/*
943 	 * if we got multipath routes, we require users to specify
944 	 * a matching RTAX_GATEWAY.
945 	 */
946 	struct rtentry *rt, *rto = NULL;
947 	register struct radix_node *rn;
948 	int error = 0;
949 
950 	rn = rnh->rnh_matchaddr(dst, rnh);
951 	if (rn == NULL)
952 		return (ESRCH);
953 	rto = rt = RNTORT(rn);
954 	rt = rt_mpath_matchgate(rt, gateway);
955 	if (rt == NULL)
956 		return (ESRCH);
957 	/*
958 	 * this is the first entry in the chain
959 	 */
960 	if (rto == rt) {
961 		rn = rn_mpath_next((struct radix_node *)rt);
962 		/*
963 		 * there is another entry, now it's active
964 		 */
965 		if (rn) {
966 			rto = RNTORT(rn);
967 			RT_LOCK(rto);
968 			rto->rt_flags |= RTF_UP;
969 			RT_UNLOCK(rto);
970 		} else if (rt->rt_flags & RTF_GATEWAY) {
971 			/*
972 			 * For gateway routes, we need to
973 			 * make sure that we we are deleting
974 			 * the correct gateway.
975 			 * rt_mpath_matchgate() does not
976 			 * check the case when there is only
977 			 * one route in the chain.
978 			 */
979 			if (gateway &&
980 			    (rt->rt_gateway->sa_len != gateway->sa_len ||
981 				memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
982 				error = ESRCH;
983 			else {
984 				/*
985 				 * remove from tree before returning it
986 				 * to the caller
987 				 */
988 				rn = rnh->rnh_deladdr(dst, netmask, rnh);
989 				KASSERT(rt == RNTORT(rn), ("radix node disappeared"));
990 				goto gwdelete;
991 			}
992 
993 		}
994 		/*
995 		 * use the normal delete code to remove
996 		 * the first entry
997 		 */
998 		if (req != RTM_DELETE)
999 			goto nondelete;
1000 
1001 		error = ENOENT;
1002 		goto done;
1003 	}
1004 
1005 	/*
1006 	 * if the entry is 2nd and on up
1007 	 */
1008 	if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
1009 		panic ("rtrequest1: rt_mpath_deldup");
1010 gwdelete:
1011 	RT_LOCK(rt);
1012 	RT_ADDREF(rt);
1013 	if (req == RTM_DELETE) {
1014 		rt->rt_flags &= ~RTF_UP;
1015 		/*
1016 		 * One more rtentry floating around that is not
1017 		 * linked to the routing table. rttrash will be decremented
1018 		 * when RTFREE(rt) is eventually called.
1019 		 */
1020 		V_rttrash++;
1021 	}
1022 
1023 nondelete:
1024 	if (req != RTM_DELETE)
1025 		panic("unrecognized request %d", req);
1026 
1027 
1028 	/*
1029 	 * If the caller wants it, then it can have it,
1030 	 * but it's up to it to free the rtentry as we won't be
1031 	 * doing it.
1032 	 */
1033 	if (ret_nrt) {
1034 		*ret_nrt = rt;
1035 		RT_UNLOCK(rt);
1036 	} else
1037 		RTFREE_LOCKED(rt);
1038 done:
1039 	return (error);
1040 }
1041 #endif
1042 
1043 int
1044 rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
1045 				u_int fibnum)
1046 {
1047 	int error = 0, needlock = 0;
1048 	register struct rtentry *rt;
1049 #ifdef FLOWTABLE
1050 	register struct rtentry *rt0;
1051 #endif
1052 	register struct radix_node *rn;
1053 	register struct radix_node_head *rnh;
1054 	struct ifaddr *ifa;
1055 	struct sockaddr *ndst;
1056 	struct sockaddr_storage mdst;
1057 #define senderr(x) { error = x ; goto bad; }
1058 
1059 	KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
1060 	switch (dst->sa_family) {
1061 	case AF_INET6:
1062 	case AF_INET:
1063 		/* We support multiple FIBs. */
1064 		break;
1065 	default:
1066 		fibnum = RT_DEFAULT_FIB;
1067 		break;
1068 	}
1069 
1070 	/*
1071 	 * Find the correct routing tree to use for this Address Family
1072 	 */
1073 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
1074 	if (rnh == NULL)
1075 		return (EAFNOSUPPORT);
1076 	needlock = ((flags & RTF_RNH_LOCKED) == 0);
1077 	flags &= ~RTF_RNH_LOCKED;
1078 	if (needlock)
1079 		RADIX_NODE_HEAD_LOCK(rnh);
1080 	else
1081 		RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
1082 	/*
1083 	 * If we are adding a host route then we don't want to put
1084 	 * a netmask in the tree, nor do we want to clone it.
1085 	 */
1086 	if (flags & RTF_HOST)
1087 		netmask = NULL;
1088 
1089 	switch (req) {
1090 	case RTM_DELETE:
1091 		if (netmask) {
1092 			rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
1093 			dst = (struct sockaddr *)&mdst;
1094 		}
1095 #ifdef RADIX_MPATH
1096 		if (rn_mpath_capable(rnh)) {
1097 			error = rn_mpath_update(req, info, rnh, ret_nrt);
1098 			/*
1099 			 * "bad" holds true for the success case
1100 			 * as well
1101 			 */
1102 			if (error != ENOENT)
1103 				goto bad;
1104 			error = 0;
1105 		}
1106 #endif
1107 		if ((flags & RTF_PINNED) == 0) {
1108 			/* Check if target route can be deleted */
1109 			rt = (struct rtentry *)rnh->rnh_lookup(dst,
1110 			    netmask, rnh);
1111 			if ((rt != NULL) && (rt->rt_flags & RTF_PINNED))
1112 				senderr(EADDRINUSE);
1113 		}
1114 
1115 		/*
1116 		 * Remove the item from the tree and return it.
1117 		 * Complain if it is not there and do no more processing.
1118 		 */
1119 		rn = rnh->rnh_deladdr(dst, netmask, rnh);
1120 		if (rn == NULL)
1121 			senderr(ESRCH);
1122 		if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
1123 			panic ("rtrequest delete");
1124 		rt = RNTORT(rn);
1125 		RT_LOCK(rt);
1126 		RT_ADDREF(rt);
1127 		rt->rt_flags &= ~RTF_UP;
1128 
1129 		/*
1130 		 * give the protocol a chance to keep things in sync.
1131 		 */
1132 		if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
1133 			ifa->ifa_rtrequest(RTM_DELETE, rt, info);
1134 
1135 		/*
1136 		 * One more rtentry floating around that is not
1137 		 * linked to the routing table. rttrash will be decremented
1138 		 * when RTFREE(rt) is eventually called.
1139 		 */
1140 		V_rttrash++;
1141 
1142 		/*
1143 		 * If the caller wants it, then it can have it,
1144 		 * but it's up to it to free the rtentry as we won't be
1145 		 * doing it.
1146 		 */
1147 		if (ret_nrt) {
1148 			*ret_nrt = rt;
1149 			RT_UNLOCK(rt);
1150 		} else
1151 			RTFREE_LOCKED(rt);
1152 		break;
1153 	case RTM_RESOLVE:
1154 		/*
1155 		 * resolve was only used for route cloning
1156 		 * here for compat
1157 		 */
1158 		break;
1159 	case RTM_ADD:
1160 		if ((flags & RTF_GATEWAY) && !gateway)
1161 			senderr(EINVAL);
1162 		if (dst && gateway && (dst->sa_family != gateway->sa_family) &&
1163 		    (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
1164 			senderr(EINVAL);
1165 
1166 		if (info->rti_ifa == NULL) {
1167 			error = rt_getifa_fib(info, fibnum);
1168 			if (error)
1169 				senderr(error);
1170 		} else
1171 			ifa_ref(info->rti_ifa);
1172 		ifa = info->rti_ifa;
1173 		rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO);
1174 		if (rt == NULL) {
1175 			ifa_free(ifa);
1176 			senderr(ENOBUFS);
1177 		}
1178 		RT_LOCK_INIT(rt);
1179 		rt->rt_flags = RTF_UP | flags;
1180 		rt->rt_fibnum = fibnum;
1181 		/*
1182 		 * Add the gateway. Possibly re-malloc-ing the storage for it.
1183 		 */
1184 		RT_LOCK(rt);
1185 		if ((error = rt_setgate(rt, dst, gateway)) != 0) {
1186 			RT_LOCK_DESTROY(rt);
1187 			ifa_free(ifa);
1188 			uma_zfree(V_rtzone, rt);
1189 			senderr(error);
1190 		}
1191 
1192 		/*
1193 		 * point to the (possibly newly malloc'd) dest address.
1194 		 */
1195 		ndst = (struct sockaddr *)rt_key(rt);
1196 
1197 		/*
1198 		 * make sure it contains the value we want (masked if needed).
1199 		 */
1200 		if (netmask) {
1201 			rt_maskedcopy(dst, ndst, netmask);
1202 		} else
1203 			bcopy(dst, ndst, dst->sa_len);
1204 
1205 		/*
1206 		 * We use the ifa reference returned by rt_getifa_fib().
1207 		 * This moved from below so that rnh->rnh_addaddr() can
1208 		 * examine the ifa and  ifa->ifa_ifp if it so desires.
1209 		 */
1210 		rt->rt_ifa = ifa;
1211 		rt->rt_ifp = ifa->ifa_ifp;
1212 		rt->rt_rmx.rmx_weight = 1;
1213 
1214 #ifdef RADIX_MPATH
1215 		/* do not permit exactly the same dst/mask/gw pair */
1216 		if (rn_mpath_capable(rnh) &&
1217 			rt_mpath_conflict(rnh, rt, netmask)) {
1218 			ifa_free(rt->rt_ifa);
1219 			Free(rt_key(rt));
1220 			RT_LOCK_DESTROY(rt);
1221 			uma_zfree(V_rtzone, rt);
1222 			senderr(EEXIST);
1223 		}
1224 #endif
1225 
1226 #ifdef FLOWTABLE
1227 		rt0 = NULL;
1228 		/* "flow-table" only supports IPv6 and IPv4 at the moment. */
1229 		switch (dst->sa_family) {
1230 #ifdef INET6
1231 		case AF_INET6:
1232 #endif
1233 #ifdef INET
1234 		case AF_INET:
1235 #endif
1236 #if defined(INET6) || defined(INET)
1237 			rn = rnh->rnh_matchaddr(dst, rnh);
1238 			if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
1239 				struct sockaddr *mask;
1240 				u_char *m, *n;
1241 				int len;
1242 
1243 				/*
1244 				 * compare mask to see if the new route is
1245 				 * more specific than the existing one
1246 				 */
1247 				rt0 = RNTORT(rn);
1248 				RT_LOCK(rt0);
1249 				RT_ADDREF(rt0);
1250 				RT_UNLOCK(rt0);
1251 				/*
1252 				 * A host route is already present, so
1253 				 * leave the flow-table entries as is.
1254 				 */
1255 				if (rt0->rt_flags & RTF_HOST) {
1256 					RTFREE(rt0);
1257 					rt0 = NULL;
1258 				} else if (!(flags & RTF_HOST) && netmask) {
1259 					mask = rt_mask(rt0);
1260 					len = mask->sa_len;
1261 					m = (u_char *)mask;
1262 					n = (u_char *)netmask;
1263 					while (len-- > 0) {
1264 						if (*n != *m)
1265 							break;
1266 						n++;
1267 						m++;
1268 					}
1269 					if (len == 0 || (*n < *m)) {
1270 						RTFREE(rt0);
1271 						rt0 = NULL;
1272 					}
1273 				}
1274 			}
1275 #endif/* INET6 || INET */
1276 		}
1277 #endif /* FLOWTABLE */
1278 
1279 		/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
1280 		rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes);
1281 		/*
1282 		 * If it still failed to go into the tree,
1283 		 * then un-make it (this should be a function)
1284 		 */
1285 		if (rn == NULL) {
1286 			ifa_free(rt->rt_ifa);
1287 			Free(rt_key(rt));
1288 			RT_LOCK_DESTROY(rt);
1289 			uma_zfree(V_rtzone, rt);
1290 #ifdef FLOWTABLE
1291 			if (rt0 != NULL)
1292 				RTFREE(rt0);
1293 #endif
1294 			senderr(EEXIST);
1295 		}
1296 #ifdef FLOWTABLE
1297 		else if (rt0 != NULL) {
1298 			switch (dst->sa_family) {
1299 #ifdef INET6
1300 			case AF_INET6:
1301 				flowtable_route_flush(V_ip6_ft, rt0);
1302 				break;
1303 #endif
1304 #ifdef INET
1305 			case AF_INET:
1306 				flowtable_route_flush(V_ip_ft, rt0);
1307 				break;
1308 #endif
1309 			}
1310 			RTFREE(rt0);
1311 		}
1312 #endif
1313 
1314 		/*
1315 		 * If this protocol has something to add to this then
1316 		 * allow it to do that as well.
1317 		 */
1318 		if (ifa->ifa_rtrequest)
1319 			ifa->ifa_rtrequest(req, rt, info);
1320 
1321 		/*
1322 		 * actually return a resultant rtentry and
1323 		 * give the caller a single reference.
1324 		 */
1325 		if (ret_nrt) {
1326 			*ret_nrt = rt;
1327 			RT_ADDREF(rt);
1328 		}
1329 		RT_UNLOCK(rt);
1330 		break;
1331 	default:
1332 		error = EOPNOTSUPP;
1333 	}
1334 bad:
1335 	if (needlock)
1336 		RADIX_NODE_HEAD_UNLOCK(rnh);
1337 	return (error);
1338 #undef senderr
1339 }
1340 
1341 #undef dst
1342 #undef gateway
1343 #undef netmask
1344 #undef ifaaddr
1345 #undef ifpaddr
1346 #undef flags
1347 
1348 int
1349 rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate)
1350 {
1351 	/* XXX dst may be overwritten, can we move this to below */
1352 	int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);
1353 #ifdef INVARIANTS
1354 	struct radix_node_head *rnh;
1355 
1356 	rnh = rt_tables_get_rnh(rt->rt_fibnum, dst->sa_family);
1357 #endif
1358 
1359 	RT_LOCK_ASSERT(rt);
1360 	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
1361 
1362 	/*
1363 	 * Prepare to store the gateway in rt->rt_gateway.
1364 	 * Both dst and gateway are stored one after the other in the same
1365 	 * malloc'd chunk. If we have room, we can reuse the old buffer,
1366 	 * rt_gateway already points to the right place.
1367 	 * Otherwise, malloc a new block and update the 'dst' address.
1368 	 */
1369 	if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) {
1370 		caddr_t new;
1371 
1372 		R_Malloc(new, caddr_t, dlen + glen);
1373 		if (new == NULL)
1374 			return ENOBUFS;
1375 		/*
1376 		 * XXX note, we copy from *dst and not *rt_key(rt) because
1377 		 * rt_setgate() can be called to initialize a newly
1378 		 * allocated route entry, in which case rt_key(rt) == NULL
1379 		 * (and also rt->rt_gateway == NULL).
1380 		 * Free()/free() handle a NULL argument just fine.
1381 		 */
1382 		bcopy(dst, new, dlen);
1383 		Free(rt_key(rt));	/* free old block, if any */
1384 		rt_key(rt) = (struct sockaddr *)new;
1385 		rt->rt_gateway = (struct sockaddr *)(new + dlen);
1386 	}
1387 
1388 	/*
1389 	 * Copy the new gateway value into the memory chunk.
1390 	 */
1391 	bcopy(gate, rt->rt_gateway, glen);
1392 
1393 	return (0);
1394 }
1395 
1396 void
1397 rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
1398 {
1399 	register u_char *cp1 = (u_char *)src;
1400 	register u_char *cp2 = (u_char *)dst;
1401 	register u_char *cp3 = (u_char *)netmask;
1402 	u_char *cplim = cp2 + *cp3;
1403 	u_char *cplim2 = cp2 + *cp1;
1404 
1405 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
1406 	cp3 += 2;
1407 	if (cplim > cplim2)
1408 		cplim = cplim2;
1409 	while (cp2 < cplim)
1410 		*cp2++ = *cp1++ & *cp3++;
1411 	if (cp2 < cplim2)
1412 		bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
1413 }
1414 
1415 /*
1416  * Set up a routing table entry, normally
1417  * for an interface.
1418  */
1419 #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */
1420 static inline  int
1421 rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum)
1422 {
1423 	struct sockaddr *dst;
1424 	struct sockaddr *netmask;
1425 	struct rtentry *rt = NULL;
1426 	struct rt_addrinfo info;
1427 	int error = 0;
1428 	int startfib, endfib;
1429 	char tempbuf[_SOCKADDR_TMPSIZE];
1430 	int didwork = 0;
1431 	int a_failure = 0;
1432 	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
1433 	struct radix_node_head *rnh;
1434 
1435 	if (flags & RTF_HOST) {
1436 		dst = ifa->ifa_dstaddr;
1437 		netmask = NULL;
1438 	} else {
1439 		dst = ifa->ifa_addr;
1440 		netmask = ifa->ifa_netmask;
1441 	}
1442 	if (dst->sa_len == 0)
1443 		return(EINVAL);
1444 	switch (dst->sa_family) {
1445 	case AF_INET6:
1446 	case AF_INET:
1447 		/* We support multiple FIBs. */
1448 		break;
1449 	default:
1450 		fibnum = RT_DEFAULT_FIB;
1451 		break;
1452 	}
1453 	if (fibnum == -1) {
1454 		if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) {
1455 			startfib = endfib = curthread->td_proc->p_fibnum;
1456 		} else {
1457 			startfib = 0;
1458 			endfib = rt_numfibs - 1;
1459 		}
1460 	} else {
1461 		KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum"));
1462 		startfib = fibnum;
1463 		endfib = fibnum;
1464 	}
1465 
1466 	/*
1467 	 * If it's a delete, check that if it exists,
1468 	 * it's on the correct interface or we might scrub
1469 	 * a route to another ifa which would
1470 	 * be confusing at best and possibly worse.
1471 	 */
1472 	if (cmd == RTM_DELETE) {
1473 		/*
1474 		 * It's a delete, so it should already exist..
1475 		 * If it's a net, mask off the host bits
1476 		 * (Assuming we have a mask)
1477 		 * XXX this is kinda inet specific..
1478 		 */
1479 		if (netmask != NULL) {
1480 			rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask);
1481 			dst = (struct sockaddr *)tempbuf;
1482 		}
1483 	}
1484 	/*
1485 	 * Now go through all the requested tables (fibs) and do the
1486 	 * requested action. Realistically, this will either be fib 0
1487 	 * for protocols that don't do multiple tables or all the
1488 	 * tables for those that do.
1489 	 */
1490 	for ( fibnum = startfib; fibnum <= endfib; fibnum++) {
1491 		if (cmd == RTM_DELETE) {
1492 			struct radix_node *rn;
1493 			/*
1494 			 * Look up an rtentry that is in the routing tree and
1495 			 * contains the correct info.
1496 			 */
1497 			rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
1498 			if (rnh == NULL)
1499 				/* this table doesn't exist but others might */
1500 				continue;
1501 			RADIX_NODE_HEAD_RLOCK(rnh);
1502 #ifdef RADIX_MPATH
1503 			if (rn_mpath_capable(rnh)) {
1504 
1505 				rn = rnh->rnh_matchaddr(dst, rnh);
1506 				if (rn == NULL)
1507 					error = ESRCH;
1508 				else {
1509 					rt = RNTORT(rn);
1510 					/*
1511 					 * for interface route the
1512 					 * rt->rt_gateway is sockaddr_intf
1513 					 * for cloning ARP entries, so
1514 					 * rt_mpath_matchgate must use the
1515 					 * interface address
1516 					 */
1517 					rt = rt_mpath_matchgate(rt,
1518 					    ifa->ifa_addr);
1519 					if (!rt)
1520 						error = ESRCH;
1521 				}
1522 			}
1523 			else
1524 #endif
1525 			rn = rnh->rnh_lookup(dst, netmask, rnh);
1526 			error = (rn == NULL ||
1527 			    (rn->rn_flags & RNF_ROOT) ||
1528 			    RNTORT(rn)->rt_ifa != ifa ||
1529 			    !sa_equal((struct sockaddr *)rn->rn_key, dst));
1530 			RADIX_NODE_HEAD_RUNLOCK(rnh);
1531 			if (error) {
1532 				/* this is only an error if bad on ALL tables */
1533 				continue;
1534 			}
1535 		}
1536 		/*
1537 		 * Do the actual request
1538 		 */
1539 		bzero((caddr_t)&info, sizeof(info));
1540 		info.rti_ifa = ifa;
1541 		info.rti_flags = flags |
1542 		    (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED;
1543 		info.rti_info[RTAX_DST] = dst;
1544 		/*
1545 		 * doing this for compatibility reasons
1546 		 */
1547 		if (cmd == RTM_ADD)
1548 			info.rti_info[RTAX_GATEWAY] =
1549 			    (struct sockaddr *)&null_sdl;
1550 		else
1551 			info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
1552 		info.rti_info[RTAX_NETMASK] = netmask;
1553 		error = rtrequest1_fib(cmd, &info, &rt, fibnum);
1554 
1555 		if ((error == EEXIST) && (cmd == RTM_ADD)) {
1556 			/*
1557 			 * Interface route addition failed.
1558 			 * Atomically delete current prefix generating
1559 			 * RTM_DELETE message, and retry adding
1560 			 * interface prefix.
1561 			 */
1562 			rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
1563 			RADIX_NODE_HEAD_LOCK(rnh);
1564 
1565 			/* Delete old prefix */
1566 			info.rti_ifa = NULL;
1567 			info.rti_flags = RTF_RNH_LOCKED;
1568 
1569 			error = rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum);
1570 			if (error == 0) {
1571 				info.rti_ifa = ifa;
1572 				info.rti_flags = flags | RTF_RNH_LOCKED |
1573 				    (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED;
1574 				error = rtrequest1_fib(cmd, &info, &rt, fibnum);
1575 			}
1576 
1577 			RADIX_NODE_HEAD_UNLOCK(rnh);
1578 		}
1579 
1580 
1581 		if (error == 0 && rt != NULL) {
1582 			/*
1583 			 * notify any listening routing agents of the change
1584 			 */
1585 			RT_LOCK(rt);
1586 #ifdef RADIX_MPATH
1587 			/*
1588 			 * in case address alias finds the first address
1589 			 * e.g. ifconfig bge0 192.0.2.246/24
1590 			 * e.g. ifconfig bge0 192.0.2.247/24
1591 			 * the address set in the route is 192.0.2.246
1592 			 * so we need to replace it with 192.0.2.247
1593 			 */
1594 			if (memcmp(rt->rt_ifa->ifa_addr,
1595 			    ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
1596 				ifa_free(rt->rt_ifa);
1597 				ifa_ref(ifa);
1598 				rt->rt_ifp = ifa->ifa_ifp;
1599 				rt->rt_ifa = ifa;
1600 			}
1601 #endif
1602 			/*
1603 			 * doing this for compatibility reasons
1604 			 */
1605 			if (cmd == RTM_ADD) {
1606 			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type  =
1607 				rt->rt_ifp->if_type;
1608 			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
1609 				rt->rt_ifp->if_index;
1610 			}
1611 			RT_ADDREF(rt);
1612 			RT_UNLOCK(rt);
1613 			rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum);
1614 			RT_LOCK(rt);
1615 			RT_REMREF(rt);
1616 			if (cmd == RTM_DELETE) {
1617 				/*
1618 				 * If we are deleting, and we found an entry,
1619 				 * then it's been removed from the tree..
1620 				 * now throw it away.
1621 				 */
1622 				RTFREE_LOCKED(rt);
1623 			} else {
1624 				if (cmd == RTM_ADD) {
1625 					/*
1626 					 * We just wanted to add it..
1627 					 * we don't actually need a reference.
1628 					 */
1629 					RT_REMREF(rt);
1630 				}
1631 				RT_UNLOCK(rt);
1632 			}
1633 			didwork = 1;
1634 		}
1635 		if (error)
1636 			a_failure = error;
1637 	}
1638 	if (cmd == RTM_DELETE) {
1639 		if (didwork) {
1640 			error = 0;
1641 		} else {
1642 			/* we only give an error if it wasn't in any table */
1643 			error = ((flags & RTF_HOST) ?
1644 			    EHOSTUNREACH : ENETUNREACH);
1645 		}
1646 	} else {
1647 		if (a_failure) {
1648 			/* return an error if any of them failed */
1649 			error = a_failure;
1650 		}
1651 	}
1652 	return (error);
1653 }
1654 
1655 #ifndef BURN_BRIDGES
1656 /* special one for inet internal use. may not use. */
1657 int
1658 rtinit_fib(struct ifaddr *ifa, int cmd, int flags)
1659 {
1660 	return (rtinit1(ifa, cmd, flags, -1));
1661 }
1662 #endif
1663 
1664 /*
1665  * Set up a routing table entry, normally
1666  * for an interface.
1667  */
1668 int
1669 rtinit(struct ifaddr *ifa, int cmd, int flags)
1670 {
1671 	struct sockaddr *dst;
1672 	int fib = RT_DEFAULT_FIB;
1673 
1674 	if (flags & RTF_HOST) {
1675 		dst = ifa->ifa_dstaddr;
1676 	} else {
1677 		dst = ifa->ifa_addr;
1678 	}
1679 
1680 	switch (dst->sa_family) {
1681 	case AF_INET6:
1682 	case AF_INET:
1683 		/* We do support multiple FIBs. */
1684 		fib = -1;
1685 		break;
1686 	}
1687 	return (rtinit1(ifa, cmd, flags, fib));
1688 }
1689