xref: /freebsd/sys/net/route.c (revision 70e0bbedef95258a4dadc996d641a9bebd3f107d)
1 /*-
2  * Copyright (c) 1980, 1986, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	@(#)route.c	8.3.1.1 (Berkeley) 2/23/95
30  * $FreeBSD$
31  */
32 /************************************************************************
33  * Note: In this file a 'fib' is a "forwarding information base"	*
34  * Which is the new name for an in kernel routing (next hop) table.	*
35  ***********************************************************************/
36 
37 #include "opt_inet.h"
38 #include "opt_route.h"
39 #include "opt_mrouting.h"
40 #include "opt_mpath.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/syslog.h>
45 #include <sys/malloc.h>
46 #include <sys/mbuf.h>
47 #include <sys/socket.h>
48 #include <sys/sysctl.h>
49 #include <sys/syslog.h>
50 #include <sys/sysproto.h>
51 #include <sys/proc.h>
52 #include <sys/domain.h>
53 #include <sys/kernel.h>
54 
55 #include <net/if.h>
56 #include <net/if_dl.h>
57 #include <net/route.h>
58 #include <net/vnet.h>
59 #include <net/flowtable.h>
60 
61 #ifdef RADIX_MPATH
62 #include <net/radix_mpath.h>
63 #endif
64 
65 #include <netinet/in.h>
66 #include <netinet/ip_mroute.h>
67 
68 #include <vm/uma.h>
69 
70 u_int rt_numfibs = RT_NUMFIBS;
71 SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, "");
72 /*
73  * Allow the boot code to allow LESS than RT_MAXFIBS to be used.
74  * We can't do more because storage is statically allocated for now.
75  * (for compatibility reasons.. this will change).
76  */
77 TUNABLE_INT("net.fibs", &rt_numfibs);
78 
79 /*
80  * By default add routes to all fibs for new interfaces.
81  * Once this is set to 0 then only allocate routes on interface
82  * changes for the FIB of the caller when adding a new set of addresses
83  * to an interface.  XXX this is a shotgun aproach to a problem that needs
84  * a more fine grained solution.. that will come.
85  */
86 u_int rt_add_addr_allfibs = 1;
87 SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW,
88     &rt_add_addr_allfibs, 0, "");
89 TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs);
90 
91 VNET_DEFINE(struct rtstat, rtstat);
92 #define	V_rtstat	VNET(rtstat)
93 
94 VNET_DEFINE(struct radix_node_head *, rt_tables);
95 #define	V_rt_tables	VNET(rt_tables)
96 
97 VNET_DEFINE(int, rttrash);		/* routes not in table but not freed */
98 #define	V_rttrash	VNET(rttrash)
99 
100 
101 /* compare two sockaddr structures */
102 #define	sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0)
103 
104 /*
105  * Convert a 'struct radix_node *' to a 'struct rtentry *'.
106  * The operation can be done safely (in this code) because a
107  * 'struct rtentry' starts with two 'struct radix_node''s, the first
108  * one representing leaf nodes in the routing tree, which is
109  * what the code in radix.c passes us as a 'struct radix_node'.
110  *
111  * But because there are a lot of assumptions in this conversion,
112  * do not cast explicitly, but always use the macro below.
113  */
114 #define RNTORT(p)	((struct rtentry *)(p))
115 
116 static VNET_DEFINE(uma_zone_t, rtzone);		/* Routing table UMA zone. */
117 #define	V_rtzone	VNET(rtzone)
118 
119 /*
120  * handler for net.my_fibnum
121  */
122 static int
123 sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
124 {
125         int fibnum;
126         int error;
127 
128         fibnum = curthread->td_proc->p_fibnum;
129         error = sysctl_handle_int(oidp, &fibnum, 0, req);
130         return (error);
131 }
132 
133 SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD,
134             NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller");
135 
136 static __inline struct radix_node_head **
137 rt_tables_get_rnh_ptr(int table, int fam)
138 {
139 	struct radix_node_head **rnh;
140 
141 	KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.",
142 	    __func__));
143 	KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.",
144 	    __func__));
145 
146 	/* rnh is [fib=0][af=0]. */
147 	rnh = (struct radix_node_head **)V_rt_tables;
148 	/* Get the offset to the requested table and fam. */
149 	rnh += table * (AF_MAX+1) + fam;
150 
151 	return (rnh);
152 }
153 
154 struct radix_node_head *
155 rt_tables_get_rnh(int table, int fam)
156 {
157 
158 	return (*rt_tables_get_rnh_ptr(table, fam));
159 }
160 
161 /*
162  * route initialization must occur before ip6_init2(), which happenas at
163  * SI_ORDER_MIDDLE.
164  */
165 static void
166 route_init(void)
167 {
168 	struct domain *dom;
169 	int max_keylen = 0;
170 
171 	/* whack the tunable ints into  line. */
172 	if (rt_numfibs > RT_MAXFIBS)
173 		rt_numfibs = RT_MAXFIBS;
174 	if (rt_numfibs == 0)
175 		rt_numfibs = 1;
176 
177 	for (dom = domains; dom; dom = dom->dom_next)
178 		if (dom->dom_maxrtkey > max_keylen)
179 			max_keylen = dom->dom_maxrtkey;
180 
181 	rn_init(max_keylen);	/* init all zeroes, all ones, mask table */
182 }
183 SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);
184 
185 static void
186 vnet_route_init(const void *unused __unused)
187 {
188 	struct domain *dom;
189 	struct radix_node_head **rnh;
190 	int table;
191 	int fam;
192 
193 	V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) *
194 	    sizeof(struct radix_node_head *), M_RTABLE, M_WAITOK|M_ZERO);
195 
196 	V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL,
197 	    NULL, NULL, UMA_ALIGN_PTR, 0);
198 	for (dom = domains; dom; dom = dom->dom_next) {
199 		if (dom->dom_rtattach)  {
200 			for  (table = 0; table < rt_numfibs; table++) {
201 				if ( (fam = dom->dom_family) == AF_INET ||
202 				    table == 0) {
203  			        	/* for now only AF_INET has > 1 table */
204 					/* XXX MRT
205 					 * rtattach will be also called
206 					 * from vfs_export.c but the
207 					 * offset will be 0
208 					 * (only for AF_INET and AF_INET6
209 					 * which don't need it anyhow)
210 					 */
211 					rnh = rt_tables_get_rnh_ptr(table, fam);
212 					if (rnh == NULL)
213 						panic("%s: rnh NULL", __func__);
214 					dom->dom_rtattach((void **)rnh,
215 				    	    dom->dom_rtoffset);
216 				} else {
217 					break;
218 				}
219 			}
220 		}
221 	}
222 }
223 VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
224     vnet_route_init, 0);
225 
226 #ifdef VIMAGE
227 static void
228 vnet_route_uninit(const void *unused __unused)
229 {
230 	int table;
231 	int fam;
232 	struct domain *dom;
233 	struct radix_node_head **rnh;
234 
235 	for (dom = domains; dom; dom = dom->dom_next) {
236 		if (dom->dom_rtdetach) {
237 			for (table = 0; table < rt_numfibs; table++) {
238 				if ( (fam = dom->dom_family) == AF_INET ||
239 				    table == 0) {
240 					/* For now only AF_INET has > 1 tbl. */
241 					rnh = rt_tables_get_rnh_ptr(table, fam);
242 					if (rnh == NULL)
243 						panic("%s: rnh NULL", __func__);
244 					dom->dom_rtdetach((void **)rnh,
245 					    dom->dom_rtoffset);
246 				} else {
247 					break;
248 				}
249 			}
250 		}
251 	}
252 }
253 VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
254     vnet_route_uninit, 0);
255 #endif
256 
257 #ifndef _SYS_SYSPROTO_H_
258 struct setfib_args {
259 	int     fibnum;
260 };
261 #endif
262 int
263 sys_setfib(struct thread *td, struct setfib_args *uap)
264 {
265 	if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs)
266 		return EINVAL;
267 	td->td_proc->p_fibnum = uap->fibnum;
268 	return (0);
269 }
270 
271 /*
272  * Packet routing routines.
273  */
274 void
275 rtalloc(struct route *ro)
276 {
277 	rtalloc_ign_fib(ro, 0UL, 0);
278 }
279 
280 void
281 rtalloc_fib(struct route *ro, u_int fibnum)
282 {
283 	rtalloc_ign_fib(ro, 0UL, fibnum);
284 }
285 
286 void
287 rtalloc_ign(struct route *ro, u_long ignore)
288 {
289 	struct rtentry *rt;
290 
291 	if ((rt = ro->ro_rt) != NULL) {
292 		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
293 			return;
294 		RTFREE(rt);
295 		ro->ro_rt = NULL;
296 	}
297 	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, 0);
298 	if (ro->ro_rt)
299 		RT_UNLOCK(ro->ro_rt);
300 }
301 
302 void
303 rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum)
304 {
305 	struct rtentry *rt;
306 
307 	if ((rt = ro->ro_rt) != NULL) {
308 		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
309 			return;
310 		RTFREE(rt);
311 		ro->ro_rt = NULL;
312 	}
313 	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum);
314 	if (ro->ro_rt)
315 		RT_UNLOCK(ro->ro_rt);
316 }
317 
318 /*
319  * Look up the route that matches the address given
320  * Or, at least try.. Create a cloned route if needed.
321  *
322  * The returned route, if any, is locked.
323  */
324 struct rtentry *
325 rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
326 {
327 	return (rtalloc1_fib(dst, report, ignflags, 0));
328 }
329 
330 struct rtentry *
331 rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags,
332 		    u_int fibnum)
333 {
334 	struct radix_node_head *rnh;
335 	struct radix_node *rn;
336 	struct rtentry *newrt;
337 	struct rt_addrinfo info;
338 	int err = 0, msgtype = RTM_MISS;
339 	int needlock;
340 
341 	KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
342 	if (dst->sa_family != AF_INET)	/* Only INET supports > 1 fib now */
343 		fibnum = 0;
344 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
345 	newrt = NULL;
346 	if (rnh == NULL)
347 		goto miss;
348 
349 	/*
350 	 * Look up the address in the table for that Address Family
351 	 */
352 	needlock = !(ignflags & RTF_RNH_LOCKED);
353 	if (needlock)
354 		RADIX_NODE_HEAD_RLOCK(rnh);
355 #ifdef INVARIANTS
356 	else
357 		RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
358 #endif
359 	rn = rnh->rnh_matchaddr(dst, rnh);
360 	if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
361 		newrt = RNTORT(rn);
362 		RT_LOCK(newrt);
363 		RT_ADDREF(newrt);
364 		if (needlock)
365 			RADIX_NODE_HEAD_RUNLOCK(rnh);
366 		goto done;
367 
368 	} else if (needlock)
369 		RADIX_NODE_HEAD_RUNLOCK(rnh);
370 
371 	/*
372 	 * Either we hit the root or couldn't find any match,
373 	 * Which basically means
374 	 * "caint get there frm here"
375 	 */
376 miss:
377 	V_rtstat.rts_unreach++;
378 
379 	if (report) {
380 		/*
381 		 * If required, report the failure to the supervising
382 		 * Authorities.
383 		 * For a delete, this is not an error. (report == 0)
384 		 */
385 		bzero(&info, sizeof(info));
386 		info.rti_info[RTAX_DST] = dst;
387 		rt_missmsg_fib(msgtype, &info, 0, err, fibnum);
388 	}
389 done:
390 	if (newrt)
391 		RT_LOCK_ASSERT(newrt);
392 	return (newrt);
393 }
394 
395 /*
396  * Remove a reference count from an rtentry.
397  * If the count gets low enough, take it out of the routing table
398  */
399 void
400 rtfree(struct rtentry *rt)
401 {
402 	struct radix_node_head *rnh;
403 
404 	KASSERT(rt != NULL,("%s: NULL rt", __func__));
405 	rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
406 	KASSERT(rnh != NULL,("%s: NULL rnh", __func__));
407 
408 	RT_LOCK_ASSERT(rt);
409 
410 	/*
411 	 * The callers should use RTFREE_LOCKED() or RTFREE(), so
412 	 * we should come here exactly with the last reference.
413 	 */
414 	RT_REMREF(rt);
415 	if (rt->rt_refcnt > 0) {
416 		log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt);
417 		goto done;
418 	}
419 
420 	/*
421 	 * On last reference give the "close method" a chance
422 	 * to cleanup private state.  This also permits (for
423 	 * IPv4 and IPv6) a chance to decide if the routing table
424 	 * entry should be purged immediately or at a later time.
425 	 * When an immediate purge is to happen the close routine
426 	 * typically calls rtexpunge which clears the RTF_UP flag
427 	 * on the entry so that the code below reclaims the storage.
428 	 */
429 	if (rt->rt_refcnt == 0 && rnh->rnh_close)
430 		rnh->rnh_close((struct radix_node *)rt, rnh);
431 
432 	/*
433 	 * If we are no longer "up" (and ref == 0)
434 	 * then we can free the resources associated
435 	 * with the route.
436 	 */
437 	if ((rt->rt_flags & RTF_UP) == 0) {
438 		if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
439 			panic("rtfree 2");
440 		/*
441 		 * the rtentry must have been removed from the routing table
442 		 * so it is represented in rttrash.. remove that now.
443 		 */
444 		V_rttrash--;
445 #ifdef	DIAGNOSTIC
446 		if (rt->rt_refcnt < 0) {
447 			printf("rtfree: %p not freed (neg refs)\n", rt);
448 			goto done;
449 		}
450 #endif
451 		/*
452 		 * release references on items we hold them on..
453 		 * e.g other routes and ifaddrs.
454 		 */
455 		if (rt->rt_ifa)
456 			ifa_free(rt->rt_ifa);
457 		/*
458 		 * The key is separatly alloc'd so free it (see rt_setgate()).
459 		 * This also frees the gateway, as they are always malloc'd
460 		 * together.
461 		 */
462 		Free(rt_key(rt));
463 
464 		/*
465 		 * and the rtentry itself of course
466 		 */
467 		RT_LOCK_DESTROY(rt);
468 		uma_zfree(V_rtzone, rt);
469 		return;
470 	}
471 done:
472 	RT_UNLOCK(rt);
473 }
474 
475 
476 /*
477  * Force a routing table entry to the specified
478  * destination to go through the given gateway.
479  * Normally called as a result of a routing redirect
480  * message from the network layer.
481  */
482 void
483 rtredirect(struct sockaddr *dst,
484 	struct sockaddr *gateway,
485 	struct sockaddr *netmask,
486 	int flags,
487 	struct sockaddr *src)
488 {
489 	rtredirect_fib(dst, gateway, netmask, flags, src, 0);
490 }
491 
492 void
493 rtredirect_fib(struct sockaddr *dst,
494 	struct sockaddr *gateway,
495 	struct sockaddr *netmask,
496 	int flags,
497 	struct sockaddr *src,
498 	u_int fibnum)
499 {
500 	struct rtentry *rt, *rt0 = NULL;
501 	int error = 0;
502 	short *stat = NULL;
503 	struct rt_addrinfo info;
504 	struct ifaddr *ifa;
505 	struct radix_node_head *rnh;
506 
507 	ifa = NULL;
508 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
509 	if (rnh == NULL) {
510 		error = EAFNOSUPPORT;
511 		goto out;
512 	}
513 
514 	/* verify the gateway is directly reachable */
515 	if ((ifa = ifa_ifwithnet(gateway, 0)) == NULL) {
516 		error = ENETUNREACH;
517 		goto out;
518 	}
519 	rt = rtalloc1_fib(dst, 0, 0UL, fibnum);	/* NB: rt is locked */
520 	/*
521 	 * If the redirect isn't from our current router for this dst,
522 	 * it's either old or wrong.  If it redirects us to ourselves,
523 	 * we have a routing loop, perhaps as a result of an interface
524 	 * going down recently.
525 	 */
526 	if (!(flags & RTF_DONE) && rt &&
527 	     (!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa))
528 		error = EINVAL;
529 	else if (ifa_ifwithaddr_check(gateway))
530 		error = EHOSTUNREACH;
531 	if (error)
532 		goto done;
533 	/*
534 	 * Create a new entry if we just got back a wildcard entry
535 	 * or the lookup failed.  This is necessary for hosts
536 	 * which use routing redirects generated by smart gateways
537 	 * to dynamically build the routing tables.
538 	 */
539 	if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
540 		goto create;
541 	/*
542 	 * Don't listen to the redirect if it's
543 	 * for a route to an interface.
544 	 */
545 	if (rt->rt_flags & RTF_GATEWAY) {
546 		if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
547 			/*
548 			 * Changing from route to net => route to host.
549 			 * Create new route, rather than smashing route to net.
550 			 */
551 		create:
552 			rt0 = rt;
553 			rt = NULL;
554 
555 			flags |=  RTF_GATEWAY | RTF_DYNAMIC;
556 			bzero((caddr_t)&info, sizeof(info));
557 			info.rti_info[RTAX_DST] = dst;
558 			info.rti_info[RTAX_GATEWAY] = gateway;
559 			info.rti_info[RTAX_NETMASK] = netmask;
560 			info.rti_ifa = ifa;
561 			info.rti_flags = flags;
562 			if (rt0 != NULL)
563 				RT_UNLOCK(rt0);	/* drop lock to avoid LOR with RNH */
564 			error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
565 			if (rt != NULL) {
566 				RT_LOCK(rt);
567 				if (rt0 != NULL)
568 					EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst);
569 				flags = rt->rt_flags;
570 			}
571 			if (rt0 != NULL)
572 				RTFREE(rt0);
573 
574 			stat = &V_rtstat.rts_dynamic;
575 		} else {
576 			struct rtentry *gwrt;
577 
578 			/*
579 			 * Smash the current notion of the gateway to
580 			 * this destination.  Should check about netmask!!!
581 			 */
582 			rt->rt_flags |= RTF_MODIFIED;
583 			flags |= RTF_MODIFIED;
584 			stat = &V_rtstat.rts_newgateway;
585 			/*
586 			 * add the key and gateway (in one malloc'd chunk).
587 			 */
588 			RT_UNLOCK(rt);
589 			RADIX_NODE_HEAD_LOCK(rnh);
590 			RT_LOCK(rt);
591 			rt_setgate(rt, rt_key(rt), gateway);
592 			gwrt = rtalloc1(gateway, 1, RTF_RNH_LOCKED);
593 			RADIX_NODE_HEAD_UNLOCK(rnh);
594 			EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst);
595 			RTFREE_LOCKED(gwrt);
596 		}
597 	} else
598 		error = EHOSTUNREACH;
599 done:
600 	if (rt)
601 		RTFREE_LOCKED(rt);
602 out:
603 	if (error)
604 		V_rtstat.rts_badredirect++;
605 	else if (stat != NULL)
606 		(*stat)++;
607 	bzero((caddr_t)&info, sizeof(info));
608 	info.rti_info[RTAX_DST] = dst;
609 	info.rti_info[RTAX_GATEWAY] = gateway;
610 	info.rti_info[RTAX_NETMASK] = netmask;
611 	info.rti_info[RTAX_AUTHOR] = src;
612 	rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum);
613 	if (ifa != NULL)
614 		ifa_free(ifa);
615 }
616 
617 int
618 rtioctl(u_long req, caddr_t data)
619 {
620 	return (rtioctl_fib(req, data, 0));
621 }
622 
623 /*
624  * Routing table ioctl interface.
625  */
626 int
627 rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
628 {
629 
630 	/*
631 	 * If more ioctl commands are added here, make sure the proper
632 	 * super-user checks are being performed because it is possible for
633 	 * prison-root to make it this far if raw sockets have been enabled
634 	 * in jails.
635 	 */
636 #ifdef INET
637 	/* Multicast goop, grrr... */
638 	return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
639 #else /* INET */
640 	return ENXIO;
641 #endif /* INET */
642 }
643 
644 /*
645  * For both ifa_ifwithroute() routines, 'ifa' is returned referenced.
646  */
647 struct ifaddr *
648 ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway)
649 {
650 	return (ifa_ifwithroute_fib(flags, dst, gateway, 0));
651 }
652 
653 struct ifaddr *
654 ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway,
655 				u_int fibnum)
656 {
657 	register struct ifaddr *ifa;
658 	int not_found = 0;
659 
660 	if ((flags & RTF_GATEWAY) == 0) {
661 		/*
662 		 * If we are adding a route to an interface,
663 		 * and the interface is a pt to pt link
664 		 * we should search for the destination
665 		 * as our clue to the interface.  Otherwise
666 		 * we can use the local address.
667 		 */
668 		ifa = NULL;
669 		if (flags & RTF_HOST)
670 			ifa = ifa_ifwithdstaddr(dst);
671 		if (ifa == NULL)
672 			ifa = ifa_ifwithaddr(gateway);
673 	} else {
674 		/*
675 		 * If we are adding a route to a remote net
676 		 * or host, the gateway may still be on the
677 		 * other end of a pt to pt link.
678 		 */
679 		ifa = ifa_ifwithdstaddr(gateway);
680 	}
681 	if (ifa == NULL)
682 		ifa = ifa_ifwithnet(gateway, 0);
683 	if (ifa == NULL) {
684 		struct rtentry *rt = rtalloc1_fib(gateway, 0, RTF_RNH_LOCKED, fibnum);
685 		if (rt == NULL)
686 			return (NULL);
687 		/*
688 		 * dismiss a gateway that is reachable only
689 		 * through the default router
690 		 */
691 		switch (gateway->sa_family) {
692 		case AF_INET:
693 			if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY)
694 				not_found = 1;
695 			break;
696 		case AF_INET6:
697 			if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr))
698 				not_found = 1;
699 			break;
700 		default:
701 			break;
702 		}
703 		if (!not_found && rt->rt_ifa != NULL) {
704 			ifa = rt->rt_ifa;
705 			ifa_ref(ifa);
706 		}
707 		RT_REMREF(rt);
708 		RT_UNLOCK(rt);
709 		if (not_found || ifa == NULL)
710 			return (NULL);
711 	}
712 	if (ifa->ifa_addr->sa_family != dst->sa_family) {
713 		struct ifaddr *oifa = ifa;
714 		ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
715 		if (ifa == NULL)
716 			ifa = oifa;
717 		else
718 			ifa_free(oifa);
719 	}
720 	return (ifa);
721 }
722 
723 /*
724  * Do appropriate manipulations of a routing tree given
725  * all the bits of info needed
726  */
727 int
728 rtrequest(int req,
729 	struct sockaddr *dst,
730 	struct sockaddr *gateway,
731 	struct sockaddr *netmask,
732 	int flags,
733 	struct rtentry **ret_nrt)
734 {
735 	return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, 0));
736 }
737 
738 int
739 rtrequest_fib(int req,
740 	struct sockaddr *dst,
741 	struct sockaddr *gateway,
742 	struct sockaddr *netmask,
743 	int flags,
744 	struct rtentry **ret_nrt,
745 	u_int fibnum)
746 {
747 	struct rt_addrinfo info;
748 
749 	if (dst->sa_len == 0)
750 		return(EINVAL);
751 
752 	bzero((caddr_t)&info, sizeof(info));
753 	info.rti_flags = flags;
754 	info.rti_info[RTAX_DST] = dst;
755 	info.rti_info[RTAX_GATEWAY] = gateway;
756 	info.rti_info[RTAX_NETMASK] = netmask;
757 	return rtrequest1_fib(req, &info, ret_nrt, fibnum);
758 }
759 
760 /*
761  * These (questionable) definitions of apparent local variables apply
762  * to the next two functions.  XXXXXX!!!
763  */
764 #define	dst	info->rti_info[RTAX_DST]
765 #define	gateway	info->rti_info[RTAX_GATEWAY]
766 #define	netmask	info->rti_info[RTAX_NETMASK]
767 #define	ifaaddr	info->rti_info[RTAX_IFA]
768 #define	ifpaddr	info->rti_info[RTAX_IFP]
769 #define	flags	info->rti_flags
770 
771 int
772 rt_getifa(struct rt_addrinfo *info)
773 {
774 	return (rt_getifa_fib(info, 0));
775 }
776 
777 /*
778  * Look up rt_addrinfo for a specific fib.  Note that if rti_ifa is defined,
779  * it will be referenced so the caller must free it.
780  */
781 int
782 rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
783 {
784 	struct ifaddr *ifa;
785 	int error = 0;
786 
787 	/*
788 	 * ifp may be specified by sockaddr_dl
789 	 * when protocol address is ambiguous.
790 	 */
791 	if (info->rti_ifp == NULL && ifpaddr != NULL &&
792 	    ifpaddr->sa_family == AF_LINK &&
793 	    (ifa = ifa_ifwithnet(ifpaddr, 0)) != NULL) {
794 		info->rti_ifp = ifa->ifa_ifp;
795 		ifa_free(ifa);
796 	}
797 	if (info->rti_ifa == NULL && ifaaddr != NULL)
798 		info->rti_ifa = ifa_ifwithaddr(ifaaddr);
799 	if (info->rti_ifa == NULL) {
800 		struct sockaddr *sa;
801 
802 		sa = ifaaddr != NULL ? ifaaddr :
803 		    (gateway != NULL ? gateway : dst);
804 		if (sa != NULL && info->rti_ifp != NULL)
805 			info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
806 		else if (dst != NULL && gateway != NULL)
807 			info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway,
808 							fibnum);
809 		else if (sa != NULL)
810 			info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa,
811 							fibnum);
812 	}
813 	if ((ifa = info->rti_ifa) != NULL) {
814 		if (info->rti_ifp == NULL)
815 			info->rti_ifp = ifa->ifa_ifp;
816 	} else
817 		error = ENETUNREACH;
818 	return (error);
819 }
820 
821 /*
822  * Expunges references to a route that's about to be reclaimed.
823  * The route must be locked.
824  */
825 int
826 rtexpunge(struct rtentry *rt)
827 {
828 #if !defined(RADIX_MPATH)
829 	struct radix_node *rn;
830 #else
831 	struct rt_addrinfo info;
832 	int fib;
833 	struct rtentry *rt0;
834 #endif
835 	struct radix_node_head *rnh;
836 	struct ifaddr *ifa;
837 	int error = 0;
838 
839 	/*
840 	 * Find the correct routing tree to use for this Address Family
841 	 */
842 	rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
843 	RT_LOCK_ASSERT(rt);
844 	if (rnh == NULL)
845 		return (EAFNOSUPPORT);
846 	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
847 
848 #ifdef RADIX_MPATH
849 	fib = rt->rt_fibnum;
850 	bzero(&info, sizeof(info));
851 	info.rti_ifp = rt->rt_ifp;
852 	info.rti_flags = RTF_RNH_LOCKED;
853 	info.rti_info[RTAX_DST] = rt_key(rt);
854 	info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr;
855 
856 	RT_UNLOCK(rt);
857 	error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib);
858 
859 	if (error == 0 && rt0 != NULL) {
860 		rt = rt0;
861 		RT_LOCK(rt);
862 	} else if (error != 0) {
863 		RT_LOCK(rt);
864 		return (error);
865 	}
866 #else
867 	/*
868 	 * Remove the item from the tree; it should be there,
869 	 * but when callers invoke us blindly it may not (sigh).
870 	 */
871 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh);
872 	if (rn == NULL) {
873 		error = ESRCH;
874 		goto bad;
875 	}
876 	KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0,
877 		("unexpected flags 0x%x", rn->rn_flags));
878 	KASSERT(rt == RNTORT(rn),
879 		("lookup mismatch, rt %p rn %p", rt, rn));
880 #endif /* RADIX_MPATH */
881 
882 	rt->rt_flags &= ~RTF_UP;
883 
884 	/*
885 	 * Give the protocol a chance to keep things in sync.
886 	 */
887 	if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) {
888 		struct rt_addrinfo info;
889 
890 		bzero((caddr_t)&info, sizeof(info));
891 		info.rti_flags = rt->rt_flags;
892 		info.rti_info[RTAX_DST] = rt_key(rt);
893 		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
894 		info.rti_info[RTAX_NETMASK] = rt_mask(rt);
895 		ifa->ifa_rtrequest(RTM_DELETE, rt, &info);
896 	}
897 
898 	/*
899 	 * one more rtentry floating around that is not
900 	 * linked to the routing table.
901 	 */
902 	V_rttrash++;
903 #if !defined(RADIX_MPATH)
904 bad:
905 #endif
906 	return (error);
907 }
908 
909 #ifdef RADIX_MPATH
910 static int
911 rn_mpath_update(int req, struct rt_addrinfo *info,
912     struct radix_node_head *rnh, struct rtentry **ret_nrt)
913 {
914 	/*
915 	 * if we got multipath routes, we require users to specify
916 	 * a matching RTAX_GATEWAY.
917 	 */
918 	struct rtentry *rt, *rto = NULL;
919 	register struct radix_node *rn;
920 	int error = 0;
921 
922 	rn = rnh->rnh_matchaddr(dst, rnh);
923 	if (rn == NULL)
924 		return (ESRCH);
925 	rto = rt = RNTORT(rn);
926 	rt = rt_mpath_matchgate(rt, gateway);
927 	if (rt == NULL)
928 		return (ESRCH);
929 	/*
930 	 * this is the first entry in the chain
931 	 */
932 	if (rto == rt) {
933 		rn = rn_mpath_next((struct radix_node *)rt);
934 		/*
935 		 * there is another entry, now it's active
936 		 */
937 		if (rn) {
938 			rto = RNTORT(rn);
939 			RT_LOCK(rto);
940 			rto->rt_flags |= RTF_UP;
941 			RT_UNLOCK(rto);
942 		} else if (rt->rt_flags & RTF_GATEWAY) {
943 			/*
944 			 * For gateway routes, we need to
945 			 * make sure that we we are deleting
946 			 * the correct gateway.
947 			 * rt_mpath_matchgate() does not
948 			 * check the case when there is only
949 			 * one route in the chain.
950 			 */
951 			if (gateway &&
952 			    (rt->rt_gateway->sa_len != gateway->sa_len ||
953 				memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
954 				error = ESRCH;
955 			else {
956 				/*
957 				 * remove from tree before returning it
958 				 * to the caller
959 				 */
960 				rn = rnh->rnh_deladdr(dst, netmask, rnh);
961 				KASSERT(rt == RNTORT(rn), ("radix node disappeared"));
962 				goto gwdelete;
963 			}
964 
965 		}
966 		/*
967 		 * use the normal delete code to remove
968 		 * the first entry
969 		 */
970 		if (req != RTM_DELETE)
971 			goto nondelete;
972 
973 		error = ENOENT;
974 		goto done;
975 	}
976 
977 	/*
978 	 * if the entry is 2nd and on up
979 	 */
980 	if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
981 		panic ("rtrequest1: rt_mpath_deldup");
982 gwdelete:
983 	RT_LOCK(rt);
984 	RT_ADDREF(rt);
985 	if (req == RTM_DELETE) {
986 		rt->rt_flags &= ~RTF_UP;
987 		/*
988 		 * One more rtentry floating around that is not
989 		 * linked to the routing table. rttrash will be decremented
990 		 * when RTFREE(rt) is eventually called.
991 		 */
992 		V_rttrash++;
993 	}
994 
995 nondelete:
996 	if (req != RTM_DELETE)
997 		panic("unrecognized request %d", req);
998 
999 
1000 	/*
1001 	 * If the caller wants it, then it can have it,
1002 	 * but it's up to it to free the rtentry as we won't be
1003 	 * doing it.
1004 	 */
1005 	if (ret_nrt) {
1006 		*ret_nrt = rt;
1007 		RT_UNLOCK(rt);
1008 	} else
1009 		RTFREE_LOCKED(rt);
1010 done:
1011 	return (error);
1012 }
1013 #endif
1014 
1015 int
1016 rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
1017 				u_int fibnum)
1018 {
1019 	int error = 0, needlock = 0;
1020 	register struct rtentry *rt;
1021 #ifdef FLOWTABLE
1022 	register struct rtentry *rt0;
1023 #endif
1024 	register struct radix_node *rn;
1025 	register struct radix_node_head *rnh;
1026 	struct ifaddr *ifa;
1027 	struct sockaddr *ndst;
1028 	struct sockaddr_storage mdst;
1029 #define senderr(x) { error = x ; goto bad; }
1030 
1031 	KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
1032 	if (dst->sa_family != AF_INET)	/* Only INET supports > 1 fib now */
1033 		fibnum = 0;
1034 	/*
1035 	 * Find the correct routing tree to use for this Address Family
1036 	 */
1037 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
1038 	if (rnh == NULL)
1039 		return (EAFNOSUPPORT);
1040 	needlock = ((flags & RTF_RNH_LOCKED) == 0);
1041 	flags &= ~RTF_RNH_LOCKED;
1042 	if (needlock)
1043 		RADIX_NODE_HEAD_LOCK(rnh);
1044 	else
1045 		RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
1046 	/*
1047 	 * If we are adding a host route then we don't want to put
1048 	 * a netmask in the tree, nor do we want to clone it.
1049 	 */
1050 	if (flags & RTF_HOST)
1051 		netmask = NULL;
1052 
1053 	switch (req) {
1054 	case RTM_DELETE:
1055 		if (netmask) {
1056 			rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
1057 			dst = (struct sockaddr *)&mdst;
1058 		}
1059 #ifdef RADIX_MPATH
1060 		if (rn_mpath_capable(rnh)) {
1061 			error = rn_mpath_update(req, info, rnh, ret_nrt);
1062 			/*
1063 			 * "bad" holds true for the success case
1064 			 * as well
1065 			 */
1066 			if (error != ENOENT)
1067 				goto bad;
1068 			error = 0;
1069 		}
1070 #endif
1071 		/*
1072 		 * Remove the item from the tree and return it.
1073 		 * Complain if it is not there and do no more processing.
1074 		 */
1075 		rn = rnh->rnh_deladdr(dst, netmask, rnh);
1076 		if (rn == NULL)
1077 			senderr(ESRCH);
1078 		if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
1079 			panic ("rtrequest delete");
1080 		rt = RNTORT(rn);
1081 		RT_LOCK(rt);
1082 		RT_ADDREF(rt);
1083 		rt->rt_flags &= ~RTF_UP;
1084 
1085 		/*
1086 		 * give the protocol a chance to keep things in sync.
1087 		 */
1088 		if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
1089 			ifa->ifa_rtrequest(RTM_DELETE, rt, info);
1090 
1091 		/*
1092 		 * One more rtentry floating around that is not
1093 		 * linked to the routing table. rttrash will be decremented
1094 		 * when RTFREE(rt) is eventually called.
1095 		 */
1096 		V_rttrash++;
1097 
1098 		/*
1099 		 * If the caller wants it, then it can have it,
1100 		 * but it's up to it to free the rtentry as we won't be
1101 		 * doing it.
1102 		 */
1103 		if (ret_nrt) {
1104 			*ret_nrt = rt;
1105 			RT_UNLOCK(rt);
1106 		} else
1107 			RTFREE_LOCKED(rt);
1108 		break;
1109 	case RTM_RESOLVE:
1110 		/*
1111 		 * resolve was only used for route cloning
1112 		 * here for compat
1113 		 */
1114 		break;
1115 	case RTM_ADD:
1116 		if ((flags & RTF_GATEWAY) && !gateway)
1117 			senderr(EINVAL);
1118 		if (dst && gateway && (dst->sa_family != gateway->sa_family) &&
1119 		    (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
1120 			senderr(EINVAL);
1121 
1122 		if (info->rti_ifa == NULL) {
1123 			error = rt_getifa_fib(info, fibnum);
1124 			if (error)
1125 				senderr(error);
1126 		} else
1127 			ifa_ref(info->rti_ifa);
1128 		ifa = info->rti_ifa;
1129 		rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO);
1130 		if (rt == NULL) {
1131 			ifa_free(ifa);
1132 			senderr(ENOBUFS);
1133 		}
1134 		RT_LOCK_INIT(rt);
1135 		rt->rt_flags = RTF_UP | flags;
1136 		rt->rt_fibnum = fibnum;
1137 		/*
1138 		 * Add the gateway. Possibly re-malloc-ing the storage for it
1139 		 *
1140 		 */
1141 		RT_LOCK(rt);
1142 		if ((error = rt_setgate(rt, dst, gateway)) != 0) {
1143 			RT_LOCK_DESTROY(rt);
1144 			ifa_free(ifa);
1145 			uma_zfree(V_rtzone, rt);
1146 			senderr(error);
1147 		}
1148 
1149 		/*
1150 		 * point to the (possibly newly malloc'd) dest address.
1151 		 */
1152 		ndst = (struct sockaddr *)rt_key(rt);
1153 
1154 		/*
1155 		 * make sure it contains the value we want (masked if needed).
1156 		 */
1157 		if (netmask) {
1158 			rt_maskedcopy(dst, ndst, netmask);
1159 		} else
1160 			bcopy(dst, ndst, dst->sa_len);
1161 
1162 		/*
1163 		 * We use the ifa reference returned by rt_getifa_fib().
1164 		 * This moved from below so that rnh->rnh_addaddr() can
1165 		 * examine the ifa and  ifa->ifa_ifp if it so desires.
1166 		 */
1167 		rt->rt_ifa = ifa;
1168 		rt->rt_ifp = ifa->ifa_ifp;
1169 		rt->rt_rmx.rmx_weight = 1;
1170 
1171 #ifdef RADIX_MPATH
1172 		/* do not permit exactly the same dst/mask/gw pair */
1173 		if (rn_mpath_capable(rnh) &&
1174 			rt_mpath_conflict(rnh, rt, netmask)) {
1175 			ifa_free(rt->rt_ifa);
1176 			Free(rt_key(rt));
1177 			RT_LOCK_DESTROY(rt);
1178 			uma_zfree(V_rtzone, rt);
1179 			senderr(EEXIST);
1180 		}
1181 #endif
1182 
1183 #ifdef FLOWTABLE
1184 		rt0 = NULL;
1185 		/* XXX
1186 		 * "flow-table" only support IPv4 at the moment.
1187 		 * XXX-BZ as of r205066 it would support IPv6.
1188 		 */
1189 #ifdef INET
1190 		if (dst->sa_family == AF_INET) {
1191 			rn = rnh->rnh_matchaddr(dst, rnh);
1192 			if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
1193 				struct sockaddr *mask;
1194 				u_char *m, *n;
1195 				int len;
1196 
1197 				/*
1198 				 * compare mask to see if the new route is
1199 				 * more specific than the existing one
1200 				 */
1201 				rt0 = RNTORT(rn);
1202 				RT_LOCK(rt0);
1203 				RT_ADDREF(rt0);
1204 				RT_UNLOCK(rt0);
1205 				/*
1206 				 * A host route is already present, so
1207 				 * leave the flow-table entries as is.
1208 				 */
1209 				if (rt0->rt_flags & RTF_HOST) {
1210 					RTFREE(rt0);
1211 					rt0 = NULL;
1212 				} else if (!(flags & RTF_HOST) && netmask) {
1213 					mask = rt_mask(rt0);
1214 					len = mask->sa_len;
1215 					m = (u_char *)mask;
1216 					n = (u_char *)netmask;
1217 					while (len-- > 0) {
1218 						if (*n != *m)
1219 							break;
1220 						n++;
1221 						m++;
1222 					}
1223 					if (len == 0 || (*n < *m)) {
1224 						RTFREE(rt0);
1225 						rt0 = NULL;
1226 					}
1227 				}
1228 			}
1229 		}
1230 #endif
1231 #endif
1232 
1233 		/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
1234 		rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes);
1235 		/*
1236 		 * If it still failed to go into the tree,
1237 		 * then un-make it (this should be a function)
1238 		 */
1239 		if (rn == NULL) {
1240 			ifa_free(rt->rt_ifa);
1241 			Free(rt_key(rt));
1242 			RT_LOCK_DESTROY(rt);
1243 			uma_zfree(V_rtzone, rt);
1244 #ifdef FLOWTABLE
1245 			if (rt0 != NULL)
1246 				RTFREE(rt0);
1247 #endif
1248 			senderr(EEXIST);
1249 		}
1250 #ifdef FLOWTABLE
1251 		else if (rt0 != NULL) {
1252 #ifdef INET
1253 			flowtable_route_flush(V_ip_ft, rt0);
1254 #endif
1255 			RTFREE(rt0);
1256 		}
1257 #endif
1258 
1259 		/*
1260 		 * If this protocol has something to add to this then
1261 		 * allow it to do that as well.
1262 		 */
1263 		if (ifa->ifa_rtrequest)
1264 			ifa->ifa_rtrequest(req, rt, info);
1265 
1266 		/*
1267 		 * actually return a resultant rtentry and
1268 		 * give the caller a single reference.
1269 		 */
1270 		if (ret_nrt) {
1271 			*ret_nrt = rt;
1272 			RT_ADDREF(rt);
1273 		}
1274 		RT_UNLOCK(rt);
1275 		break;
1276 	default:
1277 		error = EOPNOTSUPP;
1278 	}
1279 bad:
1280 	if (needlock)
1281 		RADIX_NODE_HEAD_UNLOCK(rnh);
1282 	return (error);
1283 #undef senderr
1284 }
1285 
1286 #undef dst
1287 #undef gateway
1288 #undef netmask
1289 #undef ifaaddr
1290 #undef ifpaddr
1291 #undef flags
1292 
1293 int
1294 rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate)
1295 {
1296 	/* XXX dst may be overwritten, can we move this to below */
1297 	int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);
1298 #ifdef INVARIANTS
1299 	struct radix_node_head *rnh;
1300 
1301 	rnh = rt_tables_get_rnh(rt->rt_fibnum, dst->sa_family);
1302 #endif
1303 
1304 	RT_LOCK_ASSERT(rt);
1305 	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
1306 
1307 	/*
1308 	 * Prepare to store the gateway in rt->rt_gateway.
1309 	 * Both dst and gateway are stored one after the other in the same
1310 	 * malloc'd chunk. If we have room, we can reuse the old buffer,
1311 	 * rt_gateway already points to the right place.
1312 	 * Otherwise, malloc a new block and update the 'dst' address.
1313 	 */
1314 	if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) {
1315 		caddr_t new;
1316 
1317 		R_Malloc(new, caddr_t, dlen + glen);
1318 		if (new == NULL)
1319 			return ENOBUFS;
1320 		/*
1321 		 * XXX note, we copy from *dst and not *rt_key(rt) because
1322 		 * rt_setgate() can be called to initialize a newly
1323 		 * allocated route entry, in which case rt_key(rt) == NULL
1324 		 * (and also rt->rt_gateway == NULL).
1325 		 * Free()/free() handle a NULL argument just fine.
1326 		 */
1327 		bcopy(dst, new, dlen);
1328 		Free(rt_key(rt));	/* free old block, if any */
1329 		rt_key(rt) = (struct sockaddr *)new;
1330 		rt->rt_gateway = (struct sockaddr *)(new + dlen);
1331 	}
1332 
1333 	/*
1334 	 * Copy the new gateway value into the memory chunk.
1335 	 */
1336 	bcopy(gate, rt->rt_gateway, glen);
1337 
1338 	return (0);
1339 }
1340 
1341 void
1342 rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
1343 {
1344 	register u_char *cp1 = (u_char *)src;
1345 	register u_char *cp2 = (u_char *)dst;
1346 	register u_char *cp3 = (u_char *)netmask;
1347 	u_char *cplim = cp2 + *cp3;
1348 	u_char *cplim2 = cp2 + *cp1;
1349 
1350 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
1351 	cp3 += 2;
1352 	if (cplim > cplim2)
1353 		cplim = cplim2;
1354 	while (cp2 < cplim)
1355 		*cp2++ = *cp1++ & *cp3++;
1356 	if (cp2 < cplim2)
1357 		bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
1358 }
1359 
1360 /*
1361  * Set up a routing table entry, normally
1362  * for an interface.
1363  */
1364 #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */
1365 static inline  int
1366 rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum)
1367 {
1368 	struct sockaddr *dst;
1369 	struct sockaddr *netmask;
1370 	struct rtentry *rt = NULL;
1371 	struct rt_addrinfo info;
1372 	int error = 0;
1373 	int startfib, endfib;
1374 	char tempbuf[_SOCKADDR_TMPSIZE];
1375 	int didwork = 0;
1376 	int a_failure = 0;
1377 	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
1378 
1379 	if (flags & RTF_HOST) {
1380 		dst = ifa->ifa_dstaddr;
1381 		netmask = NULL;
1382 	} else {
1383 		dst = ifa->ifa_addr;
1384 		netmask = ifa->ifa_netmask;
1385 	}
1386 	if ( dst->sa_family != AF_INET)
1387 		fibnum = 0;
1388 	if (fibnum == -1) {
1389 		if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) {
1390 			startfib = endfib = curthread->td_proc->p_fibnum;
1391 		} else {
1392 			startfib = 0;
1393 			endfib = rt_numfibs - 1;
1394 		}
1395 	} else {
1396 		KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum"));
1397 		startfib = fibnum;
1398 		endfib = fibnum;
1399 	}
1400 	if (dst->sa_len == 0)
1401 		return(EINVAL);
1402 
1403 	/*
1404 	 * If it's a delete, check that if it exists,
1405 	 * it's on the correct interface or we might scrub
1406 	 * a route to another ifa which would
1407 	 * be confusing at best and possibly worse.
1408 	 */
1409 	if (cmd == RTM_DELETE) {
1410 		/*
1411 		 * It's a delete, so it should already exist..
1412 		 * If it's a net, mask off the host bits
1413 		 * (Assuming we have a mask)
1414 		 * XXX this is kinda inet specific..
1415 		 */
1416 		if (netmask != NULL) {
1417 			rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask);
1418 			dst = (struct sockaddr *)tempbuf;
1419 		}
1420 	}
1421 	/*
1422 	 * Now go through all the requested tables (fibs) and do the
1423 	 * requested action. Realistically, this will either be fib 0
1424 	 * for protocols that don't do multiple tables or all the
1425 	 * tables for those that do. XXX For this version only AF_INET.
1426 	 * When that changes code should be refactored to protocol
1427 	 * independent parts and protocol dependent parts.
1428 	 */
1429 	for ( fibnum = startfib; fibnum <= endfib; fibnum++) {
1430 		if (cmd == RTM_DELETE) {
1431 			struct radix_node_head *rnh;
1432 			struct radix_node *rn;
1433 			/*
1434 			 * Look up an rtentry that is in the routing tree and
1435 			 * contains the correct info.
1436 			 */
1437 			rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
1438 			if (rnh == NULL)
1439 				/* this table doesn't exist but others might */
1440 				continue;
1441 			RADIX_NODE_HEAD_LOCK(rnh);
1442 #ifdef RADIX_MPATH
1443 			if (rn_mpath_capable(rnh)) {
1444 
1445 				rn = rnh->rnh_matchaddr(dst, rnh);
1446 				if (rn == NULL)
1447 					error = ESRCH;
1448 				else {
1449 					rt = RNTORT(rn);
1450 					/*
1451 					 * for interface route the
1452 					 * rt->rt_gateway is sockaddr_intf
1453 					 * for cloning ARP entries, so
1454 					 * rt_mpath_matchgate must use the
1455 					 * interface address
1456 					 */
1457 					rt = rt_mpath_matchgate(rt,
1458 					    ifa->ifa_addr);
1459 					if (!rt)
1460 						error = ESRCH;
1461 				}
1462 			}
1463 			else
1464 #endif
1465 			rn = rnh->rnh_lookup(dst, netmask, rnh);
1466 			error = (rn == NULL ||
1467 			    (rn->rn_flags & RNF_ROOT) ||
1468 			    RNTORT(rn)->rt_ifa != ifa ||
1469 			    !sa_equal((struct sockaddr *)rn->rn_key, dst));
1470 			RADIX_NODE_HEAD_UNLOCK(rnh);
1471 			if (error) {
1472 				/* this is only an error if bad on ALL tables */
1473 				continue;
1474 			}
1475 		}
1476 		/*
1477 		 * Do the actual request
1478 		 */
1479 		bzero((caddr_t)&info, sizeof(info));
1480 		info.rti_ifa = ifa;
1481 		info.rti_flags = flags | (ifa->ifa_flags & ~IFA_RTSELF);
1482 		info.rti_info[RTAX_DST] = dst;
1483 		/*
1484 		 * doing this for compatibility reasons
1485 		 */
1486 		if (cmd == RTM_ADD)
1487 			info.rti_info[RTAX_GATEWAY] =
1488 			    (struct sockaddr *)&null_sdl;
1489 		else
1490 			info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
1491 		info.rti_info[RTAX_NETMASK] = netmask;
1492 		error = rtrequest1_fib(cmd, &info, &rt, fibnum);
1493 		if (error == 0 && rt != NULL) {
1494 			/*
1495 			 * notify any listening routing agents of the change
1496 			 */
1497 			RT_LOCK(rt);
1498 #ifdef RADIX_MPATH
1499 			/*
1500 			 * in case address alias finds the first address
1501 			 * e.g. ifconfig bge0 192.103.54.246/24
1502 			 * e.g. ifconfig bge0 192.103.54.247/24
1503 			 * the address set in the route is 192.103.54.246
1504 			 * so we need to replace it with 192.103.54.247
1505 			 */
1506 			if (memcmp(rt->rt_ifa->ifa_addr,
1507 			    ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
1508 				ifa_free(rt->rt_ifa);
1509 				ifa_ref(ifa);
1510 				rt->rt_ifp = ifa->ifa_ifp;
1511 				rt->rt_ifa = ifa;
1512 			}
1513 #endif
1514 			/*
1515 			 * doing this for compatibility reasons
1516 			 */
1517 			if (cmd == RTM_ADD) {
1518 			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type  =
1519 				rt->rt_ifp->if_type;
1520 			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
1521 				rt->rt_ifp->if_index;
1522 			}
1523 			RT_ADDREF(rt);
1524 			RT_UNLOCK(rt);
1525 			rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum);
1526 			RT_LOCK(rt);
1527 			RT_REMREF(rt);
1528 			if (cmd == RTM_DELETE) {
1529 				/*
1530 				 * If we are deleting, and we found an entry,
1531 				 * then it's been removed from the tree..
1532 				 * now throw it away.
1533 				 */
1534 				RTFREE_LOCKED(rt);
1535 			} else {
1536 				if (cmd == RTM_ADD) {
1537 					/*
1538 					 * We just wanted to add it..
1539 					 * we don't actually need a reference.
1540 					 */
1541 					RT_REMREF(rt);
1542 				}
1543 				RT_UNLOCK(rt);
1544 			}
1545 			didwork = 1;
1546 		}
1547 		if (error)
1548 			a_failure = error;
1549 	}
1550 	if (cmd == RTM_DELETE) {
1551 		if (didwork) {
1552 			error = 0;
1553 		} else {
1554 			/* we only give an error if it wasn't in any table */
1555 			error = ((flags & RTF_HOST) ?
1556 			    EHOSTUNREACH : ENETUNREACH);
1557 		}
1558 	} else {
1559 		if (a_failure) {
1560 			/* return an error if any of them failed */
1561 			error = a_failure;
1562 		}
1563 	}
1564 	return (error);
1565 }
1566 
1567 /* special one for inet internal use. may not use. */
1568 int
1569 rtinit_fib(struct ifaddr *ifa, int cmd, int flags)
1570 {
1571 	return (rtinit1(ifa, cmd, flags, -1));
1572 }
1573 
1574 /*
1575  * Set up a routing table entry, normally
1576  * for an interface.
1577  */
1578 int
1579 rtinit(struct ifaddr *ifa, int cmd, int flags)
1580 {
1581 	struct sockaddr *dst;
1582 	int fib = 0;
1583 
1584 	if (flags & RTF_HOST) {
1585 		dst = ifa->ifa_dstaddr;
1586 	} else {
1587 		dst = ifa->ifa_addr;
1588 	}
1589 
1590 	if (dst->sa_family == AF_INET)
1591 		fib = -1;
1592 	return (rtinit1(ifa, cmd, flags, fib));
1593 }
1594