xref: /freebsd/sys/net/route.c (revision 6d732c66bca5da4d261577aad2c8ea84519b0bea)
1 /*-
2  * Copyright (c) 1980, 1986, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	@(#)route.c	8.3.1.1 (Berkeley) 2/23/95
30  * $FreeBSD$
31  */
32 /************************************************************************
33  * Note: In this file a 'fib' is a "forwarding information base"	*
34  * Which is the new name for an in kernel routing (next hop) table.	*
35  ***********************************************************************/
36 
37 #include "opt_inet.h"
38 #include "opt_inet6.h"
39 #include "opt_route.h"
40 #include "opt_sctp.h"
41 #include "opt_mrouting.h"
42 #include "opt_mpath.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/syslog.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/socket.h>
50 #include <sys/sysctl.h>
51 #include <sys/syslog.h>
52 #include <sys/sysproto.h>
53 #include <sys/proc.h>
54 #include <sys/domain.h>
55 #include <sys/kernel.h>
56 #include <sys/kdb.h>
57 
58 #include <net/if.h>
59 #include <net/if_var.h>
60 #include <net/if_dl.h>
61 #include <net/route.h>
62 #include <net/vnet.h>
63 #include <net/flowtable.h>
64 
65 #ifdef RADIX_MPATH
66 #include <net/radix_mpath.h>
67 #endif
68 
69 #include <netinet/in.h>
70 #include <netinet/ip_mroute.h>
71 
72 #include <vm/uma.h>
73 
74 #define	RT_MAXFIBS	UINT16_MAX
75 
76 /* Kernel config default option. */
77 #ifdef ROUTETABLES
78 #if ROUTETABLES <= 0
79 #error "ROUTETABLES defined too low"
80 #endif
81 #if ROUTETABLES > RT_MAXFIBS
82 #error "ROUTETABLES defined too big"
83 #endif
84 #define	RT_NUMFIBS	ROUTETABLES
85 #endif /* ROUTETABLES */
86 /* Initialize to default if not otherwise set. */
87 #ifndef	RT_NUMFIBS
88 #define	RT_NUMFIBS	1
89 #endif
90 
91 #if defined(INET) || defined(INET6)
92 #ifdef SCTP
93 extern void sctp_addr_change(struct ifaddr *ifa, int cmd);
94 #endif /* SCTP */
95 #endif
96 
97 
98 /* This is read-only.. */
99 u_int rt_numfibs = RT_NUMFIBS;
100 SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, "");
101 /* and this can be set too big but will be fixed before it is used */
102 TUNABLE_INT("net.fibs", &rt_numfibs);
103 
104 /*
105  * By default add routes to all fibs for new interfaces.
106  * Once this is set to 0 then only allocate routes on interface
107  * changes for the FIB of the caller when adding a new set of addresses
108  * to an interface.  XXX this is a shotgun aproach to a problem that needs
109  * a more fine grained solution.. that will come.
110  * XXX also has the problems getting the FIB from curthread which will not
111  * always work given the fib can be overridden and prefixes can be added
112  * from the network stack context.
113  */
114 u_int rt_add_addr_allfibs = 1;
115 SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW,
116     &rt_add_addr_allfibs, 0, "");
117 TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs);
118 
119 VNET_DEFINE(struct rtstat, rtstat);
120 #define	V_rtstat	VNET(rtstat)
121 
122 VNET_DEFINE(struct radix_node_head *, rt_tables);
123 #define	V_rt_tables	VNET(rt_tables)
124 
125 VNET_DEFINE(int, rttrash);		/* routes not in table but not freed */
126 #define	V_rttrash	VNET(rttrash)
127 
128 
129 /* compare two sockaddr structures */
130 #define	sa_equal(a1, a2) (((a1)->sa_len == (a2)->sa_len) && \
131     (bcmp((a1), (a2), (a1)->sa_len) == 0))
132 
133 /*
134  * Convert a 'struct radix_node *' to a 'struct rtentry *'.
135  * The operation can be done safely (in this code) because a
136  * 'struct rtentry' starts with two 'struct radix_node''s, the first
137  * one representing leaf nodes in the routing tree, which is
138  * what the code in radix.c passes us as a 'struct radix_node'.
139  *
140  * But because there are a lot of assumptions in this conversion,
141  * do not cast explicitly, but always use the macro below.
142  */
143 #define RNTORT(p)	((struct rtentry *)(p))
144 
145 static VNET_DEFINE(uma_zone_t, rtzone);		/* Routing table UMA zone. */
146 #define	V_rtzone	VNET(rtzone)
147 
148 /*
149  * handler for net.my_fibnum
150  */
151 static int
152 sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
153 {
154         int fibnum;
155         int error;
156 
157         fibnum = curthread->td_proc->p_fibnum;
158         error = sysctl_handle_int(oidp, &fibnum, 0, req);
159         return (error);
160 }
161 
162 SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD,
163             NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller");
164 
165 static __inline struct radix_node_head **
166 rt_tables_get_rnh_ptr(int table, int fam)
167 {
168 	struct radix_node_head **rnh;
169 
170 	KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.",
171 	    __func__));
172 	KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.",
173 	    __func__));
174 
175 	/* rnh is [fib=0][af=0]. */
176 	rnh = (struct radix_node_head **)V_rt_tables;
177 	/* Get the offset to the requested table and fam. */
178 	rnh += table * (AF_MAX+1) + fam;
179 
180 	return (rnh);
181 }
182 
183 struct radix_node_head *
184 rt_tables_get_rnh(int table, int fam)
185 {
186 
187 	return (*rt_tables_get_rnh_ptr(table, fam));
188 }
189 
190 /*
191  * route initialization must occur before ip6_init2(), which happenas at
192  * SI_ORDER_MIDDLE.
193  */
194 static void
195 route_init(void)
196 {
197 
198 	/* whack the tunable ints into  line. */
199 	if (rt_numfibs > RT_MAXFIBS)
200 		rt_numfibs = RT_MAXFIBS;
201 	if (rt_numfibs == 0)
202 		rt_numfibs = 1;
203 }
204 SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);
205 
206 static void
207 vnet_route_init(const void *unused __unused)
208 {
209 	struct domain *dom;
210 	struct radix_node_head **rnh;
211 	int table;
212 	int fam;
213 
214 	V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) *
215 	    sizeof(struct radix_node_head *), M_RTABLE, M_WAITOK|M_ZERO);
216 
217 	V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL,
218 	    NULL, NULL, UMA_ALIGN_PTR, 0);
219 	for (dom = domains; dom; dom = dom->dom_next) {
220 		if (dom->dom_rtattach == NULL)
221 			continue;
222 
223 		for  (table = 0; table < rt_numfibs; table++) {
224 			fam = dom->dom_family;
225 			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
226 				break;
227 
228 			/*
229 			 * XXX MRT rtattach will be also called from
230 			 * vfs_export.c but the offset will be 0 (only for
231 			 * AF_INET and AF_INET6 which don't need it anyhow).
232 			 */
233 			rnh = rt_tables_get_rnh_ptr(table, fam);
234 			if (rnh == NULL)
235 				panic("%s: rnh NULL", __func__);
236 			dom->dom_rtattach((void **)rnh, dom->dom_rtoffset);
237 		}
238 	}
239 }
240 VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
241     vnet_route_init, 0);
242 
243 #ifdef VIMAGE
244 static void
245 vnet_route_uninit(const void *unused __unused)
246 {
247 	int table;
248 	int fam;
249 	struct domain *dom;
250 	struct radix_node_head **rnh;
251 
252 	for (dom = domains; dom; dom = dom->dom_next) {
253 		if (dom->dom_rtdetach == NULL)
254 			continue;
255 
256 		for (table = 0; table < rt_numfibs; table++) {
257 			fam = dom->dom_family;
258 
259 			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
260 				break;
261 
262 			rnh = rt_tables_get_rnh_ptr(table, fam);
263 			if (rnh == NULL)
264 				panic("%s: rnh NULL", __func__);
265 			dom->dom_rtdetach((void **)rnh, dom->dom_rtoffset);
266 		}
267 	}
268 
269 	free(V_rt_tables, M_RTABLE);
270 	uma_zdestroy(V_rtzone);
271 }
272 VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
273     vnet_route_uninit, 0);
274 #endif
275 
276 #ifndef _SYS_SYSPROTO_H_
277 struct setfib_args {
278 	int     fibnum;
279 };
280 #endif
281 int
282 sys_setfib(struct thread *td, struct setfib_args *uap)
283 {
284 	if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs)
285 		return EINVAL;
286 	td->td_proc->p_fibnum = uap->fibnum;
287 	return (0);
288 }
289 
290 /*
291  * Packet routing routines.
292  */
293 void
294 rtalloc(struct route *ro)
295 {
296 
297 	rtalloc_ign_fib(ro, 0UL, RT_DEFAULT_FIB);
298 }
299 
300 void
301 rtalloc_fib(struct route *ro, u_int fibnum)
302 {
303 	rtalloc_ign_fib(ro, 0UL, fibnum);
304 }
305 
306 void
307 rtalloc_ign(struct route *ro, u_long ignore)
308 {
309 	struct rtentry *rt;
310 
311 	if ((rt = ro->ro_rt) != NULL) {
312 		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
313 			return;
314 		RTFREE(rt);
315 		ro->ro_rt = NULL;
316 	}
317 	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, RT_DEFAULT_FIB);
318 	if (ro->ro_rt)
319 		RT_UNLOCK(ro->ro_rt);
320 }
321 
322 void
323 rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum)
324 {
325 	struct rtentry *rt;
326 
327 	if ((rt = ro->ro_rt) != NULL) {
328 		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
329 			return;
330 		RTFREE(rt);
331 		ro->ro_rt = NULL;
332 	}
333 	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum);
334 	if (ro->ro_rt)
335 		RT_UNLOCK(ro->ro_rt);
336 }
337 
338 /*
339  * Look up the route that matches the address given
340  * Or, at least try.. Create a cloned route if needed.
341  *
342  * The returned route, if any, is locked.
343  */
344 struct rtentry *
345 rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
346 {
347 
348 	return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB));
349 }
350 
351 struct rtentry *
352 rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags,
353 		    u_int fibnum)
354 {
355 	struct radix_node_head *rnh;
356 	struct radix_node *rn;
357 	struct rtentry *newrt;
358 	struct rt_addrinfo info;
359 	int err = 0, msgtype = RTM_MISS;
360 	int needlock;
361 
362 	KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
363 	switch (dst->sa_family) {
364 	case AF_INET6:
365 	case AF_INET:
366 		/* We support multiple FIBs. */
367 		break;
368 	default:
369 		fibnum = RT_DEFAULT_FIB;
370 		break;
371 	}
372 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
373 	newrt = NULL;
374 	if (rnh == NULL)
375 		goto miss;
376 
377 	/*
378 	 * Look up the address in the table for that Address Family
379 	 */
380 	needlock = !(ignflags & RTF_RNH_LOCKED);
381 	if (needlock)
382 		RADIX_NODE_HEAD_RLOCK(rnh);
383 #ifdef INVARIANTS
384 	else
385 		RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
386 #endif
387 	rn = rnh->rnh_matchaddr(dst, rnh);
388 	if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
389 		newrt = RNTORT(rn);
390 		RT_LOCK(newrt);
391 		RT_ADDREF(newrt);
392 		if (needlock)
393 			RADIX_NODE_HEAD_RUNLOCK(rnh);
394 		goto done;
395 
396 	} else if (needlock)
397 		RADIX_NODE_HEAD_RUNLOCK(rnh);
398 
399 	/*
400 	 * Either we hit the root or couldn't find any match,
401 	 * Which basically means
402 	 * "caint get there frm here"
403 	 */
404 miss:
405 	V_rtstat.rts_unreach++;
406 
407 	if (report) {
408 		/*
409 		 * If required, report the failure to the supervising
410 		 * Authorities.
411 		 * For a delete, this is not an error. (report == 0)
412 		 */
413 		bzero(&info, sizeof(info));
414 		info.rti_info[RTAX_DST] = dst;
415 		rt_missmsg_fib(msgtype, &info, 0, err, fibnum);
416 	}
417 done:
418 	if (newrt)
419 		RT_LOCK_ASSERT(newrt);
420 	return (newrt);
421 }
422 
423 /*
424  * Remove a reference count from an rtentry.
425  * If the count gets low enough, take it out of the routing table
426  */
427 void
428 rtfree(struct rtentry *rt)
429 {
430 	struct radix_node_head *rnh;
431 
432 	KASSERT(rt != NULL,("%s: NULL rt", __func__));
433 	rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
434 	KASSERT(rnh != NULL,("%s: NULL rnh", __func__));
435 
436 	RT_LOCK_ASSERT(rt);
437 
438 	/*
439 	 * The callers should use RTFREE_LOCKED() or RTFREE(), so
440 	 * we should come here exactly with the last reference.
441 	 */
442 	RT_REMREF(rt);
443 	if (rt->rt_refcnt > 0) {
444 		log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt);
445 		goto done;
446 	}
447 
448 	/*
449 	 * On last reference give the "close method" a chance
450 	 * to cleanup private state.  This also permits (for
451 	 * IPv4 and IPv6) a chance to decide if the routing table
452 	 * entry should be purged immediately or at a later time.
453 	 * When an immediate purge is to happen the close routine
454 	 * typically calls rtexpunge which clears the RTF_UP flag
455 	 * on the entry so that the code below reclaims the storage.
456 	 */
457 	if (rt->rt_refcnt == 0 && rnh->rnh_close)
458 		rnh->rnh_close((struct radix_node *)rt, rnh);
459 
460 	/*
461 	 * If we are no longer "up" (and ref == 0)
462 	 * then we can free the resources associated
463 	 * with the route.
464 	 */
465 	if ((rt->rt_flags & RTF_UP) == 0) {
466 		if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
467 			panic("rtfree 2");
468 		/*
469 		 * the rtentry must have been removed from the routing table
470 		 * so it is represented in rttrash.. remove that now.
471 		 */
472 		V_rttrash--;
473 #ifdef	DIAGNOSTIC
474 		if (rt->rt_refcnt < 0) {
475 			printf("rtfree: %p not freed (neg refs)\n", rt);
476 			goto done;
477 		}
478 #endif
479 		/*
480 		 * release references on items we hold them on..
481 		 * e.g other routes and ifaddrs.
482 		 */
483 		if (rt->rt_ifa)
484 			ifa_free(rt->rt_ifa);
485 		/*
486 		 * The key is separatly alloc'd so free it (see rt_setgate()).
487 		 * This also frees the gateway, as they are always malloc'd
488 		 * together.
489 		 */
490 		Free(rt_key(rt));
491 
492 		/*
493 		 * and the rtentry itself of course
494 		 */
495 		RT_LOCK_DESTROY(rt);
496 		uma_zfree(V_rtzone, rt);
497 		return;
498 	}
499 done:
500 	RT_UNLOCK(rt);
501 }
502 
503 
504 /*
505  * Force a routing table entry to the specified
506  * destination to go through the given gateway.
507  * Normally called as a result of a routing redirect
508  * message from the network layer.
509  */
510 void
511 rtredirect(struct sockaddr *dst,
512 	struct sockaddr *gateway,
513 	struct sockaddr *netmask,
514 	int flags,
515 	struct sockaddr *src)
516 {
517 
518 	rtredirect_fib(dst, gateway, netmask, flags, src, RT_DEFAULT_FIB);
519 }
520 
521 void
522 rtredirect_fib(struct sockaddr *dst,
523 	struct sockaddr *gateway,
524 	struct sockaddr *netmask,
525 	int flags,
526 	struct sockaddr *src,
527 	u_int fibnum)
528 {
529 	struct rtentry *rt, *rt0 = NULL;
530 	int error = 0;
531 	short *stat = NULL;
532 	struct rt_addrinfo info;
533 	struct ifaddr *ifa;
534 	struct radix_node_head *rnh;
535 
536 	ifa = NULL;
537 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
538 	if (rnh == NULL) {
539 		error = EAFNOSUPPORT;
540 		goto out;
541 	}
542 
543 	/* verify the gateway is directly reachable */
544 	if ((ifa = ifa_ifwithnet(gateway, 0)) == NULL) {
545 		error = ENETUNREACH;
546 		goto out;
547 	}
548 	rt = rtalloc1_fib(dst, 0, 0UL, fibnum);	/* NB: rt is locked */
549 	/*
550 	 * If the redirect isn't from our current router for this dst,
551 	 * it's either old or wrong.  If it redirects us to ourselves,
552 	 * we have a routing loop, perhaps as a result of an interface
553 	 * going down recently.
554 	 */
555 	if (!(flags & RTF_DONE) && rt &&
556 	     (!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa))
557 		error = EINVAL;
558 	else if (ifa_ifwithaddr_check(gateway))
559 		error = EHOSTUNREACH;
560 	if (error)
561 		goto done;
562 	/*
563 	 * Create a new entry if we just got back a wildcard entry
564 	 * or the lookup failed.  This is necessary for hosts
565 	 * which use routing redirects generated by smart gateways
566 	 * to dynamically build the routing tables.
567 	 */
568 	if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
569 		goto create;
570 	/*
571 	 * Don't listen to the redirect if it's
572 	 * for a route to an interface.
573 	 */
574 	if (rt->rt_flags & RTF_GATEWAY) {
575 		if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
576 			/*
577 			 * Changing from route to net => route to host.
578 			 * Create new route, rather than smashing route to net.
579 			 */
580 		create:
581 			rt0 = rt;
582 			rt = NULL;
583 
584 			flags |=  RTF_GATEWAY | RTF_DYNAMIC;
585 			bzero((caddr_t)&info, sizeof(info));
586 			info.rti_info[RTAX_DST] = dst;
587 			info.rti_info[RTAX_GATEWAY] = gateway;
588 			info.rti_info[RTAX_NETMASK] = netmask;
589 			info.rti_ifa = ifa;
590 			info.rti_flags = flags;
591 			if (rt0 != NULL)
592 				RT_UNLOCK(rt0);	/* drop lock to avoid LOR with RNH */
593 			error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
594 			if (rt != NULL) {
595 				RT_LOCK(rt);
596 				if (rt0 != NULL)
597 					EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst);
598 				flags = rt->rt_flags;
599 			}
600 			if (rt0 != NULL)
601 				RTFREE(rt0);
602 
603 			stat = &V_rtstat.rts_dynamic;
604 		} else {
605 			struct rtentry *gwrt;
606 
607 			/*
608 			 * Smash the current notion of the gateway to
609 			 * this destination.  Should check about netmask!!!
610 			 */
611 			rt->rt_flags |= RTF_MODIFIED;
612 			flags |= RTF_MODIFIED;
613 			stat = &V_rtstat.rts_newgateway;
614 			/*
615 			 * add the key and gateway (in one malloc'd chunk).
616 			 */
617 			RT_UNLOCK(rt);
618 			RADIX_NODE_HEAD_LOCK(rnh);
619 			RT_LOCK(rt);
620 			rt_setgate(rt, rt_key(rt), gateway);
621 			gwrt = rtalloc1(gateway, 1, RTF_RNH_LOCKED);
622 			RADIX_NODE_HEAD_UNLOCK(rnh);
623 			EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst);
624 			RTFREE_LOCKED(gwrt);
625 		}
626 	} else
627 		error = EHOSTUNREACH;
628 done:
629 	if (rt)
630 		RTFREE_LOCKED(rt);
631 out:
632 	if (error)
633 		V_rtstat.rts_badredirect++;
634 	else if (stat != NULL)
635 		(*stat)++;
636 	bzero((caddr_t)&info, sizeof(info));
637 	info.rti_info[RTAX_DST] = dst;
638 	info.rti_info[RTAX_GATEWAY] = gateway;
639 	info.rti_info[RTAX_NETMASK] = netmask;
640 	info.rti_info[RTAX_AUTHOR] = src;
641 	rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum);
642 	if (ifa != NULL)
643 		ifa_free(ifa);
644 }
645 
646 int
647 rtioctl(u_long req, caddr_t data)
648 {
649 
650 	return (rtioctl_fib(req, data, RT_DEFAULT_FIB));
651 }
652 
653 /*
654  * Routing table ioctl interface.
655  */
656 int
657 rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
658 {
659 
660 	/*
661 	 * If more ioctl commands are added here, make sure the proper
662 	 * super-user checks are being performed because it is possible for
663 	 * prison-root to make it this far if raw sockets have been enabled
664 	 * in jails.
665 	 */
666 #ifdef INET
667 	/* Multicast goop, grrr... */
668 	return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
669 #else /* INET */
670 	return ENXIO;
671 #endif /* INET */
672 }
673 
674 /*
675  * For both ifa_ifwithroute() routines, 'ifa' is returned referenced.
676  */
677 struct ifaddr *
678 ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway)
679 {
680 
681 	return (ifa_ifwithroute_fib(flags, dst, gateway, RT_DEFAULT_FIB));
682 }
683 
684 struct ifaddr *
685 ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway,
686 				u_int fibnum)
687 {
688 	register struct ifaddr *ifa;
689 	int not_found = 0;
690 
691 	if ((flags & RTF_GATEWAY) == 0) {
692 		/*
693 		 * If we are adding a route to an interface,
694 		 * and the interface is a pt to pt link
695 		 * we should search for the destination
696 		 * as our clue to the interface.  Otherwise
697 		 * we can use the local address.
698 		 */
699 		ifa = NULL;
700 		if (flags & RTF_HOST)
701 			ifa = ifa_ifwithdstaddr(dst);
702 		if (ifa == NULL)
703 			ifa = ifa_ifwithaddr(gateway);
704 	} else {
705 		/*
706 		 * If we are adding a route to a remote net
707 		 * or host, the gateway may still be on the
708 		 * other end of a pt to pt link.
709 		 */
710 		ifa = ifa_ifwithdstaddr(gateway);
711 	}
712 	if (ifa == NULL)
713 		ifa = ifa_ifwithnet(gateway, 0);
714 	if (ifa == NULL) {
715 		struct rtentry *rt = rtalloc1_fib(gateway, 0, RTF_RNH_LOCKED, fibnum);
716 		if (rt == NULL)
717 			return (NULL);
718 		/*
719 		 * dismiss a gateway that is reachable only
720 		 * through the default router
721 		 */
722 		switch (gateway->sa_family) {
723 		case AF_INET:
724 			if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY)
725 				not_found = 1;
726 			break;
727 		case AF_INET6:
728 			if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr))
729 				not_found = 1;
730 			break;
731 		default:
732 			break;
733 		}
734 		if (!not_found && rt->rt_ifa != NULL) {
735 			ifa = rt->rt_ifa;
736 			ifa_ref(ifa);
737 		}
738 		RT_REMREF(rt);
739 		RT_UNLOCK(rt);
740 		if (not_found || ifa == NULL)
741 			return (NULL);
742 	}
743 	if (ifa->ifa_addr->sa_family != dst->sa_family) {
744 		struct ifaddr *oifa = ifa;
745 		ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
746 		if (ifa == NULL)
747 			ifa = oifa;
748 		else
749 			ifa_free(oifa);
750 	}
751 	return (ifa);
752 }
753 
754 /*
755  * Do appropriate manipulations of a routing tree given
756  * all the bits of info needed
757  */
758 int
759 rtrequest(int req,
760 	struct sockaddr *dst,
761 	struct sockaddr *gateway,
762 	struct sockaddr *netmask,
763 	int flags,
764 	struct rtentry **ret_nrt)
765 {
766 
767 	return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt,
768 	    RT_DEFAULT_FIB));
769 }
770 
771 int
772 rtrequest_fib(int req,
773 	struct sockaddr *dst,
774 	struct sockaddr *gateway,
775 	struct sockaddr *netmask,
776 	int flags,
777 	struct rtentry **ret_nrt,
778 	u_int fibnum)
779 {
780 	struct rt_addrinfo info;
781 
782 	if (dst->sa_len == 0)
783 		return(EINVAL);
784 
785 	bzero((caddr_t)&info, sizeof(info));
786 	info.rti_flags = flags;
787 	info.rti_info[RTAX_DST] = dst;
788 	info.rti_info[RTAX_GATEWAY] = gateway;
789 	info.rti_info[RTAX_NETMASK] = netmask;
790 	return rtrequest1_fib(req, &info, ret_nrt, fibnum);
791 }
792 
793 /*
794  * These (questionable) definitions of apparent local variables apply
795  * to the next two functions.  XXXXXX!!!
796  */
797 #define	dst	info->rti_info[RTAX_DST]
798 #define	gateway	info->rti_info[RTAX_GATEWAY]
799 #define	netmask	info->rti_info[RTAX_NETMASK]
800 #define	ifaaddr	info->rti_info[RTAX_IFA]
801 #define	ifpaddr	info->rti_info[RTAX_IFP]
802 #define	flags	info->rti_flags
803 
804 int
805 rt_getifa(struct rt_addrinfo *info)
806 {
807 
808 	return (rt_getifa_fib(info, RT_DEFAULT_FIB));
809 }
810 
811 /*
812  * Look up rt_addrinfo for a specific fib.  Note that if rti_ifa is defined,
813  * it will be referenced so the caller must free it.
814  */
815 int
816 rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
817 {
818 	struct ifaddr *ifa;
819 	int error = 0;
820 
821 	/*
822 	 * ifp may be specified by sockaddr_dl
823 	 * when protocol address is ambiguous.
824 	 */
825 	if (info->rti_ifp == NULL && ifpaddr != NULL &&
826 	    ifpaddr->sa_family == AF_LINK &&
827 	    (ifa = ifa_ifwithnet(ifpaddr, 0)) != NULL) {
828 		info->rti_ifp = ifa->ifa_ifp;
829 		ifa_free(ifa);
830 	}
831 	if (info->rti_ifa == NULL && ifaaddr != NULL)
832 		info->rti_ifa = ifa_ifwithaddr(ifaaddr);
833 	if (info->rti_ifa == NULL) {
834 		struct sockaddr *sa;
835 
836 		sa = ifaaddr != NULL ? ifaaddr :
837 		    (gateway != NULL ? gateway : dst);
838 		if (sa != NULL && info->rti_ifp != NULL)
839 			info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
840 		else if (dst != NULL && gateway != NULL)
841 			info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway,
842 							fibnum);
843 		else if (sa != NULL)
844 			info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa,
845 							fibnum);
846 	}
847 	if ((ifa = info->rti_ifa) != NULL) {
848 		if (info->rti_ifp == NULL)
849 			info->rti_ifp = ifa->ifa_ifp;
850 	} else
851 		error = ENETUNREACH;
852 	return (error);
853 }
854 
855 /*
856  * Expunges references to a route that's about to be reclaimed.
857  * The route must be locked.
858  */
859 int
860 rtexpunge(struct rtentry *rt)
861 {
862 #if !defined(RADIX_MPATH)
863 	struct radix_node *rn;
864 #else
865 	struct rt_addrinfo info;
866 	int fib;
867 	struct rtentry *rt0;
868 #endif
869 	struct radix_node_head *rnh;
870 	struct ifaddr *ifa;
871 	int error = 0;
872 
873 	/*
874 	 * Find the correct routing tree to use for this Address Family
875 	 */
876 	rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
877 	RT_LOCK_ASSERT(rt);
878 	if (rnh == NULL)
879 		return (EAFNOSUPPORT);
880 	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
881 
882 #ifdef RADIX_MPATH
883 	fib = rt->rt_fibnum;
884 	bzero(&info, sizeof(info));
885 	info.rti_ifp = rt->rt_ifp;
886 	info.rti_flags = RTF_RNH_LOCKED;
887 	info.rti_info[RTAX_DST] = rt_key(rt);
888 	info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr;
889 
890 	RT_UNLOCK(rt);
891 	error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib);
892 
893 	if (error == 0 && rt0 != NULL) {
894 		rt = rt0;
895 		RT_LOCK(rt);
896 	} else if (error != 0) {
897 		RT_LOCK(rt);
898 		return (error);
899 	}
900 #else
901 	/*
902 	 * Remove the item from the tree; it should be there,
903 	 * but when callers invoke us blindly it may not (sigh).
904 	 */
905 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh);
906 	if (rn == NULL) {
907 		error = ESRCH;
908 		goto bad;
909 	}
910 	KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0,
911 		("unexpected flags 0x%x", rn->rn_flags));
912 	KASSERT(rt == RNTORT(rn),
913 		("lookup mismatch, rt %p rn %p", rt, rn));
914 #endif /* RADIX_MPATH */
915 
916 	rt->rt_flags &= ~RTF_UP;
917 
918 	/*
919 	 * Give the protocol a chance to keep things in sync.
920 	 */
921 	if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) {
922 		struct rt_addrinfo info;
923 
924 		bzero((caddr_t)&info, sizeof(info));
925 		info.rti_flags = rt->rt_flags;
926 		info.rti_info[RTAX_DST] = rt_key(rt);
927 		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
928 		info.rti_info[RTAX_NETMASK] = rt_mask(rt);
929 		ifa->ifa_rtrequest(RTM_DELETE, rt, &info);
930 	}
931 
932 	/*
933 	 * one more rtentry floating around that is not
934 	 * linked to the routing table.
935 	 */
936 	V_rttrash++;
937 #if !defined(RADIX_MPATH)
938 bad:
939 #endif
940 	return (error);
941 }
942 
943 #if 0
944 int p_sockaddr(char *buf, int buflen, struct sockaddr *s);
945 int rt_print(char *buf, int buflen, struct rtentry *rt);
946 
947 int
948 p_sockaddr(char *buf, int buflen, struct sockaddr *s)
949 {
950 	void *paddr = NULL;
951 
952 	switch (s->sa_family) {
953 	case AF_INET:
954 		paddr = &((struct sockaddr_in *)s)->sin_addr;
955 		break;
956 	case AF_INET6:
957 		paddr = &((struct sockaddr_in6 *)s)->sin6_addr;
958 		break;
959 	}
960 
961 	if (paddr == NULL)
962 		return (0);
963 
964 	if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL)
965 		return (0);
966 
967 	return (strlen(buf));
968 }
969 
970 int
971 rt_print(char *buf, int buflen, struct rtentry *rt)
972 {
973 	struct sockaddr *addr, *mask;
974 	int i = 0;
975 
976 	addr = rt_key(rt);
977 	mask = rt_mask(rt);
978 
979 	i = p_sockaddr(buf, buflen, addr);
980 	if (!(rt->rt_flags & RTF_HOST)) {
981 		buf[i++] = '/';
982 		i += p_sockaddr(buf + i, buflen - i, mask);
983 	}
984 
985 	if (rt->rt_flags & RTF_GATEWAY) {
986 		buf[i++] = '>';
987 		i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway);
988 	}
989 
990 	return (i);
991 }
992 #endif
993 
994 #ifdef RADIX_MPATH
995 static int
996 rn_mpath_update(int req, struct rt_addrinfo *info,
997     struct radix_node_head *rnh, struct rtentry **ret_nrt)
998 {
999 	/*
1000 	 * if we got multipath routes, we require users to specify
1001 	 * a matching RTAX_GATEWAY.
1002 	 */
1003 	struct rtentry *rt, *rto = NULL;
1004 	register struct radix_node *rn;
1005 	int error = 0;
1006 
1007 	rn = rnh->rnh_lookup(dst, netmask, rnh);
1008 	if (rn == NULL)
1009 		return (ESRCH);
1010 	rto = rt = RNTORT(rn);
1011 
1012 	rt = rt_mpath_matchgate(rt, gateway);
1013 	if (rt == NULL)
1014 		return (ESRCH);
1015 	/*
1016 	 * this is the first entry in the chain
1017 	 */
1018 	if (rto == rt) {
1019 		rn = rn_mpath_next((struct radix_node *)rt);
1020 		/*
1021 		 * there is another entry, now it's active
1022 		 */
1023 		if (rn) {
1024 			rto = RNTORT(rn);
1025 			RT_LOCK(rto);
1026 			rto->rt_flags |= RTF_UP;
1027 			RT_UNLOCK(rto);
1028 		} else if (rt->rt_flags & RTF_GATEWAY) {
1029 			/*
1030 			 * For gateway routes, we need to
1031 			 * make sure that we we are deleting
1032 			 * the correct gateway.
1033 			 * rt_mpath_matchgate() does not
1034 			 * check the case when there is only
1035 			 * one route in the chain.
1036 			 */
1037 			if (gateway &&
1038 			    (rt->rt_gateway->sa_len != gateway->sa_len ||
1039 				memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
1040 				error = ESRCH;
1041 			else {
1042 				/*
1043 				 * remove from tree before returning it
1044 				 * to the caller
1045 				 */
1046 				rn = rnh->rnh_deladdr(dst, netmask, rnh);
1047 				KASSERT(rt == RNTORT(rn), ("radix node disappeared"));
1048 				goto gwdelete;
1049 			}
1050 
1051 		}
1052 		/*
1053 		 * use the normal delete code to remove
1054 		 * the first entry
1055 		 */
1056 		if (req != RTM_DELETE)
1057 			goto nondelete;
1058 
1059 		error = ENOENT;
1060 		goto done;
1061 	}
1062 
1063 	/*
1064 	 * if the entry is 2nd and on up
1065 	 */
1066 	if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
1067 		panic ("rtrequest1: rt_mpath_deldup");
1068 gwdelete:
1069 	RT_LOCK(rt);
1070 	RT_ADDREF(rt);
1071 	if (req == RTM_DELETE) {
1072 		rt->rt_flags &= ~RTF_UP;
1073 		/*
1074 		 * One more rtentry floating around that is not
1075 		 * linked to the routing table. rttrash will be decremented
1076 		 * when RTFREE(rt) is eventually called.
1077 		 */
1078 		V_rttrash++;
1079 	}
1080 
1081 nondelete:
1082 	if (req != RTM_DELETE)
1083 		panic("unrecognized request %d", req);
1084 
1085 
1086 	/*
1087 	 * If the caller wants it, then it can have it,
1088 	 * but it's up to it to free the rtentry as we won't be
1089 	 * doing it.
1090 	 */
1091 	if (ret_nrt) {
1092 		*ret_nrt = rt;
1093 		RT_UNLOCK(rt);
1094 	} else
1095 		RTFREE_LOCKED(rt);
1096 done:
1097 	return (error);
1098 }
1099 #endif
1100 
1101 int
1102 rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
1103 				u_int fibnum)
1104 {
1105 	int error = 0, needlock = 0;
1106 	register struct rtentry *rt;
1107 #ifdef FLOWTABLE
1108 	register struct rtentry *rt0;
1109 #endif
1110 	register struct radix_node *rn;
1111 	register struct radix_node_head *rnh;
1112 	struct ifaddr *ifa;
1113 	struct sockaddr *ndst;
1114 	struct sockaddr_storage mdst;
1115 #define senderr(x) { error = x ; goto bad; }
1116 
1117 	KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
1118 	switch (dst->sa_family) {
1119 	case AF_INET6:
1120 	case AF_INET:
1121 		/* We support multiple FIBs. */
1122 		break;
1123 	default:
1124 		fibnum = RT_DEFAULT_FIB;
1125 		break;
1126 	}
1127 
1128 	/*
1129 	 * Find the correct routing tree to use for this Address Family
1130 	 */
1131 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
1132 	if (rnh == NULL)
1133 		return (EAFNOSUPPORT);
1134 	needlock = ((flags & RTF_RNH_LOCKED) == 0);
1135 	flags &= ~RTF_RNH_LOCKED;
1136 	if (needlock)
1137 		RADIX_NODE_HEAD_LOCK(rnh);
1138 	else
1139 		RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
1140 	/*
1141 	 * If we are adding a host route then we don't want to put
1142 	 * a netmask in the tree, nor do we want to clone it.
1143 	 */
1144 	if (flags & RTF_HOST)
1145 		netmask = NULL;
1146 
1147 	switch (req) {
1148 	case RTM_DELETE:
1149 		if (netmask) {
1150 			rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
1151 			dst = (struct sockaddr *)&mdst;
1152 		}
1153 #ifdef RADIX_MPATH
1154 		if (rn_mpath_capable(rnh)) {
1155 			error = rn_mpath_update(req, info, rnh, ret_nrt);
1156 			/*
1157 			 * "bad" holds true for the success case
1158 			 * as well
1159 			 */
1160 			if (error != ENOENT)
1161 				goto bad;
1162 			error = 0;
1163 		}
1164 #endif
1165 		if ((flags & RTF_PINNED) == 0) {
1166 			/* Check if target route can be deleted */
1167 			rt = (struct rtentry *)rnh->rnh_lookup(dst,
1168 			    netmask, rnh);
1169 			if ((rt != NULL) && (rt->rt_flags & RTF_PINNED))
1170 				senderr(EADDRINUSE);
1171 		}
1172 
1173 		/*
1174 		 * Remove the item from the tree and return it.
1175 		 * Complain if it is not there and do no more processing.
1176 		 */
1177 		rn = rnh->rnh_deladdr(dst, netmask, rnh);
1178 		if (rn == NULL)
1179 			senderr(ESRCH);
1180 		if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
1181 			panic ("rtrequest delete");
1182 		rt = RNTORT(rn);
1183 		RT_LOCK(rt);
1184 		RT_ADDREF(rt);
1185 		rt->rt_flags &= ~RTF_UP;
1186 
1187 		/*
1188 		 * give the protocol a chance to keep things in sync.
1189 		 */
1190 		if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
1191 			ifa->ifa_rtrequest(RTM_DELETE, rt, info);
1192 
1193 		/*
1194 		 * One more rtentry floating around that is not
1195 		 * linked to the routing table. rttrash will be decremented
1196 		 * when RTFREE(rt) is eventually called.
1197 		 */
1198 		V_rttrash++;
1199 
1200 		/*
1201 		 * If the caller wants it, then it can have it,
1202 		 * but it's up to it to free the rtentry as we won't be
1203 		 * doing it.
1204 		 */
1205 		if (ret_nrt) {
1206 			*ret_nrt = rt;
1207 			RT_UNLOCK(rt);
1208 		} else
1209 			RTFREE_LOCKED(rt);
1210 		break;
1211 	case RTM_RESOLVE:
1212 		/*
1213 		 * resolve was only used for route cloning
1214 		 * here for compat
1215 		 */
1216 		break;
1217 	case RTM_ADD:
1218 		if ((flags & RTF_GATEWAY) && !gateway)
1219 			senderr(EINVAL);
1220 		if (dst && gateway && (dst->sa_family != gateway->sa_family) &&
1221 		    (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
1222 			senderr(EINVAL);
1223 
1224 		if (info->rti_ifa == NULL) {
1225 			error = rt_getifa_fib(info, fibnum);
1226 			if (error)
1227 				senderr(error);
1228 		} else
1229 			ifa_ref(info->rti_ifa);
1230 		ifa = info->rti_ifa;
1231 		rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO);
1232 		if (rt == NULL) {
1233 			ifa_free(ifa);
1234 			senderr(ENOBUFS);
1235 		}
1236 		RT_LOCK_INIT(rt);
1237 		rt->rt_flags = RTF_UP | flags;
1238 		rt->rt_fibnum = fibnum;
1239 		/*
1240 		 * Add the gateway. Possibly re-malloc-ing the storage for it.
1241 		 */
1242 		RT_LOCK(rt);
1243 		if ((error = rt_setgate(rt, dst, gateway)) != 0) {
1244 			RT_LOCK_DESTROY(rt);
1245 			ifa_free(ifa);
1246 			uma_zfree(V_rtzone, rt);
1247 			senderr(error);
1248 		}
1249 
1250 		/*
1251 		 * point to the (possibly newly malloc'd) dest address.
1252 		 */
1253 		ndst = (struct sockaddr *)rt_key(rt);
1254 
1255 		/*
1256 		 * make sure it contains the value we want (masked if needed).
1257 		 */
1258 		if (netmask) {
1259 			rt_maskedcopy(dst, ndst, netmask);
1260 		} else
1261 			bcopy(dst, ndst, dst->sa_len);
1262 
1263 		/*
1264 		 * We use the ifa reference returned by rt_getifa_fib().
1265 		 * This moved from below so that rnh->rnh_addaddr() can
1266 		 * examine the ifa and  ifa->ifa_ifp if it so desires.
1267 		 */
1268 		rt->rt_ifa = ifa;
1269 		rt->rt_ifp = ifa->ifa_ifp;
1270 		rt->rt_rmx.rmx_weight = 1;
1271 
1272 #ifdef RADIX_MPATH
1273 		/* do not permit exactly the same dst/mask/gw pair */
1274 		if (rn_mpath_capable(rnh) &&
1275 			rt_mpath_conflict(rnh, rt, netmask)) {
1276 			ifa_free(rt->rt_ifa);
1277 			Free(rt_key(rt));
1278 			RT_LOCK_DESTROY(rt);
1279 			uma_zfree(V_rtzone, rt);
1280 			senderr(EEXIST);
1281 		}
1282 #endif
1283 
1284 #ifdef FLOWTABLE
1285 		rt0 = NULL;
1286 		/* "flow-table" only supports IPv6 and IPv4 at the moment. */
1287 		switch (dst->sa_family) {
1288 #ifdef INET6
1289 		case AF_INET6:
1290 #endif
1291 #ifdef INET
1292 		case AF_INET:
1293 #endif
1294 #if defined(INET6) || defined(INET)
1295 			rn = rnh->rnh_matchaddr(dst, rnh);
1296 			if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
1297 				struct sockaddr *mask;
1298 				u_char *m, *n;
1299 				int len;
1300 
1301 				/*
1302 				 * compare mask to see if the new route is
1303 				 * more specific than the existing one
1304 				 */
1305 				rt0 = RNTORT(rn);
1306 				RT_LOCK(rt0);
1307 				RT_ADDREF(rt0);
1308 				RT_UNLOCK(rt0);
1309 				/*
1310 				 * A host route is already present, so
1311 				 * leave the flow-table entries as is.
1312 				 */
1313 				if (rt0->rt_flags & RTF_HOST) {
1314 					RTFREE(rt0);
1315 					rt0 = NULL;
1316 				} else if (!(flags & RTF_HOST) && netmask) {
1317 					mask = rt_mask(rt0);
1318 					len = mask->sa_len;
1319 					m = (u_char *)mask;
1320 					n = (u_char *)netmask;
1321 					while (len-- > 0) {
1322 						if (*n != *m)
1323 							break;
1324 						n++;
1325 						m++;
1326 					}
1327 					if (len == 0 || (*n < *m)) {
1328 						RTFREE(rt0);
1329 						rt0 = NULL;
1330 					}
1331 				}
1332 			}
1333 #endif/* INET6 || INET */
1334 		}
1335 #endif /* FLOWTABLE */
1336 
1337 		/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
1338 		rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes);
1339 		/*
1340 		 * If it still failed to go into the tree,
1341 		 * then un-make it (this should be a function)
1342 		 */
1343 		if (rn == NULL) {
1344 			ifa_free(rt->rt_ifa);
1345 			Free(rt_key(rt));
1346 			RT_LOCK_DESTROY(rt);
1347 			uma_zfree(V_rtzone, rt);
1348 #ifdef FLOWTABLE
1349 			if (rt0 != NULL)
1350 				RTFREE(rt0);
1351 #endif
1352 			senderr(EEXIST);
1353 		}
1354 #ifdef FLOWTABLE
1355 		else if (rt0 != NULL) {
1356 			switch (dst->sa_family) {
1357 #ifdef INET6
1358 			case AF_INET6:
1359 				flowtable_route_flush(V_ip6_ft, rt0);
1360 				break;
1361 #endif
1362 #ifdef INET
1363 			case AF_INET:
1364 				flowtable_route_flush(V_ip_ft, rt0);
1365 				break;
1366 #endif
1367 			}
1368 			RTFREE(rt0);
1369 		}
1370 #endif
1371 
1372 		/*
1373 		 * If this protocol has something to add to this then
1374 		 * allow it to do that as well.
1375 		 */
1376 		if (ifa->ifa_rtrequest)
1377 			ifa->ifa_rtrequest(req, rt, info);
1378 
1379 		/*
1380 		 * actually return a resultant rtentry and
1381 		 * give the caller a single reference.
1382 		 */
1383 		if (ret_nrt) {
1384 			*ret_nrt = rt;
1385 			RT_ADDREF(rt);
1386 		}
1387 		RT_UNLOCK(rt);
1388 		break;
1389 	default:
1390 		error = EOPNOTSUPP;
1391 	}
1392 bad:
1393 	if (needlock)
1394 		RADIX_NODE_HEAD_UNLOCK(rnh);
1395 	return (error);
1396 #undef senderr
1397 }
1398 
1399 #undef dst
1400 #undef gateway
1401 #undef netmask
1402 #undef ifaaddr
1403 #undef ifpaddr
1404 #undef flags
1405 
1406 int
1407 rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate)
1408 {
1409 	/* XXX dst may be overwritten, can we move this to below */
1410 	int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);
1411 #ifdef INVARIANTS
1412 	struct radix_node_head *rnh;
1413 
1414 	rnh = rt_tables_get_rnh(rt->rt_fibnum, dst->sa_family);
1415 #endif
1416 
1417 	RT_LOCK_ASSERT(rt);
1418 	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
1419 
1420 	/*
1421 	 * Prepare to store the gateway in rt->rt_gateway.
1422 	 * Both dst and gateway are stored one after the other in the same
1423 	 * malloc'd chunk. If we have room, we can reuse the old buffer,
1424 	 * rt_gateway already points to the right place.
1425 	 * Otherwise, malloc a new block and update the 'dst' address.
1426 	 */
1427 	if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) {
1428 		caddr_t new;
1429 
1430 		R_Malloc(new, caddr_t, dlen + glen);
1431 		if (new == NULL)
1432 			return ENOBUFS;
1433 		/*
1434 		 * XXX note, we copy from *dst and not *rt_key(rt) because
1435 		 * rt_setgate() can be called to initialize a newly
1436 		 * allocated route entry, in which case rt_key(rt) == NULL
1437 		 * (and also rt->rt_gateway == NULL).
1438 		 * Free()/free() handle a NULL argument just fine.
1439 		 */
1440 		bcopy(dst, new, dlen);
1441 		Free(rt_key(rt));	/* free old block, if any */
1442 		rt_key(rt) = (struct sockaddr *)new;
1443 		rt->rt_gateway = (struct sockaddr *)(new + dlen);
1444 	}
1445 
1446 	/*
1447 	 * Copy the new gateway value into the memory chunk.
1448 	 */
1449 	bcopy(gate, rt->rt_gateway, glen);
1450 
1451 	return (0);
1452 }
1453 
1454 void
1455 rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
1456 {
1457 	register u_char *cp1 = (u_char *)src;
1458 	register u_char *cp2 = (u_char *)dst;
1459 	register u_char *cp3 = (u_char *)netmask;
1460 	u_char *cplim = cp2 + *cp3;
1461 	u_char *cplim2 = cp2 + *cp1;
1462 
1463 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
1464 	cp3 += 2;
1465 	if (cplim > cplim2)
1466 		cplim = cplim2;
1467 	while (cp2 < cplim)
1468 		*cp2++ = *cp1++ & *cp3++;
1469 	if (cp2 < cplim2)
1470 		bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
1471 }
1472 
1473 /*
1474  * Set up a routing table entry, normally
1475  * for an interface.
1476  */
1477 #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */
1478 static inline  int
1479 rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum)
1480 {
1481 	struct sockaddr *dst;
1482 	struct sockaddr *netmask;
1483 	struct rtentry *rt = NULL;
1484 	struct rt_addrinfo info;
1485 	int error = 0;
1486 	int startfib, endfib;
1487 	char tempbuf[_SOCKADDR_TMPSIZE];
1488 	int didwork = 0;
1489 	int a_failure = 0;
1490 	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
1491 	struct radix_node_head *rnh;
1492 
1493 	if (flags & RTF_HOST) {
1494 		dst = ifa->ifa_dstaddr;
1495 		netmask = NULL;
1496 	} else {
1497 		dst = ifa->ifa_addr;
1498 		netmask = ifa->ifa_netmask;
1499 	}
1500 	if (dst->sa_len == 0)
1501 		return(EINVAL);
1502 	switch (dst->sa_family) {
1503 	case AF_INET6:
1504 	case AF_INET:
1505 		/* We support multiple FIBs. */
1506 		break;
1507 	default:
1508 		fibnum = RT_DEFAULT_FIB;
1509 		break;
1510 	}
1511 	if (fibnum == RT_ALL_FIBS) {
1512 		if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) {
1513 			startfib = endfib = curthread->td_proc->p_fibnum;
1514 		} else {
1515 			startfib = 0;
1516 			endfib = rt_numfibs - 1;
1517 		}
1518 	} else {
1519 		KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum"));
1520 		startfib = fibnum;
1521 		endfib = fibnum;
1522 	}
1523 
1524 	/*
1525 	 * If it's a delete, check that if it exists,
1526 	 * it's on the correct interface or we might scrub
1527 	 * a route to another ifa which would
1528 	 * be confusing at best and possibly worse.
1529 	 */
1530 	if (cmd == RTM_DELETE) {
1531 		/*
1532 		 * It's a delete, so it should already exist..
1533 		 * If it's a net, mask off the host bits
1534 		 * (Assuming we have a mask)
1535 		 * XXX this is kinda inet specific..
1536 		 */
1537 		if (netmask != NULL) {
1538 			rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask);
1539 			dst = (struct sockaddr *)tempbuf;
1540 		}
1541 	}
1542 	/*
1543 	 * Now go through all the requested tables (fibs) and do the
1544 	 * requested action. Realistically, this will either be fib 0
1545 	 * for protocols that don't do multiple tables or all the
1546 	 * tables for those that do.
1547 	 */
1548 	for ( fibnum = startfib; fibnum <= endfib; fibnum++) {
1549 		if (cmd == RTM_DELETE) {
1550 			struct radix_node *rn;
1551 			/*
1552 			 * Look up an rtentry that is in the routing tree and
1553 			 * contains the correct info.
1554 			 */
1555 			rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
1556 			if (rnh == NULL)
1557 				/* this table doesn't exist but others might */
1558 				continue;
1559 			RADIX_NODE_HEAD_RLOCK(rnh);
1560 			rn = rnh->rnh_lookup(dst, netmask, rnh);
1561 #ifdef RADIX_MPATH
1562 			if (rn_mpath_capable(rnh)) {
1563 
1564 				if (rn == NULL)
1565 					error = ESRCH;
1566 				else {
1567 					rt = RNTORT(rn);
1568 					/*
1569 					 * for interface route the
1570 					 * rt->rt_gateway is sockaddr_intf
1571 					 * for cloning ARP entries, so
1572 					 * rt_mpath_matchgate must use the
1573 					 * interface address
1574 					 */
1575 					rt = rt_mpath_matchgate(rt,
1576 					    ifa->ifa_addr);
1577 					if (rt == NULL)
1578 						error = ESRCH;
1579 				}
1580 			}
1581 #endif
1582 			error = (rn == NULL ||
1583 			    (rn->rn_flags & RNF_ROOT) ||
1584 			    RNTORT(rn)->rt_ifa != ifa);
1585 			RADIX_NODE_HEAD_RUNLOCK(rnh);
1586 			if (error) {
1587 				/* this is only an error if bad on ALL tables */
1588 				continue;
1589 			}
1590 		}
1591 		/*
1592 		 * Do the actual request
1593 		 */
1594 		bzero((caddr_t)&info, sizeof(info));
1595 		info.rti_ifa = ifa;
1596 		info.rti_flags = flags |
1597 		    (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED;
1598 		info.rti_info[RTAX_DST] = dst;
1599 		/*
1600 		 * doing this for compatibility reasons
1601 		 */
1602 		if (cmd == RTM_ADD)
1603 			info.rti_info[RTAX_GATEWAY] =
1604 			    (struct sockaddr *)&null_sdl;
1605 		else
1606 			info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
1607 		info.rti_info[RTAX_NETMASK] = netmask;
1608 		error = rtrequest1_fib(cmd, &info, &rt, fibnum);
1609 
1610 		if ((error == EEXIST) && (cmd == RTM_ADD)) {
1611 			/*
1612 			 * Interface route addition failed.
1613 			 * Atomically delete current prefix generating
1614 			 * RTM_DELETE message, and retry adding
1615 			 * interface prefix.
1616 			 */
1617 			rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
1618 			RADIX_NODE_HEAD_LOCK(rnh);
1619 
1620 			/* Delete old prefix */
1621 			info.rti_ifa = NULL;
1622 			info.rti_flags = RTF_RNH_LOCKED;
1623 
1624 			error = rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum);
1625 			if (error == 0) {
1626 				info.rti_ifa = ifa;
1627 				info.rti_flags = flags | RTF_RNH_LOCKED |
1628 				    (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED;
1629 				error = rtrequest1_fib(cmd, &info, &rt, fibnum);
1630 			}
1631 
1632 			RADIX_NODE_HEAD_UNLOCK(rnh);
1633 		}
1634 
1635 
1636 		if (error == 0 && rt != NULL) {
1637 			/*
1638 			 * notify any listening routing agents of the change
1639 			 */
1640 			RT_LOCK(rt);
1641 #ifdef RADIX_MPATH
1642 			/*
1643 			 * in case address alias finds the first address
1644 			 * e.g. ifconfig bge0 192.0.2.246/24
1645 			 * e.g. ifconfig bge0 192.0.2.247/24
1646 			 * the address set in the route is 192.0.2.246
1647 			 * so we need to replace it with 192.0.2.247
1648 			 */
1649 			if (memcmp(rt->rt_ifa->ifa_addr,
1650 			    ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
1651 				ifa_free(rt->rt_ifa);
1652 				ifa_ref(ifa);
1653 				rt->rt_ifp = ifa->ifa_ifp;
1654 				rt->rt_ifa = ifa;
1655 			}
1656 #endif
1657 			/*
1658 			 * doing this for compatibility reasons
1659 			 */
1660 			if (cmd == RTM_ADD) {
1661 			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type  =
1662 				rt->rt_ifp->if_type;
1663 			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
1664 				rt->rt_ifp->if_index;
1665 			}
1666 			RT_ADDREF(rt);
1667 			RT_UNLOCK(rt);
1668 			rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum);
1669 			RT_LOCK(rt);
1670 			RT_REMREF(rt);
1671 			if (cmd == RTM_DELETE) {
1672 				/*
1673 				 * If we are deleting, and we found an entry,
1674 				 * then it's been removed from the tree..
1675 				 * now throw it away.
1676 				 */
1677 				RTFREE_LOCKED(rt);
1678 			} else {
1679 				if (cmd == RTM_ADD) {
1680 					/*
1681 					 * We just wanted to add it..
1682 					 * we don't actually need a reference.
1683 					 */
1684 					RT_REMREF(rt);
1685 				}
1686 				RT_UNLOCK(rt);
1687 			}
1688 			didwork = 1;
1689 		}
1690 		if (error)
1691 			a_failure = error;
1692 	}
1693 	if (cmd == RTM_DELETE) {
1694 		if (didwork) {
1695 			error = 0;
1696 		} else {
1697 			/* we only give an error if it wasn't in any table */
1698 			error = ((flags & RTF_HOST) ?
1699 			    EHOSTUNREACH : ENETUNREACH);
1700 		}
1701 	} else {
1702 		if (a_failure) {
1703 			/* return an error if any of them failed */
1704 			error = a_failure;
1705 		}
1706 	}
1707 	return (error);
1708 }
1709 
1710 #ifndef BURN_BRIDGES
1711 /* special one for inet internal use. may not use. */
1712 int
1713 rtinit_fib(struct ifaddr *ifa, int cmd, int flags)
1714 {
1715 	return (rtinit1(ifa, cmd, flags, RT_ALL_FIBS));
1716 }
1717 #endif
1718 
1719 /*
1720  * Set up a routing table entry, normally
1721  * for an interface.
1722  */
1723 int
1724 rtinit(struct ifaddr *ifa, int cmd, int flags)
1725 {
1726 	struct sockaddr *dst;
1727 	int fib = RT_DEFAULT_FIB;
1728 
1729 	if (flags & RTF_HOST) {
1730 		dst = ifa->ifa_dstaddr;
1731 	} else {
1732 		dst = ifa->ifa_addr;
1733 	}
1734 
1735 	switch (dst->sa_family) {
1736 	case AF_INET6:
1737 	case AF_INET:
1738 		/* We do support multiple FIBs. */
1739 		fib = RT_ALL_FIBS;
1740 		break;
1741 	}
1742 	return (rtinit1(ifa, cmd, flags, fib));
1743 }
1744 
1745 /*
1746  * Announce interface address arrival/withdraw
1747  * Returns 0 on success.
1748  */
1749 int
1750 rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum)
1751 {
1752 
1753 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
1754 		("unexpected cmd %u", cmd));
1755 
1756 	if (fibnum != RT_ALL_FIBS) {
1757 		KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: "
1758 		    "fibnum out of range 0 <= %d < %d", __func__,
1759 		     fibnum, rt_numfibs));
1760 	}
1761 
1762 	return (rtsock_addrmsg(cmd, ifa, fibnum));
1763 }
1764 
1765 
1766 /*
1767  * Announce route addition/removal
1768  * Users of this function MUST validate input data BEFORE calling.
1769  * However we have to be able to handle invalid data:
1770  * if some userland app sends us "invalid" route message (invalid mask,
1771  * no dst, wrokg address families, etc...) we need to pass it back
1772  * to app (and any other rtsock consumers) with rtm_errno field set to
1773  * non-zero value.
1774  * Returns 0 on success.
1775  */
1776 int
1777 rt_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt,
1778     int fibnum)
1779 {
1780 
1781 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
1782 		("unexpected cmd %u", cmd));
1783 
1784 	if (fibnum != RT_ALL_FIBS) {
1785 		KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: "
1786 		    "fibnum out of range 0 <= %d < %d", __func__,
1787 		     fibnum, rt_numfibs));
1788 	}
1789 
1790 	KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__));
1791 
1792 	return (rtsock_routemsg(cmd, ifp, error, rt, fibnum));
1793 }
1794 
1795 void
1796 rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt)
1797 {
1798 
1799 	rt_newaddrmsg_fib(cmd, ifa, error, rt, RT_ALL_FIBS);
1800 }
1801 
1802 /*
1803  * This is called to generate messages from the routing socket
1804  * indicating a network interface has had addresses associated with it.
1805  */
1806 void
1807 rt_newaddrmsg_fib(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt,
1808     int fibnum)
1809 {
1810 
1811 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
1812 		("unexpected cmd %u", cmd));
1813 	if (fibnum != RT_ALL_FIBS) {
1814 		KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: "
1815 		    "fibnum out of range 0 <= %d < %d", __func__,
1816 		     fibnum, rt_numfibs));
1817 	}
1818 
1819 #if defined(INET) || defined(INET6)
1820 #ifdef SCTP
1821 	/*
1822 	 * notify the SCTP stack
1823 	 * this will only get called when an address is added/deleted
1824 	 * XXX pass the ifaddr struct instead if ifa->ifa_addr...
1825 	 */
1826 	sctp_addr_change(ifa, cmd);
1827 #endif /* SCTP */
1828 #endif
1829 	if (cmd == RTM_ADD) {
1830 		rt_addrmsg(cmd, ifa, fibnum);
1831 		if (rt != NULL)
1832 			rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum);
1833 	} else {
1834 		if (rt != NULL)
1835 			rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum);
1836 		rt_addrmsg(cmd, ifa, fibnum);
1837 	}
1838 }
1839 
1840