xref: /titanic_41/usr/src/uts/common/inet/ip/ip_rts.c (revision fd9cb95cbb2f626355a60efb9d02c5f0a33c10e6)
1 /*
2  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  */
5 
6 /*
7  * Copyright (c) 1988, 1991, 1993
8  *	The Regents of the University of California.  All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)rtsock.c	8.6 (Berkeley) 2/11/95
39  */
40 
41 #pragma ident	"%Z%%M%	%I%	%E% SMI"
42 
43 /*
44  * This file contains routines that processes routing socket requests.
45  */
46 
47 #include <sys/types.h>
48 #include <sys/stream.h>
49 #include <sys/stropts.h>
50 #include <sys/strlog.h>
51 #include <sys/dlpi.h>
52 #include <sys/ddi.h>
53 #include <sys/cmn_err.h>
54 #include <sys/debug.h>
55 #include <sys/policy.h>
56 #include <sys/zone.h>
57 
58 #include <sys/systm.h>
59 #include <sys/param.h>
60 #include <sys/socket.h>
61 #define	_SUN_TPI_VERSION	2
62 #include <sys/tihdr.h>
63 #include <sys/strsun.h>
64 #include <net/if.h>
65 #include <net/route.h>
66 #include <netinet/in.h>
67 #include <net/if_dl.h>
68 #include <netinet/ip6.h>
69 
70 #include <inet/common.h>
71 #include <inet/mi.h>
72 #include <inet/ip.h>
73 #include <inet/ip6.h>
74 #include <inet/ip_if.h>
75 #include <inet/ip_ire.h>
76 #include <inet/ip_rts.h>
77 #include <inet/ip_multi.h>
78 
79 #include <inet/ipclassifier.h>
80 
81 #define	RTS_MSG_SIZE(type, rtm_addrs, af) \
82 	(rts_data_msg_size(rtm_addrs, af) + rts_header_msg_size(type))
83 
84 static size_t	rts_copyfromsockaddr(struct sockaddr *sa, in6_addr_t *addrp);
85 static void	rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst,
86     ipaddr_t mask, ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr,
87     ipaddr_t author, ipif_t *ipif, mblk_t *mp);
88 static int	rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp,
89     in6_addr_t *gw_addrp, in6_addr_t *net_maskp, in6_addr_t *authorp,
90     in6_addr_t *if_addrp, in6_addr_t *src_addrp, ushort_t *indexp,
91     ushort_t *src_indexp, sa_family_t *afp);
92 static void	rts_getifdata(if_data_t *if_data, ipif_t *ipif);
93 static int	rts_getmetrics(ire_t *ire, rt_metrics_t *metrics);
94 static mblk_t	*rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire,
95     sa_family_t af);
96 static void	rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics);
97 static void	ip_rts_request_retry(ipsq_t *, queue_t *q, mblk_t *mp, void *);
98 
99 /*
100  * Send the ack to all the routing queues.  In case of the originating queue,
101  * send it only if the loopback is set.
102  *
103  * Messages are sent upstream only on routing sockets that did not specify an
104  * address family when they were created or when the address family matches the
105  * one specified by the caller.
106  *
107  */
108 void
109 rts_queue_input(mblk_t *mp, queue_t *q, sa_family_t af)
110 {
111 	mblk_t	*mp1;
112 	int	checkqfull;
113 	conn_t 	*connp, *next_connp;
114 
115 	mutex_enter(&rts_clients.connf_lock);
116 	connp = rts_clients.connf_head;
117 
118 	while (connp != NULL) {
119 		/*
120 		 * If there was a family specified when this routing socket was
121 		 * created and it doesn't match the family of the message to
122 		 * copy, then continue.
123 		 */
124 		if ((connp->conn_proto != AF_UNSPEC) &&
125 		    (connp->conn_proto != af)) {
126 			connp = connp->conn_next;
127 			continue;
128 		}
129 		/*
130 		 * For the originating queue, we only copy the message upstream
131 		 * if loopback is set.  For others reading on the routing
132 		 * socket, we check if there is room upstream for a copy of the
133 		 * message.
134 		 */
135 		if ((q != NULL) && (CONNP_TO_RQ(connp) == RD(q))) {
136 			if (connp->conn_loopback == 0) {
137 				connp = connp->conn_next;
138 				continue;
139 			}
140 			checkqfull = B_FALSE;
141 		} else {
142 			checkqfull = B_TRUE;
143 		}
144 		CONN_INC_REF(connp);
145 		mutex_exit(&rts_clients.connf_lock);
146 		if (!checkqfull || canputnext(CONNP_TO_RQ(connp))) {
147 			mp1 = dupmsg(mp);
148 			if (mp1 == NULL)
149 				mp1 = copymsg(mp);
150 			if (mp1 != NULL)
151 				putnext(CONNP_TO_RQ(connp), mp1);
152 		}
153 
154 		mutex_enter(&rts_clients.connf_lock);
155 		/* Follow the next pointer before releasing the conn. */
156 		next_connp = connp->conn_next;
157 		CONN_DEC_REF(connp);
158 		connp = next_connp;
159 	}
160 	mutex_exit(&rts_clients.connf_lock);
161 	freemsg(mp);
162 }
163 
164 /*
165  * Takes an ire and sends an ack to all the routing sockets. This
166  * routine is used
167  * - when a route is created/deleted through the ioctl interface.
168  * - when ire_expire deletes a stale redirect
169  */
170 void
171 ip_rts_rtmsg(int type, ire_t *ire, int error)
172 {
173 	mblk_t		*mp;
174 	rt_msghdr_t	*rtm;
175 	int		rtm_addrs = (RTA_DST | RTA_NETMASK | RTA_GATEWAY);
176 	sa_family_t	af;
177 	in6_addr_t	gw_addr_v6;
178 
179 	if (ire == NULL)
180 		return;
181 	ASSERT(ire->ire_ipversion == IPV4_VERSION ||
182 	    ire->ire_ipversion == IPV6_VERSION);
183 
184 	if (ire->ire_flags & RTF_SETSRC)
185 		rtm_addrs |= RTA_SRC;
186 
187 	switch (ire->ire_ipversion) {
188 	case IPV4_VERSION:
189 		af = AF_INET;
190 		mp = rts_alloc_msg(type, rtm_addrs, af);
191 		if (mp == NULL)
192 			return;
193 		rts_fill_msg(type, rtm_addrs, ire->ire_addr, ire->ire_mask,
194 		    ire->ire_gateway_addr, ire->ire_src_addr, 0, 0, NULL, mp);
195 		break;
196 	case IPV6_VERSION:
197 		af = AF_INET6;
198 		mp = rts_alloc_msg(type, rtm_addrs, af);
199 		if (mp == NULL)
200 			return;
201 		mutex_enter(&ire->ire_lock);
202 		gw_addr_v6 = ire->ire_gateway_addr_v6;
203 		mutex_exit(&ire->ire_lock);
204 		rts_fill_msg_v6(type, rtm_addrs, &ire->ire_addr_v6,
205 		    &ire->ire_mask_v6, &gw_addr_v6,
206 		    &ire->ire_src_addr_v6, &ipv6_all_zeros, &ipv6_all_zeros,
207 		    NULL, mp);
208 		break;
209 	}
210 	rtm = (rt_msghdr_t *)mp->b_rptr;
211 	mp->b_wptr = (uchar_t *)&mp->b_rptr[rtm->rtm_msglen];
212 	rtm->rtm_addrs = rtm_addrs;
213 	rtm->rtm_flags = ire->ire_flags;
214 	if (error != 0)
215 		rtm->rtm_errno = error;
216 	else
217 		rtm->rtm_flags |= RTF_DONE;
218 	rts_queue_input(mp, NULL, af);
219 }
220 
221 /* ARGSUSED */
222 static void
223 ip_rts_request_retry(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy)
224 {
225 	(void) ip_rts_request(q, mp, DB_CRED(mp));
226 }
227 
228 /*
229  * Processes requests received on a routing socket. It extracts all the
230  * arguments and calls the appropriate function to process the request.
231  *
232  * RTA_SRC bit flag requests are sent by mipagent and 'route -setsrc'.
233  * RTA_SRCIFP bit flag requests are sent by mipagent only.
234  *
235  * In general, this function does not consume the message supplied but rather
236  * sends the message upstream with an appropriate UNIX errno.
237  *
238  * We may need to restart this operation if the ipif cannot be looked up
239  * due to an exclusive operation that is currently in progress. The restart
240  * entry point is ip_rts_request_retry. While the request is enqueud in the
241  * ipsq the ioctl could be aborted and the conn close. To ensure that we don't
242  * have stale conn pointers, ip_wput_ioctl does a conn refhold. This is
243  * released at the completion of the rts ioctl at the end of this function
244  * by calling CONN_OPER_PENDING_DONE or when the ioctl is aborted and
245  * conn close occurs in conn_ioctl_cleanup.
246  */
247 int
248 ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
249 {
250 	rt_msghdr_t	*rtm = NULL;
251 	in6_addr_t	dst_addr_v6;
252 	in6_addr_t	src_addr_v6;
253 	in6_addr_t	gw_addr_v6;
254 	in6_addr_t	net_mask_v6;
255 	in6_addr_t	author_v6;
256 	in6_addr_t	if_addr_v6;
257 	mblk_t		*mp1, *ioc_mp = mp;
258 	ire_t		*ire = NULL;
259 	ire_t		*sire = NULL;
260 	int		error = 0;
261 	int		match_flags = MATCH_IRE_DSTONLY;
262 	int		found_addrs;
263 	sa_family_t	af;
264 	ipaddr_t	dst_addr;
265 	ipaddr_t	gw_addr;
266 	ipaddr_t	src_addr;
267 	ipaddr_t	net_mask;
268 	ushort_t	index;
269 	ushort_t	src_index;
270 	ipif_t		*ipif = NULL;
271 	ipif_t		*src_ipif = NULL;
272 	ipif_t		*tmp_ipif = NULL;
273 	IOCP		iocp = (IOCP)mp->b_rptr;
274 	conn_t		*connp;
275 
276 	ip1dbg(("ip_rts_request: mp is %x\n", DB_TYPE(mp)));
277 
278 	ASSERT(CONN_Q(q));
279 	connp = Q_TO_CONN(q);
280 
281 	ASSERT(mp->b_cont != NULL);
282 	/* ioc_mp holds mp */
283 	mp = mp->b_cont;
284 
285 	/*
286 	 * The Routing Socket data starts on
287 	 * next block. If there is no next block
288 	 * this is an indication from routing module
289 	 * that it is a routing socket stream queue.
290 	 */
291 	if (mp->b_cont != NULL) {
292 		mp1 = dupmsg(mp->b_cont);
293 		if (mp1 == NULL) {
294 			freemsg(mp);
295 			error  = ENOBUFS;
296 			goto done;
297 		}
298 		mp = mp1;
299 	} else {
300 		/*
301 		 * This is a message from RTS module
302 		 * indicating that this is a Routing Socket
303 		 * Stream. Insert this conn_t in routing
304 		 * socket client list.
305 		 */
306 
307 		connp->conn_loopback = 1;
308 		ipcl_hash_insert_wildcard(&rts_clients, connp);
309 
310 		goto done;
311 	}
312 	if (mp->b_cont != NULL && !pullupmsg(mp, -1)) {
313 		freemsg(mp);
314 		error =  EINVAL;
315 		goto done;
316 	}
317 	if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) {
318 		freemsg(mp);
319 		error = EINVAL;
320 		goto done;
321 	}
322 
323 	/*
324 	 * Check the routing message for basic consistency including the
325 	 * version number and that the number of octets written is the same
326 	 * as specified by the rtm_msglen field.
327 	 *
328 	 * At this point, an error can be delivered back via rtm_errno.
329 	 */
330 	rtm = (rt_msghdr_t *)mp->b_rptr;
331 	if ((mp->b_wptr - mp->b_rptr) != rtm->rtm_msglen) {
332 		error = EINVAL;
333 		goto done;
334 	}
335 	if (rtm->rtm_version != RTM_VERSION) {
336 		error = EPROTONOSUPPORT;
337 		goto done;
338 	}
339 
340 	/* Only allow RTM_GET or RTM_RESOLVE for unprivileged process */
341 	if (rtm->rtm_type != RTM_GET &&
342 	    rtm->rtm_type != RTM_RESOLVE &&
343 	    (ioc_cr == NULL ||
344 	    secpolicy_net_config(ioc_cr, B_FALSE) != 0)) {
345 		error = EPERM;
346 		goto done;
347 	}
348 
349 	found_addrs = rts_getaddrs(rtm, &dst_addr_v6, &gw_addr_v6, &net_mask_v6,
350 	    &author_v6, &if_addr_v6, &src_addr_v6, &index, &src_index, &af);
351 	if ((found_addrs & RTA_DST) == 0) {
352 		error = EINVAL;
353 		goto done;
354 	}
355 
356 	/*
357 	 * Based on the address family of the destination address, determine
358 	 * the destination, gateway and netmask and return the appropriate error
359 	 * if an unknown address family was specified (following the errno
360 	 * values that 4.4BSD-Lite2 returns.)
361 	 */
362 	switch (af) {
363 	case AF_INET:
364 		/*
365 		 * RTA_SRCIFP is supported for interface route only.
366 		 * Thus a gateway route with srcifindex is rejected,
367 		 * except if it's a request to add reverse tunnel
368 		 * route.
369 		 */
370 		if ((rtm->rtm_flags & RTF_GATEWAY) &&
371 		    (found_addrs & RTA_SRCIFP) &&
372 		    !(found_addrs & RTA_SRC)) {
373 			error = EINVAL;
374 			goto done;
375 		}
376 		IN6_V4MAPPED_TO_IPADDR(&dst_addr_v6, dst_addr);
377 		IN6_V4MAPPED_TO_IPADDR(&src_addr_v6, src_addr);
378 		IN6_V4MAPPED_TO_IPADDR(&gw_addr_v6, gw_addr);
379 		if (((found_addrs & RTA_NETMASK) == 0) ||
380 		    (rtm->rtm_flags & RTF_HOST))
381 			net_mask = IP_HOST_MASK;
382 		else
383 			IN6_V4MAPPED_TO_IPADDR(&net_mask_v6, net_mask);
384 		break;
385 	case AF_INET6:
386 		/*
387 		 * RTA_SRCIFP is not a valid flag for IPv6 routes.
388 		 */
389 		if (found_addrs & RTA_SRCIFP) {
390 			error = EINVAL;
391 			goto done;
392 		}
393 		if (((found_addrs & RTA_NETMASK) == 0) ||
394 		    (rtm->rtm_flags & RTF_HOST))
395 			net_mask_v6 = ipv6_all_ones;
396 		break;
397 	default:
398 		/*
399 		 * These errno values are meant to be compatible with
400 		 * 4.4BSD-Lite2 for the given message types.
401 		 */
402 		switch (rtm->rtm_type) {
403 		case RTM_ADD:
404 		case RTM_DELETE:
405 			error = ESRCH;
406 			goto done;
407 		case RTM_GET:
408 		case RTM_CHANGE:
409 			error = EAFNOSUPPORT;
410 			goto done;
411 		default:
412 			error = EOPNOTSUPP;
413 			goto done;
414 		}
415 	}
416 
417 	/*
418 	 * At this point, the address family must be something known.
419 	 */
420 	ASSERT(af == AF_INET || af == AF_INET6);
421 
422 	if (index != 0) {
423 		ill_t   *ill;
424 
425 		/*
426 		 * IPC must be refheld somewhere in ip_wput_nondata or
427 		 * ip_wput_ioctl etc... and cleaned up if ioctl is killed.
428 		 * If ILL_CHANGING the request is queued in the ipsq.
429 		 */
430 		ill = ill_lookup_on_ifindex(index, af == AF_INET6,
431 		    CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry, &error);
432 		if (ill == NULL) {
433 			if (error != EINPROGRESS)
434 				error = EINVAL;
435 			goto done;
436 		}
437 
438 		ipif = ipif_get_next_ipif(NULL, ill);
439 		ill_refrele(ill);
440 		/*
441 		 * If this is replacement ipif, prevent a route from
442 		 * being added.
443 		 */
444 		if (ipif != NULL && ipif->ipif_replace_zero) {
445 			error = ENETDOWN;
446 			goto done;
447 		}
448 		match_flags |= MATCH_IRE_ILL;
449 	}
450 
451 	/* RTA_SRCIFP is unsupported on AF_INET6. */
452 	if (af == AF_INET && src_index != 0) {
453 		ill_t   *ill;
454 
455 		/* If ILL_CHANGING the request is queued in the ipsq. */
456 		ill = ill_lookup_on_ifindex(src_index, B_FALSE,
457 		    CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry, &error);
458 		if (ill == NULL) {
459 			if (error != EINPROGRESS)
460 				error = EINVAL;
461 			goto done;
462 		}
463 
464 		src_ipif = ipif_get_next_ipif(NULL, ill);
465 		ill_refrele(ill);
466 	}
467 	/*
468 	 * If a netmask was supplied in the message, then subsequent route
469 	 * lookups will attempt to match on the netmask as well.
470 	 */
471 	if ((found_addrs & RTA_NETMASK) != 0)
472 		match_flags |= MATCH_IRE_MASK;
473 
474 	switch (rtm->rtm_type) {
475 	case RTM_ADD:
476 		/* if we are adding a route, gateway is a must */
477 		if ((found_addrs & RTA_GATEWAY) == 0) {
478 			error = EINVAL;
479 			goto done;
480 		}
481 
482 		/* Multirouting does not support net routes. */
483 		if ((rtm->rtm_flags & (RTF_MULTIRT | RTF_HOST)) ==
484 		    RTF_MULTIRT) {
485 			error = EADDRNOTAVAIL;
486 			goto done;
487 		}
488 
489 		/*
490 		 * Multirouting and user-specified source addresses
491 		 * do not support interface based routing.
492 		 * Assigning a source address to an interface based
493 		 * route is achievable by plumbing a new ipif and
494 		 * setting up the interface route via this ipif,
495 		 * though.
496 		 */
497 		if (rtm->rtm_flags & (RTF_MULTIRT | RTF_SETSRC)) {
498 			if ((rtm->rtm_flags & RTF_GATEWAY) == 0) {
499 				error = EADDRNOTAVAIL;
500 				goto done;
501 			}
502 		}
503 
504 		switch (af) {
505 		case AF_INET:
506 			if (src_addr != INADDR_ANY) {
507 				/*
508 				 * If there is a source address, but
509 				 * no RTF_SETSRC modifier, setup a MobileIP
510 				 * reverse tunnel.
511 				 */
512 				if ((rtm->rtm_flags & RTF_SETSRC) == 0) {
513 					error = ip_mrtun_rt_add(src_addr,
514 					    rtm->rtm_flags, ipif,
515 					    src_ipif, &ire, CONNP_TO_WQ(connp),
516 					    ioc_mp, ip_rts_request_retry);
517 					break;
518 				}
519 				/*
520 				 * The RTF_SETSRC flag is present, check that
521 				 * the supplied src address is not the loopback
522 				 * address. This would produce martian packets.
523 				 */
524 				if (src_addr == htonl(INADDR_LOOPBACK)) {
525 					error = EINVAL;
526 					goto done;
527 				}
528 				/*
529 				 * Also check that the supplied address is a
530 				 * valid, local one.
531 				 */
532 				tmp_ipif = ipif_lookup_addr(src_addr, NULL,
533 				    ALL_ZONES, CONNP_TO_WQ(connp), ioc_mp,
534 				    ip_rts_request_retry, &error);
535 				if (tmp_ipif == NULL) {
536 					if (error != EINPROGRESS)
537 						error = EADDRNOTAVAIL;
538 					goto done;
539 				}
540 				if (!(tmp_ipif->ipif_flags & IPIF_UP) ||
541 				    (tmp_ipif->ipif_flags &
542 				    (IPIF_NOLOCAL | IPIF_ANYCAST))) {
543 					error = EINVAL;
544 					goto done;
545 				}
546 			} else {
547 				/*
548 				 * The RTF_SETSRC modifier must be associated
549 				 * to a non-null source address.
550 				 */
551 				if (rtm->rtm_flags & RTF_SETSRC) {
552 					error = EINVAL;
553 					goto done;
554 				}
555 			}
556 
557 			error = ip_rt_add(dst_addr, net_mask,
558 			    gw_addr, src_addr,
559 			    rtm->rtm_flags, ipif, src_ipif, &ire, B_FALSE,
560 			    CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry);
561 			if (ipif != NULL)
562 				ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
563 			break;
564 		case AF_INET6:
565 			if (!IN6_IS_ADDR_UNSPECIFIED(&src_addr_v6)) {
566 				/*
567 				 * If there is a source address, but
568 				 * no RTF_SETSRC modifier, reject, as
569 				 * MobileIP IPv6 reverse tunnels are
570 				 * not supported.
571 				 */
572 				if ((rtm->rtm_flags & RTF_SETSRC) == 0) {
573 					error = EINVAL;
574 					goto done;
575 				}
576 				/*
577 				 * The RTF_SETSRC flag is present, check that
578 				 * the supplied src address is not the loopback
579 				 * address. This would produce martian packets.
580 				 */
581 				if (IN6_IS_ADDR_LOOPBACK(&src_addr_v6)) {
582 					error = EINVAL;
583 					goto done;
584 				}
585 				/*
586 				 * Also check that the supplied address is a
587 				 * valid, local one.
588 				 */
589 				tmp_ipif = ipif_lookup_addr_v6(&src_addr_v6,
590 				    NULL, ALL_ZONES, CONNP_TO_WQ(connp), ioc_mp,
591 				    ip_rts_request_retry, &error);
592 				if (tmp_ipif == NULL) {
593 					if (error != EINPROGRESS)
594 						error = EADDRNOTAVAIL;
595 					goto done;
596 				}
597 
598 				if (!(tmp_ipif->ipif_flags & IPIF_UP) ||
599 				    (tmp_ipif->ipif_flags &
600 				    (IPIF_NOLOCAL | IPIF_ANYCAST))) {
601 					error = EINVAL;
602 					goto done;
603 				}
604 
605 				error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
606 				    &gw_addr_v6, &src_addr_v6, rtm->rtm_flags,
607 				    ipif, &ire, CONNP_TO_WQ(connp), ioc_mp,
608 				    ip_rts_request_retry);
609 				break;
610 			}
611 			/*
612 			 * The RTF_SETSRC modifier must be associated
613 			 * to a non-null source address.
614 			 */
615 			if (rtm->rtm_flags & RTF_SETSRC) {
616 				error = EINVAL;
617 				goto done;
618 			}
619 			error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
620 			    &gw_addr_v6, NULL, rtm->rtm_flags,
621 			    ipif, &ire, CONNP_TO_WQ(connp), ioc_mp,
622 			    ip_rts_request_retry);
623 			if (ipif != NULL)
624 				ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
625 			break;
626 		}
627 		if (error != 0)
628 			goto done;
629 		ASSERT(ire != NULL);
630 		rts_setmetrics(ire, rtm->rtm_inits, &rtm->rtm_rmx);
631 		break;
632 	case RTM_DELETE:
633 		/* if we are deleting a route, gateway is a must */
634 		if ((found_addrs & RTA_GATEWAY) == 0) {
635 			error = EINVAL;
636 			goto done;
637 		}
638 		/*
639 		 * The RTF_SETSRC modifier does not make sense
640 		 * when deleting a route.
641 		 */
642 		if (rtm->rtm_flags & RTF_SETSRC) {
643 			error = EINVAL;
644 			goto done;
645 		}
646 
647 		switch (af) {
648 		case AF_INET:
649 			/*
650 			 * If there is a source address, delete
651 			 * a MobileIP reverse tunnel.
652 			 */
653 			if (src_addr != INADDR_ANY) {
654 				error = ip_mrtun_rt_delete(src_addr,
655 				    src_ipif);
656 				break;
657 			}
658 			error = ip_rt_delete(dst_addr, net_mask, gw_addr,
659 			    found_addrs, rtm->rtm_flags, ipif, src_ipif,
660 			    B_FALSE, CONNP_TO_WQ(connp), ioc_mp,
661 			    ip_rts_request_retry);
662 			break;
663 		case AF_INET6:
664 			error = ip_rt_delete_v6(&dst_addr_v6, &net_mask_v6,
665 			    &gw_addr_v6, found_addrs, rtm->rtm_flags, ipif,
666 			    CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry);
667 			break;
668 		}
669 		break;
670 	case RTM_GET:
671 	case RTM_CHANGE:
672 		/*
673 		 * In the case of RTM_GET, the forwarding table should be
674 		 * searched recursively with default being matched if the
675 		 * specific route doesn't exist.  Also, if a gateway was
676 		 * specified then the gateway address must also be matched.
677 		 *
678 		 * In the case of RTM_CHANGE, the gateway address (if supplied)
679 		 * is the new gateway address so matching on the gateway address
680 		 * is not done.  This can lead to ambiguity when looking up the
681 		 * route to change as usually only the destination (and netmask,
682 		 * if supplied) is used for the lookup.  However if a RTA_IFP
683 		 * sockaddr is also supplied, it can disambiguate which route to
684 		 * change provided the ambigous routes are tied to distinct
685 		 * ill's (or interface indices).  If the routes are not tied to
686 		 * any particular interfaces (for example, with traditional
687 		 * gateway routes), then a RTA_IFP sockaddr will be of no use as
688 		 * it won't match any such routes.
689 		 * RTA_SRC is not supported for RTM_GET and RTM_CHANGE,
690 		 * except when RTM_CHANGE is combined to RTF_SETSRC.
691 		 */
692 		if (((found_addrs & RTA_SRC) != 0) &&
693 		    ((rtm->rtm_type == RTM_GET) ||
694 		    !(rtm->rtm_flags & RTF_SETSRC))) {
695 			error = EOPNOTSUPP;
696 			goto done;
697 		}
698 
699 		if (rtm->rtm_type == RTM_GET) {
700 			match_flags |=
701 			    (MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE);
702 			if ((found_addrs & RTA_GATEWAY) != 0)
703 				match_flags |= MATCH_IRE_GW;
704 		}
705 		if (rtm->rtm_type == RTM_CHANGE) {
706 			if ((found_addrs & RTA_GATEWAY) &&
707 			    (rtm->rtm_flags & RTF_SETSRC)) {
708 				/*
709 				 * Do not want to change the gateway,
710 				 * but rather the source address.
711 				 */
712 				match_flags |= MATCH_IRE_GW;
713 			}
714 		}
715 
716 		/*
717 		 * If the netmask is all ones (either as supplied or as derived
718 		 * above), then first check for an IRE_LOOPBACK or
719 		 * IRE_LOCAL entry.
720 		 *
721 		 * If we didn't check for or find an IRE_LOOPBACK or IRE_LOCAL
722 		 * entry, then look in the forwarding table.
723 		 */
724 		switch (af) {
725 		case AF_INET:
726 			if (net_mask == IP_HOST_MASK) {
727 				ire = ire_ctable_lookup(dst_addr, gw_addr,
728 				    IRE_LOCAL | IRE_LOOPBACK, NULL, ALL_ZONES,
729 				    MATCH_IRE_TYPE | MATCH_IRE_GW);
730 			}
731 			if (ire == NULL) {
732 				ire = ire_ftable_lookup(dst_addr, net_mask,
733 				    gw_addr, 0, ipif, &sire, ALL_ZONES, 0,
734 				    match_flags);
735 			}
736 			break;
737 		case AF_INET6:
738 			if (IN6_ARE_ADDR_EQUAL(&net_mask_v6, &ipv6_all_ones)) {
739 				ire = ire_ctable_lookup_v6(&dst_addr_v6,
740 				    &gw_addr_v6, IRE_LOCAL | IRE_LOOPBACK, NULL,
741 				    ALL_ZONES, MATCH_IRE_TYPE | MATCH_IRE_GW);
742 			}
743 			if (ire == NULL) {
744 				ire = ire_ftable_lookup_v6(&dst_addr_v6,
745 				    &net_mask_v6, &gw_addr_v6, 0, ipif, &sire,
746 				    ALL_ZONES, 0, match_flags);
747 			}
748 			break;
749 		}
750 
751 		if (ire == NULL) {
752 			error = ESRCH;
753 			goto done;
754 		}
755 		/* we know the IRE before we come here */
756 		switch (rtm->rtm_type) {
757 		case RTM_GET:
758 			mp1 = rts_rtmget(mp, ire, sire, af);
759 			if (mp1 == NULL) {
760 				error = ENOBUFS;
761 				goto done;
762 			}
763 			freemsg(mp);
764 			mp = mp1;
765 			rtm = (rt_msghdr_t *)mp->b_rptr;
766 			break;
767 		case RTM_CHANGE:
768 			/*
769 			 * Do not allow to the multirouting state of a route
770 			 * to be changed. This aims to prevent undesirable
771 			 * stages where both multirt and non-multirt routes
772 			 * for the same destination are declared.
773 			 */
774 			if ((ire->ire_flags & RTF_MULTIRT) !=
775 			    (rtm->rtm_flags & RTF_MULTIRT)) {
776 				error = EINVAL;
777 				goto done;
778 			}
779 			/*
780 			 * Note that we do not need to do
781 			 * ire_flush_cache_*(IRE_FLUSH_ADD) as a change
782 			 * in metrics or gateway will not affect existing
783 			 * routes since it does not create a more specific
784 			 * route.
785 			 */
786 			switch (af) {
787 			case AF_INET:
788 				ire_flush_cache_v4(ire, IRE_FLUSH_DELETE);
789 				if ((found_addrs & RTA_GATEWAY) != 0 &&
790 				    (ire->ire_gateway_addr != gw_addr)) {
791 					ire->ire_gateway_addr = gw_addr;
792 				}
793 				if ((found_addrs & RTA_SRC) != 0 &&
794 				    (rtm->rtm_flags & RTF_SETSRC) != 0 &&
795 				    (ire->ire_src_addr != src_addr)) {
796 
797 					if (src_addr != INADDR_ANY) {
798 						/*
799 						 * The RTF_SETSRC flag is
800 						 * present, check that the
801 						 * supplied src address is not
802 						 * the loopback address. This
803 						 * would produce martian
804 						 * packets.
805 						 */
806 						if (src_addr ==
807 						    htonl(INADDR_LOOPBACK)) {
808 							error = EINVAL;
809 							goto done;
810 						}
811 						/*
812 						 * Also check that the the
813 						 * supplied addr is a valid
814 						 * local address.
815 						 */
816 						tmp_ipif = ipif_lookup_addr(
817 						    src_addr, NULL, ALL_ZONES,
818 						    CONNP_TO_WQ(connp), ioc_mp,
819 						    ip_rts_request_retry,
820 						    &error);
821 						if (tmp_ipif == NULL) {
822 							error = (error ==
823 							    EINPROGRESS) ?
824 							    error :
825 							    EADDRNOTAVAIL;
826 							goto done;
827 						}
828 
829 						if (!(tmp_ipif->ipif_flags &
830 						    IPIF_UP) ||
831 						    (tmp_ipif->ipif_flags &
832 						    (IPIF_NOLOCAL |
833 						    IPIF_ANYCAST))) {
834 							error = EINVAL;
835 							goto done;
836 						}
837 						ire->ire_flags |= RTF_SETSRC;
838 					} else {
839 						ire->ire_flags &= ~RTF_SETSRC;
840 					}
841 					ire->ire_src_addr = src_addr;
842 				}
843 				break;
844 			case AF_INET6:
845 				ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
846 				mutex_enter(&ire->ire_lock);
847 				if ((found_addrs & RTA_GATEWAY) != 0 &&
848 				    !IN6_ARE_ADDR_EQUAL(
849 				    &ire->ire_gateway_addr_v6, &gw_addr_v6)) {
850 					ire->ire_gateway_addr_v6 = gw_addr_v6;
851 				}
852 				if ((found_addrs & RTA_SRC) != 0 &&
853 				    (rtm->rtm_flags & RTF_SETSRC) != 0 &&
854 				    !IN6_ARE_ADDR_EQUAL(
855 					&ire->ire_src_addr_v6, &src_addr_v6)) {
856 
857 					if (!IN6_IS_ADDR_UNSPECIFIED(
858 					    &src_addr_v6)) {
859 						/*
860 						 * The RTF_SETSRC flag is
861 						 * present, check that the
862 						 * supplied src address is not
863 						 * the loopback address. This
864 						 * would produce martian
865 						 * packets.
866 						 */
867 						if (IN6_IS_ADDR_LOOPBACK(
868 						    &src_addr_v6)) {
869 							mutex_exit(
870 							    &ire->ire_lock);
871 							error = EINVAL;
872 							goto done;
873 						}
874 						/*
875 						 * Also check that the the
876 						 * supplied addr is a valid
877 						 * local address.
878 						 */
879 						tmp_ipif = ipif_lookup_addr_v6(
880 						    &src_addr_v6, NULL,
881 						    ALL_ZONES,
882 						    CONNP_TO_WQ(connp), ioc_mp,
883 						    ip_rts_request_retry,
884 						    &error);
885 						if (tmp_ipif == NULL) {
886 							mutex_exit(
887 							    &ire->ire_lock);
888 							error = (error ==
889 							    EINPROGRESS) ?
890 							    error :
891 							    EADDRNOTAVAIL;
892 							goto done;
893 						}
894 						if (!(tmp_ipif->ipif_flags &
895 						    IPIF_UP) ||
896 						    (tmp_ipif->ipif_flags &
897 						    (IPIF_NOLOCAL |
898 						    IPIF_ANYCAST))) {
899 							mutex_exit(
900 							    &ire->ire_lock);
901 							error = EINVAL;
902 							goto done;
903 						}
904 						ire->ire_flags |= RTF_SETSRC;
905 					} else {
906 						ire->ire_flags &= ~RTF_SETSRC;
907 					}
908 					ire->ire_src_addr_v6 = src_addr_v6;
909 				}
910 				mutex_exit(&ire->ire_lock);
911 				break;
912 			}
913 			rts_setmetrics(ire, rtm->rtm_inits, &rtm->rtm_rmx);
914 			break;
915 		}
916 		break;
917 	default:
918 		error = EOPNOTSUPP;
919 		break;
920 	}
921 done:
922 	if (ire != NULL)
923 		ire_refrele(ire);
924 	if (sire != NULL)
925 		ire_refrele(sire);
926 	if (ipif != NULL)
927 		ipif_refrele(ipif);
928 	if (src_ipif != NULL)
929 		ipif_refrele(src_ipif);
930 	if (tmp_ipif != NULL)
931 		ipif_refrele(tmp_ipif);
932 
933 	if (error == EINPROGRESS)
934 		return (error);
935 	if (rtm != NULL) {
936 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
937 		if (error != 0) {
938 			rtm->rtm_errno = error;
939 			/* Send error ACK */
940 			ip1dbg(("ip_rts_request: error %d\n", error));
941 		} else {
942 			rtm->rtm_flags |= RTF_DONE;
943 			/* OK ACK already set up by caller except this */
944 			ip2dbg(("ip_rts_request: OK ACK\n"));
945 		}
946 		rts_queue_input(mp, q, af);
947 	}
948 	iocp->ioc_error = error;
949 	ioc_mp->b_datap->db_type = M_IOCACK;
950 	if (iocp->ioc_error != 0)
951 		iocp->ioc_count = 0;
952 	qreply(q, ioc_mp);
953 	/* conn was refheld in ip_wput_ioctl. */
954 	CONN_OPER_PENDING_DONE(connp);
955 
956 	return (error);
957 }
958 
959 /*
960  * Build a reply to the RTM_GET request contained in the given message block
961  * using the retrieved IRE of the destination address, the parent IRE (if it
962  * exists) and the address family.
963  *
964  * Returns a pointer to a message block containing the reply if successful,
965  * otherwise NULL is returned.
966  */
967 mblk_t *
968 rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af)
969 {
970 	rt_msghdr_t	*rtm;
971 	rt_msghdr_t	*new_rtm;
972 	mblk_t		*new_mp;
973 	int		rtm_addrs;
974 	int		rtm_flags;
975 	in6_addr_t	gw_addr_v6;
976 
977 	ASSERT(ire->ire_ipif != NULL);
978 	rtm = (rt_msghdr_t *)mp->b_rptr;
979 
980 	/*
981 	 * Always return RTA_DST, RTA_GATEWAY and RTA_NETMASK.
982 	 *
983 	 * The 4.4BSD-Lite2 code (net/rtsock.c) returns both
984 	 * RTA_IFP and RTA_IFA if either is defined, and also
985 	 * returns RTA_BRD if the appropriate interface is
986 	 * point-to-point.
987 	 */
988 	rtm_addrs = (RTA_DST | RTA_GATEWAY | RTA_NETMASK);
989 	if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) {
990 		rtm_addrs |= (RTA_IFP | RTA_IFA);
991 		if (ire->ire_ipif->ipif_flags & IPIF_POINTOPOINT)
992 			rtm_addrs |= RTA_BRD;
993 	}
994 
995 	new_mp = rts_alloc_msg(RTM_GET, rtm_addrs, af);
996 	if (new_mp == NULL)
997 		return (NULL);
998 
999 	/*
1000 	 * We set the destination address, gateway address,
1001 	 * netmask and flags in the RTM_GET response depending
1002 	 * on whether we found a parent IRE or not.
1003 	 * In particular, if we did find a parent IRE during the
1004 	 * recursive search, use that IRE's gateway address.
1005 	 * Otherwise, we use the IRE's source address for the
1006 	 * gateway address.
1007 	 */
1008 	ASSERT(af == AF_INET || af == AF_INET6);
1009 	switch (af) {
1010 	case AF_INET:
1011 		if (sire == NULL) {
1012 			rtm_flags = ire->ire_flags;
1013 			rts_fill_msg(RTM_GET, rtm_addrs, ire->ire_addr,
1014 			    ire->ire_mask, ire->ire_src_addr, ire->ire_src_addr,
1015 			    ire->ire_ipif->ipif_pp_dst_addr, 0, ire->ire_ipif,
1016 			    new_mp);
1017 		} else {
1018 			if (sire->ire_flags & RTF_SETSRC)
1019 				rtm_addrs |= RTA_SRC;
1020 
1021 			rtm_flags = sire->ire_flags;
1022 			rts_fill_msg(RTM_GET, rtm_addrs, sire->ire_addr,
1023 			    sire->ire_mask, sire->ire_gateway_addr,
1024 			    (sire->ire_flags & RTF_SETSRC) ?
1025 				sire->ire_src_addr : ire->ire_src_addr,
1026 			    ire->ire_ipif->ipif_pp_dst_addr,
1027 			    0, ire->ire_ipif, new_mp);
1028 		}
1029 		break;
1030 	case AF_INET6:
1031 		if (sire == NULL) {
1032 			rtm_flags = ire->ire_flags;
1033 			rts_fill_msg_v6(RTM_GET, rtm_addrs, &ire->ire_addr_v6,
1034 			    &ire->ire_mask_v6, &ire->ire_src_addr_v6,
1035 			    &ire->ire_src_addr_v6,
1036 			    &ire->ire_ipif->ipif_v6pp_dst_addr,
1037 			    &ipv6_all_zeros, ire->ire_ipif, new_mp);
1038 		} else {
1039 			if (sire->ire_flags & RTF_SETSRC)
1040 				rtm_addrs |= RTA_SRC;
1041 
1042 			rtm_flags = sire->ire_flags;
1043 			mutex_enter(&sire->ire_lock);
1044 			gw_addr_v6 = sire->ire_gateway_addr_v6;
1045 			mutex_exit(&sire->ire_lock);
1046 			rts_fill_msg_v6(RTM_GET, rtm_addrs, &sire->ire_addr_v6,
1047 			    &sire->ire_mask_v6, &gw_addr_v6,
1048 			    (sire->ire_flags & RTF_SETSRC) ?
1049 				&sire->ire_src_addr_v6 : &ire->ire_src_addr_v6,
1050 			    &ire->ire_ipif->ipif_v6pp_dst_addr, &ipv6_all_zeros,
1051 			    ire->ire_ipif, new_mp);
1052 		}
1053 		break;
1054 	}
1055 	new_rtm = (rt_msghdr_t *)new_mp->b_rptr;
1056 
1057 	/*
1058 	 * The rtm_msglen, rtm_version and rtm_type fields in
1059 	 * RTM_GET response are filled in by rts_fill_msg.
1060 	 *
1061 	 * rtm_addrs and rtm_flags are filled in based on what
1062 	 * was requested and the state of the IREs looked up
1063 	 * above.
1064 	 *
1065 	 * rtm_inits and rtm_rmx are filled in with metrics
1066 	 * based on whether a parent IRE was found or not.
1067 	 *
1068 	 * TODO: rtm_index and rtm_use should probably be
1069 	 * filled in with something resonable here and not just
1070 	 * copied from the request.
1071 	 */
1072 	new_rtm->rtm_index = rtm->rtm_index;
1073 	new_rtm->rtm_pid = rtm->rtm_pid;
1074 	new_rtm->rtm_seq = rtm->rtm_seq;
1075 	new_rtm->rtm_use = rtm->rtm_use;
1076 	new_rtm->rtm_addrs = rtm_addrs;
1077 	new_rtm->rtm_flags = rtm_flags;
1078 	if (sire == NULL)
1079 		new_rtm->rtm_inits = rts_getmetrics(ire, &new_rtm->rtm_rmx);
1080 	else
1081 		new_rtm->rtm_inits = rts_getmetrics(sire, &new_rtm->rtm_rmx);
1082 	return (new_mp);
1083 }
1084 
1085 /*
1086  * Fill the given if_data_t with interface statistics.
1087  */
1088 static void
1089 rts_getifdata(if_data_t *if_data, ipif_t *ipif)
1090 {
1091 	if_data->ifi_type = ipif->ipif_type;	/* ethernet, tokenring, etc */
1092 	if_data->ifi_addrlen = 0;		/* media address length */
1093 	if_data->ifi_hdrlen = 0;		/* media header length */
1094 	if_data->ifi_mtu = ipif->ipif_mtu;	/* maximum transmission unit */
1095 	if_data->ifi_metric = ipif->ipif_metric; /* metric (external only) */
1096 	if_data->ifi_baudrate = 0;		/* linespeed */
1097 
1098 	if_data->ifi_ipackets = 0;		/* packets received on if */
1099 	if_data->ifi_ierrors = 0;		/* input errors on interface */
1100 	if_data->ifi_opackets = 0;		/* packets sent on interface */
1101 	if_data->ifi_oerrors = 0;		/* output errors on if */
1102 	if_data->ifi_collisions = 0;		/* collisions on csma if */
1103 	if_data->ifi_ibytes = 0;		/* total number received */
1104 	if_data->ifi_obytes = 0;		/* total number sent */
1105 	if_data->ifi_imcasts = 0;		/* multicast packets received */
1106 	if_data->ifi_omcasts = 0;		/* multicast packets sent */
1107 	if_data->ifi_iqdrops = 0;		/* dropped on input */
1108 	if_data->ifi_noproto = 0;		/* destined for unsupported */
1109 						/* protocol. */
1110 }
1111 
1112 /*
1113  * Set the metrics on a forwarding table route.
1114  */
1115 static void
1116 rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics)
1117 {
1118 	clock_t		rtt;
1119 	clock_t		rtt_sd;
1120 	ipif_t		*ipif;
1121 	ifrt_t		*ifrt;
1122 	mblk_t		*mp;
1123 	in6_addr_t	gw_addr_v6;
1124 
1125 	/*
1126 	 * Bypass obtaining the lock and searching ipif_saved_ire_mp in the
1127 	 * common case of no metrics.
1128 	 */
1129 	if (which == 0)
1130 		return;
1131 	ire->ire_uinfo.iulp_set = B_TRUE;
1132 
1133 	/*
1134 	 * iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's
1135 	 * <net/route.h> says: rmx_rtt and rmx_rttvar are stored as
1136 	 * microseconds.
1137 	 */
1138 	if (which & RTV_RTT)
1139 		rtt = metrics->rmx_rtt / 1000;
1140 	if (which & RTV_RTTVAR)
1141 		rtt_sd = metrics->rmx_rttvar / 1000;
1142 
1143 	/*
1144 	 * Update the metrics in the IRE itself.
1145 	 */
1146 	mutex_enter(&ire->ire_lock);
1147 	if (which & RTV_MTU)
1148 		ire->ire_max_frag = metrics->rmx_mtu;
1149 	if (which & RTV_RTT)
1150 		ire->ire_uinfo.iulp_rtt = rtt;
1151 	if (which & RTV_SSTHRESH)
1152 		ire->ire_uinfo.iulp_ssthresh = metrics->rmx_ssthresh;
1153 	if (which & RTV_RTTVAR)
1154 		ire->ire_uinfo.iulp_rtt_sd = rtt_sd;
1155 	if (which & RTV_SPIPE)
1156 		ire->ire_uinfo.iulp_spipe = metrics->rmx_sendpipe;
1157 	if (which & RTV_RPIPE)
1158 		ire->ire_uinfo.iulp_rpipe = metrics->rmx_recvpipe;
1159 	mutex_exit(&ire->ire_lock);
1160 
1161 	/*
1162 	 * Search through the ifrt_t chain hanging off the IPIF in order to
1163 	 * reflect the metric change there.
1164 	 */
1165 	ipif = ire->ire_ipif;
1166 	if (ipif == NULL)
1167 		return;
1168 	ASSERT((ipif->ipif_isv6 && ire->ire_ipversion == IPV6_VERSION) ||
1169 	    ((!ipif->ipif_isv6 && ire->ire_ipversion == IPV4_VERSION)));
1170 	if (ipif->ipif_isv6) {
1171 		mutex_enter(&ire->ire_lock);
1172 		gw_addr_v6 = ire->ire_gateway_addr_v6;
1173 		mutex_exit(&ire->ire_lock);
1174 	}
1175 	mutex_enter(&ipif->ipif_saved_ire_lock);
1176 	for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
1177 		/*
1178 		 * On a given ipif, the triple of address, gateway and mask is
1179 		 * unique for each saved IRE (in the case of ordinary interface
1180 		 * routes, the gateway address is all-zeroes).
1181 		 */
1182 		ifrt = (ifrt_t *)mp->b_rptr;
1183 		if (ipif->ipif_isv6) {
1184 			if (!IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
1185 			    &ire->ire_addr_v6) ||
1186 			    !IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
1187 			    &gw_addr_v6) ||
1188 			    !IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
1189 			    &ire->ire_mask_v6))
1190 				continue;
1191 		} else {
1192 			if (ifrt->ifrt_addr != ire->ire_addr ||
1193 			    ifrt->ifrt_gateway_addr != ire->ire_gateway_addr ||
1194 			    ifrt->ifrt_mask != ire->ire_mask)
1195 				continue;
1196 		}
1197 		if (which & RTV_MTU)
1198 			ifrt->ifrt_max_frag = metrics->rmx_mtu;
1199 		if (which & RTV_RTT)
1200 			ifrt->ifrt_iulp_info.iulp_rtt = rtt;
1201 		if (which & RTV_SSTHRESH) {
1202 			ifrt->ifrt_iulp_info.iulp_ssthresh =
1203 			    metrics->rmx_ssthresh;
1204 		}
1205 		if (which & RTV_RTTVAR)
1206 			ifrt->ifrt_iulp_info.iulp_rtt_sd = metrics->rmx_rttvar;
1207 		if (which & RTV_SPIPE)
1208 			ifrt->ifrt_iulp_info.iulp_spipe = metrics->rmx_sendpipe;
1209 		if (which & RTV_RPIPE)
1210 			ifrt->ifrt_iulp_info.iulp_rpipe = metrics->rmx_recvpipe;
1211 		break;
1212 	}
1213 	mutex_exit(&ipif->ipif_saved_ire_lock);
1214 }
1215 
1216 /*
1217  * Get the metrics from a forwarding table route.
1218  */
1219 static int
1220 rts_getmetrics(ire_t *ire, rt_metrics_t *metrics)
1221 {
1222 	int	metrics_set = 0;
1223 
1224 	bzero(metrics, sizeof (rt_metrics_t));
1225 	/*
1226 	 * iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's
1227 	 * <net/route.h> says: rmx_rtt and rmx_rttvar are stored as
1228 	 * microseconds.
1229 	 */
1230 	metrics->rmx_rtt = ire->ire_uinfo.iulp_rtt * 1000;
1231 	metrics_set |= RTV_RTT;
1232 	metrics->rmx_mtu = ire->ire_max_frag;
1233 	metrics_set |= RTV_MTU;
1234 	metrics->rmx_ssthresh = ire->ire_uinfo.iulp_ssthresh;
1235 	metrics_set |= RTV_SSTHRESH;
1236 	metrics->rmx_rttvar = ire->ire_uinfo.iulp_rtt_sd * 1000;
1237 	metrics_set |= RTV_RTTVAR;
1238 	metrics->rmx_sendpipe = ire->ire_uinfo.iulp_spipe;
1239 	metrics_set |= RTV_SPIPE;
1240 	metrics->rmx_recvpipe = ire->ire_uinfo.iulp_rpipe;
1241 	metrics_set |= RTV_RPIPE;
1242 	return (metrics_set);
1243 }
1244 
1245 /*
1246  * Takes a pointer to a routing message and extracts necessary info by looking
1247  * at the rtm->rtm_addrs bits and store the requested sockaddrs in the pointers
1248  * passed (all of which must be valid).
1249  *
1250  * The bitmask of sockaddrs actually found in the message is returned, or zero
1251  * is returned in the case of an error.
1252  */
1253 static int
1254 rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp, in6_addr_t *gw_addrp,
1255     in6_addr_t *net_maskp, in6_addr_t *authorp, in6_addr_t *if_addrp,
1256     in6_addr_t *in_src_addrp, ushort_t *indexp, ushort_t *src_indexp,
1257     sa_family_t *afp)
1258 {
1259 	struct sockaddr *sa;
1260 	int	i;
1261 	int	addr_bits;
1262 	int	length;
1263 	int	found_addrs = 0;
1264 	caddr_t	cp;
1265 	size_t	size;
1266 	struct sockaddr_dl *sdl;
1267 
1268 	*dst_addrp = ipv6_all_zeros;
1269 	*gw_addrp = ipv6_all_zeros;
1270 	*net_maskp = ipv6_all_zeros;
1271 	*authorp = ipv6_all_zeros;
1272 	*if_addrp = ipv6_all_zeros;
1273 	*in_src_addrp = ipv6_all_zeros;
1274 	*indexp = 0;
1275 	*src_indexp = 0;
1276 	*afp = AF_UNSPEC;
1277 
1278 	/*
1279 	 * At present we handle only RTA_DST, RTA_GATEWAY, RTA_NETMASK, RTA_IFP,
1280 	 * RTA_IFA and RTA_AUTHOR.  The rest will be added as we need them.
1281 	 */
1282 	cp = (caddr_t)&rtm[1];
1283 	length = rtm->rtm_msglen;
1284 	for (i = 0; (i < RTA_NUMBITS) && ((cp - (caddr_t)rtm) < length); i++) {
1285 		/*
1286 		 * The address family we are working with starts out as
1287 		 * AF_UNSPEC, but is set to the one specified with the
1288 		 * destination address.
1289 		 *
1290 		 * If the "working" address family that has been set to
1291 		 * something other than AF_UNSPEC, then the address family of
1292 		 * subsequent sockaddrs must either be AF_UNSPEC (for
1293 		 * compatibility with older programs) or must be the same as our
1294 		 * "working" one.
1295 		 *
1296 		 * This code assumes that RTA_DST (1) comes first in the loop.
1297 		 */
1298 		sa = (struct sockaddr *)cp;
1299 		addr_bits = (rtm->rtm_addrs & (1 << i));
1300 		if (addr_bits == 0)
1301 			continue;
1302 		switch (addr_bits) {
1303 		case RTA_DST:
1304 			size = rts_copyfromsockaddr(sa, dst_addrp);
1305 			*afp = sa->sa_family;
1306 			break;
1307 		case RTA_GATEWAY:
1308 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1309 				return (0);
1310 			size = rts_copyfromsockaddr(sa, gw_addrp);
1311 			break;
1312 		case RTA_NETMASK:
1313 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1314 				return (0);
1315 			size = rts_copyfromsockaddr(sa, net_maskp);
1316 			break;
1317 		case RTA_IFP:
1318 			if (sa->sa_family != AF_LINK &&
1319 			    sa->sa_family != AF_UNSPEC)
1320 				return (0);
1321 			sdl = (struct sockaddr_dl *)cp;
1322 			*indexp = sdl->sdl_index;
1323 			size = sizeof (struct sockaddr_dl);
1324 			break;
1325 		case RTA_SRC:
1326 			/* Source address of the incoming packet */
1327 			size = rts_copyfromsockaddr(sa, in_src_addrp);
1328 			*afp = sa->sa_family;
1329 			break;
1330 		case RTA_SRCIFP:
1331 			/* Return incoming interface index pointer */
1332 			if (sa->sa_family != AF_LINK &&
1333 			    sa->sa_family != AF_UNSPEC)
1334 				return (0);
1335 			sdl = (struct sockaddr_dl *)cp;
1336 			*src_indexp = sdl->sdl_index;
1337 			size = sizeof (struct sockaddr_dl);
1338 			break;
1339 		case RTA_IFA:
1340 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1341 				return (0);
1342 			size = rts_copyfromsockaddr(sa, if_addrp);
1343 			break;
1344 		case RTA_AUTHOR:
1345 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1346 				return (0);
1347 			size = rts_copyfromsockaddr(sa, authorp);
1348 			break;
1349 		default:
1350 			return (0);
1351 		}
1352 		if (size == 0)
1353 			return (0);
1354 		cp += size;
1355 		found_addrs |= addr_bits;
1356 	}
1357 	return (found_addrs);
1358 }
1359 
1360 /*
1361  * Fills the message with the given info.
1362  */
1363 static void
1364 rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
1365     ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr, ipaddr_t author,
1366     ipif_t *ipif, mblk_t *mp)
1367 {
1368 	rt_msghdr_t	*rtm;
1369 	sin_t		*sin;
1370 	size_t		data_size, header_size;
1371 	uchar_t		*cp;
1372 	int		i;
1373 
1374 	ASSERT(mp != NULL);
1375 	/*
1376 	 * First find the type of the message
1377 	 * and its length.
1378 	 */
1379 	header_size = rts_header_msg_size(type);
1380 	/*
1381 	 * Now find the size of the data
1382 	 * that follows the message header.
1383 	 */
1384 	data_size = rts_data_msg_size(rtm_addrs, AF_INET);
1385 
1386 	rtm = (rt_msghdr_t *)mp->b_rptr;
1387 	mp->b_wptr = &mp->b_rptr[header_size];
1388 	cp = mp->b_wptr;
1389 	bzero(cp, data_size);
1390 	for (i = 0; i < RTA_NUMBITS; i++) {
1391 		sin = (sin_t *)cp;
1392 		switch (rtm_addrs & (1 << i)) {
1393 		case RTA_DST:
1394 			sin->sin_addr.s_addr = dst;
1395 			sin->sin_family = AF_INET;
1396 			cp += sizeof (sin_t);
1397 			break;
1398 		case RTA_GATEWAY:
1399 			sin->sin_addr.s_addr = gateway;
1400 			sin->sin_family = AF_INET;
1401 			cp += sizeof (sin_t);
1402 			break;
1403 		case RTA_NETMASK:
1404 			sin->sin_addr.s_addr = mask;
1405 			sin->sin_family = AF_INET;
1406 			cp += sizeof (sin_t);
1407 			break;
1408 		case RTA_IFP:
1409 			cp += ill_dls_info((struct sockaddr_dl *)cp, ipif);
1410 			break;
1411 		case RTA_SRCIFP:
1412 			/*
1413 			 * RTA_SRCIFP is not yet supported
1414 			 * for RTM_GET and RTM_CHANGE
1415 			 */
1416 			break;
1417 		case RTA_IFA:
1418 		case RTA_SRC:
1419 			sin->sin_addr.s_addr = src_addr;
1420 			sin->sin_family = AF_INET;
1421 			cp += sizeof (sin_t);
1422 			break;
1423 		case RTA_AUTHOR:
1424 			sin->sin_addr.s_addr = author;
1425 			sin->sin_family = AF_INET;
1426 			cp += sizeof (sin_t);
1427 			break;
1428 		case RTA_BRD:
1429 			/*
1430 			 * RTA_BRD is used typically to specify a point-to-point
1431 			 * destination address.
1432 			 */
1433 			sin->sin_addr.s_addr = brd_addr;
1434 			sin->sin_family = AF_INET;
1435 			cp += sizeof (sin_t);
1436 			break;
1437 		}
1438 	}
1439 	mp->b_wptr = cp;
1440 	mp->b_cont = NULL;
1441 	/*
1442 	 * set the fields that are common to
1443 	 * to different messages.
1444 	 */
1445 	rtm->rtm_msglen = (short)(header_size + data_size);
1446 	rtm->rtm_version = RTM_VERSION;
1447 	rtm->rtm_type = (uchar_t)type;
1448 }
1449 
1450 /*
1451  * Allocates and initializes a routing socket message.
1452  */
1453 mblk_t *
1454 rts_alloc_msg(int type, int rtm_addrs, sa_family_t af)
1455 {
1456 	size_t	length;
1457 	mblk_t	*mp;
1458 
1459 	length = RTS_MSG_SIZE(type, rtm_addrs, af);
1460 	mp = allocb(length, BPRI_MED);
1461 	if (mp == NULL)
1462 		return (mp);
1463 	bzero(mp->b_rptr, length);
1464 	return (mp);
1465 }
1466 
1467 /*
1468  * Returns the size of the routing
1469  * socket message header size.
1470  */
1471 size_t
1472 rts_header_msg_size(int type)
1473 {
1474 	switch (type) {
1475 	case RTM_DELADDR:
1476 	case RTM_NEWADDR:
1477 		return (sizeof (ifa_msghdr_t));
1478 	case RTM_IFINFO:
1479 		return (sizeof (if_msghdr_t));
1480 	default:
1481 		return (sizeof (rt_msghdr_t));
1482 	}
1483 }
1484 
1485 /*
1486  * Returns the size of the message needed with the given rtm_addrs and family.
1487  *
1488  * It is assumed that all of the sockaddrs (with the exception of RTA_IFP) are
1489  * of the same family (currently either AF_INET or AF_INET6).
1490  */
1491 size_t
1492 rts_data_msg_size(int rtm_addrs, sa_family_t af)
1493 {
1494 	int	i;
1495 	size_t	length = 0;
1496 
1497 	for (i = 0; i < RTA_NUMBITS; i++) {
1498 		switch (rtm_addrs & (1 << i)) {
1499 		case RTA_IFP:
1500 			length += sizeof (struct sockaddr_dl);
1501 			break;
1502 		case RTA_DST:
1503 		case RTA_GATEWAY:
1504 		case RTA_NETMASK:
1505 		case RTA_SRC:
1506 		case RTA_SRCIFP:
1507 		case RTA_IFA:
1508 		case RTA_AUTHOR:
1509 		case RTA_BRD:
1510 			ASSERT(af == AF_INET || af == AF_INET6);
1511 			switch (af) {
1512 			case AF_INET:
1513 				length += sizeof (sin_t);
1514 				break;
1515 			case AF_INET6:
1516 				length += sizeof (sin6_t);
1517 				break;
1518 			}
1519 			break;
1520 		}
1521 	}
1522 	return (length);
1523 }
1524 
1525 /*
1526  * This routine is called to generate a message to the routing
1527  * socket indicating that a redirect has occured, a routing lookup
1528  * has failed, or that a protocol has detected timeouts to a particular
1529  * destination. This routine is called for message types RTM_LOSING,
1530  * RTM_REDIRECT, and RTM_MISS.
1531  */
1532 void
1533 ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask,
1534     ipaddr_t source, ipaddr_t author, int flags, int error, int rtm_addrs)
1535 {
1536 	rt_msghdr_t	*rtm;
1537 	mblk_t		*mp;
1538 
1539 	if (rtm_addrs == 0)
1540 		return;
1541 	mp = rts_alloc_msg(type, rtm_addrs, AF_INET);
1542 	if (mp == NULL)
1543 		return;
1544 	rts_fill_msg(type, rtm_addrs, dst_addr, net_mask, gw_addr, source, 0,
1545 	    author, NULL, mp);
1546 	rtm = (rt_msghdr_t *)mp->b_rptr;
1547 	rtm->rtm_flags = flags;
1548 	rtm->rtm_errno = error;
1549 	rtm->rtm_flags |= RTF_DONE;
1550 	rtm->rtm_addrs = rtm_addrs;
1551 	rts_queue_input(mp, NULL, AF_INET);
1552 }
1553 
1554 /*
1555  * This routine is called to generate a message to the routing
1556  * socket indicating that the status of a network interface has changed.
1557  * Message type generated RTM_IFINFO.
1558  */
1559 void
1560 ip_rts_ifmsg(ipif_t *ipif)
1561 {
1562 	if_msghdr_t	*ifm;
1563 	mblk_t		*mp;
1564 	sa_family_t	af;
1565 
1566 	/*
1567 	 * This message should be generated only
1568 	 * when the physical device is changing
1569 	 * state.
1570 	 */
1571 	if (ipif->ipif_id != 0)
1572 		return;
1573 	if (ipif->ipif_isv6) {
1574 		af = AF_INET6;
1575 		mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af);
1576 		if (mp == NULL)
1577 			return;
1578 		rts_fill_msg_v6(RTM_IFINFO, RTA_IFP, &ipv6_all_zeros,
1579 		    &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros,
1580 		    &ipv6_all_zeros, &ipv6_all_zeros, ipif, mp);
1581 	} else {
1582 		af = AF_INET;
1583 		mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af);
1584 		if (mp == NULL)
1585 			return;
1586 		rts_fill_msg(RTM_IFINFO, RTA_IFP, 0, 0, 0, 0, 0, 0, ipif, mp);
1587 	}
1588 	ifm = (if_msghdr_t *)mp->b_rptr;
1589 	ifm->ifm_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
1590 	ifm->ifm_flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags |
1591 	    ipif->ipif_ill->ill_phyint->phyint_flags;
1592 	rts_getifdata(&ifm->ifm_data, ipif);
1593 	ifm->ifm_addrs = RTA_IFP;
1594 	rts_queue_input(mp, NULL, af);
1595 }
1596 
1597 /*
1598  * This is called to generate messages to the routing socket
1599  * indicating a network interface has had addresses associated with it.
1600  * The structure of the code is based on the 4.4BSD-Lite2 <net/rtsock.c>.
1601  */
1602 void
1603 ip_rts_newaddrmsg(int cmd, int error, ipif_t *ipif)
1604 {
1605 	int		pass;
1606 	int		ncmd;
1607 	int		rtm_addrs;
1608 	mblk_t		*mp;
1609 	ifa_msghdr_t	*ifam;
1610 	rt_msghdr_t	*rtm;
1611 	sa_family_t	af;
1612 
1613 	if (ipif->ipif_isv6)
1614 		af = AF_INET6;
1615 	else
1616 		af = AF_INET;
1617 	/*
1618 	 * If the request is DELETE, send RTM_DELETE and RTM_DELADDR.
1619 	 * if the request is ADD, send RTM_NEWADDR and RTM_ADD.
1620 	 */
1621 	for (pass = 1; pass < 3; pass++) {
1622 		if ((cmd == RTM_ADD && pass == 1) ||
1623 		    (cmd == RTM_DELETE && pass == 2)) {
1624 			ncmd = ((cmd == RTM_ADD) ? RTM_NEWADDR : RTM_DELADDR);
1625 
1626 			rtm_addrs = (RTA_IFA | RTA_NETMASK | RTA_BRD);
1627 			mp = rts_alloc_msg(ncmd, rtm_addrs, af);
1628 			if (mp == NULL)
1629 				continue;
1630 			switch (af) {
1631 			case AF_INET:
1632 				rts_fill_msg(ncmd, rtm_addrs, 0,
1633 				    ipif->ipif_net_mask, 0, ipif->ipif_lcl_addr,
1634 				    ipif->ipif_pp_dst_addr, 0, NULL, mp);
1635 				break;
1636 			case AF_INET6:
1637 				rts_fill_msg_v6(ncmd, rtm_addrs,
1638 				    &ipv6_all_zeros, &ipif->ipif_v6net_mask,
1639 				    &ipv6_all_zeros, &ipif->ipif_v6lcl_addr,
1640 				    &ipif->ipif_v6pp_dst_addr, &ipv6_all_zeros,
1641 				    NULL, mp);
1642 				break;
1643 			}
1644 			ifam = (ifa_msghdr_t *)mp->b_rptr;
1645 			ifam->ifam_index =
1646 			    ipif->ipif_ill->ill_phyint->phyint_ifindex;
1647 			ifam->ifam_metric = ipif->ipif_metric;
1648 			ifam->ifam_flags = ((cmd == RTM_ADD) ? RTF_UP : 0);
1649 			ifam->ifam_addrs = rtm_addrs;
1650 			rts_queue_input(mp, NULL, af);
1651 		}
1652 		if ((cmd == RTM_ADD && pass == 2) ||
1653 		    (cmd == RTM_DELETE && pass == 1)) {
1654 			rtm_addrs = (RTA_DST | RTA_NETMASK);
1655 			mp = rts_alloc_msg(cmd, rtm_addrs, af);
1656 			if (mp == NULL)
1657 				continue;
1658 			switch (af) {
1659 			case AF_INET:
1660 				rts_fill_msg(cmd, rtm_addrs,
1661 				    ipif->ipif_lcl_addr, ipif->ipif_net_mask, 0,
1662 				    0, 0, 0, NULL, mp);
1663 				break;
1664 			case AF_INET6:
1665 				rts_fill_msg_v6(cmd, rtm_addrs,
1666 				    &ipif->ipif_v6lcl_addr,
1667 				    &ipif->ipif_v6net_mask, &ipv6_all_zeros,
1668 				    &ipv6_all_zeros, &ipv6_all_zeros,
1669 				    &ipv6_all_zeros, NULL, mp);
1670 				break;
1671 			}
1672 			rtm = (rt_msghdr_t *)mp->b_rptr;
1673 			rtm->rtm_index =
1674 			    ipif->ipif_ill->ill_phyint->phyint_ifindex;
1675 			rtm->rtm_flags = ((cmd == RTM_ADD) ? RTF_UP : 0);
1676 			rtm->rtm_errno = error;
1677 			if (error == 0)
1678 				rtm->rtm_flags |= RTF_DONE;
1679 			rtm->rtm_addrs = rtm_addrs;
1680 			rts_queue_input(mp, NULL, af);
1681 		}
1682 	}
1683 }
1684 
1685 /*
1686  * Based on the address family specified in a sockaddr, copy the address field
1687  * into an in6_addr_t.
1688  *
1689  * In the case of AF_UNSPEC, we assume the family is actually AF_INET for
1690  * compatibility with programs that leave the family cleared in the sockaddr.
1691  * Callers of rts_copyfromsockaddr should check the family themselves if they
1692  * wish to verify its value.
1693  *
1694  * In the case of AF_INET6, a check is made to ensure that address is not an
1695  * IPv4-mapped address.
1696  */
1697 size_t
1698 rts_copyfromsockaddr(struct sockaddr *sa, in6_addr_t *addrp)
1699 {
1700 	switch (sa->sa_family) {
1701 	case AF_INET:
1702 	case AF_UNSPEC:
1703 		IN6_IPADDR_TO_V4MAPPED(((sin_t *)sa)->sin_addr.s_addr, addrp);
1704 		return (sizeof (sin_t));
1705 	case AF_INET6:
1706 		*addrp = ((sin6_t *)sa)->sin6_addr;
1707 		if (IN6_IS_ADDR_V4MAPPED(addrp))
1708 			return (0);
1709 		return (sizeof (sin6_t));
1710 	default:
1711 		return (0);
1712 	}
1713 }
1714