xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_rts.c (revision 12042ab213b3af68474f48555504db816a449211)
1 /*
2  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
3  */
4 
5 /*
6  * Copyright (c) 1988, 1991, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed by the University of
20  *	California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	@(#)rtsock.c	8.6 (Berkeley) 2/11/95
38  */
39 
40 /*
41  * This file contains routines that processes routing socket requests.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/stream.h>
46 #include <sys/stropts.h>
47 #include <sys/ddi.h>
48 #include <sys/strsubr.h>
49 #include <sys/cmn_err.h>
50 #include <sys/debug.h>
51 #include <sys/policy.h>
52 #include <sys/zone.h>
53 
54 #include <sys/systm.h>
55 #include <sys/param.h>
56 #include <sys/socket.h>
57 #include <sys/strsun.h>
58 #include <net/if.h>
59 #include <net/route.h>
60 #include <netinet/in.h>
61 #include <net/if_dl.h>
62 #include <netinet/ip6.h>
63 
64 #include <inet/common.h>
65 #include <inet/ip.h>
66 #include <inet/ip6.h>
67 #include <inet/ip_if.h>
68 #include <inet/ip_ire.h>
69 #include <inet/ip_ftable.h>
70 #include <inet/ip_rts.h>
71 
72 #include <inet/ipclassifier.h>
73 
74 #include <sys/tsol/tndb.h>
75 #include <sys/tsol/tnet.h>
76 
77 #define	RTS_MSG_SIZE(type, rtm_addrs, af, sacnt) \
78 	(rts_data_msg_size(rtm_addrs, af, sacnt) + rts_header_msg_size(type))
79 
80 static size_t	rts_copyfromsockaddr(struct sockaddr *sa, in6_addr_t *addrp);
81 static void	rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst,
82     ipaddr_t mask, ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr,
83     ipaddr_t author, ipaddr_t ifaddr, const ill_t *ill, mblk_t *mp,
84     const tsol_gc_t *);
85 static int	rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp,
86     in6_addr_t *gw_addrp, in6_addr_t *net_maskp, in6_addr_t *authorp,
87     in6_addr_t *if_addrp, in6_addr_t *src_addrp, ushort_t *indexp,
88     sa_family_t *afp, tsol_rtsecattr_t *rtsecattr, int *error);
89 static void	rts_getifdata(if_data_t *if_data, const ipif_t *ipif);
90 static int	rts_getmetrics(ire_t *ire, ill_t *ill, rt_metrics_t *metrics);
91 static mblk_t	*rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *ifire,
92     const in6_addr_t *setsrc, tsol_ire_gw_secattr_t *attrp, sa_family_t af);
93 static void	rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics);
94 static ire_t	*ire_lookup_v4(ipaddr_t dst_addr, ipaddr_t net_mask,
95     ipaddr_t gw_addr, const ill_t *ill, zoneid_t zoneid,
96     const ts_label_t *tsl, int match_flags, ip_stack_t *ipst, ire_t **pifire,
97     ipaddr_t *v4setsrcp, tsol_ire_gw_secattr_t **gwattrp);
98 static ire_t	*ire_lookup_v6(const in6_addr_t *dst_addr_v6,
99     const in6_addr_t *net_mask_v6, const in6_addr_t *gw_addr_v6,
100     const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int match_flags,
101     ip_stack_t *ipst, ire_t **pifire,
102     in6_addr_t *v6setsrcp, tsol_ire_gw_secattr_t **gwattrp);
103 
104 /*
105  * Send `mp' to all eligible routing queues.  A queue is ineligible if:
106  *
107  *  1. SO_USELOOPBACK is off and it is not the originating queue.
108  *  2. RTA_UNDER_IPMP is on and RTSQ_UNDER_IPMP is not set in `flags'.
109  *  3. RTA_UNDER_IPMP is off and RTSQ_NORMAL is not set in `flags'.
110  *  4. It is not the same address family as `af', and `af' isn't AF_UNSPEC.
111  */
112 void
113 rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags,
114     ip_stack_t *ipst)
115 {
116 	mblk_t	*mp1;
117 	conn_t	*connp, *next_connp;
118 
119 	/*
120 	 * Since we don't have an ill_t here, RTSQ_DEFAULT must already be
121 	 * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP at this point.
122 	 */
123 	ASSERT(!(flags & RTSQ_DEFAULT));
124 
125 	mutex_enter(&ipst->ips_rts_clients->connf_lock);
126 	connp = ipst->ips_rts_clients->connf_head;
127 
128 	for (; connp != NULL; connp = next_connp) {
129 		next_connp = connp->conn_next;
130 		/*
131 		 * If there was a family specified when this routing socket was
132 		 * created and it doesn't match the family of the message to
133 		 * copy, then continue.
134 		 */
135 		if ((connp->conn_proto != AF_UNSPEC) &&
136 		    (connp->conn_proto != af))
137 			continue;
138 
139 		/*
140 		 * Queue the message only if the conn_t and flags match.
141 		 */
142 		if (connp->conn_rtaware & RTAW_UNDER_IPMP) {
143 			if (!(flags & RTSQ_UNDER_IPMP))
144 				continue;
145 		} else {
146 			if (!(flags & RTSQ_NORMAL))
147 				continue;
148 		}
149 		/*
150 		 * For the originating queue, we only copy the message upstream
151 		 * if loopback is set.  For others reading on the routing
152 		 * socket, we check if there is room upstream for a copy of the
153 		 * message.
154 		 */
155 		if ((o_connp == connp) && connp->conn_useloopback == 0) {
156 			connp = connp->conn_next;
157 			continue;
158 		}
159 		CONN_INC_REF(connp);
160 		mutex_exit(&ipst->ips_rts_clients->connf_lock);
161 		/* Pass to rts_input */
162 		if (IPCL_IS_NONSTR(connp) ? !connp->conn_flow_cntrld :
163 		    canputnext(connp->conn_rq)) {
164 			mp1 = dupmsg(mp);
165 			if (mp1 == NULL)
166 				mp1 = copymsg(mp);
167 			/* Note that we pass a NULL ira to rts_input */
168 			if (mp1 != NULL)
169 				(connp->conn_recv)(connp, mp1, NULL, NULL);
170 		}
171 
172 		mutex_enter(&ipst->ips_rts_clients->connf_lock);
173 		/* reload next_connp since conn_next may have changed */
174 		next_connp = connp->conn_next;
175 		CONN_DEC_REF(connp);
176 	}
177 	mutex_exit(&ipst->ips_rts_clients->connf_lock);
178 	freemsg(mp);
179 }
180 
181 /*
182  * Takes an ire and sends an ack to all the routing sockets. This
183  * routine is used
184  * - when a route is created/deleted through the ioctl interface.
185  * - when a stale redirect is deleted
186  */
187 void
188 ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
189 {
190 	mblk_t		*mp;
191 	rt_msghdr_t	*rtm;
192 	int		rtm_addrs = (RTA_DST | RTA_NETMASK | RTA_GATEWAY);
193 	sa_family_t	af = { 0 };
194 	in6_addr_t	gw_addr_v6;
195 
196 	if (ire == NULL)
197 		return;
198 	ASSERT(ire->ire_ipversion == IPV4_VERSION ||
199 	    ire->ire_ipversion == IPV6_VERSION);
200 
201 	ASSERT(!(ire->ire_type & IRE_IF_CLONE));
202 	mp = NULL;
203 
204 	if (ire->ire_flags & RTF_SETSRC)
205 		rtm_addrs |= RTA_SRC;
206 
207 	switch (ire->ire_ipversion) {
208 	case IPV4_VERSION:
209 		af = AF_INET;
210 		mp = rts_alloc_msg(type, rtm_addrs, af, 0);
211 		if (mp == NULL)
212 			return;
213 		rts_fill_msg(type, rtm_addrs, ire->ire_addr, ire->ire_mask,
214 		    ire->ire_gateway_addr, ire->ire_setsrc_addr, 0, 0, 0, NULL,
215 		    mp, NULL);
216 		break;
217 	case IPV6_VERSION:
218 		af = AF_INET6;
219 		mp = rts_alloc_msg(type, rtm_addrs, af, 0);
220 		if (mp == NULL)
221 			return;
222 		mutex_enter(&ire->ire_lock);
223 		gw_addr_v6 = ire->ire_gateway_addr_v6;
224 		mutex_exit(&ire->ire_lock);
225 		rts_fill_msg_v6(type, rtm_addrs, &ire->ire_addr_v6,
226 		    &ire->ire_mask_v6, &gw_addr_v6,
227 		    &ire->ire_setsrc_addr_v6, &ipv6_all_zeros, &ipv6_all_zeros,
228 		    &ipv6_all_zeros, NULL, mp, NULL);
229 		break;
230 	}
231 	rtm = (rt_msghdr_t *)mp->b_rptr;
232 	mp->b_wptr = (uchar_t *)&mp->b_rptr[rtm->rtm_msglen];
233 	rtm->rtm_addrs = rtm_addrs;
234 	rtm->rtm_flags = ire->ire_flags;
235 	if (error != 0)
236 		rtm->rtm_errno = error;
237 	else
238 		rtm->rtm_flags |= RTF_DONE;
239 	rts_queue_input(mp, NULL, af, RTSQ_ALL, ipst);
240 }
241 
242 /*
243  * This is a call from the RTS module
244  * indicating that this is a Routing Socket
245  * Stream. Insert this conn_t in routing
246  * socket client list.
247  */
248 void
249 ip_rts_register(conn_t *connp)
250 {
251 	ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
252 
253 	connp->conn_useloopback = 1;
254 	ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp);
255 }
256 
257 /*
258  * This is a call from the RTS module indicating that it is closing.
259  */
260 void
261 ip_rts_unregister(conn_t *connp)
262 {
263 	ipcl_hash_remove(connp);
264 }
265 
266 /*
267  * Processes requests received on a routing socket. It extracts all the
268  * arguments and calls the appropriate function to process the request.
269  *
270  * RTA_SRC bit flag requests are sent by 'route -setsrc'.
271  *
272  * In general, this function does not consume the message supplied but rather
273  * sends the message upstream with an appropriate UNIX errno.
274  */
275 int
276 ip_rts_request_common(mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
277 {
278 	rt_msghdr_t	*rtm = NULL;
279 	in6_addr_t	dst_addr_v6;
280 	in6_addr_t	src_addr_v6;
281 	in6_addr_t	gw_addr_v6;
282 	in6_addr_t	net_mask_v6;
283 	in6_addr_t	author_v6;
284 	in6_addr_t	if_addr_v6;
285 	mblk_t		*mp1;
286 	ire_t		*ire = NULL;
287 	ire_t		*ifire = NULL;
288 	ipaddr_t	v4setsrc;
289 	in6_addr_t	v6setsrc = ipv6_all_zeros;
290 	tsol_ire_gw_secattr_t *gwattr = NULL;
291 	int		error = 0;
292 	int		match_flags = MATCH_IRE_DSTONLY;
293 	int		match_flags_local = MATCH_IRE_TYPE | MATCH_IRE_GW;
294 	int		found_addrs;
295 	sa_family_t	af;
296 	ipaddr_t	dst_addr;
297 	ipaddr_t	gw_addr;
298 	ipaddr_t	src_addr;
299 	ipaddr_t	net_mask;
300 	ushort_t	index;
301 	boolean_t	gcgrp_xtraref = B_FALSE;
302 	tsol_gcgrp_addr_t ga;
303 	tsol_rtsecattr_t rtsecattr;
304 	struct rtsa_s	*rtsap = NULL;
305 	tsol_gcgrp_t	*gcgrp = NULL;
306 	tsol_gc_t	*gc = NULL;
307 	ts_label_t	*tsl = NULL;
308 	zoneid_t	zoneid;
309 	ip_stack_t	*ipst;
310 	ill_t		*ill = NULL;
311 
312 	zoneid = connp->conn_zoneid;
313 	ipst = connp->conn_netstack->netstack_ip;
314 	net_mask = 0;
315 	src_addr = 0;
316 	dst_addr = 0;
317 	gw_addr = 0;
318 
319 	if (mp->b_cont != NULL && !pullupmsg(mp, -1)) {
320 		freemsg(mp);
321 		error =  EINVAL;
322 		goto done;
323 	}
324 	if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) {
325 		freemsg(mp);
326 		error = EINVAL;
327 		goto done;
328 	}
329 
330 	/*
331 	 * Check the routing message for basic consistency including the
332 	 * version number and that the number of octets written is the same
333 	 * as specified by the rtm_msglen field.
334 	 *
335 	 * At this point, an error can be delivered back via rtm_errno.
336 	 */
337 	rtm = (rt_msghdr_t *)mp->b_rptr;
338 	if ((mp->b_wptr - mp->b_rptr) != rtm->rtm_msglen) {
339 		error = EINVAL;
340 		goto done;
341 	}
342 	if (rtm->rtm_version != RTM_VERSION) {
343 		error = EPROTONOSUPPORT;
344 		goto done;
345 	}
346 
347 	/* Only allow RTM_GET or RTM_RESOLVE for unprivileged process */
348 	if (rtm->rtm_type != RTM_GET &&
349 	    rtm->rtm_type != RTM_RESOLVE &&
350 	    (ioc_cr == NULL ||
351 	    secpolicy_ip_config(ioc_cr, B_FALSE) != 0)) {
352 		error = EPERM;
353 		goto done;
354 	}
355 
356 	found_addrs = rts_getaddrs(rtm, &dst_addr_v6, &gw_addr_v6, &net_mask_v6,
357 	    &author_v6, &if_addr_v6, &src_addr_v6, &index, &af, &rtsecattr,
358 	    &error);
359 
360 	if (error != 0)
361 		goto done;
362 
363 	if ((found_addrs & RTA_DST) == 0) {
364 		error = EINVAL;
365 		goto done;
366 	}
367 
368 	/*
369 	 * Based on the address family of the destination address, determine
370 	 * the destination, gateway and netmask and return the appropriate error
371 	 * if an unknown address family was specified (following the errno
372 	 * values that 4.4BSD-Lite2 returns.)
373 	 */
374 	switch (af) {
375 	case AF_INET:
376 		IN6_V4MAPPED_TO_IPADDR(&dst_addr_v6, dst_addr);
377 		IN6_V4MAPPED_TO_IPADDR(&src_addr_v6, src_addr);
378 		IN6_V4MAPPED_TO_IPADDR(&gw_addr_v6, gw_addr);
379 		if (((found_addrs & RTA_NETMASK) == 0) ||
380 		    (rtm->rtm_flags & RTF_HOST))
381 			net_mask = IP_HOST_MASK;
382 		else
383 			IN6_V4MAPPED_TO_IPADDR(&net_mask_v6, net_mask);
384 		break;
385 	case AF_INET6:
386 		if (((found_addrs & RTA_NETMASK) == 0) ||
387 		    (rtm->rtm_flags & RTF_HOST))
388 			net_mask_v6 = ipv6_all_ones;
389 		break;
390 	default:
391 		/*
392 		 * These errno values are meant to be compatible with
393 		 * 4.4BSD-Lite2 for the given message types.
394 		 */
395 		switch (rtm->rtm_type) {
396 		case RTM_ADD:
397 		case RTM_DELETE:
398 			error = ESRCH;
399 			goto done;
400 		case RTM_GET:
401 		case RTM_CHANGE:
402 			error = EAFNOSUPPORT;
403 			goto done;
404 		default:
405 			error = EOPNOTSUPP;
406 			goto done;
407 		}
408 	}
409 
410 	/*
411 	 * At this point, the address family must be something known.
412 	 */
413 	ASSERT(af == AF_INET || af == AF_INET6);
414 
415 	/* Handle RTA_IFP */
416 	if (index != 0) {
417 		ipif_t		*ipif;
418 lookup:
419 		ill = ill_lookup_on_ifindex(index, af == AF_INET6, ipst);
420 		if (ill == NULL) {
421 			error = EINVAL;
422 			goto done;
423 		}
424 
425 		/*
426 		 * Since all interfaces in an IPMP group must be equivalent,
427 		 * we prevent changes to a specific underlying interface's
428 		 * routing configuration.  However, for backward compatibility,
429 		 * we intepret a request to add a route on an underlying
430 		 * interface as a request to add a route on its IPMP interface.
431 		 */
432 		if (IS_UNDER_IPMP(ill)) {
433 			switch (rtm->rtm_type) {
434 			case RTM_CHANGE:
435 			case RTM_DELETE:
436 				error = EINVAL;
437 				goto done;
438 			case RTM_ADD:
439 				index = ipmp_ill_get_ipmp_ifindex(ill);
440 				ill_refrele(ill);
441 				if (index == 0) {
442 					ill = NULL; /* already refrele'd */
443 					error = EINVAL;
444 					goto done;
445 				}
446 				goto lookup;
447 			}
448 		}
449 
450 		match_flags |= MATCH_IRE_ILL;
451 		/*
452 		 * This provides the same zoneid as in Solaris 10
453 		 * that -ifp picks the zoneid from the first ipif on the ill.
454 		 * But it might not be useful since the first ipif will always
455 		 * have the same zoneid as the ill.
456 		 */
457 		ipif = ipif_get_next_ipif(NULL, ill);
458 		if (ipif != NULL) {
459 			zoneid = ipif->ipif_zoneid;
460 			ipif_refrele(ipif);
461 		}
462 	}
463 
464 	/*
465 	 * If a netmask was supplied in the message, then subsequent route
466 	 * lookups will attempt to match on the netmask as well.
467 	 */
468 	if ((found_addrs & RTA_NETMASK) != 0)
469 		match_flags |= MATCH_IRE_MASK;
470 
471 	/*
472 	 * We only process any passed-in route security attributes for
473 	 * either RTM_ADD or RTM_CHANGE message; We overload them
474 	 * to do an RTM_GET as a different label; ignore otherwise.
475 	 */
476 	if (rtm->rtm_type == RTM_ADD || rtm->rtm_type == RTM_CHANGE ||
477 	    rtm->rtm_type == RTM_GET) {
478 		ASSERT(rtsecattr.rtsa_cnt <= TSOL_RTSA_REQUEST_MAX);
479 		if (rtsecattr.rtsa_cnt > 0)
480 			rtsap = &rtsecattr.rtsa_attr[0];
481 	}
482 
483 	switch (rtm->rtm_type) {
484 	case RTM_ADD:
485 		/* if we are adding a route, gateway is a must */
486 		if ((found_addrs & RTA_GATEWAY) == 0) {
487 			error = EINVAL;
488 			goto done;
489 		}
490 
491 		/* Multirouting does not support net routes. */
492 		if ((rtm->rtm_flags & (RTF_MULTIRT | RTF_HOST)) ==
493 		    RTF_MULTIRT) {
494 			error = EADDRNOTAVAIL;
495 			goto done;
496 		}
497 
498 		/*
499 		 * Multirouting and user-specified source addresses
500 		 * do not support interface based routing.
501 		 * Assigning a source address to an interface based
502 		 * route is achievable by plumbing a new ipif and
503 		 * setting up the interface route via this ipif,
504 		 * though.
505 		 */
506 		if (rtm->rtm_flags & (RTF_MULTIRT | RTF_SETSRC)) {
507 			if ((rtm->rtm_flags & RTF_GATEWAY) == 0) {
508 				error = EADDRNOTAVAIL;
509 				goto done;
510 			}
511 		}
512 
513 		switch (af) {
514 		case AF_INET:
515 			if (src_addr != INADDR_ANY) {
516 				uint_t type;
517 
518 				/*
519 				 * The RTF_SETSRC flag is present, check that
520 				 * the supplied src address is not the loopback
521 				 * address. This would produce martian packets.
522 				 */
523 				if (src_addr == htonl(INADDR_LOOPBACK)) {
524 					error = EINVAL;
525 					goto done;
526 				}
527 				/*
528 				 * Also check that the supplied address is a
529 				 * valid, local one. Only allow IFF_UP ones
530 				 */
531 				type = ip_type_v4(src_addr, ipst);
532 				if (!(type & (IRE_LOCAL|IRE_LOOPBACK))) {
533 					error = EADDRNOTAVAIL;
534 					goto done;
535 				}
536 			} else {
537 				/*
538 				 * The RTF_SETSRC modifier must be associated
539 				 * to a non-null source address.
540 				 */
541 				if (rtm->rtm_flags & RTF_SETSRC) {
542 					error = EINVAL;
543 					goto done;
544 				}
545 			}
546 
547 			error = ip_rt_add(dst_addr, net_mask, gw_addr, src_addr,
548 			    rtm->rtm_flags, ill, &ire, B_FALSE,
549 			    rtsap, ipst, zoneid);
550 			if (ill != NULL)
551 				ASSERT(!MUTEX_HELD(&ill->ill_lock));
552 			break;
553 		case AF_INET6:
554 			if (!IN6_IS_ADDR_UNSPECIFIED(&src_addr_v6)) {
555 				uint_t type;
556 
557 				/*
558 				 * The RTF_SETSRC flag is present, check that
559 				 * the supplied src address is not the loopback
560 				 * address. This would produce martian packets.
561 				 */
562 				if (IN6_IS_ADDR_LOOPBACK(&src_addr_v6)) {
563 					error = EINVAL;
564 					goto done;
565 				}
566 				/*
567 				 * Also check that the supplied address is a
568 				 * valid, local one. Only allow UP ones.
569 				 */
570 				type = ip_type_v6(&src_addr_v6, ipst);
571 				if (!(type & (IRE_LOCAL|IRE_LOOPBACK))) {
572 					error = EADDRNOTAVAIL;
573 					goto done;
574 				}
575 
576 				error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
577 				    &gw_addr_v6, &src_addr_v6, rtm->rtm_flags,
578 				    ill, &ire, rtsap, ipst, zoneid);
579 				break;
580 			}
581 			/*
582 			 * The RTF_SETSRC modifier must be associated
583 			 * to a non-null source address.
584 			 */
585 			if (rtm->rtm_flags & RTF_SETSRC) {
586 				error = EINVAL;
587 				goto done;
588 			}
589 			error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
590 			    &gw_addr_v6, NULL, rtm->rtm_flags,
591 			    ill, &ire, rtsap, ipst, zoneid);
592 			if (ill != NULL)
593 				ASSERT(!MUTEX_HELD(&ill->ill_lock));
594 			break;
595 		}
596 		if (error != 0)
597 			goto done;
598 		ASSERT(ire != NULL);
599 		rts_setmetrics(ire, rtm->rtm_inits, &rtm->rtm_rmx);
600 		break;
601 	case RTM_DELETE:
602 		/* if we are deleting a route, gateway is a must */
603 		if ((found_addrs & RTA_GATEWAY) == 0) {
604 			error = EINVAL;
605 			goto done;
606 		}
607 		/*
608 		 * The RTF_SETSRC modifier does not make sense
609 		 * when deleting a route.
610 		 */
611 		if (rtm->rtm_flags & RTF_SETSRC) {
612 			error = EINVAL;
613 			goto done;
614 		}
615 
616 		switch (af) {
617 		case AF_INET:
618 			error = ip_rt_delete(dst_addr, net_mask, gw_addr,
619 			    found_addrs, rtm->rtm_flags, ill, B_FALSE,
620 			    ipst, zoneid);
621 			break;
622 		case AF_INET6:
623 			error = ip_rt_delete_v6(&dst_addr_v6, &net_mask_v6,
624 			    &gw_addr_v6, found_addrs, rtm->rtm_flags, ill,
625 			    ipst, zoneid);
626 			break;
627 		}
628 		break;
629 	case RTM_GET:
630 	case RTM_CHANGE:
631 		/*
632 		 * In the case of RTM_GET, the forwarding table should be
633 		 * searched recursively.  Also, if a gateway was
634 		 * specified then the gateway address must also be matched.
635 		 *
636 		 * In the case of RTM_CHANGE, the gateway address (if supplied)
637 		 * is the new gateway address so matching on the gateway address
638 		 * is not done.  This can lead to ambiguity when looking up the
639 		 * route to change as usually only the destination (and netmask,
640 		 * if supplied) is used for the lookup.  However if a RTA_IFP
641 		 * sockaddr is also supplied, it can disambiguate which route to
642 		 * change provided the ambigous routes are tied to distinct
643 		 * ill's (or interface indices).  If the routes are not tied to
644 		 * any particular interfaces (for example, with traditional
645 		 * gateway routes), then a RTA_IFP sockaddr will be of no use as
646 		 * it won't match any such routes.
647 		 * RTA_SRC is not supported for RTM_GET and RTM_CHANGE,
648 		 * except when RTM_CHANGE is combined to RTF_SETSRC.
649 		 */
650 		if (((found_addrs & RTA_SRC) != 0) &&
651 		    ((rtm->rtm_type == RTM_GET) ||
652 		    !(rtm->rtm_flags & RTF_SETSRC))) {
653 			error = EOPNOTSUPP;
654 			goto done;
655 		}
656 
657 		if (rtm->rtm_type == RTM_GET) {
658 			match_flags |= MATCH_IRE_SECATTR;
659 			match_flags_local |= MATCH_IRE_SECATTR;
660 			if ((found_addrs & RTA_GATEWAY) != 0)
661 				match_flags |= MATCH_IRE_GW;
662 			if (ioc_cr)
663 				tsl = crgetlabel(ioc_cr);
664 			if (rtsap != NULL) {
665 				if (rtsa_validate(rtsap) != 0) {
666 					error = EINVAL;
667 					goto done;
668 				}
669 				if (tsl != NULL &&
670 				    crgetzoneid(ioc_cr) != GLOBAL_ZONEID &&
671 				    (tsl->tsl_doi != rtsap->rtsa_doi ||
672 				    !bldominates(&tsl->tsl_label,
673 				    &rtsap->rtsa_slrange.lower_bound))) {
674 					error = EPERM;
675 					goto done;
676 				}
677 				tsl = labelalloc(
678 				    &rtsap->rtsa_slrange.lower_bound,
679 				    rtsap->rtsa_doi, KM_NOSLEEP);
680 			}
681 		}
682 		if (rtm->rtm_type == RTM_CHANGE) {
683 			if ((found_addrs & RTA_GATEWAY) &&
684 			    (rtm->rtm_flags & RTF_SETSRC)) {
685 				/*
686 				 * Do not want to change the gateway,
687 				 * but rather the source address.
688 				 */
689 				match_flags |= MATCH_IRE_GW;
690 			}
691 		}
692 
693 		/*
694 		 * If the netmask is all ones (either as supplied or as derived
695 		 * above), then first check for an IRE_LOOPBACK or
696 		 * IRE_LOCAL entry.
697 		 *
698 		 * If we didn't check for or find an IRE_LOOPBACK or IRE_LOCAL
699 		 * entry, then look for any other type of IRE.
700 		 */
701 		switch (af) {
702 		case AF_INET:
703 			if (net_mask == IP_HOST_MASK) {
704 				ire = ire_ftable_lookup_v4(dst_addr, 0, gw_addr,
705 				    IRE_LOCAL | IRE_LOOPBACK, NULL, zoneid,
706 				    tsl, match_flags_local, 0, ipst, NULL);
707 			}
708 			if (ire == NULL) {
709 				ire = ire_lookup_v4(dst_addr, net_mask,
710 				    gw_addr, ill, zoneid, tsl, match_flags,
711 				    ipst, &ifire, &v4setsrc, &gwattr);
712 				IN6_IPADDR_TO_V4MAPPED(v4setsrc, &v6setsrc);
713 			}
714 			break;
715 		case AF_INET6:
716 			if (IN6_ARE_ADDR_EQUAL(&net_mask_v6, &ipv6_all_ones)) {
717 				ire = ire_ftable_lookup_v6(&dst_addr_v6, NULL,
718 				    &gw_addr_v6, IRE_LOCAL | IRE_LOOPBACK, NULL,
719 				    zoneid, tsl, match_flags_local, 0, ipst,
720 				    NULL);
721 			}
722 			if (ire == NULL) {
723 				ire = ire_lookup_v6(&dst_addr_v6,
724 				    &net_mask_v6, &gw_addr_v6, ill, zoneid,
725 				    tsl, match_flags, ipst, &ifire, &v6setsrc,
726 				    &gwattr);
727 			}
728 			break;
729 		}
730 		if (tsl != NULL && tsl != crgetlabel(ioc_cr))
731 			label_rele(tsl);
732 
733 		if (ire == NULL) {
734 			error = ESRCH;
735 			goto done;
736 		}
737 		/*
738 		 * Want to return failure if we get an IRE_NOROUTE from
739 		 * ire_route_recursive
740 		 */
741 		if (ire->ire_type & IRE_NOROUTE) {
742 			ire_refrele(ire);
743 			ire = NULL;
744 			error = ESRCH;
745 			goto done;
746 		}
747 
748 		/* we know the IRE before we come here */
749 		switch (rtm->rtm_type) {
750 		case RTM_GET:
751 			mp1 = rts_rtmget(mp, ire, ifire, &v6setsrc, gwattr, af);
752 			if (mp1 == NULL) {
753 				error = ENOBUFS;
754 				goto done;
755 			}
756 			freemsg(mp);
757 			mp = mp1;
758 			rtm = (rt_msghdr_t *)mp->b_rptr;
759 			break;
760 		case RTM_CHANGE:
761 			/*
762 			 * Do not allow to the multirouting state of a route
763 			 * to be changed. This aims to prevent undesirable
764 			 * stages where both multirt and non-multirt routes
765 			 * for the same destination are declared.
766 			 */
767 			if ((ire->ire_flags & RTF_MULTIRT) !=
768 			    (rtm->rtm_flags & RTF_MULTIRT)) {
769 				error = EINVAL;
770 				goto done;
771 			}
772 			/*
773 			 * Note that we do not need to do
774 			 * ire_flush_cache_*(IRE_FLUSH_ADD) as a change
775 			 * in metrics or gateway will not affect existing
776 			 * routes since it does not create a more specific
777 			 * route.
778 			 */
779 			switch (af) {
780 			case AF_INET:
781 				if ((found_addrs & RTA_GATEWAY) != 0 &&
782 				    (ire->ire_gateway_addr != gw_addr)) {
783 					ire->ire_gateway_addr = gw_addr;
784 				}
785 
786 				if (rtsap != NULL) {
787 					ga.ga_af = AF_INET;
788 					IN6_IPADDR_TO_V4MAPPED(
789 					    ire->ire_gateway_addr, &ga.ga_addr);
790 
791 					gcgrp = gcgrp_lookup(&ga, B_TRUE);
792 					if (gcgrp == NULL) {
793 						error = ENOMEM;
794 						goto done;
795 					}
796 				}
797 
798 				if ((found_addrs & RTA_SRC) != 0 &&
799 				    (rtm->rtm_flags & RTF_SETSRC) != 0 &&
800 				    (ire->ire_setsrc_addr != src_addr)) {
801 					if (src_addr != INADDR_ANY) {
802 						uint_t type;
803 
804 						/*
805 						 * The RTF_SETSRC flag is
806 						 * present, check that the
807 						 * supplied src address is not
808 						 * the loopback address. This
809 						 * would produce martian
810 						 * packets.
811 						 */
812 						if (src_addr ==
813 						    htonl(INADDR_LOOPBACK)) {
814 							error = EINVAL;
815 							goto done;
816 						}
817 						/*
818 						 * Also check that the
819 						 * supplied addr is a valid
820 						 * local address.
821 						 */
822 						type = ip_type_v4(src_addr,
823 						    ipst);
824 						if (!(type &
825 						    (IRE_LOCAL|IRE_LOOPBACK))) {
826 							error = EADDRNOTAVAIL;
827 							goto done;
828 						}
829 						ire->ire_flags |= RTF_SETSRC;
830 						ire->ire_setsrc_addr =
831 						    src_addr;
832 					} else {
833 						ire->ire_flags &= ~RTF_SETSRC;
834 						ire->ire_setsrc_addr =
835 						    INADDR_ANY;
836 					}
837 					/*
838 					 * Let conn_ixa caching know that
839 					 * source address selection changed
840 					 */
841 					ip_update_source_selection(ipst);
842 				}
843 				ire_flush_cache_v4(ire, IRE_FLUSH_GWCHANGE);
844 				break;
845 			case AF_INET6:
846 				mutex_enter(&ire->ire_lock);
847 				if ((found_addrs & RTA_GATEWAY) != 0 &&
848 				    !IN6_ARE_ADDR_EQUAL(
849 				    &ire->ire_gateway_addr_v6, &gw_addr_v6)) {
850 					ire->ire_gateway_addr_v6 = gw_addr_v6;
851 				}
852 				mutex_exit(&ire->ire_lock);
853 
854 				if (rtsap != NULL) {
855 					ga.ga_af = AF_INET6;
856 					mutex_enter(&ire->ire_lock);
857 					ga.ga_addr = ire->ire_gateway_addr_v6;
858 					mutex_exit(&ire->ire_lock);
859 
860 					gcgrp = gcgrp_lookup(&ga, B_TRUE);
861 					if (gcgrp == NULL) {
862 						error = ENOMEM;
863 						goto done;
864 					}
865 				}
866 
867 				if ((found_addrs & RTA_SRC) != 0 &&
868 				    (rtm->rtm_flags & RTF_SETSRC) != 0 &&
869 				    !IN6_ARE_ADDR_EQUAL(
870 				    &ire->ire_setsrc_addr_v6, &src_addr_v6)) {
871 					if (!IN6_IS_ADDR_UNSPECIFIED(
872 					    &src_addr_v6)) {
873 						uint_t type;
874 
875 						/*
876 						 * The RTF_SETSRC flag is
877 						 * present, check that the
878 						 * supplied src address is not
879 						 * the loopback address. This
880 						 * would produce martian
881 						 * packets.
882 						 */
883 						if (IN6_IS_ADDR_LOOPBACK(
884 						    &src_addr_v6)) {
885 							error = EINVAL;
886 							goto done;
887 						}
888 						/*
889 						 * Also check that the
890 						 * supplied addr is a valid
891 						 * local address.
892 						 */
893 						type = ip_type_v6(&src_addr_v6,
894 						    ipst);
895 						if (!(type &
896 						    (IRE_LOCAL|IRE_LOOPBACK))) {
897 							error = EADDRNOTAVAIL;
898 							goto done;
899 						}
900 						mutex_enter(&ire->ire_lock);
901 						ire->ire_flags |= RTF_SETSRC;
902 						ire->ire_setsrc_addr_v6 =
903 						    src_addr_v6;
904 						mutex_exit(&ire->ire_lock);
905 					} else {
906 						mutex_enter(&ire->ire_lock);
907 						ire->ire_flags &= ~RTF_SETSRC;
908 						ire->ire_setsrc_addr_v6 =
909 						    ipv6_all_zeros;
910 						mutex_exit(&ire->ire_lock);
911 					}
912 					/*
913 					 * Let conn_ixa caching know that
914 					 * source address selection changed
915 					 */
916 					ip_update_source_selection(ipst);
917 				}
918 				ire_flush_cache_v6(ire, IRE_FLUSH_GWCHANGE);
919 				break;
920 			}
921 
922 			if (rtsap != NULL) {
923 				ASSERT(gcgrp != NULL);
924 
925 				/*
926 				 * Create and add the security attribute to
927 				 * prefix IRE; it will add a reference to the
928 				 * group upon allocating a new entry.  If it
929 				 * finds an already-existing entry for the
930 				 * security attribute, it simply returns it
931 				 * and no new group reference is made.
932 				 */
933 				gc = gc_create(rtsap, gcgrp, &gcgrp_xtraref);
934 				if (gc == NULL ||
935 				    (error = tsol_ire_init_gwattr(ire,
936 				    ire->ire_ipversion, gc)) != 0) {
937 					if (gc != NULL) {
938 						GC_REFRELE(gc);
939 					} else {
940 						/* gc_create failed */
941 						error = ENOMEM;
942 					}
943 					goto done;
944 				}
945 			}
946 			rts_setmetrics(ire, rtm->rtm_inits, &rtm->rtm_rmx);
947 			break;
948 		}
949 		break;
950 	default:
951 		error = EOPNOTSUPP;
952 		break;
953 	}
954 done:
955 	if (ire != NULL)
956 		ire_refrele(ire);
957 	if (ifire != NULL)
958 		ire_refrele(ifire);
959 	if (ill != NULL)
960 		ill_refrele(ill);
961 
962 	if (gcgrp_xtraref)
963 		GCGRP_REFRELE(gcgrp);
964 
965 	if (rtm != NULL) {
966 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
967 		if (error != 0) {
968 			rtm->rtm_errno = error;
969 			/* Send error ACK */
970 			ip1dbg(("ip_rts_request: error %d\n", error));
971 		} else {
972 			rtm->rtm_flags |= RTF_DONE;
973 			/* OK ACK already set up by caller except this */
974 			ip2dbg(("ip_rts_request: OK ACK\n"));
975 		}
976 		rts_queue_input(mp, connp, af, RTSQ_ALL, ipst);
977 	}
978 	return (error);
979 }
980 
981 /*
982  * Helper function that can do recursive lookups including when
983  * MATCH_IRE_GW and/or MATCH_IRE_MASK is set.
984  */
985 static ire_t *
986 ire_lookup_v4(ipaddr_t dst_addr, ipaddr_t net_mask, ipaddr_t gw_addr,
987     const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
988     int match_flags, ip_stack_t *ipst, ire_t **pifire, ipaddr_t *v4setsrcp,
989     tsol_ire_gw_secattr_t **gwattrp)
990 {
991 	ire_t		*ire;
992 	ire_t		*ifire = NULL;
993 	uint_t		ire_type;
994 
995 	*pifire = NULL;
996 	*v4setsrcp = INADDR_ANY;
997 	*gwattrp = NULL;
998 
999 	/* Skip IRE_IF_CLONE */
1000 	match_flags |= MATCH_IRE_TYPE;
1001 	ire_type = (IRE_ONLINK|IRE_OFFLINK) & ~IRE_IF_CLONE;
1002 
1003 	/*
1004 	 * ire_route_recursive can't match gateway or mask thus if they are
1005 	 * set we have to do two steps of lookups
1006 	 */
1007 	if (match_flags & (MATCH_IRE_GW|MATCH_IRE_MASK)) {
1008 		ire = ire_ftable_lookup_v4(dst_addr, net_mask, gw_addr,
1009 		    ire_type, ill, zoneid, tsl, match_flags, 0, ipst, NULL);
1010 
1011 		if (ire == NULL ||(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)))
1012 			return (ire);
1013 
1014 		if (ire->ire_type & IRE_ONLINK)
1015 			return (ire);
1016 
1017 		if (ire->ire_flags & RTF_SETSRC) {
1018 			ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
1019 			*v4setsrcp = ire->ire_setsrc_addr;
1020 			v4setsrcp = NULL;
1021 		}
1022 
1023 		/* The first ire_gw_secattr is passed back */
1024 		if (ire->ire_gw_secattr != NULL) {
1025 			*gwattrp = ire->ire_gw_secattr;
1026 			gwattrp = NULL;
1027 		}
1028 
1029 		/* Look for an interface ire recursively based on the gateway */
1030 		dst_addr = ire->ire_gateway_addr;
1031 		match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_MASK);
1032 		/*
1033 		 * Don't allow anything unusual past the first iteration.
1034 		 * After the first lookup, we should no longer look for
1035 		 * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT
1036 		 * routes.
1037 		 *
1038 		 * In addition, after we have found a direct IRE_OFFLINK,
1039 		 * we should only look for interface or clone routes.
1040 		 */
1041 		match_flags |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
1042 
1043 		if ((ire->ire_type & IRE_OFFLINK) &&
1044 		    !(ire->ire_flags & RTF_INDIRECT)) {
1045 			ire_type = IRE_IF_ALL;
1046 		} else {
1047 			/*
1048 			 * no more local, loopback, broadcast routes
1049 			 */
1050 			if (!(match_flags & MATCH_IRE_TYPE))
1051 				ire_type = (IRE_OFFLINK|IRE_ONLINK);
1052 			ire_type &= ~(IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST);
1053 		}
1054 		match_flags |= MATCH_IRE_TYPE;
1055 
1056 		ifire = ire_route_recursive_v4(dst_addr, ire_type, ill, zoneid,
1057 		    tsl, match_flags, IRR_INCOMPLETE, 0, ipst, v4setsrcp,
1058 		    gwattrp, NULL);
1059 	} else {
1060 		ire = ire_route_recursive_v4(dst_addr, ire_type, ill, zoneid,
1061 		    tsl, match_flags, IRR_INCOMPLETE, 0, ipst, v4setsrcp,
1062 		    gwattrp, NULL);
1063 	}
1064 	*pifire = ifire;
1065 	return (ire);
1066 }
1067 
1068 static ire_t *
1069 ire_lookup_v6(const in6_addr_t *dst_addr_v6,
1070     const in6_addr_t *net_mask_v6, const in6_addr_t *gw_addr_v6,
1071     const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int match_flags,
1072     ip_stack_t *ipst, ire_t **pifire,
1073     in6_addr_t *v6setsrcp, tsol_ire_gw_secattr_t **gwattrp)
1074 {
1075 	ire_t		*ire;
1076 	ire_t		*ifire = NULL;
1077 	uint_t		ire_type;
1078 
1079 	*pifire = NULL;
1080 	*v6setsrcp = ipv6_all_zeros;
1081 	*gwattrp = NULL;
1082 
1083 	/* Skip IRE_IF_CLONE */
1084 	match_flags |= MATCH_IRE_TYPE;
1085 	ire_type = (IRE_ONLINK|IRE_OFFLINK) & ~IRE_IF_CLONE;
1086 
1087 	/*
1088 	 * ire_route_recursive can't match gateway or mask thus if they are
1089 	 * set we have to do two steps of lookups
1090 	 */
1091 	if (match_flags & (MATCH_IRE_GW|MATCH_IRE_MASK)) {
1092 		in6_addr_t dst;
1093 
1094 		ire = ire_ftable_lookup_v6(dst_addr_v6, net_mask_v6,
1095 		    gw_addr_v6, ire_type, ill, zoneid, tsl, match_flags, 0,
1096 		    ipst, NULL);
1097 
1098 		if (ire == NULL ||(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)))
1099 			return (ire);
1100 
1101 		if (ire->ire_type & IRE_ONLINK)
1102 			return (ire);
1103 
1104 		if (ire->ire_flags & RTF_SETSRC) {
1105 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
1106 			    &ire->ire_setsrc_addr_v6));
1107 			*v6setsrcp = ire->ire_setsrc_addr_v6;
1108 			v6setsrcp = NULL;
1109 		}
1110 
1111 		/* The first ire_gw_secattr is passed back */
1112 		if (ire->ire_gw_secattr != NULL) {
1113 			*gwattrp = ire->ire_gw_secattr;
1114 			gwattrp = NULL;
1115 		}
1116 
1117 		mutex_enter(&ire->ire_lock);
1118 		dst = ire->ire_gateway_addr_v6;
1119 		mutex_exit(&ire->ire_lock);
1120 		match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_MASK);
1121 		/*
1122 		 * Don't allow anything unusual past the first iteration.
1123 		 * After the first lookup, we should no longer look for
1124 		 * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT
1125 		 * routes.
1126 		 *
1127 		 * In addition, after we have found a direct IRE_OFFLINK,
1128 		 * we should only look for interface or clone routes.
1129 		 */
1130 		match_flags |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
1131 
1132 		if ((ire->ire_type & IRE_OFFLINK) &&
1133 		    !(ire->ire_flags & RTF_INDIRECT)) {
1134 			ire_type = IRE_IF_ALL;
1135 		} else {
1136 			/*
1137 			 * no more local, loopback routes
1138 			 */
1139 			if (!(match_flags & MATCH_IRE_TYPE))
1140 				ire_type = (IRE_OFFLINK|IRE_ONLINK);
1141 			ire_type &= ~(IRE_LOCAL|IRE_LOOPBACK);
1142 		}
1143 		match_flags |= MATCH_IRE_TYPE;
1144 
1145 		ifire = ire_route_recursive_v6(&dst, ire_type, ill, zoneid, tsl,
1146 		    match_flags, IRR_INCOMPLETE, 0, ipst, v6setsrcp, gwattrp,
1147 		    NULL);
1148 	} else {
1149 		ire = ire_route_recursive_v6(dst_addr_v6, ire_type, ill, zoneid,
1150 		    tsl, match_flags, IRR_INCOMPLETE, 0, ipst, v6setsrcp,
1151 		    gwattrp, NULL);
1152 	}
1153 	*pifire = ifire;
1154 	return (ire);
1155 }
1156 
1157 
1158 /*
1159  * Handle IP_IOC_RTS_REQUEST ioctls
1160  */
1161 int
1162 ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
1163 {
1164 	conn_t	*connp = Q_TO_CONN(q);
1165 	IOCP	iocp = (IOCP)mp->b_rptr;
1166 	mblk_t	*mp1, *ioc_mp = mp;
1167 	int	error = 0;
1168 	ip_stack_t	*ipst;
1169 
1170 	ipst = connp->conn_netstack->netstack_ip;
1171 
1172 	ASSERT(mp->b_cont != NULL);
1173 	/* ioc_mp holds mp */
1174 	mp = mp->b_cont;
1175 
1176 	/*
1177 	 * The Routing Socket data starts on
1178 	 * next block. If there is no next block
1179 	 * this is an indication from routing module
1180 	 * that it is a routing socket stream queue.
1181 	 * We need to support that for compatibility with SDP since
1182 	 * it has a contract private interface to use IP_IOC_RTS_REQUEST.
1183 	 * Note: SDP no longer uses IP_IOC_RTS_REQUEST - we can remove this.
1184 	 */
1185 	if (mp->b_cont == NULL) {
1186 		/*
1187 		 * This is a message from SDP
1188 		 * indicating that this is a Routing Socket
1189 		 * Stream. Insert this conn_t in routing
1190 		 * socket client list.
1191 		 */
1192 		connp->conn_useloopback = 1;
1193 		ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp);
1194 		goto done;
1195 	}
1196 	mp1 = dupmsg(mp->b_cont);
1197 	if (mp1 == NULL) {
1198 		error  = ENOBUFS;
1199 		goto done;
1200 	}
1201 	mp = mp1;
1202 
1203 	error = ip_rts_request_common(mp, connp, ioc_cr);
1204 done:
1205 	iocp->ioc_error = error;
1206 	ioc_mp->b_datap->db_type = M_IOCACK;
1207 	if (iocp->ioc_error != 0)
1208 		iocp->ioc_count = 0;
1209 	/* Note that we pass a NULL ira to rts_input */
1210 	(connp->conn_recv)(connp, ioc_mp, NULL, NULL);
1211 
1212 	/* conn was refheld in ip_wput_ioctl. */
1213 	CONN_DEC_IOCTLREF(connp);
1214 	CONN_OPER_PENDING_DONE(connp);
1215 
1216 	return (error);
1217 }
1218 
1219 /*
1220  * Build a reply to the RTM_GET request contained in the given message block
1221  * using the retrieved IRE of the destination address, the parent IRE (if it
1222  * exists) and the address family.
1223  *
1224  * Returns a pointer to a message block containing the reply if successful,
1225  * otherwise NULL is returned.
1226  */
1227 static mblk_t *
1228 rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *ifire, const in6_addr_t *setsrc,
1229     tsol_ire_gw_secattr_t *attrp, sa_family_t af)
1230 {
1231 	rt_msghdr_t	*rtm;
1232 	rt_msghdr_t	*new_rtm;
1233 	mblk_t		*new_mp;
1234 	int		rtm_addrs;
1235 	int		rtm_flags;
1236 	tsol_gc_t	*gc = NULL;
1237 	tsol_gcgrp_t	*gcgrp = NULL;
1238 	ill_t		*ill;
1239 	ipif_t		*ipif = NULL;
1240 	ipaddr_t	brdaddr;	/* IFF_POINTOPOINT destination */
1241 	ipaddr_t	ifaddr;
1242 	in6_addr_t	brdaddr6;	/* IFF_POINTOPOINT destination */
1243 	in6_addr_t	ifaddr6;
1244 	ipaddr_t	v4setsrc;
1245 
1246 	rtm = (rt_msghdr_t *)mp->b_rptr;
1247 	ifaddr = 0;
1248 	brdaddr = 0;
1249 	rtm_flags = 0;
1250 
1251 	/*
1252 	 * Find the ill used to send packets. This will be NULL in case
1253 	 * of a reject or blackhole.
1254 	 */
1255 	if (ifire != NULL)
1256 		ill = ire_nexthop_ill(ifire);
1257 	else
1258 		ill = ire_nexthop_ill(ire);
1259 
1260 	if (attrp != NULL) {
1261 		mutex_enter(&attrp->igsa_lock);
1262 		if ((gc = attrp->igsa_gc) != NULL) {
1263 			gcgrp = gc->gc_grp;
1264 			ASSERT(gcgrp != NULL);
1265 			rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
1266 		}
1267 		mutex_exit(&attrp->igsa_lock);
1268 	}
1269 
1270 	/*
1271 	 * Always return RTA_DST, RTA_GATEWAY and RTA_NETMASK.
1272 	 *
1273 	 * The 4.4BSD-Lite2 code (net/rtsock.c) returns both
1274 	 * RTA_IFP and RTA_IFA if either is defined, and also
1275 	 * returns RTA_BRD if the appropriate interface is
1276 	 * point-to-point.
1277 	 */
1278 	rtm_addrs = (RTA_DST | RTA_GATEWAY | RTA_NETMASK);
1279 	if ((rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) && ill != NULL) {
1280 		rtm_addrs |= (RTA_IFP | RTA_IFA);
1281 		/*
1282 		 * We associate an IRE with an ILL, hence we don't exactly
1283 		 * know what might make sense for RTA_IFA and RTA_BRD. We
1284 		 * pick the first ipif on the ill.
1285 		 */
1286 		ipif = ipif_get_next_ipif(NULL, ill);
1287 		if (ipif != NULL) {
1288 			if (ipif->ipif_isv6)
1289 				ifaddr6 = ipif->ipif_v6lcl_addr;
1290 			else
1291 				ifaddr = ipif->ipif_lcl_addr;
1292 			if (ipif->ipif_flags & IPIF_POINTOPOINT) {
1293 				rtm_addrs |= RTA_BRD;
1294 				if (ipif->ipif_isv6)
1295 					brdaddr6 = ipif->ipif_v6pp_dst_addr;
1296 				else
1297 					brdaddr = ipif->ipif_pp_dst_addr;
1298 			}
1299 			ipif_refrele(ipif);
1300 		}
1301 	}
1302 
1303 	new_mp = rts_alloc_msg(RTM_GET, rtm_addrs, af, gc != NULL ? 1 : 0);
1304 	if (new_mp == NULL) {
1305 		if (gcgrp != NULL)
1306 			rw_exit(&gcgrp->gcgrp_rwlock);
1307 		if (ill != NULL)
1308 			ill_refrele(ill);
1309 		return (NULL);
1310 	}
1311 
1312 	/*
1313 	 * We set the destination address, gateway address,
1314 	 * netmask and flags in the RTM_GET response depending
1315 	 * on whether we found a parent IRE or not.
1316 	 * In particular, if we did find a parent IRE during the
1317 	 * recursive search, use that IRE's gateway address.
1318 	 * Otherwise, we use the IRE's source address for the
1319 	 * gateway address.
1320 	 */
1321 	ASSERT(af == AF_INET || af == AF_INET6);
1322 	switch (af) {
1323 	case AF_INET:
1324 		IN6_V4MAPPED_TO_IPADDR(setsrc, v4setsrc);
1325 		if (v4setsrc != INADDR_ANY)
1326 			rtm_addrs |= RTA_SRC;
1327 
1328 		rtm_flags = ire->ire_flags;
1329 		rts_fill_msg(RTM_GET, rtm_addrs, ire->ire_addr,
1330 		    ire->ire_mask, ire->ire_gateway_addr, v4setsrc,
1331 		    brdaddr, 0, ifaddr, ill, new_mp, gc);
1332 		break;
1333 	case AF_INET6:
1334 		if (!IN6_IS_ADDR_UNSPECIFIED(setsrc))
1335 			rtm_addrs |= RTA_SRC;
1336 
1337 		rtm_flags = ire->ire_flags;
1338 		rts_fill_msg_v6(RTM_GET, rtm_addrs, &ire->ire_addr_v6,
1339 		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
1340 		    setsrc, &brdaddr6, &ipv6_all_zeros,
1341 		    &ifaddr6, ill, new_mp, gc);
1342 		break;
1343 	}
1344 
1345 	if (gcgrp != NULL)
1346 		rw_exit(&gcgrp->gcgrp_rwlock);
1347 
1348 	new_rtm = (rt_msghdr_t *)new_mp->b_rptr;
1349 
1350 	/*
1351 	 * The rtm_msglen, rtm_version and rtm_type fields in
1352 	 * RTM_GET response are filled in by rts_fill_msg.
1353 	 *
1354 	 * rtm_addrs and rtm_flags are filled in based on what
1355 	 * was requested and the state of the IREs looked up
1356 	 * above.
1357 	 *
1358 	 * rtm_inits and rtm_rmx are filled in with metrics
1359 	 * based on whether a parent IRE was found or not.
1360 	 *
1361 	 * TODO: rtm_index and rtm_use should probably be
1362 	 * filled in with something resonable here and not just
1363 	 * copied from the request.
1364 	 */
1365 	new_rtm->rtm_index = rtm->rtm_index;
1366 	new_rtm->rtm_pid = rtm->rtm_pid;
1367 	new_rtm->rtm_seq = rtm->rtm_seq;
1368 	new_rtm->rtm_use = rtm->rtm_use;
1369 	new_rtm->rtm_addrs = rtm_addrs;
1370 	new_rtm->rtm_flags = rtm_flags;
1371 	new_rtm->rtm_inits = rts_getmetrics(ire, ill, &new_rtm->rtm_rmx);
1372 	if (ill != NULL)
1373 		ill_refrele(ill);
1374 	return (new_mp);
1375 }
1376 
1377 /*
1378  * Fill the given if_data_t with interface statistics.
1379  */
1380 static void
1381 rts_getifdata(if_data_t *if_data, const ipif_t *ipif)
1382 {
1383 	if_data->ifi_type = ipif->ipif_ill->ill_type;
1384 						/* ethernet, tokenring, etc */
1385 	if_data->ifi_addrlen = 0;		/* media address length */
1386 	if_data->ifi_hdrlen = 0;		/* media header length */
1387 	if_data->ifi_mtu = ipif->ipif_ill->ill_mtu;	/* mtu */
1388 						/* metric (external only) */
1389 	if_data->ifi_metric = ipif->ipif_ill->ill_metric;
1390 	if_data->ifi_baudrate = 0;		/* linespeed */
1391 
1392 	if_data->ifi_ipackets = 0;		/* packets received on if */
1393 	if_data->ifi_ierrors = 0;		/* input errors on interface */
1394 	if_data->ifi_opackets = 0;		/* packets sent on interface */
1395 	if_data->ifi_oerrors = 0;		/* output errors on if */
1396 	if_data->ifi_collisions = 0;		/* collisions on csma if */
1397 	if_data->ifi_ibytes = 0;		/* total number received */
1398 	if_data->ifi_obytes = 0;		/* total number sent */
1399 	if_data->ifi_imcasts = 0;		/* multicast packets received */
1400 	if_data->ifi_omcasts = 0;		/* multicast packets sent */
1401 	if_data->ifi_iqdrops = 0;		/* dropped on input */
1402 	if_data->ifi_noproto = 0;		/* destined for unsupported */
1403 						/* protocol. */
1404 }
1405 
1406 /*
1407  * Set the metrics on a forwarding table route.
1408  */
1409 static void
1410 rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics)
1411 {
1412 	clock_t		rtt;
1413 	clock_t		rtt_sd;
1414 	ill_t		*ill;
1415 	ifrt_t		*ifrt;
1416 	mblk_t		*mp;
1417 	in6_addr_t	gw_addr_v6 = { 0 };
1418 
1419 	/* Need to add back some metrics to the IRE? */
1420 	/*
1421 	 * Bypass obtaining the lock and searching ill_saved_ire_mp in the
1422 	 * common case of no metrics.
1423 	 */
1424 	if (which == 0)
1425 		return;
1426 	ire->ire_metrics.iulp_set = B_TRUE;
1427 
1428 	/*
1429 	 * iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's
1430 	 * <net/route.h> says: rmx_rtt and rmx_rttvar are stored as
1431 	 * microseconds.
1432 	 */
1433 	rtt = 0;
1434 	if (which & RTV_RTT)
1435 		rtt = metrics->rmx_rtt / 1000;
1436 	if (which & RTV_RTTVAR)
1437 		rtt_sd = metrics->rmx_rttvar / 1000;
1438 
1439 	/*
1440 	 * Update the metrics in the IRE itself.
1441 	 */
1442 	mutex_enter(&ire->ire_lock);
1443 	if (which & RTV_MTU)
1444 		ire->ire_metrics.iulp_mtu = metrics->rmx_mtu;
1445 	if (which & RTV_RTT)
1446 		ire->ire_metrics.iulp_rtt = rtt;
1447 	if (which & RTV_SSTHRESH)
1448 		ire->ire_metrics.iulp_ssthresh = metrics->rmx_ssthresh;
1449 	if (which & RTV_RTTVAR)
1450 		ire->ire_metrics.iulp_rtt_sd = rtt_sd;
1451 	if (which & RTV_SPIPE)
1452 		ire->ire_metrics.iulp_spipe = metrics->rmx_sendpipe;
1453 	if (which & RTV_RPIPE)
1454 		ire->ire_metrics.iulp_rpipe = metrics->rmx_recvpipe;
1455 	mutex_exit(&ire->ire_lock);
1456 
1457 	/*
1458 	 * Search through the ifrt_t chain hanging off the ILL in order to
1459 	 * reflect the metric change there.
1460 	 */
1461 	ill = ire->ire_ill;
1462 	if (ill == NULL)
1463 		return;
1464 	ASSERT((ill->ill_isv6 && ire->ire_ipversion == IPV6_VERSION) ||
1465 	    ((!ill->ill_isv6 && ire->ire_ipversion == IPV4_VERSION)));
1466 	if (ill->ill_isv6) {
1467 		mutex_enter(&ire->ire_lock);
1468 		gw_addr_v6 = ire->ire_gateway_addr_v6;
1469 		mutex_exit(&ire->ire_lock);
1470 	}
1471 	mutex_enter(&ill->ill_saved_ire_lock);
1472 	for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
1473 		/*
1474 		 * On a given ill, the tuple of address, gateway, mask,
1475 		 * ire_type and zoneid unique for each saved IRE.
1476 		 */
1477 		ifrt = (ifrt_t *)mp->b_rptr;
1478 		if (ill->ill_isv6) {
1479 			if (!IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
1480 			    &ire->ire_addr_v6) ||
1481 			    !IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
1482 			    &gw_addr_v6) ||
1483 			    !IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
1484 			    &ire->ire_mask_v6))
1485 				continue;
1486 		} else {
1487 			if (ifrt->ifrt_addr != ire->ire_addr ||
1488 			    ifrt->ifrt_gateway_addr != ire->ire_gateway_addr ||
1489 			    ifrt->ifrt_mask != ire->ire_mask)
1490 				continue;
1491 		}
1492 		if (ifrt->ifrt_zoneid != ire->ire_zoneid ||
1493 		    ifrt->ifrt_type != ire->ire_type)
1494 			continue;
1495 
1496 		if (which & RTV_MTU)
1497 			ifrt->ifrt_metrics.iulp_mtu = metrics->rmx_mtu;
1498 		if (which & RTV_RTT)
1499 			ifrt->ifrt_metrics.iulp_rtt = rtt;
1500 		if (which & RTV_SSTHRESH) {
1501 			ifrt->ifrt_metrics.iulp_ssthresh =
1502 			    metrics->rmx_ssthresh;
1503 		}
1504 		if (which & RTV_RTTVAR)
1505 			ifrt->ifrt_metrics.iulp_rtt_sd = metrics->rmx_rttvar;
1506 		if (which & RTV_SPIPE)
1507 			ifrt->ifrt_metrics.iulp_spipe = metrics->rmx_sendpipe;
1508 		if (which & RTV_RPIPE)
1509 			ifrt->ifrt_metrics.iulp_rpipe = metrics->rmx_recvpipe;
1510 		break;
1511 	}
1512 	mutex_exit(&ill->ill_saved_ire_lock);
1513 
1514 	/*
1515 	 * Update any IRE_IF_CLONE hanging created from this IRE_IF so they
1516 	 * get any new iulp_mtu.
1517 	 * We do that by deleting them; ire_create_if_clone will pick
1518 	 * up the new metrics.
1519 	 */
1520 	if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
1521 		ire_dep_delete_if_clone(ire);
1522 }
1523 
1524 /*
1525  * Get the metrics from a forwarding table route.
1526  */
1527 static int
1528 rts_getmetrics(ire_t *ire, ill_t *ill, rt_metrics_t *metrics)
1529 {
1530 	int	metrics_set = 0;
1531 
1532 	bzero(metrics, sizeof (rt_metrics_t));
1533 
1534 	/*
1535 	 * iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's
1536 	 * <net/route.h> says: rmx_rtt and rmx_rttvar are stored as
1537 	 * microseconds.
1538 	 */
1539 	metrics->rmx_rtt = ire->ire_metrics.iulp_rtt * 1000;
1540 	metrics_set |= RTV_RTT;
1541 	if (ire->ire_metrics.iulp_mtu != 0) {
1542 		metrics->rmx_mtu = ire->ire_metrics.iulp_mtu;
1543 		metrics_set |= RTV_MTU;
1544 	} else if (ill != NULL) {
1545 		metrics->rmx_mtu = ill->ill_mtu;
1546 		metrics_set |= RTV_MTU;
1547 	}
1548 	metrics->rmx_ssthresh = ire->ire_metrics.iulp_ssthresh;
1549 	metrics_set |= RTV_SSTHRESH;
1550 	metrics->rmx_rttvar = ire->ire_metrics.iulp_rtt_sd * 1000;
1551 	metrics_set |= RTV_RTTVAR;
1552 	metrics->rmx_sendpipe = ire->ire_metrics.iulp_spipe;
1553 	metrics_set |= RTV_SPIPE;
1554 	metrics->rmx_recvpipe = ire->ire_metrics.iulp_rpipe;
1555 	metrics_set |= RTV_RPIPE;
1556 	return (metrics_set);
1557 }
1558 
1559 /*
1560  * Given two sets of metrics (src and dst), use the dst values if they are
1561  * set. If a dst value is not set but the src value is set, then we use
1562  * the src value.
1563  * dst is updated with the new values.
1564  * This is used to merge information from a dce_t and ire_metrics, where the
1565  * dce values takes precedence.
1566  */
1567 void
1568 rts_merge_metrics(iulp_t *dst, const iulp_t *src)
1569 {
1570 	if (!src->iulp_set)
1571 		return;
1572 
1573 	if (dst->iulp_ssthresh == 0)
1574 		dst->iulp_ssthresh = src->iulp_ssthresh;
1575 	if (dst->iulp_rtt == 0)
1576 		dst->iulp_rtt = src->iulp_rtt;
1577 	if (dst->iulp_rtt_sd == 0)
1578 		dst->iulp_rtt_sd = src->iulp_rtt_sd;
1579 	if (dst->iulp_spipe == 0)
1580 		dst->iulp_spipe = src->iulp_spipe;
1581 	if (dst->iulp_rpipe == 0)
1582 		dst->iulp_rpipe = src->iulp_rpipe;
1583 	if (dst->iulp_rtomax == 0)
1584 		dst->iulp_rtomax = src->iulp_rtomax;
1585 	if (dst->iulp_sack == 0)
1586 		dst->iulp_sack = src->iulp_sack;
1587 	if (dst->iulp_tstamp_ok == 0)
1588 		dst->iulp_tstamp_ok = src->iulp_tstamp_ok;
1589 	if (dst->iulp_wscale_ok == 0)
1590 		dst->iulp_wscale_ok = src->iulp_wscale_ok;
1591 	if (dst->iulp_ecn_ok == 0)
1592 		dst->iulp_ecn_ok = src->iulp_ecn_ok;
1593 	if (dst->iulp_pmtud_ok == 0)
1594 		dst->iulp_pmtud_ok = src->iulp_pmtud_ok;
1595 	if (dst->iulp_mtu == 0)
1596 		dst->iulp_mtu = src->iulp_mtu;
1597 }
1598 
1599 
1600 /*
1601  * Takes a pointer to a routing message and extracts necessary info by looking
1602  * at the rtm->rtm_addrs bits and store the requested sockaddrs in the pointers
1603  * passed (all of which must be valid).
1604  *
1605  * The bitmask of sockaddrs actually found in the message is returned, or zero
1606  * is returned in the case of an error.
1607  */
1608 static int
1609 rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp, in6_addr_t *gw_addrp,
1610     in6_addr_t *net_maskp, in6_addr_t *authorp, in6_addr_t *if_addrp,
1611     in6_addr_t *in_src_addrp, ushort_t *indexp, sa_family_t *afp,
1612     tsol_rtsecattr_t *rtsecattr, int *error)
1613 {
1614 	struct sockaddr *sa;
1615 	int	i;
1616 	int	addr_bits;
1617 	int	length;
1618 	int	found_addrs = 0;
1619 	caddr_t	cp;
1620 	size_t	size;
1621 	struct sockaddr_dl *sdl;
1622 
1623 	*dst_addrp = ipv6_all_zeros;
1624 	*gw_addrp = ipv6_all_zeros;
1625 	*net_maskp = ipv6_all_zeros;
1626 	*authorp = ipv6_all_zeros;
1627 	*if_addrp = ipv6_all_zeros;
1628 	*in_src_addrp = ipv6_all_zeros;
1629 	*indexp = 0;
1630 	*afp = AF_UNSPEC;
1631 	rtsecattr->rtsa_cnt = 0;
1632 	*error = 0;
1633 
1634 	/*
1635 	 * At present we handle only RTA_DST, RTA_GATEWAY, RTA_NETMASK, RTA_IFP,
1636 	 * RTA_IFA and RTA_AUTHOR.  The rest will be added as we need them.
1637 	 */
1638 	cp = (caddr_t)&rtm[1];
1639 	length = rtm->rtm_msglen;
1640 	for (i = 0; (i < RTA_NUMBITS) && ((cp - (caddr_t)rtm) < length); i++) {
1641 		/*
1642 		 * The address family we are working with starts out as
1643 		 * AF_UNSPEC, but is set to the one specified with the
1644 		 * destination address.
1645 		 *
1646 		 * If the "working" address family that has been set to
1647 		 * something other than AF_UNSPEC, then the address family of
1648 		 * subsequent sockaddrs must either be AF_UNSPEC (for
1649 		 * compatibility with older programs) or must be the same as our
1650 		 * "working" one.
1651 		 *
1652 		 * This code assumes that RTA_DST (1) comes first in the loop.
1653 		 */
1654 		sa = (struct sockaddr *)cp;
1655 		addr_bits = (rtm->rtm_addrs & (1 << i));
1656 		if (addr_bits == 0)
1657 			continue;
1658 		switch (addr_bits) {
1659 		case RTA_DST:
1660 			size = rts_copyfromsockaddr(sa, dst_addrp);
1661 			*afp = sa->sa_family;
1662 			break;
1663 		case RTA_GATEWAY:
1664 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1665 				return (0);
1666 			size = rts_copyfromsockaddr(sa, gw_addrp);
1667 			break;
1668 		case RTA_NETMASK:
1669 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1670 				return (0);
1671 			size = rts_copyfromsockaddr(sa, net_maskp);
1672 			break;
1673 		case RTA_IFP:
1674 			if (sa->sa_family != AF_LINK &&
1675 			    sa->sa_family != AF_UNSPEC)
1676 				return (0);
1677 			sdl = (struct sockaddr_dl *)cp;
1678 			*indexp = sdl->sdl_index;
1679 			size = sizeof (struct sockaddr_dl);
1680 			break;
1681 		case RTA_SRC:
1682 			/* Source address of the incoming packet */
1683 			size = rts_copyfromsockaddr(sa, in_src_addrp);
1684 			*afp = sa->sa_family;
1685 			break;
1686 		case RTA_IFA:
1687 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1688 				return (0);
1689 			size = rts_copyfromsockaddr(sa, if_addrp);
1690 			break;
1691 		case RTA_AUTHOR:
1692 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1693 				return (0);
1694 			size = rts_copyfromsockaddr(sa, authorp);
1695 			break;
1696 		default:
1697 			return (0);
1698 		}
1699 		if (size == 0)
1700 			return (0);
1701 		cp += size;
1702 		found_addrs |= addr_bits;
1703 	}
1704 
1705 	/*
1706 	 * Parse the routing message and look for any security-
1707 	 * related attributes for the route.  For each valid
1708 	 * attribute, allocate/obtain the corresponding kernel
1709 	 * route security attributes.
1710 	 */
1711 	if (((cp - (caddr_t)rtm) < length) && is_system_labeled()) {
1712 		*error = tsol_rtsa_init(rtm, rtsecattr, cp);
1713 		ASSERT(rtsecattr->rtsa_cnt <= TSOL_RTSA_REQUEST_MAX);
1714 	}
1715 
1716 	return (found_addrs);
1717 }
1718 
1719 /*
1720  * Fills the message with the given info.
1721  */
1722 static void
1723 rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
1724     ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr, ipaddr_t author,
1725     ipaddr_t ifaddr, const ill_t *ill, mblk_t *mp,
1726     const tsol_gc_t *gc)
1727 {
1728 	rt_msghdr_t	*rtm;
1729 	sin_t		*sin;
1730 	size_t		data_size, header_size;
1731 	uchar_t		*cp;
1732 	int		i;
1733 
1734 	ASSERT(mp != NULL);
1735 	/*
1736 	 * First find the type of the message
1737 	 * and its length.
1738 	 */
1739 	header_size = rts_header_msg_size(type);
1740 	/*
1741 	 * Now find the size of the data
1742 	 * that follows the message header.
1743 	 */
1744 	data_size = rts_data_msg_size(rtm_addrs, AF_INET, gc != NULL ? 1 : 0);
1745 
1746 	rtm = (rt_msghdr_t *)mp->b_rptr;
1747 	mp->b_wptr = &mp->b_rptr[header_size];
1748 	cp = mp->b_wptr;
1749 	bzero(cp, data_size);
1750 	for (i = 0; i < RTA_NUMBITS; i++) {
1751 		sin = (sin_t *)cp;
1752 		switch (rtm_addrs & (1 << i)) {
1753 		case RTA_DST:
1754 			sin->sin_addr.s_addr = dst;
1755 			sin->sin_family = AF_INET;
1756 			cp += sizeof (sin_t);
1757 			break;
1758 		case RTA_GATEWAY:
1759 			sin->sin_addr.s_addr = gateway;
1760 			sin->sin_family = AF_INET;
1761 			cp += sizeof (sin_t);
1762 			break;
1763 		case RTA_NETMASK:
1764 			sin->sin_addr.s_addr = mask;
1765 			sin->sin_family = AF_INET;
1766 			cp += sizeof (sin_t);
1767 			break;
1768 		case RTA_IFP:
1769 			cp += ill_dls_info((struct sockaddr_dl *)cp, ill);
1770 			break;
1771 		case RTA_IFA:
1772 			sin->sin_addr.s_addr = ifaddr;
1773 			sin->sin_family = AF_INET;
1774 			cp += sizeof (sin_t);
1775 			break;
1776 		case RTA_SRC:
1777 			sin->sin_addr.s_addr = src_addr;
1778 			sin->sin_family = AF_INET;
1779 			cp += sizeof (sin_t);
1780 			break;
1781 		case RTA_AUTHOR:
1782 			sin->sin_addr.s_addr = author;
1783 			sin->sin_family = AF_INET;
1784 			cp += sizeof (sin_t);
1785 			break;
1786 		case RTA_BRD:
1787 			/*
1788 			 * RTA_BRD is used typically to specify a point-to-point
1789 			 * destination address.
1790 			 */
1791 			sin->sin_addr.s_addr = brd_addr;
1792 			sin->sin_family = AF_INET;
1793 			cp += sizeof (sin_t);
1794 			break;
1795 		}
1796 	}
1797 
1798 	if (gc != NULL) {
1799 		rtm_ext_t *rtm_ext;
1800 		struct rtsa_s *rp_dst;
1801 		tsol_rtsecattr_t *rsap;
1802 
1803 		ASSERT(gc->gc_grp != NULL);
1804 		ASSERT(RW_LOCK_HELD(&gc->gc_grp->gcgrp_rwlock));
1805 
1806 		rtm_ext = (rtm_ext_t *)cp;
1807 		rtm_ext->rtmex_type = RTMEX_GATEWAY_SECATTR;
1808 		rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(1);
1809 
1810 		rsap = (tsol_rtsecattr_t *)(rtm_ext + 1);
1811 		rsap->rtsa_cnt = 1;
1812 		rp_dst = rsap->rtsa_attr;
1813 
1814 		ASSERT(gc->gc_db != NULL);
1815 		bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst));
1816 		cp = (uchar_t *)rp_dst;
1817 	}
1818 
1819 	mp->b_wptr = cp;
1820 	mp->b_cont = NULL;
1821 	/*
1822 	 * set the fields that are common to
1823 	 * to different messages.
1824 	 */
1825 	rtm->rtm_msglen = (short)(header_size + data_size);
1826 	rtm->rtm_version = RTM_VERSION;
1827 	rtm->rtm_type = (uchar_t)type;
1828 }
1829 
1830 /*
1831  * Allocates and initializes a routing socket message.
1832  * Note that sacnt is either zero or one.
1833  */
1834 mblk_t *
1835 rts_alloc_msg(int type, int rtm_addrs, sa_family_t af, uint_t sacnt)
1836 {
1837 	size_t	length;
1838 	mblk_t	*mp;
1839 
1840 	length = RTS_MSG_SIZE(type, rtm_addrs, af, sacnt);
1841 	mp = allocb(length, BPRI_MED);
1842 	if (mp == NULL)
1843 		return (mp);
1844 	bzero(mp->b_rptr, length);
1845 	return (mp);
1846 }
1847 
1848 /*
1849  * Returns the size of the routing
1850  * socket message header size.
1851  */
1852 size_t
1853 rts_header_msg_size(int type)
1854 {
1855 	switch (type) {
1856 	case RTM_DELADDR:
1857 	case RTM_NEWADDR:
1858 	case RTM_CHGADDR:
1859 	case RTM_FREEADDR:
1860 		return (sizeof (ifa_msghdr_t));
1861 	case RTM_IFINFO:
1862 		return (sizeof (if_msghdr_t));
1863 	default:
1864 		return (sizeof (rt_msghdr_t));
1865 	}
1866 }
1867 
1868 /*
1869  * Returns the size of the message needed with the given rtm_addrs and family.
1870  *
1871  * It is assumed that all of the sockaddrs (with the exception of RTA_IFP) are
1872  * of the same family (currently either AF_INET or AF_INET6).
1873  */
1874 size_t
1875 rts_data_msg_size(int rtm_addrs, sa_family_t af, uint_t sacnt)
1876 {
1877 	int	i;
1878 	size_t	length = 0;
1879 
1880 	for (i = 0; i < RTA_NUMBITS; i++) {
1881 		switch (rtm_addrs & (1 << i)) {
1882 		case RTA_IFP:
1883 			length += sizeof (struct sockaddr_dl);
1884 			break;
1885 		case RTA_DST:
1886 		case RTA_GATEWAY:
1887 		case RTA_NETMASK:
1888 		case RTA_SRC:
1889 		case RTA_IFA:
1890 		case RTA_AUTHOR:
1891 		case RTA_BRD:
1892 			ASSERT(af == AF_INET || af == AF_INET6);
1893 			switch (af) {
1894 			case AF_INET:
1895 				length += sizeof (sin_t);
1896 				break;
1897 			case AF_INET6:
1898 				length += sizeof (sin6_t);
1899 				break;
1900 			}
1901 			break;
1902 		}
1903 	}
1904 	if (sacnt > 0)
1905 		length += sizeof (rtm_ext_t) + TSOL_RTSECATTR_SIZE(sacnt);
1906 
1907 	return (length);
1908 }
1909 
1910 /*
1911  * This routine is called to generate a message to the routing
1912  * socket indicating that a redirect has occured, a routing lookup
1913  * has failed, or that a protocol has detected timeouts to a particular
1914  * destination. This routine is called for message types RTM_LOSING,
1915  * RTM_REDIRECT, and RTM_MISS.
1916  */
1917 void
1918 ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask,
1919     ipaddr_t source, ipaddr_t author, int flags, int error, int rtm_addrs,
1920     ip_stack_t *ipst)
1921 {
1922 	rt_msghdr_t	*rtm;
1923 	mblk_t		*mp;
1924 
1925 	if (rtm_addrs == 0)
1926 		return;
1927 	mp = rts_alloc_msg(type, rtm_addrs, AF_INET, 0);
1928 	if (mp == NULL)
1929 		return;
1930 	rts_fill_msg(type, rtm_addrs, dst_addr, net_mask, gw_addr, source, 0,
1931 	    author, 0, NULL, mp, NULL);
1932 	rtm = (rt_msghdr_t *)mp->b_rptr;
1933 	rtm->rtm_flags = flags;
1934 	rtm->rtm_errno = error;
1935 	rtm->rtm_flags |= RTF_DONE;
1936 	rtm->rtm_addrs = rtm_addrs;
1937 	rts_queue_input(mp, NULL, AF_INET, RTSQ_ALL, ipst);
1938 }
1939 
1940 /*
1941  * This routine is called to generate a message to the routing
1942  * socket indicating that the status of a network interface has changed.
1943  * Message type generated RTM_IFINFO.
1944  */
1945 void
1946 ip_rts_ifmsg(const ipif_t *ipif, uint_t flags)
1947 {
1948 	ip_rts_xifmsg(ipif, 0, 0, flags);
1949 }
1950 
1951 void
1952 ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags)
1953 {
1954 	if_msghdr_t	*ifm;
1955 	mblk_t		*mp;
1956 	sa_family_t	af;
1957 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
1958 
1959 	/*
1960 	 * This message should be generated only
1961 	 * when the physical device is changing
1962 	 * state.
1963 	 */
1964 	if (ipif->ipif_id != 0)
1965 		return;
1966 	if (ipif->ipif_isv6) {
1967 		af = AF_INET6;
1968 		mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0);
1969 		if (mp == NULL)
1970 			return;
1971 		rts_fill_msg_v6(RTM_IFINFO, RTA_IFP, &ipv6_all_zeros,
1972 		    &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros,
1973 		    &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros,
1974 		    ipif->ipif_ill, mp, NULL);
1975 	} else {
1976 		af = AF_INET;
1977 		mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0);
1978 		if (mp == NULL)
1979 			return;
1980 		rts_fill_msg(RTM_IFINFO, RTA_IFP, 0, 0, 0, 0, 0, 0, 0,
1981 		    ipif->ipif_ill, mp, NULL);
1982 	}
1983 	ifm = (if_msghdr_t *)mp->b_rptr;
1984 	ifm->ifm_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
1985 	ifm->ifm_flags = (ipif->ipif_flags | ipif->ipif_ill->ill_flags |
1986 	    ipif->ipif_ill->ill_phyint->phyint_flags | set) & ~clear;
1987 	rts_getifdata(&ifm->ifm_data, ipif);
1988 	ifm->ifm_addrs = RTA_IFP;
1989 
1990 	if (flags & RTSQ_DEFAULT) {
1991 		flags = RTSQ_ALL;
1992 		/*
1993 		 * If this message is for an underlying interface, prevent
1994 		 * "normal" (IPMP-unaware) routing sockets from seeing it.
1995 		 */
1996 		if (IS_UNDER_IPMP(ipif->ipif_ill))
1997 			flags &= ~RTSQ_NORMAL;
1998 	}
1999 
2000 	rts_queue_input(mp, NULL, af, flags, ipst);
2001 }
2002 
2003 /*
2004  * If cmd is RTM_ADD or RTM_DELETE, generate the rt_msghdr_t message;
2005  * otherwise (RTM_NEWADDR, RTM_DELADDR, RTM_CHGADDR and RTM_FREEADDR)
2006  * generate the ifa_msghdr_t message.
2007  */
2008 static void
2009 rts_new_rtsmsg(int cmd, int error, const ipif_t *ipif, uint_t flags)
2010 {
2011 	int		rtm_addrs;
2012 	mblk_t		*mp;
2013 	ifa_msghdr_t	*ifam;
2014 	rt_msghdr_t	*rtm;
2015 	sa_family_t	af;
2016 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2017 
2018 	/*
2019 	 * Do not report unspecified address if this is the RTM_CHGADDR or
2020 	 * RTM_FREEADDR message.
2021 	 */
2022 	if (cmd == RTM_CHGADDR || cmd == RTM_FREEADDR) {
2023 		if (!ipif->ipif_isv6) {
2024 			if (ipif->ipif_lcl_addr == INADDR_ANY)
2025 				return;
2026 		} else if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
2027 			return;
2028 		}
2029 	}
2030 
2031 	if (ipif->ipif_isv6)
2032 		af = AF_INET6;
2033 	else
2034 		af = AF_INET;
2035 
2036 	if (cmd == RTM_ADD || cmd == RTM_DELETE)
2037 		rtm_addrs = (RTA_DST | RTA_NETMASK);
2038 	else
2039 		rtm_addrs = (RTA_IFA | RTA_NETMASK | RTA_BRD | RTA_IFP);
2040 
2041 	mp = rts_alloc_msg(cmd, rtm_addrs, af, 0);
2042 	if (mp == NULL)
2043 		return;
2044 
2045 	if (cmd != RTM_ADD && cmd != RTM_DELETE) {
2046 		switch (af) {
2047 		case AF_INET:
2048 			rts_fill_msg(cmd, rtm_addrs, 0,
2049 			    ipif->ipif_net_mask, 0, ipif->ipif_lcl_addr,
2050 			    ipif->ipif_pp_dst_addr, 0,
2051 			    ipif->ipif_lcl_addr, ipif->ipif_ill,
2052 			    mp, NULL);
2053 			break;
2054 		case AF_INET6:
2055 			rts_fill_msg_v6(cmd, rtm_addrs,
2056 			    &ipv6_all_zeros, &ipif->ipif_v6net_mask,
2057 			    &ipv6_all_zeros, &ipif->ipif_v6lcl_addr,
2058 			    &ipif->ipif_v6pp_dst_addr, &ipv6_all_zeros,
2059 			    &ipif->ipif_v6lcl_addr, ipif->ipif_ill,
2060 			    mp, NULL);
2061 			break;
2062 		}
2063 		ifam = (ifa_msghdr_t *)mp->b_rptr;
2064 		ifam->ifam_index =
2065 		    ipif->ipif_ill->ill_phyint->phyint_ifindex;
2066 		ifam->ifam_metric = ipif->ipif_ill->ill_metric;
2067 		ifam->ifam_flags = ((cmd == RTM_NEWADDR) ? RTF_UP : 0);
2068 		ifam->ifam_addrs = rtm_addrs;
2069 	} else {
2070 		switch (af) {
2071 		case AF_INET:
2072 			rts_fill_msg(cmd, rtm_addrs,
2073 			    ipif->ipif_lcl_addr, ipif->ipif_net_mask, 0,
2074 			    0, 0, 0, 0, NULL, mp, NULL);
2075 			break;
2076 		case AF_INET6:
2077 			rts_fill_msg_v6(cmd, rtm_addrs,
2078 			    &ipif->ipif_v6lcl_addr,
2079 			    &ipif->ipif_v6net_mask, &ipv6_all_zeros,
2080 			    &ipv6_all_zeros, &ipv6_all_zeros,
2081 			    &ipv6_all_zeros, &ipv6_all_zeros,
2082 			    NULL, mp, NULL);
2083 			break;
2084 		}
2085 		rtm = (rt_msghdr_t *)mp->b_rptr;
2086 		rtm->rtm_index =
2087 		    ipif->ipif_ill->ill_phyint->phyint_ifindex;
2088 		rtm->rtm_flags = ((cmd == RTM_ADD) ? RTF_UP : 0);
2089 		rtm->rtm_errno = error;
2090 		if (error == 0)
2091 			rtm->rtm_flags |= RTF_DONE;
2092 		rtm->rtm_addrs = rtm_addrs;
2093 	}
2094 	rts_queue_input(mp, NULL, af, flags, ipst);
2095 }
2096 
2097 /*
2098  * This is called to generate messages to the routing socket
2099  * indicating a network interface has had addresses associated with it.
2100  * The structure of the code is based on the 4.4BSD-Lite2 <net/rtsock.c>.
2101  */
2102 void
2103 ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags)
2104 {
2105 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2106 
2107 	if (flags & RTSQ_DEFAULT) {
2108 		flags = RTSQ_ALL;
2109 		/*
2110 		 * If this message is for an underlying interface, prevent
2111 		 * "normal" (IPMP-unaware) routing sockets from seeing it.
2112 		 */
2113 		if (IS_UNDER_IPMP(ipif->ipif_ill))
2114 			flags &= ~RTSQ_NORMAL;
2115 	}
2116 
2117 	/*
2118 	 * Let conn_ixa caching know that source address selection
2119 	 * changed
2120 	 */
2121 	if (cmd == RTM_ADD || cmd == RTM_DELETE)
2122 		ip_update_source_selection(ipst);
2123 
2124 	/*
2125 	 * If the request is DELETE, send RTM_DELETE and RTM_DELADDR.
2126 	 * if the request is ADD, send RTM_NEWADDR and RTM_ADD.
2127 	 * otherwise simply send the request.
2128 	 */
2129 	switch (cmd) {
2130 	case RTM_ADD:
2131 		rts_new_rtsmsg(RTM_NEWADDR, error, ipif, flags);
2132 		rts_new_rtsmsg(RTM_ADD, error, ipif, flags);
2133 		break;
2134 	case RTM_DELETE:
2135 		rts_new_rtsmsg(RTM_DELETE, error, ipif, flags);
2136 		rts_new_rtsmsg(RTM_DELADDR, error, ipif, flags);
2137 		break;
2138 	default:
2139 		rts_new_rtsmsg(cmd, error, ipif, flags);
2140 		break;
2141 	}
2142 }
2143 
2144 /*
2145  * Based on the address family specified in a sockaddr, copy the address field
2146  * into an in6_addr_t.
2147  *
2148  * In the case of AF_UNSPEC, we assume the family is actually AF_INET for
2149  * compatibility with programs that leave the family cleared in the sockaddr.
2150  * Callers of rts_copyfromsockaddr should check the family themselves if they
2151  * wish to verify its value.
2152  *
2153  * In the case of AF_INET6, a check is made to ensure that address is not an
2154  * IPv4-mapped address.
2155  */
2156 size_t
2157 rts_copyfromsockaddr(struct sockaddr *sa, in6_addr_t *addrp)
2158 {
2159 	switch (sa->sa_family) {
2160 	case AF_INET:
2161 	case AF_UNSPEC:
2162 		IN6_IPADDR_TO_V4MAPPED(((sin_t *)sa)->sin_addr.s_addr, addrp);
2163 		return (sizeof (sin_t));
2164 	case AF_INET6:
2165 		*addrp = ((sin6_t *)sa)->sin6_addr;
2166 		if (IN6_IS_ADDR_V4MAPPED(addrp))
2167 			return (0);
2168 		return (sizeof (sin6_t));
2169 	default:
2170 		return (0);
2171 	}
2172 }
2173