xref: /titanic_44/usr/src/uts/common/inet/ip/ip_if.c (revision ee519a1f9541a20bb76ef306dfc8e5616f8a5e26)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * This file contains the interface control functions for IP.
31  */
32 
33 #include <sys/types.h>
34 #include <sys/stream.h>
35 #include <sys/dlpi.h>
36 #include <sys/stropts.h>
37 #include <sys/strsun.h>
38 #include <sys/sysmacros.h>
39 #include <sys/strlog.h>
40 #include <sys/ddi.h>
41 #include <sys/sunddi.h>
42 #include <sys/cmn_err.h>
43 #include <sys/kstat.h>
44 #include <sys/debug.h>
45 #include <sys/zone.h>
46 
47 #include <sys/kmem.h>
48 #include <sys/systm.h>
49 #include <sys/param.h>
50 #include <sys/socket.h>
51 #define	_SUN_TPI_VERSION	2
52 #include <sys/tihdr.h>
53 #include <sys/isa_defs.h>
54 #include <net/if.h>
55 #include <net/if_arp.h>
56 #include <net/if_types.h>
57 #include <net/if_dl.h>
58 #include <net/route.h>
59 #include <sys/sockio.h>
60 #include <netinet/in.h>
61 #include <netinet/ip6.h>
62 #include <netinet/icmp6.h>
63 #include <netinet/igmp_var.h>
64 #include <sys/strsun.h>
65 #include <sys/policy.h>
66 #include <sys/ethernet.h>
67 
68 #include <inet/common.h>   /* for various inet/mi.h and inet/nd.h needs */
69 #include <inet/mi.h>
70 #include <inet/nd.h>
71 #include <inet/arp.h>
72 #include <inet/mib2.h>
73 #include <inet/ip.h>
74 #include <inet/ip6.h>
75 #include <inet/ip6_asp.h>
76 #include <inet/tcp.h>
77 #include <inet/ip_multi.h>
78 #include <inet/ip_ire.h>
79 #include <inet/ip_rts.h>
80 #include <inet/ip_ndp.h>
81 #include <inet/ip_if.h>
82 #include <inet/ip_impl.h>
83 #include <inet/tun.h>
84 #include <inet/sctp_ip.h>
85 
86 #include <net/pfkeyv2.h>
87 #include <inet/ipsec_info.h>
88 #include <inet/sadb.h>
89 #include <inet/ipsec_impl.h>
90 #include <sys/iphada.h>
91 
92 
93 #include <netinet/igmp.h>
94 #include <inet/ip_listutils.h>
95 #include <netinet/ip_mroute.h>
96 #include <inet/ipclassifier.h>
97 #include <sys/mac.h>
98 
99 #include <sys/systeminfo.h>
100 #include <sys/bootconf.h>
101 
102 /* The character which tells where the ill_name ends */
103 #define	IPIF_SEPARATOR_CHAR	':'
104 
105 /* IP ioctl function table entry */
106 typedef struct ipft_s {
107 	int	ipft_cmd;
108 	pfi_t	ipft_pfi;
109 	int	ipft_min_size;
110 	int	ipft_flags;
111 } ipft_t;
112 #define	IPFT_F_NO_REPLY		0x1	/* IP ioctl does not expect any reply */
113 #define	IPFT_F_SELF_REPLY	0x2	/* ioctl callee does the ioctl reply */
114 
115 typedef struct ip_sock_ar_s {
116 	union {
117 		area_t	ip_sock_area;
118 		ared_t	ip_sock_ared;
119 		areq_t	ip_sock_areq;
120 	} ip_sock_ar_u;
121 	queue_t	*ip_sock_ar_q;
122 } ip_sock_ar_t;
123 
124 static int	nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
125 static int	nd_ill_forward_set(queue_t *q, mblk_t *mp,
126 		    char *value, caddr_t cp, cred_t *ioc_cr);
127 
128 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
129 static ip_m_t	*ip_m_lookup(t_uscalar_t mac_type);
130 static int	ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
131     mblk_t *mp, boolean_t need_up);
132 static int	ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
133     mblk_t *mp, boolean_t need_up);
134 static int	ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
135     queue_t *q, mblk_t *mp, boolean_t need_up);
136 static int	ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
137     mblk_t *mp, boolean_t need_up);
138 static int	ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
139     mblk_t *mp);
140 static int	ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
141     queue_t *q, mblk_t *mp, boolean_t need_up);
142 static int	ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp,
143     sin_t *sin, boolean_t x_arp_ioctl, boolean_t if_arp_ioctl);
144 static ipaddr_t	ip_subnet_mask(ipaddr_t addr, ipif_t **);
145 static void	ip_wput_ioctl(queue_t *q, mblk_t *mp);
146 static void	ipsq_flush(ill_t *ill);
147 static void	ipsq_clean_all(ill_t *ill);
148 static void	ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring);
149 static	int	ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
150     queue_t *q, mblk_t *mp, boolean_t need_up);
151 static void	ipsq_delete(ipsq_t *);
152 
153 static ipif_t	*ipif_allocate(ill_t *ill, int id, uint_t ire_type,
154 		    boolean_t initialize);
155 static void	ipif_check_bcast_ires(ipif_t *test_ipif);
156 static void	ipif_down_delete_ire(ire_t *ire, char *ipif);
157 static void	ipif_delete_cache_ire(ire_t *, char *);
158 static int	ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
159 static void	ipif_down_tail(ipif_t *ipif);
160 static void	ipif_free(ipif_t *ipif);
161 static void	ipif_free_tail(ipif_t *ipif);
162 static void	ipif_mask_reply(ipif_t *);
163 static void	ipif_mtu_change(ire_t *ire, char *ipif_arg);
164 static void	ipif_multicast_down(ipif_t *ipif);
165 static void	ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif);
166 static void	ipif_set_default(ipif_t *ipif);
167 static int	ipif_set_values(queue_t *q, mblk_t *mp,
168     char *interf_name, uint_t *ppa);
169 static int	ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
170     queue_t *q);
171 static ipif_t	*ipif_lookup_on_name(char *name, size_t namelen,
172     boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
173     queue_t *q, mblk_t *mp, ipsq_func_t func, int *error);
174 static int	ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp);
175 static void	ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp);
176 
177 static int	ill_alloc_ppa(ill_if_t *, ill_t *);
178 static int	ill_arp_off(ill_t *ill);
179 static int	ill_arp_on(ill_t *ill);
180 static void	ill_delete_interface_type(ill_if_t *);
181 static int	ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
182 static void	ill_down(ill_t *ill);
183 static void	ill_downi(ire_t *ire, char *ill_arg);
184 static void	ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg);
185 static void	ill_down_tail(ill_t *ill);
186 static void	ill_free_mib(ill_t *ill);
187 static void	ill_glist_delete(ill_t *);
188 static boolean_t ill_has_usable_ipif(ill_t *);
189 static int	ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int);
190 static void	ill_nominate_bcast_rcv(ill_group_t *illgrp);
191 static void	ill_phyint_free(ill_t *ill);
192 static void	ill_phyint_reinit(ill_t *ill);
193 static void	ill_set_nce_router_flags(ill_t *, boolean_t);
194 static void	ill_signal_ipsq_ills(ipsq_t *, boolean_t);
195 static boolean_t ill_split_ipsq(ipsq_t *cur_sq);
196 static void	ill_stq_cache_delete(ire_t *, char *);
197 
198 static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *);
199 static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *);
200 static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
201     in6_addr_t *);
202 static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
203     ipaddr_t *);
204 static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *);
205 static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
206     in6_addr_t *);
207 static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
208     ipaddr_t *);
209 
210 static void	ipif_save_ire(ipif_t *, ire_t *);
211 static void	ipif_remove_ire(ipif_t *, ire_t *);
212 static void 	ip_cgtp_bcast_add(ire_t *, ire_t *);
213 static void 	ip_cgtp_bcast_delete(ire_t *);
214 
215 /*
216  * Per-ill IPsec capabilities management.
217  */
218 static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void);
219 static void	ill_ipsec_capab_free(ill_ipsec_capab_t *);
220 static void	ill_ipsec_capab_add(ill_t *, uint_t, boolean_t);
221 static void	ill_ipsec_capab_delete(ill_t *, uint_t);
222 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int);
223 static void ill_capability_proto(ill_t *, int, mblk_t *);
224 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *,
225     boolean_t);
226 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
227 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
228 static void ill_capability_mdt_reset(ill_t *, mblk_t **);
229 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
230 static void ill_capability_ipsec_reset(ill_t *, mblk_t **);
231 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
232 static void ill_capability_hcksum_reset(ill_t *, mblk_t **);
233 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
234     dl_capability_sub_t *);
235 static void ill_capability_zerocopy_reset(ill_t *, mblk_t **);
236 
237 static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
238 static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *);
239 static void ill_capability_dls_reset(ill_t *, mblk_t **);
240 static void ill_capability_dls_disable(ill_t *);
241 
242 static void	illgrp_cache_delete(ire_t *, char *);
243 static void	illgrp_delete(ill_t *ill);
244 static void	illgrp_reset_schednext(ill_t *ill);
245 
246 static ill_t	*ill_prev_usesrc(ill_t *);
247 static int	ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
248 static void	ill_disband_usesrc_group(ill_t *);
249 
250 static void	conn_cleanup_stale_ire(conn_t *, caddr_t);
251 
252 /*
253  * if we go over the memory footprint limit more than once in this msec
254  * interval, we'll start pruning aggressively.
255  */
256 int ip_min_frag_prune_time = 0;
257 
258 /*
259  * max # of IPsec algorithms supported.  Limited to 1 byte by PF_KEY
260  * and the IPsec DOI
261  */
262 #define	MAX_IPSEC_ALGS	256
263 
264 #define	BITSPERBYTE	8
265 #define	BITS(type)	(BITSPERBYTE * (long)sizeof (type))
266 
267 #define	IPSEC_ALG_ENABLE(algs, algid) \
268 		((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \
269 		(1 << ((algid) % BITS(ipsec_capab_elem_t))))
270 
271 #define	IPSEC_ALG_IS_ENABLED(algid, algs) \
272 		((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \
273 		(1 << ((algid) % BITS(ipsec_capab_elem_t))))
274 
275 typedef uint8_t ipsec_capab_elem_t;
276 
277 /*
278  * Per-algorithm parameters.  Note that at present, only encryption
279  * algorithms have variable keysize (IKE does not provide a way to negotiate
280  * auth algorithm keysize).
281  *
282  * All sizes here are in bits.
283  */
284 typedef struct
285 {
286 	uint16_t	minkeylen;
287 	uint16_t	maxkeylen;
288 } ipsec_capab_algparm_t;
289 
290 /*
291  * Per-ill capabilities.
292  */
293 struct ill_ipsec_capab_s {
294 	ipsec_capab_elem_t *encr_hw_algs;
295 	ipsec_capab_elem_t *auth_hw_algs;
296 	uint32_t algs_size;	/* size of _hw_algs in bytes */
297 	/* algorithm key lengths */
298 	ipsec_capab_algparm_t *encr_algparm;
299 	uint32_t encr_algparm_size;
300 	uint32_t encr_algparm_end;
301 };
302 
303 /*
304  * List of AH and ESP IPsec acceleration capable ills
305  */
306 typedef struct ipsec_capab_ill_s {
307 	uint_t ill_index;
308 	boolean_t ill_isv6;
309 	struct ipsec_capab_ill_s *next;
310 } ipsec_capab_ill_t;
311 
312 static ipsec_capab_ill_t *ipsec_capab_ills_ah;
313 static ipsec_capab_ill_t *ipsec_capab_ills_esp;
314 krwlock_t ipsec_capab_ills_lock;
315 
316 /*
317  * The field values are larger than strictly necessary for simple
318  * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls.
319  */
320 static area_t	ip_area_template = {
321 	AR_ENTRY_ADD,			/* area_cmd */
322 	sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl),
323 					/* area_name_offset */
324 	/* area_name_length temporarily holds this structure length */
325 	sizeof (area_t),			/* area_name_length */
326 	IP_ARP_PROTO_TYPE,		/* area_proto */
327 	sizeof (ip_sock_ar_t),		/* area_proto_addr_offset */
328 	IP_ADDR_LEN,			/* area_proto_addr_length */
329 	sizeof (ip_sock_ar_t) + IP_ADDR_LEN,
330 					/* area_proto_mask_offset */
331 	0,				/* area_flags */
332 	sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN,
333 					/* area_hw_addr_offset */
334 	/* Zero length hw_addr_length means 'use your idea of the address' */
335 	0				/* area_hw_addr_length */
336 };
337 
338 /*
339  * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver
340  * support
341  */
342 static area_t	ip6_area_template = {
343 	AR_ENTRY_ADD,			/* area_cmd */
344 	sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t),
345 					/* area_name_offset */
346 	/* area_name_length temporarily holds this structure length */
347 	sizeof (area_t),			/* area_name_length */
348 	IP_ARP_PROTO_TYPE,		/* area_proto */
349 	sizeof (ip_sock_ar_t),		/* area_proto_addr_offset */
350 	IPV6_ADDR_LEN,			/* area_proto_addr_length */
351 	sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN,
352 					/* area_proto_mask_offset */
353 	0,				/* area_flags */
354 	sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN,
355 					/* area_hw_addr_offset */
356 	/* Zero length hw_addr_length means 'use your idea of the address' */
357 	0				/* area_hw_addr_length */
358 };
359 
360 static ared_t	ip_ared_template = {
361 	AR_ENTRY_DELETE,
362 	sizeof (ared_t) + IP_ADDR_LEN,
363 	sizeof (ared_t),
364 	IP_ARP_PROTO_TYPE,
365 	sizeof (ared_t),
366 	IP_ADDR_LEN
367 };
368 
369 static ared_t	ip6_ared_template = {
370 	AR_ENTRY_DELETE,
371 	sizeof (ared_t) + IPV6_ADDR_LEN,
372 	sizeof (ared_t),
373 	IP_ARP_PROTO_TYPE,
374 	sizeof (ared_t),
375 	IPV6_ADDR_LEN
376 };
377 
378 /*
379  * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as
380  * as the areq doesn't include an IP address in ill_dl_up() (the only place a
381  * areq is used).
382  */
383 static areq_t	ip_areq_template = {
384 	AR_ENTRY_QUERY,			/* cmd */
385 	sizeof (areq_t)+(2*IP_ADDR_LEN),	/* name offset */
386 	sizeof (areq_t),	/* name len (filled by ill_arp_alloc) */
387 	IP_ARP_PROTO_TYPE,		/* protocol, from arps perspective */
388 	sizeof (areq_t),			/* target addr offset */
389 	IP_ADDR_LEN,			/* target addr_length */
390 	0,				/* flags */
391 	sizeof (areq_t) + IP_ADDR_LEN,	/* sender addr offset */
392 	IP_ADDR_LEN,			/* sender addr length */
393 	6,				/* xmit_count */
394 	1000,				/* (re)xmit_interval in milliseconds */
395 	4				/* max # of requests to buffer */
396 	/* anything else filled in by the code */
397 };
398 
399 static arc_t	ip_aru_template = {
400 	AR_INTERFACE_UP,
401 	sizeof (arc_t),		/* Name offset */
402 	sizeof (arc_t)		/* Name length (set by ill_arp_alloc) */
403 };
404 
405 static arc_t	ip_ard_template = {
406 	AR_INTERFACE_DOWN,
407 	sizeof (arc_t),		/* Name offset */
408 	sizeof (arc_t)		/* Name length (set by ill_arp_alloc) */
409 };
410 
411 static arc_t	ip_aron_template = {
412 	AR_INTERFACE_ON,
413 	sizeof (arc_t),		/* Name offset */
414 	sizeof (arc_t)		/* Name length (set by ill_arp_alloc) */
415 };
416 
417 static arc_t	ip_aroff_template = {
418 	AR_INTERFACE_OFF,
419 	sizeof (arc_t),		/* Name offset */
420 	sizeof (arc_t)		/* Name length (set by ill_arp_alloc) */
421 };
422 
423 
424 static arma_t	ip_arma_multi_template = {
425 	AR_MAPPING_ADD,
426 	sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN,
427 				/* Name offset */
428 	sizeof (arma_t),	/* Name length (set by ill_arp_alloc) */
429 	IP_ARP_PROTO_TYPE,
430 	sizeof (arma_t),			/* proto_addr_offset */
431 	IP_ADDR_LEN,				/* proto_addr_length */
432 	sizeof (arma_t) + IP_ADDR_LEN,		/* proto_mask_offset */
433 	sizeof (arma_t) + 2*IP_ADDR_LEN,	/* proto_extract_mask_offset */
434 	ACE_F_PERMANENT | ACE_F_MAPPING,	/* flags */
435 	sizeof (arma_t) + 3*IP_ADDR_LEN,	/* hw_addr_offset */
436 	IP_MAX_HW_LEN,				/* hw_addr_length */
437 	0,					/* hw_mapping_start */
438 };
439 
440 static ipft_t	ip_ioctl_ftbl[] = {
441 	{ IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
442 	{ IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
443 		IPFT_F_NO_REPLY },
444 	{ IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t),
445 		IPFT_F_NO_REPLY },
446 	{ IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
447 	{ 0 }
448 };
449 
450 /* Simple ICMP IP Header Template */
451 static ipha_t icmp_ipha = {
452 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
453 };
454 
455 /* Flag descriptors for ip_ipif_report */
456 static nv_t	ipif_nv_tbl[] = {
457 	{ IPIF_UP,		"UP" },
458 	{ IPIF_BROADCAST,	"BROADCAST" },
459 	{ ILLF_DEBUG,		"DEBUG" },
460 	{ PHYI_LOOPBACK,	"LOOPBACK" },
461 	{ IPIF_POINTOPOINT,	"POINTOPOINT" },
462 	{ ILLF_NOTRAILERS,	"NOTRAILERS" },
463 	{ PHYI_RUNNING,		"RUNNING" },
464 	{ ILLF_NOARP,		"NOARP" },
465 	{ PHYI_PROMISC,		"PROMISC" },
466 	{ PHYI_ALLMULTI,	"ALLMULTI" },
467 	{ PHYI_INTELLIGENT,	"INTELLIGENT" },
468 	{ ILLF_MULTICAST,	"MULTICAST" },
469 	{ PHYI_MULTI_BCAST,	"MULTI_BCAST" },
470 	{ IPIF_UNNUMBERED,	"UNNUMBERED" },
471 	{ IPIF_DHCPRUNNING,	"DHCP" },
472 	{ IPIF_PRIVATE,		"PRIVATE" },
473 	{ IPIF_NOXMIT,		"NOXMIT" },
474 	{ IPIF_NOLOCAL,		"NOLOCAL" },
475 	{ IPIF_DEPRECATED,	"DEPRECATED" },
476 	{ IPIF_PREFERRED,	"PREFERRED" },
477 	{ IPIF_TEMPORARY,	"TEMPORARY" },
478 	{ IPIF_ADDRCONF,	"ADDRCONF" },
479 	{ PHYI_VIRTUAL,		"VIRTUAL" },
480 	{ ILLF_ROUTER,		"ROUTER" },
481 	{ ILLF_NONUD,		"NONUD" },
482 	{ IPIF_ANYCAST,		"ANYCAST" },
483 	{ ILLF_NORTEXCH,	"NORTEXCH" },
484 	{ ILLF_IPV4,		"IPV4" },
485 	{ ILLF_IPV6,		"IPV6" },
486 	{ IPIF_MIPRUNNING,	"MIP" },
487 	{ IPIF_NOFAILOVER,	"NOFAILOVER" },
488 	{ PHYI_FAILED,		"FAILED" },
489 	{ PHYI_STANDBY,		"STANDBY" },
490 	{ PHYI_INACTIVE,	"INACTIVE" },
491 	{ PHYI_OFFLINE,		"OFFLINE" },
492 };
493 
494 static uchar_t	ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
495 
496 static ip_m_t	ip_m_tbl[] = {
497 	{ DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
498 	    ip_ether_v6intfid },
499 	{ DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
500 	    ip_nodef_v6intfid },
501 	{ DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
502 	    ip_nodef_v6intfid },
503 	{ DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
504 	    ip_nodef_v6intfid },
505 	{ DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
506 	    ip_ether_v6intfid },
507 	{ DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo,
508 	    ip_ib_v6intfid },
509 	{ SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL},
510 	{ DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
511 	    ip_nodef_v6intfid }
512 };
513 
514 static ill_t	ill_null;		/* Empty ILL for init. */
515 char	ipif_loopback_name[] = "lo0";
516 static char *ipv4_forward_suffix = ":ip_forwarding";
517 static char *ipv6_forward_suffix = ":ip6_forwarding";
518 static kstat_t *loopback_ksp = NULL;
519 static	sin6_t	sin6_null;	/* Zero address for quick clears */
520 static	sin_t	sin_null;	/* Zero address for quick clears */
521 static	uint_t	ill_index = 1;	/* Used to assign interface indicies */
522 /* When set search for unused index */
523 static boolean_t ill_index_wrap = B_FALSE;
524 /* When set search for unused ipif_seqid */
525 static ipif_t	ipif_zero;
526 uint_t	ipif_src_random;
527 
528 /*
529  * For details on the protection offered by these locks please refer
530  * to the notes under the Synchronization section at the start of ip.c
531  */
532 krwlock_t	ill_g_lock;		/* The global ill_g_lock */
533 kmutex_t	ip_addr_avail_lock;	/* Address availability check lock */
534 ipsq_t		*ipsq_g_head;		/* List of all ipsq's on the system */
535 
536 krwlock_t	ill_g_usesrc_lock;	/* Protects usesrc related fields */
537 
538 /*
539  * illgrp_head/ifgrp_head is protected by IP's perimeter.
540  */
541 static  ill_group_t *illgrp_head_v4;	/* Head of IPv4 ill groups */
542 ill_group_t *illgrp_head_v6;		/* Head of IPv6 ill groups */
543 
544 ill_g_head_t	ill_g_heads[MAX_G_HEADS];   /* ILL List Head */
545 
546 /*
547  * ppa arena is created after these many
548  * interfaces have been plumbed.
549  */
550 uint_t	ill_no_arena = 12;
551 
552 #pragma align CACHE_ALIGN_SIZE(phyint_g_list)
553 static phyint_list_t phyint_g_list;	/* start of phyint list */
554 
555 /*
556  * Reflects value of FAILBACK variable in IPMP config file
557  * /etc/default/mpathd. Default value is B_TRUE.
558  * Set to B_FALSE if user disabled failback by configuring "FAILBACK=no"
559  * in.mpathd uses SIOCSIPMPFAILBACK ioctl to pass this information to kernel.
560  */
561 static boolean_t ipmp_enable_failback = B_TRUE;
562 
563 /*
564  * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout
565  * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is
566  * set through platform specific code (Niagara/Ontario).
567  */
568 #define	SOFT_RINGS_ENABLED()	(ip_soft_rings_cnt ? \
569 		(ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE)
570 
571 #define	ILL_CAPAB_DLS	(ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL)
572 
573 static uint_t
574 ipif_rand(void)
575 {
576 	ipif_src_random = ipif_src_random * 1103515245 + 12345;
577 	return ((ipif_src_random >> 16) & 0x7fff);
578 }
579 
580 /*
581  * Allocate per-interface mibs. Only used for ipv6.
582  * Returns true if ok. False otherwise.
583  *  ipsq  may not yet be allocated (loopback case ).
584  */
585 static boolean_t
586 ill_allocate_mibs(ill_t *ill)
587 {
588 	ASSERT(ill->ill_isv6);
589 
590 	/* Already allocated? */
591 	if (ill->ill_ip6_mib != NULL) {
592 		ASSERT(ill->ill_icmp6_mib != NULL);
593 		return (B_TRUE);
594 	}
595 
596 	ill->ill_ip6_mib = kmem_zalloc(sizeof (*ill->ill_ip6_mib),
597 	    KM_NOSLEEP);
598 	if (ill->ill_ip6_mib == NULL) {
599 		return (B_FALSE);
600 	}
601 	ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
602 	    KM_NOSLEEP);
603 	if (ill->ill_icmp6_mib == NULL) {
604 		kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib));
605 		ill->ill_ip6_mib = NULL;
606 		return (B_FALSE);
607 	}
608 	/*
609 	 * The ipv6Ifindex and ipv6IfIcmpIndex will be assigned later
610 	 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert
611 	 * -> ill_phyint_reinit
612 	 */
613 	return (B_TRUE);
614 }
615 
616 /*
617  * Common code for preparation of ARP commands.  Two points to remember:
618  * 	1) The ill_name is tacked on at the end of the allocated space so
619  *	   the templates name_offset field must contain the total space
620  *	   to allocate less the name length.
621  *
622  *	2) The templates name_length field should contain the *template*
623  *	   length.  We use it as a parameter to bcopy() and then write
624  *	   the real ill_name_length into the name_length field of the copy.
625  * (Always called as writer.)
626  */
627 mblk_t *
628 ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr)
629 {
630 	arc_t	*arc = (arc_t *)template;
631 	char	*cp;
632 	int	len;
633 	mblk_t	*mp;
634 	uint_t	name_length = ill->ill_name_length;
635 	uint_t	template_len = arc->arc_name_length;
636 
637 	len = arc->arc_name_offset + name_length;
638 	mp = allocb(len, BPRI_HI);
639 	if (mp == NULL)
640 		return (NULL);
641 	cp = (char *)mp->b_rptr;
642 	mp->b_wptr = (uchar_t *)&cp[len];
643 	if (template_len)
644 		bcopy(template, cp, template_len);
645 	if (len > template_len)
646 		bzero(&cp[template_len], len - template_len);
647 	mp->b_datap->db_type = M_PROTO;
648 
649 	arc = (arc_t *)cp;
650 	arc->arc_name_length = name_length;
651 	cp = (char *)arc + arc->arc_name_offset;
652 	bcopy(ill->ill_name, cp, name_length);
653 
654 	if (addr) {
655 		area_t	*area = (area_t *)mp->b_rptr;
656 
657 		cp = (char *)area + area->area_proto_addr_offset;
658 		bcopy(addr, cp, area->area_proto_addr_length);
659 		if (area->area_cmd == AR_ENTRY_ADD) {
660 			cp = (char *)area;
661 			len = area->area_proto_addr_length;
662 			if (area->area_proto_mask_offset)
663 				cp += area->area_proto_mask_offset;
664 			else
665 				cp += area->area_proto_addr_offset + len;
666 			while (len-- > 0)
667 				*cp++ = (char)~0;
668 		}
669 	}
670 	return (mp);
671 }
672 
673 /*
674  * Completely vaporize a lower level tap and all associated interfaces.
675  * ill_delete is called only out of ip_close when the device control
676  * stream is being closed.
677  */
678 void
679 ill_delete(ill_t *ill)
680 {
681 	ipif_t	*ipif;
682 	ill_t	*prev_ill;
683 
684 	/*
685 	 * ill_delete may be forcibly entering the ipsq. The previous
686 	 * ioctl may not have completed and may need to be aborted.
687 	 * ipsq_flush takes care of it. If we don't need to enter the
688 	 * the ipsq forcibly, the 2nd invocation of ipsq_flush in
689 	 * ill_delete_tail is sufficient.
690 	 */
691 	ipsq_flush(ill);
692 
693 	/*
694 	 * Nuke all interfaces.  ipif_free will take down the interface,
695 	 * remove it from the list, and free the data structure.
696 	 * Walk down the ipif list and remove the logical interfaces
697 	 * first before removing the main ipif. We can't unplumb
698 	 * zeroth interface first in the case of IPv6 as reset_conn_ill
699 	 * -> ip_ll_delmulti_v6 de-references ill_ipif for checking
700 	 * POINTOPOINT.
701 	 *
702 	 * If ill_ipif was not properly initialized (i.e low on memory),
703 	 * then no interfaces to clean up. In this case just clean up the
704 	 * ill.
705 	 */
706 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
707 		ipif_free(ipif);
708 
709 	/*
710 	 * Used only by ill_arp_on and ill_arp_off, which are writers.
711 	 * So nobody can be using this mp now. Free the mp allocated for
712 	 * honoring ILLF_NOARP
713 	 */
714 	freemsg(ill->ill_arp_on_mp);
715 	ill->ill_arp_on_mp = NULL;
716 
717 	/* Clean up msgs on pending upcalls for mrouted */
718 	reset_mrt_ill(ill);
719 
720 	/*
721 	 * ipif_free -> reset_conn_ipif will remove all multicast
722 	 * references for IPv4. For IPv6, we need to do it here as
723 	 * it points only at ills.
724 	 */
725 	reset_conn_ill(ill);
726 
727 	/*
728 	 * ill_down will arrange to blow off any IRE's dependent on this
729 	 * ILL, and shut down fragmentation reassembly.
730 	 */
731 	ill_down(ill);
732 
733 	/* Let SCTP know, so that it can remove this from its list. */
734 	sctp_update_ill(ill, SCTP_ILL_REMOVE);
735 
736 	/*
737 	 * If an address on this ILL is being used as a source address then
738 	 * clear out the pointers in other ILLs that point to this ILL.
739 	 */
740 	rw_enter(&ill_g_usesrc_lock, RW_WRITER);
741 	if (ill->ill_usesrc_grp_next != NULL) {
742 		if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
743 			ill_disband_usesrc_group(ill);
744 		} else {	/* consumer of the usesrc ILL */
745 			prev_ill = ill_prev_usesrc(ill);
746 			prev_ill->ill_usesrc_grp_next =
747 			    ill->ill_usesrc_grp_next;
748 		}
749 	}
750 	rw_exit(&ill_g_usesrc_lock);
751 }
752 
753 /*
754  * ill_delete_tail is called from ip_modclose after all references
755  * to the closing ill are gone. The wait is done in ip_modclose
756  */
757 void
758 ill_delete_tail(ill_t *ill)
759 {
760 	mblk_t	**mpp;
761 	ipif_t	*ipif;
762 
763 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
764 		ipif_down_tail(ipif);
765 
766 	/*
767 	 * Send the detach if there's one to send (i.e., if we're above a
768 	 * style 2 DLPI driver).
769 	 */
770 	if (ill->ill_detach_mp != NULL) {
771 		ill_dlpi_send(ill, ill->ill_detach_mp);
772 		ill->ill_detach_mp = NULL;
773 	}
774 
775 	/*
776 	 * If polling capability is enabled (which signifies direct
777 	 * upcall into IP and driver has ill saved as a handle),
778 	 * we need to make sure that unbind has completed before we
779 	 * let the ill disappear and driver no longer has any reference
780 	 * to this ill.
781 	 */
782 	mutex_enter(&ill->ill_lock);
783 	if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) {
784 		while (!(ill->ill_state_flags & ILL_DL_UNBIND_DONE))
785 			cv_wait(&ill->ill_cv, &ill->ill_lock);
786 	}
787 	mutex_exit(&ill->ill_lock);
788 
789 	if (ill->ill_net_type != IRE_LOOPBACK)
790 		qprocsoff(ill->ill_rq);
791 
792 	/*
793 	 * We do an ipsq_flush once again now. New messages could have
794 	 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
795 	 * could also have landed up if an ioctl thread had looked up
796 	 * the ill before we set the ILL_CONDEMNED flag, but not yet
797 	 * enqueued the ioctl when we did the ipsq_flush last time.
798 	 */
799 	ipsq_flush(ill);
800 
801 	/*
802 	 * Free capabilities.
803 	 */
804 	if (ill->ill_ipsec_capab_ah != NULL) {
805 		ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH);
806 		ill_ipsec_capab_free(ill->ill_ipsec_capab_ah);
807 		ill->ill_ipsec_capab_ah = NULL;
808 	}
809 
810 	if (ill->ill_ipsec_capab_esp != NULL) {
811 		ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP);
812 		ill_ipsec_capab_free(ill->ill_ipsec_capab_esp);
813 		ill->ill_ipsec_capab_esp = NULL;
814 	}
815 
816 	if (ill->ill_mdt_capab != NULL) {
817 		kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t));
818 		ill->ill_mdt_capab = NULL;
819 	}
820 
821 	if (ill->ill_hcksum_capab != NULL) {
822 		kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
823 		ill->ill_hcksum_capab = NULL;
824 	}
825 
826 	if (ill->ill_zerocopy_capab != NULL) {
827 		kmem_free(ill->ill_zerocopy_capab,
828 		    sizeof (ill_zerocopy_capab_t));
829 		ill->ill_zerocopy_capab = NULL;
830 	}
831 
832 	/*
833 	 * Clean up polling and soft ring capabilities
834 	 */
835 	if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))
836 		ill_capability_dls_disable(ill);
837 
838 	if (ill->ill_dls_capab != NULL) {
839 		CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn);
840 		ill->ill_dls_capab->ill_unbind_conn = NULL;
841 		kmem_free(ill->ill_dls_capab,
842 		    sizeof (ill_dls_capab_t) +
843 		    (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS));
844 		ill->ill_dls_capab = NULL;
845 	}
846 
847 	ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL));
848 
849 	while (ill->ill_ipif != NULL)
850 		ipif_free_tail(ill->ill_ipif);
851 
852 	ill_down_tail(ill);
853 
854 	/*
855 	 * We have removed all references to ilm from conn and the ones joined
856 	 * within the kernel.
857 	 *
858 	 * We don't walk conns, mrts and ires because
859 	 *
860 	 * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts.
861 	 * 2) ill_down ->ill_downi walks all the ires and cleans up
862 	 *    ill references.
863 	 */
864 	ASSERT(ilm_walk_ill(ill) == 0);
865 	/*
866 	 * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free
867 	 * could free the phyint. No more reference to the phyint after this
868 	 * point.
869 	 */
870 	(void) ill_glist_delete(ill);
871 
872 	rw_enter(&ip_g_nd_lock, RW_WRITER);
873 	if (ill->ill_ndd_name != NULL)
874 		nd_unload(&ip_g_nd, ill->ill_ndd_name);
875 	rw_exit(&ip_g_nd_lock);
876 
877 
878 	if (ill->ill_frag_ptr != NULL) {
879 		uint_t count;
880 
881 		for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
882 			mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
883 		}
884 		mi_free(ill->ill_frag_ptr);
885 		ill->ill_frag_ptr = NULL;
886 		ill->ill_frag_hash_tbl = NULL;
887 	}
888 	if (ill->ill_nd_lla_mp != NULL)
889 		freemsg(ill->ill_nd_lla_mp);
890 	/* Free all retained control messages. */
891 	mpp = &ill->ill_first_mp_to_free;
892 	do {
893 		while (mpp[0]) {
894 			mblk_t  *mp;
895 			mblk_t  *mp1;
896 
897 			mp = mpp[0];
898 			mpp[0] = mp->b_next;
899 			for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
900 				mp1->b_next = NULL;
901 				mp1->b_prev = NULL;
902 			}
903 			freemsg(mp);
904 		}
905 	} while (mpp++ != &ill->ill_last_mp_to_free);
906 
907 	ill_free_mib(ill);
908 	ILL_TRACE_CLEANUP(ill);
909 }
910 
911 static void
912 ill_free_mib(ill_t *ill)
913 {
914 	if (ill->ill_ip6_mib != NULL) {
915 		kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib));
916 		ill->ill_ip6_mib = NULL;
917 	}
918 	if (ill->ill_icmp6_mib != NULL) {
919 		kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
920 		ill->ill_icmp6_mib = NULL;
921 	}
922 }
923 
924 /*
925  * Concatenate together a physical address and a sap.
926  *
927  * Sap_lengths are interpreted as follows:
928  *   sap_length == 0	==>	no sap
929  *   sap_length > 0	==>	sap is at the head of the dlpi address
930  *   sap_length < 0	==>	sap is at the tail of the dlpi address
931  */
932 static void
933 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
934     t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
935 {
936 	uint16_t sap_addr = (uint16_t)sap_src;
937 
938 	if (sap_length == 0) {
939 		if (phys_src == NULL)
940 			bzero(dst, phys_length);
941 		else
942 			bcopy(phys_src, dst, phys_length);
943 	} else if (sap_length < 0) {
944 		if (phys_src == NULL)
945 			bzero(dst, phys_length);
946 		else
947 			bcopy(phys_src, dst, phys_length);
948 		bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
949 	} else {
950 		bcopy(&sap_addr, dst, sizeof (sap_addr));
951 		if (phys_src == NULL)
952 			bzero((char *)dst + sap_length, phys_length);
953 		else
954 			bcopy(phys_src, (char *)dst + sap_length, phys_length);
955 	}
956 }
957 
958 /*
959  * Generate a dl_unitdata_req mblk for the device and address given.
960  * addr_length is the length of the physical portion of the address.
961  * If addr is NULL include an all zero address of the specified length.
962  * TRUE? In any case, addr_length is taken to be the entire length of the
963  * dlpi address, including the absolute value of sap_length.
964  */
965 mblk_t *
966 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
967 		t_scalar_t sap_length)
968 {
969 	dl_unitdata_req_t *dlur;
970 	mblk_t	*mp;
971 	t_scalar_t	abs_sap_length;		/* absolute value */
972 
973 	abs_sap_length = ABS(sap_length);
974 	mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
975 		DL_UNITDATA_REQ);
976 	if (mp == NULL)
977 		return (NULL);
978 	dlur = (dl_unitdata_req_t *)mp->b_rptr;
979 	/* HACK: accomodate incompatible DLPI drivers */
980 	if (addr_length == 8)
981 		addr_length = 6;
982 	dlur->dl_dest_addr_length = addr_length + abs_sap_length;
983 	dlur->dl_dest_addr_offset = sizeof (*dlur);
984 	dlur->dl_priority.dl_min = 0;
985 	dlur->dl_priority.dl_max = 0;
986 	ill_dlur_copy_address(addr, addr_length, sap, sap_length,
987 	    (uchar_t *)&dlur[1]);
988 	return (mp);
989 }
990 
991 /*
992  * Add the 'mp' to the list of pending mp's headed by ill_pending_mp
993  * Return an error if we already have 1 or more ioctls in progress.
994  * This is used only for non-exclusive ioctls. Currently this is used
995  * for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive
996  * and thus need to use ipsq_pending_mp_add.
997  */
998 boolean_t
999 ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp)
1000 {
1001 	ASSERT(MUTEX_HELD(&ill->ill_lock));
1002 	ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
1003 	/*
1004 	 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls.
1005 	 */
1006 	ASSERT((add_mp->b_datap->db_type == M_IOCDATA) ||
1007 	    (add_mp->b_datap->db_type == M_IOCTL));
1008 
1009 	ASSERT(MUTEX_HELD(&connp->conn_lock));
1010 	/*
1011 	 * Return error if the conn has started closing. The conn
1012 	 * could have finished cleaning up the pending mp list,
1013 	 * If so we should not add another mp to the list negating
1014 	 * the cleanup.
1015 	 */
1016 	if (connp->conn_state_flags & CONN_CLOSING)
1017 		return (B_FALSE);
1018 	/*
1019 	 * Add the pending mp to the head of the list, chained by b_next.
1020 	 * Note down the conn on which the ioctl request came, in b_prev.
1021 	 * This will be used to later get the conn, when we get a response
1022 	 * on the ill queue, from some other module (typically arp)
1023 	 */
1024 	add_mp->b_next = (void *)ill->ill_pending_mp;
1025 	add_mp->b_queue = CONNP_TO_WQ(connp);
1026 	ill->ill_pending_mp = add_mp;
1027 	if (connp != NULL)
1028 		connp->conn_oper_pending_ill = ill;
1029 	return (B_TRUE);
1030 }
1031 
1032 /*
1033  * Retrieve the ill_pending_mp and return it. We have to walk the list
1034  * of mblks starting at ill_pending_mp, and match based on the ioc_id.
1035  */
1036 mblk_t *
1037 ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id)
1038 {
1039 	mblk_t	*prev = NULL;
1040 	mblk_t	*curr = NULL;
1041 	uint_t	id;
1042 	conn_t	*connp;
1043 
1044 	/*
1045 	 * When the conn closes, conn_ioctl_cleanup needs to clean
1046 	 * up the pending mp, but it does not know the ioc_id and
1047 	 * passes in a zero for it.
1048 	 */
1049 	mutex_enter(&ill->ill_lock);
1050 	if (ioc_id != 0)
1051 		*connpp = NULL;
1052 
1053 	/* Search the list for the appropriate ioctl based on ioc_id */
1054 	for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL;
1055 	    prev = curr, curr = curr->b_next) {
1056 		id = ((struct iocblk *)curr->b_rptr)->ioc_id;
1057 		connp = Q_TO_CONN(curr->b_queue);
1058 		/* Match based on the ioc_id or based on the conn */
1059 		if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp))
1060 			break;
1061 	}
1062 
1063 	if (curr != NULL) {
1064 		/* Unlink the mblk from the pending mp list */
1065 		if (prev != NULL) {
1066 			prev->b_next = curr->b_next;
1067 		} else {
1068 			ASSERT(ill->ill_pending_mp == curr);
1069 			ill->ill_pending_mp = curr->b_next;
1070 		}
1071 
1072 		/*
1073 		 * conn refcnt must have been bumped up at the start of
1074 		 * the ioctl. So we can safely access the conn.
1075 		 */
1076 		ASSERT(CONN_Q(curr->b_queue));
1077 		*connpp = Q_TO_CONN(curr->b_queue);
1078 		curr->b_next = NULL;
1079 		curr->b_queue = NULL;
1080 	}
1081 
1082 	mutex_exit(&ill->ill_lock);
1083 
1084 	return (curr);
1085 }
1086 
1087 /*
1088  * Add the pending mp to the list. There can be only 1 pending mp
1089  * in the list. Any exclusive ioctl that needs to wait for a response
1090  * from another module or driver needs to use this function to set
1091  * the ipsq_pending_mp to the ioctl mblk and wait for the response from
1092  * the other module/driver. This is also used while waiting for the
1093  * ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
1094  */
1095 boolean_t
1096 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
1097     int waitfor)
1098 {
1099 	ipsq_t	*ipsq;
1100 
1101 	ASSERT(IAM_WRITER_IPIF(ipif));
1102 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
1103 	ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
1104 	/*
1105 	 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls,
1106 	 * M_ERROR/M_HANGUP from driver
1107 	 */
1108 	ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) ||
1109 	    (DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP));
1110 
1111 	ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
1112 	if (connp != NULL) {
1113 		ASSERT(MUTEX_HELD(&connp->conn_lock));
1114 		/*
1115 		 * Return error if the conn has started closing. The conn
1116 		 * could have finished cleaning up the pending mp list,
1117 		 * If so we should not add another mp to the list negating
1118 		 * the cleanup.
1119 		 */
1120 		if (connp->conn_state_flags & CONN_CLOSING)
1121 			return (B_FALSE);
1122 	}
1123 	mutex_enter(&ipsq->ipsq_lock);
1124 	ipsq->ipsq_pending_ipif = ipif;
1125 	/*
1126 	 * Note down the queue in b_queue. This will be returned by
1127 	 * ipsq_pending_mp_get. Caller will then use these values to restart
1128 	 * the processing
1129 	 */
1130 	add_mp->b_next = NULL;
1131 	add_mp->b_queue = q;
1132 	ipsq->ipsq_pending_mp = add_mp;
1133 	ipsq->ipsq_waitfor = waitfor;
1134 	/*
1135 	 * ipsq_current_ipif is needed to restart the operation from
1136 	 * ipif_ill_refrele_tail when the last reference to the ipi/ill
1137 	 * is gone. Since this is not an ioctl ipsq_current_ipif has not
1138 	 * been set until now.
1139 	 */
1140 	if (DB_TYPE(add_mp) == M_ERROR || DB_TYPE(add_mp) == M_HANGUP) {
1141 		ASSERT(ipsq->ipsq_current_ipif == NULL);
1142 		ipsq->ipsq_current_ipif = ipif;
1143 		ipsq->ipsq_last_cmd = DB_TYPE(add_mp);
1144 	}
1145 	if (connp != NULL)
1146 		connp->conn_oper_pending_ill = ipif->ipif_ill;
1147 	mutex_exit(&ipsq->ipsq_lock);
1148 	return (B_TRUE);
1149 }
1150 
1151 /*
1152  * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp
1153  * queued in the list.
1154  */
1155 mblk_t *
1156 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
1157 {
1158 	mblk_t	*curr = NULL;
1159 
1160 	mutex_enter(&ipsq->ipsq_lock);
1161 	*connpp = NULL;
1162 	if (ipsq->ipsq_pending_mp == NULL) {
1163 		mutex_exit(&ipsq->ipsq_lock);
1164 		return (NULL);
1165 	}
1166 
1167 	/* There can be only 1 such excl message */
1168 	curr = ipsq->ipsq_pending_mp;
1169 	ASSERT(curr != NULL && curr->b_next == NULL);
1170 	ipsq->ipsq_pending_ipif = NULL;
1171 	ipsq->ipsq_pending_mp = NULL;
1172 	ipsq->ipsq_waitfor = 0;
1173 	mutex_exit(&ipsq->ipsq_lock);
1174 
1175 	if (CONN_Q(curr->b_queue)) {
1176 		/*
1177 		 * This mp did a refhold on the conn, at the start of the ioctl.
1178 		 * So we can safely return a pointer to the conn to the caller.
1179 		 */
1180 		*connpp = Q_TO_CONN(curr->b_queue);
1181 	} else {
1182 		*connpp = NULL;
1183 	}
1184 	curr->b_next = NULL;
1185 	curr->b_prev = NULL;
1186 	return (curr);
1187 }
1188 
1189 /*
1190  * Cleanup the ioctl mp queued in ipsq_pending_mp
1191  * - Called in the ill_delete path
1192  * - Called in the M_ERROR or M_HANGUP path on the ill.
1193  * - Called in the conn close path.
1194  */
1195 boolean_t
1196 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
1197 {
1198 	mblk_t	*mp;
1199 	ipsq_t	*ipsq;
1200 	queue_t	*q;
1201 	ipif_t	*ipif;
1202 
1203 	ASSERT(IAM_WRITER_ILL(ill));
1204 	ipsq = ill->ill_phyint->phyint_ipsq;
1205 	mutex_enter(&ipsq->ipsq_lock);
1206 	/*
1207 	 * If connp is null, unconditionally clean up the ipsq_pending_mp.
1208 	 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl
1209 	 * even if it is meant for another ill, since we have to enqueue
1210 	 * a new mp now in ipsq_pending_mp to complete the ipif_down.
1211 	 * If connp is non-null we are called from the conn close path.
1212 	 */
1213 	mp = ipsq->ipsq_pending_mp;
1214 	if (mp == NULL || (connp != NULL &&
1215 	    mp->b_queue != CONNP_TO_WQ(connp))) {
1216 		mutex_exit(&ipsq->ipsq_lock);
1217 		return (B_FALSE);
1218 	}
1219 	/* Now remove from the ipsq_pending_mp */
1220 	ipsq->ipsq_pending_mp = NULL;
1221 	q = mp->b_queue;
1222 	mp->b_next = NULL;
1223 	mp->b_prev = NULL;
1224 	mp->b_queue = NULL;
1225 
1226 	/* If MOVE was in progress, clear the move_in_progress fields also. */
1227 	ill = ipsq->ipsq_pending_ipif->ipif_ill;
1228 	if (ill->ill_move_in_progress) {
1229 		ILL_CLEAR_MOVE(ill);
1230 	} else if (ill->ill_up_ipifs) {
1231 		ill_group_cleanup(ill);
1232 	}
1233 
1234 	ipif = ipsq->ipsq_pending_ipif;
1235 	ipsq->ipsq_pending_ipif = NULL;
1236 	ipsq->ipsq_waitfor = 0;
1237 	ipsq->ipsq_current_ipif = NULL;
1238 	mutex_exit(&ipsq->ipsq_lock);
1239 
1240 	if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
1241 		ip_ioctl_finish(q, mp, ENXIO, connp != NULL ? CONN_CLOSE :
1242 		    NO_COPYOUT, connp != NULL ? ipif : NULL, NULL);
1243 	} else {
1244 		/*
1245 		 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't
1246 		 * be just inet_freemsg. we have to restart it
1247 		 * otherwise the thread will be stuck.
1248 		 */
1249 		inet_freemsg(mp);
1250 	}
1251 	return (B_TRUE);
1252 }
1253 
1254 /*
1255  * The ill is closing. Cleanup all the pending mps. Called exclusively
1256  * towards the end of ill_delete. The refcount has gone to 0. So nobody
1257  * knows this ill, and hence nobody can add an mp to this list
1258  */
1259 static void
1260 ill_pending_mp_cleanup(ill_t *ill)
1261 {
1262 	mblk_t	*mp;
1263 	queue_t	*q;
1264 
1265 	ASSERT(IAM_WRITER_ILL(ill));
1266 
1267 	mutex_enter(&ill->ill_lock);
1268 	/*
1269 	 * Every mp on the pending mp list originating from an ioctl
1270 	 * added 1 to the conn refcnt, at the start of the ioctl.
1271 	 * So bump it down now.  See comments in ip_wput_nondata()
1272 	 */
1273 	while (ill->ill_pending_mp != NULL) {
1274 		mp = ill->ill_pending_mp;
1275 		ill->ill_pending_mp = mp->b_next;
1276 		mutex_exit(&ill->ill_lock);
1277 
1278 		q = mp->b_queue;
1279 		ASSERT(CONN_Q(q));
1280 		mp->b_next = NULL;
1281 		mp->b_prev = NULL;
1282 		mp->b_queue = NULL;
1283 		ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL, NULL);
1284 		mutex_enter(&ill->ill_lock);
1285 	}
1286 	ill->ill_pending_ipif = NULL;
1287 
1288 	mutex_exit(&ill->ill_lock);
1289 }
1290 
1291 /*
1292  * Called in the conn close path and ill delete path
1293  */
1294 static void
1295 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
1296 {
1297 	ipsq_t	*ipsq;
1298 	mblk_t	*prev;
1299 	mblk_t	*curr;
1300 	mblk_t	*next;
1301 	queue_t	*q;
1302 	mblk_t	*tmp_list = NULL;
1303 
1304 	ASSERT(IAM_WRITER_ILL(ill));
1305 	if (connp != NULL)
1306 		q = CONNP_TO_WQ(connp);
1307 	else
1308 		q = ill->ill_wq;
1309 
1310 	ipsq = ill->ill_phyint->phyint_ipsq;
1311 	/*
1312 	 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
1313 	 * In the case of ioctl from a conn, there can be only 1 mp
1314 	 * queued on the ipsq. If an ill is being unplumbed, only messages
1315 	 * related to this ill are flushed, like M_ERROR or M_HANGUP message.
1316 	 * ioctls meant for this ill form conn's are not flushed. They will
1317 	 * be processed during ipsq_exit and will not find the ill and will
1318 	 * return error.
1319 	 */
1320 	mutex_enter(&ipsq->ipsq_lock);
1321 	for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
1322 	    curr = next) {
1323 		next = curr->b_next;
1324 		if (curr->b_queue == q || curr->b_queue == RD(q)) {
1325 			/* Unlink the mblk from the pending mp list */
1326 			if (prev != NULL) {
1327 				prev->b_next = curr->b_next;
1328 			} else {
1329 				ASSERT(ipsq->ipsq_xopq_mphead == curr);
1330 				ipsq->ipsq_xopq_mphead = curr->b_next;
1331 			}
1332 			if (ipsq->ipsq_xopq_mptail == curr)
1333 				ipsq->ipsq_xopq_mptail = prev;
1334 			/*
1335 			 * Create a temporary list and release the ipsq lock
1336 			 * New elements are added to the head of the tmp_list
1337 			 */
1338 			curr->b_next = tmp_list;
1339 			tmp_list = curr;
1340 		} else {
1341 			prev = curr;
1342 		}
1343 	}
1344 	mutex_exit(&ipsq->ipsq_lock);
1345 
1346 	while (tmp_list != NULL) {
1347 		curr = tmp_list;
1348 		tmp_list = curr->b_next;
1349 		curr->b_next = NULL;
1350 		curr->b_prev = NULL;
1351 		curr->b_queue = NULL;
1352 		if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
1353 			ip_ioctl_finish(q, curr, ENXIO, connp != NULL ?
1354 			    CONN_CLOSE : NO_COPYOUT, NULL, NULL);
1355 		} else {
1356 			/*
1357 			 * IP-MT XXX In the case of TLI/XTI bind / optmgmt
1358 			 * this can't be just inet_freemsg. we have to
1359 			 * restart it otherwise the thread will be stuck.
1360 			 */
1361 			inet_freemsg(curr);
1362 		}
1363 	}
1364 }
1365 
1366 /*
1367  * This conn has started closing. Cleanup any pending ioctl from this conn.
1368  * STREAMS ensures that there can be at most 1 ioctl pending on a stream.
1369  */
1370 void
1371 conn_ioctl_cleanup(conn_t *connp)
1372 {
1373 	mblk_t *curr;
1374 	ipsq_t	*ipsq;
1375 	ill_t	*ill;
1376 	boolean_t refheld;
1377 
1378 	/*
1379 	 * Is any exclusive ioctl pending ? If so clean it up. If the
1380 	 * ioctl has not yet started, the mp is pending in the list headed by
1381 	 * ipsq_xopq_head. If the ioctl has started the mp could be present in
1382 	 * ipsq_pending_mp. If the ioctl timed out in the streamhead but
1383 	 * is currently executing now the mp is not queued anywhere but
1384 	 * conn_oper_pending_ill is null. The conn close will wait
1385 	 * till the conn_ref drops to zero.
1386 	 */
1387 	mutex_enter(&connp->conn_lock);
1388 	ill = connp->conn_oper_pending_ill;
1389 	if (ill == NULL) {
1390 		mutex_exit(&connp->conn_lock);
1391 		return;
1392 	}
1393 
1394 	curr = ill_pending_mp_get(ill, &connp, 0);
1395 	if (curr != NULL) {
1396 		mutex_exit(&connp->conn_lock);
1397 		CONN_DEC_REF(connp);
1398 		inet_freemsg(curr);
1399 		return;
1400 	}
1401 	/*
1402 	 * We may not be able to refhold the ill if the ill/ipif
1403 	 * is changing. But we need to make sure that the ill will
1404 	 * not vanish. So we just bump up the ill_waiter count.
1405 	 */
1406 	refheld = ill_waiter_inc(ill);
1407 	mutex_exit(&connp->conn_lock);
1408 	if (refheld) {
1409 		if (ipsq_enter(ill, B_TRUE)) {
1410 			ill_waiter_dcr(ill);
1411 			/*
1412 			 * Check whether this ioctl has started and is
1413 			 * pending now in ipsq_pending_mp. If it is not
1414 			 * found there then check whether this ioctl has
1415 			 * not even started and is in the ipsq_xopq list.
1416 			 */
1417 			if (!ipsq_pending_mp_cleanup(ill, connp))
1418 				ipsq_xopq_mp_cleanup(ill, connp);
1419 			ipsq = ill->ill_phyint->phyint_ipsq;
1420 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
1421 			return;
1422 		}
1423 	}
1424 
1425 	/*
1426 	 * The ill is also closing and we could not bump up the
1427 	 * ill_waiter_count or we could not enter the ipsq. Leave
1428 	 * the cleanup to ill_delete
1429 	 */
1430 	mutex_enter(&connp->conn_lock);
1431 	while (connp->conn_oper_pending_ill != NULL)
1432 		cv_wait(&connp->conn_refcv, &connp->conn_lock);
1433 	mutex_exit(&connp->conn_lock);
1434 	if (refheld)
1435 		ill_waiter_dcr(ill);
1436 }
1437 
1438 /*
1439  * ipcl_walk function for cleaning up conn_*_ill fields.
1440  */
1441 static void
1442 conn_cleanup_ill(conn_t *connp, caddr_t arg)
1443 {
1444 	ill_t	*ill = (ill_t *)arg;
1445 	ire_t	*ire;
1446 
1447 	mutex_enter(&connp->conn_lock);
1448 	if (connp->conn_multicast_ill == ill) {
1449 		/* Revert to late binding */
1450 		connp->conn_multicast_ill = NULL;
1451 		connp->conn_orig_multicast_ifindex = 0;
1452 	}
1453 	if (connp->conn_incoming_ill == ill)
1454 		connp->conn_incoming_ill = NULL;
1455 	if (connp->conn_outgoing_ill == ill)
1456 		connp->conn_outgoing_ill = NULL;
1457 	if (connp->conn_outgoing_pill == ill)
1458 		connp->conn_outgoing_pill = NULL;
1459 	if (connp->conn_nofailover_ill == ill)
1460 		connp->conn_nofailover_ill = NULL;
1461 	if (connp->conn_xmit_if_ill == ill)
1462 		connp->conn_xmit_if_ill = NULL;
1463 	if (connp->conn_ire_cache != NULL) {
1464 		ire = connp->conn_ire_cache;
1465 		/*
1466 		 * ip_newroute creates IRE_CACHE with ire_stq coming from
1467 		 * interface X and ipif coming from interface Y, if interface
1468 		 * X and Y are part of the same IPMPgroup. Thus whenever
1469 		 * interface X goes down, remove all references to it by
1470 		 * checking both on ire_ipif and ire_stq.
1471 		 */
1472 		if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
1473 		    (ire->ire_type == IRE_CACHE &&
1474 		    ire->ire_stq == ill->ill_wq)) {
1475 			connp->conn_ire_cache = NULL;
1476 			mutex_exit(&connp->conn_lock);
1477 			ire_refrele_notr(ire);
1478 			return;
1479 		}
1480 	}
1481 	mutex_exit(&connp->conn_lock);
1482 
1483 }
1484 
1485 /* ARGSUSED */
1486 void
1487 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
1488 {
1489 	ill_t	*ill = q->q_ptr;
1490 	ipif_t	*ipif;
1491 
1492 	ASSERT(IAM_WRITER_IPSQ(ipsq));
1493 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1494 		ipif_down_tail(ipif);
1495 	ill_down_tail(ill);
1496 	freemsg(mp);
1497 	ipsq->ipsq_current_ipif = NULL;
1498 }
1499 
1500 /*
1501  * ill_down_start is called when we want to down this ill and bring it up again
1502  * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
1503  * all interfaces, but don't tear down any plumbing.
1504  */
1505 boolean_t
1506 ill_down_start(queue_t *q, mblk_t *mp)
1507 {
1508 	ill_t	*ill;
1509 	ipif_t	*ipif;
1510 
1511 	ill = q->q_ptr;
1512 
1513 	ASSERT(IAM_WRITER_ILL(ill));
1514 
1515 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1516 		(void) ipif_down(ipif, NULL, NULL);
1517 
1518 	ill_down(ill);
1519 
1520 	(void) ipsq_pending_mp_cleanup(ill, NULL);
1521 	mutex_enter(&ill->ill_lock);
1522 	/*
1523 	 * Atomically test and add the pending mp if references are
1524 	 * still active.
1525 	 */
1526 	if (!ill_is_quiescent(ill)) {
1527 		/*
1528 		 * Get rid of any pending mps and cleanup. Call will
1529 		 * not fail since we are passing a null connp.
1530 		 */
1531 		(void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
1532 		    mp, ILL_DOWN);
1533 		mutex_exit(&ill->ill_lock);
1534 		return (B_FALSE);
1535 	}
1536 	mutex_exit(&ill->ill_lock);
1537 	return (B_TRUE);
1538 }
1539 
1540 static void
1541 ill_down(ill_t *ill)
1542 {
1543 	/* Blow off any IREs dependent on this ILL. */
1544 	ire_walk(ill_downi, (char *)ill);
1545 
1546 	mutex_enter(&ire_mrtun_lock);
1547 	if (ire_mrtun_count != 0) {
1548 		mutex_exit(&ire_mrtun_lock);
1549 		ire_walk_ill_mrtun(0, 0, ill_downi_mrtun_srcif,
1550 		    (char *)ill, NULL);
1551 	} else {
1552 		mutex_exit(&ire_mrtun_lock);
1553 	}
1554 
1555 	/*
1556 	 * If any interface based forwarding table exists
1557 	 * Blow off the ires there dependent on this ill
1558 	 */
1559 	mutex_enter(&ire_srcif_table_lock);
1560 	if (ire_srcif_table_count > 0) {
1561 		mutex_exit(&ire_srcif_table_lock);
1562 		ire_walk_srcif_table_v4(ill_downi_mrtun_srcif, (char *)ill);
1563 	} else {
1564 		mutex_exit(&ire_srcif_table_lock);
1565 	}
1566 
1567 	/* Remove any conn_*_ill depending on this ill */
1568 	ipcl_walk(conn_cleanup_ill, (caddr_t)ill);
1569 
1570 	if (ill->ill_group != NULL) {
1571 		illgrp_delete(ill);
1572 	}
1573 
1574 }
1575 
1576 static void
1577 ill_down_tail(ill_t *ill)
1578 {
1579 	int	i;
1580 
1581 	/* Destroy ill_srcif_table if it exists */
1582 	/* Lock not reqd really because nobody should be able to access */
1583 	mutex_enter(&ill->ill_lock);
1584 	if (ill->ill_srcif_table != NULL) {
1585 		ill->ill_srcif_refcnt = 0;
1586 		for (i = 0; i < IP_SRCIF_TABLE_SIZE; i++) {
1587 			rw_destroy(&ill->ill_srcif_table[i].irb_lock);
1588 		}
1589 		kmem_free(ill->ill_srcif_table,
1590 		    IP_SRCIF_TABLE_SIZE * sizeof (irb_t));
1591 		ill->ill_srcif_table = NULL;
1592 		ill->ill_srcif_refcnt = 0;
1593 		ill->ill_mrtun_refcnt = 0;
1594 	}
1595 	mutex_exit(&ill->ill_lock);
1596 }
1597 
1598 /*
1599  * ire_walk routine used to delete every IRE that depends on queues
1600  * associated with 'ill'.  (Always called as writer.)
1601  */
1602 static void
1603 ill_downi(ire_t *ire, char *ill_arg)
1604 {
1605 	ill_t	*ill = (ill_t *)ill_arg;
1606 
1607 	/*
1608 	 * ip_newroute creates IRE_CACHE with ire_stq coming from
1609 	 * interface X and ipif coming from interface Y, if interface
1610 	 * X and Y are part of the same IPMP group. Thus whenever interface
1611 	 * X goes down, remove all references to it by checking both
1612 	 * on ire_ipif and ire_stq.
1613 	 */
1614 	if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
1615 	    (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) {
1616 		ire_delete(ire);
1617 	}
1618 }
1619 
1620 /*
1621  * A seperate routine for deleting revtun and srcif based routes
1622  * are needed because the ires only deleted when the interface
1623  * is unplumbed. Also these ires have ire_in_ill non-null as well.
1624  * we want to keep mobile IP specific code separate.
1625  */
1626 static void
1627 ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg)
1628 {
1629 	ill_t   *ill = (ill_t *)ill_arg;
1630 
1631 	ASSERT(ire->ire_in_ill != NULL);
1632 
1633 	if ((ire->ire_in_ill != NULL && ire->ire_in_ill == ill) ||
1634 	    (ire->ire_stq == ill->ill_wq) || (ire->ire_stq == ill->ill_rq)) {
1635 		ire_delete(ire);
1636 	}
1637 }
1638 
1639 /*
1640  * Remove ire/nce from the fastpath list.
1641  */
1642 void
1643 ill_fastpath_nack(ill_t *ill)
1644 {
1645 	if (ill->ill_isv6) {
1646 		nce_fastpath_list_dispatch(ill, NULL, NULL);
1647 	} else {
1648 		ire_fastpath_list_dispatch(ill, NULL, NULL);
1649 	}
1650 }
1651 
1652 /* Consume an M_IOCACK of the fastpath probe. */
1653 void
1654 ill_fastpath_ack(ill_t *ill, mblk_t *mp)
1655 {
1656 	mblk_t	*mp1 = mp;
1657 
1658 	/*
1659 	 * If this was the first attempt turn on the fastpath probing.
1660 	 */
1661 	mutex_enter(&ill->ill_lock);
1662 	if (ill->ill_dlpi_fastpath_state == IDMS_INPROGRESS)
1663 		ill->ill_dlpi_fastpath_state = IDMS_OK;
1664 	mutex_exit(&ill->ill_lock);
1665 
1666 	/* Free the M_IOCACK mblk, hold on to the data */
1667 	mp = mp->b_cont;
1668 	freeb(mp1);
1669 	if (mp == NULL)
1670 		return;
1671 	if (mp->b_cont != NULL) {
1672 		/*
1673 		 * Update all IRE's or NCE's that are waiting for
1674 		 * fastpath update.
1675 		 */
1676 		if (ill->ill_isv6) {
1677 			/*
1678 			 * update nce's in the fastpath list.
1679 			 */
1680 			nce_fastpath_list_dispatch(ill,
1681 			    ndp_fastpath_update, mp);
1682 		} else {
1683 
1684 			/*
1685 			 * update ire's in the fastpath list.
1686 			 */
1687 			ire_fastpath_list_dispatch(ill,
1688 			    ire_fastpath_update, mp);
1689 			/*
1690 			 * Check if we need to traverse reverse tunnel table.
1691 			 * Since there is only single ire_type (IRE_MIPRTUN)
1692 			 * in the table, we don't need to match on ire_type.
1693 			 * We have to check ire_mrtun_count and not the
1694 			 * ill_mrtun_refcnt since ill_mrtun_refcnt is set
1695 			 * on the incoming ill and here we are dealing with
1696 			 * outgoing ill.
1697 			 */
1698 			mutex_enter(&ire_mrtun_lock);
1699 			if (ire_mrtun_count != 0) {
1700 				mutex_exit(&ire_mrtun_lock);
1701 				ire_walk_ill_mrtun(MATCH_IRE_WQ, IRE_MIPRTUN,
1702 				    (void (*)(ire_t *, void *))
1703 					ire_fastpath_update, mp, ill);
1704 			} else {
1705 				mutex_exit(&ire_mrtun_lock);
1706 			}
1707 		}
1708 		mp1 = mp->b_cont;
1709 		freeb(mp);
1710 		mp = mp1;
1711 	} else {
1712 		ip0dbg(("ill_fastpath_ack:  no b_cont\n"));
1713 	}
1714 
1715 	freeb(mp);
1716 }
1717 
1718 /*
1719  * Throw an M_IOCTL message downstream asking "do you know fastpath?"
1720  * The data portion of the request is a dl_unitdata_req_t template for
1721  * what we would send downstream in the absence of a fastpath confirmation.
1722  */
1723 int
1724 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
1725 {
1726 	struct iocblk	*ioc;
1727 	mblk_t	*mp;
1728 
1729 	if (dlur_mp == NULL)
1730 		return (EINVAL);
1731 
1732 	mutex_enter(&ill->ill_lock);
1733 	switch (ill->ill_dlpi_fastpath_state) {
1734 	case IDMS_FAILED:
1735 		/*
1736 		 * Driver NAKed the first fastpath ioctl - assume it doesn't
1737 		 * support it.
1738 		 */
1739 		mutex_exit(&ill->ill_lock);
1740 		return (ENOTSUP);
1741 	case IDMS_UNKNOWN:
1742 		/* This is the first probe */
1743 		ill->ill_dlpi_fastpath_state = IDMS_INPROGRESS;
1744 		break;
1745 	default:
1746 		break;
1747 	}
1748 	mutex_exit(&ill->ill_lock);
1749 
1750 	if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
1751 		return (EAGAIN);
1752 
1753 	mp->b_cont = copyb(dlur_mp);
1754 	if (mp->b_cont == NULL) {
1755 		freeb(mp);
1756 		return (EAGAIN);
1757 	}
1758 
1759 	ioc = (struct iocblk *)mp->b_rptr;
1760 	ioc->ioc_count = msgdsize(mp->b_cont);
1761 
1762 	putnext(ill->ill_wq, mp);
1763 	return (0);
1764 }
1765 
1766 void
1767 ill_capability_probe(ill_t *ill)
1768 {
1769 	/*
1770 	 * Do so only if negotiation is enabled, capabilities are unknown,
1771 	 * and a capability negotiation is not already in progress.
1772 	 */
1773 	if (ill->ill_capab_state != IDMS_UNKNOWN &&
1774 	    ill->ill_capab_state != IDMS_RENEG)
1775 		return;
1776 
1777 	ill->ill_capab_state = IDMS_INPROGRESS;
1778 	ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
1779 	ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL);
1780 }
1781 
1782 void
1783 ill_capability_reset(ill_t *ill)
1784 {
1785 	mblk_t *sc_mp = NULL;
1786 	mblk_t *tmp;
1787 
1788 	/*
1789 	 * Note here that we reset the state to UNKNOWN, and later send
1790 	 * down the DL_CAPABILITY_REQ without first setting the state to
1791 	 * INPROGRESS.  We do this in order to distinguish the
1792 	 * DL_CAPABILITY_ACK response which may come back in response to
1793 	 * a "reset" apart from the "probe" DL_CAPABILITY_REQ.  This would
1794 	 * also handle the case where the driver doesn't send us back
1795 	 * a DL_CAPABILITY_ACK in response, since the "probe" routine
1796 	 * requires the state to be in UNKNOWN anyway.  In any case, all
1797 	 * features are turned off until the state reaches IDMS_OK.
1798 	 */
1799 	ill->ill_capab_state = IDMS_UNKNOWN;
1800 
1801 	/*
1802 	 * Disable sub-capabilities and request a list of sub-capability
1803 	 * messages which will be sent down to the driver.  Each handler
1804 	 * allocates the corresponding dl_capability_sub_t inside an
1805 	 * mblk, and links it to the existing sc_mp mblk, or return it
1806 	 * as sc_mp if it's the first sub-capability (the passed in
1807 	 * sc_mp is NULL).  Upon returning from all capability handlers,
1808 	 * sc_mp will be pulled-up, before passing it downstream.
1809 	 */
1810 	ill_capability_mdt_reset(ill, &sc_mp);
1811 	ill_capability_hcksum_reset(ill, &sc_mp);
1812 	ill_capability_zerocopy_reset(ill, &sc_mp);
1813 	ill_capability_ipsec_reset(ill, &sc_mp);
1814 	ill_capability_dls_reset(ill, &sc_mp);
1815 
1816 	/* Nothing to send down in order to disable the capabilities? */
1817 	if (sc_mp == NULL)
1818 		return;
1819 
1820 	tmp = msgpullup(sc_mp, -1);
1821 	freemsg(sc_mp);
1822 	if ((sc_mp = tmp) == NULL) {
1823 		cmn_err(CE_WARN, "ill_capability_reset: unable to send down "
1824 		    "DL_CAPABILITY_REQ (ENOMEM)\n");
1825 		return;
1826 	}
1827 
1828 	ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n"));
1829 	ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp);
1830 }
1831 
1832 /*
1833  * Request or set new-style hardware capabilities supported by DLS provider.
1834  */
1835 static void
1836 ill_capability_proto(ill_t *ill, int type, mblk_t *reqp)
1837 {
1838 	mblk_t *mp;
1839 	dl_capability_req_t *capb;
1840 	size_t size = 0;
1841 	uint8_t *ptr;
1842 
1843 	if (reqp != NULL)
1844 		size = MBLKL(reqp);
1845 
1846 	mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type);
1847 	if (mp == NULL) {
1848 		freemsg(reqp);
1849 		return;
1850 	}
1851 	ptr = mp->b_rptr;
1852 
1853 	capb = (dl_capability_req_t *)ptr;
1854 	ptr += sizeof (dl_capability_req_t);
1855 
1856 	if (reqp != NULL) {
1857 		capb->dl_sub_offset = sizeof (dl_capability_req_t);
1858 		capb->dl_sub_length = size;
1859 		bcopy(reqp->b_rptr, ptr, size);
1860 		ptr += size;
1861 		mp->b_cont = reqp->b_cont;
1862 		freeb(reqp);
1863 	}
1864 	ASSERT(ptr == mp->b_wptr);
1865 
1866 	ill_dlpi_send(ill, mp);
1867 }
1868 
1869 static void
1870 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
1871 {
1872 	dl_capab_id_t *id_ic;
1873 	uint_t sub_dl_cap = outers->dl_cap;
1874 	dl_capability_sub_t *inners;
1875 	uint8_t *capend;
1876 
1877 	ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
1878 
1879 	/*
1880 	 * Note: range checks here are not absolutely sufficient to
1881 	 * make us robust against malformed messages sent by drivers;
1882 	 * this is in keeping with the rest of IP's dlpi handling.
1883 	 * (Remember, it's coming from something else in the kernel
1884 	 * address space)
1885 	 */
1886 
1887 	capend = (uint8_t *)(outers + 1) + outers->dl_length;
1888 	if (capend > mp->b_wptr) {
1889 		cmn_err(CE_WARN, "ill_capability_id_ack: "
1890 		    "malformed sub-capability too long for mblk");
1891 		return;
1892 	}
1893 
1894 	id_ic = (dl_capab_id_t *)(outers + 1);
1895 
1896 	if (outers->dl_length < sizeof (*id_ic) ||
1897 	    (inners = &id_ic->id_subcap,
1898 	    inners->dl_length > (outers->dl_length - sizeof (*inners)))) {
1899 		cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
1900 		    "encapsulated capab type %d too long for mblk",
1901 		    inners->dl_cap);
1902 		return;
1903 	}
1904 
1905 	if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
1906 		ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
1907 		    "isn't as expected; pass-thru module(s) detected, "
1908 		    "discarding capability\n", inners->dl_cap));
1909 		return;
1910 	}
1911 
1912 	/* Process the encapsulated sub-capability */
1913 	ill_capability_dispatch(ill, mp, inners, B_TRUE);
1914 }
1915 
1916 /*
1917  * Process Multidata Transmit capability negotiation ack received from a
1918  * DLS Provider.  isub must point to the sub-capability (DL_CAPAB_MDT) of a
1919  * DL_CAPABILITY_ACK message.
1920  */
1921 static void
1922 ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1923 {
1924 	mblk_t *nmp = NULL;
1925 	dl_capability_req_t *oc;
1926 	dl_capab_mdt_t *mdt_ic, *mdt_oc;
1927 	ill_mdt_capab_t **ill_mdt_capab;
1928 	uint_t sub_dl_cap = isub->dl_cap;
1929 	uint8_t *capend;
1930 
1931 	ASSERT(sub_dl_cap == DL_CAPAB_MDT);
1932 
1933 	ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab;
1934 
1935 	/*
1936 	 * Note: range checks here are not absolutely sufficient to
1937 	 * make us robust against malformed messages sent by drivers;
1938 	 * this is in keeping with the rest of IP's dlpi handling.
1939 	 * (Remember, it's coming from something else in the kernel
1940 	 * address space)
1941 	 */
1942 
1943 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
1944 	if (capend > mp->b_wptr) {
1945 		cmn_err(CE_WARN, "ill_capability_mdt_ack: "
1946 		    "malformed sub-capability too long for mblk");
1947 		return;
1948 	}
1949 
1950 	mdt_ic = (dl_capab_mdt_t *)(isub + 1);
1951 
1952 	if (mdt_ic->mdt_version != MDT_VERSION_2) {
1953 		cmn_err(CE_CONT, "ill_capability_mdt_ack: "
1954 		    "unsupported MDT sub-capability (version %d, expected %d)",
1955 		    mdt_ic->mdt_version, MDT_VERSION_2);
1956 		return;
1957 	}
1958 
1959 	if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) {
1960 		ip1dbg(("ill_capability_mdt_ack: mid token for MDT "
1961 		    "capability isn't as expected; pass-thru module(s) "
1962 		    "detected, discarding capability\n"));
1963 		return;
1964 	}
1965 
1966 	if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) {
1967 
1968 		if (*ill_mdt_capab == NULL) {
1969 			*ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t),
1970 			    KM_NOSLEEP);
1971 
1972 			if (*ill_mdt_capab == NULL) {
1973 				cmn_err(CE_WARN, "ill_capability_mdt_ack: "
1974 				    "could not enable MDT version %d "
1975 				    "for %s (ENOMEM)\n", MDT_VERSION_2,
1976 				    ill->ill_name);
1977 				return;
1978 			}
1979 		}
1980 
1981 		ip1dbg(("ill_capability_mdt_ack: interface %s supports "
1982 		    "MDT version %d (%d bytes leading, %d bytes trailing "
1983 		    "header spaces, %d max pld bufs, %d span limit)\n",
1984 		    ill->ill_name, MDT_VERSION_2,
1985 		    mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail,
1986 		    mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit));
1987 
1988 		(*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2;
1989 		(*ill_mdt_capab)->ill_mdt_on = 1;
1990 		/*
1991 		 * Round the following values to the nearest 32-bit; ULP
1992 		 * may further adjust them to accomodate for additional
1993 		 * protocol headers.  We pass these values to ULP during
1994 		 * bind time.
1995 		 */
1996 		(*ill_mdt_capab)->ill_mdt_hdr_head =
1997 		    roundup(mdt_ic->mdt_hdr_head, 4);
1998 		(*ill_mdt_capab)->ill_mdt_hdr_tail =
1999 		    roundup(mdt_ic->mdt_hdr_tail, 4);
2000 		(*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld;
2001 		(*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit;
2002 
2003 		ill->ill_capabilities |= ILL_CAPAB_MDT;
2004 	} else {
2005 		uint_t size;
2006 		uchar_t *rptr;
2007 
2008 		size = sizeof (dl_capability_req_t) +
2009 		    sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t);
2010 
2011 		if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
2012 			cmn_err(CE_WARN, "ill_capability_mdt_ack: "
2013 			    "could not enable MDT for %s (ENOMEM)\n",
2014 			    ill->ill_name);
2015 			return;
2016 		}
2017 
2018 		rptr = nmp->b_rptr;
2019 		/* initialize dl_capability_req_t */
2020 		oc = (dl_capability_req_t *)nmp->b_rptr;
2021 		oc->dl_sub_offset = sizeof (dl_capability_req_t);
2022 		oc->dl_sub_length = sizeof (dl_capability_sub_t) +
2023 		    sizeof (dl_capab_mdt_t);
2024 		nmp->b_rptr += sizeof (dl_capability_req_t);
2025 
2026 		/* initialize dl_capability_sub_t */
2027 		bcopy(isub, nmp->b_rptr, sizeof (*isub));
2028 		nmp->b_rptr += sizeof (*isub);
2029 
2030 		/* initialize dl_capab_mdt_t */
2031 		mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr;
2032 		bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic));
2033 
2034 		nmp->b_rptr = rptr;
2035 
2036 		ip1dbg(("ill_capability_mdt_ack: asking interface %s "
2037 		    "to enable MDT version %d\n", ill->ill_name,
2038 		    MDT_VERSION_2));
2039 
2040 		/* set ENABLE flag */
2041 		mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE;
2042 
2043 		/* nmp points to a DL_CAPABILITY_REQ message to enable MDT */
2044 		ill_dlpi_send(ill, nmp);
2045 	}
2046 }
2047 
2048 static void
2049 ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp)
2050 {
2051 	mblk_t *mp;
2052 	dl_capab_mdt_t *mdt_subcap;
2053 	dl_capability_sub_t *dl_subcap;
2054 	int size;
2055 
2056 	if (!ILL_MDT_CAPABLE(ill))
2057 		return;
2058 
2059 	ASSERT(ill->ill_mdt_capab != NULL);
2060 	/*
2061 	 * Clear the capability flag for MDT but retain the ill_mdt_capab
2062 	 * structure since it's possible that another thread is still
2063 	 * referring to it.  The structure only gets deallocated when
2064 	 * we destroy the ill.
2065 	 */
2066 	ill->ill_capabilities &= ~ILL_CAPAB_MDT;
2067 
2068 	size = sizeof (*dl_subcap) + sizeof (*mdt_subcap);
2069 
2070 	mp = allocb(size, BPRI_HI);
2071 	if (mp == NULL) {
2072 		ip1dbg(("ill_capability_mdt_reset: unable to allocate "
2073 		    "request to disable MDT\n"));
2074 		return;
2075 	}
2076 
2077 	mp->b_wptr = mp->b_rptr + size;
2078 
2079 	dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
2080 	dl_subcap->dl_cap = DL_CAPAB_MDT;
2081 	dl_subcap->dl_length = sizeof (*mdt_subcap);
2082 
2083 	mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1);
2084 	mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version;
2085 	mdt_subcap->mdt_flags = 0;
2086 	mdt_subcap->mdt_hdr_head = 0;
2087 	mdt_subcap->mdt_hdr_tail = 0;
2088 
2089 	if (*sc_mp != NULL)
2090 		linkb(*sc_mp, mp);
2091 	else
2092 		*sc_mp = mp;
2093 }
2094 
2095 /*
2096  * Send a DL_NOTIFY_REQ to the specified ill to enable
2097  * DL_NOTE_PROMISC_ON/OFF_PHYS notifications.
2098  * Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware
2099  * acceleration.
2100  * Returns B_TRUE on success, B_FALSE if the message could not be sent.
2101  */
2102 static boolean_t
2103 ill_enable_promisc_notify(ill_t *ill)
2104 {
2105 	mblk_t *mp;
2106 	dl_notify_req_t *req;
2107 
2108 	IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n"));
2109 
2110 	mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ);
2111 	if (mp == NULL)
2112 		return (B_FALSE);
2113 
2114 	req = (dl_notify_req_t *)mp->b_rptr;
2115 	req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS |
2116 	    DL_NOTE_PROMISC_OFF_PHYS;
2117 
2118 	ill_dlpi_send(ill, mp);
2119 
2120 	return (B_TRUE);
2121 }
2122 
2123 
2124 /*
2125  * Allocate an IPsec capability request which will be filled by our
2126  * caller to turn on support for one or more algorithms.
2127  */
2128 static mblk_t *
2129 ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub)
2130 {
2131 	mblk_t *nmp;
2132 	dl_capability_req_t	*ocap;
2133 	dl_capab_ipsec_t	*ocip;
2134 	dl_capab_ipsec_t	*icip;
2135 	uint8_t			*ptr;
2136 	icip = (dl_capab_ipsec_t *)(isub + 1);
2137 
2138 	/*
2139 	 * The first time around, we send a DL_NOTIFY_REQ to enable
2140 	 * PROMISC_ON/OFF notification from the provider. We need to
2141 	 * do this before enabling the algorithms to avoid leakage of
2142 	 * cleartext packets.
2143 	 */
2144 
2145 	if (!ill_enable_promisc_notify(ill))
2146 		return (NULL);
2147 
2148 	/*
2149 	 * Allocate new mblk which will contain a new capability
2150 	 * request to enable the capabilities.
2151 	 */
2152 
2153 	nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) +
2154 	    sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ);
2155 	if (nmp == NULL)
2156 		return (NULL);
2157 
2158 	ptr = nmp->b_rptr;
2159 
2160 	/* initialize dl_capability_req_t */
2161 	ocap = (dl_capability_req_t *)ptr;
2162 	ocap->dl_sub_offset = sizeof (dl_capability_req_t);
2163 	ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length;
2164 	ptr += sizeof (dl_capability_req_t);
2165 
2166 	/* initialize dl_capability_sub_t */
2167 	bcopy(isub, ptr, sizeof (*isub));
2168 	ptr += sizeof (*isub);
2169 
2170 	/* initialize dl_capab_ipsec_t */
2171 	ocip = (dl_capab_ipsec_t *)ptr;
2172 	bcopy(icip, ocip, sizeof (*icip));
2173 
2174 	nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]);
2175 	return (nmp);
2176 }
2177 
2178 /*
2179  * Process an IPsec capability negotiation ack received from a DLS Provider.
2180  * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or
2181  * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message.
2182  */
2183 static void
2184 ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
2185 {
2186 	dl_capab_ipsec_t	*icip;
2187 	dl_capab_ipsec_alg_t	*ialg;	/* ptr to input alg spec. */
2188 	dl_capab_ipsec_alg_t	*oalg;	/* ptr to output alg spec. */
2189 	uint_t cipher, nciphers;
2190 	mblk_t *nmp;
2191 	uint_t alg_len;
2192 	boolean_t need_sadb_dump;
2193 	uint_t sub_dl_cap = isub->dl_cap;
2194 	ill_ipsec_capab_t **ill_capab;
2195 	uint64_t ill_capab_flag;
2196 	uint8_t *capend, *ciphend;
2197 	boolean_t sadb_resync;
2198 
2199 	ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH ||
2200 	    sub_dl_cap == DL_CAPAB_IPSEC_ESP);
2201 
2202 	if (sub_dl_cap == DL_CAPAB_IPSEC_AH) {
2203 		ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah;
2204 		ill_capab_flag = ILL_CAPAB_AH;
2205 	} else {
2206 		ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp;
2207 		ill_capab_flag = ILL_CAPAB_ESP;
2208 	}
2209 
2210 	/*
2211 	 * If the ill capability structure exists, then this incoming
2212 	 * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle.
2213 	 * If this is so, then we'd need to resynchronize the SADB
2214 	 * after re-enabling the offloaded ciphers.
2215 	 */
2216 	sadb_resync = (*ill_capab != NULL);
2217 
2218 	/*
2219 	 * Note: range checks here are not absolutely sufficient to
2220 	 * make us robust against malformed messages sent by drivers;
2221 	 * this is in keeping with the rest of IP's dlpi handling.
2222 	 * (Remember, it's coming from something else in the kernel
2223 	 * address space)
2224 	 */
2225 
2226 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
2227 	if (capend > mp->b_wptr) {
2228 		cmn_err(CE_WARN, "ill_capability_ipsec_ack: "
2229 		    "malformed sub-capability too long for mblk");
2230 		return;
2231 	}
2232 
2233 	/*
2234 	 * There are two types of acks we process here:
2235 	 * 1. acks in reply to a (first form) generic capability req
2236 	 *    (no ENABLE flag set)
2237 	 * 2. acks in reply to a ENABLE capability req.
2238 	 *    (ENABLE flag set)
2239 	 *
2240 	 * We process the subcapability passed as argument as follows:
2241 	 * 1 do initializations
2242 	 *   1.1 initialize nmp = NULL
2243 	 *   1.2 set need_sadb_dump to B_FALSE
2244 	 * 2 for each cipher in subcapability:
2245 	 *   2.1 if ENABLE flag is set:
2246 	 *	2.1.1 update per-ill ipsec capabilities info
2247 	 *	2.1.2 set need_sadb_dump to B_TRUE
2248 	 *   2.2 if ENABLE flag is not set:
2249 	 *	2.2.1 if nmp is NULL:
2250 	 *		2.2.1.1 allocate and initialize nmp
2251 	 *		2.2.1.2 init current pos in nmp
2252 	 *	2.2.2 copy current cipher to current pos in nmp
2253 	 *	2.2.3 set ENABLE flag in nmp
2254 	 *	2.2.4 update current pos
2255 	 * 3 if nmp is not equal to NULL, send enable request
2256 	 *   3.1 send capability request
2257 	 * 4 if need_sadb_dump is B_TRUE
2258 	 *   4.1 enable promiscuous on/off notifications
2259 	 *   4.2 call ill_dlpi_send(isub->dlcap) to send all
2260 	 *	AH or ESP SA's to interface.
2261 	 */
2262 
2263 	nmp = NULL;
2264 	oalg = NULL;
2265 	need_sadb_dump = B_FALSE;
2266 	icip = (dl_capab_ipsec_t *)(isub + 1);
2267 	ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]);
2268 
2269 	nciphers = icip->cip_nciphers;
2270 	ciphend = (uint8_t *)(ialg + icip->cip_nciphers);
2271 
2272 	if (ciphend > capend) {
2273 		cmn_err(CE_WARN, "ill_capability_ipsec_ack: "
2274 		    "too many ciphers for sub-capability len");
2275 		return;
2276 	}
2277 
2278 	for (cipher = 0; cipher < nciphers; cipher++) {
2279 		alg_len = sizeof (dl_capab_ipsec_alg_t);
2280 
2281 		if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) {
2282 			/*
2283 			 * TBD: when we provide a way to disable capabilities
2284 			 * from above, need to manage the request-pending state
2285 			 * and fail if we were not expecting this ACK.
2286 			 */
2287 			IPSECHW_DEBUG(IPSECHW_CAPAB,
2288 			    ("ill_capability_ipsec_ack: got ENABLE ACK\n"));
2289 
2290 			/*
2291 			 * Update IPsec capabilities for this ill
2292 			 */
2293 
2294 			if (*ill_capab == NULL) {
2295 				IPSECHW_DEBUG(IPSECHW_CAPAB,
2296 				    ("ill_capability_ipsec_ack: "
2297 					"allocating ipsec_capab for ill\n"));
2298 				*ill_capab = ill_ipsec_capab_alloc();
2299 
2300 				if (*ill_capab == NULL) {
2301 					cmn_err(CE_WARN,
2302 					    "ill_capability_ipsec_ack: "
2303 					    "could not enable IPsec Hardware "
2304 					    "acceleration for %s (ENOMEM)\n",
2305 					    ill->ill_name);
2306 					return;
2307 				}
2308 			}
2309 
2310 			ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH ||
2311 			    ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR);
2312 
2313 			if (ialg->alg_prim >= MAX_IPSEC_ALGS) {
2314 				cmn_err(CE_WARN,
2315 				    "ill_capability_ipsec_ack: "
2316 				    "malformed IPsec algorithm id %d",
2317 				    ialg->alg_prim);
2318 				continue;
2319 			}
2320 
2321 			if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) {
2322 				IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs,
2323 				    ialg->alg_prim);
2324 			} else {
2325 				ipsec_capab_algparm_t *alp;
2326 
2327 				IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs,
2328 				    ialg->alg_prim);
2329 				if (!ill_ipsec_capab_resize_algparm(*ill_capab,
2330 				    ialg->alg_prim)) {
2331 					cmn_err(CE_WARN,
2332 					    "ill_capability_ipsec_ack: "
2333 					    "no space for IPsec alg id %d",
2334 					    ialg->alg_prim);
2335 					continue;
2336 				}
2337 				alp = &((*ill_capab)->encr_algparm[
2338 						ialg->alg_prim]);
2339 				alp->minkeylen = ialg->alg_minbits;
2340 				alp->maxkeylen = ialg->alg_maxbits;
2341 			}
2342 			ill->ill_capabilities |= ill_capab_flag;
2343 			/*
2344 			 * indicate that a capability was enabled, which
2345 			 * will be used below to kick off a SADB dump
2346 			 * to the ill.
2347 			 */
2348 			need_sadb_dump = B_TRUE;
2349 		} else {
2350 			IPSECHW_DEBUG(IPSECHW_CAPAB,
2351 			    ("ill_capability_ipsec_ack: enabling alg 0x%x\n",
2352 				ialg->alg_prim));
2353 
2354 			if (nmp == NULL) {
2355 				nmp = ill_alloc_ipsec_cap_req(ill, isub);
2356 				if (nmp == NULL) {
2357 					/*
2358 					 * Sending the PROMISC_ON/OFF
2359 					 * notification request failed.
2360 					 * We cannot enable the algorithms
2361 					 * since the Provider will not
2362 					 * notify IP of promiscous mode
2363 					 * changes, which could lead
2364 					 * to leakage of packets.
2365 					 */
2366 					cmn_err(CE_WARN,
2367 					    "ill_capability_ipsec_ack: "
2368 					    "could not enable IPsec Hardware "
2369 					    "acceleration for %s (ENOMEM)\n",
2370 					    ill->ill_name);
2371 					return;
2372 				}
2373 				/* ptr to current output alg specifier */
2374 				oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr;
2375 			}
2376 
2377 			/*
2378 			 * Copy current alg specifier, set ENABLE
2379 			 * flag, and advance to next output alg.
2380 			 * For now we enable all IPsec capabilities.
2381 			 */
2382 			ASSERT(oalg != NULL);
2383 			bcopy(ialg, oalg, alg_len);
2384 			oalg->alg_flag |= DL_CAPAB_ALG_ENABLE;
2385 			nmp->b_wptr += alg_len;
2386 			oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr;
2387 		}
2388 
2389 		/* move to next input algorithm specifier */
2390 		ialg = (dl_capab_ipsec_alg_t *)
2391 		    ((char *)ialg + alg_len);
2392 	}
2393 
2394 	if (nmp != NULL)
2395 		/*
2396 		 * nmp points to a DL_CAPABILITY_REQ message to enable
2397 		 * IPsec hardware acceleration.
2398 		 */
2399 		ill_dlpi_send(ill, nmp);
2400 
2401 	if (need_sadb_dump)
2402 		/*
2403 		 * An acknowledgement corresponding to a request to
2404 		 * enable acceleration was received, notify SADB.
2405 		 */
2406 		ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync);
2407 }
2408 
2409 /*
2410  * Given an mblk with enough space in it, create sub-capability entries for
2411  * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised
2412  * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared,
2413  * in preparation for the reset the DL_CAPABILITY_REQ message.
2414  */
2415 static void
2416 ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen,
2417     ill_ipsec_capab_t *ill_cap, mblk_t *mp)
2418 {
2419 	dl_capab_ipsec_t *oipsec;
2420 	dl_capab_ipsec_alg_t *oalg;
2421 	dl_capability_sub_t *dl_subcap;
2422 	int i, k;
2423 
2424 	ASSERT(nciphers > 0);
2425 	ASSERT(ill_cap != NULL);
2426 	ASSERT(mp != NULL);
2427 	ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen);
2428 
2429 	/* dl_capability_sub_t for "stype" */
2430 	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
2431 	dl_subcap->dl_cap = stype;
2432 	dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen;
2433 	mp->b_wptr += sizeof (dl_capability_sub_t);
2434 
2435 	/* dl_capab_ipsec_t for "stype" */
2436 	oipsec = (dl_capab_ipsec_t *)mp->b_wptr;
2437 	oipsec->cip_version = 1;
2438 	oipsec->cip_nciphers = nciphers;
2439 	mp->b_wptr = (uchar_t *)&oipsec->cip_data[0];
2440 
2441 	/* create entries for "stype" AUTH ciphers */
2442 	for (i = 0; i < ill_cap->algs_size; i++) {
2443 		for (k = 0; k < BITSPERBYTE; k++) {
2444 			if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0)
2445 				continue;
2446 
2447 			oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr;
2448 			bzero((void *)oalg, sizeof (*oalg));
2449 			oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH;
2450 			oalg->alg_prim = k + (BITSPERBYTE * i);
2451 			mp->b_wptr += sizeof (dl_capab_ipsec_alg_t);
2452 		}
2453 	}
2454 	/* create entries for "stype" ENCR ciphers */
2455 	for (i = 0; i < ill_cap->algs_size; i++) {
2456 		for (k = 0; k < BITSPERBYTE; k++) {
2457 			if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0)
2458 				continue;
2459 
2460 			oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr;
2461 			bzero((void *)oalg, sizeof (*oalg));
2462 			oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR;
2463 			oalg->alg_prim = k + (BITSPERBYTE * i);
2464 			mp->b_wptr += sizeof (dl_capab_ipsec_alg_t);
2465 		}
2466 	}
2467 }
2468 
2469 /*
2470  * Macro to count number of 1s in a byte (8-bit word).  The total count is
2471  * accumulated into the passed-in argument (sum).  We could use SPARCv9's
2472  * POPC instruction, but our macro is more flexible for an arbitrary length
2473  * of bytes, such as {auth,encr}_hw_algs.  These variables are currently
2474  * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length
2475  * stays that way, we can reduce the number of iterations required.
2476  */
2477 #define	COUNT_1S(val, sum) {					\
2478 	uint8_t x = val & 0xff;					\
2479 	x = (x & 0x55) + ((x >> 1) & 0x55);			\
2480 	x = (x & 0x33) + ((x >> 2) & 0x33);			\
2481 	sum += (x & 0xf) + ((x >> 4) & 0xf);			\
2482 }
2483 
2484 /* ARGSUSED */
2485 static void
2486 ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp)
2487 {
2488 	mblk_t *mp;
2489 	ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah;
2490 	ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp;
2491 	uint64_t ill_capabilities = ill->ill_capabilities;
2492 	int ah_cnt = 0, esp_cnt = 0;
2493 	int ah_len = 0, esp_len = 0;
2494 	int i, size = 0;
2495 
2496 	if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)))
2497 		return;
2498 
2499 	ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH));
2500 	ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP));
2501 
2502 	/* Find out the number of ciphers for AH */
2503 	if (cap_ah != NULL) {
2504 		for (i = 0; i < cap_ah->algs_size; i++) {
2505 			COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt);
2506 			COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt);
2507 		}
2508 		if (ah_cnt > 0) {
2509 			size += sizeof (dl_capability_sub_t) +
2510 			    sizeof (dl_capab_ipsec_t);
2511 			/* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */
2512 			ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t);
2513 			size += ah_len;
2514 		}
2515 	}
2516 
2517 	/* Find out the number of ciphers for ESP */
2518 	if (cap_esp != NULL) {
2519 		for (i = 0; i < cap_esp->algs_size; i++) {
2520 			COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt);
2521 			COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt);
2522 		}
2523 		if (esp_cnt > 0) {
2524 			size += sizeof (dl_capability_sub_t) +
2525 			    sizeof (dl_capab_ipsec_t);
2526 			/* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */
2527 			esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t);
2528 			size += esp_len;
2529 		}
2530 	}
2531 
2532 	if (size == 0) {
2533 		ip1dbg(("ill_capability_ipsec_reset: capabilities exist but "
2534 		    "there's nothing to reset\n"));
2535 		return;
2536 	}
2537 
2538 	mp = allocb(size, BPRI_HI);
2539 	if (mp == NULL) {
2540 		ip1dbg(("ill_capability_ipsec_reset: unable to allocate "
2541 		    "request to disable IPSEC Hardware Acceleration\n"));
2542 		return;
2543 	}
2544 
2545 	/*
2546 	 * Clear the capability flags for IPSec HA but retain the ill
2547 	 * capability structures since it's possible that another thread
2548 	 * is still referring to them.  The structures only get deallocated
2549 	 * when we destroy the ill.
2550 	 *
2551 	 * Various places check the flags to see if the ill is capable of
2552 	 * hardware acceleration, and by clearing them we ensure that new
2553 	 * outbound IPSec packets are sent down encrypted.
2554 	 */
2555 	ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP);
2556 
2557 	/* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */
2558 	if (ah_cnt > 0) {
2559 		ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len,
2560 		    cap_ah, mp);
2561 		ASSERT(mp->b_rptr + size >= mp->b_wptr);
2562 	}
2563 
2564 	/* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */
2565 	if (esp_cnt > 0) {
2566 		ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len,
2567 		    cap_esp, mp);
2568 		ASSERT(mp->b_rptr + size >= mp->b_wptr);
2569 	}
2570 
2571 	/*
2572 	 * At this point we've composed a bunch of sub-capabilities to be
2573 	 * encapsulated in a DL_CAPABILITY_REQ and later sent downstream
2574 	 * by the caller.  Upon receiving this reset message, the driver
2575 	 * must stop inbound decryption (by destroying all inbound SAs)
2576 	 * and let the corresponding packets come in encrypted.
2577 	 */
2578 
2579 	if (*sc_mp != NULL)
2580 		linkb(*sc_mp, mp);
2581 	else
2582 		*sc_mp = mp;
2583 }
2584 
2585 static void
2586 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp,
2587     boolean_t encapsulated)
2588 {
2589 	boolean_t legacy = B_FALSE;
2590 
2591 	/*
2592 	 * If this DL_CAPABILITY_ACK came in as a response to our "reset"
2593 	 * DL_CAPABILITY_REQ, ignore it during this cycle.  We've just
2594 	 * instructed the driver to disable its advertised capabilities,
2595 	 * so there's no point in accepting any response at this moment.
2596 	 */
2597 	if (ill->ill_capab_state == IDMS_UNKNOWN)
2598 		return;
2599 
2600 	/*
2601 	 * Note that only the following two sub-capabilities may be
2602 	 * considered as "legacy", since their original definitions
2603 	 * do not incorporate the dl_mid_t module ID token, and hence
2604 	 * may require the use of the wrapper sub-capability.
2605 	 */
2606 	switch (subp->dl_cap) {
2607 	case DL_CAPAB_IPSEC_AH:
2608 	case DL_CAPAB_IPSEC_ESP:
2609 		legacy = B_TRUE;
2610 		break;
2611 	}
2612 
2613 	/*
2614 	 * For legacy sub-capabilities which don't incorporate a queue_t
2615 	 * pointer in their structures, discard them if we detect that
2616 	 * there are intermediate modules in between IP and the driver.
2617 	 */
2618 	if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) {
2619 		ip1dbg(("ill_capability_dispatch: unencapsulated capab type "
2620 		    "%d discarded; %d module(s) present below IP\n",
2621 		    subp->dl_cap, ill->ill_lmod_cnt));
2622 		return;
2623 	}
2624 
2625 	switch (subp->dl_cap) {
2626 	case DL_CAPAB_IPSEC_AH:
2627 	case DL_CAPAB_IPSEC_ESP:
2628 		ill_capability_ipsec_ack(ill, mp, subp);
2629 		break;
2630 	case DL_CAPAB_MDT:
2631 		ill_capability_mdt_ack(ill, mp, subp);
2632 		break;
2633 	case DL_CAPAB_HCKSUM:
2634 		ill_capability_hcksum_ack(ill, mp, subp);
2635 		break;
2636 	case DL_CAPAB_ZEROCOPY:
2637 		ill_capability_zerocopy_ack(ill, mp, subp);
2638 		break;
2639 	case DL_CAPAB_POLL:
2640 		if (!SOFT_RINGS_ENABLED())
2641 			ill_capability_dls_ack(ill, mp, subp);
2642 		break;
2643 	case DL_CAPAB_SOFT_RING:
2644 		if (SOFT_RINGS_ENABLED())
2645 			ill_capability_dls_ack(ill, mp, subp);
2646 		break;
2647 	default:
2648 		ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
2649 		    subp->dl_cap));
2650 	}
2651 }
2652 
2653 /*
2654  * As part of negotiating polling capability, the driver tells us
2655  * the default (or normal) blanking interval and packet threshold
2656  * (the receive timer fires if blanking interval is reached or
2657  * the packet threshold is reached).
2658  *
2659  * As part of manipulating the polling interval, we always use our
2660  * estimated interval (avg service time * number of packets queued
2661  * on the squeue) but we try to blank for a minimum of
2662  * rr_normal_blank_time * rr_max_blank_ratio. We disable the
2663  * packet threshold during this time. When we are not in polling mode
2664  * we set the blank interval typically lower, rr_normal_pkt_cnt *
2665  * rr_min_blank_ratio but up the packet cnt by a ratio of
2666  * rr_min_pkt_cnt_ratio so that we are still getting chains if
2667  * possible although for a shorter interval.
2668  */
2669 #define	RR_MAX_BLANK_RATIO	20
2670 #define	RR_MIN_BLANK_RATIO	10
2671 #define	RR_MAX_PKT_CNT_RATIO	3
2672 #define	RR_MIN_PKT_CNT_RATIO	3
2673 
2674 /*
2675  * These can be tuned via /etc/system.
2676  */
2677 int rr_max_blank_ratio = RR_MAX_BLANK_RATIO;
2678 int rr_min_blank_ratio = RR_MIN_BLANK_RATIO;
2679 int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO;
2680 int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO;
2681 
2682 static mac_resource_handle_t
2683 ill_ring_add(void *arg, mac_resource_t *mrp)
2684 {
2685 	ill_t			*ill = (ill_t *)arg;
2686 	mac_rx_fifo_t		*mrfp = (mac_rx_fifo_t *)mrp;
2687 	ill_rx_ring_t		*rx_ring;
2688 	int			ip_rx_index;
2689 
2690 	ASSERT(mrp != NULL);
2691 	if (mrp->mr_type != MAC_RX_FIFO) {
2692 		return (NULL);
2693 	}
2694 	ASSERT(ill != NULL);
2695 	ASSERT(ill->ill_dls_capab != NULL);
2696 
2697 	mutex_enter(&ill->ill_lock);
2698 	for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) {
2699 		rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index];
2700 		ASSERT(rx_ring != NULL);
2701 
2702 		if (rx_ring->rr_ring_state == ILL_RING_FREE) {
2703 			time_t normal_blank_time =
2704 			    mrfp->mrf_normal_blank_time;
2705 			uint_t normal_pkt_cnt =
2706 			    mrfp->mrf_normal_pkt_count;
2707 
2708 			bzero(rx_ring, sizeof (ill_rx_ring_t));
2709 
2710 			rx_ring->rr_blank = mrfp->mrf_blank;
2711 			rx_ring->rr_handle = mrfp->mrf_arg;
2712 			rx_ring->rr_ill = ill;
2713 			rx_ring->rr_normal_blank_time = normal_blank_time;
2714 			rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt;
2715 
2716 			rx_ring->rr_max_blank_time =
2717 			    normal_blank_time * rr_max_blank_ratio;
2718 			rx_ring->rr_min_blank_time =
2719 			    normal_blank_time * rr_min_blank_ratio;
2720 			rx_ring->rr_max_pkt_cnt =
2721 			    normal_pkt_cnt * rr_max_pkt_cnt_ratio;
2722 			rx_ring->rr_min_pkt_cnt =
2723 			    normal_pkt_cnt * rr_min_pkt_cnt_ratio;
2724 
2725 			rx_ring->rr_ring_state = ILL_RING_INUSE;
2726 			mutex_exit(&ill->ill_lock);
2727 
2728 			DTRACE_PROBE2(ill__ring__add, (void *), ill,
2729 			    (int), ip_rx_index);
2730 			return ((mac_resource_handle_t)rx_ring);
2731 		}
2732 	}
2733 
2734 	/*
2735 	 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If
2736 	 * we have devices which can overwhelm this limit, ILL_MAX_RING
2737 	 * should be made configurable. Meanwhile it cause no panic because
2738 	 * driver will pass ip_input a NULL handle which will make
2739 	 * IP allocate the default squeue and Polling mode will not
2740 	 * be used for this ring.
2741 	 */
2742 	cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) "
2743 	    "for %s\n", ILL_MAX_RINGS, ill->ill_name);
2744 
2745 	mutex_exit(&ill->ill_lock);
2746 	return (NULL);
2747 }
2748 
2749 static boolean_t
2750 ill_capability_dls_init(ill_t *ill)
2751 {
2752 	ill_dls_capab_t	*ill_dls = ill->ill_dls_capab;
2753 	conn_t 			*connp;
2754 	size_t			sz;
2755 
2756 	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
2757 		if (ill_dls == NULL) {
2758 			cmn_err(CE_PANIC, "ill_capability_dls_init: "
2759 			    "soft_ring enabled for ill=%s (%p) but data "
2760 			    "structs uninitialized\n", ill->ill_name,
2761 			    (void *)ill);
2762 		}
2763 		return (B_TRUE);
2764 	} else if (ill->ill_capabilities & ILL_CAPAB_POLL) {
2765 		if (ill_dls == NULL) {
2766 			cmn_err(CE_PANIC, "ill_capability_dls_init: "
2767 			    "polling enabled for ill=%s (%p) but data "
2768 			    "structs uninitialized\n", ill->ill_name,
2769 			(void *)ill);
2770 		}
2771 		return (B_TRUE);
2772 	}
2773 
2774 	if (ill_dls != NULL) {
2775 		ill_rx_ring_t 	*rx_ring = ill_dls->ill_ring_tbl;
2776 		/* Soft_Ring or polling is being re-enabled */
2777 
2778 		connp = ill_dls->ill_unbind_conn;
2779 		ASSERT(rx_ring != NULL);
2780 		bzero((void *)ill_dls, sizeof (ill_dls_capab_t));
2781 		bzero((void *)rx_ring,
2782 		    sizeof (ill_rx_ring_t) * ILL_MAX_RINGS);
2783 		ill_dls->ill_ring_tbl = rx_ring;
2784 		ill_dls->ill_unbind_conn = connp;
2785 		return (B_TRUE);
2786 	}
2787 
2788 	if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL)
2789 		return (B_FALSE);
2790 
2791 	sz = sizeof (ill_dls_capab_t);
2792 	sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS;
2793 
2794 	ill_dls = kmem_zalloc(sz, KM_NOSLEEP);
2795 	if (ill_dls == NULL) {
2796 		cmn_err(CE_WARN, "ill_capability_dls_init: could not "
2797 		    "allocate dls_capab for %s (%p)\n", ill->ill_name,
2798 		    (void *)ill);
2799 		CONN_DEC_REF(connp);
2800 		return (B_FALSE);
2801 	}
2802 
2803 	/* Allocate space to hold ring table */
2804 	ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1];
2805 	ill->ill_dls_capab = ill_dls;
2806 	ill_dls->ill_unbind_conn = connp;
2807 	return (B_TRUE);
2808 }
2809 
2810 /*
2811  * ill_capability_dls_disable: disable soft_ring and/or polling
2812  * capability. Since any of the rings might already be in use, need
2813  * to call ipsq_clean_all() which gets behind the squeue to disable
2814  * direct calls if necessary.
2815  */
2816 static void
2817 ill_capability_dls_disable(ill_t *ill)
2818 {
2819 	ill_dls_capab_t	*ill_dls = ill->ill_dls_capab;
2820 
2821 	if (ill->ill_capabilities & ILL_CAPAB_DLS) {
2822 		ipsq_clean_all(ill);
2823 		ill_dls->ill_tx = NULL;
2824 		ill_dls->ill_tx_handle = NULL;
2825 		ill_dls->ill_dls_change_status = NULL;
2826 		ill_dls->ill_dls_bind = NULL;
2827 		ill_dls->ill_dls_unbind = NULL;
2828 	}
2829 
2830 	ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS));
2831 }
2832 
2833 static void
2834 ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls,
2835     dl_capability_sub_t *isub)
2836 {
2837 	uint_t			size;
2838 	uchar_t			*rptr;
2839 	dl_capab_dls_t	dls, *odls;
2840 	ill_dls_capab_t	*ill_dls;
2841 	mblk_t			*nmp = NULL;
2842 	dl_capability_req_t	*ocap;
2843 	uint_t			sub_dl_cap = isub->dl_cap;
2844 
2845 	if (!ill_capability_dls_init(ill))
2846 		return;
2847 	ill_dls = ill->ill_dls_capab;
2848 
2849 	/* Copy locally to get the members aligned */
2850 	bcopy((void *)idls, (void *)&dls,
2851 	    sizeof (dl_capab_dls_t));
2852 
2853 	/* Get the tx function and handle from dld */
2854 	ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx;
2855 	ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle;
2856 
2857 	if (sub_dl_cap == DL_CAPAB_SOFT_RING) {
2858 		ill_dls->ill_dls_change_status =
2859 		    (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status;
2860 		ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind;
2861 		ill_dls->ill_dls_unbind =
2862 		    (ip_dls_unbind_t)dls.dls_ring_unbind;
2863 		ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt;
2864 	}
2865 
2866 	size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) +
2867 	    isub->dl_length;
2868 
2869 	if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
2870 		cmn_err(CE_WARN, "ill_capability_dls_capable: could "
2871 		    "not allocate memory for CAPAB_REQ for %s (%p)\n",
2872 		    ill->ill_name, (void *)ill);
2873 		return;
2874 	}
2875 
2876 	/* initialize dl_capability_req_t */
2877 	rptr = nmp->b_rptr;
2878 	ocap = (dl_capability_req_t *)rptr;
2879 	ocap->dl_sub_offset = sizeof (dl_capability_req_t);
2880 	ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length;
2881 	rptr += sizeof (dl_capability_req_t);
2882 
2883 	/* initialize dl_capability_sub_t */
2884 	bcopy(isub, rptr, sizeof (*isub));
2885 	rptr += sizeof (*isub);
2886 
2887 	odls = (dl_capab_dls_t *)rptr;
2888 	rptr += sizeof (dl_capab_dls_t);
2889 
2890 	/* initialize dl_capab_dls_t to be sent down */
2891 	dls.dls_rx_handle = (uintptr_t)ill;
2892 	dls.dls_rx = (uintptr_t)ip_input;
2893 	dls.dls_ring_add = (uintptr_t)ill_ring_add;
2894 
2895 	if (sub_dl_cap == DL_CAPAB_SOFT_RING) {
2896 		dls.dls_ring_cnt = ip_soft_rings_cnt;
2897 		dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment;
2898 		dls.dls_flags = SOFT_RING_ENABLE;
2899 	} else {
2900 		dls.dls_flags = POLL_ENABLE;
2901 		ip1dbg(("ill_capability_dls_capable: asking interface %s "
2902 		    "to enable polling\n", ill->ill_name));
2903 	}
2904 	bcopy((void *)&dls, (void *)odls,
2905 	    sizeof (dl_capab_dls_t));
2906 	ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
2907 	/*
2908 	 * nmp points to a DL_CAPABILITY_REQ message to
2909 	 * enable either soft_ring or polling
2910 	 */
2911 	ill_dlpi_send(ill, nmp);
2912 }
2913 
2914 static void
2915 ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp)
2916 {
2917 	mblk_t *mp;
2918 	dl_capab_dls_t *idls;
2919 	dl_capability_sub_t *dl_subcap;
2920 	int size;
2921 
2922 	if (!(ill->ill_capabilities & ILL_CAPAB_DLS))
2923 		return;
2924 
2925 	ASSERT(ill->ill_dls_capab != NULL);
2926 
2927 	size = sizeof (*dl_subcap) + sizeof (*idls);
2928 
2929 	mp = allocb(size, BPRI_HI);
2930 	if (mp == NULL) {
2931 		ip1dbg(("ill_capability_dls_reset: unable to allocate "
2932 		    "request to disable soft_ring\n"));
2933 		return;
2934 	}
2935 
2936 	mp->b_wptr = mp->b_rptr + size;
2937 
2938 	dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
2939 	dl_subcap->dl_length = sizeof (*idls);
2940 	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING)
2941 		dl_subcap->dl_cap = DL_CAPAB_SOFT_RING;
2942 	else
2943 		dl_subcap->dl_cap = DL_CAPAB_POLL;
2944 
2945 	idls = (dl_capab_dls_t *)(dl_subcap + 1);
2946 	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING)
2947 		idls->dls_flags = SOFT_RING_DISABLE;
2948 	else
2949 		idls->dls_flags = POLL_DISABLE;
2950 
2951 	if (*sc_mp != NULL)
2952 		linkb(*sc_mp, mp);
2953 	else
2954 		*sc_mp = mp;
2955 }
2956 
2957 /*
2958  * Process a soft_ring/poll capability negotiation ack received
2959  * from a DLS Provider.isub must point to the sub-capability
2960  * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message.
2961  */
2962 static void
2963 ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
2964 {
2965 	dl_capab_dls_t		*idls;
2966 	uint_t			sub_dl_cap = isub->dl_cap;
2967 	uint8_t			*capend;
2968 
2969 	ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING ||
2970 	    sub_dl_cap == DL_CAPAB_POLL);
2971 
2972 	if (ill->ill_isv6)
2973 		return;
2974 
2975 	/*
2976 	 * Note: range checks here are not absolutely sufficient to
2977 	 * make us robust against malformed messages sent by drivers;
2978 	 * this is in keeping with the rest of IP's dlpi handling.
2979 	 * (Remember, it's coming from something else in the kernel
2980 	 * address space)
2981 	 */
2982 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
2983 	if (capend > mp->b_wptr) {
2984 		cmn_err(CE_WARN, "ill_capability_dls_ack: "
2985 		    "malformed sub-capability too long for mblk");
2986 		return;
2987 	}
2988 
2989 	/*
2990 	 * There are two types of acks we process here:
2991 	 * 1. acks in reply to a (first form) generic capability req
2992 	 *    (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE)
2993 	 * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE
2994 	 *    capability req.
2995 	 */
2996 	idls = (dl_capab_dls_t *)(isub + 1);
2997 
2998 	if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) {
2999 		ip1dbg(("ill_capability_dls_ack: mid token for dls "
3000 		    "capability isn't as expected; pass-thru "
3001 		    "module(s) detected, discarding capability\n"));
3002 		if (ill->ill_capabilities & ILL_CAPAB_DLS) {
3003 			/*
3004 			 * This is a capability renegotitation case.
3005 			 * The interface better be unusable at this
3006 			 * point other wise bad things will happen
3007 			 * if we disable direct calls on a running
3008 			 * and up interface.
3009 			 */
3010 			ill_capability_dls_disable(ill);
3011 		}
3012 		return;
3013 	}
3014 
3015 	switch (idls->dls_flags) {
3016 	default:
3017 		/* Disable if unknown flag */
3018 	case SOFT_RING_DISABLE:
3019 	case POLL_DISABLE:
3020 		ill_capability_dls_disable(ill);
3021 		break;
3022 	case SOFT_RING_CAPABLE:
3023 	case POLL_CAPABLE:
3024 		/*
3025 		 * If the capability was already enabled, its safe
3026 		 * to disable it first to get rid of stale information
3027 		 * and then start enabling it again.
3028 		 */
3029 		ill_capability_dls_disable(ill);
3030 		ill_capability_dls_capable(ill, idls, isub);
3031 		break;
3032 	case SOFT_RING_ENABLE:
3033 	case POLL_ENABLE:
3034 		mutex_enter(&ill->ill_lock);
3035 		if (sub_dl_cap == DL_CAPAB_SOFT_RING &&
3036 		    !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) {
3037 			ASSERT(ill->ill_dls_capab != NULL);
3038 			ill->ill_capabilities |= ILL_CAPAB_SOFT_RING;
3039 		}
3040 		if (sub_dl_cap == DL_CAPAB_POLL &&
3041 		    !(ill->ill_capabilities & ILL_CAPAB_POLL)) {
3042 			ASSERT(ill->ill_dls_capab != NULL);
3043 			ill->ill_capabilities |= ILL_CAPAB_POLL;
3044 			ip1dbg(("ill_capability_dls_ack: interface %s "
3045 			    "has enabled polling\n", ill->ill_name));
3046 		}
3047 		mutex_exit(&ill->ill_lock);
3048 		break;
3049 	}
3050 }
3051 
3052 /*
3053  * Process a hardware checksum offload capability negotiation ack received
3054  * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
3055  * of a DL_CAPABILITY_ACK message.
3056  */
3057 static void
3058 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
3059 {
3060 	dl_capability_req_t	*ocap;
3061 	dl_capab_hcksum_t	*ihck, *ohck;
3062 	ill_hcksum_capab_t	**ill_hcksum;
3063 	mblk_t			*nmp = NULL;
3064 	uint_t			sub_dl_cap = isub->dl_cap;
3065 	uint8_t			*capend;
3066 
3067 	ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM);
3068 
3069 	ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab;
3070 
3071 	/*
3072 	 * Note: range checks here are not absolutely sufficient to
3073 	 * make us robust against malformed messages sent by drivers;
3074 	 * this is in keeping with the rest of IP's dlpi handling.
3075 	 * (Remember, it's coming from something else in the kernel
3076 	 * address space)
3077 	 */
3078 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
3079 	if (capend > mp->b_wptr) {
3080 		cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
3081 		    "malformed sub-capability too long for mblk");
3082 		return;
3083 	}
3084 
3085 	/*
3086 	 * There are two types of acks we process here:
3087 	 * 1. acks in reply to a (first form) generic capability req
3088 	 *    (no ENABLE flag set)
3089 	 * 2. acks in reply to a ENABLE capability req.
3090 	 *    (ENABLE flag set)
3091 	 */
3092 	ihck = (dl_capab_hcksum_t *)(isub + 1);
3093 
3094 	if (ihck->hcksum_version != HCKSUM_VERSION_1) {
3095 		cmn_err(CE_CONT, "ill_capability_hcksum_ack: "
3096 		    "unsupported hardware checksum "
3097 		    "sub-capability (version %d, expected %d)",
3098 		    ihck->hcksum_version, HCKSUM_VERSION_1);
3099 		return;
3100 	}
3101 
3102 	if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) {
3103 		ip1dbg(("ill_capability_hcksum_ack: mid token for hardware "
3104 		    "checksum capability isn't as expected; pass-thru "
3105 		    "module(s) detected, discarding capability\n"));
3106 		return;
3107 	}
3108 
3109 #define	CURR_HCKSUM_CAPAB				\
3110 	(HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 |	\
3111 	HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
3112 
3113 	if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
3114 	    (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
3115 		/* do ENABLE processing */
3116 		if (*ill_hcksum == NULL) {
3117 			*ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t),
3118 			    KM_NOSLEEP);
3119 
3120 			if (*ill_hcksum == NULL) {
3121 				cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
3122 				    "could not enable hcksum version %d "
3123 				    "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION,
3124 				    ill->ill_name);
3125 				return;
3126 			}
3127 		}
3128 
3129 		(*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version;
3130 		(*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags;
3131 		ill->ill_capabilities |= ILL_CAPAB_HCKSUM;
3132 		ip1dbg(("ill_capability_hcksum_ack: interface %s "
3133 		    "has enabled hardware checksumming\n ",
3134 		    ill->ill_name));
3135 	} else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) {
3136 		/*
3137 		 * Enabling hardware checksum offload
3138 		 * Currently IP supports {TCP,UDP}/IPv4
3139 		 * partial and full cksum offload and
3140 		 * IPv4 header checksum offload.
3141 		 * Allocate new mblk which will
3142 		 * contain a new capability request
3143 		 * to enable hardware checksum offload.
3144 		 */
3145 		uint_t	size;
3146 		uchar_t	*rptr;
3147 
3148 		size = sizeof (dl_capability_req_t) +
3149 		    sizeof (dl_capability_sub_t) + isub->dl_length;
3150 
3151 		if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
3152 			cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
3153 			    "could not enable hardware cksum for %s (ENOMEM)\n",
3154 			    ill->ill_name);
3155 			return;
3156 		}
3157 
3158 		rptr = nmp->b_rptr;
3159 		/* initialize dl_capability_req_t */
3160 		ocap = (dl_capability_req_t *)nmp->b_rptr;
3161 		ocap->dl_sub_offset =
3162 		    sizeof (dl_capability_req_t);
3163 		ocap->dl_sub_length =
3164 		    sizeof (dl_capability_sub_t) +
3165 		    isub->dl_length;
3166 		nmp->b_rptr += sizeof (dl_capability_req_t);
3167 
3168 		/* initialize dl_capability_sub_t */
3169 		bcopy(isub, nmp->b_rptr, sizeof (*isub));
3170 		nmp->b_rptr += sizeof (*isub);
3171 
3172 		/* initialize dl_capab_hcksum_t */
3173 		ohck = (dl_capab_hcksum_t *)nmp->b_rptr;
3174 		bcopy(ihck, ohck, sizeof (*ihck));
3175 
3176 		nmp->b_rptr = rptr;
3177 		ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
3178 
3179 		/* Set ENABLE flag */
3180 		ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB;
3181 		ohck->hcksum_txflags |= HCKSUM_ENABLE;
3182 
3183 		/*
3184 		 * nmp points to a DL_CAPABILITY_REQ message to enable
3185 		 * hardware checksum acceleration.
3186 		 */
3187 		ill_dlpi_send(ill, nmp);
3188 	} else {
3189 		ip1dbg(("ill_capability_hcksum_ack: interface %s has "
3190 		    "advertised %x hardware checksum capability flags\n",
3191 		    ill->ill_name, ihck->hcksum_txflags));
3192 	}
3193 }
3194 
3195 static void
3196 ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp)
3197 {
3198 	mblk_t *mp;
3199 	dl_capab_hcksum_t *hck_subcap;
3200 	dl_capability_sub_t *dl_subcap;
3201 	int size;
3202 
3203 	if (!ILL_HCKSUM_CAPABLE(ill))
3204 		return;
3205 
3206 	ASSERT(ill->ill_hcksum_capab != NULL);
3207 	/*
3208 	 * Clear the capability flag for hardware checksum offload but
3209 	 * retain the ill_hcksum_capab structure since it's possible that
3210 	 * another thread is still referring to it.  The structure only
3211 	 * gets deallocated when we destroy the ill.
3212 	 */
3213 	ill->ill_capabilities &= ~ILL_CAPAB_HCKSUM;
3214 
3215 	size = sizeof (*dl_subcap) + sizeof (*hck_subcap);
3216 
3217 	mp = allocb(size, BPRI_HI);
3218 	if (mp == NULL) {
3219 		ip1dbg(("ill_capability_hcksum_reset: unable to allocate "
3220 		    "request to disable hardware checksum offload\n"));
3221 		return;
3222 	}
3223 
3224 	mp->b_wptr = mp->b_rptr + size;
3225 
3226 	dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
3227 	dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
3228 	dl_subcap->dl_length = sizeof (*hck_subcap);
3229 
3230 	hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1);
3231 	hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
3232 	hck_subcap->hcksum_txflags = 0;
3233 
3234 	if (*sc_mp != NULL)
3235 		linkb(*sc_mp, mp);
3236 	else
3237 		*sc_mp = mp;
3238 }
3239 
3240 static void
3241 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
3242 {
3243 	mblk_t *nmp = NULL;
3244 	dl_capability_req_t *oc;
3245 	dl_capab_zerocopy_t *zc_ic, *zc_oc;
3246 	ill_zerocopy_capab_t **ill_zerocopy_capab;
3247 	uint_t sub_dl_cap = isub->dl_cap;
3248 	uint8_t *capend;
3249 
3250 	ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY);
3251 
3252 	ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab;
3253 
3254 	/*
3255 	 * Note: range checks here are not absolutely sufficient to
3256 	 * make us robust against malformed messages sent by drivers;
3257 	 * this is in keeping with the rest of IP's dlpi handling.
3258 	 * (Remember, it's coming from something else in the kernel
3259 	 * address space)
3260 	 */
3261 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
3262 	if (capend > mp->b_wptr) {
3263 		cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
3264 		    "malformed sub-capability too long for mblk");
3265 		return;
3266 	}
3267 
3268 	zc_ic = (dl_capab_zerocopy_t *)(isub + 1);
3269 	if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) {
3270 		cmn_err(CE_CONT, "ill_capability_zerocopy_ack: "
3271 		    "unsupported ZEROCOPY sub-capability (version %d, "
3272 		    "expected %d)", zc_ic->zerocopy_version,
3273 		    ZEROCOPY_VERSION_1);
3274 		return;
3275 	}
3276 
3277 	if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) {
3278 		ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy "
3279 		    "capability isn't as expected; pass-thru module(s) "
3280 		    "detected, discarding capability\n"));
3281 		return;
3282 	}
3283 
3284 	if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) {
3285 		if (*ill_zerocopy_capab == NULL) {
3286 			*ill_zerocopy_capab =
3287 			    kmem_zalloc(sizeof (ill_zerocopy_capab_t),
3288 			    KM_NOSLEEP);
3289 
3290 			if (*ill_zerocopy_capab == NULL) {
3291 				cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
3292 				    "could not enable Zero-copy version %d "
3293 				    "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1,
3294 				    ill->ill_name);
3295 				return;
3296 			}
3297 		}
3298 
3299 		ip1dbg(("ill_capability_zerocopy_ack: interface %s "
3300 		    "supports Zero-copy version %d\n", ill->ill_name,
3301 		    ZEROCOPY_VERSION_1));
3302 
3303 		(*ill_zerocopy_capab)->ill_zerocopy_version =
3304 		    zc_ic->zerocopy_version;
3305 		(*ill_zerocopy_capab)->ill_zerocopy_flags =
3306 		    zc_ic->zerocopy_flags;
3307 
3308 		ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY;
3309 	} else {
3310 		uint_t size;
3311 		uchar_t *rptr;
3312 
3313 		size = sizeof (dl_capability_req_t) +
3314 		    sizeof (dl_capability_sub_t) +
3315 		    sizeof (dl_capab_zerocopy_t);
3316 
3317 		if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
3318 			cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
3319 			    "could not enable zerocopy for %s (ENOMEM)\n",
3320 			    ill->ill_name);
3321 			return;
3322 		}
3323 
3324 		rptr = nmp->b_rptr;
3325 		/* initialize dl_capability_req_t */
3326 		oc = (dl_capability_req_t *)rptr;
3327 		oc->dl_sub_offset = sizeof (dl_capability_req_t);
3328 		oc->dl_sub_length = sizeof (dl_capability_sub_t) +
3329 		    sizeof (dl_capab_zerocopy_t);
3330 		rptr += sizeof (dl_capability_req_t);
3331 
3332 		/* initialize dl_capability_sub_t */
3333 		bcopy(isub, rptr, sizeof (*isub));
3334 		rptr += sizeof (*isub);
3335 
3336 		/* initialize dl_capab_zerocopy_t */
3337 		zc_oc = (dl_capab_zerocopy_t *)rptr;
3338 		*zc_oc = *zc_ic;
3339 
3340 		ip1dbg(("ill_capability_zerocopy_ack: asking interface %s "
3341 		    "to enable zero-copy version %d\n", ill->ill_name,
3342 		    ZEROCOPY_VERSION_1));
3343 
3344 		/* set VMSAFE_MEM flag */
3345 		zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
3346 
3347 		/* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
3348 		ill_dlpi_send(ill, nmp);
3349 	}
3350 }
3351 
3352 static void
3353 ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp)
3354 {
3355 	mblk_t *mp;
3356 	dl_capab_zerocopy_t *zerocopy_subcap;
3357 	dl_capability_sub_t *dl_subcap;
3358 	int size;
3359 
3360 	if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
3361 		return;
3362 
3363 	ASSERT(ill->ill_zerocopy_capab != NULL);
3364 	/*
3365 	 * Clear the capability flag for Zero-copy but retain the
3366 	 * ill_zerocopy_capab structure since it's possible that another
3367 	 * thread is still referring to it.  The structure only gets
3368 	 * deallocated when we destroy the ill.
3369 	 */
3370 	ill->ill_capabilities &= ~ILL_CAPAB_ZEROCOPY;
3371 
3372 	size = sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
3373 
3374 	mp = allocb(size, BPRI_HI);
3375 	if (mp == NULL) {
3376 		ip1dbg(("ill_capability_zerocopy_reset: unable to allocate "
3377 		    "request to disable Zero-copy\n"));
3378 		return;
3379 	}
3380 
3381 	mp->b_wptr = mp->b_rptr + size;
3382 
3383 	dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
3384 	dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
3385 	dl_subcap->dl_length = sizeof (*zerocopy_subcap);
3386 
3387 	zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1);
3388 	zerocopy_subcap->zerocopy_version =
3389 	    ill->ill_zerocopy_capab->ill_zerocopy_version;
3390 	zerocopy_subcap->zerocopy_flags = 0;
3391 
3392 	if (*sc_mp != NULL)
3393 		linkb(*sc_mp, mp);
3394 	else
3395 		*sc_mp = mp;
3396 }
3397 
3398 /*
3399  * Consume a new-style hardware capabilities negotiation ack.
3400  * Called from ip_rput_dlpi_writer().
3401  */
3402 void
3403 ill_capability_ack(ill_t *ill, mblk_t *mp)
3404 {
3405 	dl_capability_ack_t *capp;
3406 	dl_capability_sub_t *subp, *endp;
3407 
3408 	if (ill->ill_capab_state == IDMS_INPROGRESS)
3409 		ill->ill_capab_state = IDMS_OK;
3410 
3411 	capp = (dl_capability_ack_t *)mp->b_rptr;
3412 
3413 	if (capp->dl_sub_length == 0)
3414 		/* no new-style capabilities */
3415 		return;
3416 
3417 	/* make sure the driver supplied correct dl_sub_length */
3418 	if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
3419 		ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
3420 		    "invalid dl_sub_length (%d)\n", capp->dl_sub_length));
3421 		return;
3422 	}
3423 
3424 #define	SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
3425 	/*
3426 	 * There are sub-capabilities. Process the ones we know about.
3427 	 * Loop until we don't have room for another sub-cap header..
3428 	 */
3429 	for (subp = SC(capp, capp->dl_sub_offset),
3430 	    endp = SC(subp, capp->dl_sub_length - sizeof (*subp));
3431 	    subp <= endp;
3432 	    subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) {
3433 
3434 		switch (subp->dl_cap) {
3435 		case DL_CAPAB_ID_WRAPPER:
3436 			ill_capability_id_ack(ill, mp, subp);
3437 			break;
3438 		default:
3439 			ill_capability_dispatch(ill, mp, subp, B_FALSE);
3440 			break;
3441 		}
3442 	}
3443 #undef SC
3444 }
3445 
3446 /*
3447  * This routine is called to scan the fragmentation reassembly table for
3448  * the specified ILL for any packets that are starting to smell.
3449  * dead_interval is the maximum time in seconds that will be tolerated.  It
3450  * will either be the value specified in ip_g_frag_timeout, or zero if the
3451  * ILL is shutting down and it is time to blow everything off.
3452  *
3453  * It returns the number of seconds (as a time_t) that the next frag timer
3454  * should be scheduled for, 0 meaning that the timer doesn't need to be
3455  * re-started.  Note that the method of calculating next_timeout isn't
3456  * entirely accurate since time will flow between the time we grab
3457  * current_time and the time we schedule the next timeout.  This isn't a
3458  * big problem since this is the timer for sending an ICMP reassembly time
3459  * exceeded messages, and it doesn't have to be exactly accurate.
3460  *
3461  * This function is
3462  * sometimes called as writer, although this is not required.
3463  */
3464 time_t
3465 ill_frag_timeout(ill_t *ill, time_t dead_interval)
3466 {
3467 	ipfb_t	*ipfb;
3468 	ipfb_t	*endp;
3469 	ipf_t	*ipf;
3470 	ipf_t	*ipfnext;
3471 	mblk_t	*mp;
3472 	time_t	current_time = gethrestime_sec();
3473 	time_t	next_timeout = 0;
3474 	uint32_t	hdr_length;
3475 	mblk_t	*send_icmp_head;
3476 	mblk_t	*send_icmp_head_v6;
3477 
3478 	ipfb = ill->ill_frag_hash_tbl;
3479 	if (ipfb == NULL)
3480 		return (B_FALSE);
3481 	endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT];
3482 	/* Walk the frag hash table. */
3483 	for (; ipfb < endp; ipfb++) {
3484 		send_icmp_head = NULL;
3485 		send_icmp_head_v6 = NULL;
3486 		mutex_enter(&ipfb->ipfb_lock);
3487 		while ((ipf = ipfb->ipfb_ipf) != 0) {
3488 			time_t frag_time = current_time - ipf->ipf_timestamp;
3489 			time_t frag_timeout;
3490 
3491 			if (frag_time < dead_interval) {
3492 				/*
3493 				 * There are some outstanding fragments
3494 				 * that will timeout later.  Make note of
3495 				 * the time so that we can reschedule the
3496 				 * next timeout appropriately.
3497 				 */
3498 				frag_timeout = dead_interval - frag_time;
3499 				if (next_timeout == 0 ||
3500 				    frag_timeout < next_timeout) {
3501 					next_timeout = frag_timeout;
3502 				}
3503 				break;
3504 			}
3505 			/* Time's up.  Get it out of here. */
3506 			hdr_length = ipf->ipf_nf_hdr_len;
3507 			ipfnext = ipf->ipf_hash_next;
3508 			if (ipfnext)
3509 				ipfnext->ipf_ptphn = ipf->ipf_ptphn;
3510 			*ipf->ipf_ptphn = ipfnext;
3511 			mp = ipf->ipf_mp->b_cont;
3512 			for (; mp; mp = mp->b_cont) {
3513 				/* Extra points for neatness. */
3514 				IP_REASS_SET_START(mp, 0);
3515 				IP_REASS_SET_END(mp, 0);
3516 			}
3517 			mp = ipf->ipf_mp->b_cont;
3518 			ill->ill_frag_count -= ipf->ipf_count;
3519 			ASSERT(ipfb->ipfb_count >= ipf->ipf_count);
3520 			ipfb->ipfb_count -= ipf->ipf_count;
3521 			ASSERT(ipfb->ipfb_frag_pkts > 0);
3522 			ipfb->ipfb_frag_pkts--;
3523 			/*
3524 			 * We do not send any icmp message from here because
3525 			 * we currently are holding the ipfb_lock for this
3526 			 * hash chain. If we try and send any icmp messages
3527 			 * from here we may end up via a put back into ip
3528 			 * trying to get the same lock, causing a recursive
3529 			 * mutex panic. Instead we build a list and send all
3530 			 * the icmp messages after we have dropped the lock.
3531 			 */
3532 			if (ill->ill_isv6) {
3533 				BUMP_MIB(ill->ill_ip6_mib, ipv6ReasmFails);
3534 				if (hdr_length != 0) {
3535 					mp->b_next = send_icmp_head_v6;
3536 					send_icmp_head_v6 = mp;
3537 				} else {
3538 					freemsg(mp);
3539 				}
3540 			} else {
3541 				BUMP_MIB(&ip_mib, ipReasmFails);
3542 				if (hdr_length != 0) {
3543 					mp->b_next = send_icmp_head;
3544 					send_icmp_head = mp;
3545 				} else {
3546 					freemsg(mp);
3547 				}
3548 			}
3549 			freeb(ipf->ipf_mp);
3550 		}
3551 		mutex_exit(&ipfb->ipfb_lock);
3552 		/*
3553 		 * Now need to send any icmp messages that we delayed from
3554 		 * above.
3555 		 */
3556 		while (send_icmp_head_v6 != NULL) {
3557 			mp = send_icmp_head_v6;
3558 			send_icmp_head_v6 = send_icmp_head_v6->b_next;
3559 			mp->b_next = NULL;
3560 			icmp_time_exceeded_v6(ill->ill_wq, mp,
3561 			    ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, B_FALSE);
3562 		}
3563 		while (send_icmp_head != NULL) {
3564 			mp = send_icmp_head;
3565 			send_icmp_head = send_icmp_head->b_next;
3566 			mp->b_next = NULL;
3567 			icmp_time_exceeded(ill->ill_wq, mp,
3568 			    ICMP_REASSEMBLY_TIME_EXCEEDED);
3569 		}
3570 	}
3571 	/*
3572 	 * A non-dying ILL will use the return value to decide whether to
3573 	 * restart the frag timer, and for how long.
3574 	 */
3575 	return (next_timeout);
3576 }
3577 
3578 /*
3579  * This routine is called when the approximate count of mblk memory used
3580  * for the specified ILL has exceeded max_count.
3581  */
3582 void
3583 ill_frag_prune(ill_t *ill, uint_t max_count)
3584 {
3585 	ipfb_t	*ipfb;
3586 	ipf_t	*ipf;
3587 	size_t	count;
3588 
3589 	/*
3590 	 * If we are here within ip_min_frag_prune_time msecs remove
3591 	 * ill_frag_free_num_pkts oldest packets from each bucket and increment
3592 	 * ill_frag_free_num_pkts.
3593 	 */
3594 	mutex_enter(&ill->ill_lock);
3595 	if (TICK_TO_MSEC(lbolt - ill->ill_last_frag_clean_time) <=
3596 	    (ip_min_frag_prune_time != 0 ?
3597 	    ip_min_frag_prune_time : msec_per_tick)) {
3598 
3599 		ill->ill_frag_free_num_pkts++;
3600 
3601 	} else {
3602 		ill->ill_frag_free_num_pkts = 0;
3603 	}
3604 	ill->ill_last_frag_clean_time = lbolt;
3605 	mutex_exit(&ill->ill_lock);
3606 
3607 	/*
3608 	 * free ill_frag_free_num_pkts oldest packets from each bucket.
3609 	 */
3610 	if (ill->ill_frag_free_num_pkts != 0) {
3611 		int ix;
3612 
3613 		for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
3614 			ipfb = &ill->ill_frag_hash_tbl[ix];
3615 			mutex_enter(&ipfb->ipfb_lock);
3616 			if (ipfb->ipfb_ipf != NULL) {
3617 				ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
3618 				    ill->ill_frag_free_num_pkts);
3619 			}
3620 			mutex_exit(&ipfb->ipfb_lock);
3621 		}
3622 	}
3623 	/*
3624 	 * While the reassembly list for this ILL is too big, prune a fragment
3625 	 * queue by age, oldest first.  Note that the per ILL count is
3626 	 * approximate, while the per frag hash bucket counts are accurate.
3627 	 */
3628 	while (ill->ill_frag_count > max_count) {
3629 		int	ix;
3630 		ipfb_t	*oipfb = NULL;
3631 		uint_t	oldest = UINT_MAX;
3632 
3633 		count = 0;
3634 		for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
3635 			ipfb = &ill->ill_frag_hash_tbl[ix];
3636 			mutex_enter(&ipfb->ipfb_lock);
3637 			ipf = ipfb->ipfb_ipf;
3638 			if (ipf != NULL && ipf->ipf_gen < oldest) {
3639 				oldest = ipf->ipf_gen;
3640 				oipfb = ipfb;
3641 			}
3642 			count += ipfb->ipfb_count;
3643 			mutex_exit(&ipfb->ipfb_lock);
3644 		}
3645 		/* Refresh the per ILL count */
3646 		ill->ill_frag_count = count;
3647 		if (oipfb == NULL) {
3648 			ill->ill_frag_count = 0;
3649 			break;
3650 		}
3651 		if (count <= max_count)
3652 			return;	/* Somebody beat us to it, nothing to do */
3653 		mutex_enter(&oipfb->ipfb_lock);
3654 		ipf = oipfb->ipfb_ipf;
3655 		if (ipf != NULL) {
3656 			ill_frag_free_pkts(ill, oipfb, ipf, 1);
3657 		}
3658 		mutex_exit(&oipfb->ipfb_lock);
3659 	}
3660 }
3661 
3662 /*
3663  * free 'free_cnt' fragmented packets starting at ipf.
3664  */
3665 void
3666 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
3667 {
3668 	size_t	count;
3669 	mblk_t	*mp;
3670 	mblk_t	*tmp;
3671 	ipf_t **ipfp = ipf->ipf_ptphn;
3672 
3673 	ASSERT(MUTEX_HELD(&ipfb->ipfb_lock));
3674 	ASSERT(ipfp != NULL);
3675 	ASSERT(ipf != NULL);
3676 
3677 	while (ipf != NULL && free_cnt-- > 0) {
3678 		count = ipf->ipf_count;
3679 		mp = ipf->ipf_mp;
3680 		ipf = ipf->ipf_hash_next;
3681 		for (tmp = mp; tmp; tmp = tmp->b_cont) {
3682 			IP_REASS_SET_START(tmp, 0);
3683 			IP_REASS_SET_END(tmp, 0);
3684 		}
3685 		ill->ill_frag_count -= count;
3686 		ASSERT(ipfb->ipfb_count >= count);
3687 		ipfb->ipfb_count -= count;
3688 		ASSERT(ipfb->ipfb_frag_pkts > 0);
3689 		ipfb->ipfb_frag_pkts--;
3690 		freemsg(mp);
3691 		BUMP_MIB(&ip_mib, ipReasmFails);
3692 	}
3693 
3694 	if (ipf)
3695 		ipf->ipf_ptphn = ipfp;
3696 	ipfp[0] = ipf;
3697 }
3698 
3699 #define	ND_FORWARD_WARNING	"The <if>:ip*_forwarding ndd variables are " \
3700 	"obsolete and may be removed in a future release of Solaris.  Use " \
3701 	"ifconfig(1M) to manipulate the forwarding status of an interface."
3702 
3703 /*
3704  * For obsolete per-interface forwarding configuration;
3705  * called in response to ND_GET.
3706  */
3707 /* ARGSUSED */
3708 static int
3709 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr)
3710 {
3711 	ill_t *ill = (ill_t *)cp;
3712 
3713 	cmn_err(CE_WARN, ND_FORWARD_WARNING);
3714 
3715 	(void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0);
3716 	return (0);
3717 }
3718 
3719 /*
3720  * For obsolete per-interface forwarding configuration;
3721  * called in response to ND_SET.
3722  */
3723 /* ARGSUSED */
3724 static int
3725 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp,
3726     cred_t *ioc_cr)
3727 {
3728 	long value;
3729 	int retval;
3730 
3731 	cmn_err(CE_WARN, ND_FORWARD_WARNING);
3732 
3733 	if (ddi_strtol(valuestr, NULL, 10, &value) != 0 ||
3734 	    value < 0 || value > 1) {
3735 		return (EINVAL);
3736 	}
3737 
3738 	rw_enter(&ill_g_lock, RW_READER);
3739 	retval = ill_forward_set(q, mp, (value != 0), cp);
3740 	rw_exit(&ill_g_lock);
3741 	return (retval);
3742 }
3743 
3744 /*
3745  * Set an ill's ILLF_ROUTER flag appropriately.  If the ill is part of an
3746  * IPMP group, make sure all ill's in the group adopt the new policy.  Send
3747  * up RTS_IFINFO routing socket messages for each interface whose flags we
3748  * change.
3749  */
3750 /* ARGSUSED */
3751 int
3752 ill_forward_set(queue_t *q, mblk_t *mp, boolean_t enable, caddr_t cp)
3753 {
3754 	ill_t *ill = (ill_t *)cp;
3755 	ill_group_t *illgrp;
3756 
3757 	ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ill_g_lock));
3758 
3759 	if ((enable && (ill->ill_flags & ILLF_ROUTER)) ||
3760 	    (!enable && !(ill->ill_flags & ILLF_ROUTER)) ||
3761 	    (ill->ill_phyint->phyint_flags & PHYI_LOOPBACK))
3762 		return (EINVAL);
3763 
3764 	/*
3765 	 * If the ill is in an IPMP group, set the forwarding policy on all
3766 	 * members of the group to the same value.
3767 	 */
3768 	illgrp = ill->ill_group;
3769 	if (illgrp != NULL) {
3770 		ill_t *tmp_ill;
3771 
3772 		for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL;
3773 		    tmp_ill = tmp_ill->ill_group_next) {
3774 			ip1dbg(("ill_forward_set: %s %s forwarding on %s",
3775 			    (enable ? "Enabling" : "Disabling"),
3776 			    (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"),
3777 			    tmp_ill->ill_name));
3778 			mutex_enter(&tmp_ill->ill_lock);
3779 			if (enable)
3780 				tmp_ill->ill_flags |= ILLF_ROUTER;
3781 			else
3782 				tmp_ill->ill_flags &= ~ILLF_ROUTER;
3783 			mutex_exit(&tmp_ill->ill_lock);
3784 			if (tmp_ill->ill_isv6)
3785 				ill_set_nce_router_flags(tmp_ill, enable);
3786 			/* Notify routing socket listeners of this change. */
3787 			ip_rts_ifmsg(tmp_ill->ill_ipif);
3788 		}
3789 	} else {
3790 		ip1dbg(("ill_forward_set: %s %s forwarding on %s",
3791 		    (enable ? "Enabling" : "Disabling"),
3792 		    (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
3793 		mutex_enter(&ill->ill_lock);
3794 		if (enable)
3795 			ill->ill_flags |= ILLF_ROUTER;
3796 		else
3797 			ill->ill_flags &= ~ILLF_ROUTER;
3798 		mutex_exit(&ill->ill_lock);
3799 		if (ill->ill_isv6)
3800 			ill_set_nce_router_flags(ill, enable);
3801 		/* Notify routing socket listeners of this change. */
3802 		ip_rts_ifmsg(ill->ill_ipif);
3803 	}
3804 
3805 	return (0);
3806 }
3807 
3808 /*
3809  * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for
3810  * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately
3811  * set or clear.
3812  */
3813 static void
3814 ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
3815 {
3816 	ipif_t *ipif;
3817 	nce_t *nce;
3818 
3819 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
3820 		nce = ndp_lookup(ill, &ipif->ipif_v6lcl_addr, B_FALSE);
3821 		if (nce != NULL) {
3822 			mutex_enter(&nce->nce_lock);
3823 			if (enable)
3824 				nce->nce_flags |= NCE_F_ISROUTER;
3825 			else
3826 				nce->nce_flags &= ~NCE_F_ISROUTER;
3827 			mutex_exit(&nce->nce_lock);
3828 			NCE_REFRELE(nce);
3829 		}
3830 	}
3831 }
3832 
3833 /*
3834  * Given an ill with a _valid_ name, add the ip_forwarding ndd variable
3835  * for this ill.  Make sure the v6/v4 question has been answered about this
3836  * ill.  The creation of this ndd variable is only for backwards compatibility.
3837  * The preferred way to control per-interface IP forwarding is through the
3838  * ILLF_ROUTER interface flag.
3839  */
3840 static int
3841 ill_set_ndd_name(ill_t *ill)
3842 {
3843 	char *suffix;
3844 
3845 	ASSERT(IAM_WRITER_ILL(ill));
3846 
3847 	if (ill->ill_isv6)
3848 		suffix = ipv6_forward_suffix;
3849 	else
3850 		suffix = ipv4_forward_suffix;
3851 
3852 	ill->ill_ndd_name = ill->ill_name + ill->ill_name_length;
3853 	bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1);
3854 	/*
3855 	 * Copies over the '\0'.
3856 	 * Note that strlen(suffix) is always bounded.
3857 	 */
3858 	bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1,
3859 	    strlen(suffix) + 1);
3860 
3861 	/*
3862 	 * Use of the nd table requires holding the reader lock.
3863 	 * Modifying the nd table thru nd_load/nd_unload requires
3864 	 * the writer lock.
3865 	 */
3866 	rw_enter(&ip_g_nd_lock, RW_WRITER);
3867 	if (!nd_load(&ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get,
3868 	    nd_ill_forward_set, (caddr_t)ill)) {
3869 		/*
3870 		 * If the nd_load failed, it only meant that it could not
3871 		 * allocate a new bunch of room for further NDD expansion.
3872 		 * Because of that, the ill_ndd_name will be set to 0, and
3873 		 * this interface is at the mercy of the global ip_forwarding
3874 		 * variable.
3875 		 */
3876 		rw_exit(&ip_g_nd_lock);
3877 		ill->ill_ndd_name = NULL;
3878 		return (ENOMEM);
3879 	}
3880 	rw_exit(&ip_g_nd_lock);
3881 	return (0);
3882 }
3883 
3884 /*
3885  * Intializes the context structure and returns the first ill in the list
3886  * cuurently start_list and end_list can have values:
3887  * MAX_G_HEADS		Traverse both IPV4 and IPV6 lists.
3888  * IP_V4_G_HEAD		Traverse IPV4 list only.
3889  * IP_V6_G_HEAD		Traverse IPV6 list only.
3890  */
3891 
3892 /*
3893  * We don't check for CONDEMNED ills here. Caller must do that if
3894  * necessary under the ill lock.
3895  */
3896 ill_t *
3897 ill_first(int start_list, int end_list, ill_walk_context_t *ctx)
3898 {
3899 	ill_if_t *ifp;
3900 	ill_t *ill;
3901 	avl_tree_t *avl_tree;
3902 
3903 	ASSERT(RW_LOCK_HELD(&ill_g_lock));
3904 	ASSERT(end_list <= MAX_G_HEADS && start_list >= 0);
3905 
3906 	/*
3907 	 * setup the lists to search
3908 	 */
3909 	if (end_list != MAX_G_HEADS) {
3910 		ctx->ctx_current_list = start_list;
3911 		ctx->ctx_last_list = end_list;
3912 	} else {
3913 		ctx->ctx_last_list = MAX_G_HEADS - 1;
3914 		ctx->ctx_current_list = 0;
3915 	}
3916 
3917 	while (ctx->ctx_current_list <= ctx->ctx_last_list) {
3918 		ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list);
3919 		if (ifp != (ill_if_t *)
3920 		    &IP_VX_ILL_G_LIST(ctx->ctx_current_list)) {
3921 			avl_tree = &ifp->illif_avl_by_ppa;
3922 			ill = avl_first(avl_tree);
3923 			/*
3924 			 * ill is guaranteed to be non NULL or ifp should have
3925 			 * not existed.
3926 			 */
3927 			ASSERT(ill != NULL);
3928 			return (ill);
3929 		}
3930 		ctx->ctx_current_list++;
3931 	}
3932 
3933 	return (NULL);
3934 }
3935 
3936 /*
3937  * returns the next ill in the list. ill_first() must have been called
3938  * before calling ill_next() or bad things will happen.
3939  */
3940 
3941 /*
3942  * We don't check for CONDEMNED ills here. Caller must do that if
3943  * necessary under the ill lock.
3944  */
3945 ill_t *
3946 ill_next(ill_walk_context_t *ctx, ill_t *lastill)
3947 {
3948 	ill_if_t *ifp;
3949 	ill_t *ill;
3950 
3951 
3952 	ASSERT(RW_LOCK_HELD(&ill_g_lock));
3953 	ASSERT(lastill->ill_ifptr != (ill_if_t *)
3954 	    &IP_VX_ILL_G_LIST(ctx->ctx_current_list));
3955 	if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill,
3956 	    AVL_AFTER)) != NULL) {
3957 		return (ill);
3958 	}
3959 
3960 	/* goto next ill_ifp in the list. */
3961 	ifp = lastill->ill_ifptr->illif_next;
3962 
3963 	/* make sure not at end of circular list */
3964 	while (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list)) {
3965 		if (++ctx->ctx_current_list > ctx->ctx_last_list)
3966 			return (NULL);
3967 		ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list);
3968 	}
3969 
3970 	return (avl_first(&ifp->illif_avl_by_ppa));
3971 }
3972 
3973 /*
3974  * Check interface name for correct format which is name+ppa.
3975  * name can contain characters and digits, the right most digits
3976  * make up the ppa number. use of octal is not allowed, name must contain
3977  * a ppa, return pointer to the start of ppa.
3978  * In case of error return NULL.
3979  */
3980 static char *
3981 ill_get_ppa_ptr(char *name)
3982 {
3983 	int namelen = mi_strlen(name);
3984 
3985 	int len = namelen;
3986 
3987 	name += len;
3988 	while (len > 0) {
3989 		name--;
3990 		if (*name < '0' || *name > '9')
3991 			break;
3992 		len--;
3993 	}
3994 
3995 	/* empty string, all digits, or no trailing digits */
3996 	if (len == 0 || len == (int)namelen)
3997 		return (NULL);
3998 
3999 	name++;
4000 	/* check for attempted use of octal */
4001 	if (*name == '0' && len != (int)namelen - 1)
4002 		return (NULL);
4003 	return (name);
4004 }
4005 
4006 /*
4007  * use avl tree to locate the ill.
4008  */
4009 static ill_t *
4010 ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp,
4011     ipsq_func_t func, int *error)
4012 {
4013 	char *ppa_ptr = NULL;
4014 	int len;
4015 	uint_t ppa;
4016 	ill_t *ill = NULL;
4017 	ill_if_t *ifp;
4018 	int list;
4019 	ipsq_t *ipsq;
4020 
4021 	if (error != NULL)
4022 		*error = 0;
4023 
4024 	/*
4025 	 * get ppa ptr
4026 	 */
4027 	if (isv6)
4028 		list = IP_V6_G_HEAD;
4029 	else
4030 		list = IP_V4_G_HEAD;
4031 
4032 	if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) {
4033 		if (error != NULL)
4034 			*error = ENXIO;
4035 		return (NULL);
4036 	}
4037 
4038 	len = ppa_ptr - name + 1;
4039 
4040 	ppa = stoi(&ppa_ptr);
4041 
4042 	ifp = IP_VX_ILL_G_LIST(list);
4043 
4044 	while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list)) {
4045 		/*
4046 		 * match is done on len - 1 as the name is not null
4047 		 * terminated it contains ppa in addition to the interface
4048 		 * name.
4049 		 */
4050 		if ((ifp->illif_name_len == len) &&
4051 		    bcmp(ifp->illif_name, name, len - 1) == 0) {
4052 			break;
4053 		} else {
4054 			ifp = ifp->illif_next;
4055 		}
4056 	}
4057 
4058 
4059 	if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list)) {
4060 		/*
4061 		 * Even the interface type does not exist.
4062 		 */
4063 		if (error != NULL)
4064 			*error = ENXIO;
4065 		return (NULL);
4066 	}
4067 
4068 	ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL);
4069 	if (ill != NULL) {
4070 		/*
4071 		 * The block comment at the start of ipif_down
4072 		 * explains the use of the macros used below
4073 		 */
4074 		GRAB_CONN_LOCK(q);
4075 		mutex_enter(&ill->ill_lock);
4076 		if (ILL_CAN_LOOKUP(ill)) {
4077 			ill_refhold_locked(ill);
4078 			mutex_exit(&ill->ill_lock);
4079 			RELEASE_CONN_LOCK(q);
4080 			return (ill);
4081 		} else if (ILL_CAN_WAIT(ill, q)) {
4082 			ipsq = ill->ill_phyint->phyint_ipsq;
4083 			mutex_enter(&ipsq->ipsq_lock);
4084 			mutex_exit(&ill->ill_lock);
4085 			ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
4086 			mutex_exit(&ipsq->ipsq_lock);
4087 			RELEASE_CONN_LOCK(q);
4088 			*error = EINPROGRESS;
4089 			return (NULL);
4090 		}
4091 		mutex_exit(&ill->ill_lock);
4092 		RELEASE_CONN_LOCK(q);
4093 	}
4094 	if (error != NULL)
4095 		*error = ENXIO;
4096 	return (NULL);
4097 }
4098 
4099 /*
4100  * comparison function for use with avl.
4101  */
4102 static int
4103 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr)
4104 {
4105 	uint_t ppa;
4106 	uint_t ill_ppa;
4107 
4108 	ASSERT(ppa_ptr != NULL && ill_ptr != NULL);
4109 
4110 	ppa = *((uint_t *)ppa_ptr);
4111 	ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa;
4112 	/*
4113 	 * We want the ill with the lowest ppa to be on the
4114 	 * top.
4115 	 */
4116 	if (ill_ppa < ppa)
4117 		return (1);
4118 	if (ill_ppa > ppa)
4119 		return (-1);
4120 	return (0);
4121 }
4122 
4123 /*
4124  * remove an interface type from the global list.
4125  */
4126 static void
4127 ill_delete_interface_type(ill_if_t *interface)
4128 {
4129 	ASSERT(RW_WRITE_HELD(&ill_g_lock));
4130 
4131 	ASSERT(interface != NULL);
4132 	ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0);
4133 
4134 	avl_destroy(&interface->illif_avl_by_ppa);
4135 	if (interface->illif_ppa_arena != NULL)
4136 		vmem_destroy(interface->illif_ppa_arena);
4137 
4138 	remque(interface);
4139 
4140 	mi_free(interface);
4141 }
4142 
4143 /*
4144  * remove ill from the global list.
4145  */
4146 static void
4147 ill_glist_delete(ill_t *ill)
4148 {
4149 	if (ill == NULL)
4150 		return;
4151 
4152 	rw_enter(&ill_g_lock, RW_WRITER);
4153 	/*
4154 	 * If the ill was never inserted into the AVL tree
4155 	 * we skip the if branch.
4156 	 */
4157 	if (ill->ill_ifptr != NULL) {
4158 		/*
4159 		 * remove from AVL tree and free ppa number
4160 		 */
4161 		avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill);
4162 
4163 		if (ill->ill_ifptr->illif_ppa_arena != NULL) {
4164 			vmem_free(ill->ill_ifptr->illif_ppa_arena,
4165 			    (void *)(uintptr_t)(ill->ill_ppa+1), 1);
4166 		}
4167 		if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) {
4168 			ill_delete_interface_type(ill->ill_ifptr);
4169 		}
4170 
4171 		/*
4172 		 * Indicate ill is no longer in the list.
4173 		 */
4174 		ill->ill_ifptr = NULL;
4175 		ill->ill_name_length = 0;
4176 		ill->ill_name[0] = '\0';
4177 		ill->ill_ppa = UINT_MAX;
4178 	}
4179 	ill_phyint_free(ill);
4180 	rw_exit(&ill_g_lock);
4181 }
4182 
4183 /*
4184  * allocate a ppa, if the number of plumbed interfaces of this type are
4185  * less than ill_no_arena do a linear search to find a unused ppa.
4186  * When the number goes beyond ill_no_arena switch to using an arena.
4187  * Note: ppa value of zero cannot be allocated from vmem_arena as it
4188  * is the return value for an error condition, so allocation starts at one
4189  * and is decremented by one.
4190  */
4191 static int
4192 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill)
4193 {
4194 	ill_t *tmp_ill;
4195 	uint_t start, end;
4196 	int ppa;
4197 
4198 	if (ifp->illif_ppa_arena == NULL &&
4199 	    (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) {
4200 		/*
4201 		 * Create an arena.
4202 		 */
4203 		ifp->illif_ppa_arena = vmem_create(ifp->illif_name,
4204 		    (void *)1, UINT_MAX - 1, 1, NULL, NULL,
4205 		    NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
4206 			/* allocate what has already been assigned */
4207 		for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa);
4208 		    tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa,
4209 		    tmp_ill, AVL_AFTER)) {
4210 			ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
4211 			    1,		/* size */
4212 			    1,		/* align/quantum */
4213 			    0,		/* phase */
4214 			    0,		/* nocross */
4215 		(void *)((uintptr_t)tmp_ill->ill_ppa + 1), /* minaddr */
4216 		(void *)((uintptr_t)tmp_ill->ill_ppa + 2), /* maxaddr */
4217 			    VM_NOSLEEP|VM_FIRSTFIT);
4218 			if (ppa == 0) {
4219 				ip1dbg(("ill_alloc_ppa: ppa allocation"
4220 				    " failed while switching"));
4221 				vmem_destroy(ifp->illif_ppa_arena);
4222 				ifp->illif_ppa_arena = NULL;
4223 				break;
4224 			}
4225 		}
4226 	}
4227 
4228 	if (ifp->illif_ppa_arena != NULL) {
4229 		if (ill->ill_ppa == UINT_MAX) {
4230 			ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena,
4231 			    1, VM_NOSLEEP|VM_FIRSTFIT);
4232 			if (ppa == 0)
4233 				return (EAGAIN);
4234 			ill->ill_ppa = --ppa;
4235 		} else {
4236 			ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
4237 			    1, 		/* size */
4238 			    1, 		/* align/quantum */
4239 			    0, 		/* phase */
4240 			    0, 		/* nocross */
4241 			    (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */
4242 			    (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */
4243 			    VM_NOSLEEP|VM_FIRSTFIT);
4244 			/*
4245 			 * Most likely the allocation failed because
4246 			 * the requested ppa was in use.
4247 			 */
4248 			if (ppa == 0)
4249 				return (EEXIST);
4250 		}
4251 		return (0);
4252 	}
4253 
4254 	/*
4255 	 * No arena is in use and not enough (>ill_no_arena) interfaces have
4256 	 * been plumbed to create one. Do a linear search to get a unused ppa.
4257 	 */
4258 	if (ill->ill_ppa == UINT_MAX) {
4259 		end = UINT_MAX - 1;
4260 		start = 0;
4261 	} else {
4262 		end = start = ill->ill_ppa;
4263 	}
4264 
4265 	tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL);
4266 	while (tmp_ill != NULL && tmp_ill->ill_ppa == start) {
4267 		if (start++ >= end) {
4268 			if (ill->ill_ppa == UINT_MAX)
4269 				return (EAGAIN);
4270 			else
4271 				return (EEXIST);
4272 		}
4273 		tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER);
4274 	}
4275 	ill->ill_ppa = start;
4276 	return (0);
4277 }
4278 
4279 /*
4280  * Insert ill into the list of configured ill's. Once this function completes,
4281  * the ill is globally visible and is available through lookups. More precisely
4282  * this happens after the caller drops the ill_g_lock.
4283  */
4284 static int
4285 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
4286 {
4287 	ill_if_t *ill_interface;
4288 	avl_index_t where = 0;
4289 	int error;
4290 	int name_length;
4291 	int index;
4292 	boolean_t check_length = B_FALSE;
4293 
4294 	ASSERT(RW_WRITE_HELD(&ill_g_lock));
4295 
4296 	name_length = mi_strlen(name) + 1;
4297 
4298 	if (isv6)
4299 		index = IP_V6_G_HEAD;
4300 	else
4301 		index = IP_V4_G_HEAD;
4302 
4303 	ill_interface = IP_VX_ILL_G_LIST(index);
4304 	/*
4305 	 * Search for interface type based on name
4306 	 */
4307 	while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index)) {
4308 		if ((ill_interface->illif_name_len == name_length) &&
4309 		    (strcmp(ill_interface->illif_name, name) == 0)) {
4310 			break;
4311 		}
4312 		ill_interface = ill_interface->illif_next;
4313 	}
4314 
4315 	/*
4316 	 * Interface type not found, create one.
4317 	 */
4318 	if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index)) {
4319 
4320 		ill_g_head_t ghead;
4321 
4322 		/*
4323 		 * allocate ill_if_t structure
4324 		 */
4325 
4326 		ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t));
4327 		if (ill_interface == NULL) {
4328 			return (ENOMEM);
4329 		}
4330 
4331 
4332 
4333 		(void) strcpy(ill_interface->illif_name, name);
4334 		ill_interface->illif_name_len = name_length;
4335 
4336 		avl_create(&ill_interface->illif_avl_by_ppa,
4337 		    ill_compare_ppa, sizeof (ill_t),
4338 		    offsetof(struct ill_s, ill_avl_byppa));
4339 
4340 		/*
4341 		 * link the structure in the back to maintain order
4342 		 * of configuration for ifconfig output.
4343 		 */
4344 		ghead = ill_g_heads[index];
4345 		insque(ill_interface, ghead.ill_g_list_tail);
4346 
4347 	}
4348 
4349 	if (ill->ill_ppa == UINT_MAX)
4350 		check_length = B_TRUE;
4351 
4352 	error = ill_alloc_ppa(ill_interface, ill);
4353 	if (error != 0) {
4354 		if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
4355 			ill_delete_interface_type(ill->ill_ifptr);
4356 		return (error);
4357 	}
4358 
4359 	/*
4360 	 * When the ppa is choosen by the system, check that there is
4361 	 * enough space to insert ppa. if a specific ppa was passed in this
4362 	 * check is not required as the interface name passed in will have
4363 	 * the right ppa in it.
4364 	 */
4365 	if (check_length) {
4366 		/*
4367 		 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars.
4368 		 */
4369 		char buf[sizeof (uint_t) * 3];
4370 
4371 		/*
4372 		 * convert ppa to string to calculate the amount of space
4373 		 * required for it in the name.
4374 		 */
4375 		numtos(ill->ill_ppa, buf);
4376 
4377 		/* Do we have enough space to insert ppa ? */
4378 
4379 		if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) {
4380 			/* Free ppa and interface type struct */
4381 			if (ill_interface->illif_ppa_arena != NULL) {
4382 				vmem_free(ill_interface->illif_ppa_arena,
4383 				    (void *)(uintptr_t)(ill->ill_ppa+1), 1);
4384 			}
4385 			if (avl_numnodes(&ill_interface->illif_avl_by_ppa) ==
4386 			    0) {
4387 				ill_delete_interface_type(ill->ill_ifptr);
4388 			}
4389 
4390 			return (EINVAL);
4391 		}
4392 	}
4393 
4394 	(void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa);
4395 	ill->ill_name_length = mi_strlen(ill->ill_name) + 1;
4396 
4397 	(void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa,
4398 	    &where);
4399 	ill->ill_ifptr = ill_interface;
4400 	avl_insert(&ill_interface->illif_avl_by_ppa, ill, where);
4401 
4402 	ill_phyint_reinit(ill);
4403 	return (0);
4404 }
4405 
4406 /* Initialize the per phyint (per IPMP group) ipsq used for serialization */
4407 static boolean_t
4408 ipsq_init(ill_t *ill)
4409 {
4410 	ipsq_t  *ipsq;
4411 
4412 	/* Init the ipsq and impicitly enter as writer */
4413 	ill->ill_phyint->phyint_ipsq =
4414 	    kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP);
4415 	if (ill->ill_phyint->phyint_ipsq == NULL)
4416 		return (B_FALSE);
4417 	ipsq = ill->ill_phyint->phyint_ipsq;
4418 	ipsq->ipsq_phyint_list = ill->ill_phyint;
4419 	ill->ill_phyint->phyint_ipsq_next = NULL;
4420 	mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0);
4421 	ipsq->ipsq_refs = 1;
4422 	ipsq->ipsq_writer = curthread;
4423 	ipsq->ipsq_reentry_cnt = 1;
4424 #ifdef ILL_DEBUG
4425 	ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack, IP_STACK_DEPTH);
4426 #endif
4427 	(void) strcpy(ipsq->ipsq_name, ill->ill_name);
4428 	return (B_TRUE);
4429 }
4430 
4431 /*
4432  * ill_init is called by ip_open when a device control stream is opened.
4433  * It does a few initializations, and shoots a DL_INFO_REQ message down
4434  * to the driver.  The response is later picked up in ip_rput_dlpi and
4435  * used to set up default mechanisms for talking to the driver.  (Always
4436  * called as writer.)
4437  *
4438  * If this function returns error, ip_open will call ip_close which in
4439  * turn will call ill_delete to clean up any memory allocated here that
4440  * is not yet freed.
4441  */
4442 int
4443 ill_init(queue_t *q, ill_t *ill)
4444 {
4445 	int	count;
4446 	dl_info_req_t	*dlir;
4447 	mblk_t	*info_mp;
4448 	uchar_t *frag_ptr;
4449 
4450 	/*
4451 	 * The ill is initialized to zero by mi_alloc*(). In addition
4452 	 * some fields already contain valid values, initialized in
4453 	 * ip_open(), before we reach here.
4454 	 */
4455 	mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0);
4456 
4457 	ill->ill_rq = q;
4458 	ill->ill_wq = WR(q);
4459 
4460 	info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
4461 	    BPRI_HI);
4462 	if (info_mp == NULL)
4463 		return (ENOMEM);
4464 
4465 	/*
4466 	 * Allocate sufficient space to contain our fragment hash table and
4467 	 * the device name.
4468 	 */
4469 	frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE +
4470 	    2 * LIFNAMSIZ + 5 + strlen(ipv6_forward_suffix));
4471 	if (frag_ptr == NULL) {
4472 		freemsg(info_mp);
4473 		return (ENOMEM);
4474 	}
4475 	ill->ill_frag_ptr = frag_ptr;
4476 	ill->ill_frag_free_num_pkts = 0;
4477 	ill->ill_last_frag_clean_time = 0;
4478 	ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr;
4479 	ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE);
4480 	for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
4481 		mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock,
4482 		    NULL, MUTEX_DEFAULT, NULL);
4483 	}
4484 
4485 	ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
4486 	if (ill->ill_phyint == NULL) {
4487 		freemsg(info_mp);
4488 		mi_free(frag_ptr);
4489 		return (ENOMEM);
4490 	}
4491 
4492 	mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
4493 	/*
4494 	 * For now pretend this is a v4 ill. We need to set phyint_ill*
4495 	 * at this point because of the following reason. If we can't
4496 	 * enter the ipsq at some point and cv_wait, the writer that
4497 	 * wakes us up tries to locate us using the list of all phyints
4498 	 * in an ipsq and the ills from the phyint thru the phyint_ill*.
4499 	 * If we don't set it now, we risk a missed wakeup.
4500 	 */
4501 	ill->ill_phyint->phyint_illv4 = ill;
4502 	ill->ill_ppa = UINT_MAX;
4503 	ill->ill_fastpath_list = &ill->ill_fastpath_list;
4504 
4505 	if (!ipsq_init(ill)) {
4506 		freemsg(info_mp);
4507 		mi_free(frag_ptr);
4508 		mi_free(ill->ill_phyint);
4509 		return (ENOMEM);
4510 	}
4511 
4512 	ill->ill_state_flags |= ILL_LL_SUBNET_PENDING;
4513 
4514 
4515 	/* Frag queue limit stuff */
4516 	ill->ill_frag_count = 0;
4517 	ill->ill_ipf_gen = 0;
4518 
4519 	ill->ill_global_timer = INFINITY;
4520 	ill->ill_mcast_type = IGMP_V3_ROUTER;	/* == MLD_V2_ROUTER */
4521 	ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
4522 	ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
4523 	ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
4524 	ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
4525 
4526 	/*
4527 	 * Initialize IPv6 configuration variables.  The IP module is always
4528 	 * opened as an IPv4 module.  Instead tracking down the cases where
4529 	 * it switches to do ipv6, we'll just initialize the IPv6 configuration
4530 	 * here for convenience, this has no effect until the ill is set to do
4531 	 * IPv6.
4532 	 */
4533 	ill->ill_reachable_time = ND_REACHABLE_TIME;
4534 	ill->ill_reachable_retrans_time = ND_RETRANS_TIMER;
4535 	ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT;
4536 	ill->ill_max_buf = ND_MAX_Q;
4537 	ill->ill_refcnt = 0;
4538 
4539 	/* Send down the Info Request to the driver. */
4540 	info_mp->b_datap->db_type = M_PCPROTO;
4541 	dlir = (dl_info_req_t *)info_mp->b_rptr;
4542 	info_mp->b_wptr = (uchar_t *)&dlir[1];
4543 	dlir->dl_primitive = DL_INFO_REQ;
4544 
4545 	ill->ill_dlpi_pending = DL_PRIM_INVAL;
4546 
4547 	qprocson(q);
4548 	ill_dlpi_send(ill, info_mp);
4549 
4550 	return (0);
4551 }
4552 
4553 /*
4554  * ill_dls_info
4555  * creates datalink socket info from the device.
4556  */
4557 int
4558 ill_dls_info(struct sockaddr_dl *sdl, ipif_t *ipif)
4559 {
4560 	size_t	length;
4561 	ill_t	*ill = ipif->ipif_ill;
4562 
4563 	sdl->sdl_family = AF_LINK;
4564 	sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
4565 	sdl->sdl_type = ipif->ipif_type;
4566 	(void) ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data));
4567 	length = mi_strlen(sdl->sdl_data);
4568 	ASSERT(length < 256);
4569 	sdl->sdl_nlen = (uchar_t)length;
4570 	sdl->sdl_alen = ill->ill_phys_addr_length;
4571 	mutex_enter(&ill->ill_lock);
4572 	if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) {
4573 		bcopy(ill->ill_phys_addr, &sdl->sdl_data[length],
4574 		    ill->ill_phys_addr_length);
4575 	}
4576 	mutex_exit(&ill->ill_lock);
4577 	sdl->sdl_slen = 0;
4578 	return (sizeof (struct sockaddr_dl));
4579 }
4580 
4581 /*
4582  * ill_xarp_info
4583  * creates xarp info from the device.
4584  */
4585 static int
4586 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill)
4587 {
4588 	sdl->sdl_family = AF_LINK;
4589 	sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
4590 	sdl->sdl_type = ill->ill_type;
4591 	(void) ipif_get_name(ill->ill_ipif, sdl->sdl_data,
4592 	    sizeof (sdl->sdl_data));
4593 	sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data);
4594 	sdl->sdl_alen = ill->ill_phys_addr_length;
4595 	sdl->sdl_slen = 0;
4596 	return (sdl->sdl_nlen);
4597 }
4598 
4599 static int
4600 loopback_kstat_update(kstat_t *ksp, int rw)
4601 {
4602 	kstat_named_t *kn = KSTAT_NAMED_PTR(ksp);
4603 
4604 	if (rw == KSTAT_WRITE)
4605 		return (EACCES);
4606 	kn[0].value.ui32 = loopback_packets;
4607 	kn[1].value.ui32 = loopback_packets;
4608 	return (0);
4609 }
4610 
4611 
4612 /*
4613  * Has ifindex been plumbed already.
4614  */
4615 static boolean_t
4616 phyint_exists(uint_t index)
4617 {
4618 	phyint_t *phyi;
4619 
4620 	ASSERT(RW_LOCK_HELD(&ill_g_lock));
4621 	/*
4622 	 * Indexes are stored in the phyint - a common structure
4623 	 * to both IPv4 and IPv6.
4624 	 */
4625 	phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index,
4626 	    (void *) &index, NULL);
4627 	return (phyi != NULL);
4628 }
4629 
4630 /*
4631  * Assign a unique interface index for the phyint.
4632  */
4633 static boolean_t
4634 phyint_assign_ifindex(phyint_t *phyi)
4635 {
4636 	uint_t starting_index;
4637 
4638 	ASSERT(phyi->phyint_ifindex == 0);
4639 	if (!ill_index_wrap) {
4640 		phyi->phyint_ifindex = ill_index++;
4641 		if (ill_index == 0) {
4642 			/* Reached the uint_t limit Next time wrap  */
4643 			ill_index_wrap = B_TRUE;
4644 		}
4645 		return (B_TRUE);
4646 	}
4647 
4648 	/*
4649 	 * Start reusing unused indexes. Note that we hold the ill_g_lock
4650 	 * at this point and don't want to call any function that attempts
4651 	 * to get the lock again.
4652 	 */
4653 	starting_index = ill_index++;
4654 	for (; ill_index != starting_index; ill_index++) {
4655 		if (ill_index != 0 && !phyint_exists(ill_index)) {
4656 			/* found unused index - use it */
4657 			phyi->phyint_ifindex = ill_index;
4658 			return (B_TRUE);
4659 		}
4660 	}
4661 
4662 	/*
4663 	 * all interface indicies are inuse.
4664 	 */
4665 	return (B_FALSE);
4666 }
4667 
4668 /*
4669  * Return a pointer to the ill which matches the supplied name.  Note that
4670  * the ill name length includes the null termination character.  (May be
4671  * called as writer.)
4672  * If do_alloc and the interface is "lo0" it will be automatically created.
4673  * Cannot bump up reference on condemned ills. So dup detect can't be done
4674  * using this func.
4675  */
4676 ill_t *
4677 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
4678     queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc)
4679 {
4680 	ill_t	*ill;
4681 	ipif_t	*ipif;
4682 	kstat_named_t	*kn;
4683 	boolean_t isloopback;
4684 	ipsq_t *old_ipsq;
4685 
4686 	isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
4687 
4688 	rw_enter(&ill_g_lock, RW_READER);
4689 	ill = ill_find_by_name(name, isv6, q, mp, func, error);
4690 	rw_exit(&ill_g_lock);
4691 	if (ill != NULL || (error != NULL && *error == EINPROGRESS))
4692 		return (ill);
4693 
4694 	/*
4695 	 * Couldn't find it.  Does this happen to be a lookup for the
4696 	 * loopback device and are we allowed to allocate it?
4697 	 */
4698 	if (!isloopback || !do_alloc)
4699 		return (NULL);
4700 
4701 	rw_enter(&ill_g_lock, RW_WRITER);
4702 
4703 	ill = ill_find_by_name(name, isv6, q, mp, func, error);
4704 	if (ill != NULL || (error != NULL && *error == EINPROGRESS)) {
4705 		rw_exit(&ill_g_lock);
4706 		return (ill);
4707 	}
4708 
4709 	/* Create the loopback device on demand */
4710 	ill = (ill_t *)(mi_alloc(sizeof (ill_t) +
4711 	    sizeof (ipif_loopback_name), BPRI_MED));
4712 	if (ill == NULL)
4713 		goto done;
4714 
4715 	*ill = ill_null;
4716 	mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL);
4717 	ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
4718 	if (ill->ill_phyint == NULL)
4719 		goto done;
4720 
4721 	if (isv6)
4722 		ill->ill_phyint->phyint_illv6 = ill;
4723 	else
4724 		ill->ill_phyint->phyint_illv4 = ill;
4725 	mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
4726 	ill->ill_max_frag = IP_LOOPBACK_MTU;
4727 	/* Add room for tcp+ip headers */
4728 	if (isv6) {
4729 		ill->ill_isv6 = B_TRUE;
4730 		ill->ill_max_frag += IPV6_HDR_LEN + 20;	/* for TCP */
4731 		if (!ill_allocate_mibs(ill))
4732 			goto done;
4733 	} else {
4734 		ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20;
4735 	}
4736 	ill->ill_max_mtu = ill->ill_max_frag;
4737 	/*
4738 	 * ipif_loopback_name can't be pointed at directly because its used
4739 	 * by both the ipv4 and ipv6 interfaces.  When the ill is removed
4740 	 * from the glist, ill_glist_delete() sets the first character of
4741 	 * ill_name to '\0'.
4742 	 */
4743 	ill->ill_name = (char *)ill + sizeof (*ill);
4744 	(void) strcpy(ill->ill_name, ipif_loopback_name);
4745 	ill->ill_name_length = sizeof (ipif_loopback_name);
4746 	/* Set ill_name_set for ill_phyint_reinit to work properly */
4747 
4748 	ill->ill_global_timer = INFINITY;
4749 	ill->ill_mcast_type = IGMP_V3_ROUTER;	/* == MLD_V2_ROUTER */
4750 	ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
4751 	ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
4752 	ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
4753 	ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
4754 
4755 	/* No resolver here. */
4756 	ill->ill_net_type = IRE_LOOPBACK;
4757 
4758 	/* Initialize the ipsq */
4759 	if (!ipsq_init(ill))
4760 		goto done;
4761 
4762 	ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL;
4763 	ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--;
4764 	ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0);
4765 #ifdef ILL_DEBUG
4766 	ill->ill_phyint->phyint_ipsq->ipsq_depth = 0;
4767 #endif
4768 	ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE);
4769 	if (ipif == NULL)
4770 		goto done;
4771 
4772 	ill->ill_flags = ILLF_MULTICAST;
4773 
4774 	/* Set up default loopback address and mask. */
4775 	if (!isv6) {
4776 		ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK);
4777 
4778 		IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr);
4779 		ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
4780 		V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask);
4781 		V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
4782 		    ipif->ipif_v6subnet);
4783 		ill->ill_flags |= ILLF_IPV4;
4784 	} else {
4785 		ipif->ipif_v6lcl_addr = ipv6_loopback;
4786 		ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
4787 		ipif->ipif_v6net_mask = ipv6_all_ones;
4788 		V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
4789 		    ipif->ipif_v6subnet);
4790 		ill->ill_flags |= ILLF_IPV6;
4791 	}
4792 
4793 	/*
4794 	 * Chain us in at the end of the ill list. hold the ill
4795 	 * before we make it globally visible. 1 for the lookup.
4796 	 */
4797 	ill->ill_refcnt = 0;
4798 	ill_refhold(ill);
4799 
4800 	ill->ill_frag_count = 0;
4801 	ill->ill_frag_free_num_pkts = 0;
4802 	ill->ill_last_frag_clean_time = 0;
4803 
4804 	old_ipsq = ill->ill_phyint->phyint_ipsq;
4805 
4806 	if (ill_glist_insert(ill, "lo", isv6) != 0)
4807 		cmn_err(CE_PANIC, "cannot insert loopback interface");
4808 
4809 	/* Let SCTP know so that it can add this to its list */
4810 	sctp_update_ill(ill, SCTP_ILL_INSERT);
4811 
4812 	/* Let SCTP know about this IPIF, so that it can add it to its list */
4813 	sctp_update_ipif(ipif, SCTP_IPIF_INSERT);
4814 
4815 	/*
4816 	 * If the ipsq was changed in ill_phyint_reinit free the old ipsq.
4817 	 */
4818 	if (old_ipsq != ill->ill_phyint->phyint_ipsq) {
4819 		/* Loopback ills aren't in any IPMP group */
4820 		ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP));
4821 		ipsq_delete(old_ipsq);
4822 	}
4823 
4824 	/*
4825 	 * Delay this till the ipif is allocated as ipif_allocate
4826 	 * de-references ill_phyint for getting the ifindex. We
4827 	 * can't do this before ipif_allocate because ill_phyint_reinit
4828 	 * -> phyint_assign_ifindex expects ipif to be present.
4829 	 */
4830 	mutex_enter(&ill->ill_phyint->phyint_lock);
4831 	ill->ill_phyint->phyint_flags |= PHYI_LOOPBACK | PHYI_VIRTUAL;
4832 	mutex_exit(&ill->ill_phyint->phyint_lock);
4833 
4834 	if (loopback_ksp == NULL) {
4835 		/* Export loopback interface statistics */
4836 		loopback_ksp = kstat_create("lo", 0, ipif_loopback_name, "net",
4837 		    KSTAT_TYPE_NAMED, 2, 0);
4838 		if (loopback_ksp != NULL) {
4839 			loopback_ksp->ks_update = loopback_kstat_update;
4840 			kn = KSTAT_NAMED_PTR(loopback_ksp);
4841 			kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32);
4842 			kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32);
4843 			kstat_install(loopback_ksp);
4844 		}
4845 	}
4846 
4847 	if (error != NULL)
4848 		*error = 0;
4849 	*did_alloc = B_TRUE;
4850 	rw_exit(&ill_g_lock);
4851 	return (ill);
4852 done:
4853 	if (ill != NULL) {
4854 		if (ill->ill_phyint != NULL) {
4855 			ipsq_t	*ipsq;
4856 
4857 			ipsq = ill->ill_phyint->phyint_ipsq;
4858 			if (ipsq != NULL)
4859 				kmem_free(ipsq, sizeof (ipsq_t));
4860 			mi_free(ill->ill_phyint);
4861 		}
4862 		ill_free_mib(ill);
4863 		mi_free(ill);
4864 	}
4865 	rw_exit(&ill_g_lock);
4866 	if (error != NULL)
4867 		*error = ENOMEM;
4868 	return (NULL);
4869 }
4870 
4871 /*
4872  * Return a pointer to the ill which matches the index and IP version type.
4873  */
4874 ill_t *
4875 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp,
4876     ipsq_func_t func, int *err)
4877 {
4878 	ill_t	*ill;
4879 	ipsq_t  *ipsq;
4880 	phyint_t *phyi;
4881 
4882 	ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) ||
4883 	    (q != NULL && mp != NULL && func != NULL && err != NULL));
4884 
4885 	if (err != NULL)
4886 		*err = 0;
4887 
4888 	/*
4889 	 * Indexes are stored in the phyint - a common structure
4890 	 * to both IPv4 and IPv6.
4891 	 */
4892 	rw_enter(&ill_g_lock, RW_READER);
4893 	phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index,
4894 	    (void *) &index, NULL);
4895 	if (phyi != NULL) {
4896 		ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
4897 		if (ill != NULL) {
4898 			/*
4899 			 * The block comment at the start of ipif_down
4900 			 * explains the use of the macros used below
4901 			 */
4902 			GRAB_CONN_LOCK(q);
4903 			mutex_enter(&ill->ill_lock);
4904 			if (ILL_CAN_LOOKUP(ill)) {
4905 				ill_refhold_locked(ill);
4906 				mutex_exit(&ill->ill_lock);
4907 				RELEASE_CONN_LOCK(q);
4908 				rw_exit(&ill_g_lock);
4909 				return (ill);
4910 			} else if (ILL_CAN_WAIT(ill, q)) {
4911 				ipsq = ill->ill_phyint->phyint_ipsq;
4912 				mutex_enter(&ipsq->ipsq_lock);
4913 				rw_exit(&ill_g_lock);
4914 				mutex_exit(&ill->ill_lock);
4915 				ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
4916 				mutex_exit(&ipsq->ipsq_lock);
4917 				RELEASE_CONN_LOCK(q);
4918 				*err = EINPROGRESS;
4919 				return (NULL);
4920 			}
4921 			RELEASE_CONN_LOCK(q);
4922 			mutex_exit(&ill->ill_lock);
4923 		}
4924 	}
4925 	rw_exit(&ill_g_lock);
4926 	if (err != NULL)
4927 		*err = ENXIO;
4928 	return (NULL);
4929 }
4930 
4931 /*
4932  * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt
4933  * that gives a running thread a reference to the ill. This reference must be
4934  * released by the thread when it is done accessing the ill and related
4935  * objects. ill_refcnt can not be used to account for static references
4936  * such as other structures pointing to an ill. Callers must generally
4937  * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros
4938  * or be sure that the ill is not being deleted or changing state before
4939  * calling the refhold functions. A non-zero ill_refcnt ensures that the
4940  * ill won't change any of its critical state such as address, netmask etc.
4941  */
4942 void
4943 ill_refhold(ill_t *ill)
4944 {
4945 	mutex_enter(&ill->ill_lock);
4946 	ill->ill_refcnt++;
4947 	ILL_TRACE_REF(ill);
4948 	mutex_exit(&ill->ill_lock);
4949 }
4950 
4951 void
4952 ill_refhold_locked(ill_t *ill)
4953 {
4954 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4955 	ill->ill_refcnt++;
4956 	ILL_TRACE_REF(ill);
4957 }
4958 
4959 int
4960 ill_check_and_refhold(ill_t *ill)
4961 {
4962 	mutex_enter(&ill->ill_lock);
4963 	if (ILL_CAN_LOOKUP(ill)) {
4964 		ill_refhold_locked(ill);
4965 		mutex_exit(&ill->ill_lock);
4966 		return (0);
4967 	}
4968 	mutex_exit(&ill->ill_lock);
4969 	return (ILL_LOOKUP_FAILED);
4970 }
4971 
4972 /*
4973  * Must not be called while holding any locks. Otherwise if this is
4974  * the last reference to be released, there is a chance of recursive mutex
4975  * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
4976  * to restart an ioctl.
4977  */
4978 void
4979 ill_refrele(ill_t *ill)
4980 {
4981 	mutex_enter(&ill->ill_lock);
4982 	ASSERT(ill->ill_refcnt != 0);
4983 	ill->ill_refcnt--;
4984 	ILL_UNTRACE_REF(ill);
4985 	if (ill->ill_refcnt != 0) {
4986 		/* Every ire pointing to the ill adds 1 to ill_refcnt */
4987 		mutex_exit(&ill->ill_lock);
4988 		return;
4989 	}
4990 
4991 	/* Drops the ill_lock */
4992 	ipif_ill_refrele_tail(ill);
4993 }
4994 
4995 /*
4996  * Obtain a weak reference count on the ill. This reference ensures the
4997  * ill won't be freed, but the ill may change any of its critical state
4998  * such as netmask, address etc. Returns an error if the ill has started
4999  * closing.
5000  */
5001 boolean_t
5002 ill_waiter_inc(ill_t *ill)
5003 {
5004 	mutex_enter(&ill->ill_lock);
5005 	if (ill->ill_state_flags & ILL_CONDEMNED) {
5006 		mutex_exit(&ill->ill_lock);
5007 		return (B_FALSE);
5008 	}
5009 	ill->ill_waiters++;
5010 	mutex_exit(&ill->ill_lock);
5011 	return (B_TRUE);
5012 }
5013 
5014 void
5015 ill_waiter_dcr(ill_t *ill)
5016 {
5017 	mutex_enter(&ill->ill_lock);
5018 	ill->ill_waiters--;
5019 	if (ill->ill_waiters == 0)
5020 		cv_broadcast(&ill->ill_cv);
5021 	mutex_exit(&ill->ill_lock);
5022 }
5023 
5024 /*
5025  * Named Dispatch routine to produce a formatted report on all ILLs.
5026  * This report is accessed by using the ndd utility to "get" ND variable
5027  * "ip_ill_status".
5028  */
5029 /* ARGSUSED */
5030 int
5031 ip_ill_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
5032 {
5033 	ill_t		*ill;
5034 	ill_walk_context_t ctx;
5035 
5036 	(void) mi_mpprintf(mp,
5037 	    "ILL      " MI_COL_HDRPAD_STR
5038 	/*   01234567[89ABCDEF] */
5039 	    "rq       " MI_COL_HDRPAD_STR
5040 	/*   01234567[89ABCDEF] */
5041 	    "wq       " MI_COL_HDRPAD_STR
5042 	/*   01234567[89ABCDEF] */
5043 	    "upcnt mxfrg err name");
5044 	/*   12345 12345 123 xxxxxxxx  */
5045 
5046 	rw_enter(&ill_g_lock, RW_READER);
5047 	ill = ILL_START_WALK_ALL(&ctx);
5048 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
5049 		(void) mi_mpprintf(mp,
5050 		    MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR
5051 		    "%05u %05u %03d %s",
5052 		    (void *)ill, (void *)ill->ill_rq, (void *)ill->ill_wq,
5053 		    ill->ill_ipif_up_count,
5054 		    ill->ill_max_frag, ill->ill_error, ill->ill_name);
5055 	}
5056 	rw_exit(&ill_g_lock);
5057 
5058 	return (0);
5059 }
5060 
5061 /*
5062  * Named Dispatch routine to produce a formatted report on all IPIFs.
5063  * This report is accessed by using the ndd utility to "get" ND variable
5064  * "ip_ipif_status".
5065  */
5066 /* ARGSUSED */
5067 int
5068 ip_ipif_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
5069 {
5070 	char	buf1[INET6_ADDRSTRLEN];
5071 	char	buf2[INET6_ADDRSTRLEN];
5072 	char	buf3[INET6_ADDRSTRLEN];
5073 	char	buf4[INET6_ADDRSTRLEN];
5074 	char	buf5[INET6_ADDRSTRLEN];
5075 	char	buf6[INET6_ADDRSTRLEN];
5076 	char	buf[LIFNAMSIZ];
5077 	ill_t	*ill;
5078 	ipif_t	*ipif;
5079 	nv_t	*nvp;
5080 	uint64_t flags;
5081 	zoneid_t zoneid;
5082 	ill_walk_context_t ctx;
5083 
5084 	(void) mi_mpprintf(mp,
5085 	    "IPIF metric mtu in/out/forward name zone flags...\n"
5086 	    "\tlocal address\n"
5087 	    "\tsrc address\n"
5088 	    "\tsubnet\n"
5089 	    "\tmask\n"
5090 	    "\tbroadcast\n"
5091 	    "\tp-p-dst");
5092 
5093 	ASSERT(q->q_next == NULL);
5094 	zoneid = Q_TO_CONN(q)->conn_zoneid;	/* IP is a driver */
5095 
5096 	rw_enter(&ill_g_lock, RW_READER);
5097 	ill = ILL_START_WALK_ALL(&ctx);
5098 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
5099 		for (ipif = ill->ill_ipif; ipif; ipif = ipif->ipif_next) {
5100 			if (zoneid != GLOBAL_ZONEID &&
5101 			    zoneid != ipif->ipif_zoneid)
5102 				continue;
5103 			(void) mi_mpprintf(mp,
5104 			    MI_COL_PTRFMT_STR
5105 			    "%04u %05u %u/%u/%u %s %d",
5106 			    (void *)ipif,
5107 			    ipif->ipif_metric, ipif->ipif_mtu,
5108 			    ipif->ipif_ib_pkt_count,
5109 			    ipif->ipif_ob_pkt_count,
5110 			    ipif->ipif_fo_pkt_count,
5111 			    ipif_get_name(ipif, buf, sizeof (buf)),
5112 			    ipif->ipif_zoneid);
5113 
5114 		flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags |
5115 		    ipif->ipif_ill->ill_phyint->phyint_flags;
5116 
5117 		/* Tack on text strings for any flags. */
5118 		nvp = ipif_nv_tbl;
5119 		for (; nvp < A_END(ipif_nv_tbl); nvp++) {
5120 			if (nvp->nv_value & flags)
5121 				(void) mi_mpprintf_nr(mp, " %s",
5122 				    nvp->nv_name);
5123 		}
5124 		(void) mi_mpprintf(mp,
5125 		    "\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n\t%s",
5126 		    inet_ntop(AF_INET6,
5127 			&ipif->ipif_v6lcl_addr, buf1, sizeof (buf1)),
5128 		    inet_ntop(AF_INET6,
5129 			&ipif->ipif_v6src_addr, buf2, sizeof (buf2)),
5130 		    inet_ntop(AF_INET6,
5131 			&ipif->ipif_v6subnet, buf3, sizeof (buf3)),
5132 		    inet_ntop(AF_INET6,
5133 			&ipif->ipif_v6net_mask, buf4, sizeof (buf4)),
5134 		    inet_ntop(AF_INET6,
5135 			&ipif->ipif_v6brd_addr, buf5, sizeof (buf5)),
5136 		    inet_ntop(AF_INET6,
5137 			&ipif->ipif_v6pp_dst_addr,
5138 			buf6, sizeof (buf6)));
5139 		}
5140 	}
5141 	rw_exit(&ill_g_lock);
5142 	return (0);
5143 }
5144 
5145 /*
5146  * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the
5147  * driver.  We construct best guess defaults for lower level information that
5148  * we need.  If an interface is brought up without injection of any overriding
5149  * information from outside, we have to be ready to go with these defaults.
5150  * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ)
5151  * we primarely want the dl_provider_style.
5152  * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND
5153  * at which point we assume the other part of the information is valid.
5154  */
5155 void
5156 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
5157 {
5158 	uchar_t		*brdcst_addr;
5159 	uint_t		brdcst_addr_length, phys_addr_length;
5160 	t_scalar_t	sap_length;
5161 	dl_info_ack_t	*dlia;
5162 	ip_m_t		*ipm;
5163 	dl_qos_cl_sel1_t *sel1;
5164 
5165 	ASSERT(IAM_WRITER_ILL(ill));
5166 
5167 	/*
5168 	 * Till the ill is fully up ILL_CHANGING will be set and
5169 	 * the ill is not globally visible. So no need for a lock.
5170 	 */
5171 	dlia = (dl_info_ack_t *)mp->b_rptr;
5172 	ill->ill_mactype = dlia->dl_mac_type;
5173 
5174 	ipm = ip_m_lookup(dlia->dl_mac_type);
5175 	if (ipm == NULL) {
5176 		ipm = ip_m_lookup(DL_OTHER);
5177 		ASSERT(ipm != NULL);
5178 	}
5179 	ill->ill_media = ipm;
5180 
5181 	/*
5182 	 * When the new DLPI stuff is ready we'll pull lengths
5183 	 * from dlia.
5184 	 */
5185 	if (dlia->dl_version == DL_VERSION_2) {
5186 		brdcst_addr_length = dlia->dl_brdcst_addr_length;
5187 		brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset,
5188 		    brdcst_addr_length);
5189 		if (brdcst_addr == NULL) {
5190 			brdcst_addr_length = 0;
5191 		}
5192 		sap_length = dlia->dl_sap_length;
5193 		phys_addr_length = dlia->dl_addr_length - ABS(sap_length);
5194 		ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n",
5195 		    brdcst_addr_length, sap_length, phys_addr_length));
5196 	} else {
5197 		brdcst_addr_length = 6;
5198 		brdcst_addr = ip_six_byte_all_ones;
5199 		sap_length = -2;
5200 		phys_addr_length = brdcst_addr_length;
5201 	}
5202 
5203 	ill->ill_bcast_addr_length = brdcst_addr_length;
5204 	ill->ill_phys_addr_length = phys_addr_length;
5205 	ill->ill_sap_length = sap_length;
5206 	ill->ill_max_frag = dlia->dl_max_sdu;
5207 	ill->ill_max_mtu = ill->ill_max_frag;
5208 
5209 	ill->ill_type = ipm->ip_m_type;
5210 
5211 	if (!ill->ill_dlpi_style_set) {
5212 		if (dlia->dl_provider_style == DL_STYLE2)
5213 			ill->ill_needs_attach = 1;
5214 
5215 		/*
5216 		 * Allocate the first ipif on this ill. We don't delay it
5217 		 * further as ioctl handling assumes atleast one ipif to
5218 		 * be present.
5219 		 *
5220 		 * At this point we don't know whether the ill is v4 or v6.
5221 		 * We will know this whan the SIOCSLIFNAME happens and
5222 		 * the correct value for ill_isv6 will be assigned in
5223 		 * ipif_set_values(). We need to hold the ill lock and
5224 		 * clear the ILL_LL_SUBNET_PENDING flag and atomically do
5225 		 * the wakeup.
5226 		 */
5227 		(void) ipif_allocate(ill, 0, IRE_LOCAL,
5228 		    dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE);
5229 		mutex_enter(&ill->ill_lock);
5230 		ASSERT(ill->ill_dlpi_style_set == 0);
5231 		ill->ill_dlpi_style_set = 1;
5232 		ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING;
5233 		cv_broadcast(&ill->ill_cv);
5234 		mutex_exit(&ill->ill_lock);
5235 		freemsg(mp);
5236 		return;
5237 	}
5238 	ASSERT(ill->ill_ipif != NULL);
5239 	/*
5240 	 * We know whether it is IPv4 or IPv6 now, as this is the
5241 	 * second DL_INFO_ACK we are recieving in response to the
5242 	 * DL_INFO_REQ sent in ipif_set_values.
5243 	 */
5244 	if (ill->ill_isv6)
5245 		ill->ill_sap = IP6_DL_SAP;
5246 	else
5247 		ill->ill_sap = IP_DL_SAP;
5248 	/*
5249 	 * Set ipif_mtu which is used to set the IRE's
5250 	 * ire_max_frag value. The driver could have sent
5251 	 * a different mtu from what it sent last time. No
5252 	 * need to call ipif_mtu_change because IREs have
5253 	 * not yet been created.
5254 	 */
5255 	ill->ill_ipif->ipif_mtu = ill->ill_max_mtu;
5256 	/*
5257 	 * Clear all the flags that were set based on ill_bcast_addr_length
5258 	 * and ill_phys_addr_length (in ipif_set_values) as these could have
5259 	 * changed now and we need to re-evaluate.
5260 	 */
5261 	ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP);
5262 	ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT);
5263 
5264 	/*
5265 	 * Free ill_resolver_mp and ill_bcast_mp as things could have
5266 	 * changed now.
5267 	 */
5268 	if (ill->ill_bcast_addr_length == 0) {
5269 		if (ill->ill_resolver_mp != NULL)
5270 			freemsg(ill->ill_resolver_mp);
5271 		if (ill->ill_bcast_mp != NULL)
5272 			freemsg(ill->ill_bcast_mp);
5273 		if (ill->ill_flags & ILLF_XRESOLV)
5274 			ill->ill_net_type = IRE_IF_RESOLVER;
5275 		else
5276 			ill->ill_net_type = IRE_IF_NORESOLVER;
5277 		ill->ill_resolver_mp = ill_dlur_gen(NULL,
5278 		    ill->ill_phys_addr_length,
5279 		    ill->ill_sap,
5280 		    ill->ill_sap_length);
5281 		ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp);
5282 
5283 		if (ill->ill_isv6)
5284 			/*
5285 			 * Note: xresolv interfaces will eventually need NOARP
5286 			 * set here as well, but that will require those
5287 			 * external resolvers to have some knowledge of
5288 			 * that flag and act appropriately. Not to be changed
5289 			 * at present.
5290 			 */
5291 			ill->ill_flags |= ILLF_NONUD;
5292 		else
5293 			ill->ill_flags |= ILLF_NOARP;
5294 
5295 		if (ill->ill_phys_addr_length == 0) {
5296 			if (ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) {
5297 				ill->ill_ipif->ipif_flags |= IPIF_NOXMIT;
5298 				ill->ill_phyint->phyint_flags |= PHYI_VIRTUAL;
5299 			} else {
5300 				/* pt-pt supports multicast. */
5301 				ill->ill_flags |= ILLF_MULTICAST;
5302 				ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT;
5303 			}
5304 		}
5305 	} else {
5306 		ill->ill_net_type = IRE_IF_RESOLVER;
5307 		if (ill->ill_bcast_mp != NULL)
5308 			freemsg(ill->ill_bcast_mp);
5309 		ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr,
5310 		    ill->ill_bcast_addr_length, ill->ill_sap,
5311 		    ill->ill_sap_length);
5312 		/*
5313 		 * Later detect lack of DLPI driver multicast
5314 		 * capability by catching DL_ENABMULTI errors in
5315 		 * ip_rput_dlpi.
5316 		 */
5317 		ill->ill_flags |= ILLF_MULTICAST;
5318 		if (!ill->ill_isv6)
5319 			ill->ill_ipif->ipif_flags |= IPIF_BROADCAST;
5320 	}
5321 	/* By default an interface does not support any CoS marking */
5322 	ill->ill_flags &= ~ILLF_COS_ENABLED;
5323 
5324 	/*
5325 	 * If we get QoS information in DL_INFO_ACK, the device supports
5326 	 * some form of CoS marking, set ILLF_COS_ENABLED.
5327 	 */
5328 	sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset,
5329 	    dlia->dl_qos_length);
5330 	if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) {
5331 		ill->ill_flags |= ILLF_COS_ENABLED;
5332 	}
5333 
5334 	/* Clear any previous error indication. */
5335 	ill->ill_error = 0;
5336 	freemsg(mp);
5337 }
5338 
5339 /*
5340  * Perform various checks to verify that an address would make sense as a
5341  * local, remote, or subnet interface address.
5342  */
5343 static boolean_t
5344 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask)
5345 {
5346 	ipaddr_t	net_mask;
5347 
5348 	/*
5349 	 * Don't allow all zeroes, all ones or experimental address, but allow
5350 	 * all ones netmask.
5351 	 */
5352 	if ((net_mask = ip_net_mask(addr)) == 0)
5353 		return (B_FALSE);
5354 	/* A given netmask overrides the "guess" netmask */
5355 	if (subnet_mask != 0)
5356 		net_mask = subnet_mask;
5357 	if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) ||
5358 	    (addr == (addr | ~net_mask)))) {
5359 		return (B_FALSE);
5360 	}
5361 	if (CLASSD(addr))
5362 		return (B_FALSE);
5363 
5364 	return (B_TRUE);
5365 }
5366 
5367 /*
5368  * ipif_lookup_group
5369  * Returns held ipif
5370  */
5371 ipif_t *
5372 ipif_lookup_group(ipaddr_t group, zoneid_t zoneid)
5373 {
5374 	ire_t	*ire;
5375 	ipif_t	*ipif;
5376 
5377 	ire = ire_lookup_multi(group, zoneid);
5378 	if (ire == NULL)
5379 		return (NULL);
5380 	ipif = ire->ire_ipif;
5381 	ipif_refhold(ipif);
5382 	ire_refrele(ire);
5383 	return (ipif);
5384 }
5385 
5386 /*
5387  * Look for an ipif with the specified interface address and destination.
5388  * The destination address is used only for matching point-to-point interfaces.
5389  */
5390 ipif_t *
5391 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
5392     ipsq_func_t func, int *error)
5393 {
5394 	ipif_t	*ipif;
5395 	ill_t	*ill;
5396 	ill_walk_context_t ctx;
5397 	ipsq_t	*ipsq;
5398 
5399 	if (error != NULL)
5400 		*error = 0;
5401 
5402 	/*
5403 	 * First match all the point-to-point interfaces
5404 	 * before looking at non-point-to-point interfaces.
5405 	 * This is done to avoid returning non-point-to-point
5406 	 * ipif instead of unnumbered point-to-point ipif.
5407 	 */
5408 	rw_enter(&ill_g_lock, RW_READER);
5409 	ill = ILL_START_WALK_V4(&ctx);
5410 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
5411 		GRAB_CONN_LOCK(q);
5412 		mutex_enter(&ill->ill_lock);
5413 		for (ipif = ill->ill_ipif; ipif != NULL;
5414 		    ipif = ipif->ipif_next) {
5415 			/* Allow the ipif to be down */
5416 			if ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
5417 			    (ipif->ipif_lcl_addr == if_addr) &&
5418 			    (ipif->ipif_pp_dst_addr == dst)) {
5419 				/*
5420 				 * The block comment at the start of ipif_down
5421 				 * explains the use of the macros used below
5422 				 */
5423 				if (IPIF_CAN_LOOKUP(ipif)) {
5424 					ipif_refhold_locked(ipif);
5425 					mutex_exit(&ill->ill_lock);
5426 					RELEASE_CONN_LOCK(q);
5427 					rw_exit(&ill_g_lock);
5428 					return (ipif);
5429 				} else if (IPIF_CAN_WAIT(ipif, q)) {
5430 					ipsq = ill->ill_phyint->phyint_ipsq;
5431 					mutex_enter(&ipsq->ipsq_lock);
5432 					mutex_exit(&ill->ill_lock);
5433 					rw_exit(&ill_g_lock);
5434 					ipsq_enq(ipsq, q, mp, func, NEW_OP,
5435 						ill);
5436 					mutex_exit(&ipsq->ipsq_lock);
5437 					RELEASE_CONN_LOCK(q);
5438 					*error = EINPROGRESS;
5439 					return (NULL);
5440 				}
5441 			}
5442 		}
5443 		mutex_exit(&ill->ill_lock);
5444 		RELEASE_CONN_LOCK(q);
5445 	}
5446 	rw_exit(&ill_g_lock);
5447 
5448 	/* lookup the ipif based on interface address */
5449 	ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error);
5450 	ASSERT(ipif == NULL || !ipif->ipif_isv6);
5451 	return (ipif);
5452 }
5453 
5454 /*
5455  * Look for an ipif with the specified address. For point-point links
5456  * we look for matches on either the destination address and the local
5457  * address, but we ignore the check on the local address if IPIF_UNNUMBERED
5458  * is set.
5459  * Matches on a specific ill if match_ill is set.
5460  */
5461 ipif_t *
5462 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
5463     mblk_t *mp, ipsq_func_t func, int *error)
5464 {
5465 	ipif_t  *ipif;
5466 	ill_t   *ill;
5467 	boolean_t ptp = B_FALSE;
5468 	ipsq_t	*ipsq;
5469 	ill_walk_context_t	ctx;
5470 
5471 	if (error != NULL)
5472 		*error = 0;
5473 
5474 	rw_enter(&ill_g_lock, RW_READER);
5475 	/*
5476 	 * Repeat twice, first based on local addresses and
5477 	 * next time for pointopoint.
5478 	 */
5479 repeat:
5480 	ill = ILL_START_WALK_V4(&ctx);
5481 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
5482 		if (match_ill != NULL && ill != match_ill) {
5483 			continue;
5484 		}
5485 		GRAB_CONN_LOCK(q);
5486 		mutex_enter(&ill->ill_lock);
5487 		for (ipif = ill->ill_ipif; ipif != NULL;
5488 		    ipif = ipif->ipif_next) {
5489 			if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid)
5490 				continue;
5491 			/* Allow the ipif to be down */
5492 			if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
5493 			    ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
5494 			    (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
5495 			    (ipif->ipif_pp_dst_addr == addr))) {
5496 				/*
5497 				 * The block comment at the start of ipif_down
5498 				 * explains the use of the macros used below
5499 				 */
5500 				if (IPIF_CAN_LOOKUP(ipif)) {
5501 					ipif_refhold_locked(ipif);
5502 					mutex_exit(&ill->ill_lock);
5503 					RELEASE_CONN_LOCK(q);
5504 					rw_exit(&ill_g_lock);
5505 					return (ipif);
5506 				} else if (IPIF_CAN_WAIT(ipif, q)) {
5507 					ipsq = ill->ill_phyint->phyint_ipsq;
5508 					mutex_enter(&ipsq->ipsq_lock);
5509 					mutex_exit(&ill->ill_lock);
5510 					rw_exit(&ill_g_lock);
5511 					ipsq_enq(ipsq, q, mp, func, NEW_OP,
5512 						ill);
5513 					mutex_exit(&ipsq->ipsq_lock);
5514 					RELEASE_CONN_LOCK(q);
5515 					*error = EINPROGRESS;
5516 					return (NULL);
5517 				}
5518 			}
5519 		}
5520 		mutex_exit(&ill->ill_lock);
5521 		RELEASE_CONN_LOCK(q);
5522 	}
5523 
5524 	/* Now try the ptp case */
5525 	if (ptp) {
5526 		rw_exit(&ill_g_lock);
5527 		if (error != NULL)
5528 			*error = ENXIO;
5529 		return (NULL);
5530 	}
5531 	ptp = B_TRUE;
5532 	goto repeat;
5533 }
5534 
5535 /*
5536  * Look for an ipif that matches the specified remote address i.e. the
5537  * ipif that would receive the specified packet.
5538  * First look for directly connected interfaces and then do a recursive
5539  * IRE lookup and pick the first ipif corresponding to the source address in the
5540  * ire.
5541  * Returns: held ipif
5542  */
5543 ipif_t *
5544 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
5545 {
5546 	ipif_t	*ipif;
5547 	ire_t	*ire;
5548 
5549 	ASSERT(!ill->ill_isv6);
5550 
5551 	/*
5552 	 * Someone could be changing this ipif currently or change it
5553 	 * after we return this. Thus  a few packets could use the old
5554 	 * old values. However structure updates/creates (ire, ilg, ilm etc)
5555 	 * will atomically be updated or cleaned up with the new value
5556 	 * Thus we don't need a lock to check the flags or other attrs below.
5557 	 */
5558 	mutex_enter(&ill->ill_lock);
5559 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
5560 		if (!IPIF_CAN_LOOKUP(ipif))
5561 			continue;
5562 		if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid)
5563 			continue;
5564 		/* Allow the ipif to be down */
5565 		if (ipif->ipif_flags & IPIF_POINTOPOINT) {
5566 			if ((ipif->ipif_pp_dst_addr == addr) ||
5567 			    (!(ipif->ipif_flags & IPIF_UNNUMBERED) &&
5568 			    ipif->ipif_lcl_addr == addr)) {
5569 				ipif_refhold_locked(ipif);
5570 				mutex_exit(&ill->ill_lock);
5571 				return (ipif);
5572 			}
5573 		} else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) {
5574 			ipif_refhold_locked(ipif);
5575 			mutex_exit(&ill->ill_lock);
5576 			return (ipif);
5577 		}
5578 	}
5579 	mutex_exit(&ill->ill_lock);
5580 	ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid,
5581 	    MATCH_IRE_RECURSIVE);
5582 	if (ire != NULL) {
5583 		/*
5584 		 * The callers of this function wants to know the
5585 		 * interface on which they have to send the replies
5586 		 * back. For IRE_CACHES that have ire_stq and ire_ipif
5587 		 * derived from different ills, we really don't care
5588 		 * what we return here.
5589 		 */
5590 		ipif = ire->ire_ipif;
5591 		if (ipif != NULL) {
5592 			ipif_refhold(ipif);
5593 			ire_refrele(ire);
5594 			return (ipif);
5595 		}
5596 		ire_refrele(ire);
5597 	}
5598 	/* Pick the first interface */
5599 	ipif = ipif_get_next_ipif(NULL, ill);
5600 	return (ipif);
5601 }
5602 
5603 /*
5604  * This func does not prevent refcnt from increasing. But if
5605  * the caller has taken steps to that effect, then this func
5606  * can be used to determine whether the ill has become quiescent
5607  */
5608 boolean_t
5609 ill_is_quiescent(ill_t *ill)
5610 {
5611 	ipif_t	*ipif;
5612 
5613 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5614 
5615 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
5616 		if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0)
5617 			return (B_FALSE);
5618 	}
5619 	if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0 ||
5620 	    ill->ill_nce_cnt != 0 || ill->ill_srcif_refcnt != 0 ||
5621 	    ill->ill_mrtun_refcnt != 0)
5622 		return (B_FALSE);
5623 	return (B_TRUE);
5624 }
5625 
5626 /*
5627  * This func does not prevent refcnt from increasing. But if
5628  * the caller has taken steps to that effect, then this func
5629  * can be used to determine whether the ipif has become quiescent
5630  */
5631 static boolean_t
5632 ipif_is_quiescent(ipif_t *ipif)
5633 {
5634 	ill_t *ill;
5635 
5636 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5637 
5638 	if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0)
5639 		return (B_FALSE);
5640 
5641 	ill = ipif->ipif_ill;
5642 	if (ill->ill_ipif_up_count != 0 || ill->ill_logical_down)
5643 		return (B_TRUE);
5644 
5645 	/* This is the last ipif going down or being deleted on this ill */
5646 	if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0)
5647 		return (B_FALSE);
5648 
5649 	return (B_TRUE);
5650 }
5651 
5652 /*
5653  * This func does not prevent refcnt from increasing. But if
5654  * the caller has taken steps to that effect, then this func
5655  * can be used to determine whether the ipifs marked with IPIF_MOVING
5656  * have become quiescent and can be moved in a failover/failback.
5657  */
5658 static ipif_t *
5659 ill_quiescent_to_move(ill_t *ill)
5660 {
5661 	ipif_t  *ipif;
5662 
5663 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5664 
5665 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
5666 		if (ipif->ipif_state_flags & IPIF_MOVING) {
5667 			if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) {
5668 				return (ipif);
5669 			}
5670 		}
5671 	}
5672 	return (NULL);
5673 }
5674 
5675 /*
5676  * The ipif/ill/ire has been refreled. Do the tail processing.
5677  * Determine if the ipif or ill in question has become quiescent and if so
5678  * wakeup close and/or restart any queued pending ioctl that is waiting
5679  * for the ipif_down (or ill_down)
5680  */
5681 void
5682 ipif_ill_refrele_tail(ill_t *ill)
5683 {
5684 	mblk_t	*mp;
5685 	conn_t	*connp;
5686 	ipsq_t	*ipsq;
5687 	ipif_t	*ipif;
5688 
5689 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5690 
5691 	if ((ill->ill_state_flags & ILL_CONDEMNED) &&
5692 	    ill_is_quiescent(ill)) {
5693 		/* ill_close may be waiting */
5694 		cv_broadcast(&ill->ill_cv);
5695 	}
5696 
5697 	/* ipsq can't change because ill_lock  is held */
5698 	ipsq = ill->ill_phyint->phyint_ipsq;
5699 	if (ipsq->ipsq_waitfor == 0) {
5700 		/* Not waiting for anything, just return. */
5701 		mutex_exit(&ill->ill_lock);
5702 		return;
5703 	}
5704 	ASSERT(ipsq->ipsq_pending_mp != NULL &&
5705 		ipsq->ipsq_pending_ipif != NULL);
5706 	/*
5707 	 * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF.
5708 	 * Last ipif going down needs to down the ill, so ill_ire_cnt must
5709 	 * be zero for restarting an ioctl that ends up downing the ill.
5710 	 */
5711 	ipif = ipsq->ipsq_pending_ipif;
5712 	if (ipif->ipif_ill != ill) {
5713 		/* The ioctl is pending on some other ill. */
5714 		mutex_exit(&ill->ill_lock);
5715 		return;
5716 	}
5717 
5718 	switch (ipsq->ipsq_waitfor) {
5719 	case IPIF_DOWN:
5720 	case IPIF_FREE:
5721 		if (!ipif_is_quiescent(ipif)) {
5722 			mutex_exit(&ill->ill_lock);
5723 			return;
5724 		}
5725 		break;
5726 
5727 	case ILL_DOWN:
5728 	case ILL_FREE:
5729 		/*
5730 		 * case ILL_FREE arises only for loopback. otherwise ill_delete
5731 		 * waits synchronously in ip_close, and no message is queued in
5732 		 * ipsq_pending_mp at all in this case
5733 		 */
5734 		if (!ill_is_quiescent(ill)) {
5735 			mutex_exit(&ill->ill_lock);
5736 			return;
5737 		}
5738 
5739 		break;
5740 
5741 	case ILL_MOVE_OK:
5742 		if (ill_quiescent_to_move(ill) != NULL) {
5743 			mutex_exit(&ill->ill_lock);
5744 			return;
5745 		}
5746 
5747 		break;
5748 	default:
5749 		cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n",
5750 		    (void *)ipsq, ipsq->ipsq_waitfor);
5751 	}
5752 
5753 	/*
5754 	 * Incr refcnt for the qwriter_ip call below which
5755 	 * does a refrele
5756 	 */
5757 	ill_refhold_locked(ill);
5758 	mutex_exit(&ill->ill_lock);
5759 
5760 	mp = ipsq_pending_mp_get(ipsq, &connp);
5761 	ASSERT(mp != NULL);
5762 
5763 	switch (mp->b_datap->db_type) {
5764 	case M_ERROR:
5765 	case M_HANGUP:
5766 		(void) qwriter_ip(NULL, ill, ill->ill_rq, mp,
5767 		    ipif_all_down_tail, CUR_OP, B_TRUE);
5768 		return;
5769 
5770 	case M_IOCTL:
5771 	case M_IOCDATA:
5772 		(void) qwriter_ip(NULL, ill,
5773 		    (connp != NULL ? CONNP_TO_WQ(connp) : ill->ill_wq), mp,
5774 		    ip_reprocess_ioctl, CUR_OP, B_TRUE);
5775 		return;
5776 
5777 	default:
5778 		cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p "
5779 		    "db_type %d\n", (void *)mp, mp->b_datap->db_type);
5780 	}
5781 }
5782 
5783 #ifdef ILL_DEBUG
5784 /* Reuse trace buffer from beginning (if reached the end) and record trace */
5785 void
5786 th_trace_rrecord(th_trace_t *th_trace)
5787 {
5788 	tr_buf_t *tr_buf;
5789 	uint_t lastref;
5790 
5791 	lastref = th_trace->th_trace_lastref;
5792 	lastref++;
5793 	if (lastref == TR_BUF_MAX)
5794 		lastref = 0;
5795 	th_trace->th_trace_lastref = lastref;
5796 	tr_buf = &th_trace->th_trbuf[lastref];
5797 	tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, IP_STACK_DEPTH);
5798 }
5799 
5800 th_trace_t *
5801 th_trace_ipif_lookup(ipif_t *ipif)
5802 {
5803 	int bucket_id;
5804 	th_trace_t *th_trace;
5805 
5806 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5807 
5808 	bucket_id = IP_TR_HASH(curthread);
5809 	ASSERT(bucket_id < IP_TR_HASH_MAX);
5810 
5811 	for (th_trace = ipif->ipif_trace[bucket_id]; th_trace != NULL;
5812 	    th_trace = th_trace->th_next) {
5813 		if (th_trace->th_id == curthread)
5814 			return (th_trace);
5815 	}
5816 	return (NULL);
5817 }
5818 
5819 void
5820 ipif_trace_ref(ipif_t *ipif)
5821 {
5822 	int bucket_id;
5823 	th_trace_t *th_trace;
5824 
5825 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5826 
5827 	if (ipif->ipif_trace_disable)
5828 		return;
5829 
5830 	/*
5831 	 * Attempt to locate the trace buffer for the curthread.
5832 	 * If it does not exist, then allocate a new trace buffer
5833 	 * and link it in list of trace bufs for this ipif, at the head
5834 	 */
5835 	th_trace = th_trace_ipif_lookup(ipif);
5836 	if (th_trace == NULL) {
5837 		bucket_id = IP_TR_HASH(curthread);
5838 		th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t),
5839 		    KM_NOSLEEP);
5840 		if (th_trace == NULL) {
5841 			ipif->ipif_trace_disable = B_TRUE;
5842 			ipif_trace_cleanup(ipif);
5843 			return;
5844 		}
5845 		th_trace->th_id = curthread;
5846 		th_trace->th_next = ipif->ipif_trace[bucket_id];
5847 		th_trace->th_prev = &ipif->ipif_trace[bucket_id];
5848 		if (th_trace->th_next != NULL)
5849 			th_trace->th_next->th_prev = &th_trace->th_next;
5850 		ipif->ipif_trace[bucket_id] = th_trace;
5851 	}
5852 	ASSERT(th_trace->th_refcnt >= 0 &&
5853 		th_trace->th_refcnt < TR_BUF_MAX -1);
5854 	th_trace->th_refcnt++;
5855 	th_trace_rrecord(th_trace);
5856 }
5857 
5858 void
5859 ipif_untrace_ref(ipif_t *ipif)
5860 {
5861 	th_trace_t *th_trace;
5862 
5863 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5864 
5865 	if (ipif->ipif_trace_disable)
5866 		return;
5867 	th_trace = th_trace_ipif_lookup(ipif);
5868 	ASSERT(th_trace != NULL);
5869 	ASSERT(th_trace->th_refcnt > 0);
5870 
5871 	th_trace->th_refcnt--;
5872 	th_trace_rrecord(th_trace);
5873 }
5874 
5875 th_trace_t *
5876 th_trace_ill_lookup(ill_t *ill)
5877 {
5878 	th_trace_t *th_trace;
5879 	int bucket_id;
5880 
5881 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5882 
5883 	bucket_id = IP_TR_HASH(curthread);
5884 	ASSERT(bucket_id < IP_TR_HASH_MAX);
5885 
5886 	for (th_trace = ill->ill_trace[bucket_id]; th_trace != NULL;
5887 	    th_trace = th_trace->th_next) {
5888 		if (th_trace->th_id == curthread)
5889 			return (th_trace);
5890 	}
5891 	return (NULL);
5892 }
5893 
5894 void
5895 ill_trace_ref(ill_t *ill)
5896 {
5897 	int bucket_id;
5898 	th_trace_t *th_trace;
5899 
5900 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5901 	if (ill->ill_trace_disable)
5902 		return;
5903 	/*
5904 	 * Attempt to locate the trace buffer for the curthread.
5905 	 * If it does not exist, then allocate a new trace buffer
5906 	 * and link it in list of trace bufs for this ill, at the head
5907 	 */
5908 	th_trace = th_trace_ill_lookup(ill);
5909 	if (th_trace == NULL) {
5910 		bucket_id = IP_TR_HASH(curthread);
5911 		th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t),
5912 		    KM_NOSLEEP);
5913 		if (th_trace == NULL) {
5914 			ill->ill_trace_disable = B_TRUE;
5915 			ill_trace_cleanup(ill);
5916 			return;
5917 		}
5918 		th_trace->th_id = curthread;
5919 		th_trace->th_next = ill->ill_trace[bucket_id];
5920 		th_trace->th_prev = &ill->ill_trace[bucket_id];
5921 		if (th_trace->th_next != NULL)
5922 			th_trace->th_next->th_prev = &th_trace->th_next;
5923 		ill->ill_trace[bucket_id] = th_trace;
5924 	}
5925 	ASSERT(th_trace->th_refcnt >= 0 &&
5926 		th_trace->th_refcnt < TR_BUF_MAX - 1);
5927 
5928 	th_trace->th_refcnt++;
5929 	th_trace_rrecord(th_trace);
5930 }
5931 
5932 void
5933 ill_untrace_ref(ill_t *ill)
5934 {
5935 	th_trace_t *th_trace;
5936 
5937 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5938 
5939 	if (ill->ill_trace_disable)
5940 		return;
5941 	th_trace = th_trace_ill_lookup(ill);
5942 	ASSERT(th_trace != NULL);
5943 	ASSERT(th_trace->th_refcnt > 0);
5944 
5945 	th_trace->th_refcnt--;
5946 	th_trace_rrecord(th_trace);
5947 }
5948 
5949 /*
5950  * Verify that this thread has no refs to the ipif and free
5951  * the trace buffers
5952  */
5953 /* ARGSUSED */
5954 void
5955 ipif_thread_exit(ipif_t *ipif, void *dummy)
5956 {
5957 	th_trace_t *th_trace;
5958 
5959 	mutex_enter(&ipif->ipif_ill->ill_lock);
5960 
5961 	th_trace = th_trace_ipif_lookup(ipif);
5962 	if (th_trace == NULL) {
5963 		mutex_exit(&ipif->ipif_ill->ill_lock);
5964 		return;
5965 	}
5966 	ASSERT(th_trace->th_refcnt == 0);
5967 	/* unlink th_trace and free it */
5968 	*th_trace->th_prev = th_trace->th_next;
5969 	if (th_trace->th_next != NULL)
5970 		th_trace->th_next->th_prev = th_trace->th_prev;
5971 	th_trace->th_next = NULL;
5972 	th_trace->th_prev = NULL;
5973 	kmem_free(th_trace, sizeof (th_trace_t));
5974 
5975 	mutex_exit(&ipif->ipif_ill->ill_lock);
5976 }
5977 
5978 /*
5979  * Verify that this thread has no refs to the ill and free
5980  * the trace buffers
5981  */
5982 /* ARGSUSED */
5983 void
5984 ill_thread_exit(ill_t *ill, void *dummy)
5985 {
5986 	th_trace_t *th_trace;
5987 
5988 	mutex_enter(&ill->ill_lock);
5989 
5990 	th_trace = th_trace_ill_lookup(ill);
5991 	if (th_trace == NULL) {
5992 		mutex_exit(&ill->ill_lock);
5993 		return;
5994 	}
5995 	ASSERT(th_trace->th_refcnt == 0);
5996 	/* unlink th_trace and free it */
5997 	*th_trace->th_prev = th_trace->th_next;
5998 	if (th_trace->th_next != NULL)
5999 		th_trace->th_next->th_prev = th_trace->th_prev;
6000 	th_trace->th_next = NULL;
6001 	th_trace->th_prev = NULL;
6002 	kmem_free(th_trace, sizeof (th_trace_t));
6003 
6004 	mutex_exit(&ill->ill_lock);
6005 }
6006 #endif
6007 
6008 #ifdef ILL_DEBUG
6009 void
6010 ip_thread_exit(void)
6011 {
6012 	ill_t	*ill;
6013 	ipif_t	*ipif;
6014 	ill_walk_context_t	ctx;
6015 
6016 	rw_enter(&ill_g_lock, RW_READER);
6017 	ill = ILL_START_WALK_ALL(&ctx);
6018 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
6019 		for (ipif = ill->ill_ipif; ipif != NULL;
6020 		    ipif = ipif->ipif_next) {
6021 			ipif_thread_exit(ipif, NULL);
6022 		}
6023 		ill_thread_exit(ill, NULL);
6024 	}
6025 	rw_exit(&ill_g_lock);
6026 
6027 	ire_walk(ire_thread_exit, NULL);
6028 	ndp_walk_impl(NULL, nce_thread_exit, NULL, B_FALSE);
6029 }
6030 
6031 /*
6032  * Called when ipif is unplumbed or when memory alloc fails
6033  */
6034 void
6035 ipif_trace_cleanup(ipif_t *ipif)
6036 {
6037 	int	i;
6038 	th_trace_t	*th_trace;
6039 	th_trace_t	*th_trace_next;
6040 
6041 	for (i = 0; i < IP_TR_HASH_MAX; i++) {
6042 		for (th_trace = ipif->ipif_trace[i]; th_trace != NULL;
6043 		    th_trace = th_trace_next) {
6044 			th_trace_next = th_trace->th_next;
6045 			kmem_free(th_trace, sizeof (th_trace_t));
6046 		}
6047 		ipif->ipif_trace[i] = NULL;
6048 	}
6049 }
6050 
6051 /*
6052  * Called when ill is unplumbed or when memory alloc fails
6053  */
6054 void
6055 ill_trace_cleanup(ill_t *ill)
6056 {
6057 	int	i;
6058 	th_trace_t	*th_trace;
6059 	th_trace_t	*th_trace_next;
6060 
6061 	for (i = 0; i < IP_TR_HASH_MAX; i++) {
6062 		for (th_trace = ill->ill_trace[i]; th_trace != NULL;
6063 		    th_trace = th_trace_next) {
6064 			th_trace_next = th_trace->th_next;
6065 			kmem_free(th_trace, sizeof (th_trace_t));
6066 		}
6067 		ill->ill_trace[i] = NULL;
6068 	}
6069 }
6070 
6071 #else
6072 void ip_thread_exit(void) {}
6073 #endif
6074 
6075 void
6076 ipif_refhold_locked(ipif_t *ipif)
6077 {
6078 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
6079 	ipif->ipif_refcnt++;
6080 	IPIF_TRACE_REF(ipif);
6081 }
6082 
6083 void
6084 ipif_refhold(ipif_t *ipif)
6085 {
6086 	ill_t	*ill;
6087 
6088 	ill = ipif->ipif_ill;
6089 	mutex_enter(&ill->ill_lock);
6090 	ipif->ipif_refcnt++;
6091 	IPIF_TRACE_REF(ipif);
6092 	mutex_exit(&ill->ill_lock);
6093 }
6094 
6095 /*
6096  * Must not be called while holding any locks. Otherwise if this is
6097  * the last reference to be released there is a chance of recursive mutex
6098  * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
6099  * to restart an ioctl.
6100  */
6101 void
6102 ipif_refrele(ipif_t *ipif)
6103 {
6104 	ill_t	*ill;
6105 
6106 	ill = ipif->ipif_ill;
6107 
6108 	mutex_enter(&ill->ill_lock);
6109 	ASSERT(ipif->ipif_refcnt != 0);
6110 	ipif->ipif_refcnt--;
6111 	IPIF_UNTRACE_REF(ipif);
6112 	if (ipif->ipif_refcnt != 0) {
6113 		mutex_exit(&ill->ill_lock);
6114 		return;
6115 	}
6116 
6117 	/* Drops the ill_lock */
6118 	ipif_ill_refrele_tail(ill);
6119 }
6120 
6121 ipif_t *
6122 ipif_get_next_ipif(ipif_t *curr, ill_t *ill)
6123 {
6124 	ipif_t	*ipif;
6125 
6126 	mutex_enter(&ill->ill_lock);
6127 	for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next);
6128 	    ipif != NULL; ipif = ipif->ipif_next) {
6129 		if (!IPIF_CAN_LOOKUP(ipif))
6130 			continue;
6131 		ipif_refhold_locked(ipif);
6132 		mutex_exit(&ill->ill_lock);
6133 		return (ipif);
6134 	}
6135 	mutex_exit(&ill->ill_lock);
6136 	return (NULL);
6137 }
6138 
6139 /*
6140  * TODO: make this table extendible at run time
6141  * Return a pointer to the mac type info for 'mac_type'
6142  */
6143 static ip_m_t *
6144 ip_m_lookup(t_uscalar_t mac_type)
6145 {
6146 	ip_m_t	*ipm;
6147 
6148 	for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++)
6149 		if (ipm->ip_m_mac_type == mac_type)
6150 			return (ipm);
6151 	return (NULL);
6152 }
6153 
6154 /*
6155  * ip_rt_add is called to add an IPv4 route to the forwarding table.
6156  * ipif_arg is passed in to associate it with the correct interface.
6157  * We may need to restart this operation if the ipif cannot be looked up
6158  * due to an exclusive operation that is currently in progress. The restart
6159  * entry point is specified by 'func'
6160  */
6161 int
6162 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
6163     ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ipif_t *src_ipif,
6164     ire_t **ire_arg, boolean_t ioctl_msg, queue_t *q, mblk_t *mp,
6165     ipsq_func_t func)
6166 {
6167 	ire_t	*ire;
6168 	ire_t	*gw_ire = NULL;
6169 	ipif_t	*ipif = NULL;
6170 	boolean_t ipif_refheld = B_FALSE;
6171 	uint_t	type;
6172 	int	match_flags = MATCH_IRE_TYPE;
6173 	int	error;
6174 
6175 	ip1dbg(("ip_rt_add:"));
6176 
6177 	if (ire_arg != NULL)
6178 		*ire_arg = NULL;
6179 
6180 	/*
6181 	 * If this is the case of RTF_HOST being set, then we set the netmask
6182 	 * to all ones (regardless if one was supplied).
6183 	 */
6184 	if (flags & RTF_HOST)
6185 		mask = IP_HOST_MASK;
6186 
6187 	/*
6188 	 * Prevent routes with a zero gateway from being created (since
6189 	 * interfaces can currently be plumbed and brought up no assigned
6190 	 * address).
6191 	 * For routes with RTA_SRCIFP, the gateway address can be 0.0.0.0.
6192 	 */
6193 	if (gw_addr == 0 && src_ipif == NULL)
6194 		return (ENETUNREACH);
6195 	/*
6196 	 * Get the ipif, if any, corresponding to the gw_addr
6197 	 */
6198 	if (gw_addr != 0) {
6199 		ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func,
6200 		    &error);
6201 		if (ipif != NULL) {
6202 			if (IS_VNI(ipif->ipif_ill)) {
6203 				ipif_refrele(ipif);
6204 				return (EINVAL);
6205 			}
6206 			ipif_refheld = B_TRUE;
6207 		} else if (error == EINPROGRESS) {
6208 			ip1dbg(("ip_rt_add: null and EINPROGRESS"));
6209 			return (EINPROGRESS);
6210 		} else {
6211 			error = 0;
6212 		}
6213 	}
6214 
6215 	if (ipif != NULL) {
6216 		ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull"));
6217 		ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
6218 	} else {
6219 		ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null"));
6220 	}
6221 
6222 	/*
6223 	 * GateD will attempt to create routes with a loopback interface
6224 	 * address as the gateway and with RTF_GATEWAY set.  We allow
6225 	 * these routes to be added, but create them as interface routes
6226 	 * since the gateway is an interface address.
6227 	 */
6228 	if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK))
6229 		flags &= ~RTF_GATEWAY;
6230 
6231 	/*
6232 	 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set
6233 	 * and the gateway address provided is one of the system's interface
6234 	 * addresses.  By using the routing socket interface and supplying an
6235 	 * RTA_IFP sockaddr with an interface index, an alternate method of
6236 	 * specifying an interface route to be created is available which uses
6237 	 * the interface index that specifies the outgoing interface rather than
6238 	 * the address of an outgoing interface (which may not be able to
6239 	 * uniquely identify an interface).  When coupled with the RTF_GATEWAY
6240 	 * flag, routes can be specified which not only specify the next-hop to
6241 	 * be used when routing to a certain prefix, but also which outgoing
6242 	 * interface should be used.
6243 	 *
6244 	 * Previously, interfaces would have unique addresses assigned to them
6245 	 * and so the address assigned to a particular interface could be used
6246 	 * to identify a particular interface.  One exception to this was the
6247 	 * case of an unnumbered interface (where IPIF_UNNUMBERED was set).
6248 	 *
6249 	 * With the advent of IPv6 and its link-local addresses, this
6250 	 * restriction was relaxed and interfaces could share addresses between
6251 	 * themselves.  In fact, typically all of the link-local interfaces on
6252 	 * an IPv6 node or router will have the same link-local address.  In
6253 	 * order to differentiate between these interfaces, the use of an
6254 	 * interface index is necessary and this index can be carried inside a
6255 	 * RTA_IFP sockaddr (which is actually a sockaddr_dl).  One restriction
6256 	 * of using the interface index, however, is that all of the ipif's that
6257 	 * are part of an ill have the same index and so the RTA_IFP sockaddr
6258 	 * cannot be used to differentiate between ipif's (or logical
6259 	 * interfaces) that belong to the same ill (physical interface).
6260 	 *
6261 	 * For example, in the following case involving IPv4 interfaces and
6262 	 * logical interfaces
6263 	 *
6264 	 *	192.0.2.32	255.255.255.224	192.0.2.33	U	if0
6265 	 *	192.0.2.32	255.255.255.224	192.0.2.34	U	if0:1
6266 	 *	192.0.2.32	255.255.255.224	192.0.2.35	U	if0:2
6267 	 *
6268 	 * the ipif's corresponding to each of these interface routes can be
6269 	 * uniquely identified by the "gateway" (actually interface address).
6270 	 *
6271 	 * In this case involving multiple IPv6 default routes to a particular
6272 	 * link-local gateway, the use of RTA_IFP is necessary to specify which
6273 	 * default route is of interest:
6274 	 *
6275 	 *	default		fe80::123:4567:89ab:cdef	U	if0
6276 	 *	default		fe80::123:4567:89ab:cdef	U	if1
6277 	 */
6278 
6279 	/* RTF_GATEWAY not set */
6280 	if (!(flags & RTF_GATEWAY)) {
6281 		queue_t	*stq;
6282 		queue_t	*rfq = NULL;
6283 		ill_t	*in_ill = NULL;
6284 
6285 		/*
6286 		 * As the interface index specified with the RTA_IFP sockaddr is
6287 		 * the same for all ipif's off of an ill, the matching logic
6288 		 * below uses MATCH_IRE_ILL if such an index was specified.
6289 		 * This means that routes sharing the same prefix when added
6290 		 * using a RTA_IFP sockaddr must have distinct interface
6291 		 * indices (namely, they must be on distinct ill's).
6292 		 *
6293 		 * On the other hand, since the gateway address will usually be
6294 		 * different for each ipif on the system, the matching logic
6295 		 * uses MATCH_IRE_IPIF in the case of a traditional interface
6296 		 * route.  This means that interface routes for the same prefix
6297 		 * can be created if they belong to distinct ipif's and if a
6298 		 * RTA_IFP sockaddr is not present.
6299 		 */
6300 		if (ipif_arg != NULL) {
6301 			if (ipif_refheld)  {
6302 				ipif_refrele(ipif);
6303 				ipif_refheld = B_FALSE;
6304 			}
6305 			ipif = ipif_arg;
6306 			match_flags |= MATCH_IRE_ILL;
6307 		} else {
6308 			/*
6309 			 * Check the ipif corresponding to the gw_addr
6310 			 */
6311 			if (ipif == NULL)
6312 				return (ENETUNREACH);
6313 			match_flags |= MATCH_IRE_IPIF;
6314 		}
6315 		ASSERT(ipif != NULL);
6316 		/*
6317 		 * If src_ipif is not NULL, we have to create
6318 		 * an ire with non-null ire_in_ill value
6319 		 */
6320 		if (src_ipif != NULL) {
6321 			in_ill = src_ipif->ipif_ill;
6322 		}
6323 
6324 		/*
6325 		 * We check for an existing entry at this point.
6326 		 *
6327 		 * Since a netmask isn't passed in via the ioctl interface
6328 		 * (SIOCADDRT), we don't check for a matching netmask in that
6329 		 * case.
6330 		 */
6331 		if (!ioctl_msg)
6332 			match_flags |= MATCH_IRE_MASK;
6333 		if (src_ipif != NULL) {
6334 			/* Look up in the special table */
6335 			ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE,
6336 			    ipif, src_ipif->ipif_ill, match_flags);
6337 		} else {
6338 			ire = ire_ftable_lookup(dst_addr, mask, 0,
6339 			    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0,
6340 			    match_flags);
6341 		}
6342 		if (ire != NULL) {
6343 			ire_refrele(ire);
6344 			if (ipif_refheld)
6345 				ipif_refrele(ipif);
6346 			return (EEXIST);
6347 		}
6348 
6349 		if (src_ipif != NULL) {
6350 			/*
6351 			 * Create the special ire for the IRE table
6352 			 * which hangs out of ire_in_ill. This ire
6353 			 * is in-between IRE_CACHE and IRE_INTERFACE.
6354 			 * Thus rfq is non-NULL.
6355 			 */
6356 			rfq = ipif->ipif_rq;
6357 		}
6358 		/* Create the usual interface ires */
6359 
6360 		stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
6361 		    ? ipif->ipif_rq : ipif->ipif_wq;
6362 
6363 		/*
6364 		 * Create a copy of the IRE_LOOPBACK,
6365 		 * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with
6366 		 * the modified address and netmask.
6367 		 */
6368 		ire = ire_create(
6369 		    (uchar_t *)&dst_addr,
6370 		    (uint8_t *)&mask,
6371 		    (uint8_t *)&ipif->ipif_src_addr,
6372 		    NULL,
6373 		    NULL,
6374 		    &ipif->ipif_mtu,
6375 		    NULL,
6376 		    rfq,
6377 		    stq,
6378 		    ipif->ipif_net_type,
6379 		    ipif->ipif_resolver_mp,
6380 		    ipif,
6381 		    in_ill,
6382 		    0,
6383 		    0,
6384 		    0,
6385 		    flags,
6386 		    &ire_uinfo_null);
6387 		if (ire == NULL) {
6388 			if (ipif_refheld)
6389 				ipif_refrele(ipif);
6390 			return (ENOMEM);
6391 		}
6392 
6393 		/*
6394 		 * Some software (for example, GateD and Sun Cluster) attempts
6395 		 * to create (what amount to) IRE_PREFIX routes with the
6396 		 * loopback address as the gateway.  This is primarily done to
6397 		 * set up prefixes with the RTF_REJECT flag set (for example,
6398 		 * when generating aggregate routes.)
6399 		 *
6400 		 * If the IRE type (as defined by ipif->ipif_net_type) is
6401 		 * IRE_LOOPBACK, then we map the request into a
6402 		 * IRE_IF_NORESOLVER.
6403 		 *
6404 		 * Needless to say, the real IRE_LOOPBACK is NOT created by this
6405 		 * routine, but rather using ire_create() directly.
6406 		 */
6407 		if (ipif->ipif_net_type == IRE_LOOPBACK)
6408 			ire->ire_type = IRE_IF_NORESOLVER;
6409 		error = ire_add(&ire, q, mp, func);
6410 		if (error == 0)
6411 			goto save_ire;
6412 
6413 		/*
6414 		 * In the result of failure, ire_add() will have already
6415 		 * deleted the ire in question, so there is no need to
6416 		 * do that here.
6417 		 */
6418 		if (ipif_refheld)
6419 			ipif_refrele(ipif);
6420 		return (error);
6421 	}
6422 	if (ipif_refheld) {
6423 		ipif_refrele(ipif);
6424 		ipif_refheld = B_FALSE;
6425 	}
6426 
6427 	if (src_ipif != NULL) {
6428 		/* RTA_SRCIFP is not supported on RTF_GATEWAY */
6429 		ip2dbg(("ip_rt_add: SRCIF cannot be set with gateway route\n"));
6430 		return (EINVAL);
6431 	}
6432 	/*
6433 	 * Get an interface IRE for the specified gateway.
6434 	 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
6435 	 * gateway, it is currently unreachable and we fail the request
6436 	 * accordingly.
6437 	 */
6438 	ipif = ipif_arg;
6439 	if (ipif_arg != NULL)
6440 		match_flags |= MATCH_IRE_ILL;
6441 	gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL,
6442 	    ALL_ZONES, 0, match_flags);
6443 	if (gw_ire == NULL)
6444 		return (ENETUNREACH);
6445 
6446 	/*
6447 	 * We create one of three types of IREs as a result of this request
6448 	 * based on the netmask.  A netmask of all ones (which is automatically
6449 	 * assumed when RTF_HOST is set) results in an IRE_HOST being created.
6450 	 * An all zeroes netmask implies a default route so an IRE_DEFAULT is
6451 	 * created.  Otherwise, an IRE_PREFIX route is created for the
6452 	 * destination prefix.
6453 	 */
6454 	if (mask == IP_HOST_MASK)
6455 		type = IRE_HOST;
6456 	else if (mask == 0)
6457 		type = IRE_DEFAULT;
6458 	else
6459 		type = IRE_PREFIX;
6460 
6461 	/* check for a duplicate entry */
6462 	ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg,
6463 	    NULL, ALL_ZONES, 0, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW);
6464 	if (ire != NULL) {
6465 		ire_refrele(gw_ire);
6466 		ire_refrele(ire);
6467 		return (EEXIST);
6468 	}
6469 
6470 	/* Create the IRE. */
6471 	ire = ire_create(
6472 	    (uchar_t *)&dst_addr,		/* dest address */
6473 	    (uchar_t *)&mask,			/* mask */
6474 	    /* src address assigned by the caller? */
6475 	    (uchar_t *)(((src_addr != INADDR_ANY) &&
6476 		(flags & RTF_SETSRC)) ?  &src_addr : NULL),
6477 	    (uchar_t *)&gw_addr,		/* gateway address */
6478 	    NULL,				/* no in-srcaddress */
6479 	    &gw_ire->ire_max_frag,
6480 	    NULL,				/* no Fast Path header */
6481 	    NULL,				/* no recv-from queue */
6482 	    NULL,				/* no send-to queue */
6483 	    (ushort_t)type,			/* IRE type */
6484 	    NULL,
6485 	    ipif_arg,
6486 	    NULL,
6487 	    0,
6488 	    0,
6489 	    0,
6490 	    flags,
6491 	    &gw_ire->ire_uinfo);		/* Inherit ULP info from gw */
6492 	if (ire == NULL) {
6493 		ire_refrele(gw_ire);
6494 		return (ENOMEM);
6495 	}
6496 
6497 	/*
6498 	 * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
6499 	 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0?
6500 	 */
6501 
6502 	/* Add the new IRE. */
6503 	error = ire_add(&ire, q, mp, func);
6504 	if (error != 0) {
6505 		/*
6506 		 * In the result of failure, ire_add() will have already
6507 		 * deleted the ire in question, so there is no need to
6508 		 * do that here.
6509 		 */
6510 		ire_refrele(gw_ire);
6511 		return (error);
6512 	}
6513 
6514 	if (flags & RTF_MULTIRT) {
6515 		/*
6516 		 * Invoke the CGTP (multirouting) filtering module
6517 		 * to add the dst address in the filtering database.
6518 		 * Replicated inbound packets coming from that address
6519 		 * will be filtered to discard the duplicates.
6520 		 * It is not necessary to call the CGTP filter hook
6521 		 * when the dst address is a broadcast or multicast,
6522 		 * because an IP source address cannot be a broadcast
6523 		 * or a multicast.
6524 		 */
6525 		ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0,
6526 		    IRE_BROADCAST, NULL, NULL, MATCH_IRE_TYPE);
6527 		if (ire_dst != NULL) {
6528 			ip_cgtp_bcast_add(ire, ire_dst);
6529 			ire_refrele(ire_dst);
6530 			goto save_ire;
6531 		}
6532 		if ((ip_cgtp_filter_ops != NULL) && !CLASSD(ire->ire_addr)) {
6533 			int res = ip_cgtp_filter_ops->cfo_add_dest_v4(
6534 			    ire->ire_addr,
6535 			    ire->ire_gateway_addr,
6536 			    ire->ire_src_addr,
6537 			    gw_ire->ire_src_addr);
6538 			if (res != 0) {
6539 				ire_refrele(gw_ire);
6540 				ire_delete(ire);
6541 				return (res);
6542 			}
6543 		}
6544 	}
6545 
6546 save_ire:
6547 	if (gw_ire != NULL) {
6548 		ire_refrele(gw_ire);
6549 	}
6550 	/*
6551 	 * We do not do save_ire for the routes added with RTA_SRCIFP
6552 	 * flag. This route is only added and deleted by mipagent.
6553 	 * So, for simplicity of design, we refrain from saving
6554 	 * ires that are created with srcif value. This may change
6555 	 * in future if we find more usage of srcifp feature.
6556 	 */
6557 	if (ipif != NULL && src_ipif == NULL) {
6558 		/*
6559 		 * Save enough information so that we can recreate the IRE if
6560 		 * the interface goes down and then up.  The metrics associated
6561 		 * with the route will be saved as well when rts_setmetrics() is
6562 		 * called after the IRE has been created.  In the case where
6563 		 * memory cannot be allocated, none of this information will be
6564 		 * saved.
6565 		 */
6566 		ipif_save_ire(ipif, ire);
6567 	}
6568 	if (ioctl_msg)
6569 		ip_rts_rtmsg(RTM_OLDADD, ire, 0);
6570 	if (ire_arg != NULL) {
6571 		/*
6572 		 * Store the ire that was successfully added into where ire_arg
6573 		 * points to so that callers don't have to look it up
6574 		 * themselves (but they are responsible for ire_refrele()ing
6575 		 * the ire when they are finished with it).
6576 		 */
6577 		*ire_arg = ire;
6578 	} else {
6579 		ire_refrele(ire);		/* Held in ire_add */
6580 	}
6581 	if (ipif_refheld)
6582 		ipif_refrele(ipif);
6583 	return (0);
6584 }
6585 
6586 /*
6587  * ip_rt_delete is called to delete an IPv4 route.
6588  * ipif_arg is passed in to associate it with the correct interface.
6589  * src_ipif is passed to associate the incoming interface of the packet.
6590  * We may need to restart this operation if the ipif cannot be looked up
6591  * due to an exclusive operation that is currently in progress. The restart
6592  * entry point is specified by 'func'
6593  */
6594 /* ARGSUSED4 */
6595 int
6596 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
6597     uint_t rtm_addrs, int flags, ipif_t *ipif_arg, ipif_t *src_ipif,
6598     boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func)
6599 {
6600 	ire_t	*ire = NULL;
6601 	ipif_t	*ipif;
6602 	boolean_t ipif_refheld = B_FALSE;
6603 	uint_t	type;
6604 	uint_t	match_flags = MATCH_IRE_TYPE;
6605 	int	err = 0;
6606 
6607 	ip1dbg(("ip_rt_delete:"));
6608 	/*
6609 	 * If this is the case of RTF_HOST being set, then we set the netmask
6610 	 * to all ones.  Otherwise, we use the netmask if one was supplied.
6611 	 */
6612 	if (flags & RTF_HOST) {
6613 		mask = IP_HOST_MASK;
6614 		match_flags |= MATCH_IRE_MASK;
6615 	} else if (rtm_addrs & RTA_NETMASK) {
6616 		match_flags |= MATCH_IRE_MASK;
6617 	}
6618 
6619 	/*
6620 	 * Note that RTF_GATEWAY is never set on a delete, therefore
6621 	 * we check if the gateway address is one of our interfaces first,
6622 	 * and fall back on RTF_GATEWAY routes.
6623 	 *
6624 	 * This makes it possible to delete an original
6625 	 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
6626 	 *
6627 	 * As the interface index specified with the RTA_IFP sockaddr is the
6628 	 * same for all ipif's off of an ill, the matching logic below uses
6629 	 * MATCH_IRE_ILL if such an index was specified.  This means a route
6630 	 * sharing the same prefix and interface index as the the route
6631 	 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr
6632 	 * is specified in the request.
6633 	 *
6634 	 * On the other hand, since the gateway address will usually be
6635 	 * different for each ipif on the system, the matching logic
6636 	 * uses MATCH_IRE_IPIF in the case of a traditional interface
6637 	 * route.  This means that interface routes for the same prefix can be
6638 	 * uniquely identified if they belong to distinct ipif's and if a
6639 	 * RTA_IFP sockaddr is not present.
6640 	 *
6641 	 * For more detail on specifying routes by gateway address and by
6642 	 * interface index, see the comments in ip_rt_add().
6643 	 * gw_addr could be zero in some cases when both RTA_SRCIFP and
6644 	 * RTA_IFP are specified. If RTA_SRCIFP is specified and  both
6645 	 * RTA_IFP and gateway_addr are NULL/zero, then delete will not
6646 	 * succeed.
6647 	 */
6648 	if (src_ipif != NULL) {
6649 		if (ipif_arg == NULL && gw_addr != 0) {
6650 			ipif_arg = ipif_lookup_interface(gw_addr, dst_addr,
6651 			    q, mp, func, &err);
6652 			if (ipif_arg != NULL)
6653 				ipif_refheld = B_TRUE;
6654 		}
6655 		if (ipif_arg == NULL) {
6656 			err = (err == EINPROGRESS) ? err : ESRCH;
6657 			return (err);
6658 		}
6659 		ipif = ipif_arg;
6660 	} else {
6661 		ipif = ipif_lookup_interface(gw_addr, dst_addr,
6662 			    q, mp, func, &err);
6663 		if (ipif != NULL)
6664 			ipif_refheld = B_TRUE;
6665 		else if (err == EINPROGRESS)
6666 			return (err);
6667 		else
6668 			err = 0;
6669 	}
6670 	if (ipif != NULL) {
6671 		if (ipif_arg != NULL) {
6672 			if (ipif_refheld) {
6673 				ipif_refrele(ipif);
6674 				ipif_refheld = B_FALSE;
6675 			}
6676 			ipif = ipif_arg;
6677 			match_flags |= MATCH_IRE_ILL;
6678 		} else {
6679 			match_flags |= MATCH_IRE_IPIF;
6680 		}
6681 		if (src_ipif != NULL) {
6682 			ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE,
6683 			    ipif, src_ipif->ipif_ill, match_flags);
6684 		} else {
6685 			if (ipif->ipif_ire_type == IRE_LOOPBACK) {
6686 				ire = ire_ctable_lookup(dst_addr, 0,
6687 				    IRE_LOOPBACK, ipif, ALL_ZONES, match_flags);
6688 			}
6689 			if (ire == NULL) {
6690 				ire = ire_ftable_lookup(dst_addr, mask, 0,
6691 				    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0,
6692 				    match_flags);
6693 			}
6694 		}
6695 	}
6696 
6697 	if (ire == NULL) {
6698 		/*
6699 		 * At this point, the gateway address is not one of our own
6700 		 * addresses or a matching interface route was not found.  We
6701 		 * set the IRE type to lookup based on whether
6702 		 * this is a host route, a default route or just a prefix.
6703 		 *
6704 		 * If an ipif_arg was passed in, then the lookup is based on an
6705 		 * interface index so MATCH_IRE_ILL is added to match_flags.
6706 		 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is
6707 		 * set as the route being looked up is not a traditional
6708 		 * interface route.
6709 		 * Since we do not add gateway route with srcipif, we don't
6710 		 * expect to find it either.
6711 		 */
6712 		if (src_ipif != NULL) {
6713 			if (ipif_refheld)
6714 				ipif_refrele(ipif);
6715 			return (ESRCH);
6716 		} else {
6717 			match_flags &= ~MATCH_IRE_IPIF;
6718 			match_flags |= MATCH_IRE_GW;
6719 			if (ipif_arg != NULL)
6720 				match_flags |= MATCH_IRE_ILL;
6721 			if (mask == IP_HOST_MASK)
6722 				type = IRE_HOST;
6723 			else if (mask == 0)
6724 				type = IRE_DEFAULT;
6725 			else
6726 				type = IRE_PREFIX;
6727 			ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type,
6728 			    ipif_arg, NULL, ALL_ZONES, 0, match_flags);
6729 			if (ire == NULL && type == IRE_HOST) {
6730 				ire = ire_ftable_lookup(dst_addr, mask, gw_addr,
6731 				    IRE_HOST_REDIRECT, ipif_arg, NULL,
6732 				    ALL_ZONES, 0, match_flags);
6733 			}
6734 		}
6735 	}
6736 
6737 	if (ipif_refheld)
6738 		ipif_refrele(ipif);
6739 
6740 	/* ipif is not refheld anymore */
6741 	if (ire == NULL)
6742 		return (ESRCH);
6743 
6744 	if (ire->ire_flags & RTF_MULTIRT) {
6745 		/*
6746 		 * Invoke the CGTP (multirouting) filtering module
6747 		 * to remove the dst address from the filtering database.
6748 		 * Packets coming from that address will no longer be
6749 		 * filtered to remove duplicates.
6750 		 */
6751 		if (ip_cgtp_filter_ops != NULL) {
6752 			err = ip_cgtp_filter_ops->cfo_del_dest_v4(ire->ire_addr,
6753 			    ire->ire_gateway_addr);
6754 		}
6755 		ip_cgtp_bcast_delete(ire);
6756 	}
6757 
6758 	ipif = ire->ire_ipif;
6759 	/*
6760 	 * Removing from ipif_saved_ire_mp is not necessary
6761 	 * when src_ipif being non-NULL. ip_rt_add does not
6762 	 * save the ires which src_ipif being non-NULL.
6763 	 */
6764 	if (ipif != NULL && src_ipif == NULL) {
6765 		ipif_remove_ire(ipif, ire);
6766 	}
6767 	if (ioctl_msg)
6768 		ip_rts_rtmsg(RTM_OLDDEL, ire, 0);
6769 	ire_delete(ire);
6770 	ire_refrele(ire);
6771 	return (err);
6772 }
6773 
6774 /*
6775  * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL.
6776  */
6777 /* ARGSUSED */
6778 int
6779 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
6780     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
6781 {
6782 	ipaddr_t dst_addr;
6783 	ipaddr_t gw_addr;
6784 	ipaddr_t mask;
6785 	int error = 0;
6786 	mblk_t *mp1;
6787 	struct rtentry *rt;
6788 	ipif_t *ipif = NULL;
6789 
6790 	ip1dbg(("ip_siocaddrt:"));
6791 	/* Existence of mp1 verified in ip_wput_nondata */
6792 	mp1 = mp->b_cont->b_cont;
6793 	rt = (struct rtentry *)mp1->b_rptr;
6794 
6795 	dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
6796 	gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
6797 
6798 	/*
6799 	 * If the RTF_HOST flag is on, this is a request to assign a gateway
6800 	 * to a particular host address.  In this case, we set the netmask to
6801 	 * all ones for the particular destination address.  Otherwise,
6802 	 * determine the netmask to be used based on dst_addr and the interfaces
6803 	 * in use.
6804 	 */
6805 	if (rt->rt_flags & RTF_HOST) {
6806 		mask = IP_HOST_MASK;
6807 	} else {
6808 		/*
6809 		 * Note that ip_subnet_mask returns a zero mask in the case of
6810 		 * default (an all-zeroes address).
6811 		 */
6812 		mask = ip_subnet_mask(dst_addr, &ipif);
6813 	}
6814 
6815 	error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags,
6816 	    NULL, NULL, NULL, B_TRUE, q, mp, ip_process_ioctl);
6817 	if (ipif != NULL)
6818 		ipif_refrele(ipif);
6819 	return (error);
6820 }
6821 
6822 /*
6823  * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL.
6824  */
6825 /* ARGSUSED */
6826 int
6827 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
6828     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
6829 {
6830 	ipaddr_t dst_addr;
6831 	ipaddr_t gw_addr;
6832 	ipaddr_t mask;
6833 	int error;
6834 	mblk_t *mp1;
6835 	struct rtentry *rt;
6836 	ipif_t *ipif = NULL;
6837 
6838 	ip1dbg(("ip_siocdelrt:"));
6839 	/* Existence of mp1 verified in ip_wput_nondata */
6840 	mp1 = mp->b_cont->b_cont;
6841 	rt = (struct rtentry *)mp1->b_rptr;
6842 
6843 	dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
6844 	gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
6845 
6846 	/*
6847 	 * If the RTF_HOST flag is on, this is a request to delete a gateway
6848 	 * to a particular host address.  In this case, we set the netmask to
6849 	 * all ones for the particular destination address.  Otherwise,
6850 	 * determine the netmask to be used based on dst_addr and the interfaces
6851 	 * in use.
6852 	 */
6853 	if (rt->rt_flags & RTF_HOST) {
6854 		mask = IP_HOST_MASK;
6855 	} else {
6856 		/*
6857 		 * Note that ip_subnet_mask returns a zero mask in the case of
6858 		 * default (an all-zeroes address).
6859 		 */
6860 		mask = ip_subnet_mask(dst_addr, &ipif);
6861 	}
6862 
6863 	error = ip_rt_delete(dst_addr, mask, gw_addr,
6864 	    RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, NULL,
6865 	    B_TRUE, q, mp, ip_process_ioctl);
6866 	if (ipif != NULL)
6867 		ipif_refrele(ipif);
6868 	return (error);
6869 }
6870 
6871 /*
6872  * Enqueue the mp onto the ipsq, chained by b_next.
6873  * b_prev stores the function to be executed later, and b_queue the queue
6874  * where this mp originated.
6875  */
6876 void
6877 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
6878     ill_t *pending_ill)
6879 {
6880 	conn_t	*connp = NULL;
6881 
6882 	ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
6883 	ASSERT(func != NULL);
6884 
6885 	mp->b_queue = q;
6886 	mp->b_prev = (void *)func;
6887 	mp->b_next = NULL;
6888 
6889 	switch (type) {
6890 	case CUR_OP:
6891 		if (ipsq->ipsq_mptail != NULL) {
6892 			ASSERT(ipsq->ipsq_mphead != NULL);
6893 			ipsq->ipsq_mptail->b_next = mp;
6894 		} else {
6895 			ASSERT(ipsq->ipsq_mphead == NULL);
6896 			ipsq->ipsq_mphead = mp;
6897 		}
6898 		ipsq->ipsq_mptail = mp;
6899 		break;
6900 
6901 	case NEW_OP:
6902 		if (ipsq->ipsq_xopq_mptail != NULL) {
6903 			ASSERT(ipsq->ipsq_xopq_mphead != NULL);
6904 			ipsq->ipsq_xopq_mptail->b_next = mp;
6905 		} else {
6906 			ASSERT(ipsq->ipsq_xopq_mphead == NULL);
6907 			ipsq->ipsq_xopq_mphead = mp;
6908 		}
6909 		ipsq->ipsq_xopq_mptail = mp;
6910 		break;
6911 	default:
6912 		cmn_err(CE_PANIC, "ipsq_enq %d type \n", type);
6913 	}
6914 
6915 	if (CONN_Q(q) && pending_ill != NULL) {
6916 		connp = Q_TO_CONN(q);
6917 
6918 		ASSERT(MUTEX_HELD(&connp->conn_lock));
6919 		connp->conn_oper_pending_ill = pending_ill;
6920 	}
6921 }
6922 
6923 /*
6924  * Return the mp at the head of the ipsq. After emptying the ipsq
6925  * look at the next ioctl, if this ioctl is complete. Otherwise
6926  * return, we will resume when we complete the current ioctl.
6927  * The current ioctl will wait till it gets a response from the
6928  * driver below.
6929  */
6930 static mblk_t *
6931 ipsq_dq(ipsq_t *ipsq)
6932 {
6933 	mblk_t	*mp;
6934 
6935 	ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
6936 
6937 	mp = ipsq->ipsq_mphead;
6938 	if (mp != NULL) {
6939 		ipsq->ipsq_mphead = mp->b_next;
6940 		if (ipsq->ipsq_mphead == NULL)
6941 			ipsq->ipsq_mptail = NULL;
6942 		mp->b_next = NULL;
6943 		return (mp);
6944 	}
6945 	if (ipsq->ipsq_current_ipif != NULL)
6946 		return (NULL);
6947 	mp = ipsq->ipsq_xopq_mphead;
6948 	if (mp != NULL) {
6949 		ipsq->ipsq_xopq_mphead = mp->b_next;
6950 		if (ipsq->ipsq_xopq_mphead == NULL)
6951 			ipsq->ipsq_xopq_mptail = NULL;
6952 		mp->b_next = NULL;
6953 		return (mp);
6954 	}
6955 	return (NULL);
6956 }
6957 
6958 /*
6959  * Enter the ipsq corresponding to ill, by waiting synchronously till
6960  * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
6961  * will have to drain completely before ipsq_enter returns success.
6962  * ipsq_current_ipif will be set if some exclusive ioctl is in progress,
6963  * and the ipsq_exit logic will start the next enqueued ioctl after
6964  * completion of the current ioctl. If 'force' is used, we don't wait
6965  * for the enqueued ioctls. This is needed when a conn_close wants to
6966  * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb
6967  * of an ill can also use this option. But we dont' use it currently.
6968  */
6969 #define	ENTER_SQ_WAIT_TICKS 100
6970 boolean_t
6971 ipsq_enter(ill_t *ill, boolean_t force)
6972 {
6973 	ipsq_t	*ipsq;
6974 	boolean_t waited_enough = B_FALSE;
6975 
6976 	/*
6977 	 * Holding the ill_lock prevents <ill-ipsq> assocs from changing.
6978 	 * Since the <ill-ipsq> assocs could change while we wait for the
6979 	 * writer, it is easier to wait on a fixed global rather than try to
6980 	 * cv_wait on a changing ipsq.
6981 	 */
6982 	mutex_enter(&ill->ill_lock);
6983 	for (;;) {
6984 		if (ill->ill_state_flags & ILL_CONDEMNED) {
6985 			mutex_exit(&ill->ill_lock);
6986 			return (B_FALSE);
6987 		}
6988 
6989 		ipsq = ill->ill_phyint->phyint_ipsq;
6990 		mutex_enter(&ipsq->ipsq_lock);
6991 		if (ipsq->ipsq_writer == NULL &&
6992 		    (ipsq->ipsq_current_ipif == NULL || waited_enough)) {
6993 			break;
6994 		} else if (ipsq->ipsq_writer != NULL) {
6995 			mutex_exit(&ipsq->ipsq_lock);
6996 			cv_wait(&ill->ill_cv, &ill->ill_lock);
6997 		} else {
6998 			mutex_exit(&ipsq->ipsq_lock);
6999 			if (force) {
7000 				(void) cv_timedwait(&ill->ill_cv,
7001 				    &ill->ill_lock,
7002 				    lbolt + ENTER_SQ_WAIT_TICKS);
7003 				waited_enough = B_TRUE;
7004 				continue;
7005 			} else {
7006 				cv_wait(&ill->ill_cv, &ill->ill_lock);
7007 			}
7008 		}
7009 	}
7010 
7011 	ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL);
7012 	ASSERT(ipsq->ipsq_reentry_cnt == 0);
7013 	ipsq->ipsq_writer = curthread;
7014 	ipsq->ipsq_reentry_cnt++;
7015 #ifdef ILL_DEBUG
7016 	ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH);
7017 #endif
7018 	mutex_exit(&ipsq->ipsq_lock);
7019 	mutex_exit(&ill->ill_lock);
7020 	return (B_TRUE);
7021 }
7022 
7023 /*
7024  * The ipsq_t (ipsq) is the synchronization data structure used to serialize
7025  * certain critical operations like plumbing (i.e. most set ioctls),
7026  * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP
7027  * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per
7028  * IPMP group. The ipsq serializes exclusive ioctls issued by applications
7029  * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple
7030  * threads executing in the ipsq. Responses from the driver pertain to the
7031  * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated
7032  * as part of bringing up the interface) and are enqueued in ipsq_mphead.
7033  *
7034  * If a thread does not want to reenter the ipsq when it is already writer,
7035  * it must make sure that the specified reentry point to be called later
7036  * when the ipsq is empty, nor any code path starting from the specified reentry
7037  * point must never ever try to enter the ipsq again. Otherwise it can lead
7038  * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example.
7039  * When the thread that is currently exclusive finishes, it (ipsq_exit)
7040  * dequeues the requests waiting to become exclusive in ipsq_mphead and calls
7041  * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit
7042  * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
7043  * ioctl if the current ioctl has completed. If the current ioctl is still
7044  * in progress it simply returns. The current ioctl could be waiting for
7045  * a response from another module (arp_ or the driver or could be waiting for
7046  * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp
7047  * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the
7048  * execution of the ioctl and ipsq_exit does not start the next ioctl unless
7049  * ipsq_current_ipif is clear which happens only on ioctl completion.
7050  */
7051 
7052 /*
7053  * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of
7054  * ipif or ill can be specified). The caller ensures ipif or ill is valid by
7055  * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued
7056  * completion.
7057  */
7058 ipsq_t *
7059 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
7060     ipsq_func_t func, int type, boolean_t reentry_ok)
7061 {
7062 	ipsq_t	*ipsq;
7063 
7064 	/* Only 1 of ipif or ill can be specified */
7065 	ASSERT((ipif != NULL) ^ (ill != NULL));
7066 	if (ipif != NULL)
7067 		ill = ipif->ipif_ill;
7068 
7069 	/*
7070 	 * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock
7071 	 * ipsq of an ill can't change when ill_lock is held.
7072 	 */
7073 	GRAB_CONN_LOCK(q);
7074 	mutex_enter(&ill->ill_lock);
7075 	ipsq = ill->ill_phyint->phyint_ipsq;
7076 	mutex_enter(&ipsq->ipsq_lock);
7077 
7078 	/*
7079 	 * 1. Enter the ipsq if we are already writer and reentry is ok.
7080 	 *    (Note: If the caller does not specify reentry_ok then neither
7081 	 *    'func' nor any of its callees must ever attempt to enter the ipsq
7082 	 *    again. Otherwise it can lead to an infinite loop
7083 	 * 2. Enter the ipsq if there is no current writer and this attempted
7084 	 *    entry is part of the current ioctl or operation
7085 	 * 3. Enter the ipsq if there is no current writer and this is a new
7086 	 *    ioctl (or operation) and the ioctl (or operation) queue is
7087 	 *    empty and there is no ioctl (or operation) currently in progress
7088 	 */
7089 	if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) ||
7090 	    (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL &&
7091 	    ipsq->ipsq_current_ipif == NULL))) ||
7092 	    (ipsq->ipsq_writer == curthread && reentry_ok)) {
7093 		/* Success. */
7094 		ipsq->ipsq_reentry_cnt++;
7095 		ipsq->ipsq_writer = curthread;
7096 		mutex_exit(&ipsq->ipsq_lock);
7097 		mutex_exit(&ill->ill_lock);
7098 		RELEASE_CONN_LOCK(q);
7099 #ifdef ILL_DEBUG
7100 		ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH);
7101 #endif
7102 		return (ipsq);
7103 	}
7104 
7105 	ipsq_enq(ipsq, q, mp, func, type, ill);
7106 
7107 	mutex_exit(&ipsq->ipsq_lock);
7108 	mutex_exit(&ill->ill_lock);
7109 	RELEASE_CONN_LOCK(q);
7110 	return (NULL);
7111 }
7112 
7113 /*
7114  * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of
7115  * ipif or ill can be specified). The caller ensures ipif or ill is valid by
7116  * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued
7117  * completion.
7118  *
7119  * This function does a refrele on the ipif/ill.
7120  */
7121 void
7122 qwriter_ip(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
7123     ipsq_func_t func, int type, boolean_t reentry_ok)
7124 {
7125 	ipsq_t	*ipsq;
7126 
7127 	ipsq = ipsq_try_enter(ipif, ill, q, mp, func, type, reentry_ok);
7128 	/*
7129 	 * Caller must have done a refhold on the ipif. ipif_refrele
7130 	 * happens on the passed ipif. We can do this since we are
7131 	 * already exclusive, or we won't access ipif henceforth, Both
7132 	 * this func and caller will just return if we ipsq_try_enter
7133 	 * fails above. This is needed because func needs to
7134 	 * see the correct refcount. Eg. removeif can work only then.
7135 	 */
7136 	if (ipif != NULL)
7137 		ipif_refrele(ipif);
7138 	else
7139 		ill_refrele(ill);
7140 	if (ipsq != NULL) {
7141 		(*func)(ipsq, q, mp, NULL);
7142 		ipsq_exit(ipsq, B_TRUE, B_TRUE);
7143 	}
7144 }
7145 
7146 /*
7147  * If there are more than ILL_GRP_CNT ills in a group,
7148  * we use kmem alloc'd buffers, else use the stack
7149  */
7150 #define	ILL_GRP_CNT	14
7151 /*
7152  * Drain the ipsq, if there are messages on it, and then leave the ipsq.
7153  * Called by a thread that is currently exclusive on this ipsq.
7154  */
7155 void
7156 ipsq_exit(ipsq_t *ipsq, boolean_t start_igmp_timer, boolean_t start_mld_timer)
7157 {
7158 	queue_t	*q;
7159 	mblk_t	*mp;
7160 	ipsq_func_t	func;
7161 	int	next;
7162 	ill_t	**ill_list = NULL;
7163 	size_t	ill_list_size = 0;
7164 	int	cnt = 0;
7165 	boolean_t need_ipsq_free = B_FALSE;
7166 
7167 	ASSERT(IAM_WRITER_IPSQ(ipsq));
7168 	mutex_enter(&ipsq->ipsq_lock);
7169 	ASSERT(ipsq->ipsq_reentry_cnt >= 1);
7170 	if (ipsq->ipsq_reentry_cnt != 1) {
7171 		ipsq->ipsq_reentry_cnt--;
7172 		mutex_exit(&ipsq->ipsq_lock);
7173 		return;
7174 	}
7175 
7176 	mp = ipsq_dq(ipsq);
7177 	while (mp != NULL) {
7178 again:
7179 		mutex_exit(&ipsq->ipsq_lock);
7180 		func = (ipsq_func_t)mp->b_prev;
7181 		q = (queue_t *)mp->b_queue;
7182 		mp->b_prev = NULL;
7183 		mp->b_queue = NULL;
7184 
7185 		/*
7186 		 * If 'q' is an conn queue, it is valid, since we did a
7187 		 * a refhold on the connp, at the start of the ioctl.
7188 		 * If 'q' is an ill queue, it is valid, since close of an
7189 		 * ill will clean up the 'ipsq'.
7190 		 */
7191 		(*func)(ipsq, q, mp, NULL);
7192 
7193 		mutex_enter(&ipsq->ipsq_lock);
7194 		mp = ipsq_dq(ipsq);
7195 	}
7196 
7197 	mutex_exit(&ipsq->ipsq_lock);
7198 
7199 	/*
7200 	 * Need to grab the locks in the right order. Need to
7201 	 * atomically check (under ipsq_lock) that there are no
7202 	 * messages before relinquishing the ipsq. Also need to
7203 	 * atomically wakeup waiters on ill_cv while holding ill_lock.
7204 	 * Holding ill_g_lock ensures that ipsq list of ills is stable.
7205 	 * If we need to call ill_split_ipsq and change <ill-ipsq> we need
7206 	 * to grab ill_g_lock as writer.
7207 	 */
7208 	rw_enter(&ill_g_lock, ipsq->ipsq_split ? RW_WRITER : RW_READER);
7209 
7210 	/* ipsq_refs can't change while ill_g_lock is held as reader */
7211 	if (ipsq->ipsq_refs != 0) {
7212 		/* At most 2 ills v4/v6 per phyint */
7213 		cnt = ipsq->ipsq_refs << 1;
7214 		ill_list_size = cnt * sizeof (ill_t *);
7215 		/*
7216 		 * If memory allocation fails, we will do the split
7217 		 * the next time ipsq_exit is called for whatever reason.
7218 		 * As long as the ipsq_split flag is set the need to
7219 		 * split is remembered.
7220 		 */
7221 		ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP);
7222 		if (ill_list != NULL)
7223 			cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt);
7224 	}
7225 	mutex_enter(&ipsq->ipsq_lock);
7226 	mp = ipsq_dq(ipsq);
7227 	if (mp != NULL) {
7228 		/* oops, some message has landed up, we can't get out */
7229 		if (ill_list != NULL)
7230 			ill_unlock_ills(ill_list, cnt);
7231 		rw_exit(&ill_g_lock);
7232 		if (ill_list != NULL)
7233 			kmem_free(ill_list, ill_list_size);
7234 		ill_list = NULL;
7235 		ill_list_size = 0;
7236 		cnt = 0;
7237 		goto again;
7238 	}
7239 
7240 	/*
7241 	 * Split only if no ioctl is pending and if memory alloc succeeded
7242 	 * above.
7243 	 */
7244 	if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL &&
7245 		ill_list != NULL) {
7246 		/*
7247 		 * No new ill can join this ipsq since we are holding the
7248 		 * ill_g_lock. Hence ill_split_ipsq can safely traverse the
7249 		 * ipsq. ill_split_ipsq may fail due to memory shortage.
7250 		 * If so we will retry on the next ipsq_exit.
7251 		 */
7252 		ipsq->ipsq_split = ill_split_ipsq(ipsq);
7253 	}
7254 
7255 	/*
7256 	 * We are holding the ipsq lock, hence no new messages can
7257 	 * land up on the ipsq, and there are no messages currently.
7258 	 * Now safe to get out. Wake up waiters and relinquish ipsq
7259 	 * atomically while holding ill locks.
7260 	 */
7261 	ipsq->ipsq_writer = NULL;
7262 	ipsq->ipsq_reentry_cnt--;
7263 	ASSERT(ipsq->ipsq_reentry_cnt == 0);
7264 #ifdef ILL_DEBUG
7265 	ipsq->ipsq_depth = 0;
7266 #endif
7267 	mutex_exit(&ipsq->ipsq_lock);
7268 	/*
7269 	 * For IPMP this should wake up all ills in this ipsq.
7270 	 * We need to hold the ill_lock while waking up waiters to
7271 	 * avoid missed wakeups. But there is no need to acquire all
7272 	 * the ill locks and then wakeup. If we have not acquired all
7273 	 * the locks (due to memory failure above) ill_signal_ipsq_ills
7274 	 * wakes up ills one at a time after getting the right ill_lock
7275 	 */
7276 	ill_signal_ipsq_ills(ipsq, ill_list != NULL);
7277 	if (ill_list != NULL)
7278 		ill_unlock_ills(ill_list, cnt);
7279 	if (ipsq->ipsq_refs == 0)
7280 		need_ipsq_free = B_TRUE;
7281 	rw_exit(&ill_g_lock);
7282 	if (ill_list != 0)
7283 		kmem_free(ill_list, ill_list_size);
7284 
7285 	if (need_ipsq_free) {
7286 		/*
7287 		 * Free the ipsq. ipsq_refs can't increase because ipsq can't be
7288 		 * looked up. ipsq can be looked up only thru ill or phyint
7289 		 * and there are no ills/phyint on this ipsq.
7290 		 */
7291 		ipsq_delete(ipsq);
7292 	}
7293 	/*
7294 	 * Now start any igmp or mld timers that could not be started
7295 	 * while inside the ipsq. The timers can't be started while inside
7296 	 * the ipsq, since igmp_start_timers may need to call untimeout()
7297 	 * which can't be done while holding a lock i.e. the ipsq. Otherwise
7298 	 * there could be a deadlock since the timeout handlers
7299 	 * mld_timeout_handler / igmp_timeout_handler also synchronously
7300 	 * wait in ipsq_enter() trying to get the ipsq.
7301 	 *
7302 	 * However there is one exception to the above. If this thread is
7303 	 * itself the igmp/mld timeout handler thread, then we don't want
7304 	 * to start any new timer until the current handler is done. The
7305 	 * handler thread passes in B_FALSE for start_igmp/mld_timers, while
7306 	 * all others pass B_TRUE.
7307 	 */
7308 	if (start_igmp_timer) {
7309 		mutex_enter(&igmp_timer_lock);
7310 		next = igmp_deferred_next;
7311 		igmp_deferred_next = INFINITY;
7312 		mutex_exit(&igmp_timer_lock);
7313 
7314 		if (next != INFINITY)
7315 			igmp_start_timers(next);
7316 	}
7317 
7318 	if (start_mld_timer) {
7319 		mutex_enter(&mld_timer_lock);
7320 		next = mld_deferred_next;
7321 		mld_deferred_next = INFINITY;
7322 		mutex_exit(&mld_timer_lock);
7323 
7324 		if (next != INFINITY)
7325 			mld_start_timers(next);
7326 	}
7327 }
7328 
7329 /*
7330  * The ill is closing. Flush all messages on the ipsq that originated
7331  * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead
7332  * for this ill since ipsq_enter could not have entered until then.
7333  * New messages can't be queued since the CONDEMNED flag is set.
7334  */
7335 static void
7336 ipsq_flush(ill_t *ill)
7337 {
7338 	queue_t	*q;
7339 	mblk_t	*prev;
7340 	mblk_t	*mp;
7341 	mblk_t	*mp_next;
7342 	ipsq_t	*ipsq;
7343 
7344 	ASSERT(IAM_WRITER_ILL(ill));
7345 	ipsq = ill->ill_phyint->phyint_ipsq;
7346 	/*
7347 	 * Flush any messages sent up by the driver.
7348 	 */
7349 	mutex_enter(&ipsq->ipsq_lock);
7350 	for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) {
7351 		mp_next = mp->b_next;
7352 		q = mp->b_queue;
7353 		if (q == ill->ill_rq || q == ill->ill_wq) {
7354 			/* Remove the mp from the ipsq */
7355 			if (prev == NULL)
7356 				ipsq->ipsq_mphead = mp->b_next;
7357 			else
7358 				prev->b_next = mp->b_next;
7359 			if (ipsq->ipsq_mptail == mp) {
7360 				ASSERT(mp_next == NULL);
7361 				ipsq->ipsq_mptail = prev;
7362 			}
7363 			inet_freemsg(mp);
7364 		} else {
7365 			prev = mp;
7366 		}
7367 	}
7368 	mutex_exit(&ipsq->ipsq_lock);
7369 	(void) ipsq_pending_mp_cleanup(ill, NULL);
7370 	ipsq_xopq_mp_cleanup(ill, NULL);
7371 	ill_pending_mp_cleanup(ill);
7372 }
7373 
7374 /*
7375  * Clean up one squeue element. ill_inuse_ref is protected by ill_lock.
7376  * The real cleanup happens behind the squeue via ip_squeue_clean function but
7377  * we need to protect ourselfs from 2 threads trying to cleanup at the same
7378  * time (possible with one port going down for aggr and someone tearing down the
7379  * entire aggr simultaneously. So we use ill_inuse_ref protected by ill_lock
7380  * to indicate when the cleanup has started (1 ref) and when the cleanup
7381  * is done (0 ref). When a new ring gets assigned to squeue, we start by
7382  * putting 2 ref on ill_inuse_ref.
7383  */
7384 static void
7385 ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
7386 {
7387 	conn_t *connp;
7388 	squeue_t *sqp;
7389 	mblk_t *mp;
7390 
7391 	ASSERT(rx_ring != NULL);
7392 
7393 	/* Just clean one squeue */
7394 	mutex_enter(&ill->ill_lock);
7395 	/*
7396 	 * Reset the ILL_SOFT_RING_ASSIGN bit so that
7397 	 * ip_squeue_soft_ring_affinty() will not go
7398 	 * ahead with assigning rings.
7399 	 */
7400 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
7401 	while (rx_ring->rr_ring_state == ILL_RING_INPROC)
7402 		/* Some operations pending on the ring. Wait */
7403 		cv_wait(&ill->ill_cv, &ill->ill_lock);
7404 
7405 	if (rx_ring->rr_ring_state != ILL_RING_INUSE) {
7406 		/*
7407 		 * Someone already trying to clean
7408 		 * this squeue or its already been cleaned.
7409 		 */
7410 		mutex_exit(&ill->ill_lock);
7411 		return;
7412 	}
7413 	sqp = rx_ring->rr_sqp;
7414 
7415 	if (sqp == NULL) {
7416 		/*
7417 		 * The rx_ring never had a squeue assigned to it.
7418 		 * We are under ill_lock so we can clean it up
7419 		 * here itself since no one can get to it.
7420 		 */
7421 		rx_ring->rr_blank = NULL;
7422 		rx_ring->rr_handle = NULL;
7423 		rx_ring->rr_sqp = NULL;
7424 		rx_ring->rr_ring_state = ILL_RING_FREE;
7425 		mutex_exit(&ill->ill_lock);
7426 		return;
7427 	}
7428 
7429 	/* Set the state that its being cleaned */
7430 	rx_ring->rr_ring_state = ILL_RING_BEING_FREED;
7431 	ASSERT(sqp != NULL);
7432 	mutex_exit(&ill->ill_lock);
7433 
7434 	/*
7435 	 * Use the preallocated ill_unbind_conn for this purpose
7436 	 */
7437 	connp = ill->ill_dls_capab->ill_unbind_conn;
7438 	mp = &connp->conn_tcp->tcp_closemp;
7439 	CONN_INC_REF(connp);
7440 	squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL);
7441 
7442 	mutex_enter(&ill->ill_lock);
7443 	while (rx_ring->rr_ring_state != ILL_RING_FREE)
7444 		cv_wait(&ill->ill_cv, &ill->ill_lock);
7445 
7446 	mutex_exit(&ill->ill_lock);
7447 }
7448 
7449 static void
7450 ipsq_clean_all(ill_t *ill)
7451 {
7452 	int idx;
7453 
7454 	/*
7455 	 * No need to clean if poll_capab isn't set for this ill
7456 	 */
7457 	if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)))
7458 		return;
7459 
7460 	for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
7461 		ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx];
7462 		ipsq_clean_ring(ill, ipr);
7463 	}
7464 
7465 	ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING);
7466 }
7467 
7468 /* ARGSUSED */
7469 int
7470 ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
7471     ip_ioctl_cmd_t *ipip, void *ifreq)
7472 {
7473 	ill_t	*ill;
7474 	struct lifreq	*lifr = (struct lifreq *)ifreq;
7475 	boolean_t isv6;
7476 	conn_t	*connp;
7477 
7478 	connp = Q_TO_CONN(q);
7479 	isv6 = connp->conn_af_isv6;
7480 	/*
7481 	 * Set original index.
7482 	 * Failover and failback move logical interfaces
7483 	 * from one physical interface to another.  The
7484 	 * original index indicates the parent of a logical
7485 	 * interface, in other words, the physical interface
7486 	 * the logical interface will be moved back to on
7487 	 * failback.
7488 	 */
7489 
7490 	/*
7491 	 * Don't allow the original index to be changed
7492 	 * for non-failover addresses, autoconfigured
7493 	 * addresses, or IPv6 link local addresses.
7494 	 */
7495 	if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) ||
7496 	    (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) {
7497 		return (EINVAL);
7498 	}
7499 	/*
7500 	 * The new original index must be in use by some
7501 	 * physical interface.
7502 	 */
7503 	ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL,
7504 	    NULL, NULL);
7505 	if (ill == NULL)
7506 		return (ENXIO);
7507 	ill_refrele(ill);
7508 
7509 	ipif->ipif_orig_ifindex = lifr->lifr_index;
7510 	/*
7511 	 * When this ipif gets failed back, don't
7512 	 * preserve the original id, as it is no
7513 	 * longer applicable.
7514 	 */
7515 	ipif->ipif_orig_ipifid = 0;
7516 	/*
7517 	 * For IPv4, change the original index of any
7518 	 * multicast addresses associated with the
7519 	 * ipif to the new value.
7520 	 */
7521 	if (!isv6) {
7522 		ilm_t *ilm;
7523 
7524 		mutex_enter(&ipif->ipif_ill->ill_lock);
7525 		for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL;
7526 		    ilm = ilm->ilm_next) {
7527 			if (ilm->ilm_ipif == ipif) {
7528 				ilm->ilm_orig_ifindex = lifr->lifr_index;
7529 			}
7530 		}
7531 		mutex_exit(&ipif->ipif_ill->ill_lock);
7532 	}
7533 	return (0);
7534 }
7535 
7536 /* ARGSUSED */
7537 int
7538 ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
7539     ip_ioctl_cmd_t *ipip, void *ifreq)
7540 {
7541 	struct lifreq *lifr = (struct lifreq *)ifreq;
7542 
7543 	/*
7544 	 * Get the original interface index i.e the one
7545 	 * before FAILOVER if it ever happened.
7546 	 */
7547 	lifr->lifr_index = ipif->ipif_orig_ifindex;
7548 	return (0);
7549 }
7550 
7551 /*
7552  * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls,
7553  * refhold and return the associated ipif
7554  */
7555 int
7556 ip_extract_tunreq(queue_t *q, mblk_t *mp, ipif_t **ipifp, ipsq_func_t func)
7557 {
7558 	boolean_t exists;
7559 	struct iftun_req *ta;
7560 	ipif_t	*ipif;
7561 	ill_t	*ill;
7562 	boolean_t isv6;
7563 	mblk_t	*mp1;
7564 	int	error;
7565 	conn_t	*connp;
7566 
7567 	/* Existence verified in ip_wput_nondata */
7568 	mp1 = mp->b_cont->b_cont;
7569 	ta = (struct iftun_req *)mp1->b_rptr;
7570 	/*
7571 	 * Null terminate the string to protect against buffer
7572 	 * overrun. String was generated by user code and may not
7573 	 * be trusted.
7574 	 */
7575 	ta->ifta_lifr_name[LIFNAMSIZ - 1] = '\0';
7576 
7577 	connp = Q_TO_CONN(q);
7578 	isv6 = connp->conn_af_isv6;
7579 
7580 	/* Disallows implicit create */
7581 	ipif = ipif_lookup_on_name(ta->ifta_lifr_name,
7582 	    mi_strlen(ta->ifta_lifr_name), B_FALSE, &exists, isv6,
7583 	    connp->conn_zoneid, CONNP_TO_WQ(connp), mp, func, &error);
7584 	if (ipif == NULL)
7585 		return (error);
7586 
7587 	if (ipif->ipif_id != 0) {
7588 		/*
7589 		 * We really don't want to set/get tunnel parameters
7590 		 * on virtual tunnel interfaces.  Only allow the
7591 		 * base tunnel to do these.
7592 		 */
7593 		ipif_refrele(ipif);
7594 		return (EINVAL);
7595 	}
7596 
7597 	/*
7598 	 * Send down to tunnel mod for ioctl processing.
7599 	 * Will finish ioctl in ip_rput_other().
7600 	 */
7601 	ill = ipif->ipif_ill;
7602 	if (ill->ill_net_type == IRE_LOOPBACK) {
7603 		ipif_refrele(ipif);
7604 		return (EOPNOTSUPP);
7605 	}
7606 
7607 	if (ill->ill_wq == NULL) {
7608 		ipif_refrele(ipif);
7609 		return (ENXIO);
7610 	}
7611 	/*
7612 	 * Mark the ioctl as coming from an IPv6 interface for
7613 	 * tun's convenience.
7614 	 */
7615 	if (ill->ill_isv6)
7616 		ta->ifta_flags |= 0x80000000;
7617 	*ipifp = ipif;
7618 	return (0);
7619 }
7620 
7621 /*
7622  * Parse an ifreq or lifreq struct coming down ioctls and refhold
7623  * and return the associated ipif.
7624  * Return value:
7625  *	Non zero: An error has occurred. ci may not be filled out.
7626  *	zero : ci is filled out with the ioctl cmd in ci.ci_name, and
7627  *	a held ipif in ci.ci_ipif.
7628  */
7629 int
7630 ip_extract_lifreq_cmn(queue_t *q, mblk_t *mp, int cmd_type, int flags,
7631     cmd_info_t *ci, ipsq_func_t func)
7632 {
7633 	sin_t		*sin;
7634 	sin6_t		*sin6;
7635 	char		*name;
7636 	struct ifreq    *ifr;
7637 	struct lifreq    *lifr;
7638 	ipif_t		*ipif = NULL;
7639 	ill_t		*ill;
7640 	conn_t		*connp;
7641 	boolean_t	isv6;
7642 	struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
7643 	boolean_t	exists;
7644 	int		err;
7645 	mblk_t		*mp1;
7646 	zoneid_t	zoneid;
7647 
7648 	if (q->q_next != NULL) {
7649 		ill = (ill_t *)q->q_ptr;
7650 		isv6 = ill->ill_isv6;
7651 		connp = NULL;
7652 		zoneid = ALL_ZONES;
7653 	} else {
7654 		ill = NULL;
7655 		connp = Q_TO_CONN(q);
7656 		isv6 = connp->conn_af_isv6;
7657 		zoneid = connp->conn_zoneid;
7658 		if (zoneid == GLOBAL_ZONEID) {
7659 			/* global zone can access ipifs in all zones */
7660 			zoneid = ALL_ZONES;
7661 		}
7662 	}
7663 
7664 	/* Has been checked in ip_wput_nondata */
7665 	mp1 = mp->b_cont->b_cont;
7666 
7667 
7668 	if (cmd_type == IF_CMD) {
7669 		/* This a old style SIOC[GS]IF* command */
7670 		ifr = (struct ifreq *)mp1->b_rptr;
7671 		/*
7672 		 * Null terminate the string to protect against buffer
7673 		 * overrun. String was generated by user code and may not
7674 		 * be trusted.
7675 		 */
7676 		ifr->ifr_name[IFNAMSIZ - 1] = '\0';
7677 		sin = (sin_t *)&ifr->ifr_addr;
7678 		name = ifr->ifr_name;
7679 		ci->ci_sin = sin;
7680 		ci->ci_sin6 = NULL;
7681 		ci->ci_lifr = (struct lifreq *)ifr;
7682 	} else {
7683 		/* This a new style SIOC[GS]LIF* command */
7684 		ASSERT(cmd_type == LIF_CMD);
7685 		lifr = (struct lifreq *)mp1->b_rptr;
7686 		/*
7687 		 * Null terminate the string to protect against buffer
7688 		 * overrun. String was generated by user code and may not
7689 		 * be trusted.
7690 		 */
7691 		lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
7692 		name = lifr->lifr_name;
7693 		sin = (sin_t *)&lifr->lifr_addr;
7694 		sin6 = (sin6_t *)&lifr->lifr_addr;
7695 		if (iocp->ioc_cmd == SIOCSLIFGROUPNAME) {
7696 			(void) strncpy(ci->ci_groupname, lifr->lifr_groupname,
7697 			    LIFNAMSIZ);
7698 		}
7699 		ci->ci_sin = sin;
7700 		ci->ci_sin6 = sin6;
7701 		ci->ci_lifr = lifr;
7702 	}
7703 
7704 
7705 	if (iocp->ioc_cmd == SIOCSLIFNAME) {
7706 		/*
7707 		 * The ioctl will be failed if the ioctl comes down
7708 		 * an conn stream
7709 		 */
7710 		if (ill == NULL) {
7711 			/*
7712 			 * Not an ill queue, return EINVAL same as the
7713 			 * old error code.
7714 			 */
7715 			return (ENXIO);
7716 		}
7717 		ipif = ill->ill_ipif;
7718 		ipif_refhold(ipif);
7719 	} else {
7720 		ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE,
7721 		    &exists, isv6, zoneid,
7722 		    (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err);
7723 		if (ipif == NULL) {
7724 			if (err == EINPROGRESS)
7725 				return (err);
7726 			if (iocp->ioc_cmd == SIOCLIFFAILOVER ||
7727 			    iocp->ioc_cmd == SIOCLIFFAILBACK) {
7728 				/*
7729 				 * Need to try both v4 and v6 since this
7730 				 * ioctl can come down either v4 or v6
7731 				 * socket. The lifreq.lifr_family passed
7732 				 * down by this ioctl is AF_UNSPEC.
7733 				 */
7734 				ipif = ipif_lookup_on_name(name,
7735 				    mi_strlen(name), B_FALSE, &exists, !isv6,
7736 				    zoneid, (connp == NULL) ? q :
7737 				    CONNP_TO_WQ(connp), mp, func, &err);
7738 				if (err == EINPROGRESS)
7739 					return (err);
7740 			}
7741 			err = 0;	/* Ensure we don't use it below */
7742 		}
7743 	}
7744 
7745 	/*
7746 	 * Old style [GS]IFCMD does not admit IPv6 ipif
7747 	 */
7748 	if (ipif != NULL && ipif->ipif_isv6 && cmd_type == IF_CMD) {
7749 		ipif_refrele(ipif);
7750 		return (ENXIO);
7751 	}
7752 
7753 	if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL &&
7754 	    name[0] == '\0') {
7755 		/*
7756 		 * Handle a or a SIOC?IF* with a null name
7757 		 * during plumb (on the ill queue before the I_PLINK).
7758 		 */
7759 		ipif = ill->ill_ipif;
7760 		ipif_refhold(ipif);
7761 	}
7762 
7763 	if (ipif == NULL)
7764 		return (ENXIO);
7765 
7766 	/*
7767 	 * Allow only GET operations if this ipif has been created
7768 	 * temporarily due to a MOVE operation.
7769 	 */
7770 	if (ipif->ipif_replace_zero && !(flags & IPI_REPL)) {
7771 		ipif_refrele(ipif);
7772 		return (EINVAL);
7773 	}
7774 
7775 	ci->ci_ipif = ipif;
7776 	return (0);
7777 }
7778 
7779 /*
7780  * Return the total number of ipifs.
7781  */
7782 static uint_t
7783 ip_get_numifs(zoneid_t zoneid)
7784 {
7785 	uint_t numifs = 0;
7786 	ill_t	*ill;
7787 	ill_walk_context_t	ctx;
7788 	ipif_t	*ipif;
7789 
7790 	rw_enter(&ill_g_lock, RW_READER);
7791 	ill = ILL_START_WALK_V4(&ctx);
7792 
7793 	while (ill != NULL) {
7794 		for (ipif = ill->ill_ipif; ipif != NULL;
7795 		    ipif = ipif->ipif_next) {
7796 			if (ipif->ipif_zoneid == zoneid)
7797 				numifs++;
7798 		}
7799 		ill = ill_next(&ctx, ill);
7800 	}
7801 	rw_exit(&ill_g_lock);
7802 	return (numifs);
7803 }
7804 
7805 /*
7806  * Return the total number of ipifs.
7807  */
7808 static uint_t
7809 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid)
7810 {
7811 	uint_t numifs = 0;
7812 	ill_t	*ill;
7813 	ipif_t	*ipif;
7814 	ill_walk_context_t	ctx;
7815 
7816 	ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid));
7817 
7818 	rw_enter(&ill_g_lock, RW_READER);
7819 	if (family == AF_INET)
7820 		ill = ILL_START_WALK_V4(&ctx);
7821 	else if (family == AF_INET6)
7822 		ill = ILL_START_WALK_V6(&ctx);
7823 	else
7824 		ill = ILL_START_WALK_ALL(&ctx);
7825 
7826 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7827 		for (ipif = ill->ill_ipif; ipif != NULL;
7828 		    ipif = ipif->ipif_next) {
7829 			if ((ipif->ipif_flags & IPIF_NOXMIT) &&
7830 			    !(lifn_flags & LIFC_NOXMIT))
7831 				continue;
7832 			if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
7833 			    !(lifn_flags & LIFC_TEMPORARY))
7834 				continue;
7835 			if (((ipif->ipif_flags &
7836 			    (IPIF_NOXMIT|IPIF_NOLOCAL|
7837 			    IPIF_DEPRECATED)) ||
7838 			    (ill->ill_phyint->phyint_flags &
7839 			    PHYI_LOOPBACK) ||
7840 			    !(ipif->ipif_flags & IPIF_UP)) &&
7841 			    (lifn_flags & LIFC_EXTERNAL_SOURCE))
7842 				continue;
7843 
7844 			if (zoneid != ipif->ipif_zoneid &&
7845 			    (zoneid != GLOBAL_ZONEID ||
7846 			    !(lifn_flags & LIFC_ALLZONES)))
7847 				continue;
7848 
7849 			numifs++;
7850 		}
7851 	}
7852 	rw_exit(&ill_g_lock);
7853 	return (numifs);
7854 }
7855 
7856 uint_t
7857 ip_get_lifsrcofnum(ill_t *ill)
7858 {
7859 	uint_t numifs = 0;
7860 	ill_t	*ill_head = ill;
7861 
7862 	/*
7863 	 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some
7864 	 * other thread may be trying to relink the ILLs in this usesrc group
7865 	 * and adjusting the ill_usesrc_grp_next pointers
7866 	 */
7867 	rw_enter(&ill_g_usesrc_lock, RW_READER);
7868 	if ((ill->ill_usesrc_ifindex == 0) &&
7869 	    (ill->ill_usesrc_grp_next != NULL)) {
7870 		for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head);
7871 		    ill = ill->ill_usesrc_grp_next)
7872 			numifs++;
7873 	}
7874 	rw_exit(&ill_g_usesrc_lock);
7875 
7876 	return (numifs);
7877 }
7878 
7879 /* Null values are passed in for ipif, sin, and ifreq */
7880 /* ARGSUSED */
7881 int
7882 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7883     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7884 {
7885 	int *nump;
7886 
7887 	ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7888 
7889 	/* Existence of b_cont->b_cont checked in ip_wput_nondata */
7890 	nump = (int *)mp->b_cont->b_cont->b_rptr;
7891 
7892 	*nump = ip_get_numifs(Q_TO_CONN(q)->conn_zoneid);
7893 	ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump));
7894 	return (0);
7895 }
7896 
7897 /* Null values are passed in for ipif, sin, and ifreq */
7898 /* ARGSUSED */
7899 int
7900 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin,
7901     queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7902 {
7903 	struct lifnum *lifn;
7904 	mblk_t	*mp1;
7905 
7906 	ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7907 
7908 	/* Existence checked in ip_wput_nondata */
7909 	mp1 = mp->b_cont->b_cont;
7910 
7911 	lifn = (struct lifnum *)mp1->b_rptr;
7912 	switch (lifn->lifn_family) {
7913 	case AF_UNSPEC:
7914 	case AF_INET:
7915 	case AF_INET6:
7916 		break;
7917 	default:
7918 		return (EAFNOSUPPORT);
7919 	}
7920 
7921 	lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags,
7922 	    Q_TO_CONN(q)->conn_zoneid);
7923 	ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count));
7924 	return (0);
7925 }
7926 
7927 /* ARGSUSED */
7928 int
7929 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7930     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7931 {
7932 	STRUCT_HANDLE(ifconf, ifc);
7933 	mblk_t *mp1;
7934 	struct iocblk *iocp;
7935 	struct ifreq *ifr;
7936 	ill_walk_context_t	ctx;
7937 	ill_t	*ill;
7938 	ipif_t	*ipif;
7939 	struct sockaddr_in *sin;
7940 	int32_t	ifclen;
7941 	zoneid_t zoneid;
7942 
7943 	ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */
7944 
7945 	ip1dbg(("ip_sioctl_get_ifconf"));
7946 	/* Existence verified in ip_wput_nondata */
7947 	mp1 = mp->b_cont->b_cont;
7948 	iocp = (struct iocblk *)mp->b_rptr;
7949 	zoneid = Q_TO_CONN(q)->conn_zoneid;
7950 
7951 	/*
7952 	 * The original SIOCGIFCONF passed in a struct ifconf which specified
7953 	 * the user buffer address and length into which the list of struct
7954 	 * ifreqs was to be copied.  Since AT&T Streams does not seem to
7955 	 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS,
7956 	 * the SIOCGIFCONF operation was redefined to simply provide
7957 	 * a large output buffer into which we are supposed to jam the ifreq
7958 	 * array.  The same ioctl command code was used, despite the fact that
7959 	 * both the applications and the kernel code had to change, thus making
7960 	 * it impossible to support both interfaces.
7961 	 *
7962 	 * For reasons not good enough to try to explain, the following
7963 	 * algorithm is used for deciding what to do with one of these:
7964 	 * If the IOCTL comes in as an I_STR, it is assumed to be of the new
7965 	 * form with the output buffer coming down as the continuation message.
7966 	 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style,
7967 	 * and we have to copy in the ifconf structure to find out how big the
7968 	 * output buffer is and where to copy out to.  Sure no problem...
7969 	 *
7970 	 */
7971 	STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL);
7972 	if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) {
7973 		int numifs = 0;
7974 		size_t ifc_bufsize;
7975 
7976 		/*
7977 		 * Must be (better be!) continuation of a TRANSPARENT
7978 		 * IOCTL.  We just copied in the ifconf structure.
7979 		 */
7980 		STRUCT_SET_HANDLE(ifc, iocp->ioc_flag,
7981 		    (struct ifconf *)mp1->b_rptr);
7982 
7983 		/*
7984 		 * Allocate a buffer to hold requested information.
7985 		 *
7986 		 * If ifc_len is larger than what is needed, we only
7987 		 * allocate what we will use.
7988 		 *
7989 		 * If ifc_len is smaller than what is needed, return
7990 		 * EINVAL.
7991 		 *
7992 		 * XXX: the ill_t structure can hava 2 counters, for
7993 		 * v4 and v6 (not just ill_ipif_up_count) to store the
7994 		 * number of interfaces for a device, so we don't need
7995 		 * to count them here...
7996 		 */
7997 		numifs = ip_get_numifs(zoneid);
7998 
7999 		ifclen = STRUCT_FGET(ifc, ifc_len);
8000 		ifc_bufsize = numifs * sizeof (struct ifreq);
8001 		if (ifc_bufsize > ifclen) {
8002 			if (iocp->ioc_cmd == O_SIOCGIFCONF) {
8003 				/* old behaviour */
8004 				return (EINVAL);
8005 			} else {
8006 				ifc_bufsize = ifclen;
8007 			}
8008 		}
8009 
8010 		mp1 = mi_copyout_alloc(q, mp,
8011 		    STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE);
8012 		if (mp1 == NULL)
8013 			return (ENOMEM);
8014 
8015 		mp1->b_wptr = mp1->b_rptr + ifc_bufsize;
8016 	}
8017 	bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
8018 	/*
8019 	 * the SIOCGIFCONF ioctl only knows about
8020 	 * IPv4 addresses, so don't try to tell
8021 	 * it about interfaces with IPv6-only
8022 	 * addresses. (Last parm 'isv6' is B_FALSE)
8023 	 */
8024 
8025 	ifr = (struct ifreq *)mp1->b_rptr;
8026 
8027 	rw_enter(&ill_g_lock, RW_READER);
8028 	ill = ILL_START_WALK_V4(&ctx);
8029 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
8030 		for (ipif = ill->ill_ipif; ipif;
8031 		    ipif = ipif->ipif_next) {
8032 			if (zoneid != ipif->ipif_zoneid)
8033 				continue;
8034 			if ((uchar_t *)&ifr[1] > mp1->b_wptr) {
8035 				if (iocp->ioc_cmd == O_SIOCGIFCONF) {
8036 					/* old behaviour */
8037 					rw_exit(&ill_g_lock);
8038 					return (EINVAL);
8039 				} else {
8040 					goto if_copydone;
8041 				}
8042 			}
8043 			(void) ipif_get_name(ipif,
8044 			    ifr->ifr_name,
8045 			    sizeof (ifr->ifr_name));
8046 			sin = (sin_t *)&ifr->ifr_addr;
8047 			*sin = sin_null;
8048 			sin->sin_family = AF_INET;
8049 			sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
8050 			ifr++;
8051 		}
8052 	}
8053 if_copydone:
8054 	rw_exit(&ill_g_lock);
8055 	mp1->b_wptr = (uchar_t *)ifr;
8056 
8057 	if (STRUCT_BUF(ifc) != NULL) {
8058 		STRUCT_FSET(ifc, ifc_len,
8059 			(int)((uchar_t *)ifr - mp1->b_rptr));
8060 	}
8061 	return (0);
8062 }
8063 
8064 /*
8065  * Get the interfaces using the address hosted on the interface passed in,
8066  * as a source adddress
8067  */
8068 /* ARGSUSED */
8069 int
8070 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
8071     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
8072 {
8073 	mblk_t *mp1;
8074 	ill_t	*ill, *ill_head;
8075 	ipif_t	*ipif, *orig_ipif;
8076 	int	numlifs = 0;
8077 	size_t	lifs_bufsize, lifsmaxlen;
8078 	struct	lifreq *lifr;
8079 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8080 	uint_t	ifindex;
8081 	zoneid_t zoneid;
8082 	int err = 0;
8083 	boolean_t isv6 = B_FALSE;
8084 	struct	sockaddr_in	*sin;
8085 	struct	sockaddr_in6	*sin6;
8086 
8087 	STRUCT_HANDLE(lifsrcof, lifs);
8088 
8089 	ASSERT(q->q_next == NULL);
8090 
8091 	zoneid = Q_TO_CONN(q)->conn_zoneid;
8092 
8093 	/* Existence verified in ip_wput_nondata */
8094 	mp1 = mp->b_cont->b_cont;
8095 
8096 	/*
8097 	 * Must be (better be!) continuation of a TRANSPARENT
8098 	 * IOCTL.  We just copied in the lifsrcof structure.
8099 	 */
8100 	STRUCT_SET_HANDLE(lifs, iocp->ioc_flag,
8101 	    (struct lifsrcof *)mp1->b_rptr);
8102 
8103 	if (MBLKL(mp1) != STRUCT_SIZE(lifs))
8104 		return (EINVAL);
8105 
8106 	ifindex = STRUCT_FGET(lifs, lifs_ifindex);
8107 	isv6 = (Q_TO_CONN(q))->conn_af_isv6;
8108 	ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp,
8109 	    ip_process_ioctl, &err);
8110 	if (ipif == NULL) {
8111 		ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n",
8112 		    ifindex));
8113 		return (err);
8114 	}
8115 
8116 
8117 	/* Allocate a buffer to hold requested information */
8118 	numlifs = ip_get_lifsrcofnum(ipif->ipif_ill);
8119 	lifs_bufsize = numlifs * sizeof (struct lifreq);
8120 	lifsmaxlen =  STRUCT_FGET(lifs, lifs_maxlen);
8121 	/* The actual size needed is always returned in lifs_len */
8122 	STRUCT_FSET(lifs, lifs_len, lifs_bufsize);
8123 
8124 	/* If the amount we need is more than what is passed in, abort */
8125 	if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) {
8126 		ipif_refrele(ipif);
8127 		return (0);
8128 	}
8129 
8130 	mp1 = mi_copyout_alloc(q, mp,
8131 	    STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE);
8132 	if (mp1 == NULL) {
8133 		ipif_refrele(ipif);
8134 		return (ENOMEM);
8135 	}
8136 
8137 	mp1->b_wptr = mp1->b_rptr + lifs_bufsize;
8138 	bzero(mp1->b_rptr, lifs_bufsize);
8139 
8140 	lifr = (struct lifreq *)mp1->b_rptr;
8141 
8142 	ill = ill_head = ipif->ipif_ill;
8143 	orig_ipif = ipif;
8144 
8145 	/* ill_g_usesrc_lock protects ill_usesrc_grp_next */
8146 	rw_enter(&ill_g_usesrc_lock, RW_READER);
8147 	rw_enter(&ill_g_lock, RW_READER);
8148 
8149 	ill = ill->ill_usesrc_grp_next; /* start from next ill */
8150 	for (; (ill != NULL) && (ill != ill_head);
8151 	    ill = ill->ill_usesrc_grp_next) {
8152 
8153 		if ((uchar_t *)&lifr[1] > mp1->b_wptr)
8154 			break;
8155 
8156 		ipif = ill->ill_ipif;
8157 		(void) ipif_get_name(ipif,
8158 		    lifr->lifr_name, sizeof (lifr->lifr_name));
8159 		if (ipif->ipif_isv6) {
8160 			sin6 = (sin6_t *)&lifr->lifr_addr;
8161 			*sin6 = sin6_null;
8162 			sin6->sin6_family = AF_INET6;
8163 			sin6->sin6_addr = ipif->ipif_v6lcl_addr;
8164 			lifr->lifr_addrlen = ip_mask_to_plen_v6(
8165 			    &ipif->ipif_v6net_mask);
8166 		} else {
8167 			sin = (sin_t *)&lifr->lifr_addr;
8168 			*sin = sin_null;
8169 			sin->sin_family = AF_INET;
8170 			sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
8171 			lifr->lifr_addrlen = ip_mask_to_plen(
8172 			    ipif->ipif_net_mask);
8173 		}
8174 		lifr++;
8175 	}
8176 	rw_exit(&ill_g_usesrc_lock);
8177 	rw_exit(&ill_g_lock);
8178 	ipif_refrele(orig_ipif);
8179 	mp1->b_wptr = (uchar_t *)lifr;
8180 	STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr));
8181 
8182 	return (0);
8183 }
8184 
8185 /* ARGSUSED */
8186 int
8187 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
8188     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
8189 {
8190 	mblk_t *mp1;
8191 	int	list;
8192 	ill_t	*ill;
8193 	ipif_t	*ipif;
8194 	int	flags;
8195 	int	numlifs = 0;
8196 	size_t	lifc_bufsize;
8197 	struct	lifreq *lifr;
8198 	sa_family_t	family;
8199 	struct	sockaddr_in	*sin;
8200 	struct	sockaddr_in6	*sin6;
8201 	ill_walk_context_t	ctx;
8202 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8203 	int32_t	lifclen;
8204 	zoneid_t zoneid;
8205 	STRUCT_HANDLE(lifconf, lifc);
8206 
8207 	ip1dbg(("ip_sioctl_get_lifconf"));
8208 
8209 	ASSERT(q->q_next == NULL);
8210 
8211 	zoneid = Q_TO_CONN(q)->conn_zoneid;
8212 
8213 	/* Existence verified in ip_wput_nondata */
8214 	mp1 = mp->b_cont->b_cont;
8215 
8216 	/*
8217 	 * An extended version of SIOCGIFCONF that takes an
8218 	 * additional address family and flags field.
8219 	 * AF_UNSPEC retrieve both IPv4 and IPv6.
8220 	 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT
8221 	 * interfaces are omitted.
8222 	 * Similarly, IPIF_TEMPORARY interfaces are omitted
8223 	 * unless LIFC_TEMPORARY is specified.
8224 	 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT,
8225 	 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and
8226 	 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE
8227 	 * has priority over LIFC_NOXMIT.
8228 	 */
8229 	STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL);
8230 
8231 	if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc))
8232 		return (EINVAL);
8233 
8234 	/*
8235 	 * Must be (better be!) continuation of a TRANSPARENT
8236 	 * IOCTL.  We just copied in the lifconf structure.
8237 	 */
8238 	STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr);
8239 
8240 	family = STRUCT_FGET(lifc, lifc_family);
8241 	flags = STRUCT_FGET(lifc, lifc_flags);
8242 
8243 	switch (family) {
8244 	case AF_UNSPEC:
8245 		/*
8246 		 * walk all ILL's.
8247 		 */
8248 		list = MAX_G_HEADS;
8249 		break;
8250 	case AF_INET:
8251 		/*
8252 		 * walk only IPV4 ILL's.
8253 		 */
8254 		list = IP_V4_G_HEAD;
8255 		break;
8256 	case AF_INET6:
8257 		/*
8258 		 * walk only IPV6 ILL's.
8259 		 */
8260 		list = IP_V6_G_HEAD;
8261 		break;
8262 	default:
8263 		return (EAFNOSUPPORT);
8264 	}
8265 
8266 	/*
8267 	 * Allocate a buffer to hold requested information.
8268 	 *
8269 	 * If lifc_len is larger than what is needed, we only
8270 	 * allocate what we will use.
8271 	 *
8272 	 * If lifc_len is smaller than what is needed, return
8273 	 * EINVAL.
8274 	 */
8275 	numlifs = ip_get_numlifs(family, flags, zoneid);
8276 	lifc_bufsize = numlifs * sizeof (struct lifreq);
8277 	lifclen = STRUCT_FGET(lifc, lifc_len);
8278 	if (lifc_bufsize > lifclen) {
8279 		if (iocp->ioc_cmd == O_SIOCGLIFCONF)
8280 			return (EINVAL);
8281 		else
8282 			lifc_bufsize = lifclen;
8283 	}
8284 
8285 	mp1 = mi_copyout_alloc(q, mp,
8286 	    STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE);
8287 	if (mp1 == NULL)
8288 		return (ENOMEM);
8289 
8290 	mp1->b_wptr = mp1->b_rptr + lifc_bufsize;
8291 	bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
8292 
8293 	lifr = (struct lifreq *)mp1->b_rptr;
8294 
8295 	rw_enter(&ill_g_lock, RW_READER);
8296 	ill = ill_first(list, list, &ctx);
8297 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
8298 		for (ipif = ill->ill_ipif; ipif != NULL;
8299 		    ipif = ipif->ipif_next) {
8300 			if ((ipif->ipif_flags & IPIF_NOXMIT) &&
8301 			    !(flags & LIFC_NOXMIT))
8302 				continue;
8303 
8304 			if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
8305 			    !(flags & LIFC_TEMPORARY))
8306 				continue;
8307 
8308 			if (((ipif->ipif_flags &
8309 			    (IPIF_NOXMIT|IPIF_NOLOCAL|
8310 			    IPIF_DEPRECATED)) ||
8311 			    (ill->ill_phyint->phyint_flags &
8312 			    PHYI_LOOPBACK) ||
8313 			    !(ipif->ipif_flags & IPIF_UP)) &&
8314 			    (flags & LIFC_EXTERNAL_SOURCE))
8315 				continue;
8316 
8317 			if (zoneid != ipif->ipif_zoneid &&
8318 			    (zoneid != GLOBAL_ZONEID ||
8319 			    !(flags & LIFC_ALLZONES)))
8320 				continue;
8321 
8322 			if ((uchar_t *)&lifr[1] > mp1->b_wptr) {
8323 				if (iocp->ioc_cmd == O_SIOCGLIFCONF) {
8324 					rw_exit(&ill_g_lock);
8325 					return (EINVAL);
8326 				} else {
8327 					goto lif_copydone;
8328 				}
8329 			}
8330 
8331 			(void) ipif_get_name(ipif,
8332 				lifr->lifr_name,
8333 				sizeof (lifr->lifr_name));
8334 			if (ipif->ipif_isv6) {
8335 				sin6 = (sin6_t *)&lifr->lifr_addr;
8336 				*sin6 = sin6_null;
8337 				sin6->sin6_family = AF_INET6;
8338 				sin6->sin6_addr =
8339 				ipif->ipif_v6lcl_addr;
8340 				lifr->lifr_addrlen =
8341 				ip_mask_to_plen_v6(
8342 				    &ipif->ipif_v6net_mask);
8343 			} else {
8344 				sin = (sin_t *)&lifr->lifr_addr;
8345 				*sin = sin_null;
8346 				sin->sin_family = AF_INET;
8347 				sin->sin_addr.s_addr =
8348 				    ipif->ipif_lcl_addr;
8349 				lifr->lifr_addrlen =
8350 				    ip_mask_to_plen(
8351 				    ipif->ipif_net_mask);
8352 			}
8353 			lifr++;
8354 		}
8355 	}
8356 lif_copydone:
8357 	rw_exit(&ill_g_lock);
8358 
8359 	mp1->b_wptr = (uchar_t *)lifr;
8360 	if (STRUCT_BUF(lifc) != NULL) {
8361 		STRUCT_FSET(lifc, lifc_len,
8362 			(int)((uchar_t *)lifr - mp1->b_rptr));
8363 	}
8364 	return (0);
8365 }
8366 
8367 /* ARGSUSED */
8368 int
8369 ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin,
8370     queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
8371 {
8372 	/* Existence of b_cont->b_cont checked in ip_wput_nondata */
8373 	ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr;
8374 	return (0);
8375 }
8376 
8377 static void
8378 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp)
8379 {
8380 	ip6_asp_t *table;
8381 	size_t table_size;
8382 	mblk_t *data_mp;
8383 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8384 
8385 	/* These two ioctls are I_STR only */
8386 	if (iocp->ioc_count == TRANSPARENT) {
8387 		miocnak(q, mp, 0, EINVAL);
8388 		return;
8389 	}
8390 
8391 	data_mp = mp->b_cont;
8392 	if (data_mp == NULL) {
8393 		/* The user passed us a NULL argument */
8394 		table = NULL;
8395 		table_size = iocp->ioc_count;
8396 	} else {
8397 		/*
8398 		 * The user provided a table.  The stream head
8399 		 * may have copied in the user data in chunks,
8400 		 * so make sure everything is pulled up
8401 		 * properly.
8402 		 */
8403 		if (MBLKL(data_mp) < iocp->ioc_count) {
8404 			mblk_t *new_data_mp;
8405 			if ((new_data_mp = msgpullup(data_mp, -1)) ==
8406 			    NULL) {
8407 				miocnak(q, mp, 0, ENOMEM);
8408 				return;
8409 			}
8410 			freemsg(data_mp);
8411 			data_mp = new_data_mp;
8412 			mp->b_cont = data_mp;
8413 		}
8414 		table = (ip6_asp_t *)data_mp->b_rptr;
8415 		table_size = iocp->ioc_count;
8416 	}
8417 
8418 	switch (iocp->ioc_cmd) {
8419 	case SIOCGIP6ADDRPOLICY:
8420 		iocp->ioc_rval = ip6_asp_get(table, table_size);
8421 		if (iocp->ioc_rval == -1)
8422 			iocp->ioc_error = EINVAL;
8423 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
8424 		else if (table != NULL &&
8425 		    (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) {
8426 			ip6_asp_t *src = table;
8427 			ip6_asp32_t *dst = (void *)table;
8428 			int count = table_size / sizeof (ip6_asp_t);
8429 			int i;
8430 
8431 			/*
8432 			 * We need to do an in-place shrink of the array
8433 			 * to match the alignment attributes of the
8434 			 * 32-bit ABI looking at it.
8435 			 */
8436 			/* LINTED: logical expression always true: op "||" */
8437 			ASSERT(sizeof (*src) > sizeof (*dst));
8438 			for (i = 1; i < count; i++)
8439 				bcopy(src + i, dst + i, sizeof (*dst));
8440 		}
8441 #endif
8442 		break;
8443 
8444 	case SIOCSIP6ADDRPOLICY:
8445 		ASSERT(mp->b_prev == NULL);
8446 		mp->b_prev = (void *)q;
8447 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
8448 		/*
8449 		 * We pass in the datamodel here so that the ip6_asp_replace()
8450 		 * routine can handle converting from 32-bit to native formats
8451 		 * where necessary.
8452 		 *
8453 		 * A better way to handle this might be to convert the inbound
8454 		 * data structure here, and hang it off a new 'mp'; thus the
8455 		 * ip6_asp_replace() logic would always be dealing with native
8456 		 * format data structures..
8457 		 *
8458 		 * (An even simpler way to handle these ioctls is to just
8459 		 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure
8460 		 * and just recompile everything that depends on it.)
8461 		 */
8462 #endif
8463 		ip6_asp_replace(mp, table, table_size, B_FALSE,
8464 		    iocp->ioc_flag & IOC_MODELS);
8465 		return;
8466 	}
8467 
8468 	DB_TYPE(mp) =  (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK;
8469 	qreply(q, mp);
8470 }
8471 
8472 static void
8473 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
8474 {
8475 	mblk_t 		*data_mp;
8476 	struct dstinforeq	*dir;
8477 	uint8_t		*end, *cur;
8478 	in6_addr_t	*daddr, *saddr;
8479 	ipaddr_t	v4daddr;
8480 	ire_t		*ire;
8481 	char		*slabel, *dlabel;
8482 	boolean_t	isipv4;
8483 	int		match_ire;
8484 	ill_t		*dst_ill;
8485 	ipif_t		*src_ipif, *ire_ipif;
8486 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8487 	zoneid_t	zoneid;
8488 
8489 	ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
8490 	zoneid = Q_TO_CONN(q)->conn_zoneid;
8491 
8492 	/*
8493 	 * This ioctl is I_STR only, and must have a
8494 	 * data mblk following the M_IOCTL mblk.
8495 	 */
8496 	data_mp = mp->b_cont;
8497 	if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) {
8498 		miocnak(q, mp, 0, EINVAL);
8499 		return;
8500 	}
8501 
8502 	if (MBLKL(data_mp) < iocp->ioc_count) {
8503 		mblk_t *new_data_mp;
8504 
8505 		if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) {
8506 			miocnak(q, mp, 0, ENOMEM);
8507 			return;
8508 		}
8509 		freemsg(data_mp);
8510 		data_mp = new_data_mp;
8511 		mp->b_cont = data_mp;
8512 	}
8513 	match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT;
8514 
8515 	for (cur = data_mp->b_rptr, end = data_mp->b_wptr;
8516 	    end - cur >= sizeof (struct dstinforeq);
8517 	    cur += sizeof (struct dstinforeq)) {
8518 		dir = (struct dstinforeq *)cur;
8519 		daddr = &dir->dir_daddr;
8520 		saddr = &dir->dir_saddr;
8521 
8522 		/*
8523 		 * ip_addr_scope_v6() and ip6_asp_lookup() handle
8524 		 * v4 mapped addresses; ire_ftable_lookup[_v6]()
8525 		 * and ipif_select_source[_v6]() do not.
8526 		 */
8527 		dir->dir_dscope = ip_addr_scope_v6(daddr);
8528 		dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence);
8529 
8530 		isipv4 = IN6_IS_ADDR_V4MAPPED(daddr);
8531 		if (isipv4) {
8532 			IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr);
8533 			ire = ire_ftable_lookup(v4daddr, NULL, NULL,
8534 			    0, NULL, NULL, zoneid, 0, match_ire);
8535 		} else {
8536 			ire = ire_ftable_lookup_v6(daddr, NULL, NULL,
8537 			    0, NULL, NULL, zoneid, 0, match_ire);
8538 		}
8539 		if (ire == NULL) {
8540 			dir->dir_dreachable = 0;
8541 
8542 			/* move on to next dst addr */
8543 			continue;
8544 		}
8545 		dir->dir_dreachable = 1;
8546 
8547 		ire_ipif = ire->ire_ipif;
8548 		if (ire_ipif == NULL)
8549 			goto next_dst;
8550 
8551 		/*
8552 		 * We expect to get back an interface ire or a
8553 		 * gateway ire cache entry.  For both types, the
8554 		 * output interface is ire_ipif->ipif_ill.
8555 		 */
8556 		dst_ill = ire_ipif->ipif_ill;
8557 		dir->dir_dmactype = dst_ill->ill_mactype;
8558 
8559 		if (isipv4) {
8560 			src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid);
8561 		} else {
8562 			src_ipif = ipif_select_source_v6(dst_ill,
8563 			    daddr, B_FALSE, IPV6_PREFER_SRC_DEFAULT,
8564 			    zoneid);
8565 		}
8566 		if (src_ipif == NULL)
8567 			goto next_dst;
8568 
8569 		*saddr = src_ipif->ipif_v6lcl_addr;
8570 		dir->dir_sscope = ip_addr_scope_v6(saddr);
8571 		slabel = ip6_asp_lookup(saddr, NULL);
8572 		dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel);
8573 		dir->dir_sdeprecated =
8574 		    (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0;
8575 		ipif_refrele(src_ipif);
8576 next_dst:
8577 		ire_refrele(ire);
8578 	}
8579 	miocack(q, mp, iocp->ioc_count, 0);
8580 }
8581 
8582 
8583 /*
8584  * Check if this is an address assigned to this machine.
8585  * Skips interfaces that are down by using ire checks.
8586  * Translates mapped addresses to v4 addresses and then
8587  * treats them as such, returning true if the v4 address
8588  * associated with this mapped address is configured.
8589  * Note: Applications will have to be careful what they do
8590  * with the response; use of mapped addresses limits
8591  * what can be done with the socket, especially with
8592  * respect to socket options and ioctls - neither IPv4
8593  * options nor IPv6 sticky options/ancillary data options
8594  * may be used.
8595  */
8596 /* ARGSUSED */
8597 int
8598 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
8599     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
8600 {
8601 	struct sioc_addrreq *sia;
8602 	sin_t *sin;
8603 	ire_t *ire;
8604 	mblk_t *mp1;
8605 	zoneid_t zoneid;
8606 
8607 	ip1dbg(("ip_sioctl_tmyaddr"));
8608 
8609 	ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
8610 	zoneid = Q_TO_CONN(q)->conn_zoneid;
8611 
8612 	/* Existence verified in ip_wput_nondata */
8613 	mp1 = mp->b_cont->b_cont;
8614 	sia = (struct sioc_addrreq *)mp1->b_rptr;
8615 	sin = (sin_t *)&sia->sa_addr;
8616 	switch (sin->sin_family) {
8617 	case AF_INET6: {
8618 		sin6_t *sin6 = (sin6_t *)sin;
8619 
8620 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
8621 			ipaddr_t v4_addr;
8622 
8623 			IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
8624 			    v4_addr);
8625 			ire = ire_ctable_lookup(v4_addr, 0,
8626 			    IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
8627 			    MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY);
8628 		} else {
8629 			in6_addr_t v6addr;
8630 
8631 			v6addr = sin6->sin6_addr;
8632 			ire = ire_ctable_lookup_v6(&v6addr, 0,
8633 			    IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
8634 			    MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY);
8635 		}
8636 		break;
8637 	}
8638 	case AF_INET: {
8639 		ipaddr_t v4addr;
8640 
8641 		v4addr = sin->sin_addr.s_addr;
8642 		ire = ire_ctable_lookup(v4addr, 0,
8643 		    IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
8644 		    MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY);
8645 		break;
8646 	}
8647 	default:
8648 		return (EAFNOSUPPORT);
8649 	}
8650 	if (ire != NULL) {
8651 		sia->sa_res = 1;
8652 		ire_refrele(ire);
8653 	} else {
8654 		sia->sa_res = 0;
8655 	}
8656 	return (0);
8657 }
8658 
8659 /*
8660  * Check if this is an address assigned on-link i.e. neighbor,
8661  * and makes sure it's reachable from the current zone.
8662  * Returns true for my addresses as well.
8663  * Translates mapped addresses to v4 addresses and then
8664  * treats them as such, returning true if the v4 address
8665  * associated with this mapped address is configured.
8666  * Note: Applications will have to be careful what they do
8667  * with the response; use of mapped addresses limits
8668  * what can be done with the socket, especially with
8669  * respect to socket options and ioctls - neither IPv4
8670  * options nor IPv6 sticky options/ancillary data options
8671  * may be used.
8672  */
8673 /* ARGSUSED */
8674 int
8675 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
8676     ip_ioctl_cmd_t *ipip, void *duymmy_ifreq)
8677 {
8678 	struct sioc_addrreq *sia;
8679 	sin_t *sin;
8680 	mblk_t	*mp1;
8681 	ire_t *ire = NULL;
8682 	zoneid_t zoneid;
8683 
8684 	ip1dbg(("ip_sioctl_tonlink"));
8685 
8686 	ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
8687 	zoneid = Q_TO_CONN(q)->conn_zoneid;
8688 
8689 	/* Existence verified in ip_wput_nondata */
8690 	mp1 = mp->b_cont->b_cont;
8691 	sia = (struct sioc_addrreq *)mp1->b_rptr;
8692 	sin = (sin_t *)&sia->sa_addr;
8693 
8694 	/*
8695 	 * Match addresses with a zero gateway field to avoid
8696 	 * routes going through a router.
8697 	 * Exclude broadcast and multicast addresses.
8698 	 */
8699 	switch (sin->sin_family) {
8700 	case AF_INET6: {
8701 		sin6_t *sin6 = (sin6_t *)sin;
8702 
8703 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
8704 			ipaddr_t v4_addr;
8705 
8706 			IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
8707 			    v4_addr);
8708 			if (!CLASSD(v4_addr)) {
8709 				ire = ire_route_lookup(v4_addr, 0, 0, 0,
8710 				    NULL, NULL, zoneid, MATCH_IRE_GW);
8711 			}
8712 		} else {
8713 			in6_addr_t v6addr;
8714 			in6_addr_t v6gw;
8715 
8716 			v6addr = sin6->sin6_addr;
8717 			v6gw = ipv6_all_zeros;
8718 			if (!IN6_IS_ADDR_MULTICAST(&v6addr)) {
8719 				ire = ire_route_lookup_v6(&v6addr, 0,
8720 				    &v6gw, 0, NULL, NULL, zoneid,
8721 				    MATCH_IRE_GW);
8722 			}
8723 		}
8724 		break;
8725 	}
8726 	case AF_INET: {
8727 		ipaddr_t v4addr;
8728 
8729 		v4addr = sin->sin_addr.s_addr;
8730 		if (!CLASSD(v4addr)) {
8731 			ire = ire_route_lookup(v4addr, 0, 0, 0,
8732 			    NULL, NULL, zoneid, MATCH_IRE_GW);
8733 		}
8734 		break;
8735 	}
8736 	default:
8737 		return (EAFNOSUPPORT);
8738 	}
8739 	sia->sa_res = 0;
8740 	if (ire != NULL) {
8741 		if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE|
8742 		    IRE_LOCAL|IRE_LOOPBACK)) {
8743 			sia->sa_res = 1;
8744 		}
8745 		ire_refrele(ire);
8746 	}
8747 	return (0);
8748 }
8749 
8750 /*
8751  * TBD: implement when kernel maintaines a list of site prefixes.
8752  */
8753 /* ARGSUSED */
8754 int
8755 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8756     ip_ioctl_cmd_t *ipip, void *ifreq)
8757 {
8758 	return (ENXIO);
8759 }
8760 
8761 /* ARGSUSED */
8762 int
8763 ip_sioctl_tunparam(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
8764     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
8765 {
8766 	ill_t  		*ill;
8767 	mblk_t		*mp1;
8768 	conn_t		*connp;
8769 	boolean_t	success;
8770 
8771 	ip1dbg(("ip_sioctl_tunparam(%s:%u %p)\n",
8772 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
8773 	/* ioctl comes down on an conn */
8774 	ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8775 	connp = Q_TO_CONN(q);
8776 
8777 	mp->b_datap->db_type = M_IOCTL;
8778 
8779 	/*
8780 	 * Send down a copy. (copymsg does not copy b_next/b_prev).
8781 	 * The original mp contains contaminated b_next values due to 'mi',
8782 	 * which is needed to do the mi_copy_done. Unfortunately if we
8783 	 * send down the original mblk itself and if we are popped due to an
8784 	 * an unplumb before the response comes back from tunnel,
8785 	 * the streamhead (which does a freemsg) will see this contaminated
8786 	 * message and the assertion in freemsg about non-null b_next/b_prev
8787 	 * will panic a DEBUG kernel.
8788 	 */
8789 	mp1 = copymsg(mp);
8790 	if (mp1 == NULL)
8791 		return (ENOMEM);
8792 
8793 	ill = ipif->ipif_ill;
8794 	mutex_enter(&connp->conn_lock);
8795 	mutex_enter(&ill->ill_lock);
8796 	if (ipip->ipi_cmd == SIOCSTUNPARAM || ipip->ipi_cmd == OSIOCSTUNPARAM) {
8797 		success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp),
8798 		    mp, 0);
8799 	} else {
8800 		success = ill_pending_mp_add(ill, connp, mp);
8801 	}
8802 	mutex_exit(&ill->ill_lock);
8803 	mutex_exit(&connp->conn_lock);
8804 
8805 	if (success) {
8806 		ip1dbg(("sending down tunparam request "));
8807 		putnext(ill->ill_wq, mp1);
8808 		return (EINPROGRESS);
8809 	} else {
8810 		/* The conn has started closing */
8811 		freemsg(mp1);
8812 		return (EINTR);
8813 	}
8814 }
8815 
8816 static int
8817 ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, sin_t *sin,
8818     boolean_t x_arp_ioctl, boolean_t if_arp_ioctl)
8819 {
8820 	mblk_t *mp1;
8821 	mblk_t *mp2;
8822 	mblk_t *pending_mp;
8823 	ipaddr_t ipaddr;
8824 	area_t *area;
8825 	struct iocblk *iocp;
8826 	conn_t *connp;
8827 	struct arpreq *ar;
8828 	struct xarpreq *xar;
8829 	boolean_t success;
8830 	int flags, alength;
8831 	char *lladdr;
8832 
8833 	ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8834 	connp = Q_TO_CONN(q);
8835 
8836 	iocp = (struct iocblk *)mp->b_rptr;
8837 	/*
8838 	 * ill has already been set depending on whether
8839 	 * bsd style or interface style ioctl.
8840 	 */
8841 	ASSERT(ill != NULL);
8842 
8843 	/*
8844 	 * Is this one of the new SIOC*XARP ioctls?
8845 	 */
8846 	if (x_arp_ioctl) {
8847 		/* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */
8848 		xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr;
8849 		ar = NULL;
8850 
8851 		flags = xar->xarp_flags;
8852 		lladdr = LLADDR(&xar->xarp_ha);
8853 		/*
8854 		 * Validate against user's link layer address length
8855 		 * input and name and addr length limits.
8856 		 */
8857 		alength = ill->ill_phys_addr_length;
8858 		if (iocp->ioc_cmd == SIOCSXARP) {
8859 			if (alength != xar->xarp_ha.sdl_alen ||
8860 			    (alength + xar->xarp_ha.sdl_nlen >
8861 			    sizeof (xar->xarp_ha.sdl_data)))
8862 				return (EINVAL);
8863 		}
8864 	} else {
8865 		/* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */
8866 		ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr;
8867 		xar = NULL;
8868 
8869 		flags = ar->arp_flags;
8870 		lladdr = ar->arp_ha.sa_data;
8871 		/*
8872 		 * Theoretically, the sa_family could tell us what link
8873 		 * layer type this operation is trying to deal with. By
8874 		 * common usage AF_UNSPEC means ethernet. We'll assume
8875 		 * any attempt to use the SIOC?ARP ioctls is for ethernet,
8876 		 * for now. Our new SIOC*XARP ioctls can be used more
8877 		 * generally.
8878 		 *
8879 		 * If the underlying media happens to have a non 6 byte
8880 		 * address, arp module will fail set/get, but the del
8881 		 * operation will succeed.
8882 		 */
8883 		alength = 6;
8884 		if ((iocp->ioc_cmd != SIOCDARP) &&
8885 		    (alength != ill->ill_phys_addr_length)) {
8886 			return (EINVAL);
8887 		}
8888 	}
8889 
8890 	/*
8891 	 * We are going to pass up to ARP a packet chain that looks
8892 	 * like:
8893 	 *
8894 	 * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK
8895 	 *
8896 	 * Get a copy of the original IOCTL mblk to head the chain,
8897 	 * to be sent up (in mp1). Also get another copy to store
8898 	 * in the ill_pending_mp list, for matching the response
8899 	 * when it comes back from ARP.
8900 	 */
8901 	mp1 = copyb(mp);
8902 	pending_mp = copymsg(mp);
8903 	if (mp1 == NULL || pending_mp == NULL) {
8904 		if (mp1 != NULL)
8905 			freeb(mp1);
8906 		if (pending_mp != NULL)
8907 			inet_freemsg(pending_mp);
8908 		return (ENOMEM);
8909 	}
8910 
8911 	ipaddr = sin->sin_addr.s_addr;
8912 
8913 	mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template,
8914 	    (caddr_t)&ipaddr);
8915 	if (mp2 == NULL) {
8916 		freeb(mp1);
8917 		inet_freemsg(pending_mp);
8918 		return (ENOMEM);
8919 	}
8920 	/* Put together the chain. */
8921 	mp1->b_cont = mp2;
8922 	mp1->b_datap->db_type = M_IOCTL;
8923 	mp2->b_cont = mp;
8924 	mp2->b_datap->db_type = M_DATA;
8925 
8926 	iocp = (struct iocblk *)mp1->b_rptr;
8927 
8928 	/*
8929 	 * An M_IOCDATA's payload (struct copyresp) is mostly the same as an
8930 	 * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a
8931 	 * cp_private field (or cp_rval on 32-bit systems) in place of the
8932 	 * ioc_count field; set ioc_count to be correct.
8933 	 */
8934 	iocp->ioc_count = MBLKL(mp1->b_cont);
8935 
8936 	/*
8937 	 * Set the proper command in the ARP message.
8938 	 * Convert the SIOC{G|S|D}ARP calls into our
8939 	 * AR_ENTRY_xxx calls.
8940 	 */
8941 	area = (area_t *)mp2->b_rptr;
8942 	switch (iocp->ioc_cmd) {
8943 	case SIOCDARP:
8944 	case SIOCDXARP:
8945 		/*
8946 		 * We defer deleting the corresponding IRE until
8947 		 * we return from arp.
8948 		 */
8949 		area->area_cmd = AR_ENTRY_DELETE;
8950 		area->area_proto_mask_offset = 0;
8951 		break;
8952 	case SIOCGARP:
8953 	case SIOCGXARP:
8954 		area->area_cmd = AR_ENTRY_SQUERY;
8955 		area->area_proto_mask_offset = 0;
8956 		break;
8957 	case SIOCSARP:
8958 	case SIOCSXARP: {
8959 		/*
8960 		 * Delete the corresponding ire to make sure IP will
8961 		 * pick up any change from arp.
8962 		 */
8963 		if (!if_arp_ioctl) {
8964 			(void) ip_ire_clookup_and_delete(ipaddr, NULL);
8965 			break;
8966 		} else {
8967 			ipif_t *ipif = ipif_get_next_ipif(NULL, ill);
8968 			if (ipif != NULL) {
8969 				(void) ip_ire_clookup_and_delete(ipaddr, ipif);
8970 				ipif_refrele(ipif);
8971 			}
8972 			break;
8973 		}
8974 	}
8975 	}
8976 	iocp->ioc_cmd = area->area_cmd;
8977 
8978 	/*
8979 	 * Before sending 'mp' to ARP, we have to clear the b_next
8980 	 * and b_prev. Otherwise if STREAMS encounters such a message
8981 	 * in freemsg(), (because ARP can close any time) it can cause
8982 	 * a panic. But mi code needs the b_next and b_prev values of
8983 	 * mp->b_cont, to complete the ioctl. So we store it here
8984 	 * in pending_mp->bcont, and restore it in ip_sioctl_iocack()
8985 	 * when the response comes down from ARP.
8986 	 */
8987 	pending_mp->b_cont->b_next = mp->b_cont->b_next;
8988 	pending_mp->b_cont->b_prev = mp->b_cont->b_prev;
8989 	mp->b_cont->b_next = NULL;
8990 	mp->b_cont->b_prev = NULL;
8991 
8992 	mutex_enter(&connp->conn_lock);
8993 	mutex_enter(&ill->ill_lock);
8994 	/* conn has not yet started closing, hence this can't fail */
8995 	success = ill_pending_mp_add(ill, connp, pending_mp);
8996 	ASSERT(success);
8997 	mutex_exit(&ill->ill_lock);
8998 	mutex_exit(&connp->conn_lock);
8999 
9000 	/*
9001 	 * Fill in the rest of the ARP operation fields.
9002 	 */
9003 	area->area_hw_addr_length = alength;
9004 	bcopy(lladdr,
9005 	    (char *)area + area->area_hw_addr_offset,
9006 	    area->area_hw_addr_length);
9007 	/* Translate the flags. */
9008 	if (flags & ATF_PERM)
9009 		area->area_flags |= ACE_F_PERMANENT;
9010 	if (flags & ATF_PUBL)
9011 		area->area_flags |= ACE_F_PUBLISH;
9012 
9013 	/*
9014 	 * Up to ARP it goes.  The response will come
9015 	 * back in ip_wput as an M_IOCACK message, and
9016 	 * will be handed to ip_sioctl_iocack for
9017 	 * completion.
9018 	 */
9019 	putnext(ill->ill_rq, mp1);
9020 	return (EINPROGRESS);
9021 }
9022 
9023 /* ARGSUSED */
9024 int
9025 ip_sioctl_xarp(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
9026     ip_ioctl_cmd_t *ipip, void *ifreq)
9027 {
9028 	struct xarpreq *xar;
9029 	boolean_t isv6;
9030 	mblk_t	*mp1;
9031 	int	err;
9032 	conn_t	*connp;
9033 	int ifnamelen;
9034 	ire_t	*ire = NULL;
9035 	ill_t	*ill = NULL;
9036 	struct sockaddr_in *sin;
9037 	boolean_t if_arp_ioctl = B_FALSE;
9038 
9039 	/* ioctl comes down on an conn */
9040 	ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
9041 	connp = Q_TO_CONN(q);
9042 	isv6 = connp->conn_af_isv6;
9043 
9044 	/* Existance verified in ip_wput_nondata */
9045 	mp1 = mp->b_cont->b_cont;
9046 
9047 	ASSERT(MBLKL(mp1) >= sizeof (*xar));
9048 	xar = (struct xarpreq *)mp1->b_rptr;
9049 	sin = (sin_t *)&xar->xarp_pa;
9050 
9051 	if (isv6 || (xar->xarp_ha.sdl_family != AF_LINK) ||
9052 	    (xar->xarp_pa.ss_family != AF_INET))
9053 		return (ENXIO);
9054 
9055 	ifnamelen = xar->xarp_ha.sdl_nlen;
9056 	if (ifnamelen != 0) {
9057 		char	*cptr, cval;
9058 
9059 		if (ifnamelen >= LIFNAMSIZ)
9060 			return (EINVAL);
9061 
9062 		/*
9063 		 * Instead of bcopying a bunch of bytes,
9064 		 * null-terminate the string in-situ.
9065 		 */
9066 		cptr = xar->xarp_ha.sdl_data + ifnamelen;
9067 		cval = *cptr;
9068 		*cptr = '\0';
9069 		ill = ill_lookup_on_name(xar->xarp_ha.sdl_data,
9070 		    B_FALSE, isv6, CONNP_TO_WQ(connp), mp, ip_process_ioctl,
9071 		    &err, NULL);
9072 		*cptr = cval;
9073 		if (ill == NULL)
9074 			return (err);
9075 		if (ill->ill_net_type != IRE_IF_RESOLVER) {
9076 			ill_refrele(ill);
9077 			return (ENXIO);
9078 		}
9079 
9080 		if_arp_ioctl = B_TRUE;
9081 	} else {
9082 		/*
9083 		 * PSARC 2003/088 states that if sdl_nlen == 0, it behaves
9084 		 * as an extended BSD ioctl. The kernel uses the IP address
9085 		 * to figure out the network interface.
9086 		 */
9087 		ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES);
9088 		if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) ||
9089 		    ((ill = ire_to_ill(ire)) == NULL)) {
9090 			if (ire != NULL)
9091 				ire_refrele(ire);
9092 			ire = ire_ftable_lookup(sin->sin_addr.s_addr,
9093 			    0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0,
9094 			    MATCH_IRE_TYPE);
9095 			if ((ire == NULL) ||
9096 			    ((ill = ire_to_ill(ire)) == NULL)) {
9097 				if (ire != NULL)
9098 					ire_refrele(ire);
9099 				return (ENXIO);
9100 			}
9101 		}
9102 		ASSERT(ire != NULL && ill != NULL);
9103 	}
9104 
9105 	err = ip_sioctl_arp_common(ill, q, mp, sin, B_TRUE, if_arp_ioctl);
9106 	if (if_arp_ioctl)
9107 		ill_refrele(ill);
9108 	if (ire != NULL)
9109 		ire_refrele(ire);
9110 
9111 	return (err);
9112 }
9113 
9114 /*
9115  * ARP IOCTLs.
9116  * How does IP get in the business of fronting ARP configuration/queries?
9117  * Well its like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP)
9118  * are by tradition passed in through a datagram socket.  That lands in IP.
9119  * As it happens, this is just as well since the interface is quite crude in
9120  * that it passes in no information about protocol or hardware types, or
9121  * interface association.  After making the protocol assumption, IP is in
9122  * the position to look up the name of the ILL, which ARP will need, and
9123  * format a request that can be handled by ARP.	 The request is passed up
9124  * stream to ARP, and the original IOCTL is completed by IP when ARP passes
9125  * back a response.  ARP supports its own set of more general IOCTLs, in
9126  * case anyone is interested.
9127  */
9128 /* ARGSUSED */
9129 int
9130 ip_sioctl_arp(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
9131     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
9132 {
9133 	struct arpreq *ar;
9134 	struct sockaddr_in *sin;
9135 	ire_t	*ire;
9136 	boolean_t isv6;
9137 	mblk_t	*mp1;
9138 	int	err;
9139 	conn_t	*connp;
9140 	ill_t	*ill;
9141 
9142 	/* ioctl comes down on an conn */
9143 	ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
9144 	connp = Q_TO_CONN(q);
9145 	isv6 = connp->conn_af_isv6;
9146 	if (isv6)
9147 		return (ENXIO);
9148 
9149 	/* Existance verified in ip_wput_nondata */
9150 	mp1 = mp->b_cont->b_cont;
9151 
9152 	ar = (struct arpreq *)mp1->b_rptr;
9153 	sin = (sin_t *)&ar->arp_pa;
9154 
9155 	/*
9156 	 * We need to let ARP know on which interface the IP
9157 	 * address has an ARP mapping. In the IPMP case, a
9158 	 * simple forwarding table lookup will return the
9159 	 * IRE_IF_RESOLVER for the first interface in the group,
9160 	 * which might not be the interface on which the
9161 	 * requested IP address was resolved due to the ill
9162 	 * selection algorithm (see ip_newroute_get_dst_ill()).
9163 	 * So we do a cache table lookup first: if the IRE cache
9164 	 * entry for the IP address is still there, it will
9165 	 * contain the ill pointer for the right interface, so
9166 	 * we use that. If the cache entry has been flushed, we
9167 	 * fall back to the forwarding table lookup. This should
9168 	 * be rare enough since IRE cache entries have a longer
9169 	 * life expectancy than ARP cache entries.
9170 	 */
9171 	ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES);
9172 	if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) ||
9173 	    ((ill = ire_to_ill(ire)) == NULL)) {
9174 		if (ire != NULL)
9175 			ire_refrele(ire);
9176 		ire = ire_ftable_lookup(sin->sin_addr.s_addr,
9177 		    0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0,
9178 		    MATCH_IRE_TYPE);
9179 		if ((ire == NULL) || ((ill = ire_to_ill(ire)) == NULL)) {
9180 			if (ire != NULL)
9181 				ire_refrele(ire);
9182 			return (ENXIO);
9183 		}
9184 	}
9185 	ASSERT(ire != NULL && ill != NULL);
9186 
9187 	err = ip_sioctl_arp_common(ill, q, mp, sin, B_FALSE, B_FALSE);
9188 	ire_refrele(ire);
9189 	return (err);
9190 }
9191 
9192 /*
9193  * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also
9194  * atomically set/clear the muxids. Also complete the ioctl by acking or
9195  * naking it.  Note that the code is structured such that the link type,
9196  * whether it's persistent or not, is treated equally.  ifconfig(1M) and
9197  * its clones use the persistent link, while pppd(1M) and perhaps many
9198  * other daemons may use non-persistent link.  When combined with some
9199  * ill_t states, linking and unlinking lower streams may be used as
9200  * indicators of dynamic re-plumbing events [see PSARC/1999/348].
9201  */
9202 /* ARGSUSED */
9203 void
9204 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
9205 {
9206 	mblk_t *mp1;
9207 	mblk_t *mp2;
9208 	struct linkblk *li;
9209 	queue_t	*ipwq;
9210 	char	*name;
9211 	struct qinit *qinfo;
9212 	struct ipmx_s *ipmxp;
9213 	ill_t	*ill = NULL;
9214 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
9215 	int	err = 0;
9216 	boolean_t	entered_ipsq = B_FALSE;
9217 	boolean_t islink;
9218 	queue_t *dwq = NULL;
9219 
9220 	ASSERT(iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_PUNLINK ||
9221 	    iocp->ioc_cmd == I_LINK || iocp->ioc_cmd == I_UNLINK);
9222 
9223 	islink = (iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_LINK) ?
9224 	    B_TRUE : B_FALSE;
9225 
9226 	mp1 = mp->b_cont;	/* This is the linkblk info */
9227 	li = (struct linkblk *)mp1->b_rptr;
9228 
9229 	/*
9230 	 * ARP has added this special mblk, and the utility is asking us
9231 	 * to perform consistency checks, and also atomically set the
9232 	 * muxid. Ifconfig is an example.  It achieves this by using
9233 	 * /dev/arp as the mux to plink the arp stream, and pushes arp on
9234 	 * to /dev/udp[6] stream for use as the mux when plinking the IP
9235 	 * stream. SIOCSLIFMUXID is not required.  See ifconfig.c, arp.c
9236 	 * and other comments in this routine for more details.
9237 	 */
9238 	mp2 = mp1->b_cont;	/* This is added by ARP */
9239 
9240 	/*
9241 	 * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than
9242 	 * ifconfig which didn't push ARP on top of the dummy mux, we won't
9243 	 * get the special mblk above.  For backward compatibility, we just
9244 	 * return success.  The utility will use SIOCSLIFMUXID to store
9245 	 * the muxids.  This is not atomic, and can leave the streams
9246 	 * unplumbable if the utility is interrrupted, before it does the
9247 	 * SIOCSLIFMUXID.
9248 	 */
9249 	if (mp2 == NULL) {
9250 		/*
9251 		 * At this point we don't know whether or not this is the
9252 		 * IP module stream or the ARP device stream.  We need to
9253 		 * walk the lower stream in order to find this out, since
9254 		 * the capability negotiation is done only on the IP module
9255 		 * stream.  IP module instance is identified by the module
9256 		 * name IP, non-null q_next, and it's wput not being ip_lwput.
9257 		 * STREAMS ensures that the lower stream (l_qbot) will not
9258 		 * vanish until this ioctl completes. So we can safely walk
9259 		 * the stream or refer to the q_ptr.
9260 		 */
9261 		ipwq = li->l_qbot;
9262 		while (ipwq != NULL) {
9263 			qinfo = ipwq->q_qinfo;
9264 			name = qinfo->qi_minfo->mi_idname;
9265 			if (name != NULL && name[0] != NULL &&
9266 			    (strcmp(name, ip_mod_info.mi_idname) == 0) &&
9267 			    ((void *)(qinfo->qi_putp) != (void *)ip_lwput) &&
9268 			    (ipwq->q_next != NULL)) {
9269 				break;
9270 			}
9271 			ipwq = ipwq->q_next;
9272 		}
9273 		/*
9274 		 * This looks like an IP module stream, so trigger
9275 		 * the capability reset or re-negotiation if necessary.
9276 		 */
9277 		if (ipwq != NULL) {
9278 			ill = ipwq->q_ptr;
9279 			ASSERT(ill != NULL);
9280 
9281 			if (ipsq == NULL) {
9282 				ipsq = ipsq_try_enter(NULL, ill, q, mp,
9283 				    ip_sioctl_plink, NEW_OP, B_TRUE);
9284 				if (ipsq == NULL)
9285 					return;
9286 				entered_ipsq = B_TRUE;
9287 			}
9288 			ASSERT(IAM_WRITER_ILL(ill));
9289 			/*
9290 			 * Store the upper read queue of the module
9291 			 * immediately below IP, and count the total
9292 			 * number of lower modules.  Do this only
9293 			 * for I_PLINK or I_LINK event.
9294 			 */
9295 			ill->ill_lmod_rq = NULL;
9296 			ill->ill_lmod_cnt = 0;
9297 			if (islink && (dwq = ipwq->q_next) != NULL) {
9298 				ill->ill_lmod_rq = RD(dwq);
9299 
9300 				while (dwq != NULL) {
9301 					ill->ill_lmod_cnt++;
9302 					dwq = dwq->q_next;
9303 				}
9304 			}
9305 			/*
9306 			 * There's no point in resetting or re-negotiating if
9307 			 * we are not bound to the driver, so only do this if
9308 			 * the DLPI state is idle (up); we assume such state
9309 			 * since ill_ipif_up_count gets incremented in
9310 			 * ipif_up_done(), which is after we are bound to the
9311 			 * driver.  Note that in the case of logical
9312 			 * interfaces, IP won't rebind to the driver unless
9313 			 * the ill_ipif_up_count is 0, meaning that all other
9314 			 * IP interfaces (including the main ipif) are in the
9315 			 * down state.  Because of this, we use such counter
9316 			 * as an indicator, instead of relying on the IPIF_UP
9317 			 * flag, which is per ipif instance.
9318 			 */
9319 			if (ill->ill_ipif_up_count > 0) {
9320 				if (islink)
9321 					ill_capability_probe(ill);
9322 				else
9323 					ill_capability_reset(ill);
9324 			}
9325 		}
9326 		goto done;
9327 	}
9328 
9329 	/*
9330 	 * This is an I_{P}LINK sent down by ifconfig on
9331 	 * /dev/arp. ARP has appended this last (3rd) mblk,
9332 	 * giving more info. STREAMS ensures that the lower
9333 	 * stream (l_qbot) will not vanish until this ioctl
9334 	 * completes. So we can safely walk the stream or refer
9335 	 * to the q_ptr.
9336 	 */
9337 	ipmxp = (struct ipmx_s *)mp2->b_rptr;
9338 	if (ipmxp->ipmx_arpdev_stream) {
9339 		/*
9340 		 * The operation is occuring on the arp-device
9341 		 * stream.
9342 		 */
9343 		ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE,
9344 		    q, mp, ip_sioctl_plink, &err, NULL);
9345 		if (ill == NULL) {
9346 			if (err == EINPROGRESS) {
9347 				return;
9348 			} else {
9349 				err = EINVAL;
9350 				goto done;
9351 			}
9352 		}
9353 
9354 		if (ipsq == NULL) {
9355 			ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
9356 			    NEW_OP, B_TRUE);
9357 			if (ipsq == NULL) {
9358 				ill_refrele(ill);
9359 				return;
9360 			}
9361 			entered_ipsq = B_TRUE;
9362 		}
9363 		ASSERT(IAM_WRITER_ILL(ill));
9364 		ill_refrele(ill);
9365 		/*
9366 		 * To ensure consistency between IP and ARP,
9367 		 * the following LIFO scheme is used in
9368 		 * plink/punlink. (IP first, ARP last).
9369 		 * This is because the muxid's are stored
9370 		 * in the IP stream on the ill.
9371 		 *
9372 		 * I_{P}LINK: ifconfig plinks the IP stream before
9373 		 * plinking the ARP stream. On an arp-dev
9374 		 * stream, IP checks that it is not yet
9375 		 * plinked, and it also checks that the
9376 		 * corresponding IP stream is already plinked.
9377 		 *
9378 		 * I_{P}UNLINK: ifconfig punlinks the ARP stream
9379 		 * before punlinking the IP stream. IP does
9380 		 * not allow punlink of the IP stream unless
9381 		 * the arp stream has been punlinked.
9382 		 *
9383 		 */
9384 		if ((islink &&
9385 		    (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) ||
9386 		    (!islink &&
9387 		    ill->ill_arp_muxid != li->l_index)) {
9388 			err = EINVAL;
9389 			goto done;
9390 		}
9391 		if (islink) {
9392 			ill->ill_arp_muxid = li->l_index;
9393 		} else {
9394 			ill->ill_arp_muxid = 0;
9395 		}
9396 	} else {
9397 		/*
9398 		 * This must be the IP module stream with or
9399 		 * without arp. Walk the stream and locate the
9400 		 * IP module. An IP module instance is
9401 		 * identified by the module name IP, non-null
9402 		 * q_next, and it's wput not being ip_lwput.
9403 		 */
9404 		ipwq = li->l_qbot;
9405 		while (ipwq != NULL) {
9406 			qinfo = ipwq->q_qinfo;
9407 			name = qinfo->qi_minfo->mi_idname;
9408 			if (name != NULL && name[0] != NULL &&
9409 			    (strcmp(name, ip_mod_info.mi_idname) == 0) &&
9410 			    ((void *)(qinfo->qi_putp) != (void *)ip_lwput) &&
9411 			    (ipwq->q_next != NULL)) {
9412 				break;
9413 			}
9414 			ipwq = ipwq->q_next;
9415 		}
9416 		if (ipwq != NULL) {
9417 			ill = ipwq->q_ptr;
9418 			ASSERT(ill != NULL);
9419 
9420 			if (ipsq == NULL) {
9421 				ipsq = ipsq_try_enter(NULL, ill, q, mp,
9422 				    ip_sioctl_plink, NEW_OP, B_TRUE);
9423 				if (ipsq == NULL)
9424 					return;
9425 				entered_ipsq = B_TRUE;
9426 			}
9427 			ASSERT(IAM_WRITER_ILL(ill));
9428 			/*
9429 			 * Return error if the ip_mux_id is
9430 			 * non-zero and command is I_{P}LINK.
9431 			 * If command is I_{P}UNLINK, return
9432 			 * error if the arp-devstr is not
9433 			 * yet punlinked.
9434 			 */
9435 			if ((islink && ill->ill_ip_muxid != 0) ||
9436 			    (!islink && ill->ill_arp_muxid != 0)) {
9437 				err = EINVAL;
9438 				goto done;
9439 			}
9440 			ill->ill_lmod_rq = NULL;
9441 			ill->ill_lmod_cnt = 0;
9442 			if (islink) {
9443 				/*
9444 				 * Store the upper read queue of the module
9445 				 * immediately below IP, and count the total
9446 				 * number of lower modules.
9447 				 */
9448 				if ((dwq = ipwq->q_next) != NULL) {
9449 					ill->ill_lmod_rq = RD(dwq);
9450 
9451 					while (dwq != NULL) {
9452 						ill->ill_lmod_cnt++;
9453 						dwq = dwq->q_next;
9454 					}
9455 				}
9456 				ill->ill_ip_muxid = li->l_index;
9457 			} else {
9458 				ill->ill_ip_muxid = 0;
9459 			}
9460 
9461 			/*
9462 			 * See comments above about resetting/re-
9463 			 * negotiating driver sub-capabilities.
9464 			 */
9465 			if (ill->ill_ipif_up_count > 0) {
9466 				if (islink)
9467 					ill_capability_probe(ill);
9468 				else
9469 					ill_capability_reset(ill);
9470 			}
9471 		}
9472 	}
9473 done:
9474 	iocp->ioc_count = 0;
9475 	iocp->ioc_error = err;
9476 	if (err == 0)
9477 		mp->b_datap->db_type = M_IOCACK;
9478 	else
9479 		mp->b_datap->db_type = M_IOCNAK;
9480 	qreply(q, mp);
9481 
9482 	/* Conn was refheld in ip_sioctl_copyin_setup */
9483 	if (CONN_Q(q))
9484 		CONN_OPER_PENDING_DONE(Q_TO_CONN(q));
9485 	if (entered_ipsq)
9486 		ipsq_exit(ipsq, B_TRUE, B_TRUE);
9487 }
9488 
9489 /*
9490  * Search the ioctl command in the ioctl tables and return a pointer
9491  * to the ioctl command information. The ioctl command tables are
9492  * static and fully populated at compile time.
9493  */
9494 ip_ioctl_cmd_t *
9495 ip_sioctl_lookup(int ioc_cmd)
9496 {
9497 	int index;
9498 	ip_ioctl_cmd_t *ipip;
9499 	ip_ioctl_cmd_t *ipip_end;
9500 
9501 	if (ioc_cmd == IPI_DONTCARE)
9502 		return (NULL);
9503 
9504 	/*
9505 	 * Do a 2 step search. First search the indexed table
9506 	 * based on the least significant byte of the ioctl cmd.
9507 	 * If we don't find a match, then search the misc table
9508 	 * serially.
9509 	 */
9510 	index = ioc_cmd & 0xFF;
9511 	if (index < ip_ndx_ioctl_count) {
9512 		ipip = &ip_ndx_ioctl_table[index];
9513 		if (ipip->ipi_cmd == ioc_cmd) {
9514 			/* Found a match in the ndx table */
9515 			return (ipip);
9516 		}
9517 	}
9518 
9519 	/* Search the misc table */
9520 	ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count];
9521 	for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) {
9522 		if (ipip->ipi_cmd == ioc_cmd)
9523 			/* Found a match in the misc table */
9524 			return (ipip);
9525 	}
9526 
9527 	return (NULL);
9528 }
9529 
9530 /*
9531  * Wrapper function for resuming deferred ioctl processing
9532  * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER,
9533  * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently.
9534  */
9535 /* ARGSUSED */
9536 void
9537 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp,
9538     void *dummy_arg)
9539 {
9540 	ip_sioctl_copyin_setup(q, mp);
9541 }
9542 
9543 /*
9544  * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message
9545  * that arrives.  Most of the IOCTLs are "socket" IOCTLs which we handle
9546  * in either I_STR or TRANSPARENT form, using the mi_copy facility.
9547  * We establish here the size of the block to be copied in.  mi_copyin
9548  * arranges for this to happen, an processing continues in ip_wput with
9549  * an M_IOCDATA message.
9550  */
9551 void
9552 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp)
9553 {
9554 	int	copyin_size;
9555 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
9556 	ip_ioctl_cmd_t *ipip;
9557 	cred_t *cr;
9558 
9559 	ipip = ip_sioctl_lookup(iocp->ioc_cmd);
9560 	if (ipip == NULL) {
9561 		/*
9562 		 * The ioctl is not one we understand or own.
9563 		 * Pass it along to be processed down stream,
9564 		 * if this is a module instance of IP, else nak
9565 		 * the ioctl.
9566 		 */
9567 		if (q->q_next == NULL) {
9568 			goto nak;
9569 		} else {
9570 			putnext(q, mp);
9571 			return;
9572 		}
9573 	}
9574 
9575 	/*
9576 	 * If this is deferred, then we will do all the checks when we
9577 	 * come back.
9578 	 */
9579 	if ((iocp->ioc_cmd == SIOCGDSTINFO ||
9580 	    iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup()) {
9581 		ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume);
9582 		return;
9583 	}
9584 
9585 	/*
9586 	 * Only allow a very small subset of IP ioctls on this stream if
9587 	 * IP is a module and not a driver. Allowing ioctls to be processed
9588 	 * in this case may cause assert failures or data corruption.
9589 	 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few
9590 	 * ioctls allowed on an IP module stream, after which this stream
9591 	 * normally becomes a multiplexor (at which time the stream head
9592 	 * will fail all ioctls).
9593 	 */
9594 	if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
9595 		if (ipip->ipi_flags & IPI_PASS_DOWN) {
9596 			/*
9597 			 * Pass common Streams ioctls which the IP
9598 			 * module does not own or consume along to
9599 			 * be processed down stream.
9600 			 */
9601 			putnext(q, mp);
9602 			return;
9603 		} else {
9604 			goto nak;
9605 		}
9606 	}
9607 
9608 	/* Make sure we have ioctl data to process. */
9609 	if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT))
9610 		goto nak;
9611 
9612 	/*
9613 	 * Prefer dblk credential over ioctl credential; some synthesized
9614 	 * ioctls have kcred set because there's no way to crhold()
9615 	 * a credential in some contexts.  (ioc_cr is not crfree() by
9616 	 * the framework; the caller of ioctl needs to hold the reference
9617 	 * for the duration of the call).
9618 	 */
9619 	cr = DB_CREDDEF(mp, iocp->ioc_cr);
9620 
9621 	/* Make sure normal users don't send down privileged ioctls */
9622 	if ((ipip->ipi_flags & IPI_PRIV) &&
9623 	    (cr != NULL) && secpolicy_net_config(cr, B_TRUE) != 0) {
9624 		/* We checked the privilege earlier but log it here */
9625 		miocnak(q, mp, 0, secpolicy_net_config(cr, B_FALSE));
9626 		return;
9627 	}
9628 
9629 	/*
9630 	 * The ioctl command tables can only encode fixed length
9631 	 * ioctl data. If the length is variable, the table will
9632 	 * encode the length as zero. Such special cases are handled
9633 	 * below in the switch.
9634 	 */
9635 	if (ipip->ipi_copyin_size != 0) {
9636 		mi_copyin(q, mp, NULL, ipip->ipi_copyin_size);
9637 		return;
9638 	}
9639 
9640 	switch (iocp->ioc_cmd) {
9641 	case O_SIOCGIFCONF:
9642 	case SIOCGIFCONF:
9643 		/*
9644 		 * This IOCTL is hilarious.  See comments in
9645 		 * ip_sioctl_get_ifconf for the story.
9646 		 */
9647 		if (iocp->ioc_count == TRANSPARENT)
9648 			copyin_size = SIZEOF_STRUCT(ifconf,
9649 			    iocp->ioc_flag);
9650 		else
9651 			copyin_size = iocp->ioc_count;
9652 		mi_copyin(q, mp, NULL, copyin_size);
9653 		return;
9654 
9655 	case O_SIOCGLIFCONF:
9656 	case SIOCGLIFCONF:
9657 		copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag);
9658 		mi_copyin(q, mp, NULL, copyin_size);
9659 		return;
9660 
9661 	case SIOCGLIFSRCOF:
9662 		copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag);
9663 		mi_copyin(q, mp, NULL, copyin_size);
9664 		return;
9665 	case SIOCGIP6ADDRPOLICY:
9666 		ip_sioctl_ip6addrpolicy(q, mp);
9667 		ip6_asp_table_refrele();
9668 		return;
9669 
9670 	case SIOCSIP6ADDRPOLICY:
9671 		ip_sioctl_ip6addrpolicy(q, mp);
9672 		return;
9673 
9674 	case SIOCGDSTINFO:
9675 		ip_sioctl_dstinfo(q, mp);
9676 		ip6_asp_table_refrele();
9677 		return;
9678 
9679 	case I_PLINK:
9680 	case I_PUNLINK:
9681 	case I_LINK:
9682 	case I_UNLINK:
9683 		/*
9684 		 * We treat non-persistent link similarly as the persistent
9685 		 * link case, in terms of plumbing/unplumbing, as well as
9686 		 * dynamic re-plumbing events indicator.  See comments
9687 		 * in ip_sioctl_plink() for more.
9688 		 *
9689 		 * Request can be enqueued in the 'ipsq' while waiting
9690 		 * to become exclusive. So bump up the conn ref.
9691 		 */
9692 		if (CONN_Q(q))
9693 			CONN_INC_REF(Q_TO_CONN(q));
9694 		ip_sioctl_plink(NULL, q, mp, NULL);
9695 		return;
9696 
9697 	case ND_GET:
9698 	case ND_SET:
9699 		/*
9700 		 * Use of the nd table requires holding the reader lock.
9701 		 * Modifying the nd table thru nd_load/nd_unload requires
9702 		 * the writer lock.
9703 		 */
9704 		rw_enter(&ip_g_nd_lock, RW_READER);
9705 		if (nd_getset(q, ip_g_nd, mp)) {
9706 			rw_exit(&ip_g_nd_lock);
9707 
9708 			if (iocp->ioc_error)
9709 				iocp->ioc_count = 0;
9710 			mp->b_datap->db_type = M_IOCACK;
9711 			qreply(q, mp);
9712 			return;
9713 		}
9714 		rw_exit(&ip_g_nd_lock);
9715 		/*
9716 		 * We don't understand this subioctl of ND_GET / ND_SET.
9717 		 * Maybe intended for some driver / module below us
9718 		 */
9719 		if (q->q_next) {
9720 			putnext(q, mp);
9721 		} else {
9722 			iocp->ioc_error = ENOENT;
9723 			mp->b_datap->db_type = M_IOCNAK;
9724 			iocp->ioc_count = 0;
9725 			qreply(q, mp);
9726 		}
9727 		return;
9728 
9729 	case IP_IOCTL:
9730 		ip_wput_ioctl(q, mp);
9731 		return;
9732 	default:
9733 		cmn_err(CE_PANIC, "should not happen ");
9734 	}
9735 nak:
9736 	if (mp->b_cont != NULL) {
9737 		freemsg(mp->b_cont);
9738 		mp->b_cont = NULL;
9739 	}
9740 	iocp->ioc_error = EINVAL;
9741 	mp->b_datap->db_type = M_IOCNAK;
9742 	iocp->ioc_count = 0;
9743 	qreply(q, mp);
9744 }
9745 
9746 /* ip_wput hands off ARP IOCTL responses to us */
9747 void
9748 ip_sioctl_iocack(queue_t *q, mblk_t *mp)
9749 {
9750 	struct arpreq *ar;
9751 	struct xarpreq *xar;
9752 	area_t	*area;
9753 	mblk_t	*area_mp;
9754 	struct iocblk *iocp;
9755 	mblk_t	*orig_ioc_mp, *tmp;
9756 	struct iocblk	*orig_iocp;
9757 	ill_t *ill;
9758 	conn_t *connp = NULL;
9759 	uint_t ioc_id;
9760 	mblk_t *pending_mp;
9761 	int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE;
9762 	int *flagsp;
9763 	char *storage = NULL;
9764 	sin_t *sin;
9765 	ipaddr_t addr;
9766 	int err;
9767 
9768 	ill = q->q_ptr;
9769 	ASSERT(ill != NULL);
9770 
9771 	/*
9772 	 * We should get back from ARP a packet chain that looks like:
9773 	 * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK
9774 	 */
9775 	if (!(area_mp = mp->b_cont) ||
9776 	    (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) ||
9777 	    !(orig_ioc_mp = area_mp->b_cont) ||
9778 	    !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) {
9779 		freemsg(mp);
9780 		return;
9781 	}
9782 
9783 	orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr;
9784 
9785 	tmp = (orig_ioc_mp->b_cont)->b_cont;
9786 	if ((orig_iocp->ioc_cmd == SIOCGXARP) ||
9787 	    (orig_iocp->ioc_cmd == SIOCSXARP) ||
9788 	    (orig_iocp->ioc_cmd == SIOCDXARP)) {
9789 		x_arp_ioctl = B_TRUE;
9790 		xar = (struct xarpreq *)tmp->b_rptr;
9791 		sin = (sin_t *)&xar->xarp_pa;
9792 		flagsp = &xar->xarp_flags;
9793 		storage = xar->xarp_ha.sdl_data;
9794 		if (xar->xarp_ha.sdl_nlen != 0)
9795 			ifx_arp_ioctl = B_TRUE;
9796 	} else {
9797 		ar = (struct arpreq *)tmp->b_rptr;
9798 		sin = (sin_t *)&ar->arp_pa;
9799 		flagsp = &ar->arp_flags;
9800 		storage = ar->arp_ha.sa_data;
9801 	}
9802 
9803 	iocp = (struct iocblk *)mp->b_rptr;
9804 
9805 	/*
9806 	 * Pick out the originating queue based on the ioc_id.
9807 	 */
9808 	ioc_id = iocp->ioc_id;
9809 	pending_mp = ill_pending_mp_get(ill, &connp, ioc_id);
9810 	if (pending_mp == NULL) {
9811 		ASSERT(connp == NULL);
9812 		inet_freemsg(mp);
9813 		return;
9814 	}
9815 	ASSERT(connp != NULL);
9816 	q = CONNP_TO_WQ(connp);
9817 
9818 	/* Uncouple the internally generated IOCTL from the original one */
9819 	area = (area_t *)area_mp->b_rptr;
9820 	area_mp->b_cont = NULL;
9821 
9822 	/*
9823 	 * Restore the b_next and b_prev used by mi code. This is needed
9824 	 * to complete the ioctl using mi* functions. We stored them in
9825 	 * the pending mp prior to sending the request to ARP.
9826 	 */
9827 	orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next;
9828 	orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev;
9829 	inet_freemsg(pending_mp);
9830 
9831 	/*
9832 	 * We're done if there was an error or if this is not an SIOCG{X}ARP
9833 	 * Catch the case where there is an IRE_CACHE by no entry in the
9834 	 * arp table.
9835 	 */
9836 	addr = sin->sin_addr.s_addr;
9837 	if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) {
9838 		ire_t			*ire;
9839 		dl_unitdata_req_t	*dlup;
9840 		mblk_t			*llmp;
9841 		int			addr_len;
9842 		ill_t			*ipsqill = NULL;
9843 
9844 		if (ifx_arp_ioctl) {
9845 			/*
9846 			 * There's no need to lookup the ill, since
9847 			 * we've already done that when we started
9848 			 * processing the ioctl and sent the message
9849 			 * to ARP on that ill.  So use the ill that
9850 			 * is stored in q->q_ptr.
9851 			 */
9852 			ipsqill = ill;
9853 			ire = ire_ctable_lookup(addr, 0, IRE_CACHE,
9854 			    ipsqill->ill_ipif, ALL_ZONES,
9855 			    MATCH_IRE_TYPE | MATCH_IRE_ILL);
9856 		} else {
9857 			ire = ire_ctable_lookup(addr, 0, IRE_CACHE,
9858 			    NULL, ALL_ZONES, MATCH_IRE_TYPE);
9859 			if (ire != NULL)
9860 				ipsqill = ire_to_ill(ire);
9861 		}
9862 
9863 		if ((x_arp_ioctl) && (ipsqill != NULL))
9864 			storage += ill_xarp_info(&xar->xarp_ha, ipsqill);
9865 
9866 		if (ire != NULL) {
9867 			*flagsp = ATF_INUSE;
9868 			llmp = ire->ire_dlureq_mp;
9869 			if (llmp != NULL && ipsqill != NULL) {
9870 				uchar_t *macaddr;
9871 
9872 				addr_len = ipsqill->ill_phys_addr_length;
9873 				if (x_arp_ioctl && ((addr_len +
9874 				    ipsqill->ill_name_length) >
9875 				    sizeof (xar->xarp_ha.sdl_data))) {
9876 					ire_refrele(ire);
9877 					freemsg(mp);
9878 					ip_ioctl_finish(q, orig_ioc_mp,
9879 					    EINVAL, NO_COPYOUT, NULL, NULL);
9880 					return;
9881 				}
9882 				*flagsp |= ATF_COM;
9883 				dlup = (dl_unitdata_req_t *)llmp->b_rptr;
9884 				if (ipsqill->ill_sap_length < 0)
9885 					macaddr = llmp->b_rptr +
9886 					    dlup->dl_dest_addr_offset;
9887 				else
9888 					macaddr = llmp->b_rptr +
9889 					    dlup->dl_dest_addr_offset +
9890 					    ipsqill->ill_sap_length;
9891 				/*
9892 				 * For SIOCGARP, MAC address length
9893 				 * validation has already been done
9894 				 * before the ioctl was issued to ARP to
9895 				 * allow it to progress only on 6 byte
9896 				 * addressable (ethernet like) media. Thus
9897 				 * the mac address copying can not overwrite
9898 				 * the sa_data area below.
9899 				 */
9900 				bcopy(macaddr, storage, addr_len);
9901 			}
9902 			/* Ditch the internal IOCTL. */
9903 			freemsg(mp);
9904 			ire_refrele(ire);
9905 			ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL, NULL);
9906 			return;
9907 		}
9908 	}
9909 
9910 	/*
9911 	 * Delete the coresponding IRE_CACHE if any.
9912 	 * Reset the error if there was one (in case there was no entry
9913 	 * in arp.)
9914 	 */
9915 	if (iocp->ioc_cmd == AR_ENTRY_DELETE) {
9916 		ipif_t *ipintf = NULL;
9917 
9918 		if (ifx_arp_ioctl) {
9919 			/*
9920 			 * There's no need to lookup the ill, since
9921 			 * we've already done that when we started
9922 			 * processing the ioctl and sent the message
9923 			 * to ARP on that ill.  So use the ill that
9924 			 * is stored in q->q_ptr.
9925 			 */
9926 			ipintf = ill->ill_ipif;
9927 		}
9928 		if (ip_ire_clookup_and_delete(addr, ipintf)) {
9929 			/*
9930 			 * The address in "addr" may be an entry for a
9931 			 * router. If that's true, then any off-net
9932 			 * IRE_CACHE entries that go through the router
9933 			 * with address "addr" must be clobbered. Use
9934 			 * ire_walk to achieve this goal.
9935 			 */
9936 			if (ifx_arp_ioctl)
9937 				ire_walk_ill_v4(MATCH_IRE_ILL, 0,
9938 				    ire_delete_cache_gw, (char *)&addr, ill);
9939 			else
9940 				ire_walk_v4(ire_delete_cache_gw, (char *)&addr,
9941 				    ALL_ZONES);
9942 			iocp->ioc_error = 0;
9943 		}
9944 	}
9945 
9946 	if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) {
9947 		err = iocp->ioc_error;
9948 		freemsg(mp);
9949 		ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL, NULL);
9950 		return;
9951 	}
9952 
9953 	/*
9954 	 * Completion of an SIOCG{X}ARP.  Translate the information from
9955 	 * the area_t into the struct {x}arpreq.
9956 	 */
9957 	if (x_arp_ioctl) {
9958 		storage += ill_xarp_info(&xar->xarp_ha, ill);
9959 		if ((ill->ill_phys_addr_length + ill->ill_name_length) >
9960 		    sizeof (xar->xarp_ha.sdl_data)) {
9961 			freemsg(mp);
9962 			ip_ioctl_finish(q, orig_ioc_mp, EINVAL,
9963 			    NO_COPYOUT, NULL, NULL);
9964 			return;
9965 		}
9966 	}
9967 	*flagsp = ATF_INUSE;
9968 	if (area->area_flags & ACE_F_PERMANENT)
9969 		*flagsp |= ATF_PERM;
9970 	if (area->area_flags & ACE_F_PUBLISH)
9971 		*flagsp |= ATF_PUBL;
9972 	if (area->area_hw_addr_length != 0) {
9973 		*flagsp |= ATF_COM;
9974 		/*
9975 		 * For SIOCGARP, MAC address length validation has
9976 		 * already been done before the ioctl was issued to ARP
9977 		 * to allow it to progress only on 6 byte addressable
9978 		 * (ethernet like) media. Thus the mac address copying
9979 		 * can not overwrite the sa_data area below.
9980 		 */
9981 		bcopy((char *)area + area->area_hw_addr_offset,
9982 		    storage, area->area_hw_addr_length);
9983 	}
9984 
9985 	/* Ditch the internal IOCTL. */
9986 	freemsg(mp);
9987 	/* Complete the original. */
9988 	ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL, NULL);
9989 }
9990 
9991 /*
9992  * Create a new logical interface. If ipif_id is zero (i.e. not a logical
9993  * interface) create the next available logical interface for this
9994  * physical interface.
9995  * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an
9996  * ipif with the specified name.
9997  *
9998  * If the address family is not AF_UNSPEC then set the address as well.
9999  *
10000  * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout)
10001  * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer.
10002  *
10003  * Executed as a writer on the ill or ill group.
10004  * So no lock is needed to traverse the ipif chain, or examine the
10005  * phyint flags.
10006  */
10007 /* ARGSUSED */
10008 int
10009 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
10010     ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
10011 {
10012 	mblk_t	*mp1;
10013 	struct lifreq *lifr;
10014 	boolean_t	isv6;
10015 	boolean_t	exists;
10016 	char 	*name;
10017 	char	*endp;
10018 	char	*cp;
10019 	int	namelen;
10020 	ipif_t	*ipif;
10021 	long	id;
10022 	ipsq_t	*ipsq;
10023 	ill_t	*ill;
10024 	sin_t	*sin;
10025 	int	err = 0;
10026 	boolean_t found_sep = B_FALSE;
10027 	conn_t	*connp;
10028 	zoneid_t zoneid;
10029 	int	orig_ifindex = 0;
10030 
10031 	ip1dbg(("ip_sioctl_addif\n"));
10032 	/* Existence of mp1 has been checked in ip_wput_nondata */
10033 	mp1 = mp->b_cont->b_cont;
10034 	/*
10035 	 * Null terminate the string to protect against buffer
10036 	 * overrun. String was generated by user code and may not
10037 	 * be trusted.
10038 	 */
10039 	lifr = (struct lifreq *)mp1->b_rptr;
10040 	lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
10041 	name = lifr->lifr_name;
10042 	ASSERT(CONN_Q(q));
10043 	connp = Q_TO_CONN(q);
10044 	isv6 = connp->conn_af_isv6;
10045 	zoneid = connp->conn_zoneid;
10046 	namelen = mi_strlen(name);
10047 	if (namelen == 0)
10048 		return (EINVAL);
10049 
10050 	exists = B_FALSE;
10051 	if ((namelen + 1 == sizeof (ipif_loopback_name)) &&
10052 	    (mi_strcmp(name, ipif_loopback_name) == 0)) {
10053 		/*
10054 		 * Allow creating lo0 using SIOCLIFADDIF.
10055 		 * can't be any other writer thread. So can pass null below
10056 		 * for the last 4 args to ipif_lookup_name.
10057 		 */
10058 		ipif = ipif_lookup_on_name(lifr->lifr_name, namelen,
10059 		    B_TRUE, &exists, isv6, zoneid, NULL, NULL, NULL, NULL);
10060 		/* Prevent any further action */
10061 		if (ipif == NULL) {
10062 			return (ENOBUFS);
10063 		} else if (!exists) {
10064 			/* We created the ipif now and as writer */
10065 			ipif_refrele(ipif);
10066 			return (0);
10067 		} else {
10068 			ill = ipif->ipif_ill;
10069 			ill_refhold(ill);
10070 			ipif_refrele(ipif);
10071 		}
10072 	} else {
10073 		/* Look for a colon in the name. */
10074 		endp = &name[namelen];
10075 		for (cp = endp; --cp > name; ) {
10076 			if (*cp == IPIF_SEPARATOR_CHAR) {
10077 				found_sep = B_TRUE;
10078 				/*
10079 				 * Reject any non-decimal aliases for plumbing
10080 				 * of logical interfaces. Aliases with leading
10081 				 * zeroes are also rejected as they introduce
10082 				 * ambiguity in the naming of the interfaces.
10083 				 * Comparing with "0" takes care of all such
10084 				 * cases.
10085 				 */
10086 				if ((strncmp("0", cp+1, 1)) == 0)
10087 					return (EINVAL);
10088 
10089 				if (ddi_strtol(cp+1, &endp, 10, &id) != 0 ||
10090 				    id <= 0 || *endp != '\0') {
10091 					return (EINVAL);
10092 				}
10093 				*cp = '\0';
10094 				break;
10095 			}
10096 		}
10097 		ill = ill_lookup_on_name(name, B_FALSE, isv6,
10098 		    CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL);
10099 		if (found_sep)
10100 			*cp = IPIF_SEPARATOR_CHAR;
10101 		if (ill == NULL)
10102 			return (err);
10103 	}
10104 
10105 	ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP,
10106 	    B_TRUE);
10107 
10108 	/*
10109 	 * Release the refhold due to the lookup, now that we are excl
10110 	 * or we are just returning
10111 	 */
10112 	ill_refrele(ill);
10113 
10114 	if (ipsq == NULL)
10115 		return (EINPROGRESS);
10116 
10117 	/*
10118 	 * If the interface is failed, inactive or offlined, look for a working
10119 	 * interface in the ill group and create the ipif there. If we can't
10120 	 * find a good interface, create the ipif anyway so that in.mpathd can
10121 	 * move it to the first repaired interface.
10122 	 */
10123 	if ((ill->ill_phyint->phyint_flags &
10124 	    (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
10125 	    ill->ill_phyint->phyint_groupname_len != 0) {
10126 		phyint_t *phyi;
10127 		char *groupname = ill->ill_phyint->phyint_groupname;
10128 
10129 		/*
10130 		 * We're looking for a working interface, but it doesn't matter
10131 		 * if it's up or down; so instead of following the group lists,
10132 		 * we look at each physical interface and compare the groupname.
10133 		 * We're only interested in interfaces with IPv4 (resp. IPv6)
10134 		 * plumbed when we're adding an IPv4 (resp. IPv6) ipif.
10135 		 * Otherwise we create the ipif on the failed interface.
10136 		 */
10137 		rw_enter(&ill_g_lock, RW_READER);
10138 		phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index);
10139 		for (; phyi != NULL;
10140 		    phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index,
10141 		    phyi, AVL_AFTER)) {
10142 			if (phyi->phyint_groupname_len == 0)
10143 				continue;
10144 			ASSERT(phyi->phyint_groupname != NULL);
10145 			if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 &&
10146 			    !(phyi->phyint_flags &
10147 			    (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
10148 			    (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) :
10149 			    (phyi->phyint_illv4 != NULL))) {
10150 				break;
10151 			}
10152 		}
10153 		rw_exit(&ill_g_lock);
10154 
10155 		if (phyi != NULL) {
10156 			orig_ifindex = ill->ill_phyint->phyint_ifindex;
10157 			ill = (ill->ill_isv6 ? phyi->phyint_illv6 :
10158 			    phyi->phyint_illv4);
10159 		}
10160 	}
10161 
10162 	/*
10163 	 * We are now exclusive on the ipsq, so an ill move will be serialized
10164 	 * before or after us.
10165 	 */
10166 	ASSERT(IAM_WRITER_ILL(ill));
10167 	ASSERT(ill->ill_move_in_progress == B_FALSE);
10168 
10169 	if (found_sep && orig_ifindex == 0) {
10170 		/* Now see if there is an IPIF with this unit number. */
10171 		for (ipif = ill->ill_ipif; ipif; ipif = ipif->ipif_next) {
10172 			if (ipif->ipif_id == id) {
10173 				err = EEXIST;
10174 				goto done;
10175 			}
10176 		}
10177 	}
10178 
10179 	/*
10180 	 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use
10181 	 * of lo0. We never come here when we plumb lo0:0. It
10182 	 * happens in ipif_lookup_on_name.
10183 	 * The specified unit number is ignored when we create the ipif on a
10184 	 * different interface. However, we save it in ipif_orig_ipifid below so
10185 	 * that the ipif fails back to the right position.
10186 	 */
10187 	if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ?
10188 	    id : -1, IRE_LOCAL, B_TRUE)) == NULL) {
10189 		err = ENOBUFS;
10190 		goto done;
10191 	}
10192 
10193 	/* Return created name with ioctl */
10194 	(void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name,
10195 	    IPIF_SEPARATOR_CHAR, ipif->ipif_id);
10196 	ip1dbg(("created %s\n", lifr->lifr_name));
10197 
10198 	/* Set address */
10199 	sin = (sin_t *)&lifr->lifr_addr;
10200 	if (sin->sin_family != AF_UNSPEC) {
10201 		err = ip_sioctl_addr(ipif, sin, q, mp,
10202 		    &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr);
10203 	}
10204 
10205 	/* Set ifindex and unit number for failback */
10206 	if (err == 0 && orig_ifindex != 0) {
10207 		ipif->ipif_orig_ifindex = orig_ifindex;
10208 		if (found_sep) {
10209 			ipif->ipif_orig_ipifid = id;
10210 		}
10211 	}
10212 
10213 done:
10214 	ipsq_exit(ipsq, B_TRUE, B_TRUE);
10215 	return (err);
10216 }
10217 
10218 /*
10219  * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical
10220  * interface) delete it based on the IP address (on this physical interface).
10221  * Otherwise delete it based on the ipif_id.
10222  * Also, special handling to allow a removeif of lo0.
10223  */
10224 /* ARGSUSED */
10225 int
10226 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10227     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
10228 {
10229 	conn_t		*connp;
10230 	ill_t		*ill = ipif->ipif_ill;
10231 	boolean_t	 success;
10232 
10233 	ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n",
10234 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10235 	ASSERT(IAM_WRITER_IPIF(ipif));
10236 
10237 	connp = Q_TO_CONN(q);
10238 	/*
10239 	 * Special case for unplumbing lo0 (the loopback physical interface).
10240 	 * If unplumbing lo0, the incoming address structure has been
10241 	 * initialized to all zeros. When unplumbing lo0, all its logical
10242 	 * interfaces must be removed too.
10243 	 *
10244 	 * Note that this interface may be called to remove a specific
10245 	 * loopback logical interface (eg, lo0:1). But in that case
10246 	 * ipif->ipif_id != 0 so that the code path for that case is the
10247 	 * same as any other interface (meaning it skips the code directly
10248 	 * below).
10249 	 */
10250 	if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) {
10251 		if (sin->sin_family == AF_UNSPEC &&
10252 		    (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) {
10253 			/*
10254 			 * Mark it condemned. No new ref. will be made to ill.
10255 			 */
10256 			mutex_enter(&ill->ill_lock);
10257 			ill->ill_state_flags |= ILL_CONDEMNED;
10258 			for (ipif = ill->ill_ipif; ipif != NULL;
10259 			    ipif = ipif->ipif_next) {
10260 				ipif->ipif_state_flags |= IPIF_CONDEMNED;
10261 			}
10262 			mutex_exit(&ill->ill_lock);
10263 
10264 			ipif = ill->ill_ipif;
10265 			/* unplumb the loopback interface */
10266 			ill_delete(ill);
10267 			mutex_enter(&connp->conn_lock);
10268 			mutex_enter(&ill->ill_lock);
10269 			ASSERT(ill->ill_group == NULL);
10270 
10271 			/* Are any references to this ill active */
10272 			if (ill_is_quiescent(ill)) {
10273 				mutex_exit(&ill->ill_lock);
10274 				mutex_exit(&connp->conn_lock);
10275 				ill_delete_tail(ill);
10276 				mi_free(ill);
10277 				return (0);
10278 			}
10279 			success = ipsq_pending_mp_add(connp, ipif,
10280 			    CONNP_TO_WQ(connp), mp, ILL_FREE);
10281 			mutex_exit(&connp->conn_lock);
10282 			mutex_exit(&ill->ill_lock);
10283 			if (success)
10284 				return (EINPROGRESS);
10285 			else
10286 				return (EINTR);
10287 		}
10288 	}
10289 
10290 	/*
10291 	 * We are exclusive on the ipsq, so an ill move will be serialized
10292 	 * before or after us.
10293 	 */
10294 	ASSERT(ill->ill_move_in_progress == B_FALSE);
10295 
10296 	if (ipif->ipif_id == 0) {
10297 		/* Find based on address */
10298 		if (ipif->ipif_isv6) {
10299 			sin6_t *sin6;
10300 
10301 			if (sin->sin_family != AF_INET6)
10302 				return (EAFNOSUPPORT);
10303 
10304 			sin6 = (sin6_t *)sin;
10305 			/* We are a writer, so we should be able to lookup */
10306 			ipif = ipif_lookup_addr_v6(&sin6->sin6_addr,
10307 			    ill, ALL_ZONES, NULL, NULL, NULL, NULL);
10308 			if (ipif == NULL) {
10309 				/*
10310 				 * Maybe the address in on another interface in
10311 				 * the same IPMP group? We check this below.
10312 				 */
10313 				ipif = ipif_lookup_addr_v6(&sin6->sin6_addr,
10314 				    NULL, ALL_ZONES, NULL, NULL, NULL, NULL);
10315 			}
10316 		} else {
10317 			ipaddr_t addr;
10318 
10319 			if (sin->sin_family != AF_INET)
10320 				return (EAFNOSUPPORT);
10321 
10322 			addr = sin->sin_addr.s_addr;
10323 			/* We are a writer, so we should be able to lookup */
10324 			ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL,
10325 			    NULL, NULL, NULL);
10326 			if (ipif == NULL) {
10327 				/*
10328 				 * Maybe the address in on another interface in
10329 				 * the same IPMP group? We check this below.
10330 				 */
10331 				ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES,
10332 				    NULL, NULL, NULL, NULL);
10333 			}
10334 		}
10335 		if (ipif == NULL) {
10336 			return (EADDRNOTAVAIL);
10337 		}
10338 		/*
10339 		 * When the address to be removed is hosted on a different
10340 		 * interface, we check if the interface is in the same IPMP
10341 		 * group as the specified one; if so we proceed with the
10342 		 * removal.
10343 		 * ill->ill_group is NULL when the ill is down, so we have to
10344 		 * compare the group names instead.
10345 		 */
10346 		if (ipif->ipif_ill != ill &&
10347 		    (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 ||
10348 		    ill->ill_phyint->phyint_groupname_len == 0 ||
10349 		    mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname,
10350 		    ill->ill_phyint->phyint_groupname) != 0)) {
10351 			ipif_refrele(ipif);
10352 			return (EADDRNOTAVAIL);
10353 		}
10354 
10355 		/* This is a writer */
10356 		ipif_refrele(ipif);
10357 	}
10358 
10359 	/*
10360 	 * Can not delete instance zero since it is tied to the ill.
10361 	 */
10362 	if (ipif->ipif_id == 0)
10363 		return (EBUSY);
10364 
10365 	mutex_enter(&ill->ill_lock);
10366 	ipif->ipif_state_flags |= IPIF_CONDEMNED;
10367 	mutex_exit(&ill->ill_lock);
10368 
10369 	ipif_free(ipif);
10370 
10371 	mutex_enter(&connp->conn_lock);
10372 	mutex_enter(&ill->ill_lock);
10373 
10374 	/* Are any references to this ipif active */
10375 	if (ipif->ipif_refcnt == 0 && ipif->ipif_ire_cnt == 0) {
10376 		mutex_exit(&ill->ill_lock);
10377 		mutex_exit(&connp->conn_lock);
10378 		ipif_down_tail(ipif);
10379 		ipif_free_tail(ipif);
10380 		return (0);
10381 	    }
10382 	success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp,
10383 	    IPIF_FREE);
10384 	mutex_exit(&ill->ill_lock);
10385 	mutex_exit(&connp->conn_lock);
10386 	if (success)
10387 		return (EINPROGRESS);
10388 	else
10389 		return (EINTR);
10390 }
10391 
10392 /*
10393  * Restart the removeif ioctl. The refcnt has gone down to 0.
10394  * The ipif is already condemned. So can't find it thru lookups.
10395  */
10396 /* ARGSUSED */
10397 int
10398 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
10399     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req)
10400 {
10401 	ill_t *ill;
10402 
10403 	ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n",
10404 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10405 	if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) {
10406 		ill = ipif->ipif_ill;
10407 		ASSERT(IAM_WRITER_ILL(ill));
10408 		ASSERT((ipif->ipif_state_flags & IPIF_CONDEMNED) &&
10409 		    (ill->ill_state_flags & IPIF_CONDEMNED));
10410 		ill_delete_tail(ill);
10411 		mi_free(ill);
10412 		return (0);
10413 	}
10414 
10415 	ill = ipif->ipif_ill;
10416 	ASSERT(IAM_WRITER_IPIF(ipif));
10417 	ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED);
10418 
10419 	ipif_down_tail(ipif);
10420 	ipif_free_tail(ipif);
10421 
10422 	ILL_UNMARK_CHANGING(ill);
10423 	return (0);
10424 }
10425 
10426 /*
10427  * Set the local interface address.
10428  * Allow an address of all zero when the interface is down.
10429  */
10430 /* ARGSUSED */
10431 int
10432 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10433     ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
10434 {
10435 	int err = 0;
10436 	in6_addr_t v6addr;
10437 	boolean_t need_up = B_FALSE;
10438 
10439 	ip1dbg(("ip_sioctl_addr(%s:%u %p)\n",
10440 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10441 
10442 	ASSERT(IAM_WRITER_IPIF(ipif));
10443 
10444 	if (ipif->ipif_isv6) {
10445 		sin6_t *sin6;
10446 		ill_t *ill;
10447 		phyint_t *phyi;
10448 
10449 		if (sin->sin_family != AF_INET6)
10450 			return (EAFNOSUPPORT);
10451 
10452 		sin6 = (sin6_t *)sin;
10453 		v6addr = sin6->sin6_addr;
10454 		ill = ipif->ipif_ill;
10455 		phyi = ill->ill_phyint;
10456 
10457 		/*
10458 		 * Enforce that true multicast interfaces have a link-local
10459 		 * address for logical unit 0.
10460 		 */
10461 		if (ipif->ipif_id == 0 &&
10462 		    (ill->ill_flags & ILLF_MULTICAST) &&
10463 		    !(ipif->ipif_flags & (IPIF_POINTOPOINT)) &&
10464 		    !(phyi->phyint_flags & (PHYI_LOOPBACK)) &&
10465 		    !IN6_IS_ADDR_LINKLOCAL(&v6addr)) {
10466 			return (EADDRNOTAVAIL);
10467 		}
10468 
10469 		/*
10470 		 * up interfaces shouldn't have the unspecified address
10471 		 * unless they also have the IPIF_NOLOCAL flags set and
10472 		 * have a subnet assigned.
10473 		 */
10474 		if ((ipif->ipif_flags & IPIF_UP) &&
10475 		    IN6_IS_ADDR_UNSPECIFIED(&v6addr) &&
10476 		    (!(ipif->ipif_flags & IPIF_NOLOCAL) ||
10477 		    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) {
10478 			return (EADDRNOTAVAIL);
10479 		}
10480 
10481 		if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
10482 			return (EADDRNOTAVAIL);
10483 	} else {
10484 		ipaddr_t addr;
10485 
10486 		if (sin->sin_family != AF_INET)
10487 			return (EAFNOSUPPORT);
10488 
10489 		addr = sin->sin_addr.s_addr;
10490 
10491 		/* Allow 0 as the local address. */
10492 		if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask))
10493 			return (EADDRNOTAVAIL);
10494 
10495 		IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
10496 	}
10497 
10498 
10499 	/*
10500 	 * Even if there is no change we redo things just to rerun
10501 	 * ipif_set_default.
10502 	 */
10503 	if (ipif->ipif_flags & IPIF_UP) {
10504 		/*
10505 		 * Setting a new local address, make sure
10506 		 * we have net and subnet bcast ire's for
10507 		 * the old address if we need them.
10508 		 */
10509 		if (!ipif->ipif_isv6)
10510 			ipif_check_bcast_ires(ipif);
10511 		/*
10512 		 * If the interface is already marked up,
10513 		 * we call ipif_down which will take care
10514 		 * of ditching any IREs that have been set
10515 		 * up based on the old interface address.
10516 		 */
10517 		err = ipif_logical_down(ipif, q, mp);
10518 		if (err == EINPROGRESS)
10519 			return (err);
10520 		ipif_down_tail(ipif);
10521 		need_up = 1;
10522 	}
10523 
10524 	err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up);
10525 	return (err);
10526 }
10527 
10528 int
10529 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10530     boolean_t need_up)
10531 {
10532 	in6_addr_t v6addr;
10533 	ipaddr_t addr;
10534 	sin6_t	*sin6;
10535 	int	err = 0;
10536 
10537 	ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n",
10538 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10539 	ASSERT(IAM_WRITER_IPIF(ipif));
10540 	if (ipif->ipif_isv6) {
10541 		sin6 = (sin6_t *)sin;
10542 		v6addr = sin6->sin6_addr;
10543 	} else {
10544 		addr = sin->sin_addr.s_addr;
10545 		IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
10546 	}
10547 	mutex_enter(&ipif->ipif_ill->ill_lock);
10548 	ipif->ipif_v6lcl_addr = v6addr;
10549 	if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) {
10550 		ipif->ipif_v6src_addr = ipv6_all_zeros;
10551 	} else {
10552 		ipif->ipif_v6src_addr = v6addr;
10553 	}
10554 
10555 	if ((ipif->ipif_isv6) && IN6_IS_ADDR_6TO4(&v6addr) &&
10556 		(!ipif->ipif_ill->ill_is_6to4tun)) {
10557 		queue_t *wqp = ipif->ipif_ill->ill_wq;
10558 
10559 		/*
10560 		 * The local address of this interface is a 6to4 address,
10561 		 * check if this interface is in fact a 6to4 tunnel or just
10562 		 * an interface configured with a 6to4 address.  We are only
10563 		 * interested in the former.
10564 		 */
10565 		if (wqp != NULL) {
10566 			while ((wqp->q_next != NULL) &&
10567 			    (wqp->q_next->q_qinfo != NULL) &&
10568 			    (wqp->q_next->q_qinfo->qi_minfo != NULL)) {
10569 
10570 				if (wqp->q_next->q_qinfo->qi_minfo->mi_idnum
10571 				    == TUN6TO4_MODID) {
10572 					/* set for use in IP */
10573 					ipif->ipif_ill->ill_is_6to4tun = 1;
10574 					break;
10575 				}
10576 				wqp = wqp->q_next;
10577 			}
10578 		}
10579 	}
10580 
10581 	ipif_set_default(ipif);
10582 	mutex_exit(&ipif->ipif_ill->ill_lock);
10583 
10584 	if (need_up) {
10585 		/*
10586 		 * Now bring the interface back up.  If this
10587 		 * is the only IPIF for the ILL, ipif_up
10588 		 * will have to re-bind to the device, so
10589 		 * we may get back EINPROGRESS, in which
10590 		 * case, this IOCTL will get completed in
10591 		 * ip_rput_dlpi when we see the DL_BIND_ACK.
10592 		 */
10593 		err = ipif_up(ipif, q, mp);
10594 	} else {
10595 		/*
10596 		 * Update the IPIF list in SCTP, ipif_up_done() will do it
10597 		 * if need_up is true.
10598 		 */
10599 		sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
10600 	}
10601 
10602 	return (err);
10603 }
10604 
10605 
10606 /*
10607  * Restart entry point to restart the address set operation after the
10608  * refcounts have dropped to zero.
10609  */
10610 /* ARGSUSED */
10611 int
10612 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10613     ip_ioctl_cmd_t *ipip, void *ifreq)
10614 {
10615 	ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n",
10616 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10617 	ASSERT(IAM_WRITER_IPIF(ipif));
10618 	ipif_down_tail(ipif);
10619 	return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE));
10620 }
10621 
10622 /* ARGSUSED */
10623 int
10624 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10625     ip_ioctl_cmd_t *ipip, void *if_req)
10626 {
10627 	sin6_t *sin6 = (struct sockaddr_in6 *)sin;
10628 	struct lifreq *lifr = (struct lifreq *)if_req;
10629 
10630 	ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n",
10631 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10632 	/*
10633 	 * The net mask and address can't change since we have a
10634 	 * reference to the ipif. So no lock is necessary.
10635 	 */
10636 	if (ipif->ipif_isv6) {
10637 		*sin6 = sin6_null;
10638 		sin6->sin6_family = AF_INET6;
10639 		sin6->sin6_addr = ipif->ipif_v6lcl_addr;
10640 		ASSERT(ipip->ipi_cmd_type == LIF_CMD);
10641 		lifr->lifr_addrlen =
10642 		    ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
10643 	} else {
10644 		*sin = sin_null;
10645 		sin->sin_family = AF_INET;
10646 		sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
10647 		if (ipip->ipi_cmd_type == LIF_CMD) {
10648 			lifr->lifr_addrlen =
10649 			    ip_mask_to_plen(ipif->ipif_net_mask);
10650 		}
10651 	}
10652 	return (0);
10653 }
10654 
10655 /*
10656  * Set the destination address for a pt-pt interface.
10657  */
10658 /* ARGSUSED */
10659 int
10660 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10661     ip_ioctl_cmd_t *ipip, void *if_req)
10662 {
10663 	int err = 0;
10664 	in6_addr_t v6addr;
10665 	boolean_t need_up = B_FALSE;
10666 
10667 	ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n",
10668 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10669 	ASSERT(IAM_WRITER_IPIF(ipif));
10670 
10671 	if (ipif->ipif_isv6) {
10672 		sin6_t *sin6;
10673 
10674 		if (sin->sin_family != AF_INET6)
10675 			return (EAFNOSUPPORT);
10676 
10677 		sin6 = (sin6_t *)sin;
10678 		v6addr = sin6->sin6_addr;
10679 
10680 		if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
10681 			return (EADDRNOTAVAIL);
10682 	} else {
10683 		ipaddr_t addr;
10684 
10685 		if (sin->sin_family != AF_INET)
10686 			return (EAFNOSUPPORT);
10687 
10688 		addr = sin->sin_addr.s_addr;
10689 		if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask))
10690 			return (EADDRNOTAVAIL);
10691 
10692 		IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
10693 	}
10694 
10695 	if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr))
10696 		return (0);	/* No change */
10697 
10698 	if (ipif->ipif_flags & IPIF_UP) {
10699 		/*
10700 		 * If the interface is already marked up,
10701 		 * we call ipif_down which will take care
10702 		 * of ditching any IREs that have been set
10703 		 * up based on the old pp dst address.
10704 		 */
10705 		err = ipif_logical_down(ipif, q, mp);
10706 		if (err == EINPROGRESS)
10707 			return (err);
10708 		ipif_down_tail(ipif);
10709 		need_up = B_TRUE;
10710 	}
10711 	/*
10712 	 * could return EINPROGRESS. If so ioctl will complete in
10713 	 * ip_rput_dlpi_writer
10714 	 */
10715 	err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up);
10716 	return (err);
10717 }
10718 
10719 static int
10720 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10721     boolean_t need_up)
10722 {
10723 	in6_addr_t v6addr;
10724 	ill_t	*ill = ipif->ipif_ill;
10725 	int	err = 0;
10726 
10727 	ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n",
10728 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10729 	if (ipif->ipif_isv6) {
10730 		sin6_t *sin6;
10731 
10732 		sin6 = (sin6_t *)sin;
10733 		v6addr = sin6->sin6_addr;
10734 	} else {
10735 		ipaddr_t addr;
10736 
10737 		addr = sin->sin_addr.s_addr;
10738 		IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
10739 	}
10740 	mutex_enter(&ill->ill_lock);
10741 	/* Set point to point destination address. */
10742 	if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
10743 		/*
10744 		 * Allow this as a means of creating logical
10745 		 * pt-pt interfaces on top of e.g. an Ethernet.
10746 		 * XXX Undocumented HACK for testing.
10747 		 * pt-pt interfaces are created with NUD disabled.
10748 		 */
10749 		ipif->ipif_flags |= IPIF_POINTOPOINT;
10750 		ipif->ipif_flags &= ~IPIF_BROADCAST;
10751 		if (ipif->ipif_isv6)
10752 			ipif->ipif_ill->ill_flags |= ILLF_NONUD;
10753 	}
10754 
10755 	/* Set the new address. */
10756 	ipif->ipif_v6pp_dst_addr = v6addr;
10757 	/* Make sure subnet tracks pp_dst */
10758 	ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
10759 	mutex_exit(&ill->ill_lock);
10760 
10761 	if (need_up) {
10762 		/*
10763 		 * Now bring the interface back up.  If this
10764 		 * is the only IPIF for the ILL, ipif_up
10765 		 * will have to re-bind to the device, so
10766 		 * we may get back EINPROGRESS, in which
10767 		 * case, this IOCTL will get completed in
10768 		 * ip_rput_dlpi when we see the DL_BIND_ACK.
10769 		 */
10770 		err = ipif_up(ipif, q, mp);
10771 	}
10772 	return (err);
10773 }
10774 
10775 /*
10776  * Restart entry point to restart the dstaddress set operation after the
10777  * refcounts have dropped to zero.
10778  */
10779 /* ARGSUSED */
10780 int
10781 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10782     ip_ioctl_cmd_t *ipip, void *ifreq)
10783 {
10784 	ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n",
10785 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10786 	ipif_down_tail(ipif);
10787 	return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE));
10788 }
10789 
10790 /* ARGSUSED */
10791 int
10792 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10793     ip_ioctl_cmd_t *ipip, void *if_req)
10794 {
10795 	sin6_t	*sin6 = (struct sockaddr_in6 *)sin;
10796 
10797 	ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n",
10798 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10799 	/*
10800 	 * Get point to point destination address. The addresses can't
10801 	 * change since we hold a reference to the ipif.
10802 	 */
10803 	if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0)
10804 		return (EADDRNOTAVAIL);
10805 
10806 	if (ipif->ipif_isv6) {
10807 		ASSERT(ipip->ipi_cmd_type == LIF_CMD);
10808 		*sin6 = sin6_null;
10809 		sin6->sin6_family = AF_INET6;
10810 		sin6->sin6_addr = ipif->ipif_v6pp_dst_addr;
10811 	} else {
10812 		*sin = sin_null;
10813 		sin->sin_family = AF_INET;
10814 		sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr;
10815 	}
10816 	return (0);
10817 }
10818 
10819 /*
10820  * part of ipmp, make this func return the active/inactive state and
10821  * caller can set once atomically instead of multiple mutex_enter/mutex_exit
10822  */
10823 /*
10824  * This function either sets or clears the IFF_INACTIVE flag.
10825  *
10826  * As long as there are some addresses or multicast memberships on the
10827  * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we
10828  * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface
10829  * will be used for outbound packets.
10830  *
10831  * Caller needs to verify the validity of setting IFF_INACTIVE.
10832  */
10833 static void
10834 phyint_inactive(phyint_t *phyi)
10835 {
10836 	ill_t *ill_v4;
10837 	ill_t *ill_v6;
10838 	ipif_t *ipif;
10839 	ilm_t *ilm;
10840 
10841 	ill_v4 = phyi->phyint_illv4;
10842 	ill_v6 = phyi->phyint_illv6;
10843 
10844 	/*
10845 	 * No need for a lock while traversing the list since iam
10846 	 * a writer
10847 	 */
10848 	if (ill_v4 != NULL) {
10849 		ASSERT(IAM_WRITER_ILL(ill_v4));
10850 		for (ipif = ill_v4->ill_ipif; ipif != NULL;
10851 		    ipif = ipif->ipif_next) {
10852 			if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) {
10853 				mutex_enter(&phyi->phyint_lock);
10854 				phyi->phyint_flags &= ~PHYI_INACTIVE;
10855 				mutex_exit(&phyi->phyint_lock);
10856 				return;
10857 			}
10858 		}
10859 		for (ilm = ill_v4->ill_ilm; ilm != NULL;
10860 		    ilm = ilm->ilm_next) {
10861 			if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) {
10862 				mutex_enter(&phyi->phyint_lock);
10863 				phyi->phyint_flags &= ~PHYI_INACTIVE;
10864 				mutex_exit(&phyi->phyint_lock);
10865 				return;
10866 			}
10867 		}
10868 	}
10869 	if (ill_v6 != NULL) {
10870 		ill_v6 = phyi->phyint_illv6;
10871 		for (ipif = ill_v6->ill_ipif; ipif != NULL;
10872 		    ipif = ipif->ipif_next) {
10873 			if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) {
10874 				mutex_enter(&phyi->phyint_lock);
10875 				phyi->phyint_flags &= ~PHYI_INACTIVE;
10876 				mutex_exit(&phyi->phyint_lock);
10877 				return;
10878 			}
10879 		}
10880 		for (ilm = ill_v6->ill_ilm; ilm != NULL;
10881 		    ilm = ilm->ilm_next) {
10882 			if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) {
10883 				mutex_enter(&phyi->phyint_lock);
10884 				phyi->phyint_flags &= ~PHYI_INACTIVE;
10885 				mutex_exit(&phyi->phyint_lock);
10886 				return;
10887 			}
10888 		}
10889 	}
10890 	mutex_enter(&phyi->phyint_lock);
10891 	phyi->phyint_flags |= PHYI_INACTIVE;
10892 	mutex_exit(&phyi->phyint_lock);
10893 }
10894 
10895 /*
10896  * This function is called only when the phyint flags change. Currently
10897  * called from ip_sioctl_flags. We re-do the broadcast nomination so
10898  * that we can select a good ill.
10899  */
10900 static void
10901 ip_redo_nomination(phyint_t *phyi)
10902 {
10903 	ill_t *ill_v4;
10904 
10905 	ill_v4 = phyi->phyint_illv4;
10906 
10907 	if (ill_v4 != NULL && ill_v4->ill_group != NULL) {
10908 		ASSERT(IAM_WRITER_ILL(ill_v4));
10909 		if (ill_v4->ill_group->illgrp_ill_count > 1)
10910 			ill_nominate_bcast_rcv(ill_v4->ill_group);
10911 	}
10912 }
10913 
10914 /*
10915  * Heuristic to check if ill is INACTIVE.
10916  * Checks if ill has an ipif with an usable ip address.
10917  *
10918  * Return values:
10919  *	B_TRUE	- ill is INACTIVE; has no usable ipif
10920  *	B_FALSE - ill is not INACTIVE; ill has at least one usable ipif
10921  */
10922 static boolean_t
10923 ill_is_inactive(ill_t *ill)
10924 {
10925 	ipif_t *ipif;
10926 
10927 	/* Check whether it is in an IPMP group */
10928 	if (ill->ill_phyint->phyint_groupname == NULL)
10929 		return (B_FALSE);
10930 
10931 	if (ill->ill_ipif_up_count == 0)
10932 		return (B_TRUE);
10933 
10934 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
10935 		uint64_t flags = ipif->ipif_flags;
10936 
10937 		/*
10938 		 * This ipif is usable if it is IPIF_UP and not a
10939 		 * dedicated test address.  A dedicated test address
10940 		 * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED
10941 		 * (note in particular that V6 test addresses are
10942 		 * link-local data addresses and thus are marked
10943 		 * IPIF_NOFAILOVER but not IPIF_DEPRECATED).
10944 		 */
10945 		if ((flags & IPIF_UP) &&
10946 		    ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) !=
10947 		    (IPIF_DEPRECATED|IPIF_NOFAILOVER)))
10948 			return (B_FALSE);
10949 	}
10950 	return (B_TRUE);
10951 }
10952 
10953 /*
10954  * Set interface flags.
10955  * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT,
10956  * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST,
10957  * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE.
10958  *
10959  * NOTE : We really don't enforce that ipif_id zero should be used
10960  *	  for setting any flags other than IFF_LOGINT_FLAGS. This
10961  *	  is because applications generally does SICGLIFFLAGS and
10962  *	  ORs in the new flags (that affects the logical) and does a
10963  *	  SIOCSLIFFLAGS. Thus, "flags" below could contain bits other
10964  *	  than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the
10965  *	  flags that will be turned on is correct with respect to
10966  *	  ipif_id 0. For backward compatibility reasons, it is not done.
10967  */
10968 /* ARGSUSED */
10969 int
10970 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10971     ip_ioctl_cmd_t *ipip, void *if_req)
10972 {
10973 	uint64_t turn_on;
10974 	uint64_t turn_off;
10975 	int	err;
10976 	boolean_t need_up = B_FALSE;
10977 	phyint_t *phyi;
10978 	ill_t *ill;
10979 	uint64_t intf_flags;
10980 	boolean_t phyint_flags_modified = B_FALSE;
10981 	uint64_t flags;
10982 	struct ifreq *ifr;
10983 	struct lifreq *lifr;
10984 	boolean_t set_linklocal = B_FALSE;
10985 	boolean_t zero_source = B_FALSE;
10986 
10987 	ip1dbg(("ip_sioctl_flags(%s:%u %p)\n",
10988 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10989 
10990 	ASSERT(IAM_WRITER_IPIF(ipif));
10991 
10992 	ill = ipif->ipif_ill;
10993 	phyi = ill->ill_phyint;
10994 
10995 	if (ipip->ipi_cmd_type == IF_CMD) {
10996 		ifr = (struct ifreq *)if_req;
10997 		flags =  (uint64_t)(ifr->ifr_flags & 0x0000ffff);
10998 	} else {
10999 		lifr = (struct lifreq *)if_req;
11000 		flags = lifr->lifr_flags;
11001 	}
11002 
11003 	intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
11004 
11005 	/*
11006 	 * Has the flags been set correctly till now ?
11007 	 */
11008 	ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
11009 	ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
11010 	ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
11011 	/*
11012 	 * Compare the new flags to the old, and partition
11013 	 * into those coming on and those going off.
11014 	 * For the 16 bit command keep the bits above bit 16 unchanged.
11015 	 */
11016 	if (ipip->ipi_cmd == SIOCSIFFLAGS)
11017 		flags |= intf_flags & ~0xFFFF;
11018 
11019 	/*
11020 	 * First check which bits will change and then which will
11021 	 * go on and off
11022 	 */
11023 	turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE;
11024 	if (!turn_on)
11025 		return (0);	/* No change */
11026 
11027 	turn_off = intf_flags & turn_on;
11028 	turn_on ^= turn_off;
11029 	err = 0;
11030 
11031 	/*
11032 	 * Don't allow any bits belonging to the logical interface
11033 	 * to be set or cleared on the replacement ipif that was
11034 	 * created temporarily during a MOVE.
11035 	 */
11036 	if (ipif->ipif_replace_zero &&
11037 	    ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) {
11038 		return (EINVAL);
11039 	}
11040 
11041 	/*
11042 	 * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on
11043 	 * IPv6 interfaces.
11044 	 */
11045 	if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6))
11046 		return (EINVAL);
11047 
11048 	/*
11049 	 * Don't allow the IFF_ROUTER flag to be turned on on loopback
11050 	 * interfaces.  It makes no sense in that context.
11051 	 */
11052 	if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK))
11053 		return (EINVAL);
11054 
11055 	if (flags & (IFF_NOLOCAL|IFF_ANYCAST))
11056 		zero_source = B_TRUE;
11057 
11058 	/*
11059 	 * For IPv6 ipif_id 0, don't allow the interface to be up without
11060 	 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set.
11061 	 * If the link local address isn't set, and can be set, it will get
11062 	 * set later on in this function.
11063 	 */
11064 	if (ipif->ipif_id == 0 && ipif->ipif_isv6 &&
11065 	    (flags & IFF_UP) && !zero_source &&
11066 	    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
11067 		if (ipif_cant_setlinklocal(ipif))
11068 			return (EINVAL);
11069 		set_linklocal = B_TRUE;
11070 	}
11071 
11072 	/*
11073 	 * ILL cannot be part of a usesrc group and and IPMP group at the
11074 	 * same time. No need to grab ill_g_usesrc_lock here, see
11075 	 * synchronization notes in ip.c
11076 	 */
11077 	if (turn_on & PHYI_STANDBY &&
11078 	    ipif->ipif_ill->ill_usesrc_grp_next != NULL) {
11079 		return (EINVAL);
11080 	}
11081 
11082 	/*
11083 	 * If we modify physical interface flags, we'll potentially need to
11084 	 * send up two routing socket messages for the changes (one for the
11085 	 * IPv4 ill, and another for the IPv6 ill).  Note that here.
11086 	 */
11087 	if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
11088 		phyint_flags_modified = B_TRUE;
11089 
11090 	/*
11091 	 * If we are setting or clearing FAILED or STANDBY or OFFLINE,
11092 	 * we need to flush the IRE_CACHES belonging to this ill.
11093 	 * We handle this case here without doing the DOWN/UP dance
11094 	 * like it is done for other flags. If some other flags are
11095 	 * being turned on/off with FAILED/STANDBY/OFFLINE, the code
11096 	 * below will handle it by bringing it down and then
11097 	 * bringing it UP.
11098 	 */
11099 	if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) {
11100 		ill_t *ill_v4, *ill_v6;
11101 
11102 		ill_v4 = phyi->phyint_illv4;
11103 		ill_v6 = phyi->phyint_illv6;
11104 
11105 		/*
11106 		 * First set the INACTIVE flag if needed. Then delete the ires.
11107 		 * ire_add will atomically prevent creating new IRE_CACHEs
11108 		 * unless hidden flag is set.
11109 		 * PHYI_FAILED and PHYI_INACTIVE are exclusive
11110 		 */
11111 		if ((turn_on & PHYI_FAILED) &&
11112 		    ((intf_flags & PHYI_STANDBY) || !ipmp_enable_failback)) {
11113 			/* Reset PHYI_INACTIVE when PHYI_FAILED is being set */
11114 			phyi->phyint_flags &= ~PHYI_INACTIVE;
11115 		}
11116 		if ((turn_off & PHYI_FAILED) &&
11117 		    ((intf_flags & PHYI_STANDBY) ||
11118 		    (!ipmp_enable_failback && ill_is_inactive(ill)))) {
11119 			phyint_inactive(phyi);
11120 		}
11121 
11122 		if (turn_on & PHYI_STANDBY) {
11123 			/*
11124 			 * We implicitly set INACTIVE only when STANDBY is set.
11125 			 * INACTIVE is also set on non-STANDBY phyint when user
11126 			 * disables FAILBACK using configuration file.
11127 			 * Do not allow STANDBY to be set on such INACTIVE
11128 			 * phyint
11129 			 */
11130 			if (phyi->phyint_flags & PHYI_INACTIVE)
11131 				return (EINVAL);
11132 			if (!(phyi->phyint_flags & PHYI_FAILED))
11133 				phyint_inactive(phyi);
11134 		}
11135 		if (turn_off & PHYI_STANDBY) {
11136 			if (ipmp_enable_failback) {
11137 				/*
11138 				 * Reset PHYI_INACTIVE.
11139 				 */
11140 				phyi->phyint_flags &= ~PHYI_INACTIVE;
11141 			} else if (ill_is_inactive(ill) &&
11142 			    !(phyi->phyint_flags & PHYI_FAILED)) {
11143 				/*
11144 				 * Need to set INACTIVE, when user sets
11145 				 * STANDBY on a non-STANDBY phyint and
11146 				 * later resets STANDBY
11147 				 */
11148 				phyint_inactive(phyi);
11149 			}
11150 		}
11151 		/*
11152 		 * We should always send up a message so that the
11153 		 * daemons come to know of it. Note that the zeroth
11154 		 * interface can be down and the check below for IPIF_UP
11155 		 * will not make sense as we are actually setting
11156 		 * a phyint flag here. We assume that the ipif used
11157 		 * is always the zeroth ipif. (ip_rts_ifmsg does not
11158 		 * send up any message for non-zero ipifs).
11159 		 */
11160 		phyint_flags_modified = B_TRUE;
11161 
11162 		if (ill_v4 != NULL) {
11163 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
11164 			    IRE_CACHE, ill_stq_cache_delete,
11165 			    (char *)ill_v4, ill_v4);
11166 			illgrp_reset_schednext(ill_v4);
11167 		}
11168 		if (ill_v6 != NULL) {
11169 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
11170 			    IRE_CACHE, ill_stq_cache_delete,
11171 			    (char *)ill_v6, ill_v6);
11172 			illgrp_reset_schednext(ill_v6);
11173 		}
11174 	}
11175 
11176 	/*
11177 	 * If ILLF_ROUTER changes, we need to change the ip forwarding
11178 	 * status of the interface and, if the interface is part of an IPMP
11179 	 * group, all other interfaces that are part of the same IPMP
11180 	 * group.
11181 	 */
11182 	if ((turn_on | turn_off) & ILLF_ROUTER) {
11183 		(void) ill_forward_set(q, mp, ((turn_on & ILLF_ROUTER) != 0),
11184 		    (caddr_t)ill);
11185 	}
11186 
11187 	/*
11188 	 * If the interface is not UP and we are not going to
11189 	 * bring it UP, record the flags and return. When the
11190 	 * interface comes UP later, the right actions will be
11191 	 * taken.
11192 	 */
11193 	if (!(ipif->ipif_flags & IPIF_UP) &&
11194 	    !(turn_on & IPIF_UP)) {
11195 		/* Record new flags in their respective places. */
11196 		mutex_enter(&ill->ill_lock);
11197 		mutex_enter(&ill->ill_phyint->phyint_lock);
11198 		ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
11199 		ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
11200 		ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
11201 		ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
11202 		phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
11203 		phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
11204 		mutex_exit(&ill->ill_lock);
11205 		mutex_exit(&ill->ill_phyint->phyint_lock);
11206 
11207 		/*
11208 		 * We do the broadcast and nomination here rather
11209 		 * than waiting for a FAILOVER/FAILBACK to happen. In
11210 		 * the case of FAILBACK from INACTIVE standby to the
11211 		 * interface that has been repaired, PHYI_FAILED has not
11212 		 * been cleared yet. If there are only two interfaces in
11213 		 * that group, all we have is a FAILED and INACTIVE
11214 		 * interface. If we do the nomination soon after a failback,
11215 		 * the broadcast nomination code would select the
11216 		 * INACTIVE interface for receiving broadcasts as FAILED is
11217 		 * not yet cleared. As we don't want STANDBY/INACTIVE to
11218 		 * receive broadcast packets, we need to redo nomination
11219 		 * when the FAILED is cleared here. Thus, in general we
11220 		 * always do the nomination here for FAILED, STANDBY
11221 		 * and OFFLINE.
11222 		 */
11223 		if (((turn_on | turn_off) &
11224 		    (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) {
11225 			ip_redo_nomination(phyi);
11226 		}
11227 		if (phyint_flags_modified) {
11228 			if (phyi->phyint_illv4 != NULL) {
11229 				ip_rts_ifmsg(phyi->phyint_illv4->
11230 				    ill_ipif);
11231 			}
11232 			if (phyi->phyint_illv6 != NULL) {
11233 				ip_rts_ifmsg(phyi->phyint_illv6->
11234 				    ill_ipif);
11235 			}
11236 		}
11237 		return (0);
11238 	} else if (set_linklocal || zero_source) {
11239 		mutex_enter(&ill->ill_lock);
11240 		if (set_linklocal)
11241 			ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL;
11242 		if (zero_source)
11243 			ipif->ipif_state_flags |= IPIF_ZERO_SOURCE;
11244 		mutex_exit(&ill->ill_lock);
11245 	}
11246 
11247 	/*
11248 	 * Disallow IPv6 interfaces coming up that have the unspecified address,
11249 	 * or point-to-point interfaces with an unspecified destination. We do
11250 	 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that
11251 	 * have a subnet assigned, which is how in.ndpd currently manages its
11252 	 * onlink prefix list when no addresses are configured with those
11253 	 * prefixes.
11254 	 */
11255 	if (ipif->ipif_isv6 &&
11256 	    ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
11257 	    (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) ||
11258 	    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) ||
11259 	    ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
11260 	    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) {
11261 		return (EINVAL);
11262 	}
11263 
11264 	/*
11265 	 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination
11266 	 * from being brought up.
11267 	 */
11268 	if (!ipif->ipif_isv6 &&
11269 	    ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
11270 	    ipif->ipif_pp_dst_addr == INADDR_ANY)) {
11271 		return (EINVAL);
11272 	}
11273 
11274 	/*
11275 	 * The only flag changes that we currently take specific action on
11276 	 * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL,
11277 	 * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and
11278 	 * IPIF_PREFERRED.  This is done by bring the ipif down, changing
11279 	 * the flags and bringing it back up again.
11280 	 */
11281 	if ((turn_on|turn_off) &
11282 	    (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP|
11283 	    ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) {
11284 		/*
11285 		 * Taking this ipif down, make sure we have
11286 		 * valid net and subnet bcast ire's for other
11287 		 * logical interfaces, if we need them.
11288 		 */
11289 		if (!ipif->ipif_isv6)
11290 			ipif_check_bcast_ires(ipif);
11291 
11292 		if (((ipif->ipif_flags | turn_on) & IPIF_UP) &&
11293 		    !(turn_off & IPIF_UP)) {
11294 			need_up = B_TRUE;
11295 			if (ipif->ipif_flags & IPIF_UP)
11296 				ill->ill_logical_down = 1;
11297 			turn_on &= ~IPIF_UP;
11298 		}
11299 		err = ipif_down(ipif, q, mp);
11300 		ip1dbg(("ipif_down returns %d err ", err));
11301 		if (err == EINPROGRESS)
11302 			return (err);
11303 		ipif_down_tail(ipif);
11304 	}
11305 	return (ip_sioctl_flags_tail(ipif, flags, q, mp, need_up));
11306 }
11307 
11308 static int
11309 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp,
11310     boolean_t need_up)
11311 {
11312 	ill_t	*ill;
11313 	phyint_t *phyi;
11314 	uint64_t turn_on;
11315 	uint64_t turn_off;
11316 	uint64_t intf_flags;
11317 	boolean_t phyint_flags_modified = B_FALSE;
11318 	int	err = 0;
11319 	boolean_t set_linklocal = B_FALSE;
11320 	boolean_t zero_source = B_FALSE;
11321 
11322 	ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n",
11323 		ipif->ipif_ill->ill_name, ipif->ipif_id));
11324 
11325 	ASSERT(IAM_WRITER_IPIF(ipif));
11326 
11327 	ill = ipif->ipif_ill;
11328 	phyi = ill->ill_phyint;
11329 
11330 	intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
11331 	turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP);
11332 
11333 	turn_off = intf_flags & turn_on;
11334 	turn_on ^= turn_off;
11335 
11336 	if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))
11337 		phyint_flags_modified = B_TRUE;
11338 
11339 	/*
11340 	 * Now we change the flags. Track current value of
11341 	 * other flags in their respective places.
11342 	 */
11343 	mutex_enter(&ill->ill_lock);
11344 	mutex_enter(&phyi->phyint_lock);
11345 	ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
11346 	ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
11347 	ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
11348 	ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
11349 	phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
11350 	phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
11351 	if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) {
11352 		set_linklocal = B_TRUE;
11353 		ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL;
11354 	}
11355 	if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) {
11356 		zero_source = B_TRUE;
11357 		ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE;
11358 	}
11359 	mutex_exit(&ill->ill_lock);
11360 	mutex_exit(&phyi->phyint_lock);
11361 
11362 	if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)))
11363 		ip_redo_nomination(phyi);
11364 
11365 	if (set_linklocal)
11366 		(void) ipif_setlinklocal(ipif);
11367 
11368 	if (zero_source)
11369 		ipif->ipif_v6src_addr = ipv6_all_zeros;
11370 	else
11371 		ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
11372 
11373 	if (need_up) {
11374 		/*
11375 		 * XXX ipif_up really does not know whether a phyint flags
11376 		 * was modified or not. So, it sends up information on
11377 		 * only one routing sockets message. As we don't bring up
11378 		 * the interface and also set STANDBY/FAILED simultaneously
11379 		 * it should be okay.
11380 		 */
11381 		err = ipif_up(ipif, q, mp);
11382 	} else {
11383 		/*
11384 		 * Make sure routing socket sees all changes to the flags.
11385 		 * ipif_up_done* handles this when we use ipif_up.
11386 		 */
11387 		if (phyint_flags_modified) {
11388 			if (phyi->phyint_illv4 != NULL) {
11389 				ip_rts_ifmsg(phyi->phyint_illv4->
11390 				    ill_ipif);
11391 			}
11392 			if (phyi->phyint_illv6 != NULL) {
11393 				ip_rts_ifmsg(phyi->phyint_illv6->
11394 				    ill_ipif);
11395 			}
11396 		} else {
11397 			ip_rts_ifmsg(ipif);
11398 		}
11399 	}
11400 	return (err);
11401 }
11402 
11403 /*
11404  * Restart entry point to restart the flags restart operation after the
11405  * refcounts have dropped to zero.
11406  */
11407 /* ARGSUSED */
11408 int
11409 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11410     ip_ioctl_cmd_t *ipip, void *if_req)
11411 {
11412 	int	err;
11413 	struct ifreq *ifr = (struct ifreq *)if_req;
11414 	struct lifreq *lifr = (struct lifreq *)if_req;
11415 
11416 	ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n",
11417 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11418 
11419 	ipif_down_tail(ipif);
11420 	if (ipip->ipi_cmd_type == IF_CMD) {
11421 		/*
11422 		 * Since ip_sioctl_flags expects an int and ifr_flags
11423 		 * is a short we need to cast ifr_flags into an int
11424 		 * to avoid having sign extension cause bits to get
11425 		 * set that should not be.
11426 		 */
11427 		err = ip_sioctl_flags_tail(ipif,
11428 		    (uint64_t)(ifr->ifr_flags & 0x0000ffff),
11429 		    q, mp, B_TRUE);
11430 	} else {
11431 		err = ip_sioctl_flags_tail(ipif, lifr->lifr_flags,
11432 		    q, mp, B_TRUE);
11433 	}
11434 	return (err);
11435 }
11436 
11437 /* ARGSUSED */
11438 int
11439 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11440     ip_ioctl_cmd_t *ipip, void *if_req)
11441 {
11442 	/*
11443 	 * Has the flags been set correctly till now ?
11444 	 */
11445 	ill_t *ill = ipif->ipif_ill;
11446 	phyint_t *phyi = ill->ill_phyint;
11447 
11448 	ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n",
11449 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11450 	ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
11451 	ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
11452 	ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
11453 
11454 	/*
11455 	 * Need a lock since some flags can be set even when there are
11456 	 * references to the ipif.
11457 	 */
11458 	mutex_enter(&ill->ill_lock);
11459 	if (ipip->ipi_cmd_type == IF_CMD) {
11460 		struct ifreq *ifr = (struct ifreq *)if_req;
11461 
11462 		/* Get interface flags (low 16 only). */
11463 		ifr->ifr_flags = ((ipif->ipif_flags |
11464 		    ill->ill_flags | phyi->phyint_flags) & 0xffff);
11465 	} else {
11466 		struct lifreq *lifr = (struct lifreq *)if_req;
11467 
11468 		/* Get interface flags. */
11469 		lifr->lifr_flags = ipif->ipif_flags |
11470 		    ill->ill_flags | phyi->phyint_flags;
11471 	}
11472 	mutex_exit(&ill->ill_lock);
11473 	return (0);
11474 }
11475 
11476 /* ARGSUSED */
11477 int
11478 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11479     ip_ioctl_cmd_t *ipip, void *if_req)
11480 {
11481 	int mtu;
11482 	int ip_min_mtu;
11483 	struct ifreq	*ifr;
11484 	struct lifreq *lifr;
11485 	ire_t	*ire;
11486 
11487 	ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name,
11488 	    ipif->ipif_id, (void *)ipif));
11489 	if (ipip->ipi_cmd_type == IF_CMD) {
11490 		ifr = (struct ifreq *)if_req;
11491 		mtu = ifr->ifr_metric;
11492 	} else {
11493 		lifr = (struct lifreq *)if_req;
11494 		mtu = lifr->lifr_mtu;
11495 	}
11496 
11497 	if (ipif->ipif_isv6)
11498 		ip_min_mtu = IPV6_MIN_MTU;
11499 	else
11500 		ip_min_mtu = IP_MIN_MTU;
11501 
11502 	if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu)
11503 		return (EINVAL);
11504 
11505 	/*
11506 	 * Change the MTU size in all relevant ire's.
11507 	 * Mtu change Vs. new ire creation - protocol below.
11508 	 * First change ipif_mtu and the ire_max_frag of the
11509 	 * interface ire. Then do an ire walk and change the
11510 	 * ire_max_frag of all affected ires. During ire_add
11511 	 * under the bucket lock, set the ire_max_frag of the
11512 	 * new ire being created from the ipif/ire from which
11513 	 * it is being derived. If an mtu change happens after
11514 	 * the ire is added, the new ire will be cleaned up.
11515 	 * Conversely if the mtu change happens before the ire
11516 	 * is added, ire_add will see the new value of the mtu.
11517 	 */
11518 	ipif->ipif_mtu = mtu;
11519 	ipif->ipif_flags |= IPIF_FIXEDMTU;
11520 
11521 	if (ipif->ipif_isv6)
11522 		ire = ipif_to_ire_v6(ipif);
11523 	else
11524 		ire = ipif_to_ire(ipif);
11525 	if (ire != NULL) {
11526 		ire->ire_max_frag = ipif->ipif_mtu;
11527 		ire_refrele(ire);
11528 	}
11529 	if (ipif->ipif_flags & IPIF_UP) {
11530 		if (ipif->ipif_isv6)
11531 			ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES);
11532 		else
11533 			ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES);
11534 	}
11535 	/* Update the MTU in SCTP's list */
11536 	sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
11537 	return (0);
11538 }
11539 
11540 /* Get interface MTU. */
11541 /* ARGSUSED */
11542 int
11543 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11544 	ip_ioctl_cmd_t *ipip, void *if_req)
11545 {
11546 	struct ifreq	*ifr;
11547 	struct lifreq	*lifr;
11548 
11549 	ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n",
11550 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11551 	if (ipip->ipi_cmd_type == IF_CMD) {
11552 		ifr = (struct ifreq *)if_req;
11553 		ifr->ifr_metric = ipif->ipif_mtu;
11554 	} else {
11555 		lifr = (struct lifreq *)if_req;
11556 		lifr->lifr_mtu = ipif->ipif_mtu;
11557 	}
11558 	return (0);
11559 }
11560 
11561 /* Set interface broadcast address. */
11562 /* ARGSUSED2 */
11563 int
11564 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11565 	ip_ioctl_cmd_t *ipip, void *if_req)
11566 {
11567 	ipaddr_t addr;
11568 	ire_t	*ire;
11569 
11570 	ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name,
11571 	    ipif->ipif_id));
11572 
11573 	ASSERT(IAM_WRITER_IPIF(ipif));
11574 	if (!(ipif->ipif_flags & IPIF_BROADCAST))
11575 		return (EADDRNOTAVAIL);
11576 
11577 	ASSERT(!(ipif->ipif_isv6));	/* No IPv6 broadcast */
11578 
11579 	if (sin->sin_family != AF_INET)
11580 		return (EAFNOSUPPORT);
11581 
11582 	addr = sin->sin_addr.s_addr;
11583 	if (ipif->ipif_flags & IPIF_UP) {
11584 		/*
11585 		 * If we are already up, make sure the new
11586 		 * broadcast address makes sense.  If it does,
11587 		 * there should be an IRE for it already.
11588 		 * Don't match on ipif, only on the ill
11589 		 * since we are sharing these now. Don't use
11590 		 * MATCH_IRE_ILL_GROUP as we are looking for
11591 		 * the broadcast ire on this ill and each ill
11592 		 * in the group has its own broadcast ire.
11593 		 */
11594 		ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST,
11595 		    ipif, ALL_ZONES, (MATCH_IRE_ILL | MATCH_IRE_TYPE));
11596 		if (ire == NULL) {
11597 			return (EINVAL);
11598 		} else {
11599 			ire_refrele(ire);
11600 		}
11601 	}
11602 	/*
11603 	 * Changing the broadcast addr for this ipif.
11604 	 * Make sure we have valid net and subnet bcast
11605 	 * ire's for other logical interfaces, if needed.
11606 	 */
11607 	if (addr != ipif->ipif_brd_addr)
11608 		ipif_check_bcast_ires(ipif);
11609 	IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr);
11610 	return (0);
11611 }
11612 
11613 /* Get interface broadcast address. */
11614 /* ARGSUSED */
11615 int
11616 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11617     ip_ioctl_cmd_t *ipip, void *if_req)
11618 {
11619 	ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n",
11620 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11621 	if (!(ipif->ipif_flags & IPIF_BROADCAST))
11622 		return (EADDRNOTAVAIL);
11623 
11624 	/* IPIF_BROADCAST not possible with IPv6 */
11625 	ASSERT(!ipif->ipif_isv6);
11626 	*sin = sin_null;
11627 	sin->sin_family = AF_INET;
11628 	sin->sin_addr.s_addr = ipif->ipif_brd_addr;
11629 	return (0);
11630 }
11631 
11632 /*
11633  * This routine is called to handle the SIOCS*IFNETMASK IOCTL.
11634  */
11635 /* ARGSUSED */
11636 int
11637 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11638     ip_ioctl_cmd_t *ipip, void *if_req)
11639 {
11640 	int err = 0;
11641 	in6_addr_t v6mask;
11642 
11643 	ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n",
11644 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11645 
11646 	ASSERT(IAM_WRITER_IPIF(ipif));
11647 
11648 	if (ipif->ipif_isv6) {
11649 		sin6_t *sin6;
11650 
11651 		if (sin->sin_family != AF_INET6)
11652 			return (EAFNOSUPPORT);
11653 
11654 		sin6 = (sin6_t *)sin;
11655 		v6mask = sin6->sin6_addr;
11656 	} else {
11657 		ipaddr_t mask;
11658 
11659 		if (sin->sin_family != AF_INET)
11660 			return (EAFNOSUPPORT);
11661 
11662 		mask = sin->sin_addr.s_addr;
11663 		V4MASK_TO_V6(mask, v6mask);
11664 	}
11665 
11666 	/*
11667 	 * No big deal if the interface isn't already up, or the mask
11668 	 * isn't really changing, or this is pt-pt.
11669 	 */
11670 	if (!(ipif->ipif_flags & IPIF_UP) ||
11671 	    IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) ||
11672 	    (ipif->ipif_flags & IPIF_POINTOPOINT)) {
11673 		ipif->ipif_v6net_mask = v6mask;
11674 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11675 			V6_MASK_COPY(ipif->ipif_v6lcl_addr,
11676 			    ipif->ipif_v6net_mask,
11677 			    ipif->ipif_v6subnet);
11678 		}
11679 		return (0);
11680 	}
11681 	/*
11682 	 * Make sure we have valid net and subnet broadcast ire's
11683 	 * for the old netmask, if needed by other logical interfaces.
11684 	 */
11685 	if (!ipif->ipif_isv6)
11686 		ipif_check_bcast_ires(ipif);
11687 
11688 	err = ipif_logical_down(ipif, q, mp);
11689 	if (err == EINPROGRESS)
11690 		return (err);
11691 	ipif_down_tail(ipif);
11692 	err = ip_sioctl_netmask_tail(ipif, sin, q, mp);
11693 	return (err);
11694 }
11695 
11696 static int
11697 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp)
11698 {
11699 	in6_addr_t v6mask;
11700 	int err = 0;
11701 
11702 	ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n",
11703 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11704 
11705 	if (ipif->ipif_isv6) {
11706 		sin6_t *sin6;
11707 
11708 		sin6 = (sin6_t *)sin;
11709 		v6mask = sin6->sin6_addr;
11710 	} else {
11711 		ipaddr_t mask;
11712 
11713 		mask = sin->sin_addr.s_addr;
11714 		V4MASK_TO_V6(mask, v6mask);
11715 	}
11716 
11717 	ipif->ipif_v6net_mask = v6mask;
11718 	if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11719 		V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
11720 		    ipif->ipif_v6subnet);
11721 	}
11722 	err = ipif_up(ipif, q, mp);
11723 
11724 	if (err == 0 || err == EINPROGRESS) {
11725 		/*
11726 		 * The interface must be DL_BOUND if this packet has to
11727 		 * go out on the wire. Since we only go through a logical
11728 		 * down and are bound with the driver during an internal
11729 		 * down/up that is satisfied.
11730 		 */
11731 		if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) {
11732 			/* Potentially broadcast an address mask reply. */
11733 			ipif_mask_reply(ipif);
11734 		}
11735 	}
11736 	return (err);
11737 }
11738 
11739 /* ARGSUSED */
11740 int
11741 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11742     ip_ioctl_cmd_t *ipip, void *if_req)
11743 {
11744 	ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n",
11745 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11746 	ipif_down_tail(ipif);
11747 	return (ip_sioctl_netmask_tail(ipif, sin, q, mp));
11748 }
11749 
11750 /* Get interface net mask. */
11751 /* ARGSUSED */
11752 int
11753 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11754     ip_ioctl_cmd_t *ipip, void *if_req)
11755 {
11756 	struct lifreq *lifr = (struct lifreq *)if_req;
11757 	struct sockaddr_in6 *sin6 = (sin6_t *)sin;
11758 
11759 	ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n",
11760 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11761 
11762 	/*
11763 	 * net mask can't change since we have a reference to the ipif.
11764 	 */
11765 	if (ipif->ipif_isv6) {
11766 		ASSERT(ipip->ipi_cmd_type == LIF_CMD);
11767 		*sin6 = sin6_null;
11768 		sin6->sin6_family = AF_INET6;
11769 		sin6->sin6_addr = ipif->ipif_v6net_mask;
11770 		lifr->lifr_addrlen =
11771 		    ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
11772 	} else {
11773 		*sin = sin_null;
11774 		sin->sin_family = AF_INET;
11775 		sin->sin_addr.s_addr = ipif->ipif_net_mask;
11776 		if (ipip->ipi_cmd_type == LIF_CMD) {
11777 			lifr->lifr_addrlen =
11778 			    ip_mask_to_plen(ipif->ipif_net_mask);
11779 		}
11780 	}
11781 	return (0);
11782 }
11783 
11784 /* ARGSUSED */
11785 int
11786 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11787     ip_ioctl_cmd_t *ipip, void *if_req)
11788 {
11789 
11790 	ip1dbg(("ip_sioctl_metric(%s:%u %p)\n",
11791 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11792 	/*
11793 	 * Set interface metric.  We don't use this for
11794 	 * anything but we keep track of it in case it is
11795 	 * important to routing applications or such.
11796 	 */
11797 	if (ipip->ipi_cmd_type == IF_CMD) {
11798 		struct ifreq    *ifr;
11799 
11800 		ifr = (struct ifreq *)if_req;
11801 		ipif->ipif_metric = ifr->ifr_metric;
11802 	} else {
11803 		struct lifreq   *lifr;
11804 
11805 		lifr = (struct lifreq *)if_req;
11806 		ipif->ipif_metric = lifr->lifr_metric;
11807 	}
11808 	return (0);
11809 }
11810 
11811 
11812 /* ARGSUSED */
11813 int
11814 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11815     ip_ioctl_cmd_t *ipip, void *if_req)
11816 {
11817 
11818 	/* Get interface metric. */
11819 	ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n",
11820 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11821 	if (ipip->ipi_cmd_type == IF_CMD) {
11822 		struct ifreq    *ifr;
11823 
11824 		ifr = (struct ifreq *)if_req;
11825 		ifr->ifr_metric = ipif->ipif_metric;
11826 	} else {
11827 		struct lifreq   *lifr;
11828 
11829 		lifr = (struct lifreq *)if_req;
11830 		lifr->lifr_metric = ipif->ipif_metric;
11831 	}
11832 
11833 	return (0);
11834 }
11835 
11836 /* ARGSUSED */
11837 int
11838 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11839     ip_ioctl_cmd_t *ipip, void *if_req)
11840 {
11841 
11842 	ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n",
11843 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11844 	/*
11845 	 * Set the muxid returned from I_PLINK.
11846 	 */
11847 	if (ipip->ipi_cmd_type == IF_CMD) {
11848 		struct ifreq *ifr = (struct ifreq *)if_req;
11849 
11850 		ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid;
11851 		ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid;
11852 	} else {
11853 		struct lifreq *lifr = (struct lifreq *)if_req;
11854 
11855 		ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid;
11856 		ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid;
11857 	}
11858 	return (0);
11859 }
11860 
11861 /* ARGSUSED */
11862 int
11863 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11864     ip_ioctl_cmd_t *ipip, void *if_req)
11865 {
11866 
11867 	ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n",
11868 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11869 	/*
11870 	 * Get the muxid saved in ill for I_PUNLINK.
11871 	 */
11872 	if (ipip->ipi_cmd_type == IF_CMD) {
11873 		struct ifreq *ifr = (struct ifreq *)if_req;
11874 
11875 		ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid;
11876 		ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid;
11877 	} else {
11878 		struct lifreq *lifr = (struct lifreq *)if_req;
11879 
11880 		lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid;
11881 		lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid;
11882 	}
11883 	return (0);
11884 }
11885 
11886 /*
11887  * Set the subnet prefix. Does not modify the broadcast address.
11888  */
11889 /* ARGSUSED */
11890 int
11891 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11892     ip_ioctl_cmd_t *ipip, void *if_req)
11893 {
11894 	int err = 0;
11895 	in6_addr_t v6addr;
11896 	in6_addr_t v6mask;
11897 	boolean_t need_up = B_FALSE;
11898 	int addrlen;
11899 
11900 	ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n",
11901 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11902 
11903 	ASSERT(IAM_WRITER_IPIF(ipif));
11904 	addrlen = ((struct lifreq *)if_req)->lifr_addrlen;
11905 
11906 	if (ipif->ipif_isv6) {
11907 		sin6_t *sin6;
11908 
11909 		if (sin->sin_family != AF_INET6)
11910 			return (EAFNOSUPPORT);
11911 
11912 		sin6 = (sin6_t *)sin;
11913 		v6addr = sin6->sin6_addr;
11914 		if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones))
11915 			return (EADDRNOTAVAIL);
11916 	} else {
11917 		ipaddr_t addr;
11918 
11919 		if (sin->sin_family != AF_INET)
11920 			return (EAFNOSUPPORT);
11921 
11922 		addr = sin->sin_addr.s_addr;
11923 		if (!ip_addr_ok_v4(addr, 0xFFFFFFFF))
11924 			return (EADDRNOTAVAIL);
11925 		IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11926 		/* Add 96 bits */
11927 		addrlen += IPV6_ABITS - IP_ABITS;
11928 	}
11929 
11930 	if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL)
11931 		return (EINVAL);
11932 
11933 	/* Check if bits in the address is set past the mask */
11934 	if (!V6_MASK_EQ(v6addr, v6mask, v6addr))
11935 		return (EINVAL);
11936 
11937 	if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) &&
11938 	    IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask))
11939 		return (0);	/* No change */
11940 
11941 	if (ipif->ipif_flags & IPIF_UP) {
11942 		/*
11943 		 * If the interface is already marked up,
11944 		 * we call ipif_down which will take care
11945 		 * of ditching any IREs that have been set
11946 		 * up based on the old interface address.
11947 		 */
11948 		err = ipif_logical_down(ipif, q, mp);
11949 		if (err == EINPROGRESS)
11950 			return (err);
11951 		ipif_down_tail(ipif);
11952 		need_up = B_TRUE;
11953 	}
11954 
11955 	err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up);
11956 	return (err);
11957 }
11958 
11959 static int
11960 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask,
11961     queue_t *q, mblk_t *mp, boolean_t need_up)
11962 {
11963 	ill_t	*ill = ipif->ipif_ill;
11964 	int	err = 0;
11965 
11966 	ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n",
11967 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11968 
11969 	/* Set the new address. */
11970 	mutex_enter(&ill->ill_lock);
11971 	ipif->ipif_v6net_mask = v6mask;
11972 	if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11973 		V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask,
11974 		    ipif->ipif_v6subnet);
11975 	}
11976 	mutex_exit(&ill->ill_lock);
11977 
11978 	if (need_up) {
11979 		/*
11980 		 * Now bring the interface back up.  If this
11981 		 * is the only IPIF for the ILL, ipif_up
11982 		 * will have to re-bind to the device, so
11983 		 * we may get back EINPROGRESS, in which
11984 		 * case, this IOCTL will get completed in
11985 		 * ip_rput_dlpi when we see the DL_BIND_ACK.
11986 		 */
11987 		err = ipif_up(ipif, q, mp);
11988 		if (err == EINPROGRESS)
11989 			return (err);
11990 	}
11991 	return (err);
11992 }
11993 
11994 /* ARGSUSED */
11995 int
11996 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11997     ip_ioctl_cmd_t *ipip, void *if_req)
11998 {
11999 	int	addrlen;
12000 	in6_addr_t v6addr;
12001 	in6_addr_t v6mask;
12002 	struct lifreq *lifr = (struct lifreq *)if_req;
12003 
12004 	ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n",
12005 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12006 	ipif_down_tail(ipif);
12007 
12008 	addrlen = lifr->lifr_addrlen;
12009 	if (ipif->ipif_isv6) {
12010 		sin6_t *sin6;
12011 
12012 		sin6 = (sin6_t *)sin;
12013 		v6addr = sin6->sin6_addr;
12014 	} else {
12015 		ipaddr_t addr;
12016 
12017 		addr = sin->sin_addr.s_addr;
12018 		IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
12019 		addrlen += IPV6_ABITS - IP_ABITS;
12020 	}
12021 	(void) ip_plen_to_mask_v6(addrlen, &v6mask);
12022 
12023 	return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE));
12024 }
12025 
12026 /* ARGSUSED */
12027 int
12028 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12029     ip_ioctl_cmd_t *ipip, void *if_req)
12030 {
12031 	struct lifreq *lifr = (struct lifreq *)if_req;
12032 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin;
12033 
12034 	ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n",
12035 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12036 	ASSERT(ipip->ipi_cmd_type == LIF_CMD);
12037 
12038 	if (ipif->ipif_isv6) {
12039 		*sin6 = sin6_null;
12040 		sin6->sin6_family = AF_INET6;
12041 		sin6->sin6_addr = ipif->ipif_v6subnet;
12042 		lifr->lifr_addrlen =
12043 		    ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
12044 	} else {
12045 		*sin = sin_null;
12046 		sin->sin_family = AF_INET;
12047 		sin->sin_addr.s_addr = ipif->ipif_subnet;
12048 		lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask);
12049 	}
12050 	return (0);
12051 }
12052 
12053 /*
12054  * Set the IPv6 address token.
12055  */
12056 /* ARGSUSED */
12057 int
12058 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12059     ip_ioctl_cmd_t *ipi, void *if_req)
12060 {
12061 	ill_t *ill = ipif->ipif_ill;
12062 	int err;
12063 	in6_addr_t v6addr;
12064 	in6_addr_t v6mask;
12065 	boolean_t need_up = B_FALSE;
12066 	int i;
12067 	sin6_t *sin6 = (sin6_t *)sin;
12068 	struct lifreq *lifr = (struct lifreq *)if_req;
12069 	int addrlen;
12070 
12071 	ip1dbg(("ip_sioctl_token(%s:%u %p)\n",
12072 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12073 	ASSERT(IAM_WRITER_IPIF(ipif));
12074 
12075 	addrlen = lifr->lifr_addrlen;
12076 	/* Only allow for logical unit zero i.e. not on "le0:17" */
12077 	if (ipif->ipif_id != 0)
12078 		return (EINVAL);
12079 
12080 	if (!ipif->ipif_isv6)
12081 		return (EINVAL);
12082 
12083 	if (addrlen > IPV6_ABITS)
12084 		return (EINVAL);
12085 
12086 	v6addr = sin6->sin6_addr;
12087 
12088 	/*
12089 	 * The length of the token is the length from the end.  To get
12090 	 * the proper mask for this, compute the mask of the bits not
12091 	 * in the token; ie. the prefix, and then xor to get the mask.
12092 	 */
12093 	if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL)
12094 		return (EINVAL);
12095 	for (i = 0; i < 4; i++) {
12096 		v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
12097 	}
12098 
12099 	if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) &&
12100 	    ill->ill_token_length == addrlen)
12101 		return (0);	/* No change */
12102 
12103 	if (ipif->ipif_flags & IPIF_UP) {
12104 		err = ipif_logical_down(ipif, q, mp);
12105 		if (err == EINPROGRESS)
12106 			return (err);
12107 		ipif_down_tail(ipif);
12108 		need_up = B_TRUE;
12109 	}
12110 	err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up);
12111 	return (err);
12112 }
12113 
12114 static int
12115 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q,
12116     mblk_t *mp, boolean_t need_up)
12117 {
12118 	in6_addr_t v6addr;
12119 	in6_addr_t v6mask;
12120 	ill_t	*ill = ipif->ipif_ill;
12121 	int	i;
12122 	int	err = 0;
12123 
12124 	ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n",
12125 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12126 	v6addr = sin6->sin6_addr;
12127 	/*
12128 	 * The length of the token is the length from the end.  To get
12129 	 * the proper mask for this, compute the mask of the bits not
12130 	 * in the token; ie. the prefix, and then xor to get the mask.
12131 	 */
12132 	(void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask);
12133 	for (i = 0; i < 4; i++)
12134 		v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
12135 
12136 	mutex_enter(&ill->ill_lock);
12137 	V6_MASK_COPY(v6addr, v6mask, ill->ill_token);
12138 	ill->ill_token_length = addrlen;
12139 	mutex_exit(&ill->ill_lock);
12140 
12141 	if (need_up) {
12142 		/*
12143 		 * Now bring the interface back up.  If this
12144 		 * is the only IPIF for the ILL, ipif_up
12145 		 * will have to re-bind to the device, so
12146 		 * we may get back EINPROGRESS, in which
12147 		 * case, this IOCTL will get completed in
12148 		 * ip_rput_dlpi when we see the DL_BIND_ACK.
12149 		 */
12150 		err = ipif_up(ipif, q, mp);
12151 		if (err == EINPROGRESS)
12152 			return (err);
12153 	}
12154 	return (err);
12155 }
12156 
12157 /* ARGSUSED */
12158 int
12159 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12160     ip_ioctl_cmd_t *ipi, void *if_req)
12161 {
12162 	ill_t *ill;
12163 	sin6_t *sin6 = (sin6_t *)sin;
12164 	struct lifreq *lifr = (struct lifreq *)if_req;
12165 
12166 	ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n",
12167 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12168 	if (ipif->ipif_id != 0)
12169 		return (EINVAL);
12170 
12171 	ill = ipif->ipif_ill;
12172 	if (!ill->ill_isv6)
12173 		return (ENXIO);
12174 
12175 	*sin6 = sin6_null;
12176 	sin6->sin6_family = AF_INET6;
12177 	ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token));
12178 	sin6->sin6_addr = ill->ill_token;
12179 	lifr->lifr_addrlen = ill->ill_token_length;
12180 	return (0);
12181 }
12182 
12183 /*
12184  * Set (hardware) link specific information that might override
12185  * what was acquired through the DL_INFO_ACK.
12186  * The logic is as follows.
12187  *
12188  * become exclusive
12189  * set CHANGING flag
12190  * change mtu on affected IREs
12191  * clear CHANGING flag
12192  *
12193  * An ire add that occurs before the CHANGING flag is set will have its mtu
12194  * changed by the ip_sioctl_lnkinfo.
12195  *
12196  * During the time the CHANGING flag is set, no new ires will be added to the
12197  * bucket, and ire add will fail (due the CHANGING flag).
12198  *
12199  * An ire add that occurs after the CHANGING flag is set will have the right mtu
12200  * before it is added to the bucket.
12201  *
12202  * Obviously only 1 thread can set the CHANGING flag and we need to become
12203  * exclusive to set the flag.
12204  */
12205 /* ARGSUSED */
12206 int
12207 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12208     ip_ioctl_cmd_t *ipi, void *if_req)
12209 {
12210 	ill_t		*ill = ipif->ipif_ill;
12211 	ipif_t		*nipif;
12212 	int		ip_min_mtu;
12213 	boolean_t	mtu_walk = B_FALSE;
12214 	struct lifreq	*lifr = (struct lifreq *)if_req;
12215 	lif_ifinfo_req_t *lir;
12216 	ire_t		*ire;
12217 
12218 	ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n",
12219 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12220 	lir = &lifr->lifr_ifinfo;
12221 	ASSERT(IAM_WRITER_IPIF(ipif));
12222 
12223 	/* Only allow for logical unit zero i.e. not on "le0:17" */
12224 	if (ipif->ipif_id != 0)
12225 		return (EINVAL);
12226 
12227 	/* Set interface MTU. */
12228 	if (ipif->ipif_isv6)
12229 		ip_min_mtu = IPV6_MIN_MTU;
12230 	else
12231 		ip_min_mtu = IP_MIN_MTU;
12232 
12233 	/*
12234 	 * Verify values before we set anything. Allow zero to
12235 	 * mean unspecified.
12236 	 */
12237 	if (lir->lir_maxmtu != 0 &&
12238 	    (lir->lir_maxmtu > ill->ill_max_frag ||
12239 	    lir->lir_maxmtu < ip_min_mtu))
12240 		return (EINVAL);
12241 	if (lir->lir_reachtime != 0 &&
12242 	    lir->lir_reachtime > ND_MAX_REACHTIME)
12243 		return (EINVAL);
12244 	if (lir->lir_reachretrans != 0 &&
12245 	    lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME)
12246 		return (EINVAL);
12247 
12248 	mutex_enter(&ill->ill_lock);
12249 	ill->ill_state_flags |= ILL_CHANGING;
12250 	for (nipif = ill->ill_ipif; nipif != NULL;
12251 	    nipif = nipif->ipif_next) {
12252 		nipif->ipif_state_flags |= IPIF_CHANGING;
12253 	}
12254 
12255 	mutex_exit(&ill->ill_lock);
12256 
12257 	if (lir->lir_maxmtu != 0) {
12258 		ill->ill_max_mtu = lir->lir_maxmtu;
12259 		ill->ill_mtu_userspecified = 1;
12260 		mtu_walk = B_TRUE;
12261 	}
12262 
12263 	if (lir->lir_reachtime != 0)
12264 		ill->ill_reachable_time = lir->lir_reachtime;
12265 
12266 	if (lir->lir_reachretrans != 0)
12267 		ill->ill_reachable_retrans_time = lir->lir_reachretrans;
12268 
12269 	ill->ill_max_hops = lir->lir_maxhops;
12270 
12271 	ill->ill_max_buf = ND_MAX_Q;
12272 
12273 	if (mtu_walk) {
12274 		/*
12275 		 * Set the MTU on all ipifs associated with this ill except
12276 		 * for those whose MTU was fixed via SIOCSLIFMTU.
12277 		 */
12278 		for (nipif = ill->ill_ipif; nipif != NULL;
12279 		    nipif = nipif->ipif_next) {
12280 			if (nipif->ipif_flags & IPIF_FIXEDMTU)
12281 				continue;
12282 
12283 			nipif->ipif_mtu = ill->ill_max_mtu;
12284 
12285 			if (!(nipif->ipif_flags & IPIF_UP))
12286 				continue;
12287 
12288 			if (nipif->ipif_isv6)
12289 				ire = ipif_to_ire_v6(nipif);
12290 			else
12291 				ire = ipif_to_ire(nipif);
12292 			if (ire != NULL) {
12293 				ire->ire_max_frag = ipif->ipif_mtu;
12294 				ire_refrele(ire);
12295 			}
12296 			if (ill->ill_isv6) {
12297 				ire_walk_ill_v6(MATCH_IRE_ILL, 0,
12298 				    ipif_mtu_change, (char *)nipif,
12299 				    ill);
12300 			} else {
12301 				ire_walk_ill_v4(MATCH_IRE_ILL, 0,
12302 				    ipif_mtu_change, (char *)nipif,
12303 				    ill);
12304 			}
12305 		}
12306 	}
12307 
12308 	mutex_enter(&ill->ill_lock);
12309 	for (nipif = ill->ill_ipif; nipif != NULL;
12310 	    nipif = nipif->ipif_next) {
12311 		nipif->ipif_state_flags &= ~IPIF_CHANGING;
12312 	}
12313 	ILL_UNMARK_CHANGING(ill);
12314 	mutex_exit(&ill->ill_lock);
12315 
12316 	return (0);
12317 }
12318 
12319 /* ARGSUSED */
12320 int
12321 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12322     ip_ioctl_cmd_t *ipi, void *if_req)
12323 {
12324 	struct lif_ifinfo_req *lir;
12325 	ill_t *ill = ipif->ipif_ill;
12326 
12327 	ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n",
12328 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12329 	if (ipif->ipif_id != 0)
12330 		return (EINVAL);
12331 
12332 	lir = &((struct lifreq *)if_req)->lifr_ifinfo;
12333 	lir->lir_maxhops = ill->ill_max_hops;
12334 	lir->lir_reachtime = ill->ill_reachable_time;
12335 	lir->lir_reachretrans = ill->ill_reachable_retrans_time;
12336 	lir->lir_maxmtu = ill->ill_max_mtu;
12337 
12338 	return (0);
12339 }
12340 
12341 /*
12342  * Return best guess as to the subnet mask for the specified address.
12343  * Based on the subnet masks for all the configured interfaces.
12344  *
12345  * We end up returning a zero mask in the case of default, multicast or
12346  * experimental.
12347  */
12348 static ipaddr_t
12349 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp)
12350 {
12351 	ipaddr_t net_mask;
12352 	ill_t	*ill;
12353 	ipif_t	*ipif;
12354 	ill_walk_context_t ctx;
12355 	ipif_t	*fallback_ipif = NULL;
12356 
12357 	net_mask = ip_net_mask(addr);
12358 	if (net_mask == 0) {
12359 		*ipifp = NULL;
12360 		return (0);
12361 	}
12362 
12363 	/* Let's check to see if this is maybe a local subnet route. */
12364 	/* this function only applies to IPv4 interfaces */
12365 	rw_enter(&ill_g_lock, RW_READER);
12366 	ill = ILL_START_WALK_V4(&ctx);
12367 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
12368 		mutex_enter(&ill->ill_lock);
12369 		for (ipif = ill->ill_ipif; ipif != NULL;
12370 		    ipif = ipif->ipif_next) {
12371 			if (!IPIF_CAN_LOOKUP(ipif))
12372 				continue;
12373 			if (!(ipif->ipif_flags & IPIF_UP))
12374 				continue;
12375 			if ((ipif->ipif_subnet & net_mask) ==
12376 			    (addr & net_mask)) {
12377 				/*
12378 				 * Don't trust pt-pt interfaces if there are
12379 				 * other interfaces.
12380 				 */
12381 				if (ipif->ipif_flags & IPIF_POINTOPOINT) {
12382 					if (fallback_ipif == NULL) {
12383 						ipif_refhold_locked(ipif);
12384 						fallback_ipif = ipif;
12385 					}
12386 					continue;
12387 				}
12388 
12389 				/*
12390 				 * Fine. Just assume the same net mask as the
12391 				 * directly attached subnet interface is using.
12392 				 */
12393 				ipif_refhold_locked(ipif);
12394 				mutex_exit(&ill->ill_lock);
12395 				rw_exit(&ill_g_lock);
12396 				if (fallback_ipif != NULL)
12397 					ipif_refrele(fallback_ipif);
12398 				*ipifp = ipif;
12399 				return (ipif->ipif_net_mask);
12400 			}
12401 		}
12402 		mutex_exit(&ill->ill_lock);
12403 	}
12404 	rw_exit(&ill_g_lock);
12405 
12406 	*ipifp = fallback_ipif;
12407 	return ((fallback_ipif != NULL) ?
12408 	    fallback_ipif->ipif_net_mask : net_mask);
12409 }
12410 
12411 /*
12412  * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl.
12413  */
12414 static void
12415 ip_wput_ioctl(queue_t *q, mblk_t *mp)
12416 {
12417 	IOCP	iocp;
12418 	ipft_t	*ipft;
12419 	ipllc_t	*ipllc;
12420 	mblk_t	*mp1;
12421 	cred_t	*cr;
12422 	int	error = 0;
12423 	conn_t	*connp;
12424 
12425 	ip1dbg(("ip_wput_ioctl"));
12426 	iocp = (IOCP)mp->b_rptr;
12427 	mp1 = mp->b_cont;
12428 	if (mp1 == NULL) {
12429 		iocp->ioc_error = EINVAL;
12430 		mp->b_datap->db_type = M_IOCNAK;
12431 		iocp->ioc_count = 0;
12432 		qreply(q, mp);
12433 		return;
12434 	}
12435 
12436 	/*
12437 	 * These IOCTLs provide various control capabilities to
12438 	 * upstream agents such as ULPs and processes.	There
12439 	 * are currently two such IOCTLs implemented.  They
12440 	 * are used by TCP to provide update information for
12441 	 * existing IREs and to forcibly delete an IRE for a
12442 	 * host that is not responding, thereby forcing an
12443 	 * attempt at a new route.
12444 	 */
12445 	iocp->ioc_error = EINVAL;
12446 	if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd)))
12447 		goto done;
12448 
12449 	ipllc = (ipllc_t *)mp1->b_rptr;
12450 	for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) {
12451 		if (ipllc->ipllc_cmd == ipft->ipft_cmd)
12452 			break;
12453 	}
12454 	/*
12455 	 * prefer credential from mblk over ioctl;
12456 	 * see ip_sioctl_copyin_setup
12457 	 */
12458 	cr = DB_CREDDEF(mp, iocp->ioc_cr);
12459 
12460 	/*
12461 	 * Refhold the conn in case the request gets queued up in some lookup
12462 	 */
12463 	ASSERT(CONN_Q(q));
12464 	connp = Q_TO_CONN(q);
12465 	CONN_INC_REF(connp);
12466 	if (ipft->ipft_pfi &&
12467 	    ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size ||
12468 		pullupmsg(mp1, ipft->ipft_min_size))) {
12469 		error = (*ipft->ipft_pfi)(q,
12470 		    (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr);
12471 	}
12472 	if (ipft->ipft_flags & IPFT_F_SELF_REPLY) {
12473 		/*
12474 		 * CONN_OPER_PENDING_DONE happens in the function called
12475 		 * through ipft_pfi above.
12476 		 */
12477 		return;
12478 	}
12479 
12480 	CONN_OPER_PENDING_DONE(connp);
12481 	if (ipft->ipft_flags & IPFT_F_NO_REPLY) {
12482 		freemsg(mp);
12483 		return;
12484 	}
12485 	iocp->ioc_error = error;
12486 
12487 done:
12488 	mp->b_datap->db_type = M_IOCACK;
12489 	if (iocp->ioc_error)
12490 		iocp->ioc_count = 0;
12491 	qreply(q, mp);
12492 }
12493 
12494 /*
12495  * Lookup an ipif using the sequence id (ipif_seqid)
12496  */
12497 ipif_t *
12498 ipif_lookup_seqid(ill_t *ill, uint_t seqid)
12499 {
12500 	ipif_t *ipif;
12501 
12502 	ASSERT(MUTEX_HELD(&ill->ill_lock));
12503 
12504 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12505 		if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif))
12506 			return (ipif);
12507 	}
12508 	return (NULL);
12509 }
12510 
12511 uint64_t ipif_g_seqid;
12512 
12513 /*
12514  * Assign a unique id for the ipif. This is used later when we send
12515  * IRES to ARP for resolution where we initialize ire_ipif_seqid
12516  * to the value pointed by ire_ipif->ipif_seqid. Later when the
12517  * IRE is added, we verify that ipif has not disappeared.
12518  */
12519 
12520 static void
12521 ipif_assign_seqid(ipif_t *ipif)
12522 {
12523 	ipif->ipif_seqid = atomic_add_64_nv(&ipif_g_seqid, 1);
12524 }
12525 
12526 /*
12527  * Insert the ipif, so that the list of ipifs on the ill will be sorted
12528  * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will
12529  * be inserted into the first space available in the list. The value of
12530  * ipif_id will then be set to the appropriate value for its position.
12531  */
12532 static int
12533 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
12534 {
12535 	ill_t *ill;
12536 	ipif_t *tipif;
12537 	ipif_t **tipifp;
12538 	int id;
12539 
12540 	ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK ||
12541 	    IAM_WRITER_IPIF(ipif));
12542 
12543 	ill = ipif->ipif_ill;
12544 	ASSERT(ill != NULL);
12545 
12546 	/*
12547 	 * In the case of lo0:0 we already hold the ill_g_lock.
12548 	 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate ->
12549 	 * ipif_insert. Another such caller is ipif_move.
12550 	 */
12551 	if (acquire_g_lock)
12552 		rw_enter(&ill_g_lock, RW_WRITER);
12553 	if (acquire_ill_lock)
12554 		mutex_enter(&ill->ill_lock);
12555 	id = ipif->ipif_id;
12556 	tipifp = &(ill->ill_ipif);
12557 	if (id == -1) {	/* need to find a real id */
12558 		id = 0;
12559 		while ((tipif = *tipifp) != NULL) {
12560 			ASSERT(tipif->ipif_id >= id);
12561 			if (tipif->ipif_id != id)
12562 				break; /* non-consecutive id */
12563 			id++;
12564 			tipifp = &(tipif->ipif_next);
12565 		}
12566 		/* limit number of logical interfaces */
12567 		if (id >= ip_addrs_per_if) {
12568 			if (acquire_ill_lock)
12569 				mutex_exit(&ill->ill_lock);
12570 			if (acquire_g_lock)
12571 				rw_exit(&ill_g_lock);
12572 			return (-1);
12573 		}
12574 		ipif->ipif_id = id; /* assign new id */
12575 	} else if (id < ip_addrs_per_if) {
12576 		/* we have a real id; insert ipif in the right place */
12577 		while ((tipif = *tipifp) != NULL) {
12578 			ASSERT(tipif->ipif_id != id);
12579 			if (tipif->ipif_id > id)
12580 				break; /* found correct location */
12581 			tipifp = &(tipif->ipif_next);
12582 		}
12583 	} else {
12584 		if (acquire_ill_lock)
12585 			mutex_exit(&ill->ill_lock);
12586 		if (acquire_g_lock)
12587 			rw_exit(&ill_g_lock);
12588 		return (-1);
12589 	}
12590 
12591 	ASSERT(tipifp != &(ill->ill_ipif) || id == 0);
12592 
12593 	ipif->ipif_next = tipif;
12594 	*tipifp = ipif;
12595 	if (acquire_ill_lock)
12596 		mutex_exit(&ill->ill_lock);
12597 	if (acquire_g_lock)
12598 		rw_exit(&ill_g_lock);
12599 	return (0);
12600 }
12601 
12602 /*
12603  * Allocate and initialize a new interface control structure.  (Always
12604  * called as writer.)
12605  * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill
12606  * is not part of the global linked list of ills. ipif_seqid is unique
12607  * in the system and to preserve the uniqueness, it is assigned only
12608  * when ill becomes part of the global list. At that point ill will
12609  * have a name. If it doesn't get assigned here, it will get assigned
12610  * in ipif_set_values() as part of SIOCSLIFNAME processing.
12611  * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set
12612  * the interface flags or any other information from the DL_INFO_ACK for
12613  * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at
12614  * this point. The flags etc. will be set in ip_ll_subnet_defaults when the
12615  * second DL_INFO_ACK comes in from the driver.
12616  */
12617 static ipif_t *
12618 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
12619 {
12620 	ipif_t	*ipif;
12621 	phyint_t *phyi;
12622 
12623 	ip1dbg(("ipif_allocate(%s:%d ill %p)\n",
12624 	    ill->ill_name, id, (void *)ill));
12625 	ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill));
12626 
12627 	if ((ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL)
12628 		return (NULL);
12629 	*ipif = ipif_zero;	/* start clean */
12630 
12631 	ipif->ipif_ill = ill;
12632 	ipif->ipif_id = id;	/* could be -1 */
12633 	ipif->ipif_zoneid = GLOBAL_ZONEID;
12634 
12635 	mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
12636 
12637 	ipif->ipif_refcnt = 0;
12638 	ipif->ipif_saved_ire_cnt = 0;
12639 
12640 	if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) {
12641 		mi_free(ipif);
12642 		return (NULL);
12643 	}
12644 	/* -1 id should have been replaced by real id */
12645 	id = ipif->ipif_id;
12646 	ASSERT(id >= 0);
12647 
12648 	if (ill->ill_name[0] != '\0') {
12649 		ipif_assign_seqid(ipif);
12650 		if (ill->ill_phyint->phyint_ifindex != 0)
12651 			sctp_update_ipif(ipif, SCTP_IPIF_INSERT);
12652 	}
12653 	/*
12654 	 * Keep a copy of original id in ipif_orig_ipifid.  Failback
12655 	 * will attempt to restore the original id.  The SIOCSLIFOINDEX
12656 	 * ioctl sets ipif_orig_ipifid to zero.
12657 	 */
12658 	ipif->ipif_orig_ipifid = id;
12659 
12660 	/*
12661 	 * We grab the ill_lock and phyint_lock to protect the flag changes.
12662 	 * The ipif is still not up and can't be looked up until the
12663 	 * ioctl completes and the IPIF_CHANGING flag is cleared.
12664 	 */
12665 	mutex_enter(&ill->ill_lock);
12666 	mutex_enter(&ill->ill_phyint->phyint_lock);
12667 	/*
12668 	 * Set the running flag when logical interface zero is created.
12669 	 * For subsequent logical interfaces, a DLPI link down
12670 	 * notification message may have cleared the running flag to
12671 	 * indicate the link is down, so we shouldn't just blindly set it.
12672 	 */
12673 	if (id == 0)
12674 		ill->ill_phyint->phyint_flags |= PHYI_RUNNING;
12675 	ipif->ipif_ire_type = ire_type;
12676 	phyi = ill->ill_phyint;
12677 	ipif->ipif_orig_ifindex = phyi->phyint_ifindex;
12678 
12679 	if (ipif->ipif_isv6) {
12680 		ill->ill_flags |= ILLF_IPV6;
12681 	} else {
12682 		ipaddr_t inaddr_any = INADDR_ANY;
12683 
12684 		ill->ill_flags |= ILLF_IPV4;
12685 
12686 		/* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */
12687 		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12688 		    &ipif->ipif_v6lcl_addr);
12689 		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12690 		    &ipif->ipif_v6src_addr);
12691 		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12692 		    &ipif->ipif_v6subnet);
12693 		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12694 		    &ipif->ipif_v6net_mask);
12695 		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12696 		    &ipif->ipif_v6brd_addr);
12697 		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12698 		    &ipif->ipif_v6pp_dst_addr);
12699 	}
12700 
12701 	/*
12702 	 * Don't set the interface flags etc. now, will do it in
12703 	 * ip_ll_subnet_defaults.
12704 	 */
12705 	if (!initialize) {
12706 		mutex_exit(&ill->ill_lock);
12707 		mutex_exit(&ill->ill_phyint->phyint_lock);
12708 		return (ipif);
12709 	}
12710 	ipif->ipif_mtu = ill->ill_max_mtu;
12711 
12712 	if (ill->ill_bcast_addr_length != 0) {
12713 		/*
12714 		 * Later detect lack of DLPI driver multicast
12715 		 * capability by catching DL_ENABMULTI errors in
12716 		 * ip_rput_dlpi.
12717 		 */
12718 		ill->ill_flags |= ILLF_MULTICAST;
12719 		if (!ipif->ipif_isv6)
12720 			ipif->ipif_flags |= IPIF_BROADCAST;
12721 	} else {
12722 		if (ill->ill_net_type != IRE_LOOPBACK) {
12723 			if (ipif->ipif_isv6)
12724 				/*
12725 				 * Note: xresolv interfaces will eventually need
12726 				 * NOARP set here as well, but that will require
12727 				 * those external resolvers to have some
12728 				 * knowledge of that flag and act appropriately.
12729 				 * Not to be changed at present.
12730 				 */
12731 				ill->ill_flags |= ILLF_NONUD;
12732 			else
12733 				ill->ill_flags |= ILLF_NOARP;
12734 		}
12735 		if (ill->ill_phys_addr_length == 0) {
12736 			if (ill->ill_media &&
12737 			    ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) {
12738 				ipif->ipif_flags |= IPIF_NOXMIT;
12739 				phyi->phyint_flags |= PHYI_VIRTUAL;
12740 			} else {
12741 				/* pt-pt supports multicast. */
12742 				ill->ill_flags |= ILLF_MULTICAST;
12743 				if (ill->ill_net_type == IRE_LOOPBACK) {
12744 					phyi->phyint_flags |=
12745 					    (PHYI_LOOPBACK | PHYI_VIRTUAL);
12746 				} else {
12747 					ipif->ipif_flags |= IPIF_POINTOPOINT;
12748 				}
12749 			}
12750 		}
12751 	}
12752 	mutex_exit(&ill->ill_lock);
12753 	mutex_exit(&ill->ill_phyint->phyint_lock);
12754 	return (ipif);
12755 }
12756 
12757 /*
12758  * If appropriate, send a message up to the resolver delete the entry
12759  * for the address of this interface which is going out of business.
12760  * (Always called as writer).
12761  *
12762  * NOTE : We need to check for NULL mps as some of the fields are
12763  *	  initialized only for some interface types. See ipif_resolver_up()
12764  *	  for details.
12765  */
12766 void
12767 ipif_arp_down(ipif_t *ipif)
12768 {
12769 	mblk_t	*mp;
12770 
12771 	ip1dbg(("ipif_arp_down(%s:%u)\n",
12772 	    ipif->ipif_ill->ill_name, ipif->ipif_id));
12773 	ASSERT(IAM_WRITER_IPIF(ipif));
12774 
12775 	/* Delete the mapping for the local address */
12776 	mp = ipif->ipif_arp_del_mp;
12777 	if (mp != NULL) {
12778 		ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n",
12779 		    dlpi_prim_str(*(int *)mp->b_rptr), *(int *)mp->b_rptr,
12780 		    ipif->ipif_ill->ill_name, ipif->ipif_id));
12781 		putnext(ipif->ipif_ill->ill_rq, mp);
12782 		ipif->ipif_arp_del_mp = NULL;
12783 	}
12784 
12785 	/*
12786 	 * If this is the last ipif that is going down, we need
12787 	 * to clean up ARP completely.
12788 	 */
12789 	if (ipif->ipif_ill->ill_ipif_up_count == 0) {
12790 
12791 		/* Send up AR_INTERFACE_DOWN message */
12792 		mp = ipif->ipif_ill->ill_arp_down_mp;
12793 		if (mp != NULL) {
12794 			ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n",
12795 			    dlpi_prim_str(*(int *)mp->b_rptr),
12796 			    *(int *)mp->b_rptr, ipif->ipif_ill->ill_name,
12797 			    ipif->ipif_id));
12798 			putnext(ipif->ipif_ill->ill_rq, mp);
12799 			ipif->ipif_ill->ill_arp_down_mp = NULL;
12800 		}
12801 
12802 		/* Tell ARP to delete the multicast mappings */
12803 		mp = ipif->ipif_ill->ill_arp_del_mapping_mp;
12804 		if (mp != NULL) {
12805 			ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n",
12806 			    dlpi_prim_str(*(int *)mp->b_rptr),
12807 			    *(int *)mp->b_rptr, ipif->ipif_ill->ill_name,
12808 			    ipif->ipif_id));
12809 			putnext(ipif->ipif_ill->ill_rq, mp);
12810 			ipif->ipif_ill->ill_arp_del_mapping_mp = NULL;
12811 		}
12812 	}
12813 }
12814 
12815 /*
12816  * This function sets up the multicast mappings in ARP. When ipif_resolver_up
12817  * calls this function, it passes a non-NULL arp_add_mapping_mp indicating
12818  * that it wants the add_mp allocated in this function to be returned
12819  * wihtout sending it to arp. When ip_rput_dlpi_writer calls this to
12820  * just re-do the multicast, it wants us to send the add_mp to ARP also.
12821  * ipif_resolver_up does not want us to do the "add" i.e sending to ARP,
12822  * as it does a ipif_arp_down after calling this function - which will
12823  * remove what we add here.
12824  *
12825  * Returns -1 on failures and 0 on success.
12826  */
12827 int
12828 ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp)
12829 {
12830 	mblk_t	*del_mp = NULL;
12831 	mblk_t *add_mp = NULL;
12832 	mblk_t *mp;
12833 	ill_t	*ill = ipif->ipif_ill;
12834 	phyint_t *phyi = ill->ill_phyint;
12835 	ipaddr_t addr, mask, extract_mask = 0;
12836 	arma_t	*arma;
12837 	uint8_t *maddr, *bphys_addr;
12838 	uint32_t hw_start;
12839 	dl_unitdata_req_t *dlur;
12840 
12841 	ASSERT(IAM_WRITER_IPIF(ipif));
12842 	if (ipif->ipif_flags & IPIF_POINTOPOINT)
12843 		return (0);
12844 
12845 	/*
12846 	 * Delete the existing mapping from ARP. Normally ipif_down
12847 	 * -> ipif_arp_down should send this up to ARP. The only
12848 	 * reason we would find this when we are switching from
12849 	 * Multicast to Broadcast where we did not do a down.
12850 	 */
12851 	mp = ill->ill_arp_del_mapping_mp;
12852 	if (mp != NULL) {
12853 		ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n",
12854 		    dlpi_prim_str(*(int *)mp->b_rptr),
12855 		    *(int *)mp->b_rptr, ill->ill_name, ipif->ipif_id));
12856 		putnext(ill->ill_rq, mp);
12857 		ill->ill_arp_del_mapping_mp = NULL;
12858 	}
12859 
12860 	if (arp_add_mapping_mp != NULL)
12861 		*arp_add_mapping_mp = NULL;
12862 
12863 	/*
12864 	 * Check that the address is not to long for the constant
12865 	 * length reserved in the template arma_t.
12866 	 */
12867 	if (ill->ill_phys_addr_length > IP_MAX_HW_LEN)
12868 		return (-1);
12869 
12870 	/* Add mapping mblk */
12871 	addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP);
12872 	mask = (ipaddr_t)htonl(IN_CLASSD_NET);
12873 	add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template,
12874 	    (caddr_t)&addr);
12875 	if (add_mp == NULL)
12876 		return (-1);
12877 	arma = (arma_t *)add_mp->b_rptr;
12878 	maddr = (uint8_t *)arma + arma->arma_hw_addr_offset;
12879 	bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN);
12880 	arma->arma_hw_addr_length = ill->ill_phys_addr_length;
12881 
12882 	/*
12883 	 * Determine the broadcast address.
12884 	 */
12885 	dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
12886 	if (ill->ill_sap_length < 0)
12887 		bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
12888 	else
12889 		bphys_addr = (uchar_t *)dlur +
12890 		    dlur->dl_dest_addr_offset + ill->ill_sap_length;
12891 	/*
12892 	 * Check PHYI_MULTI_BCAST and length of physical
12893 	 * address to determine if we use the mapping or the
12894 	 * broadcast address.
12895 	 */
12896 	if (!(phyi->phyint_flags & PHYI_MULTI_BCAST))
12897 		if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length,
12898 		    bphys_addr, maddr, &hw_start, &extract_mask))
12899 			phyi->phyint_flags |= PHYI_MULTI_BCAST;
12900 
12901 	if ((phyi->phyint_flags & PHYI_MULTI_BCAST) ||
12902 	    (ill->ill_flags & ILLF_MULTICAST)) {
12903 		/* Make sure this will not match the "exact" entry. */
12904 		addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP);
12905 		del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template,
12906 		    (caddr_t)&addr);
12907 		if (del_mp == NULL) {
12908 			freemsg(add_mp);
12909 			return (-1);
12910 		}
12911 		bcopy(&extract_mask, (char *)arma +
12912 		    arma->arma_proto_extract_mask_offset, IP_ADDR_LEN);
12913 		if (phyi->phyint_flags & PHYI_MULTI_BCAST) {
12914 			/* Use link-layer broadcast address for MULTI_BCAST */
12915 			bcopy(bphys_addr, maddr, ill->ill_phys_addr_length);
12916 			ip2dbg(("ipif_arp_setup_multicast: adding"
12917 			    " MULTI_BCAST ARP setup for %s\n", ill->ill_name));
12918 		} else {
12919 			arma->arma_hw_mapping_start = hw_start;
12920 			ip2dbg(("ipif_arp_setup_multicast: adding multicast"
12921 			    " ARP setup for %s\n", ill->ill_name));
12922 		}
12923 	} else {
12924 		freemsg(add_mp);
12925 		ASSERT(del_mp == NULL);
12926 		/* It is neither MULTICAST nor MULTI_BCAST */
12927 		return (0);
12928 	}
12929 	ASSERT(add_mp != NULL && del_mp != NULL);
12930 	ill->ill_arp_del_mapping_mp = del_mp;
12931 	if (arp_add_mapping_mp != NULL) {
12932 		/* The caller just wants the mblks allocated */
12933 		*arp_add_mapping_mp = add_mp;
12934 	} else {
12935 		/* The caller wants us to send it to arp */
12936 		putnext(ill->ill_rq, add_mp);
12937 	}
12938 	return (0);
12939 }
12940 
12941 /*
12942  * Get the resolver set up for a new interface address.
12943  * (Always called as writer.)
12944  * Called both for IPv4 and IPv6 interfaces,
12945  * though it only sets up the resolver for v6
12946  * if it's an xresolv interface (one using an external resolver).
12947  * Honors ILLF_NOARP.
12948  * The boolean value arp_just_publish, if B_TRUE, indicates that
12949  * it only needs to send an AR_ENTRY_ADD message up to ARP for
12950  * IPv4 interfaces. Currently, B_TRUE is only set when this
12951  * function is called by ip_rput_dlpi_writer() to handle
12952  * asynchronous hardware address change notification.
12953  * Returns error on failure.
12954  */
12955 int
12956 ipif_resolver_up(ipif_t *ipif, boolean_t arp_just_publish)
12957 {
12958 	caddr_t	addr;
12959 	mblk_t	*arp_up_mp = NULL;
12960 	mblk_t	*arp_down_mp = NULL;
12961 	mblk_t	*arp_add_mp = NULL;
12962 	mblk_t	*arp_del_mp = NULL;
12963 	mblk_t	*arp_add_mapping_mp = NULL;
12964 	mblk_t	*arp_del_mapping_mp = NULL;
12965 	ill_t	*ill = ipif->ipif_ill;
12966 	uchar_t	*area_p = NULL;
12967 	uchar_t	*ared_p = NULL;
12968 	int	err = ENOMEM;
12969 
12970 	ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n",
12971 	    ipif->ipif_ill->ill_name, ipif->ipif_id,
12972 	    (uint_t)ipif->ipif_flags));
12973 	ASSERT(IAM_WRITER_IPIF(ipif));
12974 
12975 	if ((ill->ill_net_type != IRE_IF_RESOLVER) ||
12976 	    (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))) {
12977 		return (0);
12978 	}
12979 
12980 	if (ill->ill_isv6) {
12981 		/*
12982 		 * External resolver for IPv6
12983 		 */
12984 		ASSERT(!arp_just_publish);
12985 		if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
12986 			addr = (caddr_t)&ipif->ipif_v6lcl_addr;
12987 			area_p = (uchar_t *)&ip6_area_template;
12988 			ared_p = (uchar_t *)&ip6_ared_template;
12989 		}
12990 	} else {
12991 		/*
12992 		 * IPv4 arp case. If the ARP stream has already started
12993 		 * closing, fail this request for ARP bringup. Else
12994 		 * record the fact that an ARP bringup is pending.
12995 		 */
12996 		mutex_enter(&ill->ill_lock);
12997 		if (ill->ill_arp_closing) {
12998 			mutex_exit(&ill->ill_lock);
12999 			err = EINVAL;
13000 			goto failed;
13001 		} else {
13002 			if (ill->ill_ipif_up_count == 0)
13003 				ill->ill_arp_bringup_pending = 1;
13004 			mutex_exit(&ill->ill_lock);
13005 		}
13006 		if (ipif->ipif_lcl_addr != INADDR_ANY) {
13007 			addr = (caddr_t)&ipif->ipif_lcl_addr;
13008 			area_p = (uchar_t *)&ip_area_template;
13009 			ared_p = (uchar_t *)&ip_ared_template;
13010 		}
13011 	}
13012 
13013 	/*
13014 	 * Add an entry for the local address in ARP only if it
13015 	 * is not UNNUMBERED and the address is not INADDR_ANY.
13016 	 */
13017 	if (((ipif->ipif_flags & IPIF_UNNUMBERED) == 0) && area_p != NULL) {
13018 		/* Now ask ARP to publish our address. */
13019 		arp_add_mp = ill_arp_alloc(ill, area_p, addr);
13020 		if (arp_add_mp == NULL)
13021 			goto failed;
13022 		if (arp_just_publish) {
13023 			/*
13024 			 * Copy the new hardware address and length into
13025 			 * arp_add_mp to be sent to ARP.
13026 			 */
13027 			area_t *area = (area_t *)arp_add_mp->b_rptr;
13028 			area->area_hw_addr_length =
13029 			    ill->ill_phys_addr_length;
13030 			bcopy((char *)ill->ill_phys_addr,
13031 			    ((char *)area + area->area_hw_addr_offset),
13032 			    area->area_hw_addr_length);
13033 		}
13034 
13035 		((area_t *)arp_add_mp->b_rptr)->area_flags =
13036 		    ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR;
13037 
13038 		if (arp_just_publish)
13039 			goto arp_setup_multicast;
13040 
13041 		/*
13042 		 * Allocate an ARP deletion message so we know we can tell ARP
13043 		 * when the interface goes down.
13044 		 */
13045 		arp_del_mp = ill_arp_alloc(ill, ared_p, addr);
13046 		if (arp_del_mp == NULL)
13047 			goto failed;
13048 
13049 	} else {
13050 		if (arp_just_publish)
13051 			goto done;
13052 	}
13053 	/*
13054 	 * Need to bring up ARP or setup multicast mapping only
13055 	 * when the first interface is coming UP.
13056 	 */
13057 	if (ill->ill_ipif_up_count != 0)
13058 		goto done;
13059 
13060 	/*
13061 	 * Allocate an ARP down message (to be saved) and an ARP up
13062 	 * message.
13063 	 */
13064 	arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0);
13065 	if (arp_down_mp == NULL)
13066 		goto failed;
13067 
13068 	arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0);
13069 	if (arp_up_mp == NULL)
13070 		goto failed;
13071 
13072 	if (ipif->ipif_flags & IPIF_POINTOPOINT)
13073 		goto done;
13074 
13075 arp_setup_multicast:
13076 	/*
13077 	 * Setup the multicast mappings. This function initializes
13078 	 * ill_arp_del_mapping_mp also. This does not need to be done for
13079 	 * IPv6.
13080 	 */
13081 	if (!ill->ill_isv6) {
13082 		err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp);
13083 		if (err != 0)
13084 			goto failed;
13085 		ASSERT(ill->ill_arp_del_mapping_mp != NULL);
13086 		ASSERT(arp_add_mapping_mp != NULL);
13087 	}
13088 
13089 done:;
13090 	if (arp_del_mp != NULL) {
13091 		ASSERT(ipif->ipif_arp_del_mp == NULL);
13092 		ipif->ipif_arp_del_mp = arp_del_mp;
13093 	}
13094 	if (arp_down_mp != NULL) {
13095 		ASSERT(ill->ill_arp_down_mp == NULL);
13096 		ill->ill_arp_down_mp = arp_down_mp;
13097 	}
13098 	if (arp_del_mapping_mp != NULL) {
13099 		ASSERT(ill->ill_arp_del_mapping_mp == NULL);
13100 		ill->ill_arp_del_mapping_mp = arp_del_mapping_mp;
13101 	}
13102 	if (arp_up_mp != NULL) {
13103 		ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n",
13104 			ipif->ipif_ill->ill_name, ipif->ipif_id));
13105 		putnext(ill->ill_rq, arp_up_mp);
13106 	}
13107 	if (arp_add_mp != NULL) {
13108 		ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n",
13109 			ipif->ipif_ill->ill_name, ipif->ipif_id));
13110 		putnext(ill->ill_rq, arp_add_mp);
13111 	}
13112 	if (arp_add_mapping_mp != NULL) {
13113 		ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n",
13114 			ipif->ipif_ill->ill_name, ipif->ipif_id));
13115 		putnext(ill->ill_rq, arp_add_mapping_mp);
13116 	}
13117 	if (arp_just_publish)
13118 		return (0);
13119 
13120 	if (ill->ill_flags & ILLF_NOARP)
13121 		err = ill_arp_off(ill);
13122 	else
13123 		err = ill_arp_on(ill);
13124 	if (err) {
13125 		ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err));
13126 		freemsg(ipif->ipif_arp_del_mp);
13127 		if (arp_down_mp != NULL)
13128 			freemsg(ill->ill_arp_down_mp);
13129 		if (ill->ill_arp_del_mapping_mp != NULL)
13130 			freemsg(ill->ill_arp_del_mapping_mp);
13131 		ipif->ipif_arp_del_mp = NULL;
13132 		ill->ill_arp_down_mp = NULL;
13133 		ill->ill_arp_del_mapping_mp = NULL;
13134 		return (err);
13135 	}
13136 	return (ill->ill_ipif_up_count != 0 ? 0 : EINPROGRESS);
13137 
13138 failed:;
13139 	ip1dbg(("ipif_resolver_up: FAILED\n"));
13140 	freemsg(arp_add_mp);
13141 	freemsg(arp_del_mp);
13142 	freemsg(arp_add_mapping_mp);
13143 	freemsg(arp_up_mp);
13144 	freemsg(arp_down_mp);
13145 	ill->ill_arp_bringup_pending = 0;
13146 	return (err);
13147 }
13148 
13149 /*
13150  * Wakeup all threads waiting to enter the ipsq, and sleeping
13151  * on any of the ills in this ipsq. The ill_lock of the ill
13152  * must be held so that waiters don't miss wakeups
13153  */
13154 static void
13155 ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock)
13156 {
13157 	phyint_t *phyint;
13158 
13159 	phyint = ipsq->ipsq_phyint_list;
13160 	while (phyint != NULL) {
13161 		if (phyint->phyint_illv4) {
13162 			if (!caller_holds_lock)
13163 				mutex_enter(&phyint->phyint_illv4->ill_lock);
13164 			ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
13165 			cv_broadcast(&phyint->phyint_illv4->ill_cv);
13166 			if (!caller_holds_lock)
13167 				mutex_exit(&phyint->phyint_illv4->ill_lock);
13168 		}
13169 		if (phyint->phyint_illv6) {
13170 			if (!caller_holds_lock)
13171 				mutex_enter(&phyint->phyint_illv6->ill_lock);
13172 			ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
13173 			cv_broadcast(&phyint->phyint_illv6->ill_cv);
13174 			if (!caller_holds_lock)
13175 				mutex_exit(&phyint->phyint_illv6->ill_lock);
13176 		}
13177 		phyint = phyint->phyint_ipsq_next;
13178 	}
13179 }
13180 
13181 static ipsq_t *
13182 ipsq_create(char *groupname)
13183 {
13184 	ipsq_t	*ipsq;
13185 
13186 	ASSERT(RW_WRITE_HELD(&ill_g_lock));
13187 	ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP);
13188 	if (ipsq == NULL) {
13189 		return (NULL);
13190 	}
13191 
13192 	if (groupname != NULL)
13193 		(void) strcpy(ipsq->ipsq_name, groupname);
13194 	else
13195 		ipsq->ipsq_name[0] = '\0';
13196 
13197 	mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL);
13198 	ipsq->ipsq_flags |= IPSQ_GROUP;
13199 	ipsq->ipsq_next = ipsq_g_head;
13200 	ipsq_g_head = ipsq;
13201 	return (ipsq);
13202 }
13203 
13204 /*
13205  * Return an ipsq correspoding to the groupname. If 'create' is true
13206  * allocate a new ipsq if one does not exist. Usually an ipsq is associated
13207  * uniquely with an IPMP group. However during IPMP groupname operations,
13208  * multiple IPMP groups may be associated with a single ipsq. But no
13209  * IPMP group can be associated with more than 1 ipsq at any time.
13210  * For example
13211  *	Interfaces		IPMP grpname	ipsq	ipsq_name      ipsq_refs
13212  * 	hme1, hme2		mpk17-84	ipsq1	mpk17-84	2
13213  *	hme3, hme4		mpk17-85	ipsq2	mpk17-85	2
13214  *
13215  * Now the command ifconfig hme3 group mpk17-84 results in the temporary
13216  * status shown below during the execution of the above command.
13217  * 	hme1, hme2, hme3, hme4	mpk17-84, mpk17-85	ipsq1	mpk17-84  4
13218  *
13219  * After the completion of the above groupname command we return to the stable
13220  * state shown below.
13221  * 	hme1, hme2, hme3	mpk17-84	ipsq1	mpk17-84	3
13222  *	hme4			mpk17-85	ipsq2	mpk17-85	1
13223  *
13224  * Because of the above, we don't search based on the ipsq_name since that
13225  * would miss the correct ipsq during certain windows as shown above.
13226  * The ipsq_name is only used during split of an ipsq to return the ipsq to its
13227  * natural state.
13228  */
13229 static ipsq_t *
13230 ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq)
13231 {
13232 	ipsq_t	*ipsq;
13233 	int	group_len;
13234 	phyint_t *phyint;
13235 
13236 	ASSERT(RW_LOCK_HELD(&ill_g_lock));
13237 
13238 	group_len = strlen(groupname);
13239 	ASSERT(group_len != 0);
13240 	group_len++;
13241 
13242 	for (ipsq = ipsq_g_head; ipsq != NULL; ipsq = ipsq->ipsq_next) {
13243 		/*
13244 		 * When an ipsq is being split, and ill_split_ipsq
13245 		 * calls this function, we exclude it from being considered.
13246 		 */
13247 		if (ipsq == exclude_ipsq)
13248 			continue;
13249 
13250 		/*
13251 		 * Compare against the ipsq_name. The groupname change happens
13252 		 * in 2 phases. The 1st phase merges the from group into
13253 		 * the to group's ipsq, by calling ill_merge_groups and restarts
13254 		 * the ioctl. The 2nd phase then locates the ipsq again thru
13255 		 * ipsq_name. At this point the phyint_groupname has not been
13256 		 * updated.
13257 		 */
13258 		if ((group_len == strlen(ipsq->ipsq_name) + 1) &&
13259 		    (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) {
13260 			/*
13261 			 * Verify that an ipmp groupname is exactly
13262 			 * part of 1 ipsq and is not found in any other
13263 			 * ipsq.
13264 			 */
13265 			ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq) ==
13266 			    NULL);
13267 			return (ipsq);
13268 		}
13269 
13270 		/*
13271 		 * Comparison against ipsq_name alone is not sufficient.
13272 		 * In the case when groups are currently being
13273 		 * merged, the ipsq could hold other IPMP groups temporarily.
13274 		 * so we walk the phyint list and compare against the
13275 		 * phyint_groupname as well.
13276 		 */
13277 		phyint = ipsq->ipsq_phyint_list;
13278 		while (phyint != NULL) {
13279 			if ((group_len == phyint->phyint_groupname_len) &&
13280 			    (bcmp(phyint->phyint_groupname, groupname,
13281 			    group_len) == 0)) {
13282 				/*
13283 				 * Verify that an ipmp groupname is exactly
13284 				 * part of 1 ipsq and is not found in any other
13285 				 * ipsq.
13286 				 */
13287 				ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq)
13288 					== NULL);
13289 				return (ipsq);
13290 			}
13291 			phyint = phyint->phyint_ipsq_next;
13292 		}
13293 	}
13294 	if (create)
13295 		ipsq = ipsq_create(groupname);
13296 	return (ipsq);
13297 }
13298 
13299 static void
13300 ipsq_delete(ipsq_t *ipsq)
13301 {
13302 	ipsq_t *nipsq;
13303 	ipsq_t *pipsq = NULL;
13304 
13305 	/*
13306 	 * We don't hold the ipsq lock, but we are sure no new
13307 	 * messages can land up, since the ipsq_refs is zero.
13308 	 * i.e. this ipsq is unnamed and no phyint or phyint group
13309 	 * is associated with this ipsq. (Lookups are based on ill_name
13310 	 * or phyint_group_name)
13311 	 */
13312 	ASSERT(ipsq->ipsq_refs == 0);
13313 	ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL);
13314 	ASSERT(ipsq->ipsq_pending_mp == NULL);
13315 	if (!(ipsq->ipsq_flags & IPSQ_GROUP)) {
13316 		/*
13317 		 * This is not the ipsq of an IPMP group.
13318 		 */
13319 		kmem_free(ipsq, sizeof (ipsq_t));
13320 		return;
13321 	}
13322 
13323 	rw_enter(&ill_g_lock, RW_WRITER);
13324 
13325 	/*
13326 	 * Locate the ipsq  before we can remove it from
13327 	 * the singly linked list of ipsq's.
13328 	 */
13329 	for (nipsq = ipsq_g_head; nipsq != NULL; nipsq = nipsq->ipsq_next) {
13330 		if (nipsq == ipsq) {
13331 			break;
13332 		}
13333 		pipsq = nipsq;
13334 	}
13335 
13336 	ASSERT(nipsq == ipsq);
13337 
13338 	/* unlink ipsq from the list */
13339 	if (pipsq != NULL)
13340 		pipsq->ipsq_next = ipsq->ipsq_next;
13341 	else
13342 		ipsq_g_head = ipsq->ipsq_next;
13343 	kmem_free(ipsq, sizeof (ipsq_t));
13344 	rw_exit(&ill_g_lock);
13345 }
13346 
13347 static void
13348 ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp,
13349     queue_t *q)
13350 
13351 {
13352 
13353 	ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock));
13354 	ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL);
13355 	ASSERT(old_ipsq->ipsq_pending_ipif == NULL);
13356 	ASSERT(old_ipsq->ipsq_pending_mp == NULL);
13357 	ASSERT(current_mp != NULL);
13358 
13359 	ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl,
13360 		NEW_OP, NULL);
13361 
13362 	ASSERT(new_ipsq->ipsq_xopq_mptail != NULL &&
13363 	    new_ipsq->ipsq_xopq_mphead != NULL);
13364 
13365 	/*
13366 	 * move from old ipsq to the new ipsq.
13367 	 */
13368 	new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead;
13369 	if (old_ipsq->ipsq_xopq_mphead != NULL)
13370 		new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail;
13371 
13372 	old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL;
13373 }
13374 
13375 void
13376 ill_group_cleanup(ill_t *ill)
13377 {
13378 	ill_t *ill_v4;
13379 	ill_t *ill_v6;
13380 	ipif_t *ipif;
13381 
13382 	ill_v4 = ill->ill_phyint->phyint_illv4;
13383 	ill_v6 = ill->ill_phyint->phyint_illv6;
13384 
13385 	if (ill_v4 != NULL) {
13386 		mutex_enter(&ill_v4->ill_lock);
13387 		for (ipif = ill_v4->ill_ipif; ipif != NULL;
13388 		    ipif = ipif->ipif_next) {
13389 			IPIF_UNMARK_MOVING(ipif);
13390 		}
13391 		ill_v4->ill_up_ipifs = B_FALSE;
13392 		mutex_exit(&ill_v4->ill_lock);
13393 	}
13394 
13395 	if (ill_v6 != NULL) {
13396 		mutex_enter(&ill_v6->ill_lock);
13397 		for (ipif = ill_v6->ill_ipif; ipif != NULL;
13398 		    ipif = ipif->ipif_next) {
13399 			IPIF_UNMARK_MOVING(ipif);
13400 		}
13401 		ill_v6->ill_up_ipifs = B_FALSE;
13402 		mutex_exit(&ill_v6->ill_lock);
13403 	}
13404 }
13405 /*
13406  * This function is called when an ill has had a change in its group status
13407  * to bring up all the ipifs that were up before the change.
13408  */
13409 int
13410 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
13411 {
13412 	ipif_t *ipif;
13413 	ill_t *ill_v4;
13414 	ill_t *ill_v6;
13415 	ill_t *from_ill;
13416 	int err = 0;
13417 
13418 
13419 	ASSERT(IAM_WRITER_ILL(ill));
13420 
13421 	/*
13422 	 * Except for ipif_state_flags and ill_state_flags the other
13423 	 * fields of the ipif/ill that are modified below are protected
13424 	 * implicitly since we are a writer. We would have tried to down
13425 	 * even an ipif that was already down, in ill_down_ipifs. So we
13426 	 * just blindly clear the IPIF_CHANGING flag here on all ipifs.
13427 	 */
13428 	ill_v4 = ill->ill_phyint->phyint_illv4;
13429 	ill_v6 = ill->ill_phyint->phyint_illv6;
13430 	if (ill_v4 != NULL) {
13431 		ill_v4->ill_up_ipifs = B_TRUE;
13432 		for (ipif = ill_v4->ill_ipif; ipif != NULL;
13433 		    ipif = ipif->ipif_next) {
13434 			mutex_enter(&ill_v4->ill_lock);
13435 			ipif->ipif_state_flags &= ~IPIF_CHANGING;
13436 			IPIF_UNMARK_MOVING(ipif);
13437 			mutex_exit(&ill_v4->ill_lock);
13438 			if (ipif->ipif_was_up) {
13439 				if (!(ipif->ipif_flags & IPIF_UP))
13440 					err = ipif_up(ipif, q, mp);
13441 				ipif->ipif_was_up = B_FALSE;
13442 				if (err != 0) {
13443 					/*
13444 					 * Can there be any other error ?
13445 					 */
13446 					ASSERT(err == EINPROGRESS);
13447 					return (err);
13448 				}
13449 			}
13450 		}
13451 		mutex_enter(&ill_v4->ill_lock);
13452 		ill_v4->ill_state_flags &= ~ILL_CHANGING;
13453 		mutex_exit(&ill_v4->ill_lock);
13454 		ill_v4->ill_up_ipifs = B_FALSE;
13455 		if (ill_v4->ill_move_in_progress) {
13456 			ASSERT(ill_v4->ill_move_peer != NULL);
13457 			ill_v4->ill_move_in_progress = B_FALSE;
13458 			from_ill = ill_v4->ill_move_peer;
13459 			from_ill->ill_move_in_progress = B_FALSE;
13460 			from_ill->ill_move_peer = NULL;
13461 			mutex_enter(&from_ill->ill_lock);
13462 			from_ill->ill_state_flags &= ~ILL_CHANGING;
13463 			mutex_exit(&from_ill->ill_lock);
13464 			if (ill_v6 == NULL) {
13465 				if (from_ill->ill_phyint->phyint_flags &
13466 				    PHYI_STANDBY) {
13467 					phyint_inactive(from_ill->ill_phyint);
13468 				}
13469 				if (ill_v4->ill_phyint->phyint_flags &
13470 				    PHYI_STANDBY) {
13471 					phyint_inactive(ill_v4->ill_phyint);
13472 				}
13473 			}
13474 			ill_v4->ill_move_peer = NULL;
13475 		}
13476 	}
13477 
13478 	if (ill_v6 != NULL) {
13479 		ill_v6->ill_up_ipifs = B_TRUE;
13480 		for (ipif = ill_v6->ill_ipif; ipif != NULL;
13481 		    ipif = ipif->ipif_next) {
13482 			mutex_enter(&ill_v6->ill_lock);
13483 			ipif->ipif_state_flags &= ~IPIF_CHANGING;
13484 			IPIF_UNMARK_MOVING(ipif);
13485 			mutex_exit(&ill_v6->ill_lock);
13486 			if (ipif->ipif_was_up) {
13487 				if (!(ipif->ipif_flags & IPIF_UP))
13488 					err = ipif_up(ipif, q, mp);
13489 				ipif->ipif_was_up = B_FALSE;
13490 				if (err != 0) {
13491 					/*
13492 					 * Can there be any other error ?
13493 					 */
13494 					ASSERT(err == EINPROGRESS);
13495 					return (err);
13496 				}
13497 			}
13498 		}
13499 		mutex_enter(&ill_v6->ill_lock);
13500 		ill_v6->ill_state_flags &= ~ILL_CHANGING;
13501 		mutex_exit(&ill_v6->ill_lock);
13502 		ill_v6->ill_up_ipifs = B_FALSE;
13503 		if (ill_v6->ill_move_in_progress) {
13504 			ASSERT(ill_v6->ill_move_peer != NULL);
13505 			ill_v6->ill_move_in_progress = B_FALSE;
13506 			from_ill = ill_v6->ill_move_peer;
13507 			from_ill->ill_move_in_progress = B_FALSE;
13508 			from_ill->ill_move_peer = NULL;
13509 			mutex_enter(&from_ill->ill_lock);
13510 			from_ill->ill_state_flags &= ~ILL_CHANGING;
13511 			mutex_exit(&from_ill->ill_lock);
13512 			if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
13513 				phyint_inactive(from_ill->ill_phyint);
13514 			}
13515 			if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) {
13516 				phyint_inactive(ill_v6->ill_phyint);
13517 			}
13518 			ill_v6->ill_move_peer = NULL;
13519 		}
13520 	}
13521 	return (0);
13522 }
13523 
13524 /*
13525  * bring down all the approriate ipifs.
13526  */
13527 /* ARGSUSED */
13528 static void
13529 ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover)
13530 {
13531 	ipif_t *ipif;
13532 
13533 	ASSERT(IAM_WRITER_ILL(ill));
13534 
13535 	/*
13536 	 * Except for ipif_state_flags the other fields of the ipif/ill that
13537 	 * are modified below are protected implicitly since we are a writer
13538 	 */
13539 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13540 		if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER))
13541 			continue;
13542 		if (index == 0 || index == ipif->ipif_orig_ifindex) {
13543 			/*
13544 			 * We go through the ipif_down logic even if the ipif
13545 			 * is already down, since routes can be added based
13546 			 * on down ipifs. Going through ipif_down once again
13547 			 * will delete any IREs created based on these routes.
13548 			 */
13549 			if (ipif->ipif_flags & IPIF_UP)
13550 				ipif->ipif_was_up = B_TRUE;
13551 			/*
13552 			 * If called with chk_nofailover true ipif is moving.
13553 			 */
13554 			mutex_enter(&ill->ill_lock);
13555 			if (chk_nofailover) {
13556 				ipif->ipif_state_flags |=
13557 					IPIF_MOVING | IPIF_CHANGING;
13558 			} else {
13559 				ipif->ipif_state_flags |= IPIF_CHANGING;
13560 			}
13561 			mutex_exit(&ill->ill_lock);
13562 			/*
13563 			 * Need to re-create net/subnet bcast ires if
13564 			 * they are dependent on ipif.
13565 			 */
13566 			if (!ipif->ipif_isv6)
13567 				ipif_check_bcast_ires(ipif);
13568 			(void) ipif_logical_down(ipif, NULL, NULL);
13569 			ipif_down_tail(ipif);
13570 			/*
13571 			 * We don't do ipif_multicast_down for IPv4 in
13572 			 * ipif_down. We need to set this so that
13573 			 * ipif_multicast_up will join the
13574 			 * ALLHOSTS_GROUP on to_ill.
13575 			 */
13576 			ipif->ipif_multicast_up = B_FALSE;
13577 		}
13578 	}
13579 }
13580 
13581 #define	IPSQ_INC_REF(ipsq)	{			\
13582 	ASSERT(RW_WRITE_HELD(&ill_g_lock));		\
13583 	(ipsq)->ipsq_refs++;				\
13584 }
13585 
13586 #define	IPSQ_DEC_REF(ipsq)	{			\
13587 	ASSERT(RW_WRITE_HELD(&ill_g_lock));		\
13588 	(ipsq)->ipsq_refs--;				\
13589 	if ((ipsq)->ipsq_refs == 0)				\
13590 		(ipsq)->ipsq_name[0] = '\0'; 		\
13591 }
13592 
13593 /*
13594  * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to
13595  * new_ipsq.
13596  */
13597 static void
13598 ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq)
13599 {
13600 	phyint_t *phyint;
13601 	phyint_t *next_phyint;
13602 
13603 	/*
13604 	 * To change the ipsq of an ill, we need to hold the ill_g_lock as
13605 	 * writer and the ill_lock of the ill in question. Also the dest
13606 	 * ipsq can't vanish while we hold the ill_g_lock as writer.
13607 	 */
13608 	ASSERT(RW_WRITE_HELD(&ill_g_lock));
13609 
13610 	phyint = cur_ipsq->ipsq_phyint_list;
13611 	cur_ipsq->ipsq_phyint_list = NULL;
13612 	while (phyint != NULL) {
13613 		next_phyint = phyint->phyint_ipsq_next;
13614 		IPSQ_DEC_REF(cur_ipsq);
13615 		phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list;
13616 		new_ipsq->ipsq_phyint_list = phyint;
13617 		IPSQ_INC_REF(new_ipsq);
13618 		phyint->phyint_ipsq = new_ipsq;
13619 		phyint = next_phyint;
13620 	}
13621 }
13622 
13623 #define	SPLIT_SUCCESS		0
13624 #define	SPLIT_NOT_NEEDED	1
13625 #define	SPLIT_FAILED		2
13626 
13627 int
13628 ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry)
13629 {
13630 	ipsq_t *newipsq = NULL;
13631 
13632 	/*
13633 	 * Assertions denote pre-requisites for changing the ipsq of
13634 	 * a phyint
13635 	 */
13636 	ASSERT(RW_WRITE_HELD(&ill_g_lock));
13637 	/*
13638 	 * <ill-phyint> assocs can't change while ill_g_lock
13639 	 * is held as writer. See ill_phyint_reinit()
13640 	 */
13641 	ASSERT(phyint->phyint_illv4 == NULL ||
13642 	    MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
13643 	ASSERT(phyint->phyint_illv6 == NULL ||
13644 	    MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
13645 
13646 	if ((phyint->phyint_groupname_len !=
13647 	    (strlen(cur_ipsq->ipsq_name) + 1) ||
13648 	    bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name,
13649 	    phyint->phyint_groupname_len) != 0)) {
13650 		/*
13651 		 * Once we fail in creating a new ipsq due to memory shortage,
13652 		 * don't attempt to create new ipsq again, based on another
13653 		 * phyint, since we want all phyints belonging to an IPMP group
13654 		 * to be in the same ipsq even in the event of mem alloc fails.
13655 		 */
13656 		newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry,
13657 		    cur_ipsq);
13658 		if (newipsq == NULL) {
13659 			/* Memory allocation failure */
13660 			return (SPLIT_FAILED);
13661 		} else {
13662 			/* ipsq_refs protected by ill_g_lock (writer) */
13663 			IPSQ_DEC_REF(cur_ipsq);
13664 			phyint->phyint_ipsq = newipsq;
13665 			phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list;
13666 			newipsq->ipsq_phyint_list = phyint;
13667 			IPSQ_INC_REF(newipsq);
13668 			return (SPLIT_SUCCESS);
13669 		}
13670 	}
13671 	return (SPLIT_NOT_NEEDED);
13672 }
13673 
13674 /*
13675  * The ill locks of the phyint and the ill_g_lock (writer) must be held
13676  * to do this split
13677  */
13678 static int
13679 ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq)
13680 {
13681 	ipsq_t *newipsq;
13682 
13683 	ASSERT(RW_WRITE_HELD(&ill_g_lock));
13684 	/*
13685 	 * <ill-phyint> assocs can't change while ill_g_lock
13686 	 * is held as writer. See ill_phyint_reinit()
13687 	 */
13688 
13689 	ASSERT(phyint->phyint_illv4 == NULL ||
13690 	    MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
13691 	ASSERT(phyint->phyint_illv6 == NULL ||
13692 	    MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
13693 
13694 	if (!ipsq_init((phyint->phyint_illv4 != NULL) ?
13695 	    phyint->phyint_illv4: phyint->phyint_illv6)) {
13696 		/*
13697 		 * ipsq_init failed due to no memory
13698 		 * caller will use the same ipsq
13699 		 */
13700 		return (SPLIT_FAILED);
13701 	}
13702 
13703 	/* ipsq_ref is protected by ill_g_lock (writer) */
13704 	IPSQ_DEC_REF(cur_ipsq);
13705 
13706 	/*
13707 	 * This is a new ipsq that is unknown to the world.
13708 	 * So we don't need to hold ipsq_lock,
13709 	 */
13710 	newipsq = phyint->phyint_ipsq;
13711 	newipsq->ipsq_writer = NULL;
13712 	newipsq->ipsq_reentry_cnt--;
13713 	ASSERT(newipsq->ipsq_reentry_cnt == 0);
13714 #ifdef ILL_DEBUG
13715 	newipsq->ipsq_depth = 0;
13716 #endif
13717 
13718 	return (SPLIT_SUCCESS);
13719 }
13720 
13721 /*
13722  * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to
13723  * ipsq's representing their individual groups or themselves. Return
13724  * whether split needs to be retried again later.
13725  */
13726 static boolean_t
13727 ill_split_ipsq(ipsq_t *cur_ipsq)
13728 {
13729 	phyint_t *phyint;
13730 	phyint_t *next_phyint;
13731 	int	error;
13732 	boolean_t need_retry = B_FALSE;
13733 
13734 	phyint = cur_ipsq->ipsq_phyint_list;
13735 	cur_ipsq->ipsq_phyint_list = NULL;
13736 	while (phyint != NULL) {
13737 		next_phyint = phyint->phyint_ipsq_next;
13738 		/*
13739 		 * 'created' will tell us whether the callee actually
13740 		 * created an ipsq. Lack of memory may force the callee
13741 		 * to return without creating an ipsq.
13742 		 */
13743 		if (phyint->phyint_groupname == NULL) {
13744 			error = ill_split_to_own_ipsq(phyint, cur_ipsq);
13745 		} else {
13746 			error = ill_split_to_grp_ipsq(phyint, cur_ipsq,
13747 					need_retry);
13748 		}
13749 
13750 		switch (error) {
13751 		case SPLIT_FAILED:
13752 			need_retry = B_TRUE;
13753 			/* FALLTHRU */
13754 		case SPLIT_NOT_NEEDED:
13755 			/*
13756 			 * Keep it on the list.
13757 			 */
13758 			phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list;
13759 			cur_ipsq->ipsq_phyint_list = phyint;
13760 			break;
13761 		case SPLIT_SUCCESS:
13762 			break;
13763 		default:
13764 			ASSERT(0);
13765 		}
13766 
13767 		phyint = next_phyint;
13768 	}
13769 	return (need_retry);
13770 }
13771 
13772 /*
13773  * given an ipsq 'ipsq' lock all ills associated with this ipsq.
13774  * and return the ills in the list. This list will be
13775  * needed to unlock all the ills later on by the caller.
13776  * The <ill-ipsq> associations could change between the
13777  * lock and unlock. Hence the unlock can't traverse the
13778  * ipsq to get the list of ills.
13779  */
13780 static int
13781 ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max)
13782 {
13783 	int	cnt = 0;
13784 	phyint_t	*phyint;
13785 
13786 	/*
13787 	 * The caller holds ill_g_lock to ensure that the ill memberships
13788 	 * of the ipsq don't change
13789 	 */
13790 	ASSERT(RW_LOCK_HELD(&ill_g_lock));
13791 
13792 	phyint = ipsq->ipsq_phyint_list;
13793 	while (phyint != NULL) {
13794 		if (phyint->phyint_illv4 != NULL) {
13795 			ASSERT(cnt < list_max);
13796 			list[cnt++] = phyint->phyint_illv4;
13797 		}
13798 		if (phyint->phyint_illv6 != NULL) {
13799 			ASSERT(cnt < list_max);
13800 			list[cnt++] = phyint->phyint_illv6;
13801 		}
13802 		phyint = phyint->phyint_ipsq_next;
13803 	}
13804 	ill_lock_ills(list, cnt);
13805 	return (cnt);
13806 }
13807 
13808 void
13809 ill_lock_ills(ill_t **list, int cnt)
13810 {
13811 	int	i;
13812 
13813 	if (cnt > 1) {
13814 		boolean_t try_again;
13815 		do {
13816 			try_again = B_FALSE;
13817 			for (i = 0; i < cnt - 1; i++) {
13818 				if (list[i] < list[i + 1]) {
13819 					ill_t	*tmp;
13820 
13821 					/* swap the elements */
13822 					tmp = list[i];
13823 					list[i] = list[i + 1];
13824 					list[i + 1] = tmp;
13825 					try_again = B_TRUE;
13826 				}
13827 			}
13828 		} while (try_again);
13829 	}
13830 
13831 	for (i = 0; i < cnt; i++) {
13832 		if (i == 0) {
13833 			if (list[i] != NULL)
13834 				mutex_enter(&list[i]->ill_lock);
13835 			else
13836 				return;
13837 		} else if ((list[i-1] != list[i]) && (list[i] != NULL)) {
13838 			mutex_enter(&list[i]->ill_lock);
13839 		}
13840 	}
13841 }
13842 
13843 void
13844 ill_unlock_ills(ill_t **list, int cnt)
13845 {
13846 	int	i;
13847 
13848 	for (i = 0; i < cnt; i++) {
13849 		if ((i == 0) && (list[i] != NULL)) {
13850 			mutex_exit(&list[i]->ill_lock);
13851 		} else if ((list[i-1] != list[i]) && (list[i] != NULL)) {
13852 			mutex_exit(&list[i]->ill_lock);
13853 		}
13854 	}
13855 }
13856 
13857 /*
13858  * Merge all the ills from 1 ipsq group into another ipsq group.
13859  * The source ipsq group is specified by the ipsq associated with
13860  * 'from_ill'. The destination ipsq group is specified by the ipsq
13861  * associated with 'to_ill' or 'groupname' respectively.
13862  * Note that ipsq itself does not have a reference count mechanism
13863  * and functions don't look up an ipsq and pass it around. Instead
13864  * functions pass around an ill or groupname, and the ipsq is looked
13865  * up from the ill or groupname and the required operation performed
13866  * atomically with the lookup on the ipsq.
13867  */
13868 static int
13869 ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp,
13870     queue_t *q)
13871 {
13872 	ipsq_t *old_ipsq;
13873 	ipsq_t *new_ipsq;
13874 	ill_t	**ill_list;
13875 	int	cnt;
13876 	size_t	ill_list_size;
13877 	boolean_t became_writer_on_new_sq = B_FALSE;
13878 
13879 	/* Exactly 1 of 'to_ill' and groupname can be specified. */
13880 	ASSERT((to_ill != NULL) ^ (groupname != NULL));
13881 
13882 	/*
13883 	 * Need to hold ill_g_lock as writer and also the ill_lock to
13884 	 * change the <ill-ipsq> assoc of an ill. Need to hold the
13885 	 * ipsq_lock to prevent new messages from landing on an ipsq.
13886 	 */
13887 	rw_enter(&ill_g_lock, RW_WRITER);
13888 
13889 	old_ipsq = from_ill->ill_phyint->phyint_ipsq;
13890 	if (groupname != NULL)
13891 		new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL);
13892 	else {
13893 		new_ipsq = to_ill->ill_phyint->phyint_ipsq;
13894 	}
13895 
13896 	ASSERT(old_ipsq != NULL && new_ipsq != NULL);
13897 
13898 	/*
13899 	 * both groups are on the same ipsq.
13900 	 */
13901 	if (old_ipsq == new_ipsq) {
13902 		rw_exit(&ill_g_lock);
13903 		return (0);
13904 	}
13905 
13906 	cnt = old_ipsq->ipsq_refs << 1;
13907 	ill_list_size = cnt * sizeof (ill_t *);
13908 	ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP);
13909 	if (ill_list == NULL) {
13910 		rw_exit(&ill_g_lock);
13911 		return (ENOMEM);
13912 	}
13913 	cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt);
13914 
13915 	/* Need ipsq lock to enque messages on new ipsq or to become writer */
13916 	mutex_enter(&new_ipsq->ipsq_lock);
13917 	if ((new_ipsq->ipsq_writer == NULL &&
13918 		new_ipsq->ipsq_current_ipif == NULL) ||
13919 	    (new_ipsq->ipsq_writer == curthread)) {
13920 		new_ipsq->ipsq_writer = curthread;
13921 		new_ipsq->ipsq_reentry_cnt++;
13922 		became_writer_on_new_sq = B_TRUE;
13923 	}
13924 
13925 	/*
13926 	 * We are holding ill_g_lock as writer and all the ill locks of
13927 	 * the old ipsq. So the old_ipsq can't be looked up, and hence no new
13928 	 * message can land up on the old ipsq even though we don't hold the
13929 	 * ipsq_lock of the old_ipsq. Now move all messages to the newipsq.
13930 	 */
13931 	ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q);
13932 
13933 	/*
13934 	 * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'.
13935 	 * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq>
13936 	 * assocs. till we release the ill_g_lock, and hence it can't vanish.
13937 	 */
13938 	ill_merge_ipsq(old_ipsq, new_ipsq);
13939 
13940 	/*
13941 	 * Mark the new ipsq as needing a split since it is currently
13942 	 * being shared by more than 1 IPMP group. The split will
13943 	 * occur at the end of ipsq_exit
13944 	 */
13945 	new_ipsq->ipsq_split = B_TRUE;
13946 
13947 	/* Now release all the locks */
13948 	mutex_exit(&new_ipsq->ipsq_lock);
13949 	ill_unlock_ills(ill_list, cnt);
13950 	rw_exit(&ill_g_lock);
13951 
13952 	kmem_free(ill_list, ill_list_size);
13953 
13954 	/*
13955 	 * If we succeeded in becoming writer on the new ipsq, then
13956 	 * drain the new ipsq and start processing  all enqueued messages
13957 	 * including the current ioctl we are processing which is either
13958 	 * a set groupname or failover/failback.
13959 	 */
13960 	if (became_writer_on_new_sq)
13961 		ipsq_exit(new_ipsq, B_TRUE, B_TRUE);
13962 
13963 	/*
13964 	 * syncq has been changed and all the messages have been moved.
13965 	 */
13966 	mutex_enter(&old_ipsq->ipsq_lock);
13967 	old_ipsq->ipsq_current_ipif = NULL;
13968 	mutex_exit(&old_ipsq->ipsq_lock);
13969 	return (EINPROGRESS);
13970 }
13971 
13972 /*
13973  * Delete and add the loopback copy and non-loopback copy of
13974  * the BROADCAST ire corresponding to ill and addr. Used to
13975  * group broadcast ires together when ill becomes part of
13976  * a group.
13977  *
13978  * This function is also called when ill is leaving the group
13979  * so that the ires belonging to the group gets re-grouped.
13980  */
13981 static void
13982 ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr)
13983 {
13984 	ire_t *ire, *nire, *nire_next, *ire_head = NULL;
13985 	ire_t **ire_ptpn = &ire_head;
13986 
13987 	/*
13988 	 * The loopback and non-loopback IREs are inserted in the order in which
13989 	 * they're found, on the basis that they are correctly ordered (loopback
13990 	 * first).
13991 	 */
13992 	for (;;) {
13993 		ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif,
13994 		    ALL_ZONES, MATCH_IRE_TYPE | MATCH_IRE_ILL);
13995 		if (ire == NULL)
13996 			break;
13997 
13998 		/*
13999 		 * we are passing in KM_SLEEP because it is not easy to
14000 		 * go back to a sane state in case of memory failure.
14001 		 */
14002 		nire = kmem_cache_alloc(ire_cache, KM_SLEEP);
14003 		ASSERT(nire != NULL);
14004 		bzero(nire, sizeof (ire_t));
14005 		/*
14006 		 * Don't use ire_max_frag directly since we don't
14007 		 * hold on to 'ire' until we add the new ire 'nire' and
14008 		 * we don't want the new ire to have a dangling reference
14009 		 * to 'ire'. The ire_max_frag of a broadcast ire must
14010 		 * be in sync with the ipif_mtu of the associate ipif.
14011 		 * For eg. this happens as a result of SIOCSLIFNAME,
14012 		 * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by
14013 		 * the driver. A change in ire_max_frag triggered as
14014 		 * as a result of path mtu discovery, or due to an
14015 		 * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a
14016 		 * route change -mtu command does not apply to broadcast ires.
14017 		 *
14018 		 * XXX We need a recovery strategy here if ire_init fails
14019 		 */
14020 		if (ire_init(nire,
14021 		    (uchar_t *)&ire->ire_addr,
14022 		    (uchar_t *)&ire->ire_mask,
14023 		    (uchar_t *)&ire->ire_src_addr,
14024 		    (uchar_t *)&ire->ire_gateway_addr,
14025 		    (uchar_t *)&ire->ire_in_src_addr,
14026 		    ire->ire_stq == NULL ? &ip_loopback_mtu :
14027 			&ire->ire_ipif->ipif_mtu,
14028 		    ire->ire_fp_mp,
14029 		    ire->ire_rfq,
14030 		    ire->ire_stq,
14031 		    ire->ire_type,
14032 		    ire->ire_dlureq_mp,
14033 		    ire->ire_ipif,
14034 		    ire->ire_in_ill,
14035 		    ire->ire_cmask,
14036 		    ire->ire_phandle,
14037 		    ire->ire_ihandle,
14038 		    ire->ire_flags,
14039 		    &ire->ire_uinfo) == NULL) {
14040 			cmn_err(CE_PANIC, "ire_init() failed");
14041 		}
14042 		ire_delete(ire);
14043 		ire_refrele(ire);
14044 
14045 		/*
14046 		 * The newly created IREs are inserted at the tail of the list
14047 		 * starting with ire_head. As we've just allocated them no one
14048 		 * knows about them so it's safe.
14049 		 */
14050 		*ire_ptpn = nire;
14051 		ire_ptpn = &nire->ire_next;
14052 	}
14053 
14054 	for (nire = ire_head; nire != NULL; nire = nire_next) {
14055 		int error;
14056 		ire_t *oire;
14057 		/* unlink the IRE from our list before calling ire_add() */
14058 		nire_next = nire->ire_next;
14059 		nire->ire_next = NULL;
14060 
14061 		/* ire_add adds the ire at the right place in the list */
14062 		oire = nire;
14063 		error = ire_add(&nire, NULL, NULL, NULL);
14064 		ASSERT(error == 0);
14065 		ASSERT(oire == nire);
14066 		ire_refrele(nire);	/* Held in ire_add */
14067 	}
14068 }
14069 
14070 /*
14071  * This function is usually called when an ill is inserted in
14072  * a group and all the ipifs are already UP. As all the ipifs
14073  * are already UP, the broadcast ires have already been created
14074  * and been inserted. But, ire_add_v4 would not have grouped properly.
14075  * We need to re-group for the benefit of ip_wput_ire which
14076  * expects BROADCAST ires to be grouped properly to avoid sending
14077  * more than one copy of the broadcast packet per group.
14078  *
14079  * NOTE : We don't check for ill_ipif_up_count to be non-zero here
14080  *	  because when ipif_up_done ends up calling this, ires have
14081  *        already been added before illgrp_insert i.e before ill_group
14082  *	  has been initialized.
14083  */
14084 static void
14085 ill_group_bcast_for_xmit(ill_t *ill)
14086 {
14087 	ill_group_t *illgrp;
14088 	ipif_t *ipif;
14089 	ipaddr_t addr;
14090 	ipaddr_t net_mask;
14091 	ipaddr_t subnet_netmask;
14092 
14093 	illgrp = ill->ill_group;
14094 
14095 	/*
14096 	 * This function is called even when an ill is deleted from
14097 	 * the group. Hence, illgrp could be null.
14098 	 */
14099 	if (illgrp != NULL && illgrp->illgrp_ill_count == 1)
14100 		return;
14101 
14102 	/*
14103 	 * Delete all the BROADCAST ires matching this ill and add
14104 	 * them back. This time, ire_add_v4 should take care of
14105 	 * grouping them with others because ill is part of the
14106 	 * group.
14107 	 */
14108 	ill_bcast_delete_and_add(ill, 0);
14109 	ill_bcast_delete_and_add(ill, INADDR_BROADCAST);
14110 
14111 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
14112 
14113 		if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
14114 		    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
14115 			net_mask = ip_net_mask(ipif->ipif_lcl_addr);
14116 		} else {
14117 			net_mask = htonl(IN_CLASSA_NET);
14118 		}
14119 		addr = net_mask & ipif->ipif_subnet;
14120 		ill_bcast_delete_and_add(ill, addr);
14121 		ill_bcast_delete_and_add(ill, ~net_mask | addr);
14122 
14123 		subnet_netmask = ipif->ipif_net_mask;
14124 		addr = ipif->ipif_subnet;
14125 		ill_bcast_delete_and_add(ill, addr);
14126 		ill_bcast_delete_and_add(ill, ~subnet_netmask | addr);
14127 	}
14128 }
14129 
14130 /*
14131  * This function is called from illgrp_delete when ill is being deleted
14132  * from the group.
14133  *
14134  * As ill is not there in the group anymore, any address belonging
14135  * to this ill should be cleared of IRE_MARK_NORECV.
14136  */
14137 static void
14138 ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr)
14139 {
14140 	ire_t *ire;
14141 	irb_t *irb;
14142 
14143 	ASSERT(ill->ill_group == NULL);
14144 
14145 	ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif,
14146 	    ALL_ZONES, MATCH_IRE_TYPE | MATCH_IRE_ILL);
14147 
14148 	if (ire != NULL) {
14149 		/*
14150 		 * IPMP and plumbing operations are serialized on the ipsq, so
14151 		 * no one will insert or delete a broadcast ire under our feet.
14152 		 */
14153 		irb = ire->ire_bucket;
14154 		rw_enter(&irb->irb_lock, RW_READER);
14155 		ire_refrele(ire);
14156 
14157 		for (; ire != NULL; ire = ire->ire_next) {
14158 			if (ire->ire_addr != addr)
14159 				break;
14160 			if (ire_to_ill(ire) != ill)
14161 				continue;
14162 
14163 			ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED));
14164 			ire->ire_marks &= ~IRE_MARK_NORECV;
14165 		}
14166 		rw_exit(&irb->irb_lock);
14167 	}
14168 }
14169 
14170 /*
14171  * This function must be called only after the broadcast ires
14172  * have been grouped together. For a given address addr, nominate
14173  * only one of the ires whose interface is not FAILED or OFFLINE.
14174  *
14175  * This is also called when an ipif goes down, so that we can nominate
14176  * a different ire with the same address for receiving.
14177  */
14178 static void
14179 ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr)
14180 {
14181 	irb_t *irb;
14182 	ire_t *ire;
14183 	ire_t *ire1;
14184 	ire_t *save_ire;
14185 	ire_t **irep = NULL;
14186 	boolean_t first = B_TRUE;
14187 	ire_t *clear_ire = NULL;
14188 	ire_t *start_ire = NULL;
14189 	ire_t	*new_lb_ire;
14190 	ire_t	*new_nlb_ire;
14191 	boolean_t new_lb_ire_used = B_FALSE;
14192 	boolean_t new_nlb_ire_used = B_FALSE;
14193 	uint64_t match_flags;
14194 	uint64_t phyi_flags;
14195 	boolean_t fallback = B_FALSE;
14196 
14197 	ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES,
14198 	    MATCH_IRE_TYPE);
14199 	/*
14200 	 * We may not be able to find some ires if a previous
14201 	 * ire_create failed. This happens when an ipif goes
14202 	 * down and we are unable to create BROADCAST ires due
14203 	 * to memory failure. Thus, we have to check for NULL
14204 	 * below. This should handle the case for LOOPBACK,
14205 	 * POINTOPOINT and interfaces with some POINTOPOINT
14206 	 * logicals for which there are no BROADCAST ires.
14207 	 */
14208 	if (ire == NULL)
14209 		return;
14210 	/*
14211 	 * Currently IRE_BROADCASTS are deleted when an ipif
14212 	 * goes down which runs exclusively. Thus, setting
14213 	 * IRE_MARK_RCVD should not race with ire_delete marking
14214 	 * IRE_MARK_CONDEMNED. We grab the lock below just to
14215 	 * be consistent with other parts of the code that walks
14216 	 * a given bucket.
14217 	 */
14218 	save_ire = ire;
14219 	irb = ire->ire_bucket;
14220 	new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
14221 	if (new_lb_ire == NULL) {
14222 		ire_refrele(ire);
14223 		return;
14224 	}
14225 	new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
14226 	if (new_nlb_ire == NULL) {
14227 		ire_refrele(ire);
14228 		kmem_cache_free(ire_cache, new_lb_ire);
14229 		return;
14230 	}
14231 	IRB_REFHOLD(irb);
14232 	rw_enter(&irb->irb_lock, RW_WRITER);
14233 	/*
14234 	 * Get to the first ire matching the address and the
14235 	 * group. If the address does not match we are done
14236 	 * as we could not find the IRE. If the address matches
14237 	 * we should get to the first one matching the group.
14238 	 */
14239 	while (ire != NULL) {
14240 		if (ire->ire_addr != addr ||
14241 		    ire->ire_ipif->ipif_ill->ill_group == illgrp) {
14242 			break;
14243 		}
14244 		ire = ire->ire_next;
14245 	}
14246 	match_flags = PHYI_FAILED | PHYI_INACTIVE;
14247 	start_ire = ire;
14248 redo:
14249 	while (ire != NULL && ire->ire_addr == addr &&
14250 	    ire->ire_ipif->ipif_ill->ill_group == illgrp) {
14251 		/*
14252 		 * The first ire for any address within a group
14253 		 * should always be the one with IRE_MARK_NORECV cleared
14254 		 * so that ip_wput_ire can avoid searching for one.
14255 		 * Note down the insertion point which will be used
14256 		 * later.
14257 		 */
14258 		if (first && (irep == NULL))
14259 			irep = ire->ire_ptpn;
14260 		/*
14261 		 * PHYI_FAILED is set when the interface fails.
14262 		 * This interface might have become good, but the
14263 		 * daemon has not yet detected. We should still
14264 		 * not receive on this. PHYI_OFFLINE should never
14265 		 * be picked as this has been offlined and soon
14266 		 * be removed.
14267 		 */
14268 		phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags;
14269 		if (phyi_flags & PHYI_OFFLINE) {
14270 			ire->ire_marks |= IRE_MARK_NORECV;
14271 			ire = ire->ire_next;
14272 			continue;
14273 		}
14274 		if (phyi_flags & match_flags) {
14275 			ire->ire_marks |= IRE_MARK_NORECV;
14276 			ire = ire->ire_next;
14277 			if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
14278 			    PHYI_INACTIVE) {
14279 				fallback = B_TRUE;
14280 			}
14281 			continue;
14282 		}
14283 		if (first) {
14284 			/*
14285 			 * We will move this to the front of the list later
14286 			 * on.
14287 			 */
14288 			clear_ire = ire;
14289 			ire->ire_marks &= ~IRE_MARK_NORECV;
14290 		} else {
14291 			ire->ire_marks |= IRE_MARK_NORECV;
14292 		}
14293 		first = B_FALSE;
14294 		ire = ire->ire_next;
14295 	}
14296 	/*
14297 	 * If we never nominated anybody, try nominating at least
14298 	 * an INACTIVE, if we found one. Do it only once though.
14299 	 */
14300 	if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) &&
14301 	    fallback) {
14302 		match_flags = PHYI_FAILED;
14303 		ire = start_ire;
14304 		irep = NULL;
14305 		goto redo;
14306 	}
14307 	ire_refrele(save_ire);
14308 
14309 	/*
14310 	 * irep non-NULL indicates that we entered the while loop
14311 	 * above. If clear_ire is at the insertion point, we don't
14312 	 * have to do anything. clear_ire will be NULL if all the
14313 	 * interfaces are failed.
14314 	 *
14315 	 * We cannot unlink and reinsert the ire at the right place
14316 	 * in the list since there can be other walkers of this bucket.
14317 	 * Instead we delete and recreate the ire
14318 	 */
14319 	if (clear_ire != NULL && irep != NULL && *irep != clear_ire) {
14320 		ire_t *clear_ire_stq = NULL;
14321 		bzero(new_lb_ire, sizeof (ire_t));
14322 		/* XXX We need a recovery strategy here. */
14323 		if (ire_init(new_lb_ire,
14324 		    (uchar_t *)&clear_ire->ire_addr,
14325 		    (uchar_t *)&clear_ire->ire_mask,
14326 		    (uchar_t *)&clear_ire->ire_src_addr,
14327 		    (uchar_t *)&clear_ire->ire_gateway_addr,
14328 		    (uchar_t *)&clear_ire->ire_in_src_addr,
14329 		    &clear_ire->ire_max_frag,
14330 		    clear_ire->ire_fp_mp,
14331 		    clear_ire->ire_rfq,
14332 		    clear_ire->ire_stq,
14333 		    clear_ire->ire_type,
14334 		    clear_ire->ire_dlureq_mp,
14335 		    clear_ire->ire_ipif,
14336 		    clear_ire->ire_in_ill,
14337 		    clear_ire->ire_cmask,
14338 		    clear_ire->ire_phandle,
14339 		    clear_ire->ire_ihandle,
14340 		    clear_ire->ire_flags,
14341 		    &clear_ire->ire_uinfo) == NULL)
14342 			cmn_err(CE_PANIC, "ire_init() failed");
14343 		if (clear_ire->ire_stq == NULL) {
14344 			ire_t *ire_next = clear_ire->ire_next;
14345 			if (ire_next != NULL &&
14346 			    ire_next->ire_stq != NULL &&
14347 			    ire_next->ire_addr == clear_ire->ire_addr &&
14348 			    ire_next->ire_ipif->ipif_ill ==
14349 			    clear_ire->ire_ipif->ipif_ill) {
14350 				clear_ire_stq = ire_next;
14351 
14352 				bzero(new_nlb_ire, sizeof (ire_t));
14353 				/* XXX We need a recovery strategy here. */
14354 				if (ire_init(new_nlb_ire,
14355 				    (uchar_t *)&clear_ire_stq->ire_addr,
14356 				    (uchar_t *)&clear_ire_stq->ire_mask,
14357 				    (uchar_t *)&clear_ire_stq->ire_src_addr,
14358 				    (uchar_t *)&clear_ire_stq->ire_gateway_addr,
14359 				    (uchar_t *)&clear_ire_stq->ire_in_src_addr,
14360 				    &clear_ire_stq->ire_max_frag,
14361 				    clear_ire_stq->ire_fp_mp,
14362 				    clear_ire_stq->ire_rfq,
14363 				    clear_ire_stq->ire_stq,
14364 				    clear_ire_stq->ire_type,
14365 				    clear_ire_stq->ire_dlureq_mp,
14366 				    clear_ire_stq->ire_ipif,
14367 				    clear_ire_stq->ire_in_ill,
14368 				    clear_ire_stq->ire_cmask,
14369 				    clear_ire_stq->ire_phandle,
14370 				    clear_ire_stq->ire_ihandle,
14371 				    clear_ire_stq->ire_flags,
14372 				    &clear_ire_stq->ire_uinfo) == NULL)
14373 					cmn_err(CE_PANIC, "ire_init() failed");
14374 			}
14375 		}
14376 
14377 		/*
14378 		 * Delete the ire. We can't call ire_delete() since
14379 		 * we are holding the bucket lock. We can't release the
14380 		 * bucket lock since we can't allow irep to change. So just
14381 		 * mark it CONDEMNED. The IRB_REFRELE will delete the
14382 		 * ire from the list and do the refrele.
14383 		 */
14384 		clear_ire->ire_marks |= IRE_MARK_CONDEMNED;
14385 		irb->irb_marks |= IRE_MARK_CONDEMNED;
14386 
14387 		if (clear_ire_stq != NULL) {
14388 			ire_fastpath_list_delete(
14389 			    (ill_t *)clear_ire_stq->ire_stq->q_ptr,
14390 			    clear_ire_stq);
14391 			clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED;
14392 		}
14393 
14394 		/*
14395 		 * Also take care of otherfields like ib/ob pkt count
14396 		 * etc. Need to dup them. ditto in ill_bcast_delete_and_add
14397 		 */
14398 
14399 		/* Add the new ire's. Insert at *irep */
14400 		new_lb_ire->ire_bucket = clear_ire->ire_bucket;
14401 		ire1 = *irep;
14402 		if (ire1 != NULL)
14403 			ire1->ire_ptpn = &new_lb_ire->ire_next;
14404 		new_lb_ire->ire_next = ire1;
14405 		/* Link the new one in. */
14406 		new_lb_ire->ire_ptpn = irep;
14407 		membar_producer();
14408 		*irep = new_lb_ire;
14409 		new_lb_ire_used = B_TRUE;
14410 		BUMP_IRE_STATS(ire_stats_v4, ire_stats_inserted);
14411 		new_lb_ire->ire_bucket->irb_ire_cnt++;
14412 		new_lb_ire->ire_ipif->ipif_ire_cnt++;
14413 
14414 		if (clear_ire_stq != NULL) {
14415 			new_nlb_ire->ire_bucket = clear_ire->ire_bucket;
14416 			irep = &new_lb_ire->ire_next;
14417 			/* Add the new ire. Insert at *irep */
14418 			ire1 = *irep;
14419 			if (ire1 != NULL)
14420 				ire1->ire_ptpn = &new_nlb_ire->ire_next;
14421 			new_nlb_ire->ire_next = ire1;
14422 			/* Link the new one in. */
14423 			new_nlb_ire->ire_ptpn = irep;
14424 			membar_producer();
14425 			*irep = new_nlb_ire;
14426 			new_nlb_ire_used = B_TRUE;
14427 			BUMP_IRE_STATS(ire_stats_v4, ire_stats_inserted);
14428 			new_nlb_ire->ire_bucket->irb_ire_cnt++;
14429 			new_nlb_ire->ire_ipif->ipif_ire_cnt++;
14430 			((ill_t *)new_nlb_ire->ire_stq->q_ptr)->ill_ire_cnt++;
14431 		}
14432 	}
14433 	rw_exit(&irb->irb_lock);
14434 	if (!new_lb_ire_used)
14435 		kmem_cache_free(ire_cache, new_lb_ire);
14436 	if (!new_nlb_ire_used)
14437 		kmem_cache_free(ire_cache, new_nlb_ire);
14438 	IRB_REFRELE(irb);
14439 }
14440 
14441 /*
14442  * Whenever an ipif goes down we have to renominate a different
14443  * broadcast ire to receive. Whenever an ipif comes up, we need
14444  * to make sure that we have only one nominated to receive.
14445  */
14446 static void
14447 ipif_renominate_bcast(ipif_t *ipif)
14448 {
14449 	ill_t *ill = ipif->ipif_ill;
14450 	ipaddr_t subnet_addr;
14451 	ipaddr_t net_addr;
14452 	ipaddr_t net_mask = 0;
14453 	ipaddr_t subnet_netmask;
14454 	ipaddr_t addr;
14455 	ill_group_t *illgrp;
14456 
14457 	illgrp = ill->ill_group;
14458 	/*
14459 	 * If this is the last ipif going down, it might take
14460 	 * the ill out of the group. In that case ipif_down ->
14461 	 * illgrp_delete takes care of doing the nomination.
14462 	 * ipif_down does not call for this case.
14463 	 */
14464 	ASSERT(illgrp != NULL);
14465 
14466 	/* There could not have been any ires associated with this */
14467 	if (ipif->ipif_subnet == 0)
14468 		return;
14469 
14470 	ill_mark_bcast(illgrp, 0);
14471 	ill_mark_bcast(illgrp, INADDR_BROADCAST);
14472 
14473 	if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
14474 	    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
14475 		net_mask = ip_net_mask(ipif->ipif_lcl_addr);
14476 	} else {
14477 		net_mask = htonl(IN_CLASSA_NET);
14478 	}
14479 	addr = net_mask & ipif->ipif_subnet;
14480 	ill_mark_bcast(illgrp, addr);
14481 
14482 	net_addr = ~net_mask | addr;
14483 	ill_mark_bcast(illgrp, net_addr);
14484 
14485 	subnet_netmask = ipif->ipif_net_mask;
14486 	addr = ipif->ipif_subnet;
14487 	ill_mark_bcast(illgrp, addr);
14488 
14489 	subnet_addr = ~subnet_netmask | addr;
14490 	ill_mark_bcast(illgrp, subnet_addr);
14491 }
14492 
14493 /*
14494  * Whenever we form or delete ill groups, we need to nominate one set of
14495  * BROADCAST ires for receiving in the group.
14496  *
14497  * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires
14498  *    have been added, but ill_ipif_up_count is 0. Thus, we don't assert
14499  *    for ill_ipif_up_count to be non-zero. This is the only case where
14500  *    ill_ipif_up_count is zero and we would still find the ires.
14501  *
14502  * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one
14503  *    ipif is UP and we just have to do the nomination.
14504  *
14505  * 3) When ill_handoff_responsibility calls us, some ill has been removed
14506  *    from the group. So, we have to do the nomination.
14507  *
14508  * Because of (3), there could be just one ill in the group. But we have
14509  * to nominate still as IRE_MARK_NORCV may have been marked on this.
14510  * Thus, this function does not optimize when there is only one ill as
14511  * it is not correct for (3).
14512  */
14513 static void
14514 ill_nominate_bcast_rcv(ill_group_t *illgrp)
14515 {
14516 	ill_t *ill;
14517 	ipif_t *ipif;
14518 	ipaddr_t subnet_addr;
14519 	ipaddr_t prev_subnet_addr = 0;
14520 	ipaddr_t net_addr;
14521 	ipaddr_t prev_net_addr = 0;
14522 	ipaddr_t net_mask = 0;
14523 	ipaddr_t subnet_netmask;
14524 	ipaddr_t addr;
14525 
14526 	/*
14527 	 * When the last memeber is leaving, there is nothing to
14528 	 * nominate.
14529 	 */
14530 	if (illgrp->illgrp_ill_count == 0) {
14531 		ASSERT(illgrp->illgrp_ill == NULL);
14532 		return;
14533 	}
14534 
14535 	ill = illgrp->illgrp_ill;
14536 	ASSERT(!ill->ill_isv6);
14537 	/*
14538 	 * We assume that ires with same address and belonging to the
14539 	 * same group, has been grouped together. Nominating a *single*
14540 	 * ill in the group for sending and receiving broadcast is done
14541 	 * by making sure that the first BROADCAST ire (which will be
14542 	 * the one returned by ire_ctable_lookup for ip_rput and the
14543 	 * one that will be used in ip_wput_ire) will be the one that
14544 	 * will not have IRE_MARK_NORECV set.
14545 	 *
14546 	 * 1) ip_rput checks and discards packets received on ires marked
14547 	 *    with IRE_MARK_NORECV. Thus, we don't send up duplicate
14548 	 *    broadcast packets. We need to clear IRE_MARK_NORECV on the
14549 	 *    first ire in the group for every broadcast address in the group.
14550 	 *    ip_rput will accept packets only on the first ire i.e only
14551 	 *    one copy of the ill.
14552 	 *
14553 	 * 2) ip_wput_ire needs to send out just one copy of the broadcast
14554 	 *    packet for the whole group. It needs to send out on the ill
14555 	 *    whose ire has not been marked with IRE_MARK_NORECV. If it sends
14556 	 *    on the one marked with IRE_MARK_NORECV, ip_rput will accept
14557 	 *    the copy echoed back on other port where the ire is not marked
14558 	 *    with IRE_MARK_NORECV.
14559 	 *
14560 	 * Note that we just need to have the first IRE either loopback or
14561 	 * non-loopback (either of them may not exist if ire_create failed
14562 	 * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will
14563 	 * always hit the first one and hence will always accept one copy.
14564 	 *
14565 	 * We have a broadcast ire per ill for all the unique prefixes
14566 	 * hosted on that ill. As we don't have a way of knowing the
14567 	 * unique prefixes on a given ill and hence in the whole group,
14568 	 * we just call ill_mark_bcast on all the prefixes that exist
14569 	 * in the group. For the common case of one prefix, the code
14570 	 * below optimizes by remebering the last address used for
14571 	 * markng. In the case of multiple prefixes, this will still
14572 	 * optimize depending the order of prefixes.
14573 	 *
14574 	 * The only unique address across the whole group is 0.0.0.0 and
14575 	 * 255.255.255.255 and thus we call only once. ill_mark_bcast enables
14576 	 * the first ire in the bucket for receiving and disables the
14577 	 * others.
14578 	 */
14579 	ill_mark_bcast(illgrp, 0);
14580 	ill_mark_bcast(illgrp, INADDR_BROADCAST);
14581 	for (; ill != NULL; ill = ill->ill_group_next) {
14582 
14583 		for (ipif = ill->ill_ipif; ipif != NULL;
14584 		    ipif = ipif->ipif_next) {
14585 
14586 			if (!(ipif->ipif_flags & IPIF_UP) ||
14587 			    ipif->ipif_subnet == 0) {
14588 				continue;
14589 			}
14590 			if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
14591 			    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
14592 				net_mask = ip_net_mask(ipif->ipif_lcl_addr);
14593 			} else {
14594 				net_mask = htonl(IN_CLASSA_NET);
14595 			}
14596 			addr = net_mask & ipif->ipif_subnet;
14597 			if (prev_net_addr == 0 || prev_net_addr != addr) {
14598 				ill_mark_bcast(illgrp, addr);
14599 				net_addr = ~net_mask | addr;
14600 				ill_mark_bcast(illgrp, net_addr);
14601 			}
14602 			prev_net_addr = addr;
14603 
14604 			subnet_netmask = ipif->ipif_net_mask;
14605 			addr = ipif->ipif_subnet;
14606 			if (prev_subnet_addr == 0 ||
14607 			    prev_subnet_addr != addr) {
14608 				ill_mark_bcast(illgrp, addr);
14609 				subnet_addr = ~subnet_netmask | addr;
14610 				ill_mark_bcast(illgrp, subnet_addr);
14611 			}
14612 			prev_subnet_addr = addr;
14613 		}
14614 	}
14615 }
14616 
14617 /*
14618  * This function is called while forming ill groups.
14619  *
14620  * Currently, we handle only allmulti groups. We want to join
14621  * allmulti on only one of the ills in the groups. In future,
14622  * when we have link aggregation, we may have to join normal
14623  * multicast groups on multiple ills as switch does inbound load
14624  * balancing. Following are the functions that calls this
14625  * function :
14626  *
14627  * 1) ill_recover_multicast : Interface is coming back UP.
14628  *    When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6
14629  *    will call ill_recover_multicast to recover all the multicast
14630  *    groups. We need to make sure that only one member is joined
14631  *    in the ill group.
14632  *
14633  * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed.
14634  *    Somebody is joining allmulti. We need to make sure that only one
14635  *    member is joined in the group.
14636  *
14637  * 3) illgrp_insert : If allmulti has already joined, we need to make
14638  *    sure that only one member is joined in the group.
14639  *
14640  * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving
14641  *    allmulti who we have nominated. We need to pick someother ill.
14642  *
14643  * 5) illgrp_delete : The ill we nominated is leaving the group,
14644  *    we need to pick a new ill to join the group.
14645  *
14646  * For (1), (2), (5) - we just have to check whether there is
14647  * a good ill joined in the group. If we could not find any ills
14648  * joined the group, we should join.
14649  *
14650  * For (4), the one that was nominated to receive, left the group.
14651  * There could be nobody joined in the group when this function is
14652  * called.
14653  *
14654  * For (3) - we need to explicitly check whether there are multiple
14655  * ills joined in the group.
14656  *
14657  * For simplicity, we don't differentiate any of the above cases. We
14658  * just leave the group if it is joined on any of them and join on
14659  * the first good ill.
14660  */
14661 int
14662 ill_nominate_mcast_rcv(ill_group_t *illgrp)
14663 {
14664 	ilm_t *ilm;
14665 	ill_t *ill;
14666 	ill_t *fallback_inactive_ill = NULL;
14667 	ill_t *fallback_failed_ill = NULL;
14668 	int ret = 0;
14669 
14670 	/*
14671 	 * Leave the allmulti on all the ills and start fresh.
14672 	 */
14673 	for (ill = illgrp->illgrp_ill; ill != NULL;
14674 	    ill = ill->ill_group_next) {
14675 		if (ill->ill_join_allmulti)
14676 			(void) ip_leave_allmulti(ill->ill_ipif);
14677 	}
14678 
14679 	/*
14680 	 * Choose a good ill. Fallback to inactive or failed if
14681 	 * none available. We need to fallback to FAILED in the
14682 	 * case where we have 2 interfaces in a group - where
14683 	 * one of them is failed and another is a good one and
14684 	 * the good one (not marked inactive) is leaving the group.
14685 	 */
14686 	ret = 0;
14687 	for (ill = illgrp->illgrp_ill; ill != NULL;
14688 	    ill = ill->ill_group_next) {
14689 		/* Never pick an offline interface */
14690 		if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE)
14691 			continue;
14692 
14693 		if (ill->ill_phyint->phyint_flags & PHYI_FAILED) {
14694 			fallback_failed_ill = ill;
14695 			continue;
14696 		}
14697 		if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) {
14698 			fallback_inactive_ill = ill;
14699 			continue;
14700 		}
14701 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
14702 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
14703 				ret = ip_join_allmulti(ill->ill_ipif);
14704 				/*
14705 				 * ip_join_allmulti can fail because of memory
14706 				 * failures. So, make sure we join at least
14707 				 * on one ill.
14708 				 */
14709 				if (ill->ill_join_allmulti)
14710 					return (0);
14711 			}
14712 		}
14713 	}
14714 	if (ret != 0) {
14715 		/*
14716 		 * If we tried nominating above and failed to do so,
14717 		 * return error. We might have tried multiple times.
14718 		 * But, return the latest error.
14719 		 */
14720 		return (ret);
14721 	}
14722 	if ((ill = fallback_inactive_ill) != NULL) {
14723 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
14724 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
14725 				ret = ip_join_allmulti(ill->ill_ipif);
14726 				return (ret);
14727 			}
14728 		}
14729 	} else if ((ill = fallback_failed_ill) != NULL) {
14730 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
14731 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
14732 				ret = ip_join_allmulti(ill->ill_ipif);
14733 				return (ret);
14734 			}
14735 		}
14736 	}
14737 	return (0);
14738 }
14739 
14740 /*
14741  * This function is called from illgrp_delete after it is
14742  * deleted from the group to reschedule responsibilities
14743  * to a different ill.
14744  */
14745 static void
14746 ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp)
14747 {
14748 	ilm_t	*ilm;
14749 	ipif_t	*ipif;
14750 	ipaddr_t subnet_addr;
14751 	ipaddr_t net_addr;
14752 	ipaddr_t net_mask = 0;
14753 	ipaddr_t subnet_netmask;
14754 	ipaddr_t addr;
14755 
14756 	ASSERT(ill->ill_group == NULL);
14757 	/*
14758 	 * Broadcast Responsibility:
14759 	 *
14760 	 * 1. If this ill has been nominated for receiving broadcast
14761 	 * packets, we need to find a new one. Before we find a new
14762 	 * one, we need to re-group the ires that are part of this new
14763 	 * group (assumed by ill_nominate_bcast_rcv). We do this by
14764 	 * calling ill_group_bcast_for_xmit(ill) which will do the right
14765 	 * thing for us.
14766 	 *
14767 	 * 2. If this ill was not nominated for receiving broadcast
14768 	 * packets, we need to clear the IRE_MARK_NORECV flag
14769 	 * so that we continue to send up broadcast packets.
14770 	 */
14771 	if (!ill->ill_isv6) {
14772 		/*
14773 		 * Case 1 above : No optimization here. Just redo the
14774 		 * nomination.
14775 		 */
14776 		ill_group_bcast_for_xmit(ill);
14777 		ill_nominate_bcast_rcv(illgrp);
14778 
14779 		/*
14780 		 * Case 2 above : Lookup and clear IRE_MARK_NORECV.
14781 		 */
14782 		ill_clear_bcast_mark(ill, 0);
14783 		ill_clear_bcast_mark(ill, INADDR_BROADCAST);
14784 
14785 		for (ipif = ill->ill_ipif; ipif != NULL;
14786 		    ipif = ipif->ipif_next) {
14787 
14788 			if (!(ipif->ipif_flags & IPIF_UP) ||
14789 			    ipif->ipif_subnet == 0) {
14790 				continue;
14791 			}
14792 			if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
14793 			    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
14794 				net_mask = ip_net_mask(ipif->ipif_lcl_addr);
14795 			} else {
14796 				net_mask = htonl(IN_CLASSA_NET);
14797 			}
14798 			addr = net_mask & ipif->ipif_subnet;
14799 			ill_clear_bcast_mark(ill, addr);
14800 
14801 			net_addr = ~net_mask | addr;
14802 			ill_clear_bcast_mark(ill, net_addr);
14803 
14804 			subnet_netmask = ipif->ipif_net_mask;
14805 			addr = ipif->ipif_subnet;
14806 			ill_clear_bcast_mark(ill, addr);
14807 
14808 			subnet_addr = ~subnet_netmask | addr;
14809 			ill_clear_bcast_mark(ill, subnet_addr);
14810 		}
14811 	}
14812 
14813 	/*
14814 	 * Multicast Responsibility.
14815 	 *
14816 	 * If we have joined allmulti on this one, find a new member
14817 	 * in the group to join allmulti. As this ill is already part
14818 	 * of allmulti, we don't have to join on this one.
14819 	 *
14820 	 * If we have not joined allmulti on this one, there is no
14821 	 * responsibility to handoff. But we need to take new
14822 	 * responsibility i.e, join allmulti on this one if we need
14823 	 * to.
14824 	 */
14825 	if (ill->ill_join_allmulti) {
14826 		(void) ill_nominate_mcast_rcv(illgrp);
14827 	} else {
14828 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
14829 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
14830 				(void) ip_join_allmulti(ill->ill_ipif);
14831 				break;
14832 			}
14833 		}
14834 	}
14835 
14836 	/*
14837 	 * We intentionally do the flushing of IRE_CACHES only matching
14838 	 * on the ill and not on groups. Note that we are already deleted
14839 	 * from the group.
14840 	 *
14841 	 * This will make sure that all IRE_CACHES whose stq is pointing
14842 	 * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get
14843 	 * deleted and IRE_CACHES that are not pointing at this ill will
14844 	 * be left alone.
14845 	 */
14846 	if (ill->ill_isv6) {
14847 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
14848 		    IRE_CACHE, illgrp_cache_delete, (char *)ill, ill);
14849 	} else {
14850 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
14851 		    IRE_CACHE, illgrp_cache_delete, (char *)ill, ill);
14852 	}
14853 
14854 	/*
14855 	 * Some conn may have cached one of the IREs deleted above. By removing
14856 	 * the ire reference, we clean up the extra reference to the ill held in
14857 	 * ire->ire_stq.
14858 	 */
14859 	ipcl_walk(conn_cleanup_stale_ire, NULL);
14860 
14861 	/*
14862 	 * Re-do source address selection for all the members in the
14863 	 * group, if they borrowed source address from one of the ipifs
14864 	 * in this ill.
14865 	 */
14866 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
14867 		if (ill->ill_isv6) {
14868 			ipif_update_other_ipifs_v6(ipif, illgrp);
14869 		} else {
14870 			ipif_update_other_ipifs(ipif, illgrp);
14871 		}
14872 	}
14873 }
14874 
14875 /*
14876  * Delete the ill from the group. The caller makes sure that it is
14877  * in a group and it okay to delete from the group. So, we always
14878  * delete here.
14879  */
14880 static void
14881 illgrp_delete(ill_t *ill)
14882 {
14883 	ill_group_t *illgrp;
14884 	ill_group_t *tmpg;
14885 	ill_t *tmp_ill;
14886 
14887 	/*
14888 	 * Reset illgrp_ill_schednext if it was pointing at us.
14889 	 * We need to do this before we set ill_group to NULL.
14890 	 */
14891 	rw_enter(&ill_g_lock, RW_WRITER);
14892 	mutex_enter(&ill->ill_lock);
14893 
14894 	illgrp_reset_schednext(ill);
14895 
14896 	illgrp = ill->ill_group;
14897 
14898 	/* Delete the ill from illgrp. */
14899 	if (illgrp->illgrp_ill == ill) {
14900 		illgrp->illgrp_ill = ill->ill_group_next;
14901 	} else {
14902 		tmp_ill = illgrp->illgrp_ill;
14903 		while (tmp_ill->ill_group_next != ill) {
14904 			tmp_ill = tmp_ill->ill_group_next;
14905 			ASSERT(tmp_ill != NULL);
14906 		}
14907 		tmp_ill->ill_group_next = ill->ill_group_next;
14908 	}
14909 	ill->ill_group = NULL;
14910 	ill->ill_group_next = NULL;
14911 
14912 	illgrp->illgrp_ill_count--;
14913 	mutex_exit(&ill->ill_lock);
14914 	rw_exit(&ill_g_lock);
14915 
14916 	/*
14917 	 * As this ill is leaving the group, we need to hand off
14918 	 * the responsibilities to the other ills in the group, if
14919 	 * this ill had some responsibilities.
14920 	 */
14921 
14922 	ill_handoff_responsibility(ill, illgrp);
14923 
14924 	rw_enter(&ill_g_lock, RW_WRITER);
14925 
14926 	if (illgrp->illgrp_ill_count == 0) {
14927 
14928 		ASSERT(illgrp->illgrp_ill == NULL);
14929 		if (ill->ill_isv6) {
14930 			if (illgrp == illgrp_head_v6) {
14931 				illgrp_head_v6 = illgrp->illgrp_next;
14932 			} else {
14933 				tmpg = illgrp_head_v6;
14934 				while (tmpg->illgrp_next != illgrp) {
14935 					tmpg = tmpg->illgrp_next;
14936 					ASSERT(tmpg != NULL);
14937 				}
14938 				tmpg->illgrp_next = illgrp->illgrp_next;
14939 			}
14940 		} else {
14941 			if (illgrp == illgrp_head_v4) {
14942 				illgrp_head_v4 = illgrp->illgrp_next;
14943 			} else {
14944 				tmpg = illgrp_head_v4;
14945 				while (tmpg->illgrp_next != illgrp) {
14946 					tmpg = tmpg->illgrp_next;
14947 					ASSERT(tmpg != NULL);
14948 				}
14949 				tmpg->illgrp_next = illgrp->illgrp_next;
14950 			}
14951 		}
14952 		mutex_destroy(&illgrp->illgrp_lock);
14953 		mi_free(illgrp);
14954 	}
14955 	rw_exit(&ill_g_lock);
14956 
14957 	/*
14958 	 * Even though the ill is out of the group its not necessary
14959 	 * to set ipsq_split as TRUE as the ipifs could be down temporarily
14960 	 * We will split the ipsq when phyint_groupname is set to NULL.
14961 	 */
14962 
14963 	/*
14964 	 * Send a routing sockets message if we are deleting from
14965 	 * groups with names.
14966 	 */
14967 	if (ill->ill_phyint->phyint_groupname_len != 0)
14968 		ip_rts_ifmsg(ill->ill_ipif);
14969 }
14970 
14971 /*
14972  * Re-do source address selection. This is normally called when
14973  * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST
14974  * ipif comes up.
14975  */
14976 void
14977 ill_update_source_selection(ill_t *ill)
14978 {
14979 	ipif_t *ipif;
14980 
14981 	ASSERT(IAM_WRITER_ILL(ill));
14982 
14983 	if (ill->ill_group != NULL)
14984 		ill = ill->ill_group->illgrp_ill;
14985 
14986 	for (; ill != NULL; ill = ill->ill_group_next) {
14987 		for (ipif = ill->ill_ipif; ipif != NULL;
14988 		    ipif = ipif->ipif_next) {
14989 			if (ill->ill_isv6)
14990 				ipif_recreate_interface_routes_v6(NULL, ipif);
14991 			else
14992 				ipif_recreate_interface_routes(NULL, ipif);
14993 		}
14994 	}
14995 }
14996 
14997 /*
14998  * Insert ill in a group headed by illgrp_head. The caller can either
14999  * pass a groupname in which case we search for a group with the
15000  * same name to insert in or pass a group to insert in. This function
15001  * would only search groups with names.
15002  *
15003  * NOTE : The caller should make sure that there is at least one ipif
15004  *	  UP on this ill so that illgrp_scheduler can pick this ill
15005  *	  for outbound packets. If ill_ipif_up_count is zero, we have
15006  *	  already sent a DL_UNBIND to the driver and we don't want to
15007  *	  send anymore packets. We don't assert for ipif_up_count
15008  *	  to be greater than zero, because ipif_up_done wants to call
15009  *	  this function before bumping up the ipif_up_count. See
15010  *	  ipif_up_done() for details.
15011  */
15012 int
15013 illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname,
15014     ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up)
15015 {
15016 	ill_group_t *illgrp;
15017 	ill_t *prev_ill;
15018 	phyint_t *phyi;
15019 
15020 	ASSERT(ill->ill_group == NULL);
15021 
15022 	rw_enter(&ill_g_lock, RW_WRITER);
15023 	mutex_enter(&ill->ill_lock);
15024 
15025 	if (groupname != NULL) {
15026 		/*
15027 		 * Look for a group with a matching groupname to insert.
15028 		 */
15029 		for (illgrp = *illgrp_head; illgrp != NULL;
15030 		    illgrp = illgrp->illgrp_next) {
15031 
15032 			ill_t *tmp_ill;
15033 
15034 			/*
15035 			 * If we have an ill_group_t in the list which has
15036 			 * no ill_t assigned then we must be in the process of
15037 			 * removing this group. We skip this as illgrp_delete()
15038 			 * will remove it from the list.
15039 			 */
15040 			if ((tmp_ill = illgrp->illgrp_ill) == NULL) {
15041 				ASSERT(illgrp->illgrp_ill_count == 0);
15042 				continue;
15043 			}
15044 
15045 			ASSERT(tmp_ill->ill_phyint != NULL);
15046 			phyi = tmp_ill->ill_phyint;
15047 			/*
15048 			 * Look at groups which has names only.
15049 			 */
15050 			if (phyi->phyint_groupname_len == 0)
15051 				continue;
15052 			/*
15053 			 * Names are stored in the phyint common to both
15054 			 * IPv4 and IPv6.
15055 			 */
15056 			if (mi_strcmp(phyi->phyint_groupname,
15057 			    groupname) == 0) {
15058 				break;
15059 			}
15060 		}
15061 	} else {
15062 		/*
15063 		 * If the caller passes in a NULL "grp_to_insert", we
15064 		 * allocate one below and insert this singleton.
15065 		 */
15066 		illgrp = grp_to_insert;
15067 	}
15068 
15069 	ill->ill_group_next = NULL;
15070 
15071 	if (illgrp == NULL) {
15072 		illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t));
15073 		if (illgrp == NULL) {
15074 			return (ENOMEM);
15075 		}
15076 		illgrp->illgrp_next = *illgrp_head;
15077 		*illgrp_head = illgrp;
15078 		illgrp->illgrp_ill = ill;
15079 		illgrp->illgrp_ill_count = 1;
15080 		ill->ill_group = illgrp;
15081 		/*
15082 		 * Used in illgrp_scheduler to protect multiple threads
15083 		 * from traversing the list.
15084 		 */
15085 		mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0);
15086 	} else {
15087 		ASSERT(ill->ill_net_type ==
15088 		    illgrp->illgrp_ill->ill_net_type);
15089 		ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type);
15090 
15091 		/* Insert ill at tail of this group */
15092 		prev_ill = illgrp->illgrp_ill;
15093 		while (prev_ill->ill_group_next != NULL)
15094 			prev_ill = prev_ill->ill_group_next;
15095 		prev_ill->ill_group_next = ill;
15096 		ill->ill_group = illgrp;
15097 		illgrp->illgrp_ill_count++;
15098 		/*
15099 		 * Inherit group properties. Currently only forwarding
15100 		 * is the property we try to keep the same with all the
15101 		 * ills. When there are more, we will abstract this into
15102 		 * a function.
15103 		 */
15104 		ill->ill_flags &= ~ILLF_ROUTER;
15105 		ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER);
15106 	}
15107 	mutex_exit(&ill->ill_lock);
15108 	rw_exit(&ill_g_lock);
15109 
15110 	/*
15111 	 * 1) When ipif_up_done() calls this function, ipif_up_count
15112 	 *    may be zero as it has not yet been bumped. But the ires
15113 	 *    have already been added. So, we do the nomination here
15114 	 *    itself. But, when ip_sioctl_groupname calls this, it checks
15115 	 *    for ill_ipif_up_count != 0. Thus we don't check for
15116 	 *    ill_ipif_up_count here while nominating broadcast ires for
15117 	 *    receive.
15118 	 *
15119 	 * 2) Similarly, we need to call ill_group_bcast_for_xmit here
15120 	 *    to group them properly as ire_add() has already happened
15121 	 *    in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert
15122 	 *    case, we need to do it here anyway.
15123 	 */
15124 	if (!ill->ill_isv6) {
15125 		ill_group_bcast_for_xmit(ill);
15126 		ill_nominate_bcast_rcv(illgrp);
15127 	}
15128 
15129 	if (!ipif_is_coming_up) {
15130 		/*
15131 		 * When ipif_up_done() calls this function, the multicast
15132 		 * groups have not been joined yet. So, there is no point in
15133 		 * nomination. ip_join_allmulti will handle groups when
15134 		 * ill_recover_multicast is called from ipif_up_done() later.
15135 		 */
15136 		(void) ill_nominate_mcast_rcv(illgrp);
15137 		/*
15138 		 * ipif_up_done calls ill_update_source_selection
15139 		 * anyway. Moreover, we don't want to re-create
15140 		 * interface routes while ipif_up_done() still has reference
15141 		 * to them. Refer to ipif_up_done() for more details.
15142 		 */
15143 		ill_update_source_selection(ill);
15144 	}
15145 
15146 	/*
15147 	 * Send a routing sockets message if we are inserting into
15148 	 * groups with names.
15149 	 */
15150 	if (groupname != NULL)
15151 		ip_rts_ifmsg(ill->ill_ipif);
15152 	return (0);
15153 }
15154 
15155 /*
15156  * Return the first phyint matching the groupname. There could
15157  * be more than one when there are ill groups.
15158  *
15159  * Needs work: called only from ip_sioctl_groupname
15160  */
15161 static phyint_t *
15162 phyint_lookup_group(char *groupname)
15163 {
15164 	phyint_t *phyi;
15165 
15166 	ASSERT(RW_LOCK_HELD(&ill_g_lock));
15167 	/*
15168 	 * Group names are stored in the phyint - a common structure
15169 	 * to both IPv4 and IPv6.
15170 	 */
15171 	phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index);
15172 	for (; phyi != NULL;
15173 	    phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index,
15174 	    phyi, AVL_AFTER)) {
15175 		if (phyi->phyint_groupname_len == 0)
15176 			continue;
15177 		ASSERT(phyi->phyint_groupname != NULL);
15178 		if (mi_strcmp(groupname, phyi->phyint_groupname) == 0)
15179 			return (phyi);
15180 	}
15181 	return (NULL);
15182 }
15183 
15184 
15185 
15186 /*
15187  * MT notes on creation and deletion of IPMP groups
15188  *
15189  * Creation and deletion of IPMP groups introduce the need to merge or
15190  * split the associated serialization objects i.e the ipsq's. Normally all
15191  * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled
15192  * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during
15193  * the execution of the SIOCSLIFGROUPNAME command the picture changes. There
15194  * is a need to change the <ill-ipsq> association and we have to operate on both
15195  * the source and destination IPMP groups. For eg. attempting to set the
15196  * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to
15197  * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the
15198  * source or destination IPMP group are mapped to a single ipsq for executing
15199  * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's.
15200  * The <ill-ipsq> mapping is restored back to normal at a later point. This is
15201  * termed as a split of the ipsq. The converse of the merge i.e. a split of the
15202  * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname
15203  * occurred on the ipsq, then the ipsq_split flag is set. This indicates the
15204  * ipsq has to be examined for redoing the <ill-ipsq> associations.
15205  *
15206  * In the above example the ioctl handling code locates the current ipsq of hme0
15207  * which is ipsq(mpk17-84). It then enters the above ipsq immediately or
15208  * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates
15209  * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into
15210  * the destination ipsq. If the destination ipsq is not busy, it also enters
15211  * the destination ipsq exclusively. Now the actual groupname setting operation
15212  * can proceed. If the destination ipsq is busy, the operation is enqueued
15213  * on the destination (merged) ipsq and will be handled in the unwind from
15214  * ipsq_exit.
15215  *
15216  * To prevent other threads accessing the ill while the group name change is
15217  * in progres, we bring down the ipifs which also removes the ill from the
15218  * group. The group is changed in phyint and when the first ipif on the ill
15219  * is brought up, the ill is inserted into the right IPMP group by
15220  * illgrp_insert.
15221  */
15222 /* ARGSUSED */
15223 int
15224 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15225     ip_ioctl_cmd_t *ipip, void *ifreq)
15226 {
15227 	int i;
15228 	char *tmp;
15229 	int namelen;
15230 	ill_t *ill = ipif->ipif_ill;
15231 	ill_t *ill_v4, *ill_v6;
15232 	int err = 0;
15233 	phyint_t *phyi;
15234 	phyint_t *phyi_tmp;
15235 	struct lifreq *lifr;
15236 	mblk_t	*mp1;
15237 	char *groupname;
15238 	ipsq_t *ipsq;
15239 
15240 	ASSERT(IAM_WRITER_IPIF(ipif));
15241 
15242 	/* Existance verified in ip_wput_nondata */
15243 	mp1 = mp->b_cont->b_cont;
15244 	lifr = (struct lifreq *)mp1->b_rptr;
15245 	groupname = lifr->lifr_groupname;
15246 
15247 	if (ipif->ipif_id != 0)
15248 		return (EINVAL);
15249 
15250 	phyi = ill->ill_phyint;
15251 	ASSERT(phyi != NULL);
15252 
15253 	if (phyi->phyint_flags & PHYI_VIRTUAL)
15254 		return (EINVAL);
15255 
15256 	tmp = groupname;
15257 	for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++)
15258 		;
15259 
15260 	if (i == LIFNAMSIZ) {
15261 		/* no null termination */
15262 		return (EINVAL);
15263 	}
15264 
15265 	/*
15266 	 * Calculate the namelen exclusive of the null
15267 	 * termination character.
15268 	 */
15269 	namelen = tmp - groupname;
15270 
15271 	ill_v4 = phyi->phyint_illv4;
15272 	ill_v6 = phyi->phyint_illv6;
15273 
15274 	/*
15275 	 * ILL cannot be part of a usesrc group and and IPMP group at the
15276 	 * same time. No need to grab the ill_g_usesrc_lock here, see
15277 	 * synchronization notes in ip.c
15278 	 */
15279 	if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) {
15280 		return (EINVAL);
15281 	}
15282 
15283 	/*
15284 	 * mark the ill as changing.
15285 	 * this should queue all new requests on the syncq.
15286 	 */
15287 	GRAB_ILL_LOCKS(ill_v4, ill_v6);
15288 
15289 	if (ill_v4 != NULL)
15290 		ill_v4->ill_state_flags |= ILL_CHANGING;
15291 	if (ill_v6 != NULL)
15292 		ill_v6->ill_state_flags |= ILL_CHANGING;
15293 	RELEASE_ILL_LOCKS(ill_v4, ill_v6);
15294 
15295 	if (namelen == 0) {
15296 		/*
15297 		 * Null string means remove this interface from the
15298 		 * existing group.
15299 		 */
15300 		if (phyi->phyint_groupname_len == 0) {
15301 			/*
15302 			 * Never was in a group.
15303 			 */
15304 			err = 0;
15305 			goto done;
15306 		}
15307 
15308 		/*
15309 		 * IPv4 or IPv6 may be temporarily out of the group when all
15310 		 * the ipifs are down. Thus, we need to check for ill_group to
15311 		 * be non-NULL.
15312 		 */
15313 		if (ill_v4 != NULL && ill_v4->ill_group != NULL) {
15314 			ill_down_ipifs(ill_v4, mp, 0, B_FALSE);
15315 			mutex_enter(&ill_v4->ill_lock);
15316 			if (!ill_is_quiescent(ill_v4)) {
15317 				/*
15318 				 * ipsq_pending_mp_add will not fail since
15319 				 * connp is NULL
15320 				 */
15321 				(void) ipsq_pending_mp_add(NULL,
15322 				    ill_v4->ill_ipif, q, mp, ILL_DOWN);
15323 				mutex_exit(&ill_v4->ill_lock);
15324 				err = EINPROGRESS;
15325 				goto done;
15326 			}
15327 			mutex_exit(&ill_v4->ill_lock);
15328 		}
15329 
15330 		if (ill_v6 != NULL && ill_v6->ill_group != NULL) {
15331 			ill_down_ipifs(ill_v6, mp, 0, B_FALSE);
15332 			mutex_enter(&ill_v6->ill_lock);
15333 			if (!ill_is_quiescent(ill_v6)) {
15334 				(void) ipsq_pending_mp_add(NULL,
15335 				    ill_v6->ill_ipif, q, mp, ILL_DOWN);
15336 				mutex_exit(&ill_v6->ill_lock);
15337 				err = EINPROGRESS;
15338 				goto done;
15339 			}
15340 			mutex_exit(&ill_v6->ill_lock);
15341 		}
15342 
15343 		rw_enter(&ill_g_lock, RW_WRITER);
15344 		GRAB_ILL_LOCKS(ill_v4, ill_v6);
15345 		mutex_enter(&phyi->phyint_lock);
15346 		ASSERT(phyi->phyint_groupname != NULL);
15347 		mi_free(phyi->phyint_groupname);
15348 		phyi->phyint_groupname = NULL;
15349 		phyi->phyint_groupname_len = 0;
15350 		mutex_exit(&phyi->phyint_lock);
15351 		RELEASE_ILL_LOCKS(ill_v4, ill_v6);
15352 		rw_exit(&ill_g_lock);
15353 		err = ill_up_ipifs(ill, q, mp);
15354 
15355 		/*
15356 		 * set the split flag so that the ipsq can be split
15357 		 */
15358 		mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
15359 		phyi->phyint_ipsq->ipsq_split = B_TRUE;
15360 		mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
15361 
15362 	} else {
15363 		if (phyi->phyint_groupname_len != 0) {
15364 			ASSERT(phyi->phyint_groupname != NULL);
15365 			/* Are we inserting in the same group ? */
15366 			if (mi_strcmp(groupname,
15367 			    phyi->phyint_groupname) == 0) {
15368 				err = 0;
15369 				goto done;
15370 			}
15371 		}
15372 
15373 		rw_enter(&ill_g_lock, RW_READER);
15374 		/*
15375 		 * Merge ipsq for the group's.
15376 		 * This check is here as multiple groups/ills might be
15377 		 * sharing the same ipsq.
15378 		 * If we have to merege than the operation is restarted
15379 		 * on the new ipsq.
15380 		 */
15381 		ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL);
15382 		if (phyi->phyint_ipsq != ipsq) {
15383 			rw_exit(&ill_g_lock);
15384 			err = ill_merge_groups(ill, NULL, groupname, mp, q);
15385 			goto done;
15386 		}
15387 		/*
15388 		 * Running exclusive on new ipsq.
15389 		 */
15390 
15391 		ASSERT(ipsq != NULL);
15392 		ASSERT(ipsq->ipsq_writer == curthread);
15393 
15394 		/*
15395 		 * Check whether the ill_type and ill_net_type matches before
15396 		 * we allocate any memory so that the cleanup is easier.
15397 		 *
15398 		 * We can't group dissimilar ones as we can't load spread
15399 		 * packets across the group because of potential link-level
15400 		 * header differences.
15401 		 */
15402 		phyi_tmp = phyint_lookup_group(groupname);
15403 		if (phyi_tmp != NULL) {
15404 			if ((ill_v4 != NULL &&
15405 			    phyi_tmp->phyint_illv4 != NULL) &&
15406 			    ((ill_v4->ill_net_type !=
15407 			    phyi_tmp->phyint_illv4->ill_net_type) ||
15408 			    (ill_v4->ill_type !=
15409 			    phyi_tmp->phyint_illv4->ill_type))) {
15410 				mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
15411 				phyi->phyint_ipsq->ipsq_split = B_TRUE;
15412 				mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
15413 				rw_exit(&ill_g_lock);
15414 				return (EINVAL);
15415 			}
15416 			if ((ill_v6 != NULL &&
15417 			    phyi_tmp->phyint_illv6 != NULL) &&
15418 			    ((ill_v6->ill_net_type !=
15419 			    phyi_tmp->phyint_illv6->ill_net_type) ||
15420 			    (ill_v6->ill_type !=
15421 			    phyi_tmp->phyint_illv6->ill_type))) {
15422 				mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
15423 				phyi->phyint_ipsq->ipsq_split = B_TRUE;
15424 				mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
15425 				rw_exit(&ill_g_lock);
15426 				return (EINVAL);
15427 			}
15428 		}
15429 
15430 		rw_exit(&ill_g_lock);
15431 
15432 		/*
15433 		 * bring down all v4 ipifs.
15434 		 */
15435 		if (ill_v4 != NULL) {
15436 			ill_down_ipifs(ill_v4, mp, 0, B_FALSE);
15437 		}
15438 
15439 		/*
15440 		 * bring down all v6 ipifs.
15441 		 */
15442 		if (ill_v6 != NULL) {
15443 			ill_down_ipifs(ill_v6, mp, 0, B_FALSE);
15444 		}
15445 
15446 		/*
15447 		 * make sure all ipifs are down and there are no active
15448 		 * references. Call to ipsq_pending_mp_add will not fail
15449 		 * since connp is NULL.
15450 		 */
15451 		if (ill_v4 != NULL) {
15452 			mutex_enter(&ill_v4->ill_lock);
15453 			if (!ill_is_quiescent(ill_v4)) {
15454 				(void) ipsq_pending_mp_add(NULL,
15455 				    ill_v4->ill_ipif, q, mp, ILL_DOWN);
15456 				mutex_exit(&ill_v4->ill_lock);
15457 				err = EINPROGRESS;
15458 				goto done;
15459 			}
15460 			mutex_exit(&ill_v4->ill_lock);
15461 		}
15462 
15463 		if (ill_v6 != NULL) {
15464 			mutex_enter(&ill_v6->ill_lock);
15465 			if (!ill_is_quiescent(ill_v6)) {
15466 				(void) ipsq_pending_mp_add(NULL,
15467 				    ill_v6->ill_ipif, q, mp, ILL_DOWN);
15468 				mutex_exit(&ill_v6->ill_lock);
15469 				err = EINPROGRESS;
15470 				goto done;
15471 			}
15472 			mutex_exit(&ill_v6->ill_lock);
15473 		}
15474 
15475 		/*
15476 		 * allocate including space for null terminator
15477 		 * before we insert.
15478 		 */
15479 		tmp = (char *)mi_alloc(namelen + 1, BPRI_MED);
15480 		if (tmp == NULL)
15481 			return (ENOMEM);
15482 
15483 		rw_enter(&ill_g_lock, RW_WRITER);
15484 		GRAB_ILL_LOCKS(ill_v4, ill_v6);
15485 		mutex_enter(&phyi->phyint_lock);
15486 		if (phyi->phyint_groupname_len != 0) {
15487 			ASSERT(phyi->phyint_groupname != NULL);
15488 			mi_free(phyi->phyint_groupname);
15489 		}
15490 
15491 		/*
15492 		 * setup the new group name.
15493 		 */
15494 		phyi->phyint_groupname = tmp;
15495 		bcopy(groupname, phyi->phyint_groupname, namelen + 1);
15496 		phyi->phyint_groupname_len = namelen + 1;
15497 		mutex_exit(&phyi->phyint_lock);
15498 		RELEASE_ILL_LOCKS(ill_v4, ill_v6);
15499 		rw_exit(&ill_g_lock);
15500 
15501 		err = ill_up_ipifs(ill, q, mp);
15502 	}
15503 
15504 done:
15505 	/*
15506 	 *  normally ILL_CHANGING is cleared in ill_up_ipifs.
15507 	 */
15508 	if (err != EINPROGRESS) {
15509 		GRAB_ILL_LOCKS(ill_v4, ill_v6);
15510 		if (ill_v4 != NULL)
15511 			ill_v4->ill_state_flags &= ~ILL_CHANGING;
15512 		if (ill_v6 != NULL)
15513 			ill_v6->ill_state_flags &= ~ILL_CHANGING;
15514 		RELEASE_ILL_LOCKS(ill_v4, ill_v6);
15515 	}
15516 	return (err);
15517 }
15518 
15519 /* ARGSUSED */
15520 int
15521 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
15522     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15523 {
15524 	ill_t *ill;
15525 	phyint_t *phyi;
15526 	struct lifreq *lifr;
15527 	mblk_t	*mp1;
15528 
15529 	/* Existence verified in ip_wput_nondata */
15530 	mp1 = mp->b_cont->b_cont;
15531 	lifr = (struct lifreq *)mp1->b_rptr;
15532 	ill = ipif->ipif_ill;
15533 	phyi = ill->ill_phyint;
15534 
15535 	lifr->lifr_groupname[0] = '\0';
15536 	/*
15537 	 * ill_group may be null if all the interfaces
15538 	 * are down. But still, the phyint should always
15539 	 * hold the name.
15540 	 */
15541 	if (phyi->phyint_groupname_len != 0) {
15542 		bcopy(phyi->phyint_groupname, lifr->lifr_groupname,
15543 		    phyi->phyint_groupname_len);
15544 	}
15545 
15546 	return (0);
15547 }
15548 
15549 
15550 typedef struct conn_move_s {
15551 	ill_t	*cm_from_ill;
15552 	ill_t	*cm_to_ill;
15553 	int	cm_ifindex;
15554 } conn_move_t;
15555 
15556 /*
15557  * ipcl_walk function for moving conn_multicast_ill for a given ill.
15558  */
15559 static void
15560 conn_move(conn_t *connp, caddr_t arg)
15561 {
15562 	conn_move_t *connm;
15563 	int ifindex;
15564 	int i;
15565 	ill_t *from_ill;
15566 	ill_t *to_ill;
15567 	ilg_t *ilg;
15568 	ilm_t *ret_ilm;
15569 
15570 	connm = (conn_move_t *)arg;
15571 	ifindex = connm->cm_ifindex;
15572 	from_ill = connm->cm_from_ill;
15573 	to_ill = connm->cm_to_ill;
15574 
15575 	/* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */
15576 
15577 	/* All multicast fields protected by conn_lock */
15578 	mutex_enter(&connp->conn_lock);
15579 	ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill);
15580 	if ((connp->conn_outgoing_ill == from_ill) &&
15581 	    (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) {
15582 		connp->conn_outgoing_ill = to_ill;
15583 		connp->conn_incoming_ill = to_ill;
15584 	}
15585 
15586 	/* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */
15587 
15588 	if ((connp->conn_multicast_ill == from_ill) &&
15589 	    (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) {
15590 		connp->conn_multicast_ill = connm->cm_to_ill;
15591 	}
15592 
15593 	/* Change IP_XMIT_IF associations */
15594 	if ((connp->conn_xmit_if_ill == from_ill) &&
15595 	    (ifindex == 0 || connp->conn_orig_xmit_ifindex == ifindex)) {
15596 		connp->conn_xmit_if_ill = to_ill;
15597 	}
15598 	/*
15599 	 * Change the ilg_ill to point to the new one. This assumes
15600 	 * ilm_move_v6 has moved the ilms to new_ill and the driver
15601 	 * has been told to receive packets on this interface.
15602 	 * ilm_move_v6 FAILBACKS all the ilms successfully always.
15603 	 * But when doing a FAILOVER, it might fail with ENOMEM and so
15604 	 * some ilms may not have moved. We check to see whether
15605 	 * the ilms have moved to to_ill. We can't check on from_ill
15606 	 * as in the process of moving, we could have split an ilm
15607 	 * in to two - which has the same orig_ifindex and v6group.
15608 	 *
15609 	 * For IPv4, ilg_ipif moves implicitly. The code below really
15610 	 * does not do anything for IPv4 as ilg_ill is NULL for IPv4.
15611 	 */
15612 	for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
15613 		ilg = &connp->conn_ilg[i];
15614 		if ((ilg->ilg_ill == from_ill) &&
15615 		    (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) {
15616 			/* ifindex != 0 indicates failback */
15617 			if (ifindex != 0) {
15618 				connp->conn_ilg[i].ilg_ill = to_ill;
15619 				continue;
15620 			}
15621 
15622 			ret_ilm = ilm_lookup_ill_index_v6(to_ill,
15623 			    &ilg->ilg_v6group, ilg->ilg_orig_ifindex,
15624 			    connp->conn_zoneid);
15625 
15626 			if (ret_ilm != NULL)
15627 				connp->conn_ilg[i].ilg_ill = to_ill;
15628 		}
15629 	}
15630 	mutex_exit(&connp->conn_lock);
15631 }
15632 
15633 static void
15634 conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex)
15635 {
15636 	conn_move_t connm;
15637 
15638 	connm.cm_from_ill = from_ill;
15639 	connm.cm_to_ill = to_ill;
15640 	connm.cm_ifindex = ifindex;
15641 
15642 	ipcl_walk(conn_move, (caddr_t)&connm);
15643 }
15644 
15645 /*
15646  * ilm has been moved from from_ill to to_ill.
15647  * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill.
15648  * appropriately.
15649  *
15650  * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because
15651  *	  the code there de-references ipif_ill to get the ill to
15652  *	  send multicast requests. It does not work as ipif is on its
15653  *	  move and already moved when this function is called.
15654  *	  Thus, we need to use from_ill and to_ill send down multicast
15655  *	  requests.
15656  */
15657 static void
15658 ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill)
15659 {
15660 	ipif_t *ipif;
15661 	ilm_t *ilm;
15662 
15663 	/*
15664 	 * See whether we need to send down DL_ENABMULTI_REQ on
15665 	 * to_ill as ilm has just been added.
15666 	 */
15667 	ASSERT(IAM_WRITER_ILL(to_ill));
15668 	ASSERT(IAM_WRITER_ILL(from_ill));
15669 
15670 	ILM_WALKER_HOLD(to_ill);
15671 	for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
15672 
15673 		if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED))
15674 			continue;
15675 		/*
15676 		 * no locks held, ill/ipif cannot dissappear as long
15677 		 * as we are writer.
15678 		 */
15679 		ipif = to_ill->ill_ipif;
15680 		/*
15681 		 * No need to hold any lock as we are the writer and this
15682 		 * can only be changed by a writer.
15683 		 */
15684 		ilm->ilm_is_new = B_FALSE;
15685 
15686 		if (to_ill->ill_net_type != IRE_IF_RESOLVER ||
15687 		    ipif->ipif_flags & IPIF_POINTOPOINT) {
15688 			ip1dbg(("ilm_send_multicast_reqs: to_ill not "
15689 			    "resolver\n"));
15690 			continue;		/* Must be IRE_IF_NORESOLVER */
15691 		}
15692 
15693 
15694 		if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
15695 			ip1dbg(("ilm_send_multicast_reqs: "
15696 			    "to_ill MULTI_BCAST\n"));
15697 			goto from;
15698 		}
15699 
15700 		if (to_ill->ill_isv6)
15701 			mld_joingroup(ilm);
15702 		else
15703 			igmp_joingroup(ilm);
15704 
15705 		if (to_ill->ill_ipif_up_count == 0) {
15706 			/*
15707 			 * Nobody there. All multicast addresses will be
15708 			 * re-joined when we get the DL_BIND_ACK bringing the
15709 			 * interface up.
15710 			 */
15711 			ilm->ilm_notify_driver = B_FALSE;
15712 			ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n"));
15713 			goto from;
15714 		}
15715 
15716 		/*
15717 		 * For allmulti address, we want to join on only one interface.
15718 		 * Checking for ilm_numentries_v6 is not correct as you may
15719 		 * find an ilm with zero address on to_ill, but we may not
15720 		 * have nominated to_ill for receiving. Thus, if we have
15721 		 * nominated from_ill (ill_join_allmulti is set), nominate
15722 		 * only if to_ill is not already nominated (to_ill normally
15723 		 * should not have been nominated if "from_ill" has already
15724 		 * been nominated. As we don't prevent failovers from happening
15725 		 * across groups, we don't assert).
15726 		 */
15727 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
15728 			/*
15729 			 * There is no need to hold ill locks as we are
15730 			 * writer on both ills and when ill_join_allmulti
15731 			 * is changed the thread is always a writer.
15732 			 */
15733 			if (from_ill->ill_join_allmulti &&
15734 			    !to_ill->ill_join_allmulti) {
15735 				(void) ip_join_allmulti(to_ill->ill_ipif);
15736 			}
15737 		} else if (ilm->ilm_notify_driver) {
15738 
15739 			/*
15740 			 * This is a newly moved ilm so we need to tell the
15741 			 * driver about the new group. There can be more than
15742 			 * one ilm's for the same group in the list each with a
15743 			 * different orig_ifindex. We have to inform the driver
15744 			 * once. In ilm_move_v[4,6] we only set the flag
15745 			 * ilm_notify_driver for the first ilm.
15746 			 */
15747 
15748 			(void) ip_ll_send_enabmulti_req(to_ill,
15749 			    &ilm->ilm_v6addr);
15750 		}
15751 
15752 		ilm->ilm_notify_driver = B_FALSE;
15753 
15754 		/*
15755 		 * See whether we need to send down DL_DISABMULTI_REQ on
15756 		 * from_ill as ilm has just been removed.
15757 		 */
15758 from:
15759 		ipif = from_ill->ill_ipif;
15760 		if (from_ill->ill_net_type != IRE_IF_RESOLVER ||
15761 		    ipif->ipif_flags & IPIF_POINTOPOINT) {
15762 			ip1dbg(("ilm_send_multicast_reqs: "
15763 			    "from_ill not resolver\n"));
15764 			continue;		/* Must be IRE_IF_NORESOLVER */
15765 		}
15766 
15767 		if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
15768 			ip1dbg(("ilm_send_multicast_reqs: "
15769 			    "from_ill MULTI_BCAST\n"));
15770 			continue;
15771 		}
15772 
15773 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
15774 			if (from_ill->ill_join_allmulti)
15775 			    (void) ip_leave_allmulti(from_ill->ill_ipif);
15776 		} else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) {
15777 			(void) ip_ll_send_disabmulti_req(from_ill,
15778 		    &ilm->ilm_v6addr);
15779 		}
15780 	}
15781 	ILM_WALKER_RELE(to_ill);
15782 }
15783 
15784 /*
15785  * This function is called when all multicast memberships needs
15786  * to be moved from "from_ill" to "to_ill" for IPv6. This function is
15787  * called only once unlike the IPv4 counterpart where it is called after
15788  * every logical interface is moved. The reason is due to multicast
15789  * memberships are joined using an interface address in IPv4 while in
15790  * IPv6, interface index is used.
15791  */
15792 static void
15793 ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex)
15794 {
15795 	ilm_t	*ilm;
15796 	ilm_t	*ilm_next;
15797 	ilm_t	*new_ilm;
15798 	ilm_t	**ilmp;
15799 	int	count;
15800 	char buf[INET6_ADDRSTRLEN];
15801 	in6_addr_t ipv6_snm = ipv6_solicited_node_mcast;
15802 
15803 	ASSERT(MUTEX_HELD(&to_ill->ill_lock));
15804 	ASSERT(MUTEX_HELD(&from_ill->ill_lock));
15805 	ASSERT(RW_WRITE_HELD(&ill_g_lock));
15806 
15807 	if (ifindex == 0) {
15808 		/*
15809 		 * Form the solicited node mcast address which is used later.
15810 		 */
15811 		ipif_t *ipif;
15812 
15813 		ipif = from_ill->ill_ipif;
15814 		ASSERT(ipif->ipif_id == 0);
15815 
15816 		ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
15817 	}
15818 
15819 	ilmp = &from_ill->ill_ilm;
15820 	for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) {
15821 		ilm_next = ilm->ilm_next;
15822 
15823 		if (ilm->ilm_flags & ILM_DELETED) {
15824 			ilmp = &ilm->ilm_next;
15825 			continue;
15826 		}
15827 
15828 		new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr,
15829 		    ilm->ilm_orig_ifindex, ilm->ilm_zoneid);
15830 		ASSERT(ilm->ilm_orig_ifindex != 0);
15831 		if (ilm->ilm_orig_ifindex == ifindex) {
15832 			/*
15833 			 * We are failing back multicast memberships.
15834 			 * If the same ilm exists in to_ill, it means somebody
15835 			 * has joined the same group there e.g. ff02::1
15836 			 * is joined within the kernel when the interfaces
15837 			 * came UP.
15838 			 */
15839 			ASSERT(ilm->ilm_ipif == NULL);
15840 			if (new_ilm != NULL) {
15841 				new_ilm->ilm_refcnt += ilm->ilm_refcnt;
15842 				if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
15843 				    !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
15844 					new_ilm->ilm_is_new = B_TRUE;
15845 				}
15846 			} else {
15847 				/*
15848 				 * check if we can just move the ilm
15849 				 */
15850 				if (from_ill->ill_ilm_walker_cnt != 0) {
15851 					/*
15852 					 * We have walkers we cannot move
15853 					 * the ilm, so allocate a new ilm,
15854 					 * this (old) ilm will be marked
15855 					 * ILM_DELETED at the end of the loop
15856 					 * and will be freed when the
15857 					 * last walker exits.
15858 					 */
15859 					new_ilm = (ilm_t *)mi_zalloc
15860 					    (sizeof (ilm_t));
15861 					if (new_ilm == NULL) {
15862 						ip0dbg(("ilm_move_v6: "
15863 						    "FAILBACK of IPv6"
15864 						    " multicast address %s : "
15865 						    "from %s to"
15866 						    " %s failed : ENOMEM \n",
15867 						    inet_ntop(AF_INET6,
15868 						    &ilm->ilm_v6addr, buf,
15869 						    sizeof (buf)),
15870 						    from_ill->ill_name,
15871 						    to_ill->ill_name));
15872 
15873 							ilmp = &ilm->ilm_next;
15874 							continue;
15875 					}
15876 					*new_ilm = *ilm;
15877 					/*
15878 					 * we don't want new_ilm linked to
15879 					 * ilm's filter list.
15880 					 */
15881 					new_ilm->ilm_filter = NULL;
15882 				} else {
15883 					/*
15884 					 * No walkers we can move the ilm.
15885 					 * lets take it out of the list.
15886 					 */
15887 					*ilmp = ilm->ilm_next;
15888 					ilm->ilm_next = NULL;
15889 					new_ilm = ilm;
15890 				}
15891 
15892 				/*
15893 				 * if this is the first ilm for the group
15894 				 * set ilm_notify_driver so that we notify the
15895 				 * driver in ilm_send_multicast_reqs.
15896 				 */
15897 				if (ilm_lookup_ill_v6(to_ill,
15898 				    &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
15899 					new_ilm->ilm_notify_driver = B_TRUE;
15900 
15901 				new_ilm->ilm_ill = to_ill;
15902 				/* Add to the to_ill's list */
15903 				new_ilm->ilm_next = to_ill->ill_ilm;
15904 				to_ill->ill_ilm = new_ilm;
15905 				/*
15906 				 * set the flag so that mld_joingroup is
15907 				 * called in ilm_send_multicast_reqs().
15908 				 */
15909 				new_ilm->ilm_is_new = B_TRUE;
15910 			}
15911 			goto bottom;
15912 		} else if (ifindex != 0) {
15913 			/*
15914 			 * If this is FAILBACK (ifindex != 0) and the ifindex
15915 			 * has not matched above, look at the next ilm.
15916 			 */
15917 			ilmp = &ilm->ilm_next;
15918 			continue;
15919 		}
15920 		/*
15921 		 * If we are here, it means ifindex is 0. Failover
15922 		 * everything.
15923 		 *
15924 		 * We need to handle solicited node mcast address
15925 		 * and all_nodes mcast address differently as they
15926 		 * are joined witin the kenrel (ipif_multicast_up)
15927 		 * and potentially from the userland. We are called
15928 		 * after the ipifs of from_ill has been moved.
15929 		 * If we still find ilms on ill with solicited node
15930 		 * mcast address or all_nodes mcast address, it must
15931 		 * belong to the UP interface that has not moved e.g.
15932 		 * ipif_id 0 with the link local prefix does not move.
15933 		 * We join this on the new ill accounting for all the
15934 		 * userland memberships so that applications don't
15935 		 * see any failure.
15936 		 *
15937 		 * We need to make sure that we account only for the
15938 		 * solicited node and all node multicast addresses
15939 		 * that was brought UP on these. In the case of
15940 		 * a failover from A to B, we might have ilms belonging
15941 		 * to A (ilm_orig_ifindex pointing at A) on B accounting
15942 		 * for the membership from the userland. If we are failing
15943 		 * over from B to C now, we will find the ones belonging
15944 		 * to A on B. These don't account for the ill_ipif_up_count.
15945 		 * They just move from B to C. The check below on
15946 		 * ilm_orig_ifindex ensures that.
15947 		 */
15948 		if ((ilm->ilm_orig_ifindex ==
15949 		    from_ill->ill_phyint->phyint_ifindex) &&
15950 		    (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) ||
15951 		    IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast,
15952 		    &ilm->ilm_v6addr))) {
15953 			ASSERT(ilm->ilm_refcnt > 0);
15954 			count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count;
15955 			/*
15956 			 * For indentation reasons, we are not using a
15957 			 * "else" here.
15958 			 */
15959 			if (count == 0) {
15960 				ilmp = &ilm->ilm_next;
15961 				continue;
15962 			}
15963 			ilm->ilm_refcnt -= count;
15964 			if (new_ilm != NULL) {
15965 				/*
15966 				 * Can find one with the same
15967 				 * ilm_orig_ifindex, if we are failing
15968 				 * over to a STANDBY. This happens
15969 				 * when somebody wants to join a group
15970 				 * on a STANDBY interface and we
15971 				 * internally join on a different one.
15972 				 * If we had joined on from_ill then, a
15973 				 * failover now will find a new ilm
15974 				 * with this index.
15975 				 */
15976 				ip1dbg(("ilm_move_v6: FAILOVER, found"
15977 				    " new ilm on %s, group address %s\n",
15978 				    to_ill->ill_name,
15979 				    inet_ntop(AF_INET6,
15980 				    &ilm->ilm_v6addr, buf,
15981 				    sizeof (buf))));
15982 				new_ilm->ilm_refcnt += count;
15983 				if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
15984 				    !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
15985 					new_ilm->ilm_is_new = B_TRUE;
15986 				}
15987 			} else {
15988 				new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t));
15989 				if (new_ilm == NULL) {
15990 					ip0dbg(("ilm_move_v6: FAILOVER of IPv6"
15991 					    " multicast address %s : from %s to"
15992 					    " %s failed : ENOMEM \n",
15993 					    inet_ntop(AF_INET6,
15994 					    &ilm->ilm_v6addr, buf,
15995 					    sizeof (buf)), from_ill->ill_name,
15996 					    to_ill->ill_name));
15997 					ilmp = &ilm->ilm_next;
15998 					continue;
15999 				}
16000 				*new_ilm = *ilm;
16001 				new_ilm->ilm_filter = NULL;
16002 				new_ilm->ilm_refcnt = count;
16003 				new_ilm->ilm_timer = INFINITY;
16004 				new_ilm->ilm_rtx.rtx_timer = INFINITY;
16005 				new_ilm->ilm_is_new = B_TRUE;
16006 				/*
16007 				 * If the to_ill has not joined this
16008 				 * group we need to tell the driver in
16009 				 * ill_send_multicast_reqs.
16010 				 */
16011 				if (ilm_lookup_ill_v6(to_ill,
16012 				    &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
16013 					new_ilm->ilm_notify_driver = B_TRUE;
16014 
16015 				new_ilm->ilm_ill = to_ill;
16016 				/* Add to the to_ill's list */
16017 				new_ilm->ilm_next = to_ill->ill_ilm;
16018 				to_ill->ill_ilm = new_ilm;
16019 				ASSERT(new_ilm->ilm_ipif == NULL);
16020 			}
16021 			if (ilm->ilm_refcnt == 0) {
16022 				goto bottom;
16023 			} else {
16024 				new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
16025 				CLEAR_SLIST(new_ilm->ilm_filter);
16026 				ilmp = &ilm->ilm_next;
16027 			}
16028 			continue;
16029 		} else {
16030 			/*
16031 			 * ifindex = 0 means, move everything pointing at
16032 			 * from_ill. We are doing this becuase ill has
16033 			 * either FAILED or became INACTIVE.
16034 			 *
16035 			 * As we would like to move things later back to
16036 			 * from_ill, we want to retain the identity of this
16037 			 * ilm. Thus, we don't blindly increment the reference
16038 			 * count on the ilms matching the address alone. We
16039 			 * need to match on the ilm_orig_index also. new_ilm
16040 			 * was obtained by matching ilm_orig_index also.
16041 			 */
16042 			if (new_ilm != NULL) {
16043 				/*
16044 				 * This is possible only if a previous restore
16045 				 * was incomplete i.e restore to
16046 				 * ilm_orig_ifindex left some ilms because
16047 				 * of some failures. Thus when we are failing
16048 				 * again, we might find our old friends there.
16049 				 */
16050 				ip1dbg(("ilm_move_v6: FAILOVER, found new ilm"
16051 				    " on %s, group address %s\n",
16052 				    to_ill->ill_name,
16053 				    inet_ntop(AF_INET6,
16054 				    &ilm->ilm_v6addr, buf,
16055 				    sizeof (buf))));
16056 				new_ilm->ilm_refcnt += ilm->ilm_refcnt;
16057 				if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
16058 				    !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
16059 					new_ilm->ilm_is_new = B_TRUE;
16060 				}
16061 			} else {
16062 				if (from_ill->ill_ilm_walker_cnt != 0) {
16063 					new_ilm = (ilm_t *)
16064 					    mi_zalloc(sizeof (ilm_t));
16065 					if (new_ilm == NULL) {
16066 						ip0dbg(("ilm_move_v6: "
16067 						    "FAILOVER of IPv6"
16068 						    " multicast address %s : "
16069 						    "from %s to"
16070 						    " %s failed : ENOMEM \n",
16071 						    inet_ntop(AF_INET6,
16072 						    &ilm->ilm_v6addr, buf,
16073 						    sizeof (buf)),
16074 						    from_ill->ill_name,
16075 						    to_ill->ill_name));
16076 
16077 							ilmp = &ilm->ilm_next;
16078 							continue;
16079 					}
16080 					*new_ilm = *ilm;
16081 					new_ilm->ilm_filter = NULL;
16082 				} else {
16083 					*ilmp = ilm->ilm_next;
16084 					new_ilm = ilm;
16085 				}
16086 				/*
16087 				 * If the to_ill has not joined this
16088 				 * group we need to tell the driver in
16089 				 * ill_send_multicast_reqs.
16090 				 */
16091 				if (ilm_lookup_ill_v6(to_ill,
16092 				    &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
16093 					new_ilm->ilm_notify_driver = B_TRUE;
16094 
16095 				/* Add to the to_ill's list */
16096 				new_ilm->ilm_next = to_ill->ill_ilm;
16097 				to_ill->ill_ilm = new_ilm;
16098 				ASSERT(ilm->ilm_ipif == NULL);
16099 				new_ilm->ilm_ill = to_ill;
16100 				new_ilm->ilm_is_new = B_TRUE;
16101 			}
16102 
16103 		}
16104 
16105 bottom:
16106 		/*
16107 		 * Revert multicast filter state to (EXCLUDE, NULL).
16108 		 * new_ilm->ilm_is_new should already be set if needed.
16109 		 */
16110 		new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
16111 		CLEAR_SLIST(new_ilm->ilm_filter);
16112 		/*
16113 		 * We allocated/got a new ilm, free the old one.
16114 		 */
16115 		if (new_ilm != ilm) {
16116 			if (from_ill->ill_ilm_walker_cnt == 0) {
16117 				*ilmp = ilm->ilm_next;
16118 				ilm->ilm_next = NULL;
16119 				FREE_SLIST(ilm->ilm_filter);
16120 				FREE_SLIST(ilm->ilm_pendsrcs);
16121 				FREE_SLIST(ilm->ilm_rtx.rtx_allow);
16122 				FREE_SLIST(ilm->ilm_rtx.rtx_block);
16123 				mi_free((char *)ilm);
16124 			} else {
16125 				ilm->ilm_flags |= ILM_DELETED;
16126 				from_ill->ill_ilm_cleanup_reqd = 1;
16127 				ilmp = &ilm->ilm_next;
16128 			}
16129 		}
16130 	}
16131 }
16132 
16133 /*
16134  * Move all the multicast memberships to to_ill. Called when
16135  * an ipif moves from "from_ill" to "to_ill". This function is slightly
16136  * different from IPv6 counterpart as multicast memberships are associated
16137  * with ills in IPv6. This function is called after every ipif is moved
16138  * unlike IPv6, where it is moved only once.
16139  */
16140 static void
16141 ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif)
16142 {
16143 	ilm_t	*ilm;
16144 	ilm_t	*ilm_next;
16145 	ilm_t	*new_ilm;
16146 	ilm_t	**ilmp;
16147 
16148 	ASSERT(MUTEX_HELD(&to_ill->ill_lock));
16149 	ASSERT(MUTEX_HELD(&from_ill->ill_lock));
16150 	ASSERT(RW_WRITE_HELD(&ill_g_lock));
16151 
16152 	ilmp = &from_ill->ill_ilm;
16153 	for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) {
16154 		ilm_next = ilm->ilm_next;
16155 
16156 		if (ilm->ilm_flags & ILM_DELETED) {
16157 			ilmp = &ilm->ilm_next;
16158 			continue;
16159 		}
16160 
16161 		ASSERT(ilm->ilm_ipif != NULL);
16162 
16163 		if (ilm->ilm_ipif != ipif) {
16164 			ilmp = &ilm->ilm_next;
16165 			continue;
16166 		}
16167 
16168 		if (V4_PART_OF_V6(ilm->ilm_v6addr) ==
16169 		    htonl(INADDR_ALLHOSTS_GROUP)) {
16170 			/*
16171 			 * We joined this in ipif_multicast_up
16172 			 * and we never did an ipif_multicast_down
16173 			 * for IPv4. If nobody else from the userland
16174 			 * has reference, we free the ilm, and later
16175 			 * when this ipif comes up on the new ill,
16176 			 * we will join this again.
16177 			 */
16178 			if (--ilm->ilm_refcnt == 0)
16179 				goto delete_ilm;
16180 
16181 			new_ilm = ilm_lookup_ipif(ipif,
16182 			    V4_PART_OF_V6(ilm->ilm_v6addr));
16183 			if (new_ilm != NULL) {
16184 				new_ilm->ilm_refcnt += ilm->ilm_refcnt;
16185 				/*
16186 				 * We still need to deal with the from_ill.
16187 				 */
16188 				new_ilm->ilm_is_new = B_TRUE;
16189 				new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
16190 				CLEAR_SLIST(new_ilm->ilm_filter);
16191 				goto delete_ilm;
16192 			}
16193 			/*
16194 			 * If we could not find one e.g. ipif is
16195 			 * still down on to_ill, we add this ilm
16196 			 * on ill_new to preserve the reference
16197 			 * count.
16198 			 */
16199 		}
16200 		/*
16201 		 * When ipifs move, ilms always move with it
16202 		 * to the NEW ill. Thus we should never be
16203 		 * able to find ilm till we really move it here.
16204 		 */
16205 		ASSERT(ilm_lookup_ipif(ipif,
16206 		    V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL);
16207 
16208 		if (from_ill->ill_ilm_walker_cnt != 0) {
16209 			new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t));
16210 			if (new_ilm == NULL) {
16211 				char buf[INET6_ADDRSTRLEN];
16212 				ip0dbg(("ilm_move_v4: FAILBACK of IPv4"
16213 				    " multicast address %s : "
16214 				    "from %s to"
16215 				    " %s failed : ENOMEM \n",
16216 				    inet_ntop(AF_INET,
16217 				    &ilm->ilm_v6addr, buf,
16218 				    sizeof (buf)),
16219 				    from_ill->ill_name,
16220 				    to_ill->ill_name));
16221 
16222 				ilmp = &ilm->ilm_next;
16223 				continue;
16224 			}
16225 			*new_ilm = *ilm;
16226 			/* We don't want new_ilm linked to ilm's filter list */
16227 			new_ilm->ilm_filter = NULL;
16228 		} else {
16229 			/* Remove from the list */
16230 			*ilmp = ilm->ilm_next;
16231 			new_ilm = ilm;
16232 		}
16233 
16234 		/*
16235 		 * If we have never joined this group on the to_ill
16236 		 * make sure we tell the driver.
16237 		 */
16238 		if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr,
16239 		    ALL_ZONES) == NULL)
16240 			new_ilm->ilm_notify_driver = B_TRUE;
16241 
16242 		/* Add to the to_ill's list */
16243 		new_ilm->ilm_next = to_ill->ill_ilm;
16244 		to_ill->ill_ilm = new_ilm;
16245 		new_ilm->ilm_is_new = B_TRUE;
16246 
16247 		/*
16248 		 * Revert multicast filter state to (EXCLUDE, NULL)
16249 		 */
16250 		new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
16251 		CLEAR_SLIST(new_ilm->ilm_filter);
16252 
16253 		/*
16254 		 * Delete only if we have allocated a new ilm.
16255 		 */
16256 		if (new_ilm != ilm) {
16257 delete_ilm:
16258 			if (from_ill->ill_ilm_walker_cnt == 0) {
16259 				/* Remove from the list */
16260 				*ilmp = ilm->ilm_next;
16261 				ilm->ilm_next = NULL;
16262 				FREE_SLIST(ilm->ilm_filter);
16263 				FREE_SLIST(ilm->ilm_pendsrcs);
16264 				FREE_SLIST(ilm->ilm_rtx.rtx_allow);
16265 				FREE_SLIST(ilm->ilm_rtx.rtx_block);
16266 				mi_free((char *)ilm);
16267 			} else {
16268 				ilm->ilm_flags |= ILM_DELETED;
16269 				from_ill->ill_ilm_cleanup_reqd = 1;
16270 				ilmp = &ilm->ilm_next;
16271 			}
16272 		}
16273 	}
16274 }
16275 
16276 static uint_t
16277 ipif_get_id(ill_t *ill, uint_t id)
16278 {
16279 	uint_t	unit;
16280 	ipif_t	*tipif;
16281 	boolean_t found = B_FALSE;
16282 
16283 	/*
16284 	 * During failback, we want to go back to the same id
16285 	 * instead of the smallest id so that the original
16286 	 * configuration is maintained. id is non-zero in that
16287 	 * case.
16288 	 */
16289 	if (id != 0) {
16290 		/*
16291 		 * While failing back, if we still have an ipif with
16292 		 * MAX_ADDRS_PER_IF, it means this will be replaced
16293 		 * as soon as we return from this function. It was
16294 		 * to set to MAX_ADDRS_PER_IF by the caller so that
16295 		 * we can choose the smallest id. Thus we return zero
16296 		 * in that case ignoring the hint.
16297 		 */
16298 		if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF)
16299 			return (0);
16300 		for (tipif = ill->ill_ipif; tipif != NULL;
16301 		    tipif = tipif->ipif_next) {
16302 			if (tipif->ipif_id == id) {
16303 				found = B_TRUE;
16304 				break;
16305 			}
16306 		}
16307 		/*
16308 		 * If somebody already plumbed another logical
16309 		 * with the same id, we won't be able to find it.
16310 		 */
16311 		if (!found)
16312 			return (id);
16313 	}
16314 	for (unit = 0; unit <= ip_addrs_per_if; unit++) {
16315 		found = B_FALSE;
16316 		for (tipif = ill->ill_ipif; tipif != NULL;
16317 		    tipif = tipif->ipif_next) {
16318 			if (tipif->ipif_id == unit) {
16319 				found = B_TRUE;
16320 				break;
16321 			}
16322 		}
16323 		if (!found)
16324 			break;
16325 	}
16326 	return (unit);
16327 }
16328 
16329 /* ARGSUSED */
16330 static int
16331 ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp,
16332     ipif_t **rep_ipif_ptr)
16333 {
16334 	ill_t	*from_ill;
16335 	ipif_t	*rep_ipif;
16336 	ipif_t	**ipifp;
16337 	uint_t	unit;
16338 	int err = 0;
16339 	ipif_t	*to_ipif;
16340 	struct iocblk	*iocp;
16341 	boolean_t failback_cmd;
16342 	boolean_t remove_ipif;
16343 	int	rc;
16344 
16345 	ASSERT(IAM_WRITER_ILL(to_ill));
16346 	ASSERT(IAM_WRITER_IPIF(ipif));
16347 
16348 	iocp = (struct iocblk *)mp->b_rptr;
16349 	failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK);
16350 	remove_ipif = B_FALSE;
16351 
16352 	from_ill = ipif->ipif_ill;
16353 
16354 	ASSERT(MUTEX_HELD(&to_ill->ill_lock));
16355 	ASSERT(MUTEX_HELD(&from_ill->ill_lock));
16356 	ASSERT(RW_WRITE_HELD(&ill_g_lock));
16357 
16358 	/*
16359 	 * Don't move LINK LOCAL addresses as they are tied to
16360 	 * physical interface.
16361 	 */
16362 	if (from_ill->ill_isv6 &&
16363 	    IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) {
16364 		ipif->ipif_was_up = B_FALSE;
16365 		IPIF_UNMARK_MOVING(ipif);
16366 		return (0);
16367 	}
16368 
16369 	/*
16370 	 * We set the ipif_id to maximum so that the search for
16371 	 * ipif_id will pick the lowest number i.e 0 in the
16372 	 * following 2 cases :
16373 	 *
16374 	 * 1) We have a replacement ipif at the head of to_ill.
16375 	 *    We can't remove it yet as we can exceed ip_addrs_per_if
16376 	 *    on to_ill and hence the MOVE might fail. We want to
16377 	 *    remove it only if we could move the ipif. Thus, by
16378 	 *    setting it to the MAX value, we make the search in
16379 	 *    ipif_get_id return the zeroth id.
16380 	 *
16381 	 * 2) When DR pulls out the NIC and re-plumbs the interface,
16382 	 *    we might just have a zero address plumbed on the ipif
16383 	 *    with zero id in the case of IPv4. We remove that while
16384 	 *    doing the failback. We want to remove it only if we
16385 	 *    could move the ipif. Thus, by setting it to the MAX
16386 	 *    value, we make the search in ipif_get_id return the
16387 	 *    zeroth id.
16388 	 *
16389 	 * Both (1) and (2) are done only when when we are moving
16390 	 * an ipif (either due to failover/failback) which originally
16391 	 * belonged to this interface i.e the ipif_orig_ifindex is
16392 	 * the same as to_ill's ifindex. This is needed so that
16393 	 * FAILOVER from A -> B ( A failed) followed by FAILOVER
16394 	 * from B -> A (B is being removed from the group) and
16395 	 * FAILBACK from A -> B restores the original configuration.
16396 	 * Without the check for orig_ifindex, the second FAILOVER
16397 	 * could make the ipif belonging to B replace the A's zeroth
16398 	 * ipif and the subsequent failback re-creating the replacement
16399 	 * ipif again.
16400 	 *
16401 	 * NOTE : We created the replacement ipif when we did a
16402 	 * FAILOVER (See below). We could check for FAILBACK and
16403 	 * then look for replacement ipif to be removed. But we don't
16404 	 * want to do that because we wan't to allow the possibility
16405 	 * of a FAILOVER from A -> B (which creates the replacement ipif),
16406 	 * followed by a *FAILOVER* from B -> A instead of a FAILBACK
16407 	 * from B -> A.
16408 	 */
16409 	to_ipif = to_ill->ill_ipif;
16410 	if ((to_ill->ill_phyint->phyint_ifindex ==
16411 	    ipif->ipif_orig_ifindex) &&
16412 	    IPIF_REPL_CHECK(to_ipif, failback_cmd)) {
16413 		ASSERT(to_ipif->ipif_id == 0);
16414 		remove_ipif = B_TRUE;
16415 		to_ipif->ipif_id = MAX_ADDRS_PER_IF;
16416 	}
16417 	/*
16418 	 * Find the lowest logical unit number on the to_ill.
16419 	 * If we are failing back, try to get the original id
16420 	 * rather than the lowest one so that the original
16421 	 * configuration is maintained.
16422 	 *
16423 	 * XXX need a better scheme for this.
16424 	 */
16425 	if (failback_cmd) {
16426 		unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid);
16427 	} else {
16428 		unit = ipif_get_id(to_ill, 0);
16429 	}
16430 
16431 	/* Reset back to zero in case we fail below */
16432 	if (to_ipif->ipif_id == MAX_ADDRS_PER_IF)
16433 		to_ipif->ipif_id = 0;
16434 
16435 	if (unit == ip_addrs_per_if) {
16436 		ipif->ipif_was_up = B_FALSE;
16437 		IPIF_UNMARK_MOVING(ipif);
16438 		return (EINVAL);
16439 	}
16440 
16441 	/*
16442 	 * ipif is ready to move from "from_ill" to "to_ill".
16443 	 *
16444 	 * 1) If we are moving ipif with id zero, create a
16445 	 *    replacement ipif for this ipif on from_ill. If this fails
16446 	 *    fail the MOVE operation.
16447 	 *
16448 	 * 2) Remove the replacement ipif on to_ill if any.
16449 	 *    We could remove the replacement ipif when we are moving
16450 	 *    the ipif with id zero. But what if somebody already
16451 	 *    unplumbed it ? Thus we always remove it if it is present.
16452 	 *    We want to do it only if we are sure we are going to
16453 	 *    move the ipif to to_ill which is why there are no
16454 	 *    returns due to error till ipif is linked to to_ill.
16455 	 *    Note that the first ipif that we failback will always
16456 	 *    be zero if it is present.
16457 	 */
16458 	if (ipif->ipif_id == 0) {
16459 		ipaddr_t inaddr_any = INADDR_ANY;
16460 
16461 		rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED);
16462 		if (rep_ipif == NULL) {
16463 			ipif->ipif_was_up = B_FALSE;
16464 			IPIF_UNMARK_MOVING(ipif);
16465 			return (ENOMEM);
16466 		}
16467 		*rep_ipif = ipif_zero;
16468 		/*
16469 		 * Before we put the ipif on the list, store the addresses
16470 		 * as mapped addresses as some of the ioctls e.g SIOCGIFADDR
16471 		 * assumes so. This logic is not any different from what
16472 		 * ipif_allocate does.
16473 		 */
16474 		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
16475 		    &rep_ipif->ipif_v6lcl_addr);
16476 		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
16477 		    &rep_ipif->ipif_v6src_addr);
16478 		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
16479 		    &rep_ipif->ipif_v6subnet);
16480 		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
16481 		    &rep_ipif->ipif_v6net_mask);
16482 		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
16483 		    &rep_ipif->ipif_v6brd_addr);
16484 		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
16485 		    &rep_ipif->ipif_v6pp_dst_addr);
16486 		/*
16487 		 * We mark IPIF_NOFAILOVER so that this can never
16488 		 * move.
16489 		 */
16490 		rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER;
16491 		rep_ipif->ipif_flags &= ~IPIF_UP;
16492 		rep_ipif->ipif_replace_zero = B_TRUE;
16493 		mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL,
16494 		    MUTEX_DEFAULT, NULL);
16495 		rep_ipif->ipif_id = 0;
16496 		rep_ipif->ipif_ire_type = ipif->ipif_ire_type;
16497 		rep_ipif->ipif_ill = from_ill;
16498 		rep_ipif->ipif_orig_ifindex =
16499 		    from_ill->ill_phyint->phyint_ifindex;
16500 		/* Insert at head */
16501 		rep_ipif->ipif_next = from_ill->ill_ipif;
16502 		from_ill->ill_ipif = rep_ipif;
16503 		/*
16504 		 * We don't really care to let apps know about
16505 		 * this interface.
16506 		 */
16507 	}
16508 
16509 	if (remove_ipif) {
16510 		/*
16511 		 * We set to a max value above for this case to get
16512 		 * id zero. ASSERT that we did get one.
16513 		 */
16514 		ASSERT((to_ipif->ipif_id == 0) && (unit == 0));
16515 		rep_ipif = to_ipif;
16516 		to_ill->ill_ipif = rep_ipif->ipif_next;
16517 		rep_ipif->ipif_next = NULL;
16518 		/*
16519 		 * If some apps scanned and find this interface,
16520 		 * it is time to let them know, so that they can
16521 		 * delete it.
16522 		 */
16523 
16524 		*rep_ipif_ptr = rep_ipif;
16525 	}
16526 
16527 	/* Get it out of the ILL interface list. */
16528 	ipifp = &ipif->ipif_ill->ill_ipif;
16529 	for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) {
16530 		if (*ipifp == ipif) {
16531 			*ipifp = ipif->ipif_next;
16532 			break;
16533 		}
16534 	}
16535 
16536 	/* Assign the new ill */
16537 	ipif->ipif_ill = to_ill;
16538 	ipif->ipif_id = unit;
16539 	/* id has already been checked */
16540 	rc = ipif_insert(ipif, B_FALSE, B_FALSE);
16541 	ASSERT(rc == 0);
16542 	/* Let SCTP update its list */
16543 	sctp_move_ipif(ipif, from_ill, to_ill);
16544 	/*
16545 	 * Handle the failover and failback of ipif_t between
16546 	 * ill_t that have differing maximum mtu values.
16547 	 */
16548 	if (ipif->ipif_mtu > to_ill->ill_max_mtu) {
16549 		if (ipif->ipif_saved_mtu == 0) {
16550 			/*
16551 			 * As this ipif_t is moving to an ill_t
16552 			 * that has a lower ill_max_mtu, its
16553 			 * ipif_mtu needs to be saved so it can
16554 			 * be restored during failback or during
16555 			 * failover to an ill_t which has a
16556 			 * higher ill_max_mtu.
16557 			 */
16558 			ipif->ipif_saved_mtu = ipif->ipif_mtu;
16559 			ipif->ipif_mtu = to_ill->ill_max_mtu;
16560 		} else {
16561 			/*
16562 			 * The ipif_t is, once again, moving to
16563 			 * an ill_t that has a lower maximum mtu
16564 			 * value.
16565 			 */
16566 			ipif->ipif_mtu = to_ill->ill_max_mtu;
16567 		}
16568 	} else if (ipif->ipif_mtu < to_ill->ill_max_mtu &&
16569 	    ipif->ipif_saved_mtu != 0) {
16570 		/*
16571 		 * The mtu of this ipif_t had to be reduced
16572 		 * during an earlier failover; this is an
16573 		 * opportunity for it to be increased (either as
16574 		 * part of another failover or a failback).
16575 		 */
16576 		if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) {
16577 			ipif->ipif_mtu = ipif->ipif_saved_mtu;
16578 			ipif->ipif_saved_mtu = 0;
16579 		} else {
16580 			ipif->ipif_mtu = to_ill->ill_max_mtu;
16581 		}
16582 	}
16583 
16584 	/*
16585 	 * We preserve all the other fields of the ipif including
16586 	 * ipif_saved_ire_mp. The routes that are saved here will
16587 	 * be recreated on the new interface and back on the old
16588 	 * interface when we move back.
16589 	 */
16590 	ASSERT(ipif->ipif_arp_del_mp == NULL);
16591 
16592 	return (err);
16593 }
16594 
16595 static int
16596 ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp,
16597     int ifindex, ipif_t **rep_ipif_ptr)
16598 {
16599 	ipif_t *mipif;
16600 	ipif_t *ipif_next;
16601 	int err;
16602 
16603 	/*
16604 	 * We don't really try to MOVE back things if some of the
16605 	 * operations fail. The daemon will take care of moving again
16606 	 * later on.
16607 	 */
16608 	for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) {
16609 		ipif_next = mipif->ipif_next;
16610 		if (!(mipif->ipif_flags & IPIF_NOFAILOVER) &&
16611 		    (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) {
16612 
16613 			err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr);
16614 
16615 			/*
16616 			 * When the MOVE fails, it is the job of the
16617 			 * application to take care of this properly
16618 			 * i.e try again if it is ENOMEM.
16619 			 */
16620 			if (mipif->ipif_ill != from_ill) {
16621 				/*
16622 				 * ipif has moved.
16623 				 *
16624 				 * Move the multicast memberships associated
16625 				 * with this ipif to the new ill. For IPv6, we
16626 				 * do it once after all the ipifs are moved
16627 				 * (in ill_move) as they are not associated
16628 				 * with ipifs.
16629 				 *
16630 				 * We need to move the ilms as the ipif has
16631 				 * already been moved to a new ill even
16632 				 * in the case of errors. Neither
16633 				 * ilm_free(ipif) will find the ilm
16634 				 * when somebody unplumbs this ipif nor
16635 				 * ilm_delete(ilm) will be able to find the
16636 				 * ilm, if we don't move now.
16637 				 */
16638 				if (!from_ill->ill_isv6)
16639 					ilm_move_v4(from_ill, to_ill, mipif);
16640 			}
16641 
16642 			if (err != 0)
16643 				return (err);
16644 		}
16645 	}
16646 	return (0);
16647 }
16648 
16649 static int
16650 ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp)
16651 {
16652 	int ifindex;
16653 	int err;
16654 	struct iocblk	*iocp;
16655 	ipif_t	*ipif;
16656 	ipif_t *rep_ipif_ptr = NULL;
16657 	ipif_t	*from_ipif = NULL;
16658 	boolean_t check_rep_if = B_FALSE;
16659 
16660 	iocp = (struct iocblk *)mp->b_rptr;
16661 	if (iocp->ioc_cmd == SIOCLIFFAILOVER) {
16662 		/*
16663 		 * Move everything pointing at from_ill to to_ill.
16664 		 * We acheive this by passing in 0 as ifindex.
16665 		 */
16666 		ifindex = 0;
16667 	} else {
16668 		/*
16669 		 * Move everything pointing at from_ill whose original
16670 		 * ifindex of connp, ipif, ilm points at to_ill->ill_index.
16671 		 * We acheive this by passing in ifindex rather than 0.
16672 		 * Multicast vifs, ilgs move implicitly because ipifs move.
16673 		 */
16674 		ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK);
16675 		ifindex = to_ill->ill_phyint->phyint_ifindex;
16676 	}
16677 
16678 	/*
16679 	 * Determine if there is at least one ipif that would move from
16680 	 * 'from_ill' to 'to_ill'. If so, it is possible that the replacement
16681 	 * ipif (if it exists) on the to_ill would be consumed as a result of
16682 	 * the move, in which case we need to quiesce the replacement ipif also.
16683 	 */
16684 	for (from_ipif = from_ill->ill_ipif; from_ipif != NULL;
16685 	    from_ipif = from_ipif->ipif_next) {
16686 		if (((ifindex == 0) ||
16687 		    (ifindex == from_ipif->ipif_orig_ifindex)) &&
16688 		    !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) {
16689 			check_rep_if = B_TRUE;
16690 			break;
16691 		}
16692 	}
16693 
16694 
16695 	ill_down_ipifs(from_ill, mp, ifindex, B_TRUE);
16696 
16697 	GRAB_ILL_LOCKS(from_ill, to_ill);
16698 	if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) {
16699 		(void) ipsq_pending_mp_add(NULL, ipif, q,
16700 		    mp, ILL_MOVE_OK);
16701 		RELEASE_ILL_LOCKS(from_ill, to_ill);
16702 		return (EINPROGRESS);
16703 	}
16704 
16705 	/* Check if the replacement ipif is quiescent to delete */
16706 	if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif,
16707 	    (iocp->ioc_cmd == SIOCLIFFAILBACK))) {
16708 		to_ill->ill_ipif->ipif_state_flags |=
16709 		    IPIF_MOVING | IPIF_CHANGING;
16710 		if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) {
16711 			(void) ipsq_pending_mp_add(NULL, ipif, q,
16712 			    mp, ILL_MOVE_OK);
16713 			RELEASE_ILL_LOCKS(from_ill, to_ill);
16714 			return (EINPROGRESS);
16715 		}
16716 	}
16717 	RELEASE_ILL_LOCKS(from_ill, to_ill);
16718 
16719 	ASSERT(!MUTEX_HELD(&to_ill->ill_lock));
16720 	rw_enter(&ill_g_lock, RW_WRITER);
16721 	GRAB_ILL_LOCKS(from_ill, to_ill);
16722 	err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr);
16723 
16724 	/* ilm_move is done inside ipif_move for IPv4 */
16725 	if (err == 0 && from_ill->ill_isv6)
16726 		ilm_move_v6(from_ill, to_ill, ifindex);
16727 
16728 	RELEASE_ILL_LOCKS(from_ill, to_ill);
16729 	rw_exit(&ill_g_lock);
16730 
16731 	/*
16732 	 * send rts messages and multicast messages.
16733 	 */
16734 	if (rep_ipif_ptr != NULL) {
16735 		ip_rts_ifmsg(rep_ipif_ptr);
16736 		ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr);
16737 		IPIF_TRACE_CLEANUP(rep_ipif_ptr);
16738 		mi_free(rep_ipif_ptr);
16739 	}
16740 
16741 	ilm_send_multicast_reqs(from_ill, to_ill);
16742 
16743 	conn_move_ill(from_ill, to_ill, ifindex);
16744 
16745 	return (err);
16746 }
16747 
16748 /*
16749  * Used to extract arguments for FAILOVER/FAILBACK ioctls.
16750  * Also checks for the validity of the arguments.
16751  * Note: We are already exclusive inside the from group.
16752  * It is upto the caller to release refcnt on the to_ill's.
16753  */
16754 static int
16755 ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4,
16756     ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6)
16757 {
16758 	int dst_index;
16759 	ipif_t *ipif_v4, *ipif_v6;
16760 	struct lifreq *lifr;
16761 	mblk_t *mp1;
16762 	boolean_t exists;
16763 	sin_t	*sin;
16764 	int	err = 0;
16765 
16766 	if ((mp1 = mp->b_cont) == NULL)
16767 		return (EPROTO);
16768 
16769 	if ((mp1 = mp1->b_cont) == NULL)
16770 		return (EPROTO);
16771 
16772 	lifr = (struct lifreq *)mp1->b_rptr;
16773 	sin = (sin_t *)&lifr->lifr_addr;
16774 
16775 	/*
16776 	 * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6
16777 	 * specific operations.
16778 	 */
16779 	if (sin->sin_family != AF_UNSPEC)
16780 		return (EINVAL);
16781 
16782 	/*
16783 	 * Get ipif with id 0. We are writer on the from ill. So we can pass
16784 	 * NULLs for the last 4 args and we know the lookup won't fail
16785 	 * with EINPROGRESS.
16786 	 */
16787 	ipif_v4 = ipif_lookup_on_name(lifr->lifr_name,
16788 	    mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE,
16789 	    ALL_ZONES, NULL, NULL, NULL, NULL);
16790 	ipif_v6 = ipif_lookup_on_name(lifr->lifr_name,
16791 	    mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE,
16792 	    ALL_ZONES, NULL, NULL, NULL, NULL);
16793 
16794 	if (ipif_v4 == NULL && ipif_v6 == NULL)
16795 		return (ENXIO);
16796 
16797 	if (ipif_v4 != NULL) {
16798 		ASSERT(ipif_v4->ipif_refcnt != 0);
16799 		if (ipif_v4->ipif_id != 0) {
16800 			err = EINVAL;
16801 			goto done;
16802 		}
16803 
16804 		ASSERT(IAM_WRITER_IPIF(ipif_v4));
16805 		*ill_from_v4 = ipif_v4->ipif_ill;
16806 	}
16807 
16808 	if (ipif_v6 != NULL) {
16809 		ASSERT(ipif_v6->ipif_refcnt != 0);
16810 		if (ipif_v6->ipif_id != 0) {
16811 			err = EINVAL;
16812 			goto done;
16813 		}
16814 
16815 		ASSERT(IAM_WRITER_IPIF(ipif_v6));
16816 		*ill_from_v6 = ipif_v6->ipif_ill;
16817 	}
16818 
16819 	err = 0;
16820 	dst_index = lifr->lifr_movetoindex;
16821 	*ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE,
16822 	    q, mp, ip_process_ioctl, &err);
16823 	if (err != 0) {
16824 		/*
16825 		 * There could be only v6.
16826 		 */
16827 		if (err != ENXIO)
16828 			goto done;
16829 		err = 0;
16830 	}
16831 
16832 	*ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE,
16833 	    q, mp, ip_process_ioctl, &err);
16834 	if (err != 0) {
16835 		if (err != ENXIO)
16836 			goto done;
16837 		if (*ill_to_v4 == NULL) {
16838 			err = ENXIO;
16839 			goto done;
16840 		}
16841 		err = 0;
16842 	}
16843 
16844 	/*
16845 	 * If we have something to MOVE i.e "from" not NULL,
16846 	 * "to" should be non-NULL.
16847 	 */
16848 	if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) ||
16849 	    (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) {
16850 		err = EINVAL;
16851 	}
16852 
16853 done:
16854 	if (ipif_v4 != NULL)
16855 		ipif_refrele(ipif_v4);
16856 	if (ipif_v6 != NULL)
16857 		ipif_refrele(ipif_v6);
16858 	return (err);
16859 }
16860 
16861 /*
16862  * FAILOVER and FAILBACK are modelled as MOVE operations.
16863  *
16864  * We don't check whether the MOVE is within the same group or
16865  * not, because this ioctl can be used as a generic mechanism
16866  * to failover from interface A to B, though things will function
16867  * only if they are really part of the same group. Moreover,
16868  * all ipifs may be down and hence temporarily out of the group.
16869  *
16870  * ipif's that need to be moved are first brought down; V4 ipifs are brought
16871  * down first and then V6.  For each we wait for the ipif's to become quiescent.
16872  * Bringing down the ipifs ensures that all ires pointing to these ipifs's
16873  * have been deleted and there are no active references. Once quiescent the
16874  * ipif's are moved and brought up on the new ill.
16875  *
16876  * Normally the source ill and destination ill belong to the same IPMP group
16877  * and hence the same ipsq_t. In the event they don't belong to the same
16878  * same group the two ipsq's are first merged into one ipsq - that of the
16879  * to_ill. The multicast memberships on the source and destination ill cannot
16880  * change during the move operation since multicast joins/leaves also have to
16881  * execute on the same ipsq and are hence serialized.
16882  */
16883 /* ARGSUSED */
16884 int
16885 ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16886     ip_ioctl_cmd_t *ipip, void *ifreq)
16887 {
16888 	ill_t *ill_to_v4 = NULL;
16889 	ill_t *ill_to_v6 = NULL;
16890 	ill_t *ill_from_v4 = NULL;
16891 	ill_t *ill_from_v6 = NULL;
16892 	int err = 0;
16893 
16894 	/*
16895 	 * setup from and to ill's, we can get EINPROGRESS only for
16896 	 * to_ill's.
16897 	 */
16898 	err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6,
16899 	    &ill_to_v4, &ill_to_v6);
16900 
16901 	if (err != 0) {
16902 		ip0dbg(("ip_sioctl_move: extract args failed\n"));
16903 		goto done;
16904 	}
16905 
16906 	/*
16907 	 * nothing to do.
16908 	 */
16909 	if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) {
16910 		goto done;
16911 	}
16912 
16913 	/*
16914 	 * nothing to do.
16915 	 */
16916 	if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) {
16917 		goto done;
16918 	}
16919 
16920 	/*
16921 	 * Mark the ill as changing.
16922 	 * ILL_CHANGING flag is cleared when the ipif's are brought up
16923 	 * in ill_up_ipifs in case of error they are cleared below.
16924 	 */
16925 
16926 	GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6);
16927 	if (ill_from_v4 != NULL)
16928 		ill_from_v4->ill_state_flags |= ILL_CHANGING;
16929 	if (ill_from_v6 != NULL)
16930 		ill_from_v6->ill_state_flags |= ILL_CHANGING;
16931 	RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6);
16932 
16933 	/*
16934 	 * Make sure that both src and dst are
16935 	 * in the same syncq group. If not make it happen.
16936 	 * We are not holding any locks because we are the writer
16937 	 * on the from_ipsq and we will hold locks in ill_merge_groups
16938 	 * to protect to_ipsq against changing.
16939 	 */
16940 	if (ill_from_v4 != NULL) {
16941 		if (ill_from_v4->ill_phyint->phyint_ipsq !=
16942 		    ill_to_v4->ill_phyint->phyint_ipsq) {
16943 			err = ill_merge_groups(ill_from_v4, ill_to_v4,
16944 			    NULL, mp, q);
16945 			goto err_ret;
16946 
16947 		}
16948 		ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock));
16949 	} else {
16950 
16951 		if (ill_from_v6->ill_phyint->phyint_ipsq !=
16952 		    ill_to_v6->ill_phyint->phyint_ipsq) {
16953 			err = ill_merge_groups(ill_from_v6, ill_to_v6,
16954 			    NULL, mp, q);
16955 			goto err_ret;
16956 
16957 		}
16958 		ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock));
16959 	}
16960 
16961 	/*
16962 	 * Now that the ipsq's have been merged and we are the writer
16963 	 * lets mark to_ill as changing as well.
16964 	 */
16965 
16966 	GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6);
16967 	if (ill_to_v4 != NULL)
16968 		ill_to_v4->ill_state_flags |= ILL_CHANGING;
16969 	if (ill_to_v6 != NULL)
16970 		ill_to_v6->ill_state_flags |= ILL_CHANGING;
16971 	RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6);
16972 
16973 	/*
16974 	 * Its ok for us to proceed with the move even if
16975 	 * ill_pending_mp is non null on one of the from ill's as the reply
16976 	 * should not be looking at the ipif, it should only care about the
16977 	 * ill itself.
16978 	 */
16979 
16980 	/*
16981 	 * lets move ipv4 first.
16982 	 */
16983 	if (ill_from_v4 != NULL) {
16984 		ASSERT(IAM_WRITER_ILL(ill_to_v4));
16985 		ill_from_v4->ill_move_in_progress = B_TRUE;
16986 		ill_to_v4->ill_move_in_progress = B_TRUE;
16987 		ill_to_v4->ill_move_peer = ill_from_v4;
16988 		ill_from_v4->ill_move_peer = ill_to_v4;
16989 		err = ill_move(ill_from_v4, ill_to_v4, q, mp);
16990 	}
16991 
16992 	/*
16993 	 * Now lets move ipv6.
16994 	 */
16995 	if (err == 0 && ill_from_v6 != NULL) {
16996 		ASSERT(IAM_WRITER_ILL(ill_to_v6));
16997 		ill_from_v6->ill_move_in_progress = B_TRUE;
16998 		ill_to_v6->ill_move_in_progress = B_TRUE;
16999 		ill_to_v6->ill_move_peer = ill_from_v6;
17000 		ill_from_v6->ill_move_peer = ill_to_v6;
17001 		err = ill_move(ill_from_v6, ill_to_v6, q, mp);
17002 	}
17003 
17004 err_ret:
17005 	/*
17006 	 * EINPROGRESS means we are waiting for the ipif's that need to be
17007 	 * moved to become quiescent.
17008 	 */
17009 	if (err == EINPROGRESS) {
17010 		goto done;
17011 	}
17012 
17013 	/*
17014 	 * if err is set ill_up_ipifs will not be called
17015 	 * lets clear the flags.
17016 	 */
17017 
17018 	GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6);
17019 	GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6);
17020 	/*
17021 	 * Some of the clearing may be redundant. But it is simple
17022 	 * not making any extra checks.
17023 	 */
17024 	if (ill_from_v6 != NULL) {
17025 		ill_from_v6->ill_move_in_progress = B_FALSE;
17026 		ill_from_v6->ill_move_peer = NULL;
17027 		ill_from_v6->ill_state_flags &= ~ILL_CHANGING;
17028 	}
17029 	if (ill_from_v4 != NULL) {
17030 		ill_from_v4->ill_move_in_progress = B_FALSE;
17031 		ill_from_v4->ill_move_peer = NULL;
17032 		ill_from_v4->ill_state_flags &= ~ILL_CHANGING;
17033 	}
17034 	if (ill_to_v6 != NULL) {
17035 		ill_to_v6->ill_move_in_progress = B_FALSE;
17036 		ill_to_v6->ill_move_peer = NULL;
17037 		ill_to_v6->ill_state_flags &= ~ILL_CHANGING;
17038 	}
17039 	if (ill_to_v4 != NULL) {
17040 		ill_to_v4->ill_move_in_progress = B_FALSE;
17041 		ill_to_v4->ill_move_peer = NULL;
17042 		ill_to_v4->ill_state_flags &= ~ILL_CHANGING;
17043 	}
17044 
17045 	/*
17046 	 * Check for setting INACTIVE, if STANDBY is set and FAILED is not set.
17047 	 * Do this always to maintain proper state i.e even in case of errors.
17048 	 * As phyint_inactive looks at both v4 and v6 interfaces,
17049 	 * we need not call on both v4 and v6 interfaces.
17050 	 */
17051 	if (ill_from_v4 != NULL) {
17052 		if ((ill_from_v4->ill_phyint->phyint_flags &
17053 		    (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) {
17054 			phyint_inactive(ill_from_v4->ill_phyint);
17055 		}
17056 	} else if (ill_from_v6 != NULL) {
17057 		if ((ill_from_v6->ill_phyint->phyint_flags &
17058 		    (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) {
17059 			phyint_inactive(ill_from_v6->ill_phyint);
17060 		}
17061 	}
17062 
17063 	if (ill_to_v4 != NULL) {
17064 		if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) {
17065 			ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE;
17066 		}
17067 	} else if (ill_to_v6 != NULL) {
17068 		if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) {
17069 			ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE;
17070 		}
17071 	}
17072 
17073 	RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6);
17074 	RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6);
17075 
17076 no_err:
17077 	/*
17078 	 * lets bring the interfaces up on the to_ill.
17079 	 */
17080 	if (err == 0) {
17081 		err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4,
17082 		    q, mp);
17083 	}
17084 done:
17085 
17086 	if (ill_to_v4 != NULL) {
17087 		ill_refrele(ill_to_v4);
17088 	}
17089 	if (ill_to_v6 != NULL) {
17090 		ill_refrele(ill_to_v6);
17091 	}
17092 
17093 	return (err);
17094 }
17095 
17096 static void
17097 ill_dl_down(ill_t *ill)
17098 {
17099 	/*
17100 	 * The ill is down; unbind but stay attached since we're still
17101 	 * associated with a PPA.
17102 	 */
17103 	mblk_t	*mp = ill->ill_unbind_mp;
17104 
17105 	ill->ill_unbind_mp = NULL;
17106 	ip1dbg(("ill_dl_down(%s)\n", ill->ill_name));
17107 	if (mp != NULL) {
17108 		ip1dbg(("ill_dl_down: %s (%u) for %s\n",
17109 		    dlpi_prim_str(*(int *)mp->b_rptr), *(int *)mp->b_rptr,
17110 		    ill->ill_name));
17111 		ill_dlpi_send(ill, mp);
17112 	}
17113 
17114 	/*
17115 	 * Toss all of our multicast memberships.  We could keep them, but
17116 	 * then we'd have to do bookkeeping of any joins and leaves performed
17117 	 * by the application while the the interface is down (we can't just
17118 	 * issue them because arp cannot currently process AR_ENTRY_SQUERY's
17119 	 * on a downed interface).
17120 	 */
17121 	ill_leave_multicast(ill);
17122 
17123 	mutex_enter(&ill->ill_lock);
17124 	ill->ill_dl_up = 0;
17125 	mutex_exit(&ill->ill_lock);
17126 }
17127 
17128 void
17129 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
17130 {
17131 	union DL_primitives *dlp;
17132 	t_uscalar_t prim;
17133 
17134 	ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
17135 
17136 	dlp = (union DL_primitives *)mp->b_rptr;
17137 	prim = dlp->dl_primitive;
17138 
17139 	ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n",
17140 		dlpi_prim_str(prim), prim, ill->ill_name));
17141 
17142 	switch (prim) {
17143 	case DL_PHYS_ADDR_REQ:
17144 	{
17145 		dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr;
17146 		ill->ill_phys_addr_pend = dlpap->dl_addr_type;
17147 		break;
17148 	}
17149 	case DL_BIND_REQ:
17150 		mutex_enter(&ill->ill_lock);
17151 		ill->ill_state_flags &= ~ILL_DL_UNBIND_DONE;
17152 		mutex_exit(&ill->ill_lock);
17153 		break;
17154 	}
17155 
17156 	ill->ill_dlpi_pending = prim;
17157 
17158 	/*
17159 	 * Some drivers send M_FLUSH up to IP as part of unbind
17160 	 * request.  When this M_FLUSH is sent back to the driver,
17161 	 * this can go after we send the detach request if the
17162 	 * M_FLUSH ends up in IP's syncq. To avoid that, we reply
17163 	 * to the M_FLUSH in ip_rput and locally generate another
17164 	 * M_FLUSH for the correctness.  This will get freed in
17165 	 * ip_wput_nondata.
17166 	 */
17167 	if (prim == DL_UNBIND_REQ)
17168 		(void) putnextctl1(ill->ill_rq, M_FLUSH, FLUSHRW);
17169 
17170 	putnext(ill->ill_wq, mp);
17171 }
17172 
17173 /*
17174  * Send a DLPI control message to the driver but make sure there
17175  * is only one outstanding message. Uses ill_dlpi_pending to tell
17176  * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done()
17177  * when an ACK or a NAK is received to process the next queued message.
17178  *
17179  * We don't protect ill_dlpi_pending with any lock. This is okay as
17180  * every place where its accessed, ip is exclusive while accessing
17181  * ill_dlpi_pending except when this function is called from ill_init()
17182  */
17183 void
17184 ill_dlpi_send(ill_t *ill, mblk_t *mp)
17185 {
17186 	mblk_t **mpp;
17187 
17188 	ASSERT(IAM_WRITER_ILL(ill));
17189 	ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
17190 
17191 	if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
17192 		/* Must queue message. Tail insertion */
17193 		mpp = &ill->ill_dlpi_deferred;
17194 		while (*mpp != NULL)
17195 			mpp = &((*mpp)->b_next);
17196 
17197 		ip1dbg(("ill_dlpi_send: deferring request for %s\n",
17198 		    ill->ill_name));
17199 
17200 		*mpp = mp;
17201 		return;
17202 	}
17203 
17204 	ill_dlpi_dispatch(ill, mp);
17205 }
17206 
17207 /*
17208  * Called when an DLPI control message has been acked or nacked to
17209  * send down the next queued message (if any).
17210  */
17211 void
17212 ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
17213 {
17214 	mblk_t *mp;
17215 
17216 	ASSERT(IAM_WRITER_ILL(ill));
17217 
17218 	ASSERT(prim != DL_PRIM_INVAL);
17219 	if (ill->ill_dlpi_pending != prim) {
17220 		if (ill->ill_dlpi_pending == DL_PRIM_INVAL) {
17221 			(void) mi_strlog(ill->ill_rq, 1,
17222 			    SL_CONSOLE|SL_ERROR|SL_TRACE,
17223 			    "ill_dlpi_done: unsolicited ack for %s from %s\n",
17224 			    dlpi_prim_str(prim), ill->ill_name);
17225 		} else {
17226 			(void) mi_strlog(ill->ill_rq, 1,
17227 			    SL_CONSOLE|SL_ERROR|SL_TRACE,
17228 			    "ill_dlpi_done: unexpected ack for %s from %s "
17229 			    "(expecting ack for %s)\n",
17230 			    dlpi_prim_str(prim), ill->ill_name,
17231 			    dlpi_prim_str(ill->ill_dlpi_pending));
17232 		}
17233 		return;
17234 	}
17235 
17236 	ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name,
17237 	    dlpi_prim_str(ill->ill_dlpi_pending), ill->ill_dlpi_pending));
17238 
17239 	if ((mp = ill->ill_dlpi_deferred) == NULL) {
17240 		ill->ill_dlpi_pending = DL_PRIM_INVAL;
17241 		return;
17242 	}
17243 
17244 	ill->ill_dlpi_deferred = mp->b_next;
17245 	mp->b_next = NULL;
17246 
17247 	ill_dlpi_dispatch(ill, mp);
17248 }
17249 
17250 void
17251 conn_delete_ire(conn_t *connp, caddr_t arg)
17252 {
17253 	ipif_t	*ipif = (ipif_t *)arg;
17254 	ire_t	*ire;
17255 
17256 	/*
17257 	 * Look at the cached ires on conns which has pointers to ipifs.
17258 	 * We just call ire_refrele which clears up the reference
17259 	 * to ire. Called when a conn closes. Also called from ipif_free
17260 	 * to cleanup indirect references to the stale ipif via the cached ire.
17261 	 */
17262 	mutex_enter(&connp->conn_lock);
17263 	ire = connp->conn_ire_cache;
17264 	if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) {
17265 		connp->conn_ire_cache = NULL;
17266 		mutex_exit(&connp->conn_lock);
17267 		IRE_REFRELE_NOTR(ire);
17268 		return;
17269 	}
17270 	mutex_exit(&connp->conn_lock);
17271 
17272 }
17273 
17274 /*
17275  * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number
17276  * of IREs. Those IREs may have been previously cached in the conn structure.
17277  * This ipcl_walk() walker function releases all references to such IREs based
17278  * on the condemned flag.
17279  */
17280 /* ARGSUSED */
17281 void
17282 conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
17283 {
17284 	ire_t	*ire;
17285 
17286 	mutex_enter(&connp->conn_lock);
17287 	ire = connp->conn_ire_cache;
17288 	if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) {
17289 		connp->conn_ire_cache = NULL;
17290 		mutex_exit(&connp->conn_lock);
17291 		IRE_REFRELE_NOTR(ire);
17292 		return;
17293 	}
17294 	mutex_exit(&connp->conn_lock);
17295 }
17296 
17297 /*
17298  * Take down a specific interface, but don't lose any information about it.
17299  * Also delete interface from its interface group (ifgrp).
17300  * (Always called as writer.)
17301  * This function goes through the down sequence even if the interface is
17302  * already down. There are 2 reasons.
17303  * a. Currently we permit interface routes that depend on down interfaces
17304  *    to be added. This behaviour itself is questionable. However it appears
17305  *    that both Solaris and 4.3 BSD have exhibited this behaviour for a long
17306  *    time. We go thru the cleanup in order to remove these routes.
17307  * b. The bringup of the interface could fail in ill_dl_up i.e. we get
17308  *    DL_ERROR_ACK in response to the the DL_BIND request. The interface is
17309  *    down, but we need to cleanup i.e. do ill_dl_down and
17310  *    ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down.
17311  *
17312  * IP-MT notes:
17313  *
17314  * Model of reference to interfaces.
17315  *
17316  * The following members in ipif_t track references to the ipif.
17317  *	int     ipif_refcnt;    Active reference count
17318  *	uint_t  ipif_ire_cnt;   Number of ire's referencing this ipif
17319  * The following members in ill_t track references to the ill.
17320  *	int             ill_refcnt;     active refcnt
17321  *	uint_t          ill_ire_cnt;	Number of ires referencing ill
17322  *	uint_t          ill_nce_cnt;	Number of nces referencing ill
17323  *
17324  * Reference to an ipif or ill can be obtained in any of the following ways.
17325  *
17326  * Through the lookup functions ipif_lookup_* / ill_lookup_* functions
17327  * Pointers to ipif / ill from other data structures viz ire and conn.
17328  * Implicit reference to the ipif / ill by holding a reference to the ire.
17329  *
17330  * The ipif/ill lookup functions return a reference held ipif / ill.
17331  * ipif_refcnt and ill_refcnt track the reference counts respectively.
17332  * This is a purely dynamic reference count associated with threads holding
17333  * references to the ipif / ill. Pointers from other structures do not
17334  * count towards this reference count.
17335  *
17336  * ipif_ire_cnt/ill_ire_cnt is the number of ire's associated with the
17337  * ipif/ill. This is incremented whenever a new ire is created referencing the
17338  * ipif/ill. This is done atomically inside ire_add_v[46] where the ire is
17339  * actually added to the ire hash table. The count is decremented in
17340  * ire_inactive where the ire is destroyed.
17341  *
17342  * nce's reference ill's thru nce_ill and the count of nce's associated with
17343  * an ill is recorded in ill_nce_cnt. This is incremented atomically in
17344  * ndp_add() where the nce is actually added to the table. Similarly it is
17345  * decremented in ndp_inactive where the nce is destroyed.
17346  *
17347  * Flow of ioctls involving interface down/up
17348  *
17349  * The following is the sequence of an attempt to set some critical flags on an
17350  * up interface.
17351  * ip_sioctl_flags
17352  * ipif_down
17353  * wait for ipif to be quiescent
17354  * ipif_down_tail
17355  * ip_sioctl_flags_tail
17356  *
17357  * All set ioctls that involve down/up sequence would have a skeleton similar
17358  * to the above. All the *tail functions are called after the refcounts have
17359  * dropped to the appropriate values.
17360  *
17361  * The mechanism to quiesce an ipif is as follows.
17362  *
17363  * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed
17364  * on the ipif. Callers either pass a flag requesting wait or the lookup
17365  *  functions will return NULL.
17366  *
17367  * Delete all ires referencing this ipif
17368  *
17369  * Any thread attempting to do an ipif_refhold on an ipif that has been
17370  * obtained thru a cached pointer will first make sure that
17371  * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then
17372  * increment the refcount.
17373  *
17374  * The above guarantees that the ipif refcount will eventually come down to
17375  * zero and the ipif will quiesce, once all threads that currently hold a
17376  * reference to the ipif refrelease the ipif. The ipif is quiescent after the
17377  * ipif_refcount has dropped to zero and all ire's associated with this ipif
17378  * have also been ire_inactive'd. i.e. when ipif_ire_cnt and ipif_refcnt both
17379  * drop to zero.
17380  *
17381  * Lookups during the IPIF_CHANGING/ILL_CHANGING interval.
17382  *
17383  * Threads trying to lookup an ipif or ill can pass a flag requesting
17384  * wait and restart if the ipif / ill cannot be looked up currently.
17385  * For eg. bind, and route operations (Eg. route add / delete) cannot return
17386  * failure if the ipif is currently undergoing an exclusive operation, and
17387  * hence pass the flag. The mblk is then enqueued in the ipsq and the operation
17388  * is restarted by ipsq_exit() when the currently exclusive ioctl completes.
17389  * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The
17390  * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't
17391  * change while the ill_lock is held. Before dropping the ill_lock we acquire
17392  * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish
17393  * until we release the ipsq_lock, even though the the ill/ipif state flags
17394  * can change after we drop the ill_lock.
17395  *
17396  * An attempt to send out a packet using an ipif that is currently
17397  * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this
17398  * operation and restart it later when the exclusive condition on the ipif ends.
17399  * This is an example of not passing the wait flag to the lookup functions. For
17400  * example an attempt to refhold and use conn->conn_multicast_ipif and send
17401  * out a multicast packet on that ipif will fail while the ipif is
17402  * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is
17403  * currently IPIF_CHANGING will also fail.
17404  */
17405 int
17406 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
17407 {
17408 	ill_t		*ill = ipif->ipif_ill;
17409 	phyint_t	*phyi;
17410 	conn_t		*connp;
17411 	boolean_t	success;
17412 	boolean_t	ipif_was_up = B_FALSE;
17413 
17414 	ASSERT(IAM_WRITER_IPIF(ipif));
17415 
17416 	ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
17417 
17418 	if (ipif->ipif_flags & IPIF_UP) {
17419 		mutex_enter(&ill->ill_lock);
17420 		ipif->ipif_flags &= ~IPIF_UP;
17421 		ASSERT(ill->ill_ipif_up_count > 0);
17422 		--ill->ill_ipif_up_count;
17423 		mutex_exit(&ill->ill_lock);
17424 		ipif_was_up = B_TRUE;
17425 		/* Update status in SCTP's list */
17426 		sctp_update_ipif(ipif, SCTP_IPIF_DOWN);
17427 	}
17428 
17429 	/*
17430 	 * Blow away v6 memberships we established in ipif_multicast_up(); the
17431 	 * v4 ones are left alone (as is the ipif_multicast_up flag, so we
17432 	 * know not to rejoin when the interface is brought back up).
17433 	 */
17434 	if (ipif->ipif_isv6)
17435 		ipif_multicast_down(ipif);
17436 	/*
17437 	 * Remove from the mapping for __sin6_src_id. We insert only
17438 	 * when the address is not INADDR_ANY. As IPv4 addresses are
17439 	 * stored as mapped addresses, we need to check for mapped
17440 	 * INADDR_ANY also.
17441 	 */
17442 	if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
17443 	    !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) &&
17444 	    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
17445 		int err;
17446 
17447 		err = ip_srcid_remove(&ipif->ipif_v6lcl_addr,
17448 		    ipif->ipif_zoneid);
17449 		if (err != 0) {
17450 			ip0dbg(("ipif_down: srcid_remove %d\n", err));
17451 		}
17452 	}
17453 
17454 	/*
17455 	 * Before we delete the ill from the group (if any), we need
17456 	 * to make sure that we delete all the routes dependent on
17457 	 * this and also any ipifs dependent on this ipif for
17458 	 * source address. We need to do before we delete from
17459 	 * the group because
17460 	 *
17461 	 * 1) ipif_down_delete_ire de-references ill->ill_group.
17462 	 *
17463 	 * 2) ipif_update_other_ipifs needs to walk the whole group
17464 	 *    for re-doing source address selection. Note that
17465 	 *    ipif_select_source[_v6] called from
17466 	 *    ipif_update_other_ipifs[_v6] will not pick this ipif
17467 	 *    because we have already marked down here i.e cleared
17468 	 *    IPIF_UP.
17469 	 */
17470 	if (ipif->ipif_isv6)
17471 		ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES);
17472 	else
17473 		ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES);
17474 
17475 	/*
17476 	 * Need to add these also to be saved and restored when the
17477 	 * ipif is brought down and up
17478 	 */
17479 	mutex_enter(&ire_mrtun_lock);
17480 	if (ire_mrtun_count != 0) {
17481 		mutex_exit(&ire_mrtun_lock);
17482 		ire_walk_ill_mrtun(0, 0, ipif_down_delete_ire,
17483 		    (char *)ipif, NULL);
17484 	} else {
17485 		mutex_exit(&ire_mrtun_lock);
17486 	}
17487 
17488 	mutex_enter(&ire_srcif_table_lock);
17489 	if (ire_srcif_table_count > 0) {
17490 		mutex_exit(&ire_srcif_table_lock);
17491 		ire_walk_srcif_table_v4(ipif_down_delete_ire, (char *)ipif);
17492 	} else {
17493 		mutex_exit(&ire_srcif_table_lock);
17494 	}
17495 
17496 	/*
17497 	 * Cleaning up the conn_ire_cache or conns must be done only after the
17498 	 * ires have been deleted above. Otherwise a thread could end up
17499 	 * caching an ire in a conn after we have finished the cleanup of the
17500 	 * conn. The caching is done after making sure that the ire is not yet
17501 	 * condemned. Also documented in the block comment above ip_output
17502 	 */
17503 	ipcl_walk(conn_cleanup_stale_ire, NULL);
17504 	/* Also, delete the ires cached in SCTP */
17505 	sctp_ire_cache_flush(ipif);
17506 
17507 	/* Resolve any IPsec/IKE NAT-T instances that depend on this ipif. */
17508 	nattymod_clean_ipif(ipif);
17509 
17510 	/*
17511 	 * Update any other ipifs which have used "our" local address as
17512 	 * a source address. This entails removing and recreating IRE_INTERFACE
17513 	 * entries for such ipifs.
17514 	 */
17515 	if (ipif->ipif_isv6)
17516 		ipif_update_other_ipifs_v6(ipif, ill->ill_group);
17517 	else
17518 		ipif_update_other_ipifs(ipif, ill->ill_group);
17519 
17520 	if (ipif_was_up) {
17521 		/*
17522 		 * Check whether it is last ipif to leave this group.
17523 		 * If this is the last ipif to leave, we should remove
17524 		 * this ill from the group as ipif_select_source will not
17525 		 * be able to find any useful ipifs if this ill is selected
17526 		 * for load balancing.
17527 		 *
17528 		 * For nameless groups, we should call ifgrp_delete if this
17529 		 * belongs to some group. As this ipif is going down, we may
17530 		 * need to reconstruct groups.
17531 		 */
17532 		phyi = ill->ill_phyint;
17533 		/*
17534 		 * If the phyint_groupname_len is 0, it may or may not
17535 		 * be in the nameless group. If the phyint_groupname_len is
17536 		 * not 0, then this ill should be part of some group.
17537 		 * As we always insert this ill in the group if
17538 		 * phyint_groupname_len is not zero when the first ipif
17539 		 * comes up (in ipif_up_done), it should be in a group
17540 		 * when the namelen is not 0.
17541 		 *
17542 		 * NOTE : When we delete the ill from the group,it will
17543 		 * blow away all the IRE_CACHES pointing either at this ipif or
17544 		 * ill_wq (illgrp_cache_delete does this). Thus, no IRES
17545 		 * should be pointing at this ill.
17546 		 */
17547 		ASSERT(phyi->phyint_groupname_len == 0 ||
17548 		    (phyi->phyint_groupname != NULL && ill->ill_group != NULL));
17549 
17550 		if (phyi->phyint_groupname_len != 0) {
17551 			if (ill->ill_ipif_up_count == 0)
17552 				illgrp_delete(ill);
17553 		}
17554 
17555 		/*
17556 		 * If we have deleted some of the broadcast ires associated
17557 		 * with this ipif, we need to re-nominate somebody else if
17558 		 * the ires that we deleted were the nominated ones.
17559 		 */
17560 		if (ill->ill_group != NULL && !ill->ill_isv6)
17561 			ipif_renominate_bcast(ipif);
17562 	}
17563 
17564 	if (ipif->ipif_isv6)
17565 		ipif_ndp_down(ipif);
17566 
17567 	/*
17568 	 * If mp is NULL the caller will wait for the appropriate refcnt.
17569 	 * Eg. ip_sioctl_removeif -> ipif_free  -> ipif_down
17570 	 * and ill_delete -> ipif_free -> ipif_down
17571 	 */
17572 	if (mp == NULL) {
17573 		ASSERT(q == NULL);
17574 		return (0);
17575 	}
17576 
17577 	if (CONN_Q(q)) {
17578 		connp = Q_TO_CONN(q);
17579 		mutex_enter(&connp->conn_lock);
17580 	} else {
17581 		connp = NULL;
17582 	}
17583 	mutex_enter(&ill->ill_lock);
17584 	/*
17585 	 * Are there any ire's pointing to this ipif that are still active ?
17586 	 * If this is the last ipif going down, are there any ire's pointing
17587 	 * to this ill that are still active ?
17588 	 */
17589 	if (ipif_is_quiescent(ipif)) {
17590 		mutex_exit(&ill->ill_lock);
17591 		if (connp != NULL)
17592 			mutex_exit(&connp->conn_lock);
17593 		return (0);
17594 	}
17595 
17596 	ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p",
17597 	    ill->ill_name, (void *)ill));
17598 	/*
17599 	 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount
17600 	 * drops down, the operation will be restarted by ipif_ill_refrele_tail
17601 	 * which in turn is called by the last refrele on the ipif/ill/ire.
17602 	 */
17603 	success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN);
17604 	if (!success) {
17605 		/* The conn is closing. So just return */
17606 		ASSERT(connp != NULL);
17607 		mutex_exit(&ill->ill_lock);
17608 		mutex_exit(&connp->conn_lock);
17609 		return (EINTR);
17610 	}
17611 
17612 	mutex_exit(&ill->ill_lock);
17613 	if (connp != NULL)
17614 		mutex_exit(&connp->conn_lock);
17615 	return (EINPROGRESS);
17616 }
17617 
17618 static void
17619 ipif_down_tail(ipif_t *ipif)
17620 {
17621 	ill_t	*ill = ipif->ipif_ill;
17622 
17623 	/*
17624 	 * Skip any loopback interface (null wq).
17625 	 * If this is the last logical interface on the ill
17626 	 * have ill_dl_down tell the driver we are gone (unbind)
17627 	 * Note that lun 0 can ipif_down even though
17628 	 * there are other logical units that are up.
17629 	 * This occurs e.g. when we change a "significant" IFF_ flag.
17630 	 */
17631 	if (ipif->ipif_ill->ill_wq != NULL) {
17632 		if (!ill->ill_logical_down && (ill->ill_ipif_up_count == 0) &&
17633 		    ill->ill_dl_up) {
17634 			ill_dl_down(ill);
17635 		}
17636 	}
17637 	ill->ill_logical_down = 0;
17638 
17639 	/*
17640 	 * Have to be after removing the routes in ipif_down_delete_ire.
17641 	 */
17642 	if (ipif->ipif_isv6) {
17643 		if (ipif->ipif_ill->ill_flags & ILLF_XRESOLV)
17644 			ipif_arp_down(ipif);
17645 	} else {
17646 		ipif_arp_down(ipif);
17647 	}
17648 
17649 	ip_rts_ifmsg(ipif);
17650 	ip_rts_newaddrmsg(RTM_DELETE, 0, ipif);
17651 }
17652 
17653 /*
17654  * Bring interface logically down without bringing the physical interface
17655  * down e.g. when the netmask is changed. This avoids long lasting link
17656  * negotiations between an ethernet interface and a certain switches.
17657  */
17658 static int
17659 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
17660 {
17661 	/*
17662 	 * The ill_logical_down flag is a transient flag. It is set here
17663 	 * and is cleared once the down has completed in ipif_down_tail.
17664 	 * This flag does not indicate whether the ill stream is in the
17665 	 * DL_BOUND state with the driver. Instead this flag is used by
17666 	 * ipif_down_tail to determine whether to DL_UNBIND the stream with
17667 	 * the driver. The state of the ill stream i.e. whether it is
17668 	 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag.
17669 	 */
17670 	ipif->ipif_ill->ill_logical_down = 1;
17671 	return (ipif_down(ipif, q, mp));
17672 }
17673 
17674 /*
17675  * This is called when the SIOCSLIFUSESRC ioctl is processed in IP.
17676  * If the usesrc client ILL is already part of a usesrc group or not,
17677  * in either case a ire_stq with the matching usesrc client ILL will
17678  * locate the IRE's that need to be deleted. We want IREs to be created
17679  * with the new source address.
17680  */
17681 static void
17682 ipif_delete_cache_ire(ire_t *ire, char *ill_arg)
17683 {
17684 	ill_t	*ucill = (ill_t *)ill_arg;
17685 
17686 	ASSERT(IAM_WRITER_ILL(ucill));
17687 
17688 	if (ire->ire_stq == NULL)
17689 		return;
17690 
17691 	if ((ire->ire_type == IRE_CACHE) &&
17692 	    ((ill_t *)ire->ire_stq->q_ptr == ucill))
17693 		ire_delete(ire);
17694 }
17695 
17696 /*
17697  * ire_walk routine to delete every IRE dependent on the interface
17698  * address that is going down.	(Always called as writer.)
17699  * Works for both v4 and v6.
17700  * In addition for checking for ire_ipif matches it also checks for
17701  * IRE_CACHE entries which have the same source address as the
17702  * disappearing ipif since ipif_select_source might have picked
17703  * that source. Note that ipif_down/ipif_update_other_ipifs takes
17704  * care of any IRE_INTERFACE with the disappearing source address.
17705  */
17706 static void
17707 ipif_down_delete_ire(ire_t *ire, char *ipif_arg)
17708 {
17709 	ipif_t	*ipif = (ipif_t *)ipif_arg;
17710 	ill_t *ire_ill;
17711 	ill_t *ipif_ill;
17712 
17713 	ASSERT(IAM_WRITER_IPIF(ipif));
17714 	if (ire->ire_ipif == NULL)
17715 		return;
17716 
17717 	/*
17718 	 * For IPv4, we derive source addresses for an IRE from ipif's
17719 	 * belonging to the same IPMP group as the IRE's outgoing
17720 	 * interface.  If an IRE's outgoing interface isn't in the
17721 	 * same IPMP group as a particular ipif, then that ipif
17722 	 * couldn't have been used as a source address for this IRE.
17723 	 *
17724 	 * For IPv6, source addresses are only restricted to the IPMP group
17725 	 * if the IRE is for a link-local address or a multicast address.
17726 	 * Otherwise, source addresses for an IRE can be chosen from
17727 	 * interfaces other than the the outgoing interface for that IRE.
17728 	 *
17729 	 * For source address selection details, see ipif_select_source()
17730 	 * and ipif_select_source_v6().
17731 	 */
17732 	if (ire->ire_ipversion == IPV4_VERSION ||
17733 	    IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) ||
17734 	    IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) {
17735 		ire_ill = ire->ire_ipif->ipif_ill;
17736 		ipif_ill = ipif->ipif_ill;
17737 
17738 		if (ire_ill->ill_group != ipif_ill->ill_group) {
17739 			return;
17740 		}
17741 	}
17742 
17743 
17744 	if (ire->ire_ipif != ipif) {
17745 		/*
17746 		 * Look for a matching source address.
17747 		 */
17748 		if (ire->ire_type != IRE_CACHE)
17749 			return;
17750 		if (ipif->ipif_flags & IPIF_NOLOCAL)
17751 			return;
17752 
17753 		if (ire->ire_ipversion == IPV4_VERSION) {
17754 			if (ire->ire_src_addr != ipif->ipif_src_addr)
17755 				return;
17756 		} else {
17757 			if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
17758 			    &ipif->ipif_v6lcl_addr))
17759 				return;
17760 		}
17761 		ire_delete(ire);
17762 		return;
17763 	}
17764 	/*
17765 	 * ire_delete() will do an ire_flush_cache which will delete
17766 	 * all ire_ipif matches
17767 	 */
17768 	ire_delete(ire);
17769 }
17770 
17771 /*
17772  * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when
17773  * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or
17774  * 2) when an interface is brought up or down (on that ill).
17775  * This ensures that the IRE_CACHE entries don't retain stale source
17776  * address selection results.
17777  */
17778 void
17779 ill_ipif_cache_delete(ire_t *ire, char *ill_arg)
17780 {
17781 	ill_t	*ill = (ill_t *)ill_arg;
17782 	ill_t	*ipif_ill;
17783 
17784 	ASSERT(IAM_WRITER_ILL(ill));
17785 	/*
17786 	 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
17787 	 * Hence this should be IRE_CACHE.
17788 	 */
17789 	ASSERT(ire->ire_type == IRE_CACHE);
17790 
17791 	/*
17792 	 * We are called for IRE_CACHES whose ire_ipif matches ill.
17793 	 * We are only interested in IRE_CACHES that has borrowed
17794 	 * the source address from ill_arg e.g. ipif_up_done[_v6]
17795 	 * for which we need to look at ire_ipif->ipif_ill match
17796 	 * with ill.
17797 	 */
17798 	ASSERT(ire->ire_ipif != NULL);
17799 	ipif_ill = ire->ire_ipif->ipif_ill;
17800 	if (ipif_ill == ill || (ill->ill_group != NULL &&
17801 	    ipif_ill->ill_group == ill->ill_group)) {
17802 		ire_delete(ire);
17803 	}
17804 }
17805 
17806 /*
17807  * Delete all the ire whose stq references ill_arg.
17808  */
17809 static void
17810 ill_stq_cache_delete(ire_t *ire, char *ill_arg)
17811 {
17812 	ill_t	*ill = (ill_t *)ill_arg;
17813 	ill_t	*ire_ill;
17814 
17815 	ASSERT(IAM_WRITER_ILL(ill));
17816 	/*
17817 	 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
17818 	 * Hence this should be IRE_CACHE.
17819 	 */
17820 	ASSERT(ire->ire_type == IRE_CACHE);
17821 
17822 	/*
17823 	 * We are called for IRE_CACHES whose ire_stq and ire_ipif
17824 	 * matches ill. We are only interested in IRE_CACHES that
17825 	 * has ire_stq->q_ptr pointing at ill_arg. Thus we do the
17826 	 * filtering here.
17827 	 */
17828 	ire_ill = (ill_t *)ire->ire_stq->q_ptr;
17829 
17830 	if (ire_ill == ill)
17831 		ire_delete(ire);
17832 }
17833 
17834 /*
17835  * This is called when an ill leaves the group. We want to delete
17836  * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is
17837  * pointing at ill.
17838  */
17839 static void
17840 illgrp_cache_delete(ire_t *ire, char *ill_arg)
17841 {
17842 	ill_t	*ill = (ill_t *)ill_arg;
17843 
17844 	ASSERT(IAM_WRITER_ILL(ill));
17845 	ASSERT(ill->ill_group == NULL);
17846 	/*
17847 	 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
17848 	 * Hence this should be IRE_CACHE.
17849 	 */
17850 	ASSERT(ire->ire_type == IRE_CACHE);
17851 	/*
17852 	 * We are called for IRE_CACHES whose ire_stq and ire_ipif
17853 	 * matches ill. We are interested in both.
17854 	 */
17855 	ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) ||
17856 	    (ire->ire_ipif->ipif_ill == ill));
17857 
17858 	ire_delete(ire);
17859 }
17860 
17861 /*
17862  * Initiate deallocate of an IPIF. Always called as writer. Called by
17863  * ill_delete or ip_sioctl_removeif.
17864  */
17865 static void
17866 ipif_free(ipif_t *ipif)
17867 {
17868 	ASSERT(IAM_WRITER_IPIF(ipif));
17869 
17870 	/* Remove conn references */
17871 	reset_conn_ipif(ipif);
17872 
17873 	/*
17874 	 * Make sure we have valid net and subnet broadcast ire's for the
17875 	 * other ipif's which share them with this ipif.
17876 	 */
17877 	if (!ipif->ipif_isv6)
17878 		ipif_check_bcast_ires(ipif);
17879 
17880 	/*
17881 	 * Take down the interface. We can be called either from ill_delete
17882 	 * or from ip_sioctl_removeif.
17883 	 */
17884 	(void) ipif_down(ipif, NULL, NULL);
17885 
17886 	rw_enter(&ill_g_lock, RW_WRITER);
17887 	/* Remove pointers to this ill in the multicast routing tables */
17888 	reset_mrt_vif_ipif(ipif);
17889 	rw_exit(&ill_g_lock);
17890 }
17891 
17892 static void
17893 ipif_free_tail(ipif_t *ipif)
17894 {
17895 	mblk_t	*mp;
17896 	ipif_t	**ipifp;
17897 
17898 	/*
17899 	 * Free state for addition IRE_IF_[NO]RESOLVER ire's.
17900 	 */
17901 	mutex_enter(&ipif->ipif_saved_ire_lock);
17902 	mp = ipif->ipif_saved_ire_mp;
17903 	ipif->ipif_saved_ire_mp = NULL;
17904 	mutex_exit(&ipif->ipif_saved_ire_lock);
17905 	freemsg(mp);
17906 
17907 	/*
17908 	 * Need to hold both ill_g_lock and ill_lock while
17909 	 * inserting or removing an ipif from the linked list
17910 	 * of ipifs hanging off the ill.
17911 	 */
17912 	rw_enter(&ill_g_lock, RW_WRITER);
17913 	/*
17914 	 * Remove all multicast memberships on the interface now.
17915 	 * This removes IPv4 multicast memberships joined within
17916 	 * the kernel as ipif_down does not do ipif_multicast_down
17917 	 * for IPv4. IPv6 is not handled here as the multicast memberships
17918 	 * are based on ill and not on ipif.
17919 	 */
17920 	ilm_free(ipif);
17921 
17922 	/*
17923 	 * Since we held the ill_g_lock while doing the ilm_free above,
17924 	 * we can assert the ilms were really deleted and not just marked
17925 	 * ILM_DELETED.
17926 	 */
17927 	ASSERT(ilm_walk_ipif(ipif) == 0);
17928 
17929 
17930 	IPIF_TRACE_CLEANUP(ipif);
17931 
17932 	/* Ask SCTP to take it out of it list */
17933 	sctp_update_ipif(ipif, SCTP_IPIF_REMOVE);
17934 
17935 	mutex_enter(&ipif->ipif_ill->ill_lock);
17936 	/* Get it out of the ILL interface list. */
17937 	ipifp = &ipif->ipif_ill->ill_ipif;
17938 	for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) {
17939 		if (*ipifp == ipif) {
17940 			*ipifp = ipif->ipif_next;
17941 			break;
17942 		}
17943 	}
17944 
17945 	mutex_exit(&ipif->ipif_ill->ill_lock);
17946 	rw_exit(&ill_g_lock);
17947 
17948 	mutex_destroy(&ipif->ipif_saved_ire_lock);
17949 	/* Free the memory. */
17950 	mi_free((char *)ipif);
17951 }
17952 
17953 /*
17954  * Returns an ipif name in the form "ill_name/unit" if ipif_id is not zero,
17955  * "ill_name" otherwise.
17956  */
17957 char *
17958 ipif_get_name(ipif_t *ipif, char *buf, int len)
17959 {
17960 	char	lbuf[32];
17961 	char	*name;
17962 	size_t	name_len;
17963 
17964 	buf[0] = '\0';
17965 	if (!ipif)
17966 		return (buf);
17967 	name = ipif->ipif_ill->ill_name;
17968 	name_len = ipif->ipif_ill->ill_name_length;
17969 	if (ipif->ipif_id != 0) {
17970 		(void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR,
17971 		    ipif->ipif_id);
17972 		name = lbuf;
17973 		name_len = mi_strlen(name) + 1;
17974 	}
17975 	len -= 1;
17976 	buf[len] = '\0';
17977 	len = MIN(len, name_len);
17978 	bcopy(name, buf, len);
17979 	return (buf);
17980 }
17981 
17982 /*
17983  * Find an IPIF based on the name passed in.  Names can be of the
17984  * form <phys> (e.g., le0), <phys>:<#> (e.g., le0:1),
17985  * The <phys> string can have forms like <dev><#> (e.g., le0),
17986  * <dev><#>.<module> (e.g. le0.foo), or <dev>.<module><#> (e.g. ip.tun3).
17987  * When there is no colon, the implied unit id is zero. <phys> must
17988  * correspond to the name of an ILL.  (May be called as writer.)
17989  */
17990 static ipif_t *
17991 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
17992     boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q,
17993     mblk_t *mp, ipsq_func_t func, int *error)
17994 {
17995 	char	*cp;
17996 	char	*endp;
17997 	long	id;
17998 	ill_t	*ill;
17999 	ipif_t	*ipif;
18000 	uint_t	ire_type;
18001 	boolean_t did_alloc = B_FALSE;
18002 	ipsq_t	*ipsq;
18003 
18004 	if (error != NULL)
18005 		*error = 0;
18006 
18007 	/*
18008 	 * If the caller wants to us to create the ipif, make sure we have a
18009 	 * valid zoneid
18010 	 */
18011 	ASSERT(!do_alloc || zoneid != ALL_ZONES);
18012 
18013 	if (namelen == 0) {
18014 		if (error != NULL)
18015 			*error = ENXIO;
18016 		return (NULL);
18017 	}
18018 
18019 	*exists = B_FALSE;
18020 	/* Look for a colon in the name. */
18021 	endp = &name[namelen];
18022 	for (cp = endp; --cp > name; ) {
18023 		if (*cp == IPIF_SEPARATOR_CHAR)
18024 			break;
18025 	}
18026 
18027 	if (*cp == IPIF_SEPARATOR_CHAR) {
18028 		/*
18029 		 * Reject any non-decimal aliases for logical
18030 		 * interfaces. Aliases with leading zeroes
18031 		 * are also rejected as they introduce ambiguity
18032 		 * in the naming of the interfaces.
18033 		 * In order to confirm with existing semantics,
18034 		 * and to not break any programs/script relying
18035 		 * on that behaviour, if<0>:0 is considered to be
18036 		 * a valid interface.
18037 		 *
18038 		 * If alias has two or more digits and the first
18039 		 * is zero, fail.
18040 		 */
18041 		if (&cp[2] < endp && cp[1] == '0')
18042 			return (NULL);
18043 	}
18044 
18045 	if (cp <= name) {
18046 		cp = endp;
18047 	} else {
18048 		*cp = '\0';
18049 	}
18050 
18051 	/*
18052 	 * Look up the ILL, based on the portion of the name
18053 	 * before the slash. ill_lookup_on_name returns a held ill.
18054 	 * Temporary to check whether ill exists already. If so
18055 	 * ill_lookup_on_name will clear it.
18056 	 */
18057 	ill = ill_lookup_on_name(name, do_alloc, isv6,
18058 	    q, mp, func, error, &did_alloc);
18059 	if (cp != endp)
18060 		*cp = IPIF_SEPARATOR_CHAR;
18061 	if (ill == NULL)
18062 		return (NULL);
18063 
18064 	/* Establish the unit number in the name. */
18065 	id = 0;
18066 	if (cp < endp && *endp == '\0') {
18067 		/* If there was a colon, the unit number follows. */
18068 		cp++;
18069 		if (ddi_strtol(cp, NULL, 0, &id) != 0) {
18070 			ill_refrele(ill);
18071 			if (error != NULL)
18072 				*error = ENXIO;
18073 			return (NULL);
18074 		}
18075 	}
18076 
18077 	GRAB_CONN_LOCK(q);
18078 	mutex_enter(&ill->ill_lock);
18079 	/* Now see if there is an IPIF with this unit number. */
18080 	for (ipif = ill->ill_ipif; ipif; ipif = ipif->ipif_next) {
18081 		if (ipif->ipif_id == id) {
18082 			if (zoneid != ALL_ZONES &&
18083 			    zoneid != ipif->ipif_zoneid) {
18084 				mutex_exit(&ill->ill_lock);
18085 				RELEASE_CONN_LOCK(q);
18086 				ill_refrele(ill);
18087 				if (error != NULL)
18088 					*error = ENXIO;
18089 				return (NULL);
18090 			}
18091 			/*
18092 			 * The block comment at the start of ipif_down
18093 			 * explains the use of the macros used below
18094 			 */
18095 			if (IPIF_CAN_LOOKUP(ipif)) {
18096 				ipif_refhold_locked(ipif);
18097 				mutex_exit(&ill->ill_lock);
18098 				if (!did_alloc)
18099 					*exists = B_TRUE;
18100 				/*
18101 				 * Drop locks before calling ill_refrele
18102 				 * since it can potentially call into
18103 				 * ipif_ill_refrele_tail which can end up
18104 				 * in trying to acquire any lock.
18105 				 */
18106 				RELEASE_CONN_LOCK(q);
18107 				ill_refrele(ill);
18108 				return (ipif);
18109 			} else if (IPIF_CAN_WAIT(ipif, q)) {
18110 				ipsq = ill->ill_phyint->phyint_ipsq;
18111 				mutex_enter(&ipsq->ipsq_lock);
18112 				mutex_exit(&ill->ill_lock);
18113 				ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
18114 				mutex_exit(&ipsq->ipsq_lock);
18115 				RELEASE_CONN_LOCK(q);
18116 				ill_refrele(ill);
18117 				*error = EINPROGRESS;
18118 				return (NULL);
18119 			}
18120 		}
18121 	}
18122 	RELEASE_CONN_LOCK(q);
18123 
18124 	if (!do_alloc) {
18125 		mutex_exit(&ill->ill_lock);
18126 		ill_refrele(ill);
18127 		if (error != NULL)
18128 			*error = ENXIO;
18129 		return (NULL);
18130 	}
18131 
18132 	/*
18133 	 * If none found, atomically allocate and return a new one.
18134 	 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL
18135 	 * to support "receive only" use of lo0:1 etc. as is still done
18136 	 * below as an initial guess.
18137 	 * However, this is now likely to be overriden later in ipif_up_done()
18138 	 * when we know for sure what address has been configured on the
18139 	 * interface, since we might have more than one loopback interface
18140 	 * with a loopback address, e.g. in the case of zones, and all the
18141 	 * interfaces with loopback addresses need to be marked IRE_LOOPBACK.
18142 	 */
18143 	if (ill->ill_net_type == IRE_LOOPBACK && id == 0)
18144 		ire_type = IRE_LOOPBACK;
18145 	else
18146 		ire_type = IRE_LOCAL;
18147 	ipif = ipif_allocate(ill, id, ire_type, B_TRUE);
18148 	if (ipif != NULL)
18149 		ipif_refhold_locked(ipif);
18150 	else if (error != NULL)
18151 		*error = ENOMEM;
18152 	mutex_exit(&ill->ill_lock);
18153 	ill_refrele(ill);
18154 	return (ipif);
18155 }
18156 
18157 /*
18158  * This routine is called whenever a new address comes up on an ipif.  If
18159  * we are configured to respond to address mask requests, then we are supposed
18160  * to broadcast an address mask reply at this time.  This routine is also
18161  * called if we are already up, but a netmask change is made.  This is legal
18162  * but might not make the system manager very popular.	(May be called
18163  * as writer.)
18164  */
18165 static void
18166 ipif_mask_reply(ipif_t *ipif)
18167 {
18168 	icmph_t	*icmph;
18169 	ipha_t	*ipha;
18170 	mblk_t	*mp;
18171 
18172 #define	REPLY_LEN	(sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN)
18173 
18174 	if (!ip_respond_to_address_mask_broadcast)
18175 		return;
18176 
18177 	/* ICMP mask reply is IPv4 only */
18178 	ASSERT(!ipif->ipif_isv6);
18179 	/* ICMP mask reply is not for a loopback interface */
18180 	ASSERT(ipif->ipif_ill->ill_wq != NULL);
18181 
18182 	mp = allocb(REPLY_LEN, BPRI_HI);
18183 	if (mp == NULL)
18184 		return;
18185 	mp->b_wptr = mp->b_rptr + REPLY_LEN;
18186 
18187 	ipha = (ipha_t *)mp->b_rptr;
18188 	bzero(ipha, REPLY_LEN);
18189 	*ipha = icmp_ipha;
18190 	ipha->ipha_ttl = ip_broadcast_ttl;
18191 	ipha->ipha_src = ipif->ipif_src_addr;
18192 	ipha->ipha_dst = ipif->ipif_brd_addr;
18193 	ipha->ipha_length = htons(REPLY_LEN);
18194 	ipha->ipha_ident = 0;
18195 
18196 	icmph = (icmph_t *)&ipha[1];
18197 	icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
18198 	bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
18199 	icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0);
18200 	if (icmph->icmph_checksum == 0)
18201 		icmph->icmph_checksum = 0xffff;
18202 
18203 	put(ipif->ipif_wq, mp);
18204 
18205 #undef	REPLY_LEN
18206 }
18207 
18208 /*
18209  * When the mtu in the ipif changes, we call this routine through ire_walk
18210  * to update all the relevant IREs.
18211  * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq.
18212  */
18213 static void
18214 ipif_mtu_change(ire_t *ire, char *ipif_arg)
18215 {
18216 	ipif_t *ipif = (ipif_t *)ipif_arg;
18217 
18218 	if (ire->ire_stq == NULL || ire->ire_ipif != ipif)
18219 		return;
18220 	ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET);
18221 }
18222 
18223 /*
18224  * When the mtu in the ill changes, we call this routine through ire_walk
18225  * to update all the relevant IREs.
18226  * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq.
18227  */
18228 void
18229 ill_mtu_change(ire_t *ire, char *ill_arg)
18230 {
18231 	ill_t	*ill = (ill_t *)ill_arg;
18232 
18233 	if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill)
18234 		return;
18235 	ire->ire_max_frag = ire->ire_ipif->ipif_mtu;
18236 }
18237 
18238 /*
18239  * Join the ipif specific multicast groups.
18240  * Must be called after a mapping has been set up in the resolver.  (Always
18241  * called as writer.)
18242  */
18243 void
18244 ipif_multicast_up(ipif_t *ipif)
18245 {
18246 	int err, index;
18247 	ill_t *ill;
18248 
18249 	ASSERT(IAM_WRITER_IPIF(ipif));
18250 
18251 	ill = ipif->ipif_ill;
18252 	index = ill->ill_phyint->phyint_ifindex;
18253 
18254 	ip1dbg(("ipif_multicast_up\n"));
18255 	if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up)
18256 		return;
18257 
18258 	if (ipif->ipif_isv6) {
18259 		if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr))
18260 			return;
18261 
18262 		/* Join the all hosts multicast address */
18263 		ip1dbg(("ipif_multicast_up - addmulti\n"));
18264 		/*
18265 		 * Passing B_TRUE means we have to join the multicast
18266 		 * membership on this interface even though this is
18267 		 * FAILED. If we join on a different one in the group,
18268 		 * we will not be able to delete the membership later
18269 		 * as we currently don't track where we join when we
18270 		 * join within the kernel unlike applications where
18271 		 * we have ilg/ilg_orig_index. See ip_addmulti_v6
18272 		 * for more on this.
18273 		 */
18274 		err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index,
18275 		    ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
18276 		if (err != 0) {
18277 			ip0dbg(("ipif_multicast_up: "
18278 			    "all_hosts_mcast failed %d\n",
18279 			    err));
18280 			return;
18281 		}
18282 		/*
18283 		 * Enable multicast for the solicited node multicast address
18284 		 */
18285 		if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
18286 			in6_addr_t ipv6_multi = ipv6_solicited_node_mcast;
18287 
18288 			ipv6_multi.s6_addr32[3] |=
18289 			    ipif->ipif_v6lcl_addr.s6_addr32[3];
18290 
18291 			err = ip_addmulti_v6(&ipv6_multi, ill, index,
18292 			    ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE,
18293 			    NULL);
18294 			if (err != 0) {
18295 				ip0dbg(("ipif_multicast_up: solicited MC"
18296 				    " failed %d\n", err));
18297 				(void) ip_delmulti_v6(&ipv6_all_hosts_mcast,
18298 				    ill, ill->ill_phyint->phyint_ifindex,
18299 				    ipif->ipif_zoneid, B_TRUE, B_TRUE);
18300 				return;
18301 			}
18302 		}
18303 	} else {
18304 		if (ipif->ipif_lcl_addr == INADDR_ANY)
18305 			return;
18306 
18307 		/* Join the all hosts multicast address */
18308 		ip1dbg(("ipif_multicast_up - addmulti\n"));
18309 		err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif,
18310 		    ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
18311 		if (err) {
18312 			ip0dbg(("ipif_multicast_up: failed %d\n", err));
18313 			return;
18314 		}
18315 	}
18316 	ipif->ipif_multicast_up = 1;
18317 }
18318 
18319 /*
18320  * Blow away any IPv6 multicast groups that we joined in ipif_multicast_up();
18321  * any explicit memberships are blown away in ill_leave_multicast() when the
18322  * ill is brought down.
18323  */
18324 static void
18325 ipif_multicast_down(ipif_t *ipif)
18326 {
18327 	int err;
18328 
18329 	ASSERT(IAM_WRITER_IPIF(ipif));
18330 
18331 	ip1dbg(("ipif_multicast_down\n"));
18332 	if (!ipif->ipif_multicast_up)
18333 		return;
18334 
18335 	ASSERT(ipif->ipif_isv6);
18336 
18337 	ip1dbg(("ipif_multicast_down - delmulti\n"));
18338 
18339 	/*
18340 	 * Leave the all hosts multicast address. Similar to ip_addmulti_v6,
18341 	 * we should look for ilms on this ill rather than the ones that have
18342 	 * been failed over here.  They are here temporarily. As
18343 	 * ipif_multicast_up has joined on this ill, we should delete only
18344 	 * from this ill.
18345 	 */
18346 	err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill,
18347 	    ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid,
18348 	    B_TRUE, B_TRUE);
18349 	if (err != 0) {
18350 		ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n",
18351 		    err));
18352 	}
18353 	/*
18354 	 * Disable multicast for the solicited node multicast address
18355 	 */
18356 	if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
18357 		in6_addr_t ipv6_multi = ipv6_solicited_node_mcast;
18358 
18359 		ipv6_multi.s6_addr32[3] |=
18360 		    ipif->ipif_v6lcl_addr.s6_addr32[3];
18361 
18362 		err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill,
18363 		    ipif->ipif_ill->ill_phyint->phyint_ifindex,
18364 		    ipif->ipif_zoneid, B_TRUE, B_TRUE);
18365 
18366 		if (err != 0) {
18367 			ip0dbg(("ipif_multicast_down: sol MC failed %d\n",
18368 			    err));
18369 		}
18370 	}
18371 
18372 	ipif->ipif_multicast_up = 0;
18373 }
18374 
18375 /*
18376  * Used when an interface comes up to recreate any extra routes on this
18377  * interface.
18378  */
18379 static ire_t **
18380 ipif_recover_ire(ipif_t *ipif)
18381 {
18382 	mblk_t	*mp;
18383 	ire_t	**ipif_saved_irep;
18384 	ire_t	**irep;
18385 
18386 	ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name,
18387 	    ipif->ipif_id));
18388 
18389 	mutex_enter(&ipif->ipif_saved_ire_lock);
18390 	ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) *
18391 	    ipif->ipif_saved_ire_cnt, KM_NOSLEEP);
18392 	if (ipif_saved_irep == NULL) {
18393 		mutex_exit(&ipif->ipif_saved_ire_lock);
18394 		return (NULL);
18395 	}
18396 
18397 	irep = ipif_saved_irep;
18398 	for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
18399 		ire_t		*ire;
18400 		queue_t		*rfq;
18401 		queue_t		*stq;
18402 		ifrt_t		*ifrt;
18403 		uchar_t		*src_addr;
18404 		uchar_t		*gateway_addr;
18405 		mblk_t		*resolver_mp;
18406 		ushort_t	type;
18407 
18408 		/*
18409 		 * When the ire was initially created and then added in
18410 		 * ip_rt_add(), it was created either using ipif->ipif_net_type
18411 		 * in the case of a traditional interface route, or as one of
18412 		 * the IRE_OFFSUBNET types (with the exception of
18413 		 * IRE_HOST_REDIRECT which is created by icmp_redirect() and
18414 		 * which we don't need to save or recover).  In the case where
18415 		 * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update
18416 		 * the ire_type to IRE_IF_NORESOLVER before calling ire_add()
18417 		 * to satisfy software like GateD and Sun Cluster which creates
18418 		 * routes using the the loopback interface's address as a
18419 		 * gateway.
18420 		 *
18421 		 * As ifrt->ifrt_type reflects the already updated ire_type and
18422 		 * since ire_create() expects that IRE_IF_NORESOLVER will have
18423 		 * a valid ire_dlureq_mp field (which doesn't make sense for a
18424 		 * IRE_LOOPBACK), ire_create() will be called in the same way
18425 		 * here as in ip_rt_add(), namely using ipif->ipif_net_type when
18426 		 * the route looks like a traditional interface route (where
18427 		 * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using
18428 		 * the saved ifrt->ifrt_type.  This means that in the case where
18429 		 * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by
18430 		 * ire_create() will be an IRE_LOOPBACK, it will then be turned
18431 		 * into an IRE_IF_NORESOLVER and then added by ire_add().
18432 		 */
18433 		ifrt = (ifrt_t *)mp->b_rptr;
18434 		if (ifrt->ifrt_type & IRE_INTERFACE) {
18435 			rfq = NULL;
18436 			stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
18437 			    ? ipif->ipif_rq : ipif->ipif_wq;
18438 			src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
18439 			    ? (uint8_t *)&ifrt->ifrt_src_addr
18440 			    : (uint8_t *)&ipif->ipif_src_addr;
18441 			gateway_addr = NULL;
18442 			resolver_mp = ipif->ipif_resolver_mp;
18443 			type = ipif->ipif_net_type;
18444 		} else if (ifrt->ifrt_type & IRE_BROADCAST) {
18445 			/* Recover multiroute broadcast IRE. */
18446 			rfq = ipif->ipif_rq;
18447 			stq = ipif->ipif_wq;
18448 			src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
18449 			    ? (uint8_t *)&ifrt->ifrt_src_addr
18450 			    : (uint8_t *)&ipif->ipif_src_addr;
18451 			gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr;
18452 			resolver_mp = ipif->ipif_bcast_mp;
18453 			type = ifrt->ifrt_type;
18454 		} else {
18455 			rfq = NULL;
18456 			stq = NULL;
18457 			src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
18458 			    ? (uint8_t *)&ifrt->ifrt_src_addr : NULL;
18459 			gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr;
18460 			resolver_mp = NULL;
18461 			type = ifrt->ifrt_type;
18462 		}
18463 
18464 		/*
18465 		 * Create a copy of the IRE with the saved address and netmask.
18466 		 */
18467 		ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for "
18468 		    "0x%x/0x%x\n",
18469 		    ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type,
18470 		    ntohl(ifrt->ifrt_addr),
18471 		    ntohl(ifrt->ifrt_mask)));
18472 		ire = ire_create(
18473 		    (uint8_t *)&ifrt->ifrt_addr,
18474 		    (uint8_t *)&ifrt->ifrt_mask,
18475 		    src_addr,
18476 		    gateway_addr,
18477 		    NULL,
18478 		    &ifrt->ifrt_max_frag,
18479 		    NULL,
18480 		    rfq,
18481 		    stq,
18482 		    type,
18483 		    resolver_mp,
18484 		    ipif,
18485 		    NULL,
18486 		    0,
18487 		    0,
18488 		    0,
18489 		    ifrt->ifrt_flags,
18490 		    &ifrt->ifrt_iulp_info);
18491 
18492 		if (ire == NULL) {
18493 			mutex_exit(&ipif->ipif_saved_ire_lock);
18494 			kmem_free(ipif_saved_irep,
18495 			    ipif->ipif_saved_ire_cnt * sizeof (ire_t *));
18496 			return (NULL);
18497 		}
18498 
18499 		/*
18500 		 * Some software (for example, GateD and Sun Cluster) attempts
18501 		 * to create (what amount to) IRE_PREFIX routes with the
18502 		 * loopback address as the gateway.  This is primarily done to
18503 		 * set up prefixes with the RTF_REJECT flag set (for example,
18504 		 * when generating aggregate routes.)
18505 		 *
18506 		 * If the IRE type (as defined by ipif->ipif_net_type) is
18507 		 * IRE_LOOPBACK, then we map the request into a
18508 		 * IRE_IF_NORESOLVER.
18509 		 */
18510 		if (ipif->ipif_net_type == IRE_LOOPBACK)
18511 			ire->ire_type = IRE_IF_NORESOLVER;
18512 		/*
18513 		 * ire held by ire_add, will be refreled' towards the
18514 		 * the end of ipif_up_done
18515 		 */
18516 		(void) ire_add(&ire, NULL, NULL, NULL);
18517 		*irep = ire;
18518 		irep++;
18519 		ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire));
18520 	}
18521 	mutex_exit(&ipif->ipif_saved_ire_lock);
18522 	return (ipif_saved_irep);
18523 }
18524 
18525 /*
18526  * Used to set the netmask and broadcast address to default values when the
18527  * interface is brought up.  (Always called as writer.)
18528  */
18529 static void
18530 ipif_set_default(ipif_t *ipif)
18531 {
18532 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
18533 
18534 	if (!ipif->ipif_isv6) {
18535 		/*
18536 		 * Interface holds an IPv4 address. Default
18537 		 * mask is the natural netmask.
18538 		 */
18539 		if (!ipif->ipif_net_mask) {
18540 			ipaddr_t	v4mask;
18541 
18542 			v4mask = ip_net_mask(ipif->ipif_lcl_addr);
18543 			V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask);
18544 		}
18545 		if (ipif->ipif_flags & IPIF_POINTOPOINT) {
18546 			/* ipif_subnet is ipif_pp_dst_addr for pt-pt */
18547 			ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
18548 		} else {
18549 			V6_MASK_COPY(ipif->ipif_v6lcl_addr,
18550 			    ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
18551 		}
18552 		/*
18553 		 * NOTE: SunOS 4.X does this even if the broadcast address
18554 		 * has been already set thus we do the same here.
18555 		 */
18556 		if (ipif->ipif_flags & IPIF_BROADCAST) {
18557 			ipaddr_t	v4addr;
18558 
18559 			v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask;
18560 			IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr);
18561 		}
18562 	} else {
18563 		/*
18564 		 * Interface holds an IPv6-only address.  Default
18565 		 * mask is all-ones.
18566 		 */
18567 		if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))
18568 			ipif->ipif_v6net_mask = ipv6_all_ones;
18569 		if (ipif->ipif_flags & IPIF_POINTOPOINT) {
18570 			/* ipif_subnet is ipif_pp_dst_addr for pt-pt */
18571 			ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
18572 		} else {
18573 			V6_MASK_COPY(ipif->ipif_v6lcl_addr,
18574 			    ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
18575 		}
18576 	}
18577 }
18578 
18579 /*
18580  * Return 0 if this address can be used as local address without causing
18581  * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address
18582  * is already up on a different ill, and EADDRINUSE if it's up on the same ill.
18583  * Special checks are needed to allow the same IPv6 link-local address
18584  * on different ills.
18585  * TODO: allowing the same site-local address on different ill's.
18586  */
18587 int
18588 ip_addr_availability_check(ipif_t *new_ipif)
18589 {
18590 	in6_addr_t our_v6addr;
18591 	ill_t *ill;
18592 	ipif_t *ipif;
18593 	ill_walk_context_t ctx;
18594 
18595 	ASSERT(IAM_WRITER_IPIF(new_ipif));
18596 	ASSERT(MUTEX_HELD(&ip_addr_avail_lock));
18597 	ASSERT(RW_READ_HELD(&ill_g_lock));
18598 
18599 	new_ipif->ipif_flags &= ~IPIF_UNNUMBERED;
18600 	if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) ||
18601 	    IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr))
18602 		return (0);
18603 
18604 	our_v6addr = new_ipif->ipif_v6lcl_addr;
18605 
18606 	if (new_ipif->ipif_isv6)
18607 		ill = ILL_START_WALK_V6(&ctx);
18608 	else
18609 		ill = ILL_START_WALK_V4(&ctx);
18610 
18611 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
18612 		for (ipif = ill->ill_ipif; ipif != NULL;
18613 		    ipif = ipif->ipif_next) {
18614 			if ((ipif == new_ipif) ||
18615 			    !(ipif->ipif_flags & IPIF_UP) ||
18616 			    (ipif->ipif_flags & IPIF_UNNUMBERED))
18617 				continue;
18618 			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
18619 			    &our_v6addr)) {
18620 				if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
18621 				    new_ipif->ipif_flags |= IPIF_UNNUMBERED;
18622 				else if (ipif->ipif_flags & IPIF_POINTOPOINT)
18623 				    ipif->ipif_flags |= IPIF_UNNUMBERED;
18624 				else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) &&
18625 				    new_ipif->ipif_ill != ill)
18626 					continue;
18627 				else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) &&
18628 				    new_ipif->ipif_ill != ill)
18629 					continue;
18630 				else if (new_ipif->ipif_zoneid !=
18631 				    ipif->ipif_zoneid &&
18632 				    (ill->ill_phyint->phyint_flags &
18633 				    PHYI_LOOPBACK))
18634 					continue;
18635 				else if (new_ipif->ipif_ill == ill)
18636 					return (EADDRINUSE);
18637 				else
18638 					return (EADDRNOTAVAIL);
18639 			}
18640 		}
18641 	}
18642 
18643 	return (0);
18644 }
18645 
18646 /*
18647  * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add
18648  * IREs for the ipif.
18649  * When the routine returns EINPROGRESS then mp has been consumed and
18650  * the ioctl will be acked from ip_rput_dlpi.
18651  */
18652 static int
18653 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
18654 {
18655 	ill_t	*ill = ipif->ipif_ill;
18656 	boolean_t isv6 = ipif->ipif_isv6;
18657 	int	err = 0;
18658 	boolean_t success;
18659 
18660 	ASSERT(IAM_WRITER_IPIF(ipif));
18661 
18662 	ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
18663 
18664 	/* Shouldn't get here if it is already up. */
18665 	if (ipif->ipif_flags & IPIF_UP)
18666 		return (EALREADY);
18667 
18668 	/* Skip arp/ndp for any loopback interface. */
18669 	if (ill->ill_wq != NULL) {
18670 		conn_t *connp = Q_TO_CONN(q);
18671 		ipsq_t	*ipsq = ill->ill_phyint->phyint_ipsq;
18672 
18673 		if (!ill->ill_dl_up) {
18674 			/*
18675 			 * ill_dl_up is not yet set. i.e. we are yet to
18676 			 * DL_BIND with the driver and this is the first
18677 			 * logical interface on the ill to become "up".
18678 			 * Tell the driver to get going (via DL_BIND_REQ).
18679 			 * Note that changing "significant" IFF_ flags
18680 			 * address/netmask etc cause a down/up dance, but
18681 			 * does not cause an unbind (DL_UNBIND) with the driver
18682 			 */
18683 			return (ill_dl_up(ill, ipif, mp, q));
18684 		}
18685 
18686 		/*
18687 		 * ipif_resolver_up may end up sending an
18688 		 * AR_INTERFACE_UP message to ARP, which would, in
18689 		 * turn send a DLPI message to the driver. ioctls are
18690 		 * serialized and so we cannot send more than one
18691 		 * interface up message at a time. If ipif_resolver_up
18692 		 * does send an interface up message to ARP, we get
18693 		 * EINPROGRESS and we will complete in ip_arp_done.
18694 		 */
18695 
18696 		ASSERT(connp != NULL);
18697 		ASSERT(ipsq->ipsq_pending_mp == NULL);
18698 		mutex_enter(&connp->conn_lock);
18699 		mutex_enter(&ill->ill_lock);
18700 		success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
18701 		mutex_exit(&ill->ill_lock);
18702 		mutex_exit(&connp->conn_lock);
18703 		if (!success)
18704 			return (EINTR);
18705 
18706 		/*
18707 		 * Crank up IPv6 neighbor discovery
18708 		 * Unlike ARP, this should complete when
18709 		 * ipif_ndp_up returns. However, for
18710 		 * ILLF_XRESOLV interfaces we also send a
18711 		 * AR_INTERFACE_UP to the external resolver.
18712 		 * That ioctl will complete in ip_rput.
18713 		 */
18714 		if (isv6) {
18715 			err = ipif_ndp_up(ipif, &ipif->ipif_v6lcl_addr,
18716 			    B_FALSE);
18717 			if (err != 0) {
18718 				mp = ipsq_pending_mp_get(ipsq, &connp);
18719 				return (err);
18720 			}
18721 		}
18722 		/* Now, ARP */
18723 		if ((err = ipif_resolver_up(ipif, B_FALSE)) ==
18724 		    EINPROGRESS) {
18725 			/* We will complete it in ip_arp_done */
18726 			return (err);
18727 		}
18728 		mp = ipsq_pending_mp_get(ipsq, &connp);
18729 		ASSERT(mp != NULL);
18730 		if (err != 0)
18731 			return (err);
18732 	}
18733 	return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif));
18734 }
18735 
18736 /*
18737  * Perform a bind for the physical device.
18738  * When the routine returns EINPROGRESS then mp has been consumed and
18739  * the ioctl will be acked from ip_rput_dlpi.
18740  * Allocate an unbind message and save it until ipif_down.
18741  */
18742 static int
18743 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
18744 {
18745 	mblk_t	*areq_mp = NULL;
18746 	mblk_t	*bind_mp = NULL;
18747 	mblk_t	*unbind_mp = NULL;
18748 	conn_t	*connp;
18749 	boolean_t success;
18750 
18751 	ip1dbg(("ill_dl_up(%s)\n", ill->ill_name));
18752 	ASSERT(IAM_WRITER_ILL(ill));
18753 
18754 	ASSERT(mp != NULL);
18755 
18756 	/* Create a resolver cookie for ARP */
18757 	if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) {
18758 		areq_t		*areq;
18759 		uint16_t	sap_addr;
18760 
18761 		areq_mp = ill_arp_alloc(ill,
18762 			(uchar_t *)&ip_areq_template, 0);
18763 		if (areq_mp == NULL) {
18764 			return (ENOMEM);
18765 		}
18766 		freemsg(ill->ill_resolver_mp);
18767 		ill->ill_resolver_mp = areq_mp;
18768 		areq = (areq_t *)areq_mp->b_rptr;
18769 		sap_addr = ill->ill_sap;
18770 		bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr));
18771 		/*
18772 		 * Wait till we call ill_pending_mp_add to determine
18773 		 * the success before we free the ill_resolver_mp and
18774 		 * attach areq_mp in it's place.
18775 		 */
18776 	}
18777 	bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
18778 	    DL_BIND_REQ);
18779 	if (bind_mp == NULL)
18780 		goto bad;
18781 	((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap;
18782 	((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
18783 
18784 	unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ);
18785 	if (unbind_mp == NULL)
18786 		goto bad;
18787 
18788 	/*
18789 	 * Record state needed to complete this operation when the
18790 	 * DL_BIND_ACK shows up.  Also remember the pre-allocated mblks.
18791 	 */
18792 	if (WR(q)->q_next == NULL) {
18793 		connp = Q_TO_CONN(q);
18794 		mutex_enter(&connp->conn_lock);
18795 	} else {
18796 		connp = NULL;
18797 	}
18798 	mutex_enter(&ipif->ipif_ill->ill_lock);
18799 	success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
18800 	mutex_exit(&ipif->ipif_ill->ill_lock);
18801 	if (connp != NULL)
18802 		mutex_exit(&connp->conn_lock);
18803 	if (!success)
18804 		goto bad;
18805 
18806 	/*
18807 	 * Save the unbind message for ill_dl_down(); it will be consumed when
18808 	 * the interface goes down.
18809 	 */
18810 	ASSERT(ill->ill_unbind_mp == NULL);
18811 	ill->ill_unbind_mp = unbind_mp;
18812 
18813 	ill_dlpi_send(ill, bind_mp);
18814 	/* Send down link-layer capabilities probe if not already done. */
18815 	ill_capability_probe(ill);
18816 
18817 	/*
18818 	 * Sysid used to rely on the fact that netboots set domainname
18819 	 * and the like. Now that miniroot boots aren't strictly netboots
18820 	 * and miniroot network configuration is driven from userland
18821 	 * these things still need to be set. This situation can be detected
18822 	 * by comparing the interface being configured here to the one
18823 	 * dhcack was set to reference by the boot loader. Once sysid is
18824 	 * converted to use dhcp_ipc_getinfo() this call can go away.
18825 	 */
18826 	if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && (dhcack != NULL) &&
18827 	    (strcmp(ill->ill_name, dhcack) == 0) &&
18828 	    (strlen(srpc_domain) == 0)) {
18829 		if (dhcpinit() != 0)
18830 			cmn_err(CE_WARN, "no cached dhcp response");
18831 	}
18832 
18833 	/*
18834 	 * This operation will complete in ip_rput_dlpi with either
18835 	 * a DL_BIND_ACK or DL_ERROR_ACK.
18836 	 */
18837 	return (EINPROGRESS);
18838 bad:
18839 	ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
18840 	/*
18841 	 * We don't have to check for possible removal from illgrp
18842 	 * as we have not yet inserted in illgrp. For groups
18843 	 * without names, this ipif is still not UP and hence
18844 	 * this could not have possibly had any influence in forming
18845 	 * groups.
18846 	 */
18847 
18848 	if (bind_mp != NULL)
18849 		freemsg(bind_mp);
18850 	if (unbind_mp != NULL)
18851 		freemsg(unbind_mp);
18852 	return (ENOMEM);
18853 }
18854 
18855 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20;
18856 
18857 /*
18858  * DLPI and ARP is up.
18859  * Create all the IREs associated with an interface bring up multicast.
18860  * Set the interface flag and finish other initialization
18861  * that potentially had to be differed to after DL_BIND_ACK.
18862  */
18863 int
18864 ipif_up_done(ipif_t *ipif)
18865 {
18866 	ire_t	*ire_array[20];
18867 	ire_t	**irep = ire_array;
18868 	ire_t	**irep1;
18869 	ipaddr_t net_mask = 0;
18870 	ipaddr_t subnet_mask, route_mask;
18871 	ill_t	*ill = ipif->ipif_ill;
18872 	queue_t	*stq;
18873 	ipif_t	 *src_ipif;
18874 	ipif_t   *tmp_ipif;
18875 	boolean_t	flush_ire_cache = B_TRUE;
18876 	int	err = 0;
18877 	phyint_t *phyi;
18878 	ire_t	**ipif_saved_irep = NULL;
18879 	int ipif_saved_ire_cnt;
18880 	int	cnt;
18881 	boolean_t	src_ipif_held = B_FALSE;
18882 	boolean_t	ire_added = B_FALSE;
18883 	boolean_t	loopback = B_FALSE;
18884 
18885 	ip1dbg(("ipif_up_done(%s:%u)\n",
18886 		ipif->ipif_ill->ill_name, ipif->ipif_id));
18887 	/* Check if this is a loopback interface */
18888 	if (ipif->ipif_ill->ill_wq == NULL)
18889 		loopback = B_TRUE;
18890 
18891 	ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
18892 	/*
18893 	 * If all other interfaces for this ill are down or DEPRECATED,
18894 	 * or otherwise unsuitable for source address selection, remove
18895 	 * any IRE_CACHE entries for this ill to make sure source
18896 	 * address selection gets to take this new ipif into account.
18897 	 * No need to hold ill_lock while traversing the ipif list since
18898 	 * we are writer
18899 	 */
18900 	for (tmp_ipif = ill->ill_ipif; tmp_ipif;
18901 		tmp_ipif = tmp_ipif->ipif_next) {
18902 		if (((tmp_ipif->ipif_flags &
18903 		    (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) ||
18904 		    !(tmp_ipif->ipif_flags & IPIF_UP)) ||
18905 		    (tmp_ipif == ipif))
18906 			continue;
18907 		/* first useable pre-existing interface */
18908 		flush_ire_cache = B_FALSE;
18909 		break;
18910 	}
18911 	if (flush_ire_cache)
18912 		ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE,
18913 		    IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill);
18914 
18915 	/*
18916 	 * Figure out which way the send-to queue should go.  Only
18917 	 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK
18918 	 * should show up here.
18919 	 */
18920 	switch (ill->ill_net_type) {
18921 	case IRE_IF_RESOLVER:
18922 		stq = ill->ill_rq;
18923 		break;
18924 	case IRE_IF_NORESOLVER:
18925 	case IRE_LOOPBACK:
18926 		stq = ill->ill_wq;
18927 		break;
18928 	default:
18929 		return (EINVAL);
18930 	}
18931 
18932 	if (ill->ill_phyint->phyint_flags & PHYI_LOOPBACK) {
18933 		/*
18934 		 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in
18935 		 * ipif_lookup_on_name(), but in the case of zones we can have
18936 		 * several loopback addresses on lo0. So all the interfaces with
18937 		 * loopback addresses need to be marked IRE_LOOPBACK.
18938 		 */
18939 		if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) ==
18940 		    htonl(INADDR_LOOPBACK))
18941 			ipif->ipif_ire_type = IRE_LOOPBACK;
18942 		else
18943 			ipif->ipif_ire_type = IRE_LOCAL;
18944 	}
18945 
18946 	if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) {
18947 		/*
18948 		 * Can't use our source address. Select a different
18949 		 * source address for the IRE_INTERFACE and IRE_LOCAL
18950 		 */
18951 		src_ipif = ipif_select_source(ipif->ipif_ill,
18952 		    ipif->ipif_subnet, ipif->ipif_zoneid);
18953 		if (src_ipif == NULL)
18954 			src_ipif = ipif;	/* Last resort */
18955 		else
18956 			src_ipif_held = B_TRUE;
18957 	} else {
18958 		src_ipif = ipif;
18959 	}
18960 
18961 	/* Create all the IREs associated with this interface */
18962 	if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
18963 	    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
18964 		/* Register the source address for __sin6_src_id */
18965 		err = ip_srcid_insert(&ipif->ipif_v6lcl_addr,
18966 		    ipif->ipif_zoneid);
18967 		if (err != 0) {
18968 			ip0dbg(("ipif_up_done: srcid_insert %d\n", err));
18969 			return (err);
18970 		}
18971 		/* If the interface address is set, create the local IRE. */
18972 		ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n",
18973 			(void *)ipif,
18974 			ipif->ipif_ire_type,
18975 			ntohl(ipif->ipif_lcl_addr)));
18976 		*irep++ = ire_create(
18977 		    (uchar_t *)&ipif->ipif_lcl_addr,	/* dest address */
18978 		    (uchar_t *)&ip_g_all_ones,		/* mask */
18979 		    (uchar_t *)&src_ipif->ipif_src_addr, /* source address */
18980 		    NULL,				/* no gateway */
18981 		    NULL,
18982 		    &ip_loopback_mtuplus,		/* max frag size */
18983 		    NULL,
18984 		    ipif->ipif_rq,			/* recv-from queue */
18985 		    NULL,				/* no send-to queue */
18986 		    ipif->ipif_ire_type,		/* LOCAL or LOOPBACK */
18987 		    NULL,
18988 		    ipif,
18989 		    NULL,
18990 		    0,
18991 		    0,
18992 		    0,
18993 		    (ipif->ipif_flags & IPIF_PRIVATE) ?
18994 		    RTF_PRIVATE : 0,
18995 		    &ire_uinfo_null);
18996 	} else {
18997 		ip1dbg((
18998 		    "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n",
18999 		    ipif->ipif_ire_type,
19000 		    ntohl(ipif->ipif_lcl_addr),
19001 		    (uint_t)ipif->ipif_flags));
19002 	}
19003 	if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
19004 	    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
19005 		net_mask = ip_net_mask(ipif->ipif_lcl_addr);
19006 	} else {
19007 		net_mask = htonl(IN_CLASSA_NET);	/* fallback */
19008 	}
19009 
19010 	subnet_mask = ipif->ipif_net_mask;
19011 
19012 	/*
19013 	 * If mask was not specified, use natural netmask of
19014 	 * interface address. Also, store this mask back into the
19015 	 * ipif struct.
19016 	 */
19017 	if (subnet_mask == 0) {
19018 		subnet_mask = net_mask;
19019 		V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask);
19020 		V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
19021 		    ipif->ipif_v6subnet);
19022 	}
19023 
19024 	/* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */
19025 	if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) &&
19026 	    ipif->ipif_subnet != INADDR_ANY) {
19027 		/* ipif_subnet is ipif_pp_dst_addr for pt-pt */
19028 
19029 		if (ipif->ipif_flags & IPIF_POINTOPOINT) {
19030 			route_mask = IP_HOST_MASK;
19031 		} else {
19032 			route_mask = subnet_mask;
19033 		}
19034 
19035 		ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p "
19036 		    "creating if IRE ill_net_type 0x%x for 0x%x\n",
19037 			(void *)ipif, (void *)ill,
19038 			ill->ill_net_type,
19039 			ntohl(ipif->ipif_subnet)));
19040 		*irep++ = ire_create(
19041 		    (uchar_t *)&ipif->ipif_subnet,	/* dest address */
19042 		    (uchar_t *)&route_mask,		/* mask */
19043 		    (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
19044 		    NULL,				/* no gateway */
19045 		    NULL,
19046 		    &ipif->ipif_mtu,			/* max frag */
19047 		    NULL,
19048 		    NULL,				/* no recv queue */
19049 		    stq,				/* send-to queue */
19050 		    ill->ill_net_type,			/* IF_[NO]RESOLVER */
19051 		    ill->ill_resolver_mp,		/* xmit header */
19052 		    ipif,
19053 		    NULL,
19054 		    0,
19055 		    0,
19056 		    0,
19057 		    (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0,
19058 		    &ire_uinfo_null);
19059 	}
19060 
19061 	/*
19062 	 * If the interface address is set, create the broadcast IREs.
19063 	 *
19064 	 * ire_create_bcast checks if the proposed new IRE matches
19065 	 * any existing IRE's with the same physical interface (ILL).
19066 	 * This should get rid of duplicates.
19067 	 * ire_create_bcast also check IPIF_NOXMIT and does not create
19068 	 * any broadcast ires.
19069 	 */
19070 	if ((ipif->ipif_subnet != INADDR_ANY) &&
19071 	    (ipif->ipif_flags & IPIF_BROADCAST)) {
19072 		ipaddr_t addr;
19073 
19074 		ip1dbg(("ipif_up_done: creating broadcast IRE\n"));
19075 		irep = ire_check_and_create_bcast(ipif, 0, irep,
19076 		    (MATCH_IRE_TYPE | MATCH_IRE_ILL));
19077 		irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep,
19078 		    (MATCH_IRE_TYPE | MATCH_IRE_ILL));
19079 
19080 		/*
19081 		 * For backward compatibility, we need to create net
19082 		 * broadcast ire's based on the old "IP address class
19083 		 * system."  The reason is that some old machines only
19084 		 * respond to these class derived net broadcast.
19085 		 *
19086 		 * But we should not create these net broadcast ire's if
19087 		 * the subnet_mask is shorter than the IP address class based
19088 		 * derived netmask.  Otherwise, we may create a net
19089 		 * broadcast address which is the same as an IP address
19090 		 * on the subnet.  Then TCP will refuse to talk to that
19091 		 * address.
19092 		 *
19093 		 * Nor do we need IRE_BROADCAST ire's for the interface
19094 		 * with the netmask as 0xFFFFFFFF, as IRE_LOCAL for that
19095 		 * interface is already created.  Creating these broadcast
19096 		 * ire's will only create confusion as the "addr" is going
19097 		 * to be same as that of the IP address of the interface.
19098 		 */
19099 		if (net_mask < subnet_mask) {
19100 			addr = net_mask & ipif->ipif_subnet;
19101 			irep = ire_check_and_create_bcast(ipif, addr, irep,
19102 			    (MATCH_IRE_TYPE | MATCH_IRE_ILL));
19103 			irep = ire_check_and_create_bcast(ipif,
19104 			    ~net_mask | addr, irep,
19105 			    (MATCH_IRE_TYPE | MATCH_IRE_ILL));
19106 		}
19107 
19108 		if (subnet_mask != 0xFFFFFFFF) {
19109 			addr = ipif->ipif_subnet;
19110 			irep = ire_check_and_create_bcast(ipif, addr, irep,
19111 			    (MATCH_IRE_TYPE | MATCH_IRE_ILL));
19112 			irep = ire_check_and_create_bcast(ipif,
19113 			    ~subnet_mask|addr, irep,
19114 			    (MATCH_IRE_TYPE | MATCH_IRE_ILL));
19115 		}
19116 	}
19117 
19118 	ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
19119 
19120 	/* If an earlier ire_create failed, get out now */
19121 	for (irep1 = irep; irep1 > ire_array; ) {
19122 		irep1--;
19123 		if (*irep1 == NULL) {
19124 			ip1dbg(("ipif_up_done: NULL ire found in ire_array\n"));
19125 			err = ENOMEM;
19126 			goto bad;
19127 		}
19128 	}
19129 
19130 	/*
19131 	 * Need to atomically check for ip_addr_availablity_check
19132 	 * under ip_addr_avail_lock, and if it fails got bad, and remove
19133 	 * from group also.The ill_g_lock is grabbed as reader
19134 	 * just to make sure no new ills or new ipifs are being added
19135 	 * to the system while we are checking the uniqueness of addresses.
19136 	 */
19137 	rw_enter(&ill_g_lock, RW_READER);
19138 	mutex_enter(&ip_addr_avail_lock);
19139 	/* Mark it up, and increment counters. */
19140 	ill->ill_ipif_up_count++;
19141 	ipif->ipif_flags |= IPIF_UP;
19142 	err = ip_addr_availability_check(ipif);
19143 	mutex_exit(&ip_addr_avail_lock);
19144 	rw_exit(&ill_g_lock);
19145 
19146 	if (err != 0) {
19147 		/*
19148 		 * Our address may already be up on the same ill. In this case,
19149 		 * the ARP entry for our ipif replaced the one for the other
19150 		 * ipif. So we don't want to delete it (otherwise the other ipif
19151 		 * would be unable to send packets).
19152 		 * ip_addr_availability_check() identifies this case for us and
19153 		 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL
19154 		 * which is the expected error code.
19155 		 */
19156 		if (err == EADDRINUSE) {
19157 			freemsg(ipif->ipif_arp_del_mp);
19158 			ipif->ipif_arp_del_mp = NULL;
19159 			err = EADDRNOTAVAIL;
19160 		}
19161 		ill->ill_ipif_up_count--;
19162 		ipif->ipif_flags &= ~IPIF_UP;
19163 		goto bad;
19164 	}
19165 
19166 	/*
19167 	 * Add in all newly created IREs.  ire_create_bcast() has
19168 	 * already checked for duplicates of the IRE_BROADCAST type.
19169 	 * We want to add before we call ifgrp_insert which wants
19170 	 * to know whether IRE_IF_RESOLVER exists or not.
19171 	 *
19172 	 * NOTE : We refrele the ire though we may branch to "bad"
19173 	 *	  later on where we do ire_delete. This is okay
19174 	 *	  because nobody can delete it as we are running
19175 	 *	  exclusively.
19176 	 */
19177 	for (irep1 = irep; irep1 > ire_array; ) {
19178 		irep1--;
19179 		ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock)));
19180 		/*
19181 		 * refheld by ire_add. refele towards the end of the func
19182 		 */
19183 		(void) ire_add(irep1, NULL, NULL, NULL);
19184 	}
19185 	ire_added = B_TRUE;
19186 	/*
19187 	 * Form groups if possible.
19188 	 *
19189 	 * If we are supposed to be in a ill_group with a name, insert it
19190 	 * now as we know that at least one ipif is UP. Otherwise form
19191 	 * nameless groups.
19192 	 *
19193 	 * If ip_enable_group_ifs is set and ipif address is not 0, insert
19194 	 * this ipif into the appropriate interface group, or create a
19195 	 * new one. If this is already in a nameless group, we try to form
19196 	 * a bigger group looking at other ills potentially sharing this
19197 	 * ipif's prefix.
19198 	 */
19199 	phyi = ill->ill_phyint;
19200 	if (phyi->phyint_groupname_len != 0) {
19201 		ASSERT(phyi->phyint_groupname != NULL);
19202 		if (ill->ill_ipif_up_count == 1) {
19203 			ASSERT(ill->ill_group == NULL);
19204 			err = illgrp_insert(&illgrp_head_v4, ill,
19205 			    phyi->phyint_groupname, NULL, B_TRUE);
19206 			if (err != 0) {
19207 				ip1dbg(("ipif_up_done: illgrp allocation "
19208 				    "failed, error %d\n", err));
19209 				goto bad;
19210 			}
19211 		}
19212 		ASSERT(ill->ill_group != NULL);
19213 	}
19214 
19215 	/*
19216 	 * When this is part of group, we need to make sure that
19217 	 * any broadcast ires created because of this ipif coming
19218 	 * UP gets marked/cleared with IRE_MARK_NORECV appropriately
19219 	 * so that we don't receive duplicate broadcast packets.
19220 	 */
19221 	if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0)
19222 		ipif_renominate_bcast(ipif);
19223 
19224 	/* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */
19225 	ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt;
19226 	ipif_saved_irep = ipif_recover_ire(ipif);
19227 
19228 	if (!loopback) {
19229 		/*
19230 		 * If the broadcast address has been set, make sure it makes
19231 		 * sense based on the interface address.
19232 		 * Only match on ill since we are sharing broadcast addresses.
19233 		 */
19234 		if ((ipif->ipif_brd_addr != INADDR_ANY) &&
19235 		    (ipif->ipif_flags & IPIF_BROADCAST)) {
19236 			ire_t	*ire;
19237 
19238 			ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0,
19239 			    IRE_BROADCAST, ipif, ALL_ZONES,
19240 			    (MATCH_IRE_TYPE | MATCH_IRE_ILL));
19241 
19242 			if (ire == NULL) {
19243 				/*
19244 				 * If there isn't a matching broadcast IRE,
19245 				 * revert to the default for this netmask.
19246 				 */
19247 				ipif->ipif_v6brd_addr = ipv6_all_zeros;
19248 				mutex_enter(&ipif->ipif_ill->ill_lock);
19249 				ipif_set_default(ipif);
19250 				mutex_exit(&ipif->ipif_ill->ill_lock);
19251 			} else {
19252 				ire_refrele(ire);
19253 			}
19254 		}
19255 
19256 	}
19257 
19258 
19259 	/* This is the first interface on this ill */
19260 	if (ipif->ipif_ipif_up_count == 1 && !loopback) {
19261 		/*
19262 		 * Need to recover all multicast memberships in the driver.
19263 		 * This had to be deferred until we had attached.
19264 		 */
19265 		ill_recover_multicast(ill);
19266 	}
19267 	/* Join the allhosts multicast address */
19268 	ipif_multicast_up(ipif);
19269 
19270 	if (!loopback) {
19271 		/*
19272 		 * See whether anybody else would benefit from the
19273 		 * new ipif that we added. We call this always rather
19274 		 * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST
19275 		 * ipif is for the benefit of illgrp_insert (done above)
19276 		 * which does not do source address selection as it does
19277 		 * not want to re-create interface routes that we are
19278 		 * having reference to it here.
19279 		 */
19280 		ill_update_source_selection(ill);
19281 	}
19282 
19283 	for (irep1 = irep; irep1 > ire_array; ) {
19284 		irep1--;
19285 		if (*irep1 != NULL) {
19286 			/* was held in ire_add */
19287 			ire_refrele(*irep1);
19288 		}
19289 	}
19290 
19291 	cnt = ipif_saved_ire_cnt;
19292 	for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) {
19293 		if (*irep1 != NULL) {
19294 			/* was held in ire_add */
19295 			ire_refrele(*irep1);
19296 		}
19297 	}
19298 
19299 	/*
19300 	 * This had to be deferred until we had bound.
19301 	 * tell routing sockets that this interface is up
19302 	 */
19303 	ip_rts_ifmsg(ipif);
19304 	ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
19305 
19306 	if (!loopback) {
19307 		/* Broadcast an address mask reply. */
19308 		ipif_mask_reply(ipif);
19309 	}
19310 	if (ipif_saved_irep != NULL) {
19311 		kmem_free(ipif_saved_irep,
19312 		    ipif_saved_ire_cnt * sizeof (ire_t *));
19313 	}
19314 	if (src_ipif_held)
19315 		ipif_refrele(src_ipif);
19316 	/* Let SCTP update the status for this ipif */
19317 	sctp_update_ipif(ipif, SCTP_IPIF_UP);
19318 	return (0);
19319 
19320 bad:
19321 	ip1dbg(("ipif_up_done: FAILED \n"));
19322 	/*
19323 	 * We don't have to bother removing from ill groups because
19324 	 *
19325 	 * 1) For groups with names, we insert only when the first ipif
19326 	 *    comes up. In that case if it fails, it will not be in any
19327 	 *    group. So, we need not try to remove for that case.
19328 	 *
19329 	 * 2) For groups without names, either we tried to insert ipif_ill
19330 	 *    in a group as singleton or found some other group to become
19331 	 *    a bigger group. For the former, if it fails we don't have
19332 	 *    anything to do as ipif_ill is not in the group and for the
19333 	 *    latter, there are no failures in illgrp_insert/illgrp_delete
19334 	 *    (ENOMEM can't occur for this. Check ifgrp_insert).
19335 	 */
19336 	while (irep > ire_array) {
19337 		irep--;
19338 		if (*irep != NULL) {
19339 			ire_delete(*irep);
19340 			if (ire_added)
19341 				ire_refrele(*irep);
19342 		}
19343 	}
19344 	(void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid);
19345 
19346 	if (ipif_saved_irep != NULL) {
19347 		kmem_free(ipif_saved_irep,
19348 		    ipif_saved_ire_cnt * sizeof (ire_t *));
19349 	}
19350 	if (src_ipif_held)
19351 		ipif_refrele(src_ipif);
19352 
19353 	ipif_arp_down(ipif);
19354 	return (err);
19355 }
19356 
19357 /*
19358  * Turn off the ARP with the ILLF_NOARP flag.
19359  */
19360 static int
19361 ill_arp_off(ill_t *ill)
19362 {
19363 	mblk_t	*arp_off_mp = NULL;
19364 	mblk_t	*arp_on_mp = NULL;
19365 
19366 	ip1dbg(("ill_arp_off(%s)\n", ill->ill_name));
19367 
19368 	ASSERT(IAM_WRITER_ILL(ill));
19369 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
19370 
19371 	/*
19372 	 * If the on message is still around we've already done
19373 	 * an arp_off without doing an arp_on thus there is no
19374 	 * work needed.
19375 	 */
19376 	if (ill->ill_arp_on_mp != NULL)
19377 		return (0);
19378 
19379 	/*
19380 	 * Allocate an ARP on message (to be saved) and an ARP off message
19381 	 */
19382 	arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0);
19383 	if (!arp_off_mp)
19384 		return (ENOMEM);
19385 
19386 	arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0);
19387 	if (!arp_on_mp)
19388 		goto failed;
19389 
19390 	ASSERT(ill->ill_arp_on_mp == NULL);
19391 	ill->ill_arp_on_mp = arp_on_mp;
19392 
19393 	/* Send an AR_INTERFACE_OFF request */
19394 	putnext(ill->ill_rq, arp_off_mp);
19395 	return (0);
19396 failed:
19397 
19398 	if (arp_off_mp)
19399 		freemsg(arp_off_mp);
19400 	return (ENOMEM);
19401 }
19402 
19403 /*
19404  * Turn on ARP by turning off the ILLF_NOARP flag.
19405  */
19406 static int
19407 ill_arp_on(ill_t *ill)
19408 {
19409 	mblk_t	*mp;
19410 
19411 	ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name));
19412 
19413 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
19414 
19415 	ASSERT(IAM_WRITER_ILL(ill));
19416 	/*
19417 	 * Send an AR_INTERFACE_ON request if we have already done
19418 	 * an arp_off (which allocated the message).
19419 	 */
19420 	if (ill->ill_arp_on_mp != NULL) {
19421 		mp = ill->ill_arp_on_mp;
19422 		ill->ill_arp_on_mp = NULL;
19423 		putnext(ill->ill_rq, mp);
19424 	}
19425 	return (0);
19426 }
19427 
19428 /*
19429  * Called after either deleting ill from the group or when setting
19430  * FAILED or STANDBY on the interface.
19431  */
19432 static void
19433 illgrp_reset_schednext(ill_t *ill)
19434 {
19435 	ill_group_t *illgrp;
19436 	ill_t *save_ill;
19437 
19438 	ASSERT(IAM_WRITER_ILL(ill));
19439 	/*
19440 	 * When called from illgrp_delete, ill_group will be non-NULL.
19441 	 * But when called from ip_sioctl_flags, it could be NULL if
19442 	 * somebody is setting FAILED/INACTIVE on some interface which
19443 	 * is not part of a group.
19444 	 */
19445 	illgrp = ill->ill_group;
19446 	if (illgrp == NULL)
19447 		return;
19448 	if (illgrp->illgrp_ill_schednext != ill)
19449 		return;
19450 
19451 	illgrp->illgrp_ill_schednext = NULL;
19452 	save_ill = ill;
19453 	/*
19454 	 * Choose a good ill to be the next one for
19455 	 * outbound traffic. As the flags FAILED/STANDBY is
19456 	 * not yet marked when called from ip_sioctl_flags,
19457 	 * we check for ill separately.
19458 	 */
19459 	for (ill = illgrp->illgrp_ill; ill != NULL;
19460 	    ill = ill->ill_group_next) {
19461 		if ((ill != save_ill) &&
19462 		    !(ill->ill_phyint->phyint_flags &
19463 		    (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) {
19464 			illgrp->illgrp_ill_schednext = ill;
19465 			return;
19466 		}
19467 	}
19468 }
19469 
19470 /*
19471  * Given an ill, find the next ill in the group to be scheduled.
19472  * (This should be called by ip_newroute() before ire_create().)
19473  * The passed in ill may be pulled out of the group, after we have picked
19474  * up a different outgoing ill from the same group. However ire add will
19475  * atomically check this.
19476  */
19477 ill_t *
19478 illgrp_scheduler(ill_t *ill)
19479 {
19480 	ill_t *retill;
19481 	ill_group_t *illgrp;
19482 	int illcnt;
19483 	int i;
19484 	uint64_t flags;
19485 
19486 	/*
19487 	 * We don't use a lock to check for the ill_group. If this ill
19488 	 * is currently being inserted we may end up just returning this
19489 	 * ill itself. That is ok.
19490 	 */
19491 	if (ill->ill_group == NULL) {
19492 		ill_refhold(ill);
19493 		return (ill);
19494 	}
19495 
19496 	/*
19497 	 * Grab the ill_g_lock as reader to make sure we are dealing with
19498 	 * a set of stable ills. No ill can be added or deleted or change
19499 	 * group while we hold the reader lock.
19500 	 */
19501 	rw_enter(&ill_g_lock, RW_READER);
19502 	if ((illgrp = ill->ill_group) == NULL) {
19503 		rw_exit(&ill_g_lock);
19504 		ill_refhold(ill);
19505 		return (ill);
19506 	}
19507 
19508 	illcnt = illgrp->illgrp_ill_count;
19509 	mutex_enter(&illgrp->illgrp_lock);
19510 	retill = illgrp->illgrp_ill_schednext;
19511 
19512 	if (retill == NULL)
19513 		retill = illgrp->illgrp_ill;
19514 
19515 	/*
19516 	 * We do a circular search beginning at illgrp_ill_schednext
19517 	 * or illgrp_ill. We don't check the flags against the ill lock
19518 	 * since it can change anytime. The ire creation will be atomic
19519 	 * and will fail if the ill is FAILED or OFFLINE.
19520 	 */
19521 	for (i = 0; i < illcnt; i++) {
19522 		flags = retill->ill_phyint->phyint_flags;
19523 
19524 		if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
19525 		    ILL_CAN_LOOKUP(retill)) {
19526 			illgrp->illgrp_ill_schednext = retill->ill_group_next;
19527 			ill_refhold(retill);
19528 			break;
19529 		}
19530 		retill = retill->ill_group_next;
19531 		if (retill == NULL)
19532 			retill = illgrp->illgrp_ill;
19533 	}
19534 	mutex_exit(&illgrp->illgrp_lock);
19535 	rw_exit(&ill_g_lock);
19536 
19537 	return (i == illcnt ? NULL : retill);
19538 }
19539 
19540 /*
19541  * Checks for availbility of a usable source address (if there is one) when the
19542  * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note
19543  * this selection is done regardless of the destination.
19544  */
19545 boolean_t
19546 ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid)
19547 {
19548 	uint_t	ifindex;
19549 	ipif_t	*ipif = NULL;
19550 	ill_t	*uill;
19551 	boolean_t isv6;
19552 
19553 	ASSERT(ill != NULL);
19554 
19555 	isv6 = ill->ill_isv6;
19556 	ifindex = ill->ill_usesrc_ifindex;
19557 	if (ifindex != 0) {
19558 		uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL,
19559 		    NULL);
19560 		if (uill == NULL)
19561 			return (NULL);
19562 		mutex_enter(&uill->ill_lock);
19563 		for (ipif = uill->ill_ipif; ipif != NULL;
19564 		    ipif = ipif->ipif_next) {
19565 			if (!IPIF_CAN_LOOKUP(ipif))
19566 				continue;
19567 			if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
19568 				continue;
19569 			if (!(ipif->ipif_flags & IPIF_UP))
19570 				continue;
19571 			if (ipif->ipif_zoneid != zoneid)
19572 				continue;
19573 			if ((isv6 &&
19574 			    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) ||
19575 			    (ipif->ipif_lcl_addr == INADDR_ANY))
19576 				continue;
19577 			mutex_exit(&uill->ill_lock);
19578 			ill_refrele(uill);
19579 			return (B_TRUE);
19580 		}
19581 		mutex_exit(&uill->ill_lock);
19582 		ill_refrele(uill);
19583 	}
19584 	return (B_FALSE);
19585 }
19586 
19587 /*
19588  * Determine the best source address given a destination address and an ill.
19589  * Prefers non-deprecated over deprecated but will return a deprecated
19590  * address if there is no other choice. If there is a usable source address
19591  * on the interface pointed to by ill_usesrc_ifindex then that is given
19592  * first preference.
19593  *
19594  * Returns NULL if there is no suitable source address for the ill.
19595  * This only occurs when there is no valid source address for the ill.
19596  */
19597 ipif_t *
19598 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
19599 {
19600 	ipif_t *ipif;
19601 	ipif_t *ipif_dep = NULL;	/* Fallback to deprecated */
19602 	ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE];
19603 	int index = 0;
19604 	boolean_t wrapped = B_FALSE;
19605 	boolean_t same_subnet_only = B_FALSE;
19606 	boolean_t ipif_same_found, ipif_other_found;
19607 	ill_t	*till, *usill = NULL;
19608 
19609 	if (ill->ill_usesrc_ifindex != 0) {
19610 		usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, B_FALSE,
19611 		    NULL, NULL, NULL, NULL);
19612 		if (usill != NULL)
19613 			ill = usill;	/* Select source from usesrc ILL */
19614 		else
19615 			return (NULL);
19616 	}
19617 
19618 	/*
19619 	 * Holds the ill_g_lock as reader. This makes sure that no ipif/ill
19620 	 * can be deleted. But an ipif/ill can get CONDEMNED any time.
19621 	 * After selecting the right ipif, under ill_lock make sure ipif is
19622 	 * not condemned, and increment refcnt. If ipif is CONDEMNED,
19623 	 * we retry. Inside the loop we still need to check for CONDEMNED,
19624 	 * but not under a lock.
19625 	 */
19626 	rw_enter(&ill_g_lock, RW_READER);
19627 
19628 retry:
19629 	till = ill;
19630 	ipif_arr[0] = NULL;
19631 
19632 	if (till->ill_group != NULL)
19633 		till = till->ill_group->illgrp_ill;
19634 
19635 	/*
19636 	 * Choose one good source address from each ill across the group.
19637 	 * If possible choose a source address in the same subnet as
19638 	 * the destination address.
19639 	 *
19640 	 * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE
19641 	 * This is okay because of the following.
19642 	 *
19643 	 *    If PHYI_FAILED is set and we still have non-deprecated
19644 	 *    addresses, it means the addresses have not yet been
19645 	 *    failed over to a different interface. We potentially
19646 	 *    select them to create IRE_CACHES, which will be later
19647 	 *    flushed when the addresses move over.
19648 	 *
19649 	 *    If PHYI_INACTIVE is set and we still have non-deprecated
19650 	 *    addresses, it means either the user has configured them
19651 	 *    or PHYI_INACTIVE has not been cleared after the addresses
19652 	 *    been moved over. For the former, in.mpathd does a failover
19653 	 *    when the interface becomes INACTIVE and hence we should
19654 	 *    not find them. Once INACTIVE is set, we don't allow them
19655 	 *    to create logical interfaces anymore. For the latter, a
19656 	 *    flush will happen when INACTIVE is cleared which will
19657 	 *    flush the IRE_CACHES.
19658 	 *
19659 	 *    If PHYI_OFFLINE is set, all the addresses will be failed
19660 	 *    over soon. We potentially select them to create IRE_CACHEs,
19661 	 *    which will be later flushed when the addresses move over.
19662 	 *
19663 	 * NOTE : As ipif_select_source is called to borrow source address
19664 	 * for an ipif that is part of a group, source address selection
19665 	 * will be re-done whenever the group changes i.e either an
19666 	 * insertion/deletion in the group.
19667 	 *
19668 	 * Fill ipif_arr[] with source addresses, using these rules:
19669 	 *
19670 	 *	1. At most one source address from a given ill ends up
19671 	 *	   in ipif_arr[] -- that is, at most one of the ipif's
19672 	 *	   associated with a given ill ends up in ipif_arr[].
19673 	 *
19674 	 *	2. If there is at least one non-deprecated ipif in the
19675 	 *	   IPMP group with a source address on the same subnet as
19676 	 *	   our destination, then fill ipif_arr[] only with
19677 	 *	   source addresses on the same subnet as our destination.
19678 	 *	   Note that because of (1), only the first
19679 	 *	   non-deprecated ipif found with a source address
19680 	 *	   matching the destination ends up in ipif_arr[].
19681 	 *
19682 	 *	3. Otherwise, fill ipif_arr[] with non-deprecated source
19683 	 *	   addresses not in the same subnet as our destination.
19684 	 *	   Again, because of (1), only the first off-subnet source
19685 	 *	   address will be chosen.
19686 	 *
19687 	 *	4. If there are no non-deprecated ipifs, then just use
19688 	 *	   the source address associated with the last deprecated
19689 	 *	   one we find that happens to be on the same subnet,
19690 	 *	   otherwise the first one not in the same subnet.
19691 	 */
19692 	for (; till != NULL; till = till->ill_group_next) {
19693 		ipif_same_found = B_FALSE;
19694 		ipif_other_found = B_FALSE;
19695 		for (ipif = till->ill_ipif; ipif != NULL;
19696 		    ipif = ipif->ipif_next) {
19697 			if (!IPIF_CAN_LOOKUP(ipif))
19698 				continue;
19699 			/* Always skip NOLOCAL and ANYCAST interfaces */
19700 			if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
19701 				continue;
19702 			if (!(ipif->ipif_flags & IPIF_UP))
19703 				continue;
19704 			if (ipif->ipif_zoneid != zoneid)
19705 				continue;
19706 			/*
19707 			 * Interfaces with 0.0.0.0 address are allowed to be UP,
19708 			 * but are not valid as source addresses.
19709 			 */
19710 			if (ipif->ipif_lcl_addr == INADDR_ANY)
19711 				continue;
19712 			if (ipif->ipif_flags & IPIF_DEPRECATED) {
19713 				if (ipif_dep == NULL ||
19714 				    (ipif->ipif_net_mask & dst) ==
19715 				    ipif->ipif_subnet)
19716 					ipif_dep = ipif;
19717 				continue;
19718 			}
19719 			if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) {
19720 				/* found a source address in the same subnet */
19721 				if (same_subnet_only == B_FALSE) {
19722 					same_subnet_only = B_TRUE;
19723 					index = 0;
19724 				}
19725 				ipif_same_found = B_TRUE;
19726 			} else {
19727 				if (same_subnet_only == B_TRUE ||
19728 				    ipif_other_found == B_TRUE)
19729 					continue;
19730 				ipif_other_found = B_TRUE;
19731 			}
19732 			ipif_arr[index++] = ipif;
19733 			if (index == MAX_IPIF_SELECT_SOURCE) {
19734 				wrapped = B_TRUE;
19735 				index = 0;
19736 			}
19737 			if (ipif_same_found == B_TRUE)
19738 				break;
19739 		}
19740 	}
19741 
19742 	if (ipif_arr[0] == NULL) {
19743 		ipif = ipif_dep;
19744 	} else {
19745 		if (wrapped)
19746 			index = MAX_IPIF_SELECT_SOURCE;
19747 		ipif = ipif_arr[ipif_rand() % index];
19748 		ASSERT(ipif != NULL);
19749 	}
19750 
19751 	if (ipif != NULL) {
19752 		mutex_enter(&ipif->ipif_ill->ill_lock);
19753 		if (!IPIF_CAN_LOOKUP(ipif)) {
19754 			mutex_exit(&ipif->ipif_ill->ill_lock);
19755 			goto retry;
19756 		}
19757 		ipif_refhold_locked(ipif);
19758 		mutex_exit(&ipif->ipif_ill->ill_lock);
19759 	}
19760 
19761 	rw_exit(&ill_g_lock);
19762 	if (usill != NULL)
19763 		ill_refrele(usill);
19764 
19765 #ifdef DEBUG
19766 	if (ipif == NULL) {
19767 		char buf1[INET6_ADDRSTRLEN];
19768 
19769 		ip1dbg(("ipif_select_source(%s, %s) -> NULL\n",
19770 		    ill->ill_name,
19771 		    inet_ntop(AF_INET, &dst, buf1, sizeof (buf1))));
19772 	} else {
19773 		char buf1[INET6_ADDRSTRLEN];
19774 		char buf2[INET6_ADDRSTRLEN];
19775 
19776 		ip1dbg(("ipif_select_source(%s, %s) -> %s\n",
19777 		    ipif->ipif_ill->ill_name,
19778 		    inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)),
19779 		    inet_ntop(AF_INET, &ipif->ipif_lcl_addr,
19780 		    buf2, sizeof (buf2))));
19781 	}
19782 #endif /* DEBUG */
19783 	return (ipif);
19784 }
19785 
19786 
19787 /*
19788  * If old_ipif is not NULL, see if ipif was derived from old
19789  * ipif and if so, recreate the interface route by re-doing
19790  * source address selection. This happens when ipif_down ->
19791  * ipif_update_other_ipifs calls us.
19792  *
19793  * If old_ipif is NULL, just redo the source address selection
19794  * if needed. This happens when illgrp_insert or ipif_up_done
19795  * calls us.
19796  */
19797 static void
19798 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif)
19799 {
19800 	ire_t *ire;
19801 	ire_t *ipif_ire;
19802 	queue_t *stq;
19803 	ipif_t *nipif;
19804 	ill_t *ill;
19805 	boolean_t need_rele = B_FALSE;
19806 
19807 	ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif));
19808 	ASSERT(IAM_WRITER_IPIF(ipif));
19809 
19810 	ill = ipif->ipif_ill;
19811 	if (!(ipif->ipif_flags &
19812 	    (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
19813 		/*
19814 		 * Can't possibly have borrowed the source
19815 		 * from old_ipif.
19816 		 */
19817 		return;
19818 	}
19819 
19820 	/*
19821 	 * Is there any work to be done? No work if the address
19822 	 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST (
19823 	 * ipif_select_source() does not borrow addresses from
19824 	 * NOLOCAL and ANYCAST interfaces).
19825 	 */
19826 	if ((old_ipif != NULL) &&
19827 	    ((old_ipif->ipif_lcl_addr == INADDR_ANY) ||
19828 	    (old_ipif->ipif_ill->ill_wq == NULL) ||
19829 	    (old_ipif->ipif_flags &
19830 	    (IPIF_NOLOCAL|IPIF_ANYCAST)))) {
19831 		return;
19832 	}
19833 
19834 	/*
19835 	 * Perform the same checks as when creating the
19836 	 * IRE_INTERFACE in ipif_up_done.
19837 	 */
19838 	if (!(ipif->ipif_flags & IPIF_UP))
19839 		return;
19840 
19841 	if ((ipif->ipif_flags & IPIF_NOXMIT) ||
19842 	    (ipif->ipif_subnet == INADDR_ANY))
19843 		return;
19844 
19845 	ipif_ire = ipif_to_ire(ipif);
19846 	if (ipif_ire == NULL)
19847 		return;
19848 
19849 	/*
19850 	 * We know that ipif uses some other source for its
19851 	 * IRE_INTERFACE. Is it using the source of this
19852 	 * old_ipif?
19853 	 */
19854 	if (old_ipif != NULL &&
19855 	    old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) {
19856 		ire_refrele(ipif_ire);
19857 		return;
19858 	}
19859 	if (ip_debug > 2) {
19860 		/* ip1dbg */
19861 		pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for"
19862 		    " src %s\n", AF_INET, &ipif_ire->ire_src_addr);
19863 	}
19864 
19865 	stq = ipif_ire->ire_stq;
19866 
19867 	/*
19868 	 * Can't use our source address. Select a different
19869 	 * source address for the IRE_INTERFACE.
19870 	 */
19871 	nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid);
19872 	if (nipif == NULL) {
19873 		/* Last resort - all ipif's have IPIF_NOLOCAL */
19874 		nipif = ipif;
19875 	} else {
19876 		need_rele = B_TRUE;
19877 	}
19878 
19879 	ire = ire_create(
19880 	    (uchar_t *)&ipif->ipif_subnet,	/* dest pref */
19881 	    (uchar_t *)&ipif->ipif_net_mask,	/* mask */
19882 	    (uchar_t *)&nipif->ipif_src_addr,	/* src addr */
19883 	    NULL,				/* no gateway */
19884 	    NULL,
19885 	    &ipif->ipif_mtu,			/* max frag */
19886 	    NULL,				/* fast path header */
19887 	    NULL,				/* no recv from queue */
19888 	    stq,				/* send-to queue */
19889 	    ill->ill_net_type,			/* IF_[NO]RESOLVER */
19890 	    ill->ill_resolver_mp,		/* xmit header */
19891 	    ipif,
19892 	    NULL,
19893 	    0,
19894 	    0,
19895 	    0,
19896 	    0,
19897 	    &ire_uinfo_null);
19898 
19899 	if (ire != NULL) {
19900 		ire_t *ret_ire;
19901 		int error;
19902 
19903 		/*
19904 		 * We don't need ipif_ire anymore. We need to delete
19905 		 * before we add so that ire_add does not detect
19906 		 * duplicates.
19907 		 */
19908 		ire_delete(ipif_ire);
19909 		ret_ire = ire;
19910 		error = ire_add(&ret_ire, NULL, NULL, NULL);
19911 		ASSERT(error == 0);
19912 		ASSERT(ire == ret_ire);
19913 		/* Held in ire_add */
19914 		ire_refrele(ret_ire);
19915 	}
19916 	/*
19917 	 * Either we are falling through from above or could not
19918 	 * allocate a replacement.
19919 	 */
19920 	ire_refrele(ipif_ire);
19921 	if (need_rele)
19922 		ipif_refrele(nipif);
19923 }
19924 
19925 /*
19926  * This old_ipif is going away.
19927  *
19928  * Determine if any other ipif's is using our address as
19929  * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or
19930  * IPIF_DEPRECATED).
19931  * Find the IRE_INTERFACE for such ipifs and recreate them
19932  * to use an different source address following the rules in
19933  * ipif_up_done.
19934  *
19935  * This function takes an illgrp as an argument so that illgrp_delete
19936  * can call this to update source address even after deleting the
19937  * old_ipif->ipif_ill from the ill group.
19938  */
19939 static void
19940 ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp)
19941 {
19942 	ipif_t *ipif;
19943 	ill_t *ill;
19944 	char	buf[INET6_ADDRSTRLEN];
19945 
19946 	ASSERT(IAM_WRITER_IPIF(old_ipif));
19947 	ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif));
19948 
19949 	ill = old_ipif->ipif_ill;
19950 
19951 	ip1dbg(("ipif_update_other_ipifs(%s, %s)\n",
19952 	    ill->ill_name,
19953 	    inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr,
19954 	    buf, sizeof (buf))));
19955 	/*
19956 	 * If this part of a group, look at all ills as ipif_select_source
19957 	 * borrows source address across all the ills in the group.
19958 	 */
19959 	if (illgrp != NULL)
19960 		ill = illgrp->illgrp_ill;
19961 
19962 	for (; ill != NULL; ill = ill->ill_group_next) {
19963 		for (ipif = ill->ill_ipif; ipif != NULL;
19964 		    ipif = ipif->ipif_next) {
19965 
19966 			if (ipif == old_ipif)
19967 				continue;
19968 
19969 			ipif_recreate_interface_routes(old_ipif, ipif);
19970 		}
19971 	}
19972 }
19973 
19974 /* ARGSUSED */
19975 int
19976 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
19977 	ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
19978 {
19979 	/*
19980 	 * ill_phyint_reinit merged the v4 and v6 into a single
19981 	 * ipsq. Could also have become part of a ipmp group in the
19982 	 * process, and we might not have been able to complete the
19983 	 * operation in ipif_set_values, if we could not become
19984 	 * exclusive.  If so restart it here.
19985 	 */
19986 	return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
19987 }
19988 
19989 
19990 /* ARGSUSED */
19991 int
19992 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
19993     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
19994 {
19995 	queue_t		*q1 = q;
19996 	char 		*cp;
19997 	char		interf_name[LIFNAMSIZ];
19998 	uint_t		ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr;
19999 
20000 	if (!q->q_next) {
20001 		ip1dbg((
20002 		    "if_unitsel: IF_UNITSEL: no q_next\n"));
20003 		return (EINVAL);
20004 	}
20005 
20006 	if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0')
20007 		return (EALREADY);
20008 
20009 	do {
20010 		q1 = q1->q_next;
20011 	} while (q1->q_next);
20012 	cp = q1->q_qinfo->qi_minfo->mi_idname;
20013 	(void) sprintf(interf_name, "%s%d", cp, ppa);
20014 
20015 	/*
20016 	 * Here we are not going to delay the ioack until after
20017 	 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the
20018 	 * original ioctl message before sending the requests.
20019 	 */
20020 	return (ipif_set_values(q, mp, interf_name, &ppa));
20021 }
20022 
20023 /* ARGSUSED */
20024 int
20025 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
20026     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
20027 {
20028 	return (ENXIO);
20029 }
20030 
20031 /*
20032  * Net and subnet broadcast ire's are now specific to the particular
20033  * physical interface (ill) and not to any one locigal interface (ipif).
20034  * However, if a particular logical interface is being taken down, it's
20035  * associated ire's will be taken down as well.  Hence, when we go to
20036  * take down or change the local address, broadcast address or netmask
20037  * of a specific logical interface, we must check to make sure that we
20038  * have valid net and subnet broadcast ire's for the other logical
20039  * interfaces which may have been shared with the logical interface
20040  * being brought down or changed.
20041  *
20042  * There is one set of 0.0.0.0 and 255.255.255.255 per ill. Usually it
20043  * is tied to the first interface coming UP. If that ipif is going down,
20044  * we need to recreate them on the next valid ipif.
20045  *
20046  * Note: assume that the ipif passed in is still up so that it's IRE
20047  * entries are still valid.
20048  */
20049 static void
20050 ipif_check_bcast_ires(ipif_t *test_ipif)
20051 {
20052 	ipif_t	*ipif;
20053 	ire_t	*test_subnet_ire, *test_net_ire;
20054 	ire_t	*test_allzero_ire, *test_allone_ire;
20055 	ire_t	*ire_array[12];
20056 	ire_t	**irep = &ire_array[0];
20057 	ire_t	**irep1;
20058 
20059 	ipaddr_t net_addr, subnet_addr, net_mask, subnet_mask;
20060 	ipaddr_t test_net_addr, test_subnet_addr;
20061 	ipaddr_t test_net_mask, test_subnet_mask;
20062 	boolean_t need_net_bcast_ire = B_FALSE;
20063 	boolean_t need_subnet_bcast_ire = B_FALSE;
20064 	boolean_t allzero_bcast_ire_created = B_FALSE;
20065 	boolean_t allone_bcast_ire_created = B_FALSE;
20066 	boolean_t net_bcast_ire_created = B_FALSE;
20067 	boolean_t subnet_bcast_ire_created = B_FALSE;
20068 
20069 	ipif_t  *backup_ipif_net = (ipif_t *)NULL;
20070 	ipif_t  *backup_ipif_subnet = (ipif_t *)NULL;
20071 	ipif_t  *backup_ipif_allzeros = (ipif_t *)NULL;
20072 	ipif_t  *backup_ipif_allones = (ipif_t *)NULL;
20073 	uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST;
20074 
20075 	ASSERT(!test_ipif->ipif_isv6);
20076 	ASSERT(IAM_WRITER_IPIF(test_ipif));
20077 
20078 	/*
20079 	 * No broadcast IREs for the LOOPBACK interface
20080 	 * or others such as point to point and IPIF_NOXMIT.
20081 	 */
20082 	if (!(test_ipif->ipif_flags & IPIF_BROADCAST) ||
20083 	    (test_ipif->ipif_flags & IPIF_NOXMIT))
20084 		return;
20085 
20086 	test_allzero_ire = ire_ctable_lookup(0, 0, IRE_BROADCAST,
20087 	    test_ipif, ALL_ZONES, (MATCH_IRE_TYPE | MATCH_IRE_IPIF));
20088 
20089 	test_allone_ire = ire_ctable_lookup(INADDR_BROADCAST, 0, IRE_BROADCAST,
20090 	    test_ipif, ALL_ZONES, (MATCH_IRE_TYPE | MATCH_IRE_IPIF));
20091 
20092 	test_net_mask = ip_net_mask(test_ipif->ipif_subnet);
20093 	test_subnet_mask = test_ipif->ipif_net_mask;
20094 
20095 	/*
20096 	 * If no net mask set, assume the default based on net class.
20097 	 */
20098 	if (test_subnet_mask == 0)
20099 		test_subnet_mask = test_net_mask;
20100 
20101 	/*
20102 	 * Check if there is a network broadcast ire associated with this ipif
20103 	 */
20104 	test_net_addr = test_net_mask  & test_ipif->ipif_subnet;
20105 	test_net_ire = ire_ctable_lookup(test_net_addr, 0, IRE_BROADCAST,
20106 	    test_ipif, ALL_ZONES, (MATCH_IRE_TYPE | MATCH_IRE_IPIF));
20107 
20108 	/*
20109 	 * Check if there is a subnet broadcast IRE associated with this ipif
20110 	 */
20111 	test_subnet_addr = test_subnet_mask  & test_ipif->ipif_subnet;
20112 	test_subnet_ire = ire_ctable_lookup(test_subnet_addr, 0, IRE_BROADCAST,
20113 	    test_ipif, ALL_ZONES, (MATCH_IRE_TYPE | MATCH_IRE_IPIF));
20114 
20115 	/*
20116 	 * No broadcast ire's associated with this ipif.
20117 	 */
20118 	if ((test_subnet_ire == NULL) && (test_net_ire == NULL) &&
20119 	    (test_allzero_ire == NULL) && (test_allone_ire == NULL)) {
20120 		return;
20121 	}
20122 
20123 	/*
20124 	 * We have established which bcast ires have to be replaced.
20125 	 * Next we try to locate ipifs that match there ires.
20126 	 * The rules are simple: If we find an ipif that matches on the subnet
20127 	 * address it will also match on the net address, the allzeros and
20128 	 * allones address. Any ipif that matches only on the net address will
20129 	 * also match the allzeros and allones addresses.
20130 	 * The other criterion is the ipif_flags. We look for non-deprecated
20131 	 * (and non-anycast and non-nolocal) ipifs as the best choice.
20132 	 * ipifs with check_flags matching (deprecated, etc) are used only
20133 	 * if good ipifs are not available. While looping, we save existing
20134 	 * deprecated ipifs as backup_ipif.
20135 	 * We loop through all the ipifs for this ill looking for ipifs
20136 	 * whose broadcast addr match the ipif passed in, but do not have
20137 	 * their own broadcast ires. For creating 0.0.0.0 and
20138 	 * 255.255.255.255 we just need an ipif on this ill to create.
20139 	 */
20140 	for (ipif = test_ipif->ipif_ill->ill_ipif; ipif != NULL;
20141 	    ipif = ipif->ipif_next) {
20142 
20143 		ASSERT(!ipif->ipif_isv6);
20144 		/*
20145 		 * Already checked the ipif passed in.
20146 		 */
20147 		if (ipif == test_ipif) {
20148 			continue;
20149 		}
20150 
20151 		/*
20152 		 * We only need to recreate broadcast ires if another ipif in
20153 		 * the same zone uses them. The new ires must be created in the
20154 		 * same zone.
20155 		 */
20156 		if (ipif->ipif_zoneid != test_ipif->ipif_zoneid) {
20157 			continue;
20158 		}
20159 
20160 		/*
20161 		 * Only interested in logical interfaces with valid local
20162 		 * addresses or with the ability to broadcast.
20163 		 */
20164 		if ((ipif->ipif_subnet == 0) ||
20165 		    !(ipif->ipif_flags & IPIF_BROADCAST) ||
20166 		    (ipif->ipif_flags & IPIF_NOXMIT) ||
20167 		    !(ipif->ipif_flags & IPIF_UP)) {
20168 			continue;
20169 		}
20170 		/*
20171 		 * Check if there is a net broadcast ire for this
20172 		 * net address.  If it turns out that the ipif we are
20173 		 * about to take down owns this ire, we must make a
20174 		 * new one because it is potentially going away.
20175 		 */
20176 		if (test_net_ire && (!net_bcast_ire_created)) {
20177 			net_mask = ip_net_mask(ipif->ipif_subnet);
20178 			net_addr = net_mask & ipif->ipif_subnet;
20179 			if (net_addr == test_net_addr) {
20180 				need_net_bcast_ire = B_TRUE;
20181 				/*
20182 				 * Use DEPRECATED ipif only if no good
20183 				 * ires are available. subnet_addr is
20184 				 * a better match than net_addr.
20185 				 */
20186 				if ((ipif->ipif_flags & check_flags) &&
20187 				    (backup_ipif_net == NULL)) {
20188 					backup_ipif_net = ipif;
20189 				}
20190 			}
20191 		}
20192 		/*
20193 		 * Check if there is a subnet broadcast ire for this
20194 		 * net address.  If it turns out that the ipif we are
20195 		 * about to take down owns this ire, we must make a
20196 		 * new one because it is potentially going away.
20197 		 */
20198 		if (test_subnet_ire && (!subnet_bcast_ire_created)) {
20199 			subnet_mask = ipif->ipif_net_mask;
20200 			subnet_addr = ipif->ipif_subnet;
20201 			if (subnet_addr == test_subnet_addr) {
20202 				need_subnet_bcast_ire = B_TRUE;
20203 				if ((ipif->ipif_flags & check_flags) &&
20204 				    (backup_ipif_subnet == NULL)) {
20205 					backup_ipif_subnet = ipif;
20206 				}
20207 			}
20208 		}
20209 
20210 
20211 		/* Short circuit here if this ipif is deprecated */
20212 		if (ipif->ipif_flags & check_flags) {
20213 			if ((test_allzero_ire != NULL) &&
20214 			    (!allzero_bcast_ire_created) &&
20215 			    (backup_ipif_allzeros == NULL)) {
20216 				backup_ipif_allzeros = ipif;
20217 			}
20218 			if ((test_allone_ire != NULL) &&
20219 			    (!allone_bcast_ire_created) &&
20220 			    (backup_ipif_allones == NULL)) {
20221 				backup_ipif_allones = ipif;
20222 			}
20223 			continue;
20224 		}
20225 
20226 		/*
20227 		 * Found an ipif which has the same broadcast ire as the
20228 		 * ipif passed in and the ipif passed in "owns" the ire.
20229 		 * Create new broadcast ire's for this broadcast addr.
20230 		 */
20231 		if (need_net_bcast_ire && !net_bcast_ire_created) {
20232 			irep = ire_create_bcast(ipif, net_addr, irep);
20233 			irep = ire_create_bcast(ipif,
20234 			    ~net_mask | net_addr, irep);
20235 			net_bcast_ire_created = B_TRUE;
20236 		}
20237 		if (need_subnet_bcast_ire && !subnet_bcast_ire_created) {
20238 			irep = ire_create_bcast(ipif, subnet_addr, irep);
20239 			irep = ire_create_bcast(ipif,
20240 			    ~subnet_mask | subnet_addr, irep);
20241 			subnet_bcast_ire_created = B_TRUE;
20242 		}
20243 		if (test_allzero_ire != NULL && !allzero_bcast_ire_created) {
20244 			irep = ire_create_bcast(ipif, 0, irep);
20245 			allzero_bcast_ire_created = B_TRUE;
20246 		}
20247 		if (test_allone_ire != NULL && !allone_bcast_ire_created) {
20248 			irep = ire_create_bcast(ipif, INADDR_BROADCAST, irep);
20249 			allone_bcast_ire_created = B_TRUE;
20250 		}
20251 		/*
20252 		 * Once we have created all the appropriate ires, we
20253 		 * just break out of this loop to add what we have created.
20254 		 * This has been indented similar to ire_match_args for
20255 		 * readability.
20256 		 */
20257 		if (((test_net_ire == NULL) ||
20258 			(net_bcast_ire_created)) &&
20259 		    ((test_subnet_ire == NULL) ||
20260 			(subnet_bcast_ire_created)) &&
20261 		    ((test_allzero_ire == NULL) ||
20262 			(allzero_bcast_ire_created)) &&
20263 		    ((test_allone_ire == NULL) ||
20264 			(allone_bcast_ire_created))) {
20265 			break;
20266 		}
20267 	}
20268 
20269 	/*
20270 	 * Create bcast ires on deprecated ipifs if no non-deprecated ipifs
20271 	 * exist. 6 pairs of bcast ires are needed.
20272 	 * Note - the old ires are deleted in ipif_down.
20273 	 */
20274 	if (need_net_bcast_ire && !net_bcast_ire_created && backup_ipif_net) {
20275 		ipif = backup_ipif_net;
20276 		irep = ire_create_bcast(ipif, net_addr, irep);
20277 		irep = ire_create_bcast(ipif, ~net_mask | net_addr, irep);
20278 		net_bcast_ire_created = B_TRUE;
20279 	}
20280 	if (need_subnet_bcast_ire && !subnet_bcast_ire_created &&
20281 	    backup_ipif_subnet) {
20282 		ipif = backup_ipif_subnet;
20283 		irep = ire_create_bcast(ipif, subnet_addr, irep);
20284 		irep = ire_create_bcast(ipif,
20285 		    ~subnet_mask | subnet_addr, irep);
20286 		subnet_bcast_ire_created = B_TRUE;
20287 	}
20288 	if (test_allzero_ire != NULL && !allzero_bcast_ire_created &&
20289 	    backup_ipif_allzeros) {
20290 		irep = ire_create_bcast(backup_ipif_allzeros, 0, irep);
20291 		allzero_bcast_ire_created = B_TRUE;
20292 	}
20293 	if (test_allone_ire != NULL && !allone_bcast_ire_created &&
20294 	    backup_ipif_allones) {
20295 		irep = ire_create_bcast(backup_ipif_allones,
20296 		    INADDR_BROADCAST, irep);
20297 		allone_bcast_ire_created = B_TRUE;
20298 	}
20299 
20300 	/*
20301 	 * If we can't create all of them, don't add any of them.
20302 	 * Code in ip_wput_ire and ire_to_ill assumes that we
20303 	 * always have a non-loopback copy and loopback copy
20304 	 * for a given address.
20305 	 */
20306 	for (irep1 = irep; irep1 > ire_array; ) {
20307 		irep1--;
20308 		if (*irep1 == NULL) {
20309 			ip0dbg(("ipif_check_bcast_ires: can't create "
20310 			    "IRE_BROADCAST, memory allocation failure\n"));
20311 			while (irep > ire_array) {
20312 				irep--;
20313 				if (*irep != NULL)
20314 					ire_delete(*irep);
20315 			}
20316 			goto bad;
20317 		}
20318 	}
20319 	for (irep1 = irep; irep1 > ire_array; ) {
20320 		int error;
20321 
20322 		irep1--;
20323 		error = ire_add(irep1, NULL, NULL, NULL);
20324 		if (error == 0) {
20325 			ire_refrele(*irep1);		/* Held in ire_add */
20326 		}
20327 	}
20328 bad:
20329 	if (test_allzero_ire != NULL)
20330 		ire_refrele(test_allzero_ire);
20331 	if (test_allone_ire != NULL)
20332 		ire_refrele(test_allone_ire);
20333 	if (test_net_ire != NULL)
20334 		ire_refrele(test_net_ire);
20335 	if (test_subnet_ire != NULL)
20336 		ire_refrele(test_subnet_ire);
20337 }
20338 
20339 /*
20340  * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV*
20341  * from lifr_flags and the name from lifr_name.
20342  * Set IFF_IPV* and ill_isv6 prior to doing the lookup
20343  * since ipif_lookup_on_name uses the _isv6 flags when matching.
20344  * Returns EINPROGRESS when mp has been consumed by queueing it on
20345  * ill_pending_mp and the ioctl will complete in ip_rput.
20346  */
20347 /* ARGSUSED */
20348 int
20349 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
20350     ip_ioctl_cmd_t *ipip, void *if_req)
20351 {
20352 	int	err;
20353 	ill_t	*ill;
20354 	struct lifreq *lifr = (struct lifreq *)if_req;
20355 
20356 	ASSERT(ipif != NULL);
20357 	ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name));
20358 	ASSERT(q->q_next != NULL);
20359 
20360 	ill = (ill_t *)q->q_ptr;
20361 	/*
20362 	 * If we are not writer on 'q' then this interface exists already
20363 	 * and previous lookups (ipif_extract_lifreq_cmn) found this ipif.
20364 	 * So return EALREADY
20365 	 */
20366 	if (ill != ipif->ipif_ill)
20367 		return (EALREADY);
20368 
20369 	if (ill->ill_name[0] != '\0')
20370 		return (EALREADY);
20371 
20372 	/*
20373 	 * Set all the flags. Allows all kinds of override. Provide some
20374 	 * sanity checking by not allowing IFF_BROADCAST and IFF_MULTICAST
20375 	 * unless there is either multicast/broadcast support in the driver
20376 	 * or it is a pt-pt link.
20377 	 */
20378 	if (lifr->lifr_flags & (IFF_PROMISC|IFF_ALLMULTI)) {
20379 		/* Meaningless to IP thus don't allow them to be set. */
20380 		ip1dbg(("ip_setname: EINVAL 1\n"));
20381 		return (EINVAL);
20382 	}
20383 	/*
20384 	 * For a DL_STYLE2 driver (ill_needs_attach), we would not have the
20385 	 * ill_bcast_addr_length info.
20386 	 */
20387 	if (!ill->ill_needs_attach &&
20388 	    ((lifr->lifr_flags & IFF_MULTICAST) &&
20389 	    !(lifr->lifr_flags & IFF_POINTOPOINT) &&
20390 	    ill->ill_bcast_addr_length == 0)) {
20391 		/* Link not broadcast/pt-pt capable i.e. no multicast */
20392 		ip1dbg(("ip_setname: EINVAL 2\n"));
20393 		return (EINVAL);
20394 	}
20395 	if ((lifr->lifr_flags & IFF_BROADCAST) &&
20396 	    ((lifr->lifr_flags & IFF_IPV6) ||
20397 	    (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) {
20398 		/* Link not broadcast capable or IPv6 i.e. no broadcast */
20399 		ip1dbg(("ip_setname: EINVAL 3\n"));
20400 		return (EINVAL);
20401 	}
20402 	if (lifr->lifr_flags & IFF_UP) {
20403 		/* Can only be set with SIOCSLIFFLAGS */
20404 		ip1dbg(("ip_setname: EINVAL 4\n"));
20405 		return (EINVAL);
20406 	}
20407 	if ((lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV6 &&
20408 	    (lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV4) {
20409 		ip1dbg(("ip_setname: EINVAL 5\n"));
20410 		return (EINVAL);
20411 	}
20412 	/*
20413 	 * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces.
20414 	 */
20415 	if ((lifr->lifr_flags & IFF_XRESOLV) &&
20416 	    !(lifr->lifr_flags & IFF_IPV6) &&
20417 	    !(ipif->ipif_isv6)) {
20418 		ip1dbg(("ip_setname: EINVAL 6\n"));
20419 		return (EINVAL);
20420 	}
20421 
20422 	/*
20423 	 * The user has done SIOCGLIFFLAGS prior to this ioctl and hence
20424 	 * we have all the flags here. So, we assign rather than we OR.
20425 	 * We can't OR the flags here because we don't want to set
20426 	 * both IFF_IPV4 and IFF_IPV6. We start off as IFF_IPV4 in
20427 	 * ipif_allocate and become IFF_IPV4 or IFF_IPV6 here depending
20428 	 * on lifr_flags value here.
20429 	 */
20430 	/*
20431 	 * This ill has not been inserted into the global list.
20432 	 * So we are still single threaded and don't need any lock
20433 	 */
20434 	ipif->ipif_flags = lifr->lifr_flags & IFF_LOGINT_FLAGS;
20435 	ill->ill_flags = lifr->lifr_flags & IFF_PHYINTINST_FLAGS;
20436 	ill->ill_phyint->phyint_flags = lifr->lifr_flags & IFF_PHYINT_FLAGS;
20437 
20438 	/* We started off as V4. */
20439 	if (ill->ill_flags & ILLF_IPV6) {
20440 		ill->ill_phyint->phyint_illv6 = ill;
20441 		ill->ill_phyint->phyint_illv4 = NULL;
20442 	}
20443 	err = ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa);
20444 	return (err);
20445 }
20446 
20447 /* ARGSUSED */
20448 int
20449 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
20450     ip_ioctl_cmd_t *ipip, void *if_req)
20451 {
20452 	/*
20453 	 * ill_phyint_reinit merged the v4 and v6 into a single
20454 	 * ipsq. Could also have become part of a ipmp group in the
20455 	 * process, and we might not have been able to complete the
20456 	 * slifname in ipif_set_values, if we could not become
20457 	 * exclusive.  If so restart it here
20458 	 */
20459 	return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
20460 }
20461 
20462 /*
20463  * Return a pointer to the ipif which matches the index, IP version type and
20464  * zoneid.
20465  */
20466 ipif_t *
20467 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
20468     queue_t *q, mblk_t *mp, ipsq_func_t func, int *err)
20469 {
20470 	ill_t	*ill;
20471 	ipsq_t  *ipsq;
20472 	phyint_t *phyi;
20473 	ipif_t	*ipif;
20474 
20475 	ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) ||
20476 	    (q != NULL && mp != NULL && func != NULL && err != NULL));
20477 
20478 	if (err != NULL)
20479 		*err = 0;
20480 
20481 	/*
20482 	 * Indexes are stored in the phyint - a common structure
20483 	 * to both IPv4 and IPv6.
20484 	 */
20485 
20486 	rw_enter(&ill_g_lock, RW_READER);
20487 	phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index,
20488 	    (void *) &index, NULL);
20489 	if (phyi != NULL) {
20490 		ill = isv6 ? phyi->phyint_illv6 : phyi->phyint_illv4;
20491 		if (ill == NULL) {
20492 			rw_exit(&ill_g_lock);
20493 			if (err != NULL)
20494 				*err = ENXIO;
20495 			return (NULL);
20496 		}
20497 		GRAB_CONN_LOCK(q);
20498 		mutex_enter(&ill->ill_lock);
20499 		if (ILL_CAN_LOOKUP(ill)) {
20500 			for (ipif = ill->ill_ipif; ipif != NULL;
20501 			    ipif = ipif->ipif_next) {
20502 				if (IPIF_CAN_LOOKUP(ipif) &&
20503 				    (zoneid == ALL_ZONES ||
20504 				    zoneid == ipif->ipif_zoneid)) {
20505 					ipif_refhold_locked(ipif);
20506 					mutex_exit(&ill->ill_lock);
20507 					RELEASE_CONN_LOCK(q);
20508 					rw_exit(&ill_g_lock);
20509 					return (ipif);
20510 				}
20511 			}
20512 		} else if (ILL_CAN_WAIT(ill, q)) {
20513 			ipsq = ill->ill_phyint->phyint_ipsq;
20514 			mutex_enter(&ipsq->ipsq_lock);
20515 			rw_exit(&ill_g_lock);
20516 			mutex_exit(&ill->ill_lock);
20517 			ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
20518 			mutex_exit(&ipsq->ipsq_lock);
20519 			RELEASE_CONN_LOCK(q);
20520 			*err = EINPROGRESS;
20521 			return (NULL);
20522 		}
20523 		mutex_exit(&ill->ill_lock);
20524 		RELEASE_CONN_LOCK(q);
20525 	}
20526 	rw_exit(&ill_g_lock);
20527 	if (err != NULL)
20528 		*err = ENXIO;
20529 	return (NULL);
20530 }
20531 
20532 typedef struct conn_change_s {
20533 	uint_t cc_old_ifindex;
20534 	uint_t cc_new_ifindex;
20535 } conn_change_t;
20536 
20537 /*
20538  * ipcl_walk function for changing interface index.
20539  */
20540 static void
20541 conn_change_ifindex(conn_t *connp, caddr_t arg)
20542 {
20543 	conn_change_t *connc;
20544 	uint_t old_ifindex;
20545 	uint_t new_ifindex;
20546 	int i;
20547 	ilg_t *ilg;
20548 
20549 	connc = (conn_change_t *)arg;
20550 	old_ifindex = connc->cc_old_ifindex;
20551 	new_ifindex = connc->cc_new_ifindex;
20552 
20553 	if (connp->conn_orig_bound_ifindex == old_ifindex)
20554 		connp->conn_orig_bound_ifindex = new_ifindex;
20555 
20556 	if (connp->conn_orig_multicast_ifindex == old_ifindex)
20557 		connp->conn_orig_multicast_ifindex = new_ifindex;
20558 
20559 	if (connp->conn_orig_xmit_ifindex == old_ifindex)
20560 		connp->conn_orig_xmit_ifindex = new_ifindex;
20561 
20562 	for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
20563 		ilg = &connp->conn_ilg[i];
20564 		if (ilg->ilg_orig_ifindex == old_ifindex)
20565 			ilg->ilg_orig_ifindex = new_ifindex;
20566 	}
20567 }
20568 
20569 /*
20570  * Walk all the ipifs and ilms on this ill and change the orig_ifindex
20571  * to new_index if it matches the old_index.
20572  *
20573  * Failovers typically happen within a group of ills. But somebody
20574  * can remove an ill from the group after a failover happened. If
20575  * we are setting the ifindex after this, we potentially need to
20576  * look at all the ills rather than just the ones in the group.
20577  * We cut down the work by looking at matching ill_net_types
20578  * and ill_types as we could not possibly grouped them together.
20579  */
20580 static void
20581 ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc)
20582 {
20583 	ill_t *ill;
20584 	ipif_t *ipif;
20585 	uint_t old_ifindex;
20586 	uint_t new_ifindex;
20587 	ilm_t *ilm;
20588 	ill_walk_context_t ctx;
20589 
20590 	old_ifindex = connc->cc_old_ifindex;
20591 	new_ifindex = connc->cc_new_ifindex;
20592 
20593 	rw_enter(&ill_g_lock, RW_READER);
20594 	ill = ILL_START_WALK_ALL(&ctx);
20595 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
20596 		if ((ill_orig->ill_net_type != ill->ill_net_type) ||
20597 			(ill_orig->ill_type != ill->ill_type)) {
20598 			continue;
20599 		}
20600 		for (ipif = ill->ill_ipif; ipif != NULL;
20601 				ipif = ipif->ipif_next) {
20602 			if (ipif->ipif_orig_ifindex == old_ifindex)
20603 				ipif->ipif_orig_ifindex = new_ifindex;
20604 		}
20605 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
20606 			if (ilm->ilm_orig_ifindex == old_ifindex)
20607 				ilm->ilm_orig_ifindex = new_ifindex;
20608 		}
20609 	}
20610 	rw_exit(&ill_g_lock);
20611 }
20612 
20613 /*
20614  * We first need to ensure that the new index is unique, and
20615  * then carry the change across both v4 and v6 ill representation
20616  * of the physical interface.
20617  */
20618 /* ARGSUSED */
20619 int
20620 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
20621     ip_ioctl_cmd_t *ipip, void *ifreq)
20622 {
20623 	ill_t		*ill;
20624 	ill_t		*ill_other;
20625 	phyint_t	*phyi;
20626 	int		old_index;
20627 	conn_change_t	connc;
20628 	struct ifreq	*ifr = (struct ifreq *)ifreq;
20629 	struct lifreq	*lifr = (struct lifreq *)ifreq;
20630 	uint_t	index;
20631 	ill_t	*ill_v4;
20632 	ill_t	*ill_v6;
20633 
20634 	if (ipip->ipi_cmd_type == IF_CMD)
20635 		index = ifr->ifr_index;
20636 	else
20637 		index = lifr->lifr_index;
20638 
20639 	/*
20640 	 * Only allow on physical interface. Also, index zero is illegal.
20641 	 *
20642 	 * Need to check for PHYI_FAILED and PHYI_INACTIVE
20643 	 *
20644 	 * 1) If PHYI_FAILED is set, a failover could have happened which
20645 	 *    implies a possible failback might have to happen. As failback
20646 	 *    depends on the old index, we should fail setting the index.
20647 	 *
20648 	 * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that
20649 	 *    any addresses or multicast memberships are failed over to
20650 	 *    a non-STANDBY interface. As failback depends on the old
20651 	 *    index, we should fail setting the index for this case also.
20652 	 *
20653 	 * 3) If PHYI_OFFLINE is set, a possible failover has happened.
20654 	 *    Be consistent with PHYI_FAILED and fail the ioctl.
20655 	 */
20656 	ill = ipif->ipif_ill;
20657 	phyi = ill->ill_phyint;
20658 	if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) ||
20659 	    ipif->ipif_id != 0 || index == 0) {
20660 		return (EINVAL);
20661 	}
20662 	old_index = phyi->phyint_ifindex;
20663 
20664 	/* If the index is not changing, no work to do */
20665 	if (old_index == index)
20666 		return (0);
20667 
20668 	/*
20669 	 * Use ill_lookup_on_ifindex to determine if the
20670 	 * new index is unused and if so allow the change.
20671 	 */
20672 	ill_v6 = ill_lookup_on_ifindex(index, B_TRUE, NULL, NULL, NULL, NULL);
20673 	ill_v4 = ill_lookup_on_ifindex(index, B_FALSE, NULL, NULL, NULL, NULL);
20674 	if (ill_v6 != NULL || ill_v4 != NULL) {
20675 		if (ill_v4 != NULL)
20676 			ill_refrele(ill_v4);
20677 		if (ill_v6 != NULL)
20678 			ill_refrele(ill_v6);
20679 		return (EBUSY);
20680 	}
20681 
20682 	/*
20683 	 * The new index is unused. Set it in the phyint.
20684 	 * Locate the other ill so that we can send a routing
20685 	 * sockets message.
20686 	 */
20687 	if (ill->ill_isv6) {
20688 		ill_other = phyi->phyint_illv4;
20689 	} else {
20690 		ill_other = phyi->phyint_illv6;
20691 	}
20692 
20693 	phyi->phyint_ifindex = index;
20694 
20695 	connc.cc_old_ifindex = old_index;
20696 	connc.cc_new_ifindex = index;
20697 	ip_change_ifindex(ill, &connc);
20698 	ipcl_walk(conn_change_ifindex, (caddr_t)&connc);
20699 
20700 	/* Send the routing sockets message */
20701 	ip_rts_ifmsg(ipif);
20702 	if (ill_other != NULL)
20703 		ip_rts_ifmsg(ill_other->ill_ipif);
20704 
20705 	return (0);
20706 }
20707 
20708 /* ARGSUSED */
20709 int
20710 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
20711     ip_ioctl_cmd_t *ipip, void *ifreq)
20712 {
20713 	struct ifreq	*ifr = (struct ifreq *)ifreq;
20714 	struct lifreq	*lifr = (struct lifreq *)ifreq;
20715 
20716 	ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n",
20717 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
20718 	/* Get the interface index */
20719 	if (ipip->ipi_cmd_type == IF_CMD) {
20720 		ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
20721 	} else {
20722 		lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
20723 	}
20724 	return (0);
20725 }
20726 
20727 /* ARGSUSED */
20728 int
20729 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
20730     ip_ioctl_cmd_t *ipip, void *ifreq)
20731 {
20732 	struct lifreq	*lifr = (struct lifreq *)ifreq;
20733 
20734 	ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n",
20735 		ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
20736 	/* Get the interface zone */
20737 	ASSERT(ipip->ipi_cmd_type == LIF_CMD);
20738 	lifr->lifr_zoneid = ipif->ipif_zoneid;
20739 	return (0);
20740 }
20741 
20742 /*
20743  * Set the zoneid of an interface.
20744  */
20745 /* ARGSUSED */
20746 int
20747 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
20748     ip_ioctl_cmd_t *ipip, void *ifreq)
20749 {
20750 	struct lifreq	*lifr = (struct lifreq *)ifreq;
20751 	int err = 0;
20752 	boolean_t need_up = B_FALSE;
20753 	zone_t *zptr;
20754 	zone_status_t status;
20755 	zoneid_t zoneid;
20756 
20757 	/* cannot assign instance zero to a non-global zone */
20758 	if (ipif->ipif_id == 0)
20759 		return (ENOTSUP);
20760 
20761 	ASSERT(ipip->ipi_cmd_type == LIF_CMD);
20762 	zoneid = lifr->lifr_zoneid;
20763 
20764 	/*
20765 	 * Cannot assign to a zone that doesn't exist or is shutting down.  In
20766 	 * the event of a race with the zone shutdown processing, since IP
20767 	 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the
20768 	 * interface will be cleaned up even if the zone is shut down
20769 	 * immediately after the status check. If the interface can't be brought
20770 	 * down right away, and the zone is shut down before the restart
20771 	 * function is called, we resolve the possible races by rechecking the
20772 	 * zone status in the restart function.
20773 	 */
20774 	if ((zptr = zone_find_by_id(zoneid)) == NULL)
20775 		return (EINVAL);
20776 	status = zone_status_get(zptr);
20777 	zone_rele(zptr);
20778 
20779 	if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING)
20780 		return (EINVAL);
20781 
20782 	if (ipif->ipif_flags & IPIF_UP) {
20783 		/*
20784 		 * If the interface is already marked up,
20785 		 * we call ipif_down which will take care
20786 		 * of ditching any IREs that have been set
20787 		 * up based on the old interface address.
20788 		 */
20789 		err = ipif_logical_down(ipif, q, mp);
20790 		if (err == EINPROGRESS)
20791 			return (err);
20792 		ipif_down_tail(ipif);
20793 		need_up = B_TRUE;
20794 	}
20795 
20796 	err = ip_sioctl_slifzone_tail(ipif, zoneid, q, mp, need_up);
20797 	return (err);
20798 }
20799 
20800 static int
20801 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
20802     queue_t *q, mblk_t *mp, boolean_t need_up)
20803 {
20804 	int	err = 0;
20805 
20806 	ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n",
20807 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
20808 
20809 	/* Set the new zone id. */
20810 	ipif->ipif_zoneid = zoneid;
20811 
20812 	/* Update sctp list */
20813 	sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
20814 
20815 	if (need_up) {
20816 		/*
20817 		 * Now bring the interface back up.  If this
20818 		 * is the only IPIF for the ILL, ipif_up
20819 		 * will have to re-bind to the device, so
20820 		 * we may get back EINPROGRESS, in which
20821 		 * case, this IOCTL will get completed in
20822 		 * ip_rput_dlpi when we see the DL_BIND_ACK.
20823 		 */
20824 		err = ipif_up(ipif, q, mp);
20825 	}
20826 	return (err);
20827 }
20828 
20829 /* ARGSUSED */
20830 int
20831 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
20832     ip_ioctl_cmd_t *ipip, void *if_req)
20833 {
20834 	struct lifreq *lifr = (struct lifreq *)if_req;
20835 	zoneid_t zoneid;
20836 	zone_t *zptr;
20837 	zone_status_t status;
20838 
20839 	ASSERT(ipif->ipif_id != 0);
20840 	ASSERT(ipip->ipi_cmd_type == LIF_CMD);
20841 	zoneid = lifr->lifr_zoneid;
20842 
20843 	ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n",
20844 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
20845 
20846 	/*
20847 	 * We recheck the zone status to resolve the following race condition:
20848 	 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone";
20849 	 * 2) hme0:1 is up and can't be brought down right away;
20850 	 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued;
20851 	 * 3) zone "myzone" is halted; the zone status switches to
20852 	 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list
20853 	 * the interfaces to remove - hme0:1 is not returned because it's not
20854 	 * yet in "myzone", so it won't be removed;
20855 	 * 4) the restart function for SIOCSLIFZONE is called; without the
20856 	 * status check here, we would have hme0:1 in "myzone" after it's been
20857 	 * destroyed.
20858 	 * Note that if the status check fails, we need to bring the interface
20859 	 * back to its state prior to ip_sioctl_slifzone(), hence the call to
20860 	 * ipif_up_done[_v6]().
20861 	 */
20862 	status = ZONE_IS_UNINITIALIZED;
20863 	if ((zptr = zone_find_by_id(zoneid)) != NULL) {
20864 		status = zone_status_get(zptr);
20865 		zone_rele(zptr);
20866 	}
20867 	if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) {
20868 		if (ipif->ipif_isv6) {
20869 			(void) ipif_up_done_v6(ipif);
20870 		} else {
20871 			(void) ipif_up_done(ipif);
20872 		}
20873 		return (EINVAL);
20874 	}
20875 
20876 	ipif_down_tail(ipif);
20877 
20878 	return (ip_sioctl_slifzone_tail(ipif, zoneid, q, mp, B_TRUE));
20879 }
20880 
20881 /* ARGSUSED */
20882 int
20883 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
20884 	ip_ioctl_cmd_t *ipip, void *ifreq)
20885 {
20886 	struct lifreq	*lifr = ifreq;
20887 
20888 	ASSERT(q->q_next == NULL);
20889 	ASSERT(CONN_Q(q));
20890 
20891 	ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n",
20892 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
20893 	lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex;
20894 	ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index));
20895 
20896 	return (0);
20897 }
20898 
20899 
20900 /* Find the previous ILL in this usesrc group */
20901 static ill_t *
20902 ill_prev_usesrc(ill_t *uill)
20903 {
20904 	ill_t *ill;
20905 
20906 	for (ill = uill->ill_usesrc_grp_next;
20907 	    ASSERT(ill), ill->ill_usesrc_grp_next != uill;
20908 	    ill = ill->ill_usesrc_grp_next)
20909 		/* do nothing */;
20910 	return (ill);
20911 }
20912 
20913 /*
20914  * Release all members of the usesrc group. This routine is called
20915  * from ill_delete when the interface being unplumbed is the
20916  * group head.
20917  */
20918 static void
20919 ill_disband_usesrc_group(ill_t *uill)
20920 {
20921 	ill_t *next_ill, *tmp_ill;
20922 	ASSERT(RW_WRITE_HELD(&ill_g_usesrc_lock));
20923 	next_ill = uill->ill_usesrc_grp_next;
20924 
20925 	do {
20926 		ASSERT(next_ill != NULL);
20927 		tmp_ill = next_ill->ill_usesrc_grp_next;
20928 		ASSERT(tmp_ill != NULL);
20929 		next_ill->ill_usesrc_grp_next = NULL;
20930 		next_ill->ill_usesrc_ifindex = 0;
20931 		next_ill = tmp_ill;
20932 	} while (next_ill->ill_usesrc_ifindex != 0);
20933 	uill->ill_usesrc_grp_next = NULL;
20934 }
20935 
20936 /*
20937  * Remove the client usesrc ILL from the list and relink to a new list
20938  */
20939 int
20940 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex)
20941 {
20942 	ill_t *ill, *tmp_ill;
20943 
20944 	ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) &&
20945 	    (uill != NULL) && RW_WRITE_HELD(&ill_g_usesrc_lock));
20946 
20947 	/*
20948 	 * Check if the usesrc client ILL passed in is not already
20949 	 * in use as a usesrc ILL i.e one whose source address is
20950 	 * in use OR a usesrc ILL is not already in use as a usesrc
20951 	 * client ILL
20952 	 */
20953 	if ((ucill->ill_usesrc_ifindex == 0) ||
20954 	    (uill->ill_usesrc_ifindex != 0)) {
20955 		return (-1);
20956 	}
20957 
20958 	ill = ill_prev_usesrc(ucill);
20959 	ASSERT(ill->ill_usesrc_grp_next != NULL);
20960 
20961 	/* Remove from the current list */
20962 	if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) {
20963 		/* Only two elements in the list */
20964 		ASSERT(ill->ill_usesrc_ifindex == 0);
20965 		ill->ill_usesrc_grp_next = NULL;
20966 	} else {
20967 		ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next;
20968 	}
20969 
20970 	if (ifindex == 0) {
20971 		ucill->ill_usesrc_ifindex = 0;
20972 		ucill->ill_usesrc_grp_next = NULL;
20973 		return (0);
20974 	}
20975 
20976 	ucill->ill_usesrc_ifindex = ifindex;
20977 	tmp_ill = uill->ill_usesrc_grp_next;
20978 	uill->ill_usesrc_grp_next = ucill;
20979 	ucill->ill_usesrc_grp_next =
20980 	    (tmp_ill != NULL) ? tmp_ill : uill;
20981 	return (0);
20982 }
20983 
20984 /*
20985  * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in
20986  * ip.c for locking details.
20987  */
20988 /* ARGSUSED */
20989 int
20990 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
20991     ip_ioctl_cmd_t *ipip, void *ifreq)
20992 {
20993 	struct lifreq *lifr = (struct lifreq *)ifreq;
20994 	boolean_t isv6 = B_FALSE, reset_flg = B_FALSE,
20995 	    ill_flag_changed = B_FALSE;
20996 	ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill;
20997 	int err = 0, ret;
20998 	uint_t ifindex;
20999 	phyint_t *us_phyint, *us_cli_phyint;
21000 	ipsq_t *ipsq = NULL;
21001 
21002 	ASSERT(IAM_WRITER_IPIF(ipif));
21003 	ASSERT(q->q_next == NULL);
21004 	ASSERT(CONN_Q(q));
21005 
21006 	isv6 = (Q_TO_CONN(q))->conn_af_isv6;
21007 	us_cli_phyint = usesrc_cli_ill->ill_phyint;
21008 
21009 	ASSERT(us_cli_phyint != NULL);
21010 
21011 	/*
21012 	 * If the client ILL is being used for IPMP, abort.
21013 	 * Note, this can be done before ipsq_try_enter since we are already
21014 	 * exclusive on this ILL
21015 	 */
21016 	if ((us_cli_phyint->phyint_groupname != NULL) ||
21017 	    (us_cli_phyint->phyint_flags & PHYI_STANDBY)) {
21018 		return (EINVAL);
21019 	}
21020 
21021 	ifindex = lifr->lifr_index;
21022 	if (ifindex == 0) {
21023 		if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) {
21024 			/* non usesrc group interface, nothing to reset */
21025 			return (0);
21026 		}
21027 		ifindex = usesrc_cli_ill->ill_usesrc_ifindex;
21028 		/* valid reset request */
21029 		reset_flg = B_TRUE;
21030 	}
21031 
21032 	usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp,
21033 	    ip_process_ioctl, &err);
21034 
21035 	if (usesrc_ill == NULL) {
21036 		return (err);
21037 	}
21038 
21039 	/*
21040 	 * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP
21041 	 * group nor can either of the interfaces be used for standy. So
21042 	 * to guarantee mutual exclusion with ip_sioctl_flags (which sets
21043 	 * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname)
21044 	 * we need to be exclusive on the ipsq belonging to the usesrc_ill.
21045 	 * We are already exlusive on this ipsq i.e ipsq corresponding to
21046 	 * the usesrc_cli_ill
21047 	 */
21048 	ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl,
21049 	    NEW_OP, B_TRUE);
21050 	if (ipsq == NULL) {
21051 		err = EINPROGRESS;
21052 		/* Operation enqueued on the ipsq of the usesrc ILL */
21053 		goto done;
21054 	}
21055 
21056 	/* Check if the usesrc_ill is used for IPMP */
21057 	us_phyint = usesrc_ill->ill_phyint;
21058 	if ((us_phyint->phyint_groupname != NULL) ||
21059 	    (us_phyint->phyint_flags & PHYI_STANDBY)) {
21060 		err = EINVAL;
21061 		goto done;
21062 	}
21063 
21064 	/*
21065 	 * If the client is already in use as a usesrc_ill or a usesrc_ill is
21066 	 * already a client then return EINVAL
21067 	 */
21068 	if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) {
21069 		err = EINVAL;
21070 		goto done;
21071 	}
21072 
21073 	/*
21074 	 * If the ill_usesrc_ifindex field is already set to what it needs to
21075 	 * be then this is a duplicate operation.
21076 	 */
21077 	if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) {
21078 		err = 0;
21079 		goto done;
21080 	}
21081 
21082 	ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s,"
21083 	    " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name,
21084 	    usesrc_ill->ill_isv6));
21085 
21086 	/*
21087 	 * The next step ensures that no new ires will be created referencing
21088 	 * the client ill, until the ILL_CHANGING flag is cleared. Then
21089 	 * we go through an ire walk deleting all ire caches that reference
21090 	 * the client ill. New ires referencing the client ill that are added
21091 	 * to the ire table before the ILL_CHANGING flag is set, will be
21092 	 * cleaned up by the ire walk below. Attempt to add new ires referencing
21093 	 * the client ill while the ILL_CHANGING flag is set will be failed
21094 	 * during the ire_add in ire_atomic_start. ire_atomic_start atomically
21095 	 * checks (under the ill_g_usesrc_lock) that the ire being added
21096 	 * is not stale, i.e the ire_stq and ire_ipif are consistent and
21097 	 * belong to the same usesrc group.
21098 	 */
21099 	mutex_enter(&usesrc_cli_ill->ill_lock);
21100 	usesrc_cli_ill->ill_state_flags |= ILL_CHANGING;
21101 	mutex_exit(&usesrc_cli_ill->ill_lock);
21102 	ill_flag_changed = B_TRUE;
21103 
21104 	if (ipif->ipif_isv6)
21105 		ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill,
21106 		    ALL_ZONES);
21107 	else
21108 		ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill,
21109 		    ALL_ZONES);
21110 
21111 	/*
21112 	 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next
21113 	 * and the ill_usesrc_ifindex fields
21114 	 */
21115 	rw_enter(&ill_g_usesrc_lock, RW_WRITER);
21116 
21117 	if (reset_flg) {
21118 		ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0);
21119 		if (ret != 0) {
21120 			err = EINVAL;
21121 		}
21122 		rw_exit(&ill_g_usesrc_lock);
21123 		goto done;
21124 	}
21125 
21126 	/*
21127 	 * Four possibilities to consider:
21128 	 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp
21129 	 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't
21130 	 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't
21131 	 * 4. Both are part of their respective usesrc groups
21132 	 */
21133 	if ((usesrc_ill->ill_usesrc_grp_next == NULL) &&
21134 	    (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
21135 		ASSERT(usesrc_ill->ill_usesrc_ifindex == 0);
21136 		usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
21137 		usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
21138 		usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill;
21139 	} else if ((usesrc_ill->ill_usesrc_grp_next != NULL) &&
21140 	    (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
21141 		usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
21142 		/* Insert at head of list */
21143 		usesrc_cli_ill->ill_usesrc_grp_next =
21144 		    usesrc_ill->ill_usesrc_grp_next;
21145 		usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
21146 	} else {
21147 		ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill,
21148 		    ifindex);
21149 		if (ret != 0)
21150 			err = EINVAL;
21151 	}
21152 	rw_exit(&ill_g_usesrc_lock);
21153 
21154 done:
21155 	if (ill_flag_changed) {
21156 		mutex_enter(&usesrc_cli_ill->ill_lock);
21157 		usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING;
21158 		mutex_exit(&usesrc_cli_ill->ill_lock);
21159 	}
21160 	if (ipsq != NULL)
21161 		ipsq_exit(ipsq, B_TRUE, B_TRUE);
21162 	/* The refrele on the lifr_name ipif is done by ip_process_ioctl */
21163 	ill_refrele(usesrc_ill);
21164 	return (err);
21165 }
21166 
21167 /*
21168  * comparison function used by avl.
21169  */
21170 static int
21171 ill_phyint_compare_index(const void *index_ptr, const void *phyip)
21172 {
21173 
21174 	uint_t index;
21175 
21176 	ASSERT(phyip != NULL && index_ptr != NULL);
21177 
21178 	index = *((uint_t *)index_ptr);
21179 	/*
21180 	 * let the phyint with the lowest index be on top.
21181 	 */
21182 	if (((phyint_t *)phyip)->phyint_ifindex < index)
21183 		return (1);
21184 	if (((phyint_t *)phyip)->phyint_ifindex > index)
21185 		return (-1);
21186 	return (0);
21187 }
21188 
21189 /*
21190  * comparison function used by avl.
21191  */
21192 static int
21193 ill_phyint_compare_name(const void *name_ptr, const void *phyip)
21194 {
21195 	ill_t *ill;
21196 	int res = 0;
21197 
21198 	ASSERT(phyip != NULL && name_ptr != NULL);
21199 
21200 	if (((phyint_t *)phyip)->phyint_illv4)
21201 		ill = ((phyint_t *)phyip)->phyint_illv4;
21202 	else
21203 		ill = ((phyint_t *)phyip)->phyint_illv6;
21204 	ASSERT(ill != NULL);
21205 
21206 	res = strcmp(ill->ill_name, (char *)name_ptr);
21207 	if (res > 0)
21208 		return (1);
21209 	else if (res < 0)
21210 		return (-1);
21211 	return (0);
21212 }
21213 /*
21214  * This function is called from ill_delete when the ill is being
21215  * unplumbed. We remove the reference from the phyint and we also
21216  * free the phyint when there are no more references to it.
21217  */
21218 static void
21219 ill_phyint_free(ill_t *ill)
21220 {
21221 	phyint_t *phyi;
21222 	phyint_t *next_phyint;
21223 	ipsq_t *cur_ipsq;
21224 
21225 	ASSERT(ill->ill_phyint != NULL);
21226 
21227 	ASSERT(RW_WRITE_HELD(&ill_g_lock));
21228 	phyi = ill->ill_phyint;
21229 	ill->ill_phyint = NULL;
21230 	/*
21231 	 * ill_init allocates a phyint always to store the copy
21232 	 * of flags relevant to phyint. At that point in time, we could
21233 	 * not assign the name and hence phyint_illv4/v6 could not be
21234 	 * initialized. Later in ipif_set_values, we assign the name to
21235 	 * the ill, at which point in time we assign phyint_illv4/v6.
21236 	 * Thus we don't rely on phyint_illv6 to be initialized always.
21237 	 */
21238 	if (ill->ill_flags & ILLF_IPV6) {
21239 		phyi->phyint_illv6 = NULL;
21240 	} else {
21241 		phyi->phyint_illv4 = NULL;
21242 	}
21243 	/*
21244 	 * ipif_down removes it from the group when the last ipif goes
21245 	 * down.
21246 	 */
21247 	ASSERT(ill->ill_group == NULL);
21248 
21249 	if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL)
21250 		return;
21251 
21252 	/*
21253 	 * Make sure this phyint was put in the list.
21254 	 */
21255 	if (phyi->phyint_ifindex > 0) {
21256 		avl_remove(&phyint_g_list.phyint_list_avl_by_index,
21257 		    phyi);
21258 		avl_remove(&phyint_g_list.phyint_list_avl_by_name,
21259 		    phyi);
21260 	}
21261 	/*
21262 	 * remove phyint from the ipsq list.
21263 	 */
21264 	cur_ipsq = phyi->phyint_ipsq;
21265 	if (phyi == cur_ipsq->ipsq_phyint_list) {
21266 		cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next;
21267 	} else {
21268 		next_phyint = cur_ipsq->ipsq_phyint_list;
21269 		while (next_phyint != NULL) {
21270 			if (next_phyint->phyint_ipsq_next == phyi) {
21271 				next_phyint->phyint_ipsq_next =
21272 					phyi->phyint_ipsq_next;
21273 				break;
21274 			}
21275 			next_phyint = next_phyint->phyint_ipsq_next;
21276 		}
21277 		ASSERT(next_phyint != NULL);
21278 	}
21279 	IPSQ_DEC_REF(cur_ipsq);
21280 
21281 	if (phyi->phyint_groupname_len != 0) {
21282 		ASSERT(phyi->phyint_groupname != NULL);
21283 		mi_free(phyi->phyint_groupname);
21284 	}
21285 	mi_free(phyi);
21286 }
21287 
21288 /*
21289  * Attach the ill to the phyint structure which can be shared by both
21290  * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This
21291  * function is called from ipif_set_values and ill_lookup_on_name (for
21292  * loopback) where we know the name of the ill. We lookup the ill and if
21293  * there is one present already with the name use that phyint. Otherwise
21294  * reuse the one allocated by ill_init.
21295  */
21296 static void
21297 ill_phyint_reinit(ill_t *ill)
21298 {
21299 	boolean_t isv6 = ill->ill_isv6;
21300 	phyint_t *phyi_old;
21301 	phyint_t *phyi;
21302 	avl_index_t where = 0;
21303 	ill_t	*ill_other = NULL;
21304 	ipsq_t	*ipsq;
21305 
21306 	ASSERT(RW_WRITE_HELD(&ill_g_lock));
21307 
21308 	phyi_old = ill->ill_phyint;
21309 	ASSERT(isv6 || (phyi_old->phyint_illv4 == ill &&
21310 	    phyi_old->phyint_illv6 == NULL));
21311 	ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill &&
21312 	    phyi_old->phyint_illv4 == NULL));
21313 	ASSERT(phyi_old->phyint_ifindex == 0);
21314 
21315 	phyi = avl_find(&phyint_g_list.phyint_list_avl_by_name,
21316 	    ill->ill_name, &where);
21317 
21318 	/*
21319 	 * 1. We grabbed the ill_g_lock before inserting this ill into
21320 	 *    the global list of ills. So no other thread could have located
21321 	 *    this ill and hence the ipsq of this ill is guaranteed to be empty.
21322 	 * 2. Now locate the other protocol instance of this ill.
21323 	 * 3. Now grab both ill locks in the right order, and the phyint lock of
21324 	 *    the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq
21325 	 *    of neither ill can change.
21326 	 * 4. Merge the phyint and thus the ipsq as well of this ill onto the
21327 	 *    other ill.
21328 	 * 5. Release all locks.
21329 	 */
21330 
21331 	/*
21332 	 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if
21333 	 * we are initializing IPv4.
21334 	 */
21335 	if (phyi != NULL) {
21336 		ill_other = (isv6) ? phyi->phyint_illv4 :
21337 		    phyi->phyint_illv6;
21338 		ASSERT(ill_other->ill_phyint != NULL);
21339 		ASSERT((isv6 && !ill_other->ill_isv6) ||
21340 		    (!isv6 && ill_other->ill_isv6));
21341 		GRAB_ILL_LOCKS(ill, ill_other);
21342 		/*
21343 		 * We are potentially throwing away phyint_flags which
21344 		 * could be different from the one that we obtain from
21345 		 * ill_other->ill_phyint. But it is okay as we are assuming
21346 		 * that the state maintained within IP is correct.
21347 		 */
21348 		mutex_enter(&phyi->phyint_lock);
21349 		if (isv6) {
21350 			ASSERT(phyi->phyint_illv6 == NULL);
21351 			phyi->phyint_illv6 = ill;
21352 		} else {
21353 			ASSERT(phyi->phyint_illv4 == NULL);
21354 			phyi->phyint_illv4 = ill;
21355 		}
21356 		/*
21357 		 * This is a new ill, currently undergoing SLIFNAME
21358 		 * So we could not have joined an IPMP group until now.
21359 		 */
21360 		ASSERT(phyi_old->phyint_ipsq_next == NULL &&
21361 		    phyi_old->phyint_groupname == NULL);
21362 
21363 		/*
21364 		 * This phyi_old is going away. Decref ipsq_refs and
21365 		 * assert it is zero. The ipsq itself will be freed in
21366 		 * ipsq_exit
21367 		 */
21368 		ipsq = phyi_old->phyint_ipsq;
21369 		IPSQ_DEC_REF(ipsq);
21370 		ASSERT(ipsq->ipsq_refs == 0);
21371 		/* Get the singleton phyint out of the ipsq list */
21372 		ASSERT(phyi_old->phyint_ipsq_next == NULL);
21373 		ipsq->ipsq_phyint_list = NULL;
21374 		phyi_old->phyint_illv4 = NULL;
21375 		phyi_old->phyint_illv6 = NULL;
21376 		mi_free(phyi_old);
21377 	} else {
21378 		mutex_enter(&ill->ill_lock);
21379 		/*
21380 		 * We don't need to acquire any lock, since
21381 		 * the ill is not yet visible globally  and we
21382 		 * have not yet released the ill_g_lock.
21383 		 */
21384 		phyi = phyi_old;
21385 		mutex_enter(&phyi->phyint_lock);
21386 		/* XXX We need a recovery strategy here. */
21387 		if (!phyint_assign_ifindex(phyi))
21388 			cmn_err(CE_PANIC, "phyint_assign_ifindex() failed");
21389 
21390 		avl_insert(&phyint_g_list.phyint_list_avl_by_name,
21391 		    (void *)phyi, where);
21392 
21393 		(void) avl_find(&phyint_g_list.phyint_list_avl_by_index,
21394 		    &phyi->phyint_ifindex, &where);
21395 		avl_insert(&phyint_g_list.phyint_list_avl_by_index,
21396 		    (void *)phyi, where);
21397 	}
21398 
21399 	/*
21400 	 * Reassigning ill_phyint automatically reassigns the ipsq also.
21401 	 * pending mp is not affected because that is per ill basis.
21402 	 */
21403 	ill->ill_phyint = phyi;
21404 
21405 	/*
21406 	 * Keep the index on ipif_orig_index to be used by FAILOVER.
21407 	 * We do this here as when the first ipif was allocated,
21408 	 * ipif_allocate does not know the right interface index.
21409 	 */
21410 
21411 	ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex;
21412 	/*
21413 	 * Now that the phyint's ifindex has been assigned, complete the
21414 	 * remaining
21415 	 */
21416 	if (ill->ill_isv6) {
21417 		ill->ill_ip6_mib->ipv6IfIndex =
21418 		    ill->ill_phyint->phyint_ifindex;
21419 		ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
21420 		    ill->ill_phyint->phyint_ifindex;
21421 	}
21422 
21423 	RELEASE_ILL_LOCKS(ill, ill_other);
21424 	mutex_exit(&phyi->phyint_lock);
21425 }
21426 
21427 /*
21428  * Notify any downstream modules of the name of this interface.
21429  * An M_IOCTL is used even though we don't expect a successful reply.
21430  * Any reply message from the driver (presumably an M_IOCNAK) will
21431  * eventually get discarded somewhere upstream.  The message format is
21432  * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig
21433  * to IP.
21434  */
21435 static void
21436 ip_ifname_notify(ill_t *ill, queue_t *q)
21437 {
21438 	mblk_t *mp1, *mp2;
21439 	struct iocblk *iocp;
21440 	struct lifreq *lifr;
21441 
21442 	mp1 = mkiocb(SIOCSLIFNAME);
21443 	if (mp1 == NULL)
21444 		return;
21445 	mp2 = allocb(sizeof (struct lifreq), BPRI_HI);
21446 	if (mp2 == NULL) {
21447 		freeb(mp1);
21448 		return;
21449 	}
21450 
21451 	mp1->b_cont = mp2;
21452 	iocp = (struct iocblk *)mp1->b_rptr;
21453 	iocp->ioc_count = sizeof (struct lifreq);
21454 
21455 	lifr = (struct lifreq *)mp2->b_rptr;
21456 	mp2->b_wptr += sizeof (struct lifreq);
21457 	bzero(lifr, sizeof (struct lifreq));
21458 
21459 	(void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ);
21460 	lifr->lifr_ppa = ill->ill_ppa;
21461 	lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6));
21462 
21463 	putnext(q, mp1);
21464 }
21465 
21466 static boolean_t ip_trash_timer_started = B_FALSE;
21467 
21468 static int
21469 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
21470 {
21471 	int err;
21472 
21473 	/* Set the obsolete NDD per-interface forwarding name. */
21474 	err = ill_set_ndd_name(ill);
21475 	if (err != 0) {
21476 		cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n",
21477 		    err);
21478 	}
21479 
21480 	/* Tell downstream modules where they are. */
21481 	ip_ifname_notify(ill, q);
21482 
21483 	/*
21484 	 * ill_dl_phys returns EINPROGRESS in the usual case.
21485 	 * Error cases are ENOMEM ...
21486 	 */
21487 	err = ill_dl_phys(ill, ipif, mp, q);
21488 
21489 	/*
21490 	 * If there is no IRE expiration timer running, get one started.
21491 	 * igmp and mld timers will be triggered by the first multicast
21492 	 */
21493 	if (!ip_trash_timer_started) {
21494 		/*
21495 		 * acquire the lock and check again.
21496 		 */
21497 		mutex_enter(&ip_trash_timer_lock);
21498 		if (!ip_trash_timer_started) {
21499 			ip_ire_expire_id = timeout(ip_trash_timer_expire, NULL,
21500 			    MSEC_TO_TICK(ip_timer_interval));
21501 			ip_trash_timer_started = B_TRUE;
21502 		}
21503 		mutex_exit(&ip_trash_timer_lock);
21504 	}
21505 
21506 	if (ill->ill_isv6) {
21507 		mutex_enter(&mld_slowtimeout_lock);
21508 		if (mld_slowtimeout_id == 0) {
21509 			mld_slowtimeout_id = timeout(mld_slowtimo, NULL,
21510 			    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
21511 		}
21512 		mutex_exit(&mld_slowtimeout_lock);
21513 	} else {
21514 		mutex_enter(&igmp_slowtimeout_lock);
21515 		if (igmp_slowtimeout_id == 0) {
21516 			igmp_slowtimeout_id = timeout(igmp_slowtimo, NULL,
21517 				MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
21518 		}
21519 		mutex_exit(&igmp_slowtimeout_lock);
21520 	}
21521 
21522 	return (err);
21523 }
21524 
21525 /*
21526  * Common routine for ppa and ifname setting. Should be called exclusive.
21527  *
21528  * Returns EINPROGRESS when mp has been consumed by queueing it on
21529  * ill_pending_mp and the ioctl will complete in ip_rput.
21530  *
21531  * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return
21532  * the new name and new ppa in lifr_name and lifr_ppa respectively.
21533  * For SLIFNAME, we pass these values back to the userland.
21534  */
21535 static int
21536 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
21537 {
21538 	ill_t	*ill;
21539 	ipif_t	*ipif;
21540 	ipsq_t	*ipsq;
21541 	char	*ppa_ptr;
21542 	char	*old_ptr;
21543 	char	old_char;
21544 	int	error;
21545 
21546 	ip1dbg(("ipif_set_values: interface %s\n", interf_name));
21547 	ASSERT(q->q_next != NULL);
21548 	ASSERT(interf_name != NULL);
21549 
21550 	ill = (ill_t *)q->q_ptr;
21551 
21552 	ASSERT(ill->ill_name[0] == '\0');
21553 	ASSERT(IAM_WRITER_ILL(ill));
21554 	ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ);
21555 	ASSERT(ill->ill_ppa == UINT_MAX);
21556 
21557 	/* The ppa is sent down by ifconfig or is chosen */
21558 	if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) {
21559 		return (EINVAL);
21560 	}
21561 
21562 	/*
21563 	 * make sure ppa passed in is same as ppa in the name.
21564 	 * This check is not made when ppa == UINT_MAX in that case ppa
21565 	 * in the name could be anything. System will choose a ppa and
21566 	 * update new_ppa_ptr and inter_name to contain the choosen ppa.
21567 	 */
21568 	if (*new_ppa_ptr != UINT_MAX) {
21569 		/* stoi changes the pointer */
21570 		old_ptr = ppa_ptr;
21571 		/*
21572 		 * ifconfig passed in 0 for the ppa for DLPI 1 style devices
21573 		 * (they don't have an externally visible ppa).  We assign one
21574 		 * here so that we can manage the interface.  Note that in
21575 		 * the past this value was always 0 for DLPI 1 drivers.
21576 		 */
21577 		if (*new_ppa_ptr == 0)
21578 			*new_ppa_ptr = stoi(&old_ptr);
21579 		else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr))
21580 			return (EINVAL);
21581 	}
21582 	/*
21583 	 * terminate string before ppa
21584 	 * save char at that location.
21585 	 */
21586 	old_char = ppa_ptr[0];
21587 	ppa_ptr[0] = '\0';
21588 
21589 	ill->ill_ppa = *new_ppa_ptr;
21590 	/*
21591 	 * Finish as much work now as possible before calling ill_glist_insert
21592 	 * which makes the ill globally visible and also merges it with the
21593 	 * other protocol instance of this phyint. The remaining work is
21594 	 * done after entering the ipsq which may happen sometime later.
21595 	 * ill_set_ndd_name occurs after the ill has been made globally visible.
21596 	 */
21597 	ipif = ill->ill_ipif;
21598 
21599 	/* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */
21600 	ipif_assign_seqid(ipif);
21601 
21602 	if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)))
21603 		ill->ill_flags |= ILLF_IPV4;
21604 
21605 	ASSERT(ipif->ipif_next == NULL);	/* Only one ipif on ill */
21606 	ASSERT((ipif->ipif_flags & IPIF_UP) == 0);
21607 
21608 	if (ill->ill_flags & ILLF_IPV6) {
21609 
21610 		ill->ill_isv6 = B_TRUE;
21611 		if (ill->ill_rq != NULL) {
21612 			ill->ill_rq->q_qinfo = &rinit_ipv6;
21613 			ill->ill_wq->q_qinfo = &winit_ipv6;
21614 		}
21615 
21616 		/* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */
21617 		ipif->ipif_v6lcl_addr = ipv6_all_zeros;
21618 		ipif->ipif_v6src_addr = ipv6_all_zeros;
21619 		ipif->ipif_v6subnet = ipv6_all_zeros;
21620 		ipif->ipif_v6net_mask = ipv6_all_zeros;
21621 		ipif->ipif_v6brd_addr = ipv6_all_zeros;
21622 		ipif->ipif_v6pp_dst_addr = ipv6_all_zeros;
21623 		/*
21624 		 * point-to-point or Non-mulicast capable
21625 		 * interfaces won't do NUD unless explicitly
21626 		 * configured to do so.
21627 		 */
21628 		if (ipif->ipif_flags & IPIF_POINTOPOINT ||
21629 		    !(ill->ill_flags & ILLF_MULTICAST)) {
21630 			ill->ill_flags |= ILLF_NONUD;
21631 		}
21632 		/* Make sure IPv4 specific flag is not set on IPv6 if */
21633 		if (ill->ill_flags & ILLF_NOARP) {
21634 			/*
21635 			 * Note: xresolv interfaces will eventually need
21636 			 * NOARP set here as well, but that will require
21637 			 * those external resolvers to have some
21638 			 * knowledge of that flag and act appropriately.
21639 			 * Not to be changed at present.
21640 			 */
21641 			ill->ill_flags &= ~ILLF_NOARP;
21642 		}
21643 		/*
21644 		 * Set the ILLF_ROUTER flag according to the global
21645 		 * IPv6 forwarding policy.
21646 		 */
21647 		if (ipv6_forward != 0)
21648 			ill->ill_flags |= ILLF_ROUTER;
21649 	} else if (ill->ill_flags & ILLF_IPV4) {
21650 		ill->ill_isv6 = B_FALSE;
21651 		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr);
21652 		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr);
21653 		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet);
21654 		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask);
21655 		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr);
21656 		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr);
21657 		/*
21658 		 * Set the ILLF_ROUTER flag according to the global
21659 		 * IPv4 forwarding policy.
21660 		 */
21661 		if (ip_g_forward != 0)
21662 			ill->ill_flags |= ILLF_ROUTER;
21663 	}
21664 
21665 	ASSERT(ill->ill_phyint != NULL);
21666 
21667 	/*
21668 	 * The ipv6Ifindex and ipv6IfIcmpIfIndex assignments will
21669 	 * be completed in ill_glist_insert -> ill_phyint_reinit
21670 	 */
21671 	if (ill->ill_isv6) {
21672 		/* allocate v6 mib */
21673 		if (!ill_allocate_mibs(ill))
21674 			return (ENOMEM);
21675 	}
21676 
21677 	/*
21678 	 * Pick a default sap until we get the DL_INFO_ACK back from
21679 	 * the driver.
21680 	 */
21681 	if (ill->ill_sap == 0) {
21682 		if (ill->ill_isv6)
21683 			ill->ill_sap  = IP6_DL_SAP;
21684 		else
21685 			ill->ill_sap  = IP_DL_SAP;
21686 	}
21687 
21688 	ill->ill_ifname_pending = 1;
21689 	ill->ill_ifname_pending_err = 0;
21690 
21691 	ill_refhold(ill);
21692 	rw_enter(&ill_g_lock, RW_WRITER);
21693 	if ((error = ill_glist_insert(ill, interf_name,
21694 	    (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) {
21695 		ill->ill_ppa = UINT_MAX;
21696 		ill->ill_name[0] = '\0';
21697 		/*
21698 		 * undo null termination done above.
21699 		 */
21700 		ppa_ptr[0] = old_char;
21701 		rw_exit(&ill_g_lock);
21702 		ill_refrele(ill);
21703 		return (error);
21704 	}
21705 
21706 	ASSERT(ill->ill_name_length <= LIFNAMSIZ);
21707 
21708 	/*
21709 	 * When we return the buffer pointed to by interf_name should contain
21710 	 * the same name as in ill_name.
21711 	 * If a ppa was choosen by the system (ppa passed in was UINT_MAX)
21712 	 * the buffer pointed to by new_ppa_ptr would not contain the right ppa
21713 	 * so copy full name and update the ppa ptr.
21714 	 * When ppa passed in != UINT_MAX all values are correct just undo
21715 	 * null termination, this saves a bcopy.
21716 	 */
21717 	if (*new_ppa_ptr == UINT_MAX) {
21718 		bcopy(ill->ill_name, interf_name, ill->ill_name_length);
21719 		*new_ppa_ptr = ill->ill_ppa;
21720 	} else {
21721 		/*
21722 		 * undo null termination done above.
21723 		 */
21724 		ppa_ptr[0] = old_char;
21725 	}
21726 
21727 	/* Let SCTP know about this ILL */
21728 	sctp_update_ill(ill, SCTP_ILL_INSERT);
21729 
21730 	/* and also about the first ipif */
21731 	sctp_update_ipif(ipif, SCTP_IPIF_INSERT);
21732 
21733 	ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_reprocess_ioctl, NEW_OP,
21734 	    B_TRUE);
21735 
21736 	rw_exit(&ill_g_lock);
21737 	ill_refrele(ill);
21738 	if (ipsq == NULL)
21739 		return (EINPROGRESS);
21740 
21741 	/*
21742 	 * Need to set the ipsq_current_ipif now, if we have changed ipsq
21743 	 * due to the phyint merge in ill_phyint_reinit.
21744 	 */
21745 	ASSERT(ipsq->ipsq_current_ipif == NULL ||
21746 		ipsq->ipsq_current_ipif == ipif);
21747 	ipsq->ipsq_current_ipif = ipif;
21748 	ipsq->ipsq_last_cmd = SIOCSLIFNAME;
21749 	error = ipif_set_values_tail(ill, ipif, mp, q);
21750 	ipsq_exit(ipsq, B_TRUE, B_TRUE);
21751 	if (error != 0 && error != EINPROGRESS) {
21752 		/*
21753 		 * restore previous values
21754 		 */
21755 		ill->ill_isv6 = B_FALSE;
21756 	}
21757 	return (error);
21758 }
21759 
21760 
21761 extern void (*ip_cleanup_func)(void);
21762 
21763 void
21764 ipif_init(void)
21765 {
21766 	hrtime_t hrt;
21767 	int i;
21768 
21769 	/*
21770 	 * Can't call drv_getparm here as it is too early in the boot.
21771 	 * As we use ipif_src_random just for picking a different
21772 	 * source address everytime, this need not be really random.
21773 	 */
21774 	hrt = gethrtime();
21775 	ipif_src_random = ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff);
21776 
21777 	for (i = 0; i < MAX_G_HEADS; i++) {
21778 		ill_g_heads[i].ill_g_list_head = (ill_if_t *)&ill_g_heads[i];
21779 		ill_g_heads[i].ill_g_list_tail = (ill_if_t *)&ill_g_heads[i];
21780 	}
21781 
21782 	avl_create(&phyint_g_list.phyint_list_avl_by_index,
21783 	    ill_phyint_compare_index,
21784 	    sizeof (phyint_t),
21785 	    offsetof(struct phyint, phyint_avl_by_index));
21786 	avl_create(&phyint_g_list.phyint_list_avl_by_name,
21787 	    ill_phyint_compare_name,
21788 	    sizeof (phyint_t),
21789 	    offsetof(struct phyint, phyint_avl_by_name));
21790 
21791 	ip_cleanup_func = ip_thread_exit;
21792 }
21793 
21794 /*
21795  * This is called by ip_rt_add when src_addr value is other than zero.
21796  * src_addr signifies the source address of the incoming packet. For
21797  * reverse tunnel route we need to create a source addr based routing
21798  * table. This routine creates ip_mrtun_table if it's empty and then
21799  * it adds the route entry hashed by source address. It verifies that
21800  * the outgoing interface is always a non-resolver interface (tunnel).
21801  */
21802 int
21803 ip_mrtun_rt_add(ipaddr_t in_src_addr, int flags, ipif_t *ipif_arg,
21804     ipif_t *src_ipif, ire_t **ire_arg, queue_t *q, mblk_t *mp, ipsq_func_t func)
21805 {
21806 	ire_t   *ire;
21807 	ire_t	*save_ire;
21808 	ipif_t  *ipif;
21809 	ill_t   *in_ill = NULL;
21810 	ill_t	*out_ill;
21811 	queue_t	*stq;
21812 	mblk_t	*dlureq_mp;
21813 	int	error;
21814 
21815 	if (ire_arg != NULL)
21816 		*ire_arg = NULL;
21817 	ASSERT(in_src_addr != INADDR_ANY);
21818 
21819 	ipif = ipif_arg;
21820 	if (ipif != NULL) {
21821 		out_ill = ipif->ipif_ill;
21822 	} else {
21823 		ip1dbg(("ip_mrtun_rt_add: ipif is NULL\n"));
21824 		return (EINVAL);
21825 	}
21826 
21827 	if (src_ipif == NULL) {
21828 		ip1dbg(("ip_mrtun_rt_add: src_ipif is NULL\n"));
21829 		return (EINVAL);
21830 	}
21831 	in_ill = src_ipif->ipif_ill;
21832 
21833 	/*
21834 	 * Check for duplicates. We don't need to
21835 	 * match out_ill, because the uniqueness of
21836 	 * a route is only dependent on src_addr and
21837 	 * in_ill.
21838 	 */
21839 	ire = ire_mrtun_lookup(in_src_addr, in_ill);
21840 	if (ire != NULL) {
21841 		ire_refrele(ire);
21842 		return (EEXIST);
21843 	}
21844 	if (ipif->ipif_net_type != IRE_IF_NORESOLVER) {
21845 		ip2dbg(("ip_mrtun_rt_add: outgoing interface is type %d\n",
21846 		    ipif->ipif_net_type));
21847 		return (EINVAL);
21848 	}
21849 
21850 	stq = ipif->ipif_wq;
21851 	ASSERT(stq != NULL);
21852 
21853 	/*
21854 	 * The outgoing interface must be non-resolver
21855 	 * interface.
21856 	 */
21857 	dlureq_mp = ill_dlur_gen(NULL,
21858 	    out_ill->ill_phys_addr_length, out_ill->ill_sap,
21859 	    out_ill->ill_sap_length);
21860 
21861 	if (dlureq_mp == NULL) {
21862 		ip1dbg(("ip_newroute: dlureq_mp NULL\n"));
21863 		return (ENOMEM);
21864 	}
21865 
21866 	/* Create the IRE. */
21867 
21868 	ire = ire_create(
21869 	    NULL,				/* Zero dst addr */
21870 	    NULL,				/* Zero mask */
21871 	    NULL,				/* Zero gateway addr */
21872 	    NULL,				/* Zero ipif_src addr */
21873 	    (uint8_t *)&in_src_addr,		/* in_src-addr */
21874 	    &ipif->ipif_mtu,
21875 	    NULL,
21876 	    NULL,				/* rfq */
21877 	    stq,
21878 	    IRE_MIPRTUN,
21879 	    dlureq_mp,
21880 	    ipif,
21881 	    in_ill,
21882 	    0,
21883 	    0,
21884 	    0,
21885 	    flags,
21886 	    &ire_uinfo_null);
21887 
21888 	if (ire == NULL)
21889 		return (ENOMEM);
21890 	ip2dbg(("ip_mrtun_rt_add: mrtun route is created with type %d\n",
21891 	    ire->ire_type));
21892 	save_ire = ire;
21893 	ASSERT(save_ire != NULL);
21894 	error = ire_add_mrtun(&ire, q, mp, func);
21895 	/*
21896 	 * If ire_add_mrtun() failed, the ire passed in was freed
21897 	 * so there is no need to do so here.
21898 	 */
21899 	if (error != 0) {
21900 		return (error);
21901 	}
21902 
21903 	/* Duplicate check */
21904 	if (ire != save_ire) {
21905 		/* route already exists by now */
21906 		ire_refrele(ire);
21907 		return (EEXIST);
21908 	}
21909 
21910 	if (ire_arg != NULL) {
21911 		/*
21912 		 * Store the ire that was just added. the caller
21913 		 * ip_rts_request responsible for doing ire_refrele()
21914 		 * on it.
21915 		 */
21916 		*ire_arg = ire;
21917 	} else {
21918 		ire_refrele(ire);	/* held in ire_add_mrtun */
21919 	}
21920 
21921 	return (0);
21922 }
21923 
21924 /*
21925  * It is called by ip_rt_delete() only when mipagent requests to delete
21926  * a reverse tunnel route that was added by ip_mrtun_rt_add() before.
21927  */
21928 
21929 int
21930 ip_mrtun_rt_delete(ipaddr_t in_src_addr, ipif_t *src_ipif)
21931 {
21932 	ire_t   *ire = NULL;
21933 
21934 	if (in_src_addr == INADDR_ANY)
21935 		return (EINVAL);
21936 	if (src_ipif == NULL)
21937 		return (EINVAL);
21938 
21939 	/* search if this route exists in the ip_mrtun_table */
21940 	ire = ire_mrtun_lookup(in_src_addr, src_ipif->ipif_ill);
21941 	if (ire == NULL) {
21942 		ip2dbg(("ip_mrtun_rt_delete: ire not found\n"));
21943 		return (ESRCH);
21944 	}
21945 	ire_delete(ire);
21946 	ire_refrele(ire);
21947 	return (0);
21948 }
21949 
21950 /*
21951  * Lookup the ipif corresponding to the onlink destination address. For
21952  * point-to-point interfaces, it matches with remote endpoint destination
21953  * address. For point-to-multipoint interfaces it only tries to match the
21954  * destination with the interface's subnet address. The longest, most specific
21955  * match is found to take care of such rare network configurations like -
21956  * le0: 129.146.1.1/16
21957  * le1: 129.146.2.2/24
21958  * It is used only by SO_DONTROUTE at the moment.
21959  */
21960 ipif_t *
21961 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid)
21962 {
21963 	ipif_t	*ipif, *best_ipif;
21964 	ill_t	*ill;
21965 	ill_walk_context_t ctx;
21966 
21967 	ASSERT(zoneid != ALL_ZONES);
21968 	best_ipif = NULL;
21969 
21970 	rw_enter(&ill_g_lock, RW_READER);
21971 	ill = ILL_START_WALK_V4(&ctx);
21972 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
21973 		mutex_enter(&ill->ill_lock);
21974 		for (ipif = ill->ill_ipif; ipif != NULL;
21975 		    ipif = ipif->ipif_next) {
21976 			if (!IPIF_CAN_LOOKUP(ipif))
21977 				continue;
21978 			if (ipif->ipif_zoneid != zoneid)
21979 				continue;
21980 			/*
21981 			 * Point-to-point case. Look for exact match with
21982 			 * destination address.
21983 			 */
21984 			if (ipif->ipif_flags & IPIF_POINTOPOINT) {
21985 				if (ipif->ipif_pp_dst_addr == addr) {
21986 					ipif_refhold_locked(ipif);
21987 					mutex_exit(&ill->ill_lock);
21988 					rw_exit(&ill_g_lock);
21989 					if (best_ipif != NULL)
21990 						ipif_refrele(best_ipif);
21991 					return (ipif);
21992 				}
21993 			} else if (ipif->ipif_subnet == (addr &
21994 			    ipif->ipif_net_mask)) {
21995 				/*
21996 				 * Point-to-multipoint case. Looping through to
21997 				 * find the most specific match. If there are
21998 				 * multiple best match ipif's then prefer ipif's
21999 				 * that are UP. If there is only one best match
22000 				 * ipif and it is DOWN we must still return it.
22001 				 */
22002 				if ((best_ipif == NULL) ||
22003 				    (ipif->ipif_net_mask >
22004 				    best_ipif->ipif_net_mask) ||
22005 				    ((ipif->ipif_net_mask ==
22006 				    best_ipif->ipif_net_mask) &&
22007 				    ((ipif->ipif_flags & IPIF_UP) &&
22008 				    (!(best_ipif->ipif_flags & IPIF_UP))))) {
22009 					ipif_refhold_locked(ipif);
22010 					mutex_exit(&ill->ill_lock);
22011 					rw_exit(&ill_g_lock);
22012 					if (best_ipif != NULL)
22013 						ipif_refrele(best_ipif);
22014 					best_ipif = ipif;
22015 					rw_enter(&ill_g_lock, RW_READER);
22016 					mutex_enter(&ill->ill_lock);
22017 				}
22018 			}
22019 		}
22020 		mutex_exit(&ill->ill_lock);
22021 	}
22022 	rw_exit(&ill_g_lock);
22023 	return (best_ipif);
22024 }
22025 
22026 
22027 /*
22028  * Save enough information so that we can recreate the IRE if
22029  * the interface goes down and then up.
22030  */
22031 static void
22032 ipif_save_ire(ipif_t *ipif, ire_t *ire)
22033 {
22034 	mblk_t	*save_mp;
22035 
22036 	save_mp = allocb(sizeof (ifrt_t), BPRI_MED);
22037 	if (save_mp != NULL) {
22038 		ifrt_t	*ifrt;
22039 
22040 		save_mp->b_wptr += sizeof (ifrt_t);
22041 		ifrt = (ifrt_t *)save_mp->b_rptr;
22042 		bzero(ifrt, sizeof (ifrt_t));
22043 		ifrt->ifrt_type = ire->ire_type;
22044 		ifrt->ifrt_addr = ire->ire_addr;
22045 		ifrt->ifrt_gateway_addr = ire->ire_gateway_addr;
22046 		ifrt->ifrt_src_addr = ire->ire_src_addr;
22047 		ifrt->ifrt_mask = ire->ire_mask;
22048 		ifrt->ifrt_flags = ire->ire_flags;
22049 		ifrt->ifrt_max_frag = ire->ire_max_frag;
22050 		mutex_enter(&ipif->ipif_saved_ire_lock);
22051 		save_mp->b_cont = ipif->ipif_saved_ire_mp;
22052 		ipif->ipif_saved_ire_mp = save_mp;
22053 		ipif->ipif_saved_ire_cnt++;
22054 		mutex_exit(&ipif->ipif_saved_ire_lock);
22055 	}
22056 }
22057 
22058 
22059 static void
22060 ipif_remove_ire(ipif_t *ipif, ire_t *ire)
22061 {
22062 	mblk_t	**mpp;
22063 	mblk_t	*mp;
22064 	ifrt_t	*ifrt;
22065 
22066 	/* Remove from ipif_saved_ire_mp list if it is there */
22067 	mutex_enter(&ipif->ipif_saved_ire_lock);
22068 	for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL;
22069 	    mpp = &(*mpp)->b_cont) {
22070 		/*
22071 		 * On a given ipif, the triple of address, gateway and
22072 		 * mask is unique for each saved IRE (in the case of
22073 		 * ordinary interface routes, the gateway address is
22074 		 * all-zeroes).
22075 		 */
22076 		mp = *mpp;
22077 		ifrt = (ifrt_t *)mp->b_rptr;
22078 		if (ifrt->ifrt_addr == ire->ire_addr &&
22079 		    ifrt->ifrt_gateway_addr == ire->ire_gateway_addr &&
22080 		    ifrt->ifrt_mask == ire->ire_mask) {
22081 			*mpp = mp->b_cont;
22082 			ipif->ipif_saved_ire_cnt--;
22083 			freeb(mp);
22084 			break;
22085 		}
22086 	}
22087 	mutex_exit(&ipif->ipif_saved_ire_lock);
22088 }
22089 
22090 
22091 /*
22092  * IP multirouting broadcast routes handling
22093  * Append CGTP broadcast IREs to regular ones created
22094  * at ifconfig time.
22095  */
22096 static void
22097 ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst)
22098 {
22099 	ire_t *ire_prim;
22100 
22101 	ASSERT(ire != NULL);
22102 	ASSERT(ire_dst != NULL);
22103 
22104 	ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0,
22105 	    IRE_BROADCAST, NULL, NULL, MATCH_IRE_TYPE);
22106 	if (ire_prim != NULL) {
22107 		/*
22108 		 * We are in the special case of broadcasts for
22109 		 * CGTP. We add an IRE_BROADCAST that holds
22110 		 * the RTF_MULTIRT flag, the destination
22111 		 * address of ire_dst and the low level
22112 		 * info of ire_prim. In other words, CGTP
22113 		 * broadcast is added to the redundant ipif.
22114 		 */
22115 		ipif_t *ipif_prim;
22116 		ire_t  *bcast_ire;
22117 
22118 		ipif_prim = ire_prim->ire_ipif;
22119 
22120 		ip2dbg(("ip_cgtp_filter_bcast_add: "
22121 		    "ire_dst %p, ire_prim %p, ipif_prim %p\n",
22122 		    (void *)ire_dst, (void *)ire_prim,
22123 		    (void *)ipif_prim));
22124 
22125 		bcast_ire = ire_create(
22126 		    (uchar_t *)&ire->ire_addr,
22127 		    (uchar_t *)&ip_g_all_ones,
22128 		    (uchar_t *)&ire_dst->ire_src_addr,
22129 		    (uchar_t *)&ire->ire_gateway_addr,
22130 		    NULL,
22131 		    &ipif_prim->ipif_mtu,
22132 		    NULL,
22133 		    ipif_prim->ipif_rq,
22134 		    ipif_prim->ipif_wq,
22135 		    IRE_BROADCAST,
22136 		    ipif_prim->ipif_bcast_mp,
22137 		    ipif_prim,
22138 		    NULL,
22139 		    0,
22140 		    0,
22141 		    0,
22142 		    ire->ire_flags,
22143 		    &ire_uinfo_null);
22144 
22145 		if (bcast_ire != NULL) {
22146 
22147 			if (ire_add(&bcast_ire, NULL, NULL, NULL) == 0) {
22148 				ip2dbg(("ip_cgtp_filter_bcast_add: "
22149 				    "added bcast_ire %p\n",
22150 				    (void *)bcast_ire));
22151 
22152 				ipif_save_ire(bcast_ire->ire_ipif,
22153 				    bcast_ire);
22154 				ire_refrele(bcast_ire);
22155 			}
22156 		}
22157 		ire_refrele(ire_prim);
22158 	}
22159 }
22160 
22161 
22162 /*
22163  * IP multirouting broadcast routes handling
22164  * Remove the broadcast ire
22165  */
22166 static void
22167 ip_cgtp_bcast_delete(ire_t *ire)
22168 {
22169 	ire_t *ire_dst;
22170 
22171 	ASSERT(ire != NULL);
22172 	ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST,
22173 	    NULL, NULL, MATCH_IRE_TYPE);
22174 	if (ire_dst != NULL) {
22175 		ire_t *ire_prim;
22176 
22177 		ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0,
22178 		    IRE_BROADCAST, NULL, NULL, MATCH_IRE_TYPE);
22179 		if (ire_prim != NULL) {
22180 			ipif_t *ipif_prim;
22181 			ire_t  *bcast_ire;
22182 
22183 			ipif_prim = ire_prim->ire_ipif;
22184 
22185 			ip2dbg(("ip_cgtp_filter_bcast_delete: "
22186 			    "ire_dst %p, ire_prim %p, ipif_prim %p\n",
22187 			    (void *)ire_dst, (void *)ire_prim,
22188 			    (void *)ipif_prim));
22189 
22190 			bcast_ire = ire_ctable_lookup(ire->ire_addr,
22191 			    ire->ire_gateway_addr,
22192 			    IRE_BROADCAST,
22193 			    ipif_prim,
22194 			    NULL,
22195 			    MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF |
22196 			    MATCH_IRE_MASK);
22197 
22198 			if (bcast_ire != NULL) {
22199 				ip2dbg(("ip_cgtp_filter_bcast_delete: "
22200 				    "looked up bcast_ire %p\n",
22201 				    (void *)bcast_ire));
22202 				ipif_remove_ire(bcast_ire->ire_ipif,
22203 					bcast_ire);
22204 				ire_delete(bcast_ire);
22205 			}
22206 			ire_refrele(ire_prim);
22207 		}
22208 		ire_refrele(ire_dst);
22209 	}
22210 }
22211 
22212 /*
22213  * IPsec hardware acceleration capabilities related functions.
22214  */
22215 
22216 /*
22217  * Free a per-ill IPsec capabilities structure.
22218  */
22219 static void
22220 ill_ipsec_capab_free(ill_ipsec_capab_t *capab)
22221 {
22222 	if (capab->auth_hw_algs != NULL)
22223 		kmem_free(capab->auth_hw_algs, capab->algs_size);
22224 	if (capab->encr_hw_algs != NULL)
22225 		kmem_free(capab->encr_hw_algs, capab->algs_size);
22226 	if (capab->encr_algparm != NULL)
22227 		kmem_free(capab->encr_algparm, capab->encr_algparm_size);
22228 	kmem_free(capab, sizeof (ill_ipsec_capab_t));
22229 }
22230 
22231 /*
22232  * Allocate a new per-ill IPsec capabilities structure. This structure
22233  * is specific to an IPsec protocol (AH or ESP). It is implemented as
22234  * an array which specifies, for each algorithm, whether this algorithm
22235  * is supported by the ill or not.
22236  */
22237 static ill_ipsec_capab_t *
22238 ill_ipsec_capab_alloc(void)
22239 {
22240 	ill_ipsec_capab_t *capab;
22241 	uint_t nelems;
22242 
22243 	capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP);
22244 	if (capab == NULL)
22245 		return (NULL);
22246 
22247 	/* we need one bit per algorithm */
22248 	nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t);
22249 	capab->algs_size = nelems * sizeof (ipsec_capab_elem_t);
22250 
22251 	/* allocate memory to store algorithm flags */
22252 	capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP);
22253 	if (capab->encr_hw_algs == NULL)
22254 		goto nomem;
22255 	capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP);
22256 	if (capab->auth_hw_algs == NULL)
22257 		goto nomem;
22258 	/*
22259 	 * Leave encr_algparm NULL for now since we won't need it half
22260 	 * the time
22261 	 */
22262 	return (capab);
22263 
22264 nomem:
22265 	ill_ipsec_capab_free(capab);
22266 	return (NULL);
22267 }
22268 
22269 /*
22270  * Resize capability array.  Since we're exclusive, this is OK.
22271  */
22272 static boolean_t
22273 ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid)
22274 {
22275 	ipsec_capab_algparm_t *nalp, *oalp;
22276 	uint32_t olen, nlen;
22277 
22278 	oalp = capab->encr_algparm;
22279 	olen = capab->encr_algparm_size;
22280 
22281 	if (oalp != NULL) {
22282 		if (algid < capab->encr_algparm_end)
22283 			return (B_TRUE);
22284 	}
22285 
22286 	nlen = (algid + 1) * sizeof (*nalp);
22287 	nalp = kmem_zalloc(nlen, KM_NOSLEEP);
22288 	if (nalp == NULL)
22289 		return (B_FALSE);
22290 
22291 	if (oalp != NULL) {
22292 		bcopy(oalp, nalp, olen);
22293 		kmem_free(oalp, olen);
22294 	}
22295 	capab->encr_algparm = nalp;
22296 	capab->encr_algparm_size = nlen;
22297 	capab->encr_algparm_end = algid + 1;
22298 
22299 	return (B_TRUE);
22300 }
22301 
22302 /*
22303  * Compare the capabilities of the specified ill with the protocol
22304  * and algorithms specified by the SA passed as argument.
22305  * If they match, returns B_TRUE, B_FALSE if they do not match.
22306  *
22307  * The ill can be passed as a pointer to it, or by specifying its index
22308  * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments).
22309  *
22310  * Called by ipsec_out_is_accelerated() do decide whether an outbound
22311  * packet is eligible for hardware acceleration, and by
22312  * ill_ipsec_capab_send_all() to decide whether a SA must be sent down
22313  * to a particular ill.
22314  */
22315 boolean_t
22316 ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6,
22317     ipsa_t *sa)
22318 {
22319 	boolean_t sa_isv6;
22320 	uint_t algid;
22321 	struct ill_ipsec_capab_s *cpp;
22322 	boolean_t need_refrele = B_FALSE;
22323 
22324 	if (ill == NULL) {
22325 		ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL,
22326 		    NULL, NULL, NULL);
22327 		if (ill == NULL) {
22328 			ip0dbg(("ipsec_capab_match: ill doesn't exist\n"));
22329 			return (B_FALSE);
22330 		}
22331 		need_refrele = B_TRUE;
22332 	}
22333 
22334 	/*
22335 	 * Use the address length specified by the SA to determine
22336 	 * if it corresponds to a IPv6 address, and fail the matching
22337 	 * if the isv6 flag passed as argument does not match.
22338 	 * Note: this check is used for SADB capability checking before
22339 	 * sending SA information to an ill.
22340 	 */
22341 	sa_isv6 = (sa->ipsa_addrfam == AF_INET6);
22342 	if (sa_isv6 != ill_isv6)
22343 		/* protocol mismatch */
22344 		goto done;
22345 
22346 	/*
22347 	 * Check if the ill supports the protocol, algorithm(s) and
22348 	 * key size(s) specified by the SA, and get the pointers to
22349 	 * the algorithms supported by the ill.
22350 	 */
22351 	switch (sa->ipsa_type) {
22352 
22353 	case SADB_SATYPE_ESP:
22354 		if (!(ill->ill_capabilities & ILL_CAPAB_ESP))
22355 			/* ill does not support ESP acceleration */
22356 			goto done;
22357 		cpp = ill->ill_ipsec_capab_esp;
22358 		algid = sa->ipsa_auth_alg;
22359 		if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs))
22360 			goto done;
22361 		algid = sa->ipsa_encr_alg;
22362 		if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs))
22363 			goto done;
22364 		if (algid < cpp->encr_algparm_end) {
22365 			ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid];
22366 			if (sa->ipsa_encrkeybits < alp->minkeylen)
22367 				goto done;
22368 			if (sa->ipsa_encrkeybits > alp->maxkeylen)
22369 				goto done;
22370 		}
22371 		break;
22372 
22373 	case SADB_SATYPE_AH:
22374 		if (!(ill->ill_capabilities & ILL_CAPAB_AH))
22375 			/* ill does not support AH acceleration */
22376 			goto done;
22377 		if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg,
22378 		    ill->ill_ipsec_capab_ah->auth_hw_algs))
22379 			goto done;
22380 		break;
22381 	}
22382 
22383 	if (need_refrele)
22384 		ill_refrele(ill);
22385 	return (B_TRUE);
22386 done:
22387 	if (need_refrele)
22388 		ill_refrele(ill);
22389 	return (B_FALSE);
22390 }
22391 
22392 
22393 /*
22394  * Add a new ill to the list of IPsec capable ills.
22395  * Called from ill_capability_ipsec_ack() when an ACK was received
22396  * indicating that IPsec hardware processing was enabled for an ill.
22397  *
22398  * ill must point to the ill for which acceleration was enabled.
22399  * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP.
22400  */
22401 static void
22402 ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync)
22403 {
22404 	ipsec_capab_ill_t **ills, *cur_ill, *new_ill;
22405 	uint_t sa_type;
22406 	uint_t ipproto;
22407 
22408 	ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) ||
22409 	    (dl_cap == DL_CAPAB_IPSEC_ESP));
22410 
22411 	switch (dl_cap) {
22412 	case DL_CAPAB_IPSEC_AH:
22413 		sa_type = SADB_SATYPE_AH;
22414 		ills = &ipsec_capab_ills_ah;
22415 		ipproto = IPPROTO_AH;
22416 		break;
22417 	case DL_CAPAB_IPSEC_ESP:
22418 		sa_type = SADB_SATYPE_ESP;
22419 		ills = &ipsec_capab_ills_esp;
22420 		ipproto = IPPROTO_ESP;
22421 		break;
22422 	}
22423 
22424 	rw_enter(&ipsec_capab_ills_lock, RW_WRITER);
22425 
22426 	/*
22427 	 * Add ill index to list of hardware accelerators. If
22428 	 * already in list, do nothing.
22429 	 */
22430 	for (cur_ill = *ills; cur_ill != NULL &&
22431 	    (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex ||
22432 	    cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next)
22433 		;
22434 
22435 	if (cur_ill == NULL) {
22436 		/* if this is a new entry for this ill */
22437 		new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP);
22438 		if (new_ill == NULL) {
22439 			rw_exit(&ipsec_capab_ills_lock);
22440 			return;
22441 		}
22442 
22443 		new_ill->ill_index = ill->ill_phyint->phyint_ifindex;
22444 		new_ill->ill_isv6 = ill->ill_isv6;
22445 		new_ill->next = *ills;
22446 		*ills = new_ill;
22447 	} else if (!sadb_resync) {
22448 		/* not resync'ing SADB and an entry exists for this ill */
22449 		rw_exit(&ipsec_capab_ills_lock);
22450 		return;
22451 	}
22452 
22453 	rw_exit(&ipsec_capab_ills_lock);
22454 
22455 	if (ipcl_proto_fanout_v6[ipproto].connf_head != NULL)
22456 		/*
22457 		 * IPsec module for protocol loaded, initiate dump
22458 		 * of the SADB to this ill.
22459 		 */
22460 		sadb_ill_download(ill, sa_type);
22461 }
22462 
22463 /*
22464  * Remove an ill from the list of IPsec capable ills.
22465  */
22466 static void
22467 ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap)
22468 {
22469 	ipsec_capab_ill_t **ills, *cur_ill, *prev_ill;
22470 
22471 	ASSERT(dl_cap == DL_CAPAB_IPSEC_AH ||
22472 	    dl_cap == DL_CAPAB_IPSEC_ESP);
22473 
22474 	ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipsec_capab_ills_ah :
22475 	    &ipsec_capab_ills_esp;
22476 
22477 	rw_enter(&ipsec_capab_ills_lock, RW_WRITER);
22478 
22479 	prev_ill = NULL;
22480 	for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index !=
22481 	    ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 !=
22482 	    ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next)
22483 		;
22484 	if (cur_ill == NULL) {
22485 		/* entry not found */
22486 		rw_exit(&ipsec_capab_ills_lock);
22487 		return;
22488 	}
22489 	if (prev_ill == NULL) {
22490 		/* entry at front of list */
22491 		*ills = NULL;
22492 	} else {
22493 		prev_ill->next = cur_ill->next;
22494 	}
22495 	kmem_free(cur_ill, sizeof (ipsec_capab_ill_t));
22496 	rw_exit(&ipsec_capab_ills_lock);
22497 }
22498 
22499 
22500 /*
22501  * Handling of DL_CONTROL_REQ messages that must be sent down to
22502  * an ill while having exclusive access.
22503  */
22504 /* ARGSUSED */
22505 static void
22506 ill_ipsec_capab_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
22507 {
22508 	ill_t *ill = (ill_t *)q->q_ptr;
22509 
22510 	ill_dlpi_send(ill, mp);
22511 }
22512 
22513 
22514 /*
22515  * Called by SADB to send a DL_CONTROL_REQ message to every ill
22516  * supporting the specified IPsec protocol acceleration.
22517  * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP.
22518  * We free the mblk and, if sa is non-null, release the held referece.
22519  */
22520 void
22521 ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa)
22522 {
22523 	ipsec_capab_ill_t *ici, *cur_ici;
22524 	ill_t *ill;
22525 	mblk_t *nmp, *mp_ship_list = NULL, *next_mp;
22526 
22527 	ici = (sa_type == SADB_SATYPE_AH) ? ipsec_capab_ills_ah :
22528 	    ipsec_capab_ills_esp;
22529 
22530 	rw_enter(&ipsec_capab_ills_lock, RW_READER);
22531 
22532 	for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) {
22533 		ill = ill_lookup_on_ifindex(cur_ici->ill_index,
22534 		    cur_ici->ill_isv6, NULL, NULL, NULL, NULL);
22535 
22536 		/*
22537 		 * Handle the case where the ill goes away while the SADB is
22538 		 * attempting to send messages.  If it's going away, it's
22539 		 * nuking its shadow SADB, so we don't care..
22540 		 */
22541 
22542 		if (ill == NULL)
22543 			continue;
22544 
22545 		if (sa != NULL) {
22546 			/*
22547 			 * Make sure capabilities match before
22548 			 * sending SA to ill.
22549 			 */
22550 			if (!ipsec_capab_match(ill, cur_ici->ill_index,
22551 			    cur_ici->ill_isv6, sa)) {
22552 				ill_refrele(ill);
22553 				continue;
22554 			}
22555 
22556 			mutex_enter(&sa->ipsa_lock);
22557 			sa->ipsa_flags |= IPSA_F_HW;
22558 			mutex_exit(&sa->ipsa_lock);
22559 		}
22560 
22561 		/*
22562 		 * Copy template message, and add it to the front
22563 		 * of the mblk ship list. We want to avoid holding
22564 		 * the ipsec_capab_ills_lock while sending the
22565 		 * message to the ills.
22566 		 *
22567 		 * The b_next and b_prev are temporarily used
22568 		 * to build a list of mblks to be sent down, and to
22569 		 * save the ill to which they must be sent.
22570 		 */
22571 		nmp = copymsg(mp);
22572 		if (nmp == NULL) {
22573 			ill_refrele(ill);
22574 			continue;
22575 		}
22576 		ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL);
22577 		nmp->b_next = mp_ship_list;
22578 		mp_ship_list = nmp;
22579 		nmp->b_prev = (mblk_t *)ill;
22580 	}
22581 
22582 	rw_exit(&ipsec_capab_ills_lock);
22583 
22584 	nmp = mp_ship_list;
22585 	while (nmp != NULL) {
22586 		/* restore the mblk to a sane state */
22587 		next_mp = nmp->b_next;
22588 		nmp->b_next = NULL;
22589 		ill = (ill_t *)nmp->b_prev;
22590 		nmp->b_prev = NULL;
22591 
22592 		/*
22593 		 * Ship the mblk to the ill, must be exclusive. Keep the
22594 		 * reference to the ill as qwriter_ip() does a ill_referele().
22595 		 */
22596 		(void) qwriter_ip(NULL, ill, ill->ill_wq, nmp,
22597 		    ill_ipsec_capab_send_writer, NEW_OP, B_TRUE);
22598 
22599 		nmp = next_mp;
22600 	}
22601 
22602 	if (sa != NULL)
22603 		IPSA_REFRELE(sa);
22604 	freemsg(mp);
22605 }
22606 
22607 
22608 /*
22609  * Derive an interface id from the link layer address.
22610  * Knows about IEEE 802 and IEEE EUI-64 mappings.
22611  */
22612 static boolean_t
22613 ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
22614 {
22615 	char		*addr;
22616 
22617 	if (phys_length != ETHERADDRL)
22618 		return (B_FALSE);
22619 
22620 	/* Form EUI-64 like address */
22621 	addr = (char *)&v6addr->s6_addr32[2];
22622 	bcopy((char *)phys_addr, addr, 3);
22623 	addr[0] ^= 0x2;		/* Toggle Universal/Local bit */
22624 	addr[3] = (char)0xff;
22625 	addr[4] = (char)0xfe;
22626 	bcopy((char *)phys_addr + 3, addr + 5, 3);
22627 	return (B_TRUE);
22628 }
22629 
22630 /* ARGSUSED */
22631 static boolean_t
22632 ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
22633 {
22634 	return (B_FALSE);
22635 }
22636 
22637 /* ARGSUSED */
22638 static boolean_t
22639 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr,
22640     uint32_t *hw_start, in6_addr_t *v6_extract_mask)
22641 {
22642 	/*
22643 	 * Multicast address mappings used over Ethernet/802.X.
22644 	 * This address is used as a base for mappings.
22645 	 */
22646 	static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00,
22647 	    0x00, 0x00, 0x00};
22648 
22649 	/*
22650 	 * Extract low order 32 bits from IPv6 multicast address.
22651 	 * Or that into the link layer address, starting from the
22652 	 * second byte.
22653 	 */
22654 	*hw_start = 2;
22655 	v6_extract_mask->s6_addr32[0] = 0;
22656 	v6_extract_mask->s6_addr32[1] = 0;
22657 	v6_extract_mask->s6_addr32[2] = 0;
22658 	v6_extract_mask->s6_addr32[3] = 0xffffffffU;
22659 	bcopy(ipv6_g_phys_multi_addr, maddr, lla_length);
22660 	return (B_TRUE);
22661 }
22662 
22663 /*
22664  * Indicate by return value whether multicast is supported. If not,
22665  * this code should not touch/change any parameters.
22666  */
22667 /* ARGSUSED */
22668 static boolean_t
22669 ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr,
22670     uint32_t *hw_start, ipaddr_t *extract_mask)
22671 {
22672 	/*
22673 	 * Multicast address mappings used over Ethernet/802.X.
22674 	 * This address is used as a base for mappings.
22675 	 */
22676 	static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e,
22677 	    0x00, 0x00, 0x00 };
22678 
22679 	if (phys_length != ETHERADDRL)
22680 		return (B_FALSE);
22681 
22682 	*extract_mask = htonl(0x007fffff);
22683 	*hw_start = 2;
22684 	bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL);
22685 	return (B_TRUE);
22686 }
22687 
22688 /*
22689  * Derive IPoIB interface id from the link layer address.
22690  */
22691 static boolean_t
22692 ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
22693 {
22694 	char		*addr;
22695 
22696 	if (phys_length != 20)
22697 		return (B_FALSE);
22698 	addr = (char *)&v6addr->s6_addr32[2];
22699 	bcopy(phys_addr + 12, addr, 8);
22700 	/*
22701 	 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit
22702 	 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE
22703 	 * rules. In these cases, the IBA considers these GUIDs to be in
22704 	 * "Modified EUI-64" format, and thus toggling the u/l bit is not
22705 	 * required; vendors are required not to assign global EUI-64's
22706 	 * that differ only in u/l bit values, thus guaranteeing uniqueness
22707 	 * of the interface identifier. Whether the GUID is in modified
22708 	 * or proper EUI-64 format, the ipv6 identifier must have the u/l
22709 	 * bit set to 1.
22710 	 */
22711 	addr[0] |= 2;			/* Set Universal/Local bit to 1 */
22712 	return (B_TRUE);
22713 }
22714 
22715 /*
22716  * Note on mapping from multicast IP addresses to IPoIB multicast link
22717  * addresses. IPoIB multicast link addresses are based on IBA link addresses.
22718  * The format of an IPoIB multicast address is:
22719  *
22720  *  4 byte QPN      Scope Sign.  Pkey
22721  * +--------------------------------------------+
22722  * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID |
22723  * +--------------------------------------------+
22724  *
22725  * The Scope and Pkey components are properties of the IBA port and
22726  * network interface. They can be ascertained from the broadcast address.
22727  * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6.
22728  */
22729 
22730 static boolean_t
22731 ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr,
22732     uint32_t *hw_start, in6_addr_t *v6_extract_mask)
22733 {
22734 	/*
22735 	 * Base IPoIB IPv6 multicast address used for mappings.
22736 	 * Does not contain the IBA scope/Pkey values.
22737 	 */
22738 	static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
22739 	    0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00,
22740 	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
22741 
22742 	/*
22743 	 * Extract low order 80 bits from IPv6 multicast address.
22744 	 * Or that into the link layer address, starting from the
22745 	 * sixth byte.
22746 	 */
22747 	*hw_start = 6;
22748 	bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length);
22749 
22750 	/*
22751 	 * Now fill in the IBA scope/Pkey values from the broadcast address.
22752 	 */
22753 	*(maddr + 5) = *(bphys_addr + 5);
22754 	*(maddr + 8) = *(bphys_addr + 8);
22755 	*(maddr + 9) = *(bphys_addr + 9);
22756 
22757 	v6_extract_mask->s6_addr32[0] = 0;
22758 	v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff);
22759 	v6_extract_mask->s6_addr32[2] = 0xffffffffU;
22760 	v6_extract_mask->s6_addr32[3] = 0xffffffffU;
22761 	return (B_TRUE);
22762 }
22763 
22764 static boolean_t
22765 ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr,
22766     uint32_t *hw_start, ipaddr_t *extract_mask)
22767 {
22768 	/*
22769 	 * Base IPoIB IPv4 multicast address used for mappings.
22770 	 * Does not contain the IBA scope/Pkey values.
22771 	 */
22772 	static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
22773 	    0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
22774 	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
22775 
22776 	if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr))
22777 		return (B_FALSE);
22778 
22779 	/*
22780 	 * Extract low order 28 bits from IPv4 multicast address.
22781 	 * Or that into the link layer address, starting from the
22782 	 * sixteenth byte.
22783 	 */
22784 	*extract_mask = htonl(0x0fffffff);
22785 	*hw_start = 16;
22786 	bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length);
22787 
22788 	/*
22789 	 * Now fill in the IBA scope/Pkey values from the broadcast address.
22790 	 */
22791 	*(maddr + 5) = *(bphys_addr + 5);
22792 	*(maddr + 8) = *(bphys_addr + 8);
22793 	*(maddr + 9) = *(bphys_addr + 9);
22794 	return (B_TRUE);
22795 }
22796 
22797 /*
22798  * Returns B_TRUE if an ipif is present in the given zone, matching some flags
22799  * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there.
22800  * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with
22801  * the link-local address is preferred.
22802  */
22803 boolean_t
22804 ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
22805 {
22806 	ipif_t	*ipif;
22807 	ipif_t	*maybe_ipif = NULL;
22808 
22809 	mutex_enter(&ill->ill_lock);
22810 	if (ill->ill_state_flags & ILL_CONDEMNED) {
22811 		mutex_exit(&ill->ill_lock);
22812 		if (ipifp != NULL)
22813 			*ipifp = NULL;
22814 		return (B_FALSE);
22815 	}
22816 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
22817 		if (!IPIF_CAN_LOOKUP(ipif))
22818 			continue;
22819 		if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid)
22820 			continue;
22821 		if ((ipif->ipif_flags & flags) != flags)
22822 			continue;
22823 
22824 		if (ipifp == NULL) {
22825 			mutex_exit(&ill->ill_lock);
22826 			ASSERT(maybe_ipif == NULL);
22827 			return (B_TRUE);
22828 		}
22829 		if (!ill->ill_isv6 ||
22830 		    IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) {
22831 			ipif_refhold_locked(ipif);
22832 			mutex_exit(&ill->ill_lock);
22833 			*ipifp = ipif;
22834 			return (B_TRUE);
22835 		}
22836 		if (maybe_ipif == NULL)
22837 			maybe_ipif = ipif;
22838 	}
22839 	if (ipifp != NULL) {
22840 		if (maybe_ipif != NULL)
22841 			ipif_refhold_locked(maybe_ipif);
22842 		*ipifp = maybe_ipif;
22843 	}
22844 	mutex_exit(&ill->ill_lock);
22845 	return (maybe_ipif != NULL);
22846 }
22847 
22848 /*
22849  * Same as ipif_lookup_zoneid() but looks at all the ills in the same group.
22850  */
22851 boolean_t
22852 ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
22853 {
22854 	ill_t *illg;
22855 
22856 	/*
22857 	 * We look at the passed-in ill first without grabbing ill_g_lock.
22858 	 */
22859 	if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) {
22860 		return (B_TRUE);
22861 	}
22862 	rw_enter(&ill_g_lock, RW_READER);
22863 	if (ill->ill_group == NULL) {
22864 		/* ill not in a group */
22865 		rw_exit(&ill_g_lock);
22866 		return (B_FALSE);
22867 	}
22868 
22869 	/*
22870 	 * There's no ipif in the zone on ill, however ill is part of an IPMP
22871 	 * group. We need to look for an ipif in the zone on all the ills in the
22872 	 * group.
22873 	 */
22874 	illg = ill->ill_group->illgrp_ill;
22875 	do {
22876 		/*
22877 		 * We don't call ipif_lookup_zoneid() on ill as we already know
22878 		 * that it's not there.
22879 		 */
22880 		if (illg != ill &&
22881 		    ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) {
22882 			break;
22883 		}
22884 	} while ((illg = illg->ill_group_next) != NULL);
22885 	rw_exit(&ill_g_lock);
22886 	return (illg != NULL);
22887 }
22888 
22889 /*
22890  * Check if this ill is only being used to send ICMP probes for IPMP
22891  */
22892 boolean_t
22893 ill_is_probeonly(ill_t *ill)
22894 {
22895 	/*
22896 	 * Check if the interface is FAILED, or INACTIVE
22897 	 */
22898 	if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE))
22899 		return (B_TRUE);
22900 
22901 	return (B_FALSE);
22902 }
22903