1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 /* 28 * This file contains the interface control functions for IP. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/stream.h> 33 #include <sys/dlpi.h> 34 #include <sys/stropts.h> 35 #include <sys/strsun.h> 36 #include <sys/sysmacros.h> 37 #include <sys/strsubr.h> 38 #include <sys/strlog.h> 39 #include <sys/ddi.h> 40 #include <sys/sunddi.h> 41 #include <sys/cmn_err.h> 42 #include <sys/kstat.h> 43 #include <sys/debug.h> 44 #include <sys/zone.h> 45 #include <sys/sunldi.h> 46 #include <sys/file.h> 47 #include <sys/bitmap.h> 48 #include <sys/cpuvar.h> 49 #include <sys/time.h> 50 #include <sys/ctype.h> 51 #include <sys/kmem.h> 52 #include <sys/systm.h> 53 #include <sys/param.h> 54 #include <sys/socket.h> 55 #include <sys/isa_defs.h> 56 #include <net/if.h> 57 #include <net/if_arp.h> 58 #include <net/if_types.h> 59 #include <net/if_dl.h> 60 #include <net/route.h> 61 #include <sys/sockio.h> 62 #include <netinet/in.h> 63 #include <netinet/ip6.h> 64 #include <netinet/icmp6.h> 65 #include <netinet/igmp_var.h> 66 #include <sys/policy.h> 67 #include <sys/ethernet.h> 68 #include <sys/callb.h> 69 #include <sys/md5.h> 70 71 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 72 #include <inet/mi.h> 73 #include <inet/nd.h> 74 #include <inet/arp.h> 75 #include <inet/ip_arp.h> 76 #include <inet/mib2.h> 77 #include <inet/ip.h> 78 #include <inet/ip6.h> 79 #include <inet/ip6_asp.h> 80 #include <inet/tcp.h> 81 #include <inet/ip_multi.h> 82 #include <inet/ip_ire.h> 83 #include <inet/ip_ftable.h> 84 #include <inet/ip_rts.h> 85 #include <inet/ip_ndp.h> 86 #include <inet/ip_if.h> 87 #include <inet/ip_impl.h> 88 #include <inet/sctp_ip.h> 89 #include <inet/ip_netinfo.h> 90 #include <inet/ilb_ip.h> 91 92 #include <netinet/igmp.h> 93 #include <inet/ip_listutils.h> 94 #include <inet/ipclassifier.h> 95 #include <sys/mac_client.h> 96 #include <sys/dld.h> 97 98 #include <sys/systeminfo.h> 99 #include <sys/bootconf.h> 100 101 #include <sys/tsol/tndb.h> 102 #include <sys/tsol/tnet.h> 103 104 /* The character which tells where the ill_name ends */ 105 #define IPIF_SEPARATOR_CHAR ':' 106 107 /* IP ioctl function table entry */ 108 typedef struct ipft_s { 109 int ipft_cmd; 110 pfi_t ipft_pfi; 111 int ipft_min_size; 112 int ipft_flags; 113 } ipft_t; 114 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 115 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 116 117 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 118 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 119 char *value, caddr_t cp, cred_t *ioc_cr); 120 121 static boolean_t ill_is_quiescent(ill_t *); 122 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 123 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 124 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 125 mblk_t *mp, boolean_t need_up); 126 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 127 mblk_t *mp, boolean_t need_up); 128 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 129 queue_t *q, mblk_t *mp, boolean_t need_up); 130 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 131 mblk_t *mp); 132 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 133 mblk_t *mp); 134 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 135 queue_t *q, mblk_t *mp, boolean_t need_up); 136 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, 137 int ioccmd, struct linkblk *li); 138 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); 139 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 140 static void ipsq_flush(ill_t *ill); 141 142 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 143 queue_t *q, mblk_t *mp, boolean_t need_up); 144 static void ipsq_delete(ipsq_t *); 145 146 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 147 boolean_t initialize, boolean_t insert, int *errorp); 148 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); 149 static void ipif_delete_bcast_ires(ipif_t *ipif); 150 static int ipif_add_ires_v4(ipif_t *, boolean_t); 151 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, 152 boolean_t isv6); 153 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 154 static void ipif_free(ipif_t *ipif); 155 static void ipif_free_tail(ipif_t *ipif); 156 static void ipif_set_default(ipif_t *ipif); 157 static int ipif_set_values(queue_t *q, mblk_t *mp, 158 char *interf_name, uint_t *ppa); 159 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 160 queue_t *q); 161 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 162 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 163 ip_stack_t *); 164 165 static int ill_alloc_ppa(ill_if_t *, ill_t *); 166 static void ill_delete_interface_type(ill_if_t *); 167 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 168 static void ill_dl_down(ill_t *ill); 169 static void ill_down(ill_t *ill); 170 static void ill_down_ipifs(ill_t *, boolean_t); 171 static void ill_free_mib(ill_t *ill); 172 static void ill_glist_delete(ill_t *); 173 static void ill_phyint_reinit(ill_t *ill); 174 static void ill_set_nce_router_flags(ill_t *, boolean_t); 175 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); 176 static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *); 177 178 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; 179 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid; 180 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; 181 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid; 182 static ip_v4mapinfo_func_t ip_ether_v4_mapping; 183 static ip_v6mapinfo_func_t ip_ether_v6_mapping; 184 static ip_v4mapinfo_func_t ip_ib_v4_mapping; 185 static ip_v6mapinfo_func_t ip_ib_v6_mapping; 186 static ip_v4mapinfo_func_t ip_mbcast_mapping; 187 static void ip_cgtp_bcast_add(ire_t *, ip_stack_t *); 188 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); 189 static void phyint_free(phyint_t *); 190 191 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *); 192 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 193 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 194 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 195 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *); 196 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 197 dl_capability_sub_t *); 198 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *); 199 static void ill_capability_dld_reset_fill(ill_t *, mblk_t *); 200 static void ill_capability_dld_ack(ill_t *, mblk_t *, 201 dl_capability_sub_t *); 202 static void ill_capability_dld_enable(ill_t *); 203 static void ill_capability_ack_thr(void *); 204 static void ill_capability_lso_enable(ill_t *); 205 206 static ill_t *ill_prev_usesrc(ill_t *); 207 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 208 static void ill_disband_usesrc_group(ill_t *); 209 static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int); 210 211 #ifdef DEBUG 212 static void ill_trace_cleanup(const ill_t *); 213 static void ipif_trace_cleanup(const ipif_t *); 214 #endif 215 216 /* 217 * if we go over the memory footprint limit more than once in this msec 218 * interval, we'll start pruning aggressively. 219 */ 220 int ip_min_frag_prune_time = 0; 221 222 static ipft_t ip_ioctl_ftbl[] = { 223 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 224 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 225 IPFT_F_NO_REPLY }, 226 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 227 { 0 } 228 }; 229 230 /* Simple ICMP IP Header Template */ 231 static ipha_t icmp_ipha = { 232 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 233 }; 234 235 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 236 237 static ip_m_t ip_m_tbl[] = { 238 { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 239 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 240 ip_nodef_v6intfid }, 241 { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6, 242 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 243 ip_nodef_v6intfid }, 244 { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6, 245 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 246 ip_nodef_v6intfid }, 247 { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6, 248 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 249 ip_nodef_v6intfid }, 250 { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6, 251 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 252 ip_nodef_v6intfid }, 253 { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6, 254 ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid, 255 ip_nodef_v6intfid }, 256 { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6, 257 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 258 ip_ipv4_v6destintfid }, 259 { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6, 260 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid, 261 ip_ipv6_v6destintfid }, 262 { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6, 263 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 264 ip_nodef_v6intfid }, 265 { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 266 NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid }, 267 { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 268 NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid }, 269 { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 270 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 271 ip_nodef_v6intfid } 272 }; 273 274 static ill_t ill_null; /* Empty ILL for init. */ 275 char ipif_loopback_name[] = "lo0"; 276 static char *ipv4_forward_suffix = ":ip_forwarding"; 277 static char *ipv6_forward_suffix = ":ip6_forwarding"; 278 static sin6_t sin6_null; /* Zero address for quick clears */ 279 static sin_t sin_null; /* Zero address for quick clears */ 280 281 /* When set search for unused ipif_seqid */ 282 static ipif_t ipif_zero; 283 284 /* 285 * ppa arena is created after these many 286 * interfaces have been plumbed. 287 */ 288 uint_t ill_no_arena = 12; /* Setable in /etc/system */ 289 290 /* 291 * Allocate per-interface mibs. 292 * Returns true if ok. False otherwise. 293 * ipsq may not yet be allocated (loopback case ). 294 */ 295 static boolean_t 296 ill_allocate_mibs(ill_t *ill) 297 { 298 /* Already allocated? */ 299 if (ill->ill_ip_mib != NULL) { 300 if (ill->ill_isv6) 301 ASSERT(ill->ill_icmp6_mib != NULL); 302 return (B_TRUE); 303 } 304 305 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), 306 KM_NOSLEEP); 307 if (ill->ill_ip_mib == NULL) { 308 return (B_FALSE); 309 } 310 311 /* Setup static information */ 312 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, 313 sizeof (mib2_ipIfStatsEntry_t)); 314 if (ill->ill_isv6) { 315 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 316 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 317 sizeof (mib2_ipv6AddrEntry_t)); 318 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 319 sizeof (mib2_ipv6RouteEntry_t)); 320 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 321 sizeof (mib2_ipv6NetToMediaEntry_t)); 322 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 323 sizeof (ipv6_member_t)); 324 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 325 sizeof (ipv6_grpsrc_t)); 326 } else { 327 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 328 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 329 sizeof (mib2_ipAddrEntry_t)); 330 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 331 sizeof (mib2_ipRouteEntry_t)); 332 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 333 sizeof (mib2_ipNetToMediaEntry_t)); 334 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 335 sizeof (ip_member_t)); 336 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 337 sizeof (ip_grpsrc_t)); 338 339 /* 340 * For a v4 ill, we are done at this point, because per ill 341 * icmp mibs are only used for v6. 342 */ 343 return (B_TRUE); 344 } 345 346 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 347 KM_NOSLEEP); 348 if (ill->ill_icmp6_mib == NULL) { 349 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 350 ill->ill_ip_mib = NULL; 351 return (B_FALSE); 352 } 353 /* static icmp info */ 354 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 355 sizeof (mib2_ipv6IfIcmpEntry_t); 356 /* 357 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later 358 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 359 * -> ill_phyint_reinit 360 */ 361 return (B_TRUE); 362 } 363 364 /* 365 * Completely vaporize a lower level tap and all associated interfaces. 366 * ill_delete is called only out of ip_close when the device control 367 * stream is being closed. 368 */ 369 void 370 ill_delete(ill_t *ill) 371 { 372 ipif_t *ipif; 373 ill_t *prev_ill; 374 ip_stack_t *ipst = ill->ill_ipst; 375 376 /* 377 * ill_delete may be forcibly entering the ipsq. The previous 378 * ioctl may not have completed and may need to be aborted. 379 * ipsq_flush takes care of it. If we don't need to enter the 380 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 381 * ill_delete_tail is sufficient. 382 */ 383 ipsq_flush(ill); 384 385 /* 386 * Nuke all interfaces. ipif_free will take down the interface, 387 * remove it from the list, and free the data structure. 388 * Walk down the ipif list and remove the logical interfaces 389 * first before removing the main ipif. We can't unplumb 390 * zeroth interface first in the case of IPv6 as update_conn_ill 391 * -> ip_ll_multireq de-references ill_ipif for checking 392 * POINTOPOINT. 393 * 394 * If ill_ipif was not properly initialized (i.e low on memory), 395 * then no interfaces to clean up. In this case just clean up the 396 * ill. 397 */ 398 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 399 ipif_free(ipif); 400 401 /* 402 * clean out all the nce_t entries that depend on this 403 * ill for the ill_phys_addr. 404 */ 405 nce_flush(ill, B_TRUE); 406 407 /* Clean up msgs on pending upcalls for mrouted */ 408 reset_mrt_ill(ill); 409 410 update_conn_ill(ill, ipst); 411 412 /* 413 * Remove multicast references added as a result of calls to 414 * ip_join_allmulti(). 415 */ 416 ip_purge_allmulti(ill); 417 418 /* 419 * If the ill being deleted is under IPMP, boot it out of the illgrp. 420 */ 421 if (IS_UNDER_IPMP(ill)) 422 ipmp_ill_leave_illgrp(ill); 423 424 /* 425 * ill_down will arrange to blow off any IRE's dependent on this 426 * ILL, and shut down fragmentation reassembly. 427 */ 428 ill_down(ill); 429 430 /* Let SCTP know, so that it can remove this from its list. */ 431 sctp_update_ill(ill, SCTP_ILL_REMOVE); 432 433 /* 434 * Walk all CONNs that can have a reference on an ire or nce for this 435 * ill (we actually walk all that now have stale references). 436 */ 437 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); 438 439 /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 440 if (ill->ill_isv6) 441 dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst); 442 443 /* 444 * If an address on this ILL is being used as a source address then 445 * clear out the pointers in other ILLs that point to this ILL. 446 */ 447 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 448 if (ill->ill_usesrc_grp_next != NULL) { 449 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 450 ill_disband_usesrc_group(ill); 451 } else { /* consumer of the usesrc ILL */ 452 prev_ill = ill_prev_usesrc(ill); 453 prev_ill->ill_usesrc_grp_next = 454 ill->ill_usesrc_grp_next; 455 } 456 } 457 rw_exit(&ipst->ips_ill_g_usesrc_lock); 458 } 459 460 static void 461 ipif_non_duplicate(ipif_t *ipif) 462 { 463 ill_t *ill = ipif->ipif_ill; 464 mutex_enter(&ill->ill_lock); 465 if (ipif->ipif_flags & IPIF_DUPLICATE) { 466 ipif->ipif_flags &= ~IPIF_DUPLICATE; 467 ASSERT(ill->ill_ipif_dup_count > 0); 468 ill->ill_ipif_dup_count--; 469 } 470 mutex_exit(&ill->ill_lock); 471 } 472 473 /* 474 * ill_delete_tail is called from ip_modclose after all references 475 * to the closing ill are gone. The wait is done in ip_modclose 476 */ 477 void 478 ill_delete_tail(ill_t *ill) 479 { 480 mblk_t **mpp; 481 ipif_t *ipif; 482 ip_stack_t *ipst = ill->ill_ipst; 483 484 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 485 ipif_non_duplicate(ipif); 486 (void) ipif_down_tail(ipif); 487 } 488 489 ASSERT(ill->ill_ipif_dup_count == 0); 490 491 /* 492 * If polling capability is enabled (which signifies direct 493 * upcall into IP and driver has ill saved as a handle), 494 * we need to make sure that unbind has completed before we 495 * let the ill disappear and driver no longer has any reference 496 * to this ill. 497 */ 498 mutex_enter(&ill->ill_lock); 499 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 500 cv_wait(&ill->ill_cv, &ill->ill_lock); 501 mutex_exit(&ill->ill_lock); 502 ASSERT(!(ill->ill_capabilities & 503 (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT))); 504 505 if (ill->ill_net_type != IRE_LOOPBACK) 506 qprocsoff(ill->ill_rq); 507 508 /* 509 * We do an ipsq_flush once again now. New messages could have 510 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 511 * could also have landed up if an ioctl thread had looked up 512 * the ill before we set the ILL_CONDEMNED flag, but not yet 513 * enqueued the ioctl when we did the ipsq_flush last time. 514 */ 515 ipsq_flush(ill); 516 517 /* 518 * Free capabilities. 519 */ 520 if (ill->ill_hcksum_capab != NULL) { 521 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 522 ill->ill_hcksum_capab = NULL; 523 } 524 525 if (ill->ill_zerocopy_capab != NULL) { 526 kmem_free(ill->ill_zerocopy_capab, 527 sizeof (ill_zerocopy_capab_t)); 528 ill->ill_zerocopy_capab = NULL; 529 } 530 531 if (ill->ill_lso_capab != NULL) { 532 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 533 ill->ill_lso_capab = NULL; 534 } 535 536 if (ill->ill_dld_capab != NULL) { 537 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t)); 538 ill->ill_dld_capab = NULL; 539 } 540 541 while (ill->ill_ipif != NULL) 542 ipif_free_tail(ill->ill_ipif); 543 544 /* 545 * We have removed all references to ilm from conn and the ones joined 546 * within the kernel. 547 * 548 * We don't walk conns, mrts and ires because 549 * 550 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts. 551 * 2) ill_down ->ill_downi walks all the ires and cleans up 552 * ill references. 553 */ 554 555 /* 556 * If this ill is an IPMP meta-interface, blow away the illgrp. This 557 * is safe to do because the illgrp has already been unlinked from the 558 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it. 559 */ 560 if (IS_IPMP(ill)) { 561 ipmp_illgrp_destroy(ill->ill_grp); 562 ill->ill_grp = NULL; 563 } 564 565 /* 566 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free 567 * could free the phyint. No more reference to the phyint after this 568 * point. 569 */ 570 (void) ill_glist_delete(ill); 571 572 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 573 if (ill->ill_ndd_name != NULL) 574 nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name); 575 rw_exit(&ipst->ips_ip_g_nd_lock); 576 577 if (ill->ill_frag_ptr != NULL) { 578 uint_t count; 579 580 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 581 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 582 } 583 mi_free(ill->ill_frag_ptr); 584 ill->ill_frag_ptr = NULL; 585 ill->ill_frag_hash_tbl = NULL; 586 } 587 588 freemsg(ill->ill_nd_lla_mp); 589 /* Free all retained control messages. */ 590 mpp = &ill->ill_first_mp_to_free; 591 do { 592 while (mpp[0]) { 593 mblk_t *mp; 594 mblk_t *mp1; 595 596 mp = mpp[0]; 597 mpp[0] = mp->b_next; 598 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 599 mp1->b_next = NULL; 600 mp1->b_prev = NULL; 601 } 602 freemsg(mp); 603 } 604 } while (mpp++ != &ill->ill_last_mp_to_free); 605 606 ill_free_mib(ill); 607 608 #ifdef DEBUG 609 ill_trace_cleanup(ill); 610 #endif 611 612 /* The default multicast interface might have changed */ 613 ire_increment_multicast_generation(ipst, ill->ill_isv6); 614 615 /* Drop refcnt here */ 616 netstack_rele(ill->ill_ipst->ips_netstack); 617 ill->ill_ipst = NULL; 618 } 619 620 static void 621 ill_free_mib(ill_t *ill) 622 { 623 ip_stack_t *ipst = ill->ill_ipst; 624 625 /* 626 * MIB statistics must not be lost, so when an interface 627 * goes away the counter values will be added to the global 628 * MIBs. 629 */ 630 if (ill->ill_ip_mib != NULL) { 631 if (ill->ill_isv6) { 632 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, 633 ill->ill_ip_mib); 634 } else { 635 ip_mib2_add_ip_stats(&ipst->ips_ip_mib, 636 ill->ill_ip_mib); 637 } 638 639 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 640 ill->ill_ip_mib = NULL; 641 } 642 if (ill->ill_icmp6_mib != NULL) { 643 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, 644 ill->ill_icmp6_mib); 645 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 646 ill->ill_icmp6_mib = NULL; 647 } 648 } 649 650 /* 651 * Concatenate together a physical address and a sap. 652 * 653 * Sap_lengths are interpreted as follows: 654 * sap_length == 0 ==> no sap 655 * sap_length > 0 ==> sap is at the head of the dlpi address 656 * sap_length < 0 ==> sap is at the tail of the dlpi address 657 */ 658 static void 659 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 660 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 661 { 662 uint16_t sap_addr = (uint16_t)sap_src; 663 664 if (sap_length == 0) { 665 if (phys_src == NULL) 666 bzero(dst, phys_length); 667 else 668 bcopy(phys_src, dst, phys_length); 669 } else if (sap_length < 0) { 670 if (phys_src == NULL) 671 bzero(dst, phys_length); 672 else 673 bcopy(phys_src, dst, phys_length); 674 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 675 } else { 676 bcopy(&sap_addr, dst, sizeof (sap_addr)); 677 if (phys_src == NULL) 678 bzero((char *)dst + sap_length, phys_length); 679 else 680 bcopy(phys_src, (char *)dst + sap_length, phys_length); 681 } 682 } 683 684 /* 685 * Generate a dl_unitdata_req mblk for the device and address given. 686 * addr_length is the length of the physical portion of the address. 687 * If addr is NULL include an all zero address of the specified length. 688 * TRUE? In any case, addr_length is taken to be the entire length of the 689 * dlpi address, including the absolute value of sap_length. 690 */ 691 mblk_t * 692 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 693 t_scalar_t sap_length) 694 { 695 dl_unitdata_req_t *dlur; 696 mblk_t *mp; 697 t_scalar_t abs_sap_length; /* absolute value */ 698 699 abs_sap_length = ABS(sap_length); 700 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 701 DL_UNITDATA_REQ); 702 if (mp == NULL) 703 return (NULL); 704 dlur = (dl_unitdata_req_t *)mp->b_rptr; 705 /* HACK: accomodate incompatible DLPI drivers */ 706 if (addr_length == 8) 707 addr_length = 6; 708 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 709 dlur->dl_dest_addr_offset = sizeof (*dlur); 710 dlur->dl_priority.dl_min = 0; 711 dlur->dl_priority.dl_max = 0; 712 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 713 (uchar_t *)&dlur[1]); 714 return (mp); 715 } 716 717 /* 718 * Add the pending mp to the list. There can be only 1 pending mp 719 * in the list. Any exclusive ioctl that needs to wait for a response 720 * from another module or driver needs to use this function to set 721 * the ipx_pending_mp to the ioctl mblk and wait for the response from 722 * the other module/driver. This is also used while waiting for the 723 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 724 */ 725 boolean_t 726 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 727 int waitfor) 728 { 729 ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop; 730 731 ASSERT(IAM_WRITER_IPIF(ipif)); 732 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 733 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 734 ASSERT(ipx->ipx_pending_mp == NULL); 735 /* 736 * The caller may be using a different ipif than the one passed into 737 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 738 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT 739 * that `ipx_current_ipif == ipif'. 740 */ 741 ASSERT(ipx->ipx_current_ipif != NULL); 742 743 /* 744 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the 745 * driver. 746 */ 747 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) || 748 (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) || 749 (DB_TYPE(add_mp) == M_PCPROTO)); 750 751 if (connp != NULL) { 752 ASSERT(MUTEX_HELD(&connp->conn_lock)); 753 /* 754 * Return error if the conn has started closing. The conn 755 * could have finished cleaning up the pending mp list, 756 * If so we should not add another mp to the list negating 757 * the cleanup. 758 */ 759 if (connp->conn_state_flags & CONN_CLOSING) 760 return (B_FALSE); 761 } 762 mutex_enter(&ipx->ipx_lock); 763 ipx->ipx_pending_ipif = ipif; 764 /* 765 * Note down the queue in b_queue. This will be returned by 766 * ipsq_pending_mp_get. Caller will then use these values to restart 767 * the processing 768 */ 769 add_mp->b_next = NULL; 770 add_mp->b_queue = q; 771 ipx->ipx_pending_mp = add_mp; 772 ipx->ipx_waitfor = waitfor; 773 mutex_exit(&ipx->ipx_lock); 774 775 if (connp != NULL) 776 connp->conn_oper_pending_ill = ipif->ipif_ill; 777 778 return (B_TRUE); 779 } 780 781 /* 782 * Retrieve the ipx_pending_mp and return it. There can be only 1 mp 783 * queued in the list. 784 */ 785 mblk_t * 786 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 787 { 788 mblk_t *curr = NULL; 789 ipxop_t *ipx = ipsq->ipsq_xop; 790 791 *connpp = NULL; 792 mutex_enter(&ipx->ipx_lock); 793 if (ipx->ipx_pending_mp == NULL) { 794 mutex_exit(&ipx->ipx_lock); 795 return (NULL); 796 } 797 798 /* There can be only 1 such excl message */ 799 curr = ipx->ipx_pending_mp; 800 ASSERT(curr->b_next == NULL); 801 ipx->ipx_pending_ipif = NULL; 802 ipx->ipx_pending_mp = NULL; 803 ipx->ipx_waitfor = 0; 804 mutex_exit(&ipx->ipx_lock); 805 806 if (CONN_Q(curr->b_queue)) { 807 /* 808 * This mp did a refhold on the conn, at the start of the ioctl. 809 * So we can safely return a pointer to the conn to the caller. 810 */ 811 *connpp = Q_TO_CONN(curr->b_queue); 812 } else { 813 *connpp = NULL; 814 } 815 curr->b_next = NULL; 816 curr->b_prev = NULL; 817 return (curr); 818 } 819 820 /* 821 * Cleanup the ioctl mp queued in ipx_pending_mp 822 * - Called in the ill_delete path 823 * - Called in the M_ERROR or M_HANGUP path on the ill. 824 * - Called in the conn close path. 825 */ 826 boolean_t 827 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 828 { 829 mblk_t *mp; 830 ipxop_t *ipx; 831 queue_t *q; 832 ipif_t *ipif; 833 int cmd; 834 835 ASSERT(IAM_WRITER_ILL(ill)); 836 ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 837 838 /* 839 * If connp is null, unconditionally clean up the ipx_pending_mp. 840 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 841 * even if it is meant for another ill, since we have to enqueue 842 * a new mp now in ipx_pending_mp to complete the ipif_down. 843 * If connp is non-null we are called from the conn close path. 844 */ 845 mutex_enter(&ipx->ipx_lock); 846 mp = ipx->ipx_pending_mp; 847 if (mp == NULL || (connp != NULL && 848 mp->b_queue != CONNP_TO_WQ(connp))) { 849 mutex_exit(&ipx->ipx_lock); 850 return (B_FALSE); 851 } 852 /* Now remove from the ipx_pending_mp */ 853 ipx->ipx_pending_mp = NULL; 854 q = mp->b_queue; 855 mp->b_next = NULL; 856 mp->b_prev = NULL; 857 mp->b_queue = NULL; 858 859 ipif = ipx->ipx_pending_ipif; 860 ipx->ipx_pending_ipif = NULL; 861 ipx->ipx_waitfor = 0; 862 ipx->ipx_current_ipif = NULL; 863 cmd = ipx->ipx_current_ioctl; 864 ipx->ipx_current_ioctl = 0; 865 ipx->ipx_current_done = B_TRUE; 866 mutex_exit(&ipx->ipx_lock); 867 868 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 869 DTRACE_PROBE4(ipif__ioctl, 870 char *, "ipsq_pending_mp_cleanup", 871 int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill, 872 ipif_t *, ipif); 873 if (connp == NULL) { 874 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 875 } else { 876 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); 877 mutex_enter(&ipif->ipif_ill->ill_lock); 878 ipif->ipif_state_flags &= ~IPIF_CHANGING; 879 mutex_exit(&ipif->ipif_ill->ill_lock); 880 } 881 } else { 882 /* 883 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 884 * be just inet_freemsg. we have to restart it 885 * otherwise the thread will be stuck. 886 */ 887 inet_freemsg(mp); 888 } 889 return (B_TRUE); 890 } 891 892 /* 893 * Called in the conn close path and ill delete path 894 */ 895 static void 896 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 897 { 898 ipsq_t *ipsq; 899 mblk_t *prev; 900 mblk_t *curr; 901 mblk_t *next; 902 queue_t *q; 903 mblk_t *tmp_list = NULL; 904 905 ASSERT(IAM_WRITER_ILL(ill)); 906 if (connp != NULL) 907 q = CONNP_TO_WQ(connp); 908 else 909 q = ill->ill_wq; 910 911 ipsq = ill->ill_phyint->phyint_ipsq; 912 /* 913 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 914 * In the case of ioctl from a conn, there can be only 1 mp 915 * queued on the ipsq. If an ill is being unplumbed, only messages 916 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 917 * ioctls meant for this ill form conn's are not flushed. They will 918 * be processed during ipsq_exit and will not find the ill and will 919 * return error. 920 */ 921 mutex_enter(&ipsq->ipsq_lock); 922 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 923 curr = next) { 924 next = curr->b_next; 925 if (curr->b_queue == q || curr->b_queue == RD(q)) { 926 /* Unlink the mblk from the pending mp list */ 927 if (prev != NULL) { 928 prev->b_next = curr->b_next; 929 } else { 930 ASSERT(ipsq->ipsq_xopq_mphead == curr); 931 ipsq->ipsq_xopq_mphead = curr->b_next; 932 } 933 if (ipsq->ipsq_xopq_mptail == curr) 934 ipsq->ipsq_xopq_mptail = prev; 935 /* 936 * Create a temporary list and release the ipsq lock 937 * New elements are added to the head of the tmp_list 938 */ 939 curr->b_next = tmp_list; 940 tmp_list = curr; 941 } else { 942 prev = curr; 943 } 944 } 945 mutex_exit(&ipsq->ipsq_lock); 946 947 while (tmp_list != NULL) { 948 curr = tmp_list; 949 tmp_list = curr->b_next; 950 curr->b_next = NULL; 951 curr->b_prev = NULL; 952 curr->b_queue = NULL; 953 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 954 DTRACE_PROBE4(ipif__ioctl, 955 char *, "ipsq_xopq_mp_cleanup", 956 int, 0, ill_t *, NULL, ipif_t *, NULL); 957 ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? 958 CONN_CLOSE : NO_COPYOUT, NULL); 959 } else { 960 /* 961 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 962 * this can't be just inet_freemsg. we have to 963 * restart it otherwise the thread will be stuck. 964 */ 965 inet_freemsg(curr); 966 } 967 } 968 } 969 970 /* 971 * This conn has started closing. Cleanup any pending ioctl from this conn. 972 * STREAMS ensures that there can be at most 1 ioctl pending on a stream. 973 */ 974 void 975 conn_ioctl_cleanup(conn_t *connp) 976 { 977 ipsq_t *ipsq; 978 ill_t *ill; 979 boolean_t refheld; 980 981 /* 982 * Is any exclusive ioctl pending ? If so clean it up. If the 983 * ioctl has not yet started, the mp is pending in the list headed by 984 * ipsq_xopq_head. If the ioctl has started the mp could be present in 985 * ipx_pending_mp. If the ioctl timed out in the streamhead but 986 * is currently executing now the mp is not queued anywhere but 987 * conn_oper_pending_ill is null. The conn close will wait 988 * till the conn_ref drops to zero. 989 */ 990 mutex_enter(&connp->conn_lock); 991 ill = connp->conn_oper_pending_ill; 992 if (ill == NULL) { 993 mutex_exit(&connp->conn_lock); 994 return; 995 } 996 997 /* 998 * We may not be able to refhold the ill if the ill/ipif 999 * is changing. But we need to make sure that the ill will 1000 * not vanish. So we just bump up the ill_waiter count. 1001 */ 1002 refheld = ill_waiter_inc(ill); 1003 mutex_exit(&connp->conn_lock); 1004 if (refheld) { 1005 if (ipsq_enter(ill, B_TRUE, NEW_OP)) { 1006 ill_waiter_dcr(ill); 1007 /* 1008 * Check whether this ioctl has started and is 1009 * pending. If it is not found there then check 1010 * whether this ioctl has not even started and is in 1011 * the ipsq_xopq list. 1012 */ 1013 if (!ipsq_pending_mp_cleanup(ill, connp)) 1014 ipsq_xopq_mp_cleanup(ill, connp); 1015 ipsq = ill->ill_phyint->phyint_ipsq; 1016 ipsq_exit(ipsq); 1017 return; 1018 } 1019 } 1020 1021 /* 1022 * The ill is also closing and we could not bump up the 1023 * ill_waiter_count or we could not enter the ipsq. Leave 1024 * the cleanup to ill_delete 1025 */ 1026 mutex_enter(&connp->conn_lock); 1027 while (connp->conn_oper_pending_ill != NULL) 1028 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1029 mutex_exit(&connp->conn_lock); 1030 if (refheld) 1031 ill_waiter_dcr(ill); 1032 } 1033 1034 /* 1035 * ipcl_walk function for cleaning up conn_*_ill fields. 1036 * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and 1037 * conn_bound_if in place. We prefer dropping 1038 * packets instead of sending them out the wrong interface, or accepting 1039 * packets from the wrong ifindex. 1040 */ 1041 static void 1042 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1043 { 1044 ill_t *ill = (ill_t *)arg; 1045 1046 mutex_enter(&connp->conn_lock); 1047 if (connp->conn_dhcpinit_ill == ill) { 1048 connp->conn_dhcpinit_ill = NULL; 1049 ASSERT(ill->ill_dhcpinit != 0); 1050 atomic_dec_32(&ill->ill_dhcpinit); 1051 ill_set_inputfn(ill); 1052 } 1053 mutex_exit(&connp->conn_lock); 1054 } 1055 1056 static int 1057 ill_down_ipifs_tail(ill_t *ill) 1058 { 1059 ipif_t *ipif; 1060 int err; 1061 1062 ASSERT(IAM_WRITER_ILL(ill)); 1063 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1064 ipif_non_duplicate(ipif); 1065 /* 1066 * ipif_down_tail will call arp_ll_down on the last ipif 1067 * and typically return EINPROGRESS when the DL_UNBIND is sent. 1068 */ 1069 if ((err = ipif_down_tail(ipif)) != 0) 1070 return (err); 1071 } 1072 return (0); 1073 } 1074 1075 /* ARGSUSED */ 1076 void 1077 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1078 { 1079 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1080 (void) ill_down_ipifs_tail(q->q_ptr); 1081 freemsg(mp); 1082 ipsq_current_finish(ipsq); 1083 } 1084 1085 /* 1086 * ill_down_start is called when we want to down this ill and bring it up again 1087 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1088 * all interfaces, but don't tear down any plumbing. 1089 */ 1090 boolean_t 1091 ill_down_start(queue_t *q, mblk_t *mp) 1092 { 1093 ill_t *ill = q->q_ptr; 1094 ipif_t *ipif; 1095 1096 ASSERT(IAM_WRITER_ILL(ill)); 1097 mutex_enter(&ill->ill_lock); 1098 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 1099 /* no more nce addition allowed */ 1100 mutex_exit(&ill->ill_lock); 1101 1102 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1103 (void) ipif_down(ipif, NULL, NULL); 1104 1105 ill_down(ill); 1106 1107 /* 1108 * Walk all CONNs that can have a reference on an ire or nce for this 1109 * ill (we actually walk all that now have stale references). 1110 */ 1111 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst); 1112 1113 /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 1114 if (ill->ill_isv6) 1115 dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst); 1116 1117 1118 (void) ipsq_pending_mp_cleanup(ill, NULL); 1119 1120 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); 1121 1122 /* 1123 * Atomically test and add the pending mp if references are active. 1124 */ 1125 mutex_enter(&ill->ill_lock); 1126 if (!ill_is_quiescent(ill)) { 1127 /* call cannot fail since `conn_t *' argument is NULL */ 1128 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1129 mp, ILL_DOWN); 1130 mutex_exit(&ill->ill_lock); 1131 return (B_FALSE); 1132 } 1133 mutex_exit(&ill->ill_lock); 1134 return (B_TRUE); 1135 } 1136 1137 static void 1138 ill_down(ill_t *ill) 1139 { 1140 mblk_t *mp; 1141 ip_stack_t *ipst = ill->ill_ipst; 1142 1143 /* 1144 * Blow off any IREs dependent on this ILL. 1145 * The caller needs to handle conn_ixa_cleanup 1146 */ 1147 ill_delete_ires(ill); 1148 1149 ire_walk_ill(0, 0, ill_downi, ill, ill); 1150 1151 /* Remove any conn_*_ill depending on this ill */ 1152 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); 1153 1154 /* 1155 * Free state for additional IREs. 1156 */ 1157 mutex_enter(&ill->ill_saved_ire_lock); 1158 mp = ill->ill_saved_ire_mp; 1159 ill->ill_saved_ire_mp = NULL; 1160 ill->ill_saved_ire_cnt = 0; 1161 mutex_exit(&ill->ill_saved_ire_lock); 1162 freemsg(mp); 1163 } 1164 1165 /* 1166 * ire_walk routine used to delete every IRE that depends on 1167 * 'ill'. (Always called as writer.) 1168 * 1169 * Note: since the routes added by the kernel are deleted separately, 1170 * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE. 1171 * 1172 * We also remove references on ire_nce_cache entries that refer to the ill. 1173 */ 1174 void 1175 ill_downi(ire_t *ire, char *ill_arg) 1176 { 1177 ill_t *ill = (ill_t *)ill_arg; 1178 nce_t *nce; 1179 1180 mutex_enter(&ire->ire_lock); 1181 nce = ire->ire_nce_cache; 1182 if (nce != NULL && nce->nce_ill == ill) 1183 ire->ire_nce_cache = NULL; 1184 else 1185 nce = NULL; 1186 mutex_exit(&ire->ire_lock); 1187 if (nce != NULL) 1188 nce_refrele(nce); 1189 if (ire->ire_ill == ill) 1190 ire_delete(ire); 1191 } 1192 1193 /* Remove IRE_IF_CLONE on this ill */ 1194 void 1195 ill_downi_if_clone(ire_t *ire, char *ill_arg) 1196 { 1197 ill_t *ill = (ill_t *)ill_arg; 1198 1199 ASSERT(ire->ire_type & IRE_IF_CLONE); 1200 if (ire->ire_ill == ill) 1201 ire_delete(ire); 1202 } 1203 1204 /* Consume an M_IOCACK of the fastpath probe. */ 1205 void 1206 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1207 { 1208 mblk_t *mp1 = mp; 1209 1210 /* 1211 * If this was the first attempt turn on the fastpath probing. 1212 */ 1213 mutex_enter(&ill->ill_lock); 1214 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1215 ill->ill_dlpi_fastpath_state = IDS_OK; 1216 mutex_exit(&ill->ill_lock); 1217 1218 /* Free the M_IOCACK mblk, hold on to the data */ 1219 mp = mp->b_cont; 1220 freeb(mp1); 1221 if (mp == NULL) 1222 return; 1223 if (mp->b_cont != NULL) 1224 nce_fastpath_update(ill, mp); 1225 else 1226 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1227 freemsg(mp); 1228 } 1229 1230 /* 1231 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1232 * The data portion of the request is a dl_unitdata_req_t template for 1233 * what we would send downstream in the absence of a fastpath confirmation. 1234 */ 1235 int 1236 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1237 { 1238 struct iocblk *ioc; 1239 mblk_t *mp; 1240 1241 if (dlur_mp == NULL) 1242 return (EINVAL); 1243 1244 mutex_enter(&ill->ill_lock); 1245 switch (ill->ill_dlpi_fastpath_state) { 1246 case IDS_FAILED: 1247 /* 1248 * Driver NAKed the first fastpath ioctl - assume it doesn't 1249 * support it. 1250 */ 1251 mutex_exit(&ill->ill_lock); 1252 return (ENOTSUP); 1253 case IDS_UNKNOWN: 1254 /* This is the first probe */ 1255 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1256 break; 1257 default: 1258 break; 1259 } 1260 mutex_exit(&ill->ill_lock); 1261 1262 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1263 return (EAGAIN); 1264 1265 mp->b_cont = copyb(dlur_mp); 1266 if (mp->b_cont == NULL) { 1267 freeb(mp); 1268 return (EAGAIN); 1269 } 1270 1271 ioc = (struct iocblk *)mp->b_rptr; 1272 ioc->ioc_count = msgdsize(mp->b_cont); 1273 1274 DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe", 1275 char *, "DL_IOC_HDR_INFO", ill_t *, ill); 1276 putnext(ill->ill_wq, mp); 1277 return (0); 1278 } 1279 1280 void 1281 ill_capability_probe(ill_t *ill) 1282 { 1283 mblk_t *mp; 1284 1285 ASSERT(IAM_WRITER_ILL(ill)); 1286 1287 if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN && 1288 ill->ill_dlpi_capab_state != IDCS_FAILED) 1289 return; 1290 1291 /* 1292 * We are starting a new cycle of capability negotiation. 1293 * Free up the capab reset messages of any previous incarnation. 1294 * We will do a fresh allocation when we get the response to our probe 1295 */ 1296 if (ill->ill_capab_reset_mp != NULL) { 1297 freemsg(ill->ill_capab_reset_mp); 1298 ill->ill_capab_reset_mp = NULL; 1299 } 1300 1301 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1302 1303 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); 1304 if (mp == NULL) 1305 return; 1306 1307 ill_capability_send(ill, mp); 1308 ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; 1309 } 1310 1311 void 1312 ill_capability_reset(ill_t *ill, boolean_t reneg) 1313 { 1314 ASSERT(IAM_WRITER_ILL(ill)); 1315 1316 if (ill->ill_dlpi_capab_state != IDCS_OK) 1317 return; 1318 1319 ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; 1320 1321 ill_capability_send(ill, ill->ill_capab_reset_mp); 1322 ill->ill_capab_reset_mp = NULL; 1323 /* 1324 * We turn off all capabilities except those pertaining to 1325 * direct function call capabilities viz. ILL_CAPAB_DLD* 1326 * which will be turned off by the corresponding reset functions. 1327 */ 1328 ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY); 1329 } 1330 1331 static void 1332 ill_capability_reset_alloc(ill_t *ill) 1333 { 1334 mblk_t *mp; 1335 size_t size = 0; 1336 int err; 1337 dl_capability_req_t *capb; 1338 1339 ASSERT(IAM_WRITER_ILL(ill)); 1340 ASSERT(ill->ill_capab_reset_mp == NULL); 1341 1342 if (ILL_HCKSUM_CAPABLE(ill)) { 1343 size += sizeof (dl_capability_sub_t) + 1344 sizeof (dl_capab_hcksum_t); 1345 } 1346 1347 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) { 1348 size += sizeof (dl_capability_sub_t) + 1349 sizeof (dl_capab_zerocopy_t); 1350 } 1351 1352 if (ill->ill_capabilities & ILL_CAPAB_DLD) { 1353 size += sizeof (dl_capability_sub_t) + 1354 sizeof (dl_capab_dld_t); 1355 } 1356 1357 mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED, 1358 STR_NOSIG, &err); 1359 1360 mp->b_datap->db_type = M_PROTO; 1361 bzero(mp->b_rptr, size + sizeof (dl_capability_req_t)); 1362 1363 capb = (dl_capability_req_t *)mp->b_rptr; 1364 capb->dl_primitive = DL_CAPABILITY_REQ; 1365 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1366 capb->dl_sub_length = size; 1367 1368 mp->b_wptr += sizeof (dl_capability_req_t); 1369 1370 /* 1371 * Each handler fills in the corresponding dl_capability_sub_t 1372 * inside the mblk, 1373 */ 1374 ill_capability_hcksum_reset_fill(ill, mp); 1375 ill_capability_zerocopy_reset_fill(ill, mp); 1376 ill_capability_dld_reset_fill(ill, mp); 1377 1378 ill->ill_capab_reset_mp = mp; 1379 } 1380 1381 static void 1382 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1383 { 1384 dl_capab_id_t *id_ic; 1385 uint_t sub_dl_cap = outers->dl_cap; 1386 dl_capability_sub_t *inners; 1387 uint8_t *capend; 1388 1389 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1390 1391 /* 1392 * Note: range checks here are not absolutely sufficient to 1393 * make us robust against malformed messages sent by drivers; 1394 * this is in keeping with the rest of IP's dlpi handling. 1395 * (Remember, it's coming from something else in the kernel 1396 * address space) 1397 */ 1398 1399 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1400 if (capend > mp->b_wptr) { 1401 cmn_err(CE_WARN, "ill_capability_id_ack: " 1402 "malformed sub-capability too long for mblk"); 1403 return; 1404 } 1405 1406 id_ic = (dl_capab_id_t *)(outers + 1); 1407 1408 if (outers->dl_length < sizeof (*id_ic) || 1409 (inners = &id_ic->id_subcap, 1410 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1411 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1412 "encapsulated capab type %d too long for mblk", 1413 inners->dl_cap); 1414 return; 1415 } 1416 1417 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1418 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1419 "isn't as expected; pass-thru module(s) detected, " 1420 "discarding capability\n", inners->dl_cap)); 1421 return; 1422 } 1423 1424 /* Process the encapsulated sub-capability */ 1425 ill_capability_dispatch(ill, mp, inners); 1426 } 1427 1428 static void 1429 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp) 1430 { 1431 dl_capability_sub_t *dl_subcap; 1432 1433 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 1434 return; 1435 1436 /* 1437 * The dl_capab_dld_t that follows the dl_capability_sub_t is not 1438 * initialized below since it is not used by DLD. 1439 */ 1440 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1441 dl_subcap->dl_cap = DL_CAPAB_DLD; 1442 dl_subcap->dl_length = sizeof (dl_capab_dld_t); 1443 1444 mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); 1445 } 1446 1447 static void 1448 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp) 1449 { 1450 /* 1451 * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK 1452 * is only to get the VRRP capability. 1453 * 1454 * Note that we cannot check ill_ipif_up_count here since 1455 * ill_ipif_up_count is only incremented when the resolver is setup. 1456 * That is done asynchronously, and can race with this function. 1457 */ 1458 if (!ill->ill_dl_up) { 1459 if (subp->dl_cap == DL_CAPAB_VRRP) 1460 ill_capability_vrrp_ack(ill, mp, subp); 1461 return; 1462 } 1463 1464 switch (subp->dl_cap) { 1465 case DL_CAPAB_HCKSUM: 1466 ill_capability_hcksum_ack(ill, mp, subp); 1467 break; 1468 case DL_CAPAB_ZEROCOPY: 1469 ill_capability_zerocopy_ack(ill, mp, subp); 1470 break; 1471 case DL_CAPAB_DLD: 1472 ill_capability_dld_ack(ill, mp, subp); 1473 break; 1474 case DL_CAPAB_VRRP: 1475 break; 1476 default: 1477 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 1478 subp->dl_cap)); 1479 } 1480 } 1481 1482 /* 1483 * Process the vrrp capability received from a DLS Provider. isub must point 1484 * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message. 1485 */ 1486 static void 1487 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1488 { 1489 dl_capab_vrrp_t *vrrp; 1490 uint_t sub_dl_cap = isub->dl_cap; 1491 uint8_t *capend; 1492 1493 ASSERT(IAM_WRITER_ILL(ill)); 1494 ASSERT(sub_dl_cap == DL_CAPAB_VRRP); 1495 1496 /* 1497 * Note: range checks here are not absolutely sufficient to 1498 * make us robust against malformed messages sent by drivers; 1499 * this is in keeping with the rest of IP's dlpi handling. 1500 * (Remember, it's coming from something else in the kernel 1501 * address space) 1502 */ 1503 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1504 if (capend > mp->b_wptr) { 1505 cmn_err(CE_WARN, "ill_capability_vrrp_ack: " 1506 "malformed sub-capability too long for mblk"); 1507 return; 1508 } 1509 vrrp = (dl_capab_vrrp_t *)(isub + 1); 1510 1511 /* 1512 * Compare the IP address family and set ILLF_VRRP for the right ill. 1513 */ 1514 if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) || 1515 (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) { 1516 ill->ill_flags |= ILLF_VRRP; 1517 } 1518 } 1519 1520 /* 1521 * Process a hardware checksum offload capability negotiation ack received 1522 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 1523 * of a DL_CAPABILITY_ACK message. 1524 */ 1525 static void 1526 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1527 { 1528 dl_capability_req_t *ocap; 1529 dl_capab_hcksum_t *ihck, *ohck; 1530 ill_hcksum_capab_t **ill_hcksum; 1531 mblk_t *nmp = NULL; 1532 uint_t sub_dl_cap = isub->dl_cap; 1533 uint8_t *capend; 1534 1535 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 1536 1537 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 1538 1539 /* 1540 * Note: range checks here are not absolutely sufficient to 1541 * make us robust against malformed messages sent by drivers; 1542 * this is in keeping with the rest of IP's dlpi handling. 1543 * (Remember, it's coming from something else in the kernel 1544 * address space) 1545 */ 1546 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1547 if (capend > mp->b_wptr) { 1548 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1549 "malformed sub-capability too long for mblk"); 1550 return; 1551 } 1552 1553 /* 1554 * There are two types of acks we process here: 1555 * 1. acks in reply to a (first form) generic capability req 1556 * (no ENABLE flag set) 1557 * 2. acks in reply to a ENABLE capability req. 1558 * (ENABLE flag set) 1559 */ 1560 ihck = (dl_capab_hcksum_t *)(isub + 1); 1561 1562 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 1563 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 1564 "unsupported hardware checksum " 1565 "sub-capability (version %d, expected %d)", 1566 ihck->hcksum_version, HCKSUM_VERSION_1); 1567 return; 1568 } 1569 1570 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 1571 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 1572 "checksum capability isn't as expected; pass-thru " 1573 "module(s) detected, discarding capability\n")); 1574 return; 1575 } 1576 1577 #define CURR_HCKSUM_CAPAB \ 1578 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 1579 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 1580 1581 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 1582 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 1583 /* do ENABLE processing */ 1584 if (*ill_hcksum == NULL) { 1585 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 1586 KM_NOSLEEP); 1587 1588 if (*ill_hcksum == NULL) { 1589 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1590 "could not enable hcksum version %d " 1591 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 1592 ill->ill_name); 1593 return; 1594 } 1595 } 1596 1597 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 1598 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 1599 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 1600 ip1dbg(("ill_capability_hcksum_ack: interface %s " 1601 "has enabled hardware checksumming\n ", 1602 ill->ill_name)); 1603 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 1604 /* 1605 * Enabling hardware checksum offload 1606 * Currently IP supports {TCP,UDP}/IPv4 1607 * partial and full cksum offload and 1608 * IPv4 header checksum offload. 1609 * Allocate new mblk which will 1610 * contain a new capability request 1611 * to enable hardware checksum offload. 1612 */ 1613 uint_t size; 1614 uchar_t *rptr; 1615 1616 size = sizeof (dl_capability_req_t) + 1617 sizeof (dl_capability_sub_t) + isub->dl_length; 1618 1619 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1620 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1621 "could not enable hardware cksum for %s (ENOMEM)\n", 1622 ill->ill_name); 1623 return; 1624 } 1625 1626 rptr = nmp->b_rptr; 1627 /* initialize dl_capability_req_t */ 1628 ocap = (dl_capability_req_t *)nmp->b_rptr; 1629 ocap->dl_sub_offset = 1630 sizeof (dl_capability_req_t); 1631 ocap->dl_sub_length = 1632 sizeof (dl_capability_sub_t) + 1633 isub->dl_length; 1634 nmp->b_rptr += sizeof (dl_capability_req_t); 1635 1636 /* initialize dl_capability_sub_t */ 1637 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 1638 nmp->b_rptr += sizeof (*isub); 1639 1640 /* initialize dl_capab_hcksum_t */ 1641 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 1642 bcopy(ihck, ohck, sizeof (*ihck)); 1643 1644 nmp->b_rptr = rptr; 1645 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 1646 1647 /* Set ENABLE flag */ 1648 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 1649 ohck->hcksum_txflags |= HCKSUM_ENABLE; 1650 1651 /* 1652 * nmp points to a DL_CAPABILITY_REQ message to enable 1653 * hardware checksum acceleration. 1654 */ 1655 ill_capability_send(ill, nmp); 1656 } else { 1657 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 1658 "advertised %x hardware checksum capability flags\n", 1659 ill->ill_name, ihck->hcksum_txflags)); 1660 } 1661 } 1662 1663 static void 1664 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp) 1665 { 1666 dl_capab_hcksum_t *hck_subcap; 1667 dl_capability_sub_t *dl_subcap; 1668 1669 if (!ILL_HCKSUM_CAPABLE(ill)) 1670 return; 1671 1672 ASSERT(ill->ill_hcksum_capab != NULL); 1673 1674 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1675 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 1676 dl_subcap->dl_length = sizeof (*hck_subcap); 1677 1678 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 1679 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 1680 hck_subcap->hcksum_txflags = 0; 1681 1682 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap); 1683 } 1684 1685 static void 1686 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1687 { 1688 mblk_t *nmp = NULL; 1689 dl_capability_req_t *oc; 1690 dl_capab_zerocopy_t *zc_ic, *zc_oc; 1691 ill_zerocopy_capab_t **ill_zerocopy_capab; 1692 uint_t sub_dl_cap = isub->dl_cap; 1693 uint8_t *capend; 1694 1695 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 1696 1697 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 1698 1699 /* 1700 * Note: range checks here are not absolutely sufficient to 1701 * make us robust against malformed messages sent by drivers; 1702 * this is in keeping with the rest of IP's dlpi handling. 1703 * (Remember, it's coming from something else in the kernel 1704 * address space) 1705 */ 1706 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1707 if (capend > mp->b_wptr) { 1708 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1709 "malformed sub-capability too long for mblk"); 1710 return; 1711 } 1712 1713 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 1714 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 1715 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 1716 "unsupported ZEROCOPY sub-capability (version %d, " 1717 "expected %d)", zc_ic->zerocopy_version, 1718 ZEROCOPY_VERSION_1); 1719 return; 1720 } 1721 1722 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 1723 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 1724 "capability isn't as expected; pass-thru module(s) " 1725 "detected, discarding capability\n")); 1726 return; 1727 } 1728 1729 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 1730 if (*ill_zerocopy_capab == NULL) { 1731 *ill_zerocopy_capab = 1732 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 1733 KM_NOSLEEP); 1734 1735 if (*ill_zerocopy_capab == NULL) { 1736 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1737 "could not enable Zero-copy version %d " 1738 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 1739 ill->ill_name); 1740 return; 1741 } 1742 } 1743 1744 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 1745 "supports Zero-copy version %d\n", ill->ill_name, 1746 ZEROCOPY_VERSION_1)); 1747 1748 (*ill_zerocopy_capab)->ill_zerocopy_version = 1749 zc_ic->zerocopy_version; 1750 (*ill_zerocopy_capab)->ill_zerocopy_flags = 1751 zc_ic->zerocopy_flags; 1752 1753 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 1754 } else { 1755 uint_t size; 1756 uchar_t *rptr; 1757 1758 size = sizeof (dl_capability_req_t) + 1759 sizeof (dl_capability_sub_t) + 1760 sizeof (dl_capab_zerocopy_t); 1761 1762 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1763 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1764 "could not enable zerocopy for %s (ENOMEM)\n", 1765 ill->ill_name); 1766 return; 1767 } 1768 1769 rptr = nmp->b_rptr; 1770 /* initialize dl_capability_req_t */ 1771 oc = (dl_capability_req_t *)rptr; 1772 oc->dl_sub_offset = sizeof (dl_capability_req_t); 1773 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 1774 sizeof (dl_capab_zerocopy_t); 1775 rptr += sizeof (dl_capability_req_t); 1776 1777 /* initialize dl_capability_sub_t */ 1778 bcopy(isub, rptr, sizeof (*isub)); 1779 rptr += sizeof (*isub); 1780 1781 /* initialize dl_capab_zerocopy_t */ 1782 zc_oc = (dl_capab_zerocopy_t *)rptr; 1783 *zc_oc = *zc_ic; 1784 1785 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 1786 "to enable zero-copy version %d\n", ill->ill_name, 1787 ZEROCOPY_VERSION_1)); 1788 1789 /* set VMSAFE_MEM flag */ 1790 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 1791 1792 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 1793 ill_capability_send(ill, nmp); 1794 } 1795 } 1796 1797 static void 1798 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp) 1799 { 1800 dl_capab_zerocopy_t *zerocopy_subcap; 1801 dl_capability_sub_t *dl_subcap; 1802 1803 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 1804 return; 1805 1806 ASSERT(ill->ill_zerocopy_capab != NULL); 1807 1808 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1809 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 1810 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 1811 1812 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 1813 zerocopy_subcap->zerocopy_version = 1814 ill->ill_zerocopy_capab->ill_zerocopy_version; 1815 zerocopy_subcap->zerocopy_flags = 0; 1816 1817 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 1818 } 1819 1820 /* 1821 * DLD capability 1822 * Refer to dld.h for more information regarding the purpose and usage 1823 * of this capability. 1824 */ 1825 static void 1826 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1827 { 1828 dl_capab_dld_t *dld_ic, dld; 1829 uint_t sub_dl_cap = isub->dl_cap; 1830 uint8_t *capend; 1831 ill_dld_capab_t *idc; 1832 1833 ASSERT(IAM_WRITER_ILL(ill)); 1834 ASSERT(sub_dl_cap == DL_CAPAB_DLD); 1835 1836 /* 1837 * Note: range checks here are not absolutely sufficient to 1838 * make us robust against malformed messages sent by drivers; 1839 * this is in keeping with the rest of IP's dlpi handling. 1840 * (Remember, it's coming from something else in the kernel 1841 * address space) 1842 */ 1843 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1844 if (capend > mp->b_wptr) { 1845 cmn_err(CE_WARN, "ill_capability_dld_ack: " 1846 "malformed sub-capability too long for mblk"); 1847 return; 1848 } 1849 dld_ic = (dl_capab_dld_t *)(isub + 1); 1850 if (dld_ic->dld_version != DLD_CURRENT_VERSION) { 1851 cmn_err(CE_CONT, "ill_capability_dld_ack: " 1852 "unsupported DLD sub-capability (version %d, " 1853 "expected %d)", dld_ic->dld_version, 1854 DLD_CURRENT_VERSION); 1855 return; 1856 } 1857 if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) { 1858 ip1dbg(("ill_capability_dld_ack: mid token for dld " 1859 "capability isn't as expected; pass-thru module(s) " 1860 "detected, discarding capability\n")); 1861 return; 1862 } 1863 1864 /* 1865 * Copy locally to ensure alignment. 1866 */ 1867 bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t)); 1868 1869 if ((idc = ill->ill_dld_capab) == NULL) { 1870 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP); 1871 if (idc == NULL) { 1872 cmn_err(CE_WARN, "ill_capability_dld_ack: " 1873 "could not enable DLD version %d " 1874 "for %s (ENOMEM)\n", DLD_CURRENT_VERSION, 1875 ill->ill_name); 1876 return; 1877 } 1878 ill->ill_dld_capab = idc; 1879 } 1880 idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab; 1881 idc->idc_capab_dh = (void *)dld.dld_capab_handle; 1882 ip1dbg(("ill_capability_dld_ack: interface %s " 1883 "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION)); 1884 1885 ill_capability_dld_enable(ill); 1886 } 1887 1888 /* 1889 * Typically capability negotiation between IP and the driver happens via 1890 * DLPI message exchange. However GLD also offers a direct function call 1891 * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities, 1892 * But arbitrary function calls into IP or GLD are not permitted, since both 1893 * of them are protected by their own perimeter mechanism. The perimeter can 1894 * be viewed as a coarse lock or serialization mechanism. The hierarchy of 1895 * these perimeters is IP -> MAC. Thus for example to enable the squeue 1896 * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter 1897 * to enter the mac perimeter and then do the direct function calls into 1898 * GLD to enable squeue polling. The ring related callbacks from the mac into 1899 * the stack to add, bind, quiesce, restart or cleanup a ring are all 1900 * protected by the mac perimeter. 1901 */ 1902 static void 1903 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp) 1904 { 1905 ill_dld_capab_t *idc = ill->ill_dld_capab; 1906 int err; 1907 1908 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp, 1909 DLD_ENABLE); 1910 ASSERT(err == 0); 1911 } 1912 1913 static void 1914 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph) 1915 { 1916 ill_dld_capab_t *idc = ill->ill_dld_capab; 1917 int err; 1918 1919 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph, 1920 DLD_DISABLE); 1921 ASSERT(err == 0); 1922 } 1923 1924 boolean_t 1925 ill_mac_perim_held(ill_t *ill) 1926 { 1927 ill_dld_capab_t *idc = ill->ill_dld_capab; 1928 1929 return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL, 1930 DLD_QUERY)); 1931 } 1932 1933 static void 1934 ill_capability_direct_enable(ill_t *ill) 1935 { 1936 ill_dld_capab_t *idc = ill->ill_dld_capab; 1937 ill_dld_direct_t *idd = &idc->idc_direct; 1938 dld_capab_direct_t direct; 1939 int rc; 1940 1941 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 1942 1943 bzero(&direct, sizeof (direct)); 1944 direct.di_rx_cf = (uintptr_t)ip_input; 1945 direct.di_rx_ch = ill; 1946 1947 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct, 1948 DLD_ENABLE); 1949 if (rc == 0) { 1950 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df; 1951 idd->idd_tx_dh = direct.di_tx_dh; 1952 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df; 1953 idd->idd_tx_cb_dh = direct.di_tx_cb_dh; 1954 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df; 1955 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh; 1956 ASSERT(idd->idd_tx_cb_df != NULL); 1957 ASSERT(idd->idd_tx_fctl_df != NULL); 1958 ASSERT(idd->idd_tx_df != NULL); 1959 /* 1960 * One time registration of flow enable callback function 1961 */ 1962 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh, 1963 ill_flow_enable, ill); 1964 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT; 1965 DTRACE_PROBE1(direct_on, (ill_t *), ill); 1966 } else { 1967 cmn_err(CE_WARN, "warning: could not enable DIRECT " 1968 "capability, rc = %d\n", rc); 1969 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc); 1970 } 1971 } 1972 1973 static void 1974 ill_capability_poll_enable(ill_t *ill) 1975 { 1976 ill_dld_capab_t *idc = ill->ill_dld_capab; 1977 dld_capab_poll_t poll; 1978 int rc; 1979 1980 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 1981 1982 bzero(&poll, sizeof (poll)); 1983 poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring; 1984 poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring; 1985 poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring; 1986 poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring; 1987 poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring; 1988 poll.poll_ring_ch = ill; 1989 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll, 1990 DLD_ENABLE); 1991 if (rc == 0) { 1992 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL; 1993 DTRACE_PROBE1(poll_on, (ill_t *), ill); 1994 } else { 1995 ip1dbg(("warning: could not enable POLL " 1996 "capability, rc = %d\n", rc)); 1997 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc); 1998 } 1999 } 2000 2001 /* 2002 * Enable the LSO capability. 2003 */ 2004 static void 2005 ill_capability_lso_enable(ill_t *ill) 2006 { 2007 ill_dld_capab_t *idc = ill->ill_dld_capab; 2008 dld_capab_lso_t lso; 2009 int rc; 2010 2011 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 2012 2013 if (ill->ill_lso_capab == NULL) { 2014 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), 2015 KM_NOSLEEP); 2016 if (ill->ill_lso_capab == NULL) { 2017 cmn_err(CE_WARN, "ill_capability_lso_enable: " 2018 "could not enable LSO for %s (ENOMEM)\n", 2019 ill->ill_name); 2020 return; 2021 } 2022 } 2023 2024 bzero(&lso, sizeof (lso)); 2025 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso, 2026 DLD_ENABLE)) == 0) { 2027 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags; 2028 ill->ill_lso_capab->ill_lso_max = lso.lso_max; 2029 ill->ill_capabilities |= ILL_CAPAB_LSO; 2030 ip1dbg(("ill_capability_lso_enable: interface %s " 2031 "has enabled LSO\n ", ill->ill_name)); 2032 } else { 2033 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 2034 ill->ill_lso_capab = NULL; 2035 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc); 2036 } 2037 } 2038 2039 static void 2040 ill_capability_dld_enable(ill_t *ill) 2041 { 2042 mac_perim_handle_t mph; 2043 2044 ASSERT(IAM_WRITER_ILL(ill)); 2045 2046 if (ill->ill_isv6) 2047 return; 2048 2049 ill_mac_perim_enter(ill, &mph); 2050 if (!ill->ill_isv6) { 2051 ill_capability_direct_enable(ill); 2052 ill_capability_poll_enable(ill); 2053 ill_capability_lso_enable(ill); 2054 } 2055 ill->ill_capabilities |= ILL_CAPAB_DLD; 2056 ill_mac_perim_exit(ill, mph); 2057 } 2058 2059 static void 2060 ill_capability_dld_disable(ill_t *ill) 2061 { 2062 ill_dld_capab_t *idc; 2063 ill_dld_direct_t *idd; 2064 mac_perim_handle_t mph; 2065 2066 ASSERT(IAM_WRITER_ILL(ill)); 2067 2068 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 2069 return; 2070 2071 ill_mac_perim_enter(ill, &mph); 2072 2073 idc = ill->ill_dld_capab; 2074 if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) { 2075 /* 2076 * For performance we avoid locks in the transmit data path 2077 * and don't maintain a count of the number of threads using 2078 * direct calls. Thus some threads could be using direct 2079 * transmit calls to GLD, even after the capability mechanism 2080 * turns it off. This is still safe since the handles used in 2081 * the direct calls continue to be valid until the unplumb is 2082 * completed. Remove the callback that was added (1-time) at 2083 * capab enable time. 2084 */ 2085 mutex_enter(&ill->ill_lock); 2086 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT; 2087 mutex_exit(&ill->ill_lock); 2088 if (ill->ill_flownotify_mh != NULL) { 2089 idd = &idc->idc_direct; 2090 idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL, 2091 ill->ill_flownotify_mh); 2092 ill->ill_flownotify_mh = NULL; 2093 } 2094 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, 2095 NULL, DLD_DISABLE); 2096 } 2097 2098 if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) { 2099 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL; 2100 ip_squeue_clean_all(ill); 2101 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, 2102 NULL, DLD_DISABLE); 2103 } 2104 2105 if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) { 2106 ASSERT(ill->ill_lso_capab != NULL); 2107 /* 2108 * Clear the capability flag for LSO but retain the 2109 * ill_lso_capab structure since it's possible that another 2110 * thread is still referring to it. The structure only gets 2111 * deallocated when we destroy the ill. 2112 */ 2113 2114 ill->ill_capabilities &= ~ILL_CAPAB_LSO; 2115 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, 2116 NULL, DLD_DISABLE); 2117 } 2118 2119 ill->ill_capabilities &= ~ILL_CAPAB_DLD; 2120 ill_mac_perim_exit(ill, mph); 2121 } 2122 2123 /* 2124 * Capability Negotiation protocol 2125 * 2126 * We don't wait for DLPI capability operations to finish during interface 2127 * bringup or teardown. Doing so would introduce more asynchrony and the 2128 * interface up/down operations will need multiple return and restarts. 2129 * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as 2130 * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next 2131 * exclusive operation won't start until the DLPI operations of the previous 2132 * exclusive operation complete. 2133 * 2134 * The capability state machine is shown below. 2135 * 2136 * state next state event, action 2137 * 2138 * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe 2139 * IDCS_PROBE_SENT IDCS_OK ill_capability_ack 2140 * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack) 2141 * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG 2142 * IDCS_OK IDCS_RESET_SENT ill_capability_reset 2143 * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr 2144 * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr -> 2145 * ill_capability_probe. 2146 */ 2147 2148 /* 2149 * Dedicated thread started from ip_stack_init that handles capability 2150 * disable. This thread ensures the taskq dispatch does not fail by waiting 2151 * for resources using TQ_SLEEP. The taskq mechanism is used to ensure 2152 * that direct calls to DLD are done in a cv_waitable context. 2153 */ 2154 void 2155 ill_taskq_dispatch(ip_stack_t *ipst) 2156 { 2157 callb_cpr_t cprinfo; 2158 char name[64]; 2159 mblk_t *mp; 2160 2161 (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d", 2162 ipst->ips_netstack->netstack_stackid); 2163 CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr, 2164 name); 2165 mutex_enter(&ipst->ips_capab_taskq_lock); 2166 2167 for (;;) { 2168 mp = ipst->ips_capab_taskq_head; 2169 while (mp != NULL) { 2170 ipst->ips_capab_taskq_head = mp->b_next; 2171 if (ipst->ips_capab_taskq_head == NULL) 2172 ipst->ips_capab_taskq_tail = NULL; 2173 mutex_exit(&ipst->ips_capab_taskq_lock); 2174 mp->b_next = NULL; 2175 2176 VERIFY(taskq_dispatch(system_taskq, 2177 ill_capability_ack_thr, mp, TQ_SLEEP) != 0); 2178 mutex_enter(&ipst->ips_capab_taskq_lock); 2179 mp = ipst->ips_capab_taskq_head; 2180 } 2181 2182 if (ipst->ips_capab_taskq_quit) 2183 break; 2184 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2185 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock); 2186 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock); 2187 } 2188 VERIFY(ipst->ips_capab_taskq_head == NULL); 2189 VERIFY(ipst->ips_capab_taskq_tail == NULL); 2190 CALLB_CPR_EXIT(&cprinfo); 2191 thread_exit(); 2192 } 2193 2194 /* 2195 * Consume a new-style hardware capabilities negotiation ack. 2196 * Called via taskq on receipt of DL_CAPABILITY_ACK. 2197 */ 2198 static void 2199 ill_capability_ack_thr(void *arg) 2200 { 2201 mblk_t *mp = arg; 2202 dl_capability_ack_t *capp; 2203 dl_capability_sub_t *subp, *endp; 2204 ill_t *ill; 2205 boolean_t reneg; 2206 2207 ill = (ill_t *)mp->b_prev; 2208 mp->b_prev = NULL; 2209 2210 VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE); 2211 2212 if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT || 2213 ill->ill_dlpi_capab_state == IDCS_RENEG) { 2214 /* 2215 * We have received the ack for our DL_CAPAB reset request. 2216 * There isnt' anything in the message that needs processing. 2217 * All message based capabilities have been disabled, now 2218 * do the function call based capability disable. 2219 */ 2220 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG; 2221 ill_capability_dld_disable(ill); 2222 ill->ill_dlpi_capab_state = IDCS_UNKNOWN; 2223 if (reneg) 2224 ill_capability_probe(ill); 2225 goto done; 2226 } 2227 2228 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT) 2229 ill->ill_dlpi_capab_state = IDCS_OK; 2230 2231 capp = (dl_capability_ack_t *)mp->b_rptr; 2232 2233 if (capp->dl_sub_length == 0) { 2234 /* no new-style capabilities */ 2235 goto done; 2236 } 2237 2238 /* make sure the driver supplied correct dl_sub_length */ 2239 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 2240 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 2241 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 2242 goto done; 2243 } 2244 2245 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 2246 /* 2247 * There are sub-capabilities. Process the ones we know about. 2248 * Loop until we don't have room for another sub-cap header.. 2249 */ 2250 for (subp = SC(capp, capp->dl_sub_offset), 2251 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 2252 subp <= endp; 2253 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 2254 2255 switch (subp->dl_cap) { 2256 case DL_CAPAB_ID_WRAPPER: 2257 ill_capability_id_ack(ill, mp, subp); 2258 break; 2259 default: 2260 ill_capability_dispatch(ill, mp, subp); 2261 break; 2262 } 2263 } 2264 #undef SC 2265 done: 2266 inet_freemsg(mp); 2267 ill_capability_done(ill); 2268 ipsq_exit(ill->ill_phyint->phyint_ipsq); 2269 } 2270 2271 /* 2272 * This needs to be started in a taskq thread to provide a cv_waitable 2273 * context. 2274 */ 2275 void 2276 ill_capability_ack(ill_t *ill, mblk_t *mp) 2277 { 2278 ip_stack_t *ipst = ill->ill_ipst; 2279 2280 mp->b_prev = (mblk_t *)ill; 2281 ASSERT(mp->b_next == NULL); 2282 2283 if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp, 2284 TQ_NOSLEEP) != 0) 2285 return; 2286 2287 /* 2288 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread 2289 * which will do the dispatch using TQ_SLEEP to guarantee success. 2290 */ 2291 mutex_enter(&ipst->ips_capab_taskq_lock); 2292 if (ipst->ips_capab_taskq_head == NULL) { 2293 ASSERT(ipst->ips_capab_taskq_tail == NULL); 2294 ipst->ips_capab_taskq_head = mp; 2295 } else { 2296 ipst->ips_capab_taskq_tail->b_next = mp; 2297 } 2298 ipst->ips_capab_taskq_tail = mp; 2299 2300 cv_signal(&ipst->ips_capab_taskq_cv); 2301 mutex_exit(&ipst->ips_capab_taskq_lock); 2302 } 2303 2304 /* 2305 * This routine is called to scan the fragmentation reassembly table for 2306 * the specified ILL for any packets that are starting to smell. 2307 * dead_interval is the maximum time in seconds that will be tolerated. It 2308 * will either be the value specified in ip_g_frag_timeout, or zero if the 2309 * ILL is shutting down and it is time to blow everything off. 2310 * 2311 * It returns the number of seconds (as a time_t) that the next frag timer 2312 * should be scheduled for, 0 meaning that the timer doesn't need to be 2313 * re-started. Note that the method of calculating next_timeout isn't 2314 * entirely accurate since time will flow between the time we grab 2315 * current_time and the time we schedule the next timeout. This isn't a 2316 * big problem since this is the timer for sending an ICMP reassembly time 2317 * exceeded messages, and it doesn't have to be exactly accurate. 2318 * 2319 * This function is 2320 * sometimes called as writer, although this is not required. 2321 */ 2322 time_t 2323 ill_frag_timeout(ill_t *ill, time_t dead_interval) 2324 { 2325 ipfb_t *ipfb; 2326 ipfb_t *endp; 2327 ipf_t *ipf; 2328 ipf_t *ipfnext; 2329 mblk_t *mp; 2330 time_t current_time = gethrestime_sec(); 2331 time_t next_timeout = 0; 2332 uint32_t hdr_length; 2333 mblk_t *send_icmp_head; 2334 mblk_t *send_icmp_head_v6; 2335 ip_stack_t *ipst = ill->ill_ipst; 2336 ip_recv_attr_t iras; 2337 2338 bzero(&iras, sizeof (iras)); 2339 iras.ira_flags = 0; 2340 iras.ira_ill = iras.ira_rill = ill; 2341 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2342 iras.ira_rifindex = iras.ira_ruifindex; 2343 2344 ipfb = ill->ill_frag_hash_tbl; 2345 if (ipfb == NULL) 2346 return (B_FALSE); 2347 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 2348 /* Walk the frag hash table. */ 2349 for (; ipfb < endp; ipfb++) { 2350 send_icmp_head = NULL; 2351 send_icmp_head_v6 = NULL; 2352 mutex_enter(&ipfb->ipfb_lock); 2353 while ((ipf = ipfb->ipfb_ipf) != 0) { 2354 time_t frag_time = current_time - ipf->ipf_timestamp; 2355 time_t frag_timeout; 2356 2357 if (frag_time < dead_interval) { 2358 /* 2359 * There are some outstanding fragments 2360 * that will timeout later. Make note of 2361 * the time so that we can reschedule the 2362 * next timeout appropriately. 2363 */ 2364 frag_timeout = dead_interval - frag_time; 2365 if (next_timeout == 0 || 2366 frag_timeout < next_timeout) { 2367 next_timeout = frag_timeout; 2368 } 2369 break; 2370 } 2371 /* Time's up. Get it out of here. */ 2372 hdr_length = ipf->ipf_nf_hdr_len; 2373 ipfnext = ipf->ipf_hash_next; 2374 if (ipfnext) 2375 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 2376 *ipf->ipf_ptphn = ipfnext; 2377 mp = ipf->ipf_mp->b_cont; 2378 for (; mp; mp = mp->b_cont) { 2379 /* Extra points for neatness. */ 2380 IP_REASS_SET_START(mp, 0); 2381 IP_REASS_SET_END(mp, 0); 2382 } 2383 mp = ipf->ipf_mp->b_cont; 2384 atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count); 2385 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 2386 ipfb->ipfb_count -= ipf->ipf_count; 2387 ASSERT(ipfb->ipfb_frag_pkts > 0); 2388 ipfb->ipfb_frag_pkts--; 2389 /* 2390 * We do not send any icmp message from here because 2391 * we currently are holding the ipfb_lock for this 2392 * hash chain. If we try and send any icmp messages 2393 * from here we may end up via a put back into ip 2394 * trying to get the same lock, causing a recursive 2395 * mutex panic. Instead we build a list and send all 2396 * the icmp messages after we have dropped the lock. 2397 */ 2398 if (ill->ill_isv6) { 2399 if (hdr_length != 0) { 2400 mp->b_next = send_icmp_head_v6; 2401 send_icmp_head_v6 = mp; 2402 } else { 2403 freemsg(mp); 2404 } 2405 } else { 2406 if (hdr_length != 0) { 2407 mp->b_next = send_icmp_head; 2408 send_icmp_head = mp; 2409 } else { 2410 freemsg(mp); 2411 } 2412 } 2413 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2414 ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill); 2415 freeb(ipf->ipf_mp); 2416 } 2417 mutex_exit(&ipfb->ipfb_lock); 2418 /* 2419 * Now need to send any icmp messages that we delayed from 2420 * above. 2421 */ 2422 while (send_icmp_head_v6 != NULL) { 2423 ip6_t *ip6h; 2424 2425 mp = send_icmp_head_v6; 2426 send_icmp_head_v6 = send_icmp_head_v6->b_next; 2427 mp->b_next = NULL; 2428 ip6h = (ip6_t *)mp->b_rptr; 2429 iras.ira_flags = 0; 2430 /* 2431 * This will result in an incorrect ALL_ZONES zoneid 2432 * for multicast packets, but we 2433 * don't send ICMP errors for those in any case. 2434 */ 2435 iras.ira_zoneid = 2436 ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 2437 ill, ipst); 2438 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2439 icmp_time_exceeded_v6(mp, 2440 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 2441 &iras); 2442 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2443 } 2444 while (send_icmp_head != NULL) { 2445 ipaddr_t dst; 2446 2447 mp = send_icmp_head; 2448 send_icmp_head = send_icmp_head->b_next; 2449 mp->b_next = NULL; 2450 2451 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 2452 2453 iras.ira_flags = IRAF_IS_IPV4; 2454 /* 2455 * This will result in an incorrect ALL_ZONES zoneid 2456 * for broadcast and multicast packets, but we 2457 * don't send ICMP errors for those in any case. 2458 */ 2459 iras.ira_zoneid = ipif_lookup_addr_zoneid(dst, 2460 ill, ipst); 2461 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2462 icmp_time_exceeded(mp, 2463 ICMP_REASSEMBLY_TIME_EXCEEDED, &iras); 2464 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2465 } 2466 } 2467 /* 2468 * A non-dying ILL will use the return value to decide whether to 2469 * restart the frag timer, and for how long. 2470 */ 2471 return (next_timeout); 2472 } 2473 2474 /* 2475 * This routine is called when the approximate count of mblk memory used 2476 * for the specified ILL has exceeded max_count. 2477 */ 2478 void 2479 ill_frag_prune(ill_t *ill, uint_t max_count) 2480 { 2481 ipfb_t *ipfb; 2482 ipf_t *ipf; 2483 size_t count; 2484 clock_t now; 2485 2486 /* 2487 * If we are here within ip_min_frag_prune_time msecs remove 2488 * ill_frag_free_num_pkts oldest packets from each bucket and increment 2489 * ill_frag_free_num_pkts. 2490 */ 2491 mutex_enter(&ill->ill_lock); 2492 now = ddi_get_lbolt(); 2493 if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <= 2494 (ip_min_frag_prune_time != 0 ? 2495 ip_min_frag_prune_time : msec_per_tick)) { 2496 2497 ill->ill_frag_free_num_pkts++; 2498 2499 } else { 2500 ill->ill_frag_free_num_pkts = 0; 2501 } 2502 ill->ill_last_frag_clean_time = now; 2503 mutex_exit(&ill->ill_lock); 2504 2505 /* 2506 * free ill_frag_free_num_pkts oldest packets from each bucket. 2507 */ 2508 if (ill->ill_frag_free_num_pkts != 0) { 2509 int ix; 2510 2511 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2512 ipfb = &ill->ill_frag_hash_tbl[ix]; 2513 mutex_enter(&ipfb->ipfb_lock); 2514 if (ipfb->ipfb_ipf != NULL) { 2515 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 2516 ill->ill_frag_free_num_pkts); 2517 } 2518 mutex_exit(&ipfb->ipfb_lock); 2519 } 2520 } 2521 /* 2522 * While the reassembly list for this ILL is too big, prune a fragment 2523 * queue by age, oldest first. 2524 */ 2525 while (ill->ill_frag_count > max_count) { 2526 int ix; 2527 ipfb_t *oipfb = NULL; 2528 uint_t oldest = UINT_MAX; 2529 2530 count = 0; 2531 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2532 ipfb = &ill->ill_frag_hash_tbl[ix]; 2533 mutex_enter(&ipfb->ipfb_lock); 2534 ipf = ipfb->ipfb_ipf; 2535 if (ipf != NULL && ipf->ipf_gen < oldest) { 2536 oldest = ipf->ipf_gen; 2537 oipfb = ipfb; 2538 } 2539 count += ipfb->ipfb_count; 2540 mutex_exit(&ipfb->ipfb_lock); 2541 } 2542 if (oipfb == NULL) 2543 break; 2544 2545 if (count <= max_count) 2546 return; /* Somebody beat us to it, nothing to do */ 2547 mutex_enter(&oipfb->ipfb_lock); 2548 ipf = oipfb->ipfb_ipf; 2549 if (ipf != NULL) { 2550 ill_frag_free_pkts(ill, oipfb, ipf, 1); 2551 } 2552 mutex_exit(&oipfb->ipfb_lock); 2553 } 2554 } 2555 2556 /* 2557 * free 'free_cnt' fragmented packets starting at ipf. 2558 */ 2559 void 2560 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 2561 { 2562 size_t count; 2563 mblk_t *mp; 2564 mblk_t *tmp; 2565 ipf_t **ipfp = ipf->ipf_ptphn; 2566 2567 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 2568 ASSERT(ipfp != NULL); 2569 ASSERT(ipf != NULL); 2570 2571 while (ipf != NULL && free_cnt-- > 0) { 2572 count = ipf->ipf_count; 2573 mp = ipf->ipf_mp; 2574 ipf = ipf->ipf_hash_next; 2575 for (tmp = mp; tmp; tmp = tmp->b_cont) { 2576 IP_REASS_SET_START(tmp, 0); 2577 IP_REASS_SET_END(tmp, 0); 2578 } 2579 atomic_add_32(&ill->ill_frag_count, -count); 2580 ASSERT(ipfb->ipfb_count >= count); 2581 ipfb->ipfb_count -= count; 2582 ASSERT(ipfb->ipfb_frag_pkts > 0); 2583 ipfb->ipfb_frag_pkts--; 2584 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2585 ip_drop_input("ipIfStatsReasmFails", mp, ill); 2586 freemsg(mp); 2587 } 2588 2589 if (ipf) 2590 ipf->ipf_ptphn = ipfp; 2591 ipfp[0] = ipf; 2592 } 2593 2594 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 2595 "obsolete and may be removed in a future release of Solaris. Use " \ 2596 "ifconfig(1M) to manipulate the forwarding status of an interface." 2597 2598 /* 2599 * For obsolete per-interface forwarding configuration; 2600 * called in response to ND_GET. 2601 */ 2602 /* ARGSUSED */ 2603 static int 2604 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 2605 { 2606 ill_t *ill = (ill_t *)cp; 2607 2608 cmn_err(CE_WARN, ND_FORWARD_WARNING); 2609 2610 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 2611 return (0); 2612 } 2613 2614 /* 2615 * For obsolete per-interface forwarding configuration; 2616 * called in response to ND_SET. 2617 */ 2618 /* ARGSUSED */ 2619 static int 2620 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 2621 cred_t *ioc_cr) 2622 { 2623 long value; 2624 int retval; 2625 ip_stack_t *ipst = CONNQ_TO_IPST(q); 2626 2627 cmn_err(CE_WARN, ND_FORWARD_WARNING); 2628 2629 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 2630 value < 0 || value > 1) { 2631 return (EINVAL); 2632 } 2633 2634 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2635 retval = ill_forward_set((ill_t *)cp, (value != 0)); 2636 rw_exit(&ipst->ips_ill_g_lock); 2637 return (retval); 2638 } 2639 2640 /* 2641 * Helper function for ill_forward_set(). 2642 */ 2643 static void 2644 ill_forward_set_on_ill(ill_t *ill, boolean_t enable) 2645 { 2646 ip_stack_t *ipst = ill->ill_ipst; 2647 2648 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2649 2650 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 2651 (enable ? "Enabling" : "Disabling"), 2652 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 2653 mutex_enter(&ill->ill_lock); 2654 if (enable) 2655 ill->ill_flags |= ILLF_ROUTER; 2656 else 2657 ill->ill_flags &= ~ILLF_ROUTER; 2658 mutex_exit(&ill->ill_lock); 2659 if (ill->ill_isv6) 2660 ill_set_nce_router_flags(ill, enable); 2661 /* Notify routing socket listeners of this change. */ 2662 if (ill->ill_ipif != NULL) 2663 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 2664 } 2665 2666 /* 2667 * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing 2668 * socket messages for each interface whose flags we change. 2669 */ 2670 int 2671 ill_forward_set(ill_t *ill, boolean_t enable) 2672 { 2673 ipmp_illgrp_t *illg; 2674 ip_stack_t *ipst = ill->ill_ipst; 2675 2676 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2677 2678 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 2679 (!enable && !(ill->ill_flags & ILLF_ROUTER))) 2680 return (0); 2681 2682 if (IS_LOOPBACK(ill)) 2683 return (EINVAL); 2684 2685 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { 2686 /* 2687 * Update all of the interfaces in the group. 2688 */ 2689 illg = ill->ill_grp; 2690 ill = list_head(&illg->ig_if); 2691 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) 2692 ill_forward_set_on_ill(ill, enable); 2693 2694 /* 2695 * Update the IPMP meta-interface. 2696 */ 2697 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable); 2698 return (0); 2699 } 2700 2701 ill_forward_set_on_ill(ill, enable); 2702 return (0); 2703 } 2704 2705 /* 2706 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 2707 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 2708 * set or clear. 2709 */ 2710 static void 2711 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 2712 { 2713 ipif_t *ipif; 2714 ncec_t *ncec; 2715 nce_t *nce; 2716 2717 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 2718 /* 2719 * NOTE: we match across the illgrp because nce's for 2720 * addresses on IPMP interfaces have an nce_ill that points to 2721 * the bound underlying ill. 2722 */ 2723 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); 2724 if (nce != NULL) { 2725 ncec = nce->nce_common; 2726 mutex_enter(&ncec->ncec_lock); 2727 if (enable) 2728 ncec->ncec_flags |= NCE_F_ISROUTER; 2729 else 2730 ncec->ncec_flags &= ~NCE_F_ISROUTER; 2731 mutex_exit(&ncec->ncec_lock); 2732 nce_refrele(nce); 2733 } 2734 } 2735 } 2736 2737 /* 2738 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 2739 * for this ill. Make sure the v6/v4 question has been answered about this 2740 * ill. The creation of this ndd variable is only for backwards compatibility. 2741 * The preferred way to control per-interface IP forwarding is through the 2742 * ILLF_ROUTER interface flag. 2743 */ 2744 static int 2745 ill_set_ndd_name(ill_t *ill) 2746 { 2747 char *suffix; 2748 ip_stack_t *ipst = ill->ill_ipst; 2749 2750 ASSERT(IAM_WRITER_ILL(ill)); 2751 2752 if (ill->ill_isv6) 2753 suffix = ipv6_forward_suffix; 2754 else 2755 suffix = ipv4_forward_suffix; 2756 2757 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 2758 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 2759 /* 2760 * Copies over the '\0'. 2761 * Note that strlen(suffix) is always bounded. 2762 */ 2763 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 2764 strlen(suffix) + 1); 2765 2766 /* 2767 * Use of the nd table requires holding the reader lock. 2768 * Modifying the nd table thru nd_load/nd_unload requires 2769 * the writer lock. 2770 */ 2771 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 2772 if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 2773 nd_ill_forward_set, (caddr_t)ill)) { 2774 /* 2775 * If the nd_load failed, it only meant that it could not 2776 * allocate a new bunch of room for further NDD expansion. 2777 * Because of that, the ill_ndd_name will be set to 0, and 2778 * this interface is at the mercy of the global ip_forwarding 2779 * variable. 2780 */ 2781 rw_exit(&ipst->ips_ip_g_nd_lock); 2782 ill->ill_ndd_name = NULL; 2783 return (ENOMEM); 2784 } 2785 rw_exit(&ipst->ips_ip_g_nd_lock); 2786 return (0); 2787 } 2788 2789 /* 2790 * Intializes the context structure and returns the first ill in the list 2791 * cuurently start_list and end_list can have values: 2792 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 2793 * IP_V4_G_HEAD Traverse IPV4 list only. 2794 * IP_V6_G_HEAD Traverse IPV6 list only. 2795 */ 2796 2797 /* 2798 * We don't check for CONDEMNED ills here. Caller must do that if 2799 * necessary under the ill lock. 2800 */ 2801 ill_t * 2802 ill_first(int start_list, int end_list, ill_walk_context_t *ctx, 2803 ip_stack_t *ipst) 2804 { 2805 ill_if_t *ifp; 2806 ill_t *ill; 2807 avl_tree_t *avl_tree; 2808 2809 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 2810 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 2811 2812 /* 2813 * setup the lists to search 2814 */ 2815 if (end_list != MAX_G_HEADS) { 2816 ctx->ctx_current_list = start_list; 2817 ctx->ctx_last_list = end_list; 2818 } else { 2819 ctx->ctx_last_list = MAX_G_HEADS - 1; 2820 ctx->ctx_current_list = 0; 2821 } 2822 2823 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 2824 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2825 if (ifp != (ill_if_t *) 2826 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2827 avl_tree = &ifp->illif_avl_by_ppa; 2828 ill = avl_first(avl_tree); 2829 /* 2830 * ill is guaranteed to be non NULL or ifp should have 2831 * not existed. 2832 */ 2833 ASSERT(ill != NULL); 2834 return (ill); 2835 } 2836 ctx->ctx_current_list++; 2837 } 2838 2839 return (NULL); 2840 } 2841 2842 /* 2843 * returns the next ill in the list. ill_first() must have been called 2844 * before calling ill_next() or bad things will happen. 2845 */ 2846 2847 /* 2848 * We don't check for CONDEMNED ills here. Caller must do that if 2849 * necessary under the ill lock. 2850 */ 2851 ill_t * 2852 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 2853 { 2854 ill_if_t *ifp; 2855 ill_t *ill; 2856 ip_stack_t *ipst = lastill->ill_ipst; 2857 2858 ASSERT(lastill->ill_ifptr != (ill_if_t *) 2859 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)); 2860 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 2861 AVL_AFTER)) != NULL) { 2862 return (ill); 2863 } 2864 2865 /* goto next ill_ifp in the list. */ 2866 ifp = lastill->ill_ifptr->illif_next; 2867 2868 /* make sure not at end of circular list */ 2869 while (ifp == 2870 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2871 if (++ctx->ctx_current_list > ctx->ctx_last_list) 2872 return (NULL); 2873 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2874 } 2875 2876 return (avl_first(&ifp->illif_avl_by_ppa)); 2877 } 2878 2879 /* 2880 * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+ 2881 * The final number (PPA) must not have any leading zeros. Upon success, a 2882 * pointer to the start of the PPA is returned; otherwise NULL is returned. 2883 */ 2884 static char * 2885 ill_get_ppa_ptr(char *name) 2886 { 2887 int namelen = strlen(name); 2888 int end_ndx = namelen - 1; 2889 int ppa_ndx, i; 2890 2891 /* 2892 * Check that the first character is [a-zA-Z], and that the last 2893 * character is [0-9]. 2894 */ 2895 if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx])) 2896 return (NULL); 2897 2898 /* 2899 * Set `ppa_ndx' to the PPA start, and check for leading zeroes. 2900 */ 2901 for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--) 2902 if (!isdigit(name[ppa_ndx - 1])) 2903 break; 2904 2905 if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx) 2906 return (NULL); 2907 2908 /* 2909 * Check that the intermediate characters are [a-z0-9.] 2910 */ 2911 for (i = 1; i < ppa_ndx; i++) { 2912 if (!isalpha(name[i]) && !isdigit(name[i]) && 2913 name[i] != '.' && name[i] != '_') { 2914 return (NULL); 2915 } 2916 } 2917 2918 return (name + ppa_ndx); 2919 } 2920 2921 /* 2922 * use avl tree to locate the ill. 2923 */ 2924 static ill_t * 2925 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst) 2926 { 2927 char *ppa_ptr = NULL; 2928 int len; 2929 uint_t ppa; 2930 ill_t *ill = NULL; 2931 ill_if_t *ifp; 2932 int list; 2933 2934 /* 2935 * get ppa ptr 2936 */ 2937 if (isv6) 2938 list = IP_V6_G_HEAD; 2939 else 2940 list = IP_V4_G_HEAD; 2941 2942 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 2943 return (NULL); 2944 } 2945 2946 len = ppa_ptr - name + 1; 2947 2948 ppa = stoi(&ppa_ptr); 2949 2950 ifp = IP_VX_ILL_G_LIST(list, ipst); 2951 2952 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 2953 /* 2954 * match is done on len - 1 as the name is not null 2955 * terminated it contains ppa in addition to the interface 2956 * name. 2957 */ 2958 if ((ifp->illif_name_len == len) && 2959 bcmp(ifp->illif_name, name, len - 1) == 0) { 2960 break; 2961 } else { 2962 ifp = ifp->illif_next; 2963 } 2964 } 2965 2966 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 2967 /* 2968 * Even the interface type does not exist. 2969 */ 2970 return (NULL); 2971 } 2972 2973 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 2974 if (ill != NULL) { 2975 mutex_enter(&ill->ill_lock); 2976 if (ILL_CAN_LOOKUP(ill)) { 2977 ill_refhold_locked(ill); 2978 mutex_exit(&ill->ill_lock); 2979 return (ill); 2980 } 2981 mutex_exit(&ill->ill_lock); 2982 } 2983 return (NULL); 2984 } 2985 2986 /* 2987 * comparison function for use with avl. 2988 */ 2989 static int 2990 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 2991 { 2992 uint_t ppa; 2993 uint_t ill_ppa; 2994 2995 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 2996 2997 ppa = *((uint_t *)ppa_ptr); 2998 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 2999 /* 3000 * We want the ill with the lowest ppa to be on the 3001 * top. 3002 */ 3003 if (ill_ppa < ppa) 3004 return (1); 3005 if (ill_ppa > ppa) 3006 return (-1); 3007 return (0); 3008 } 3009 3010 /* 3011 * remove an interface type from the global list. 3012 */ 3013 static void 3014 ill_delete_interface_type(ill_if_t *interface) 3015 { 3016 ASSERT(interface != NULL); 3017 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 3018 3019 avl_destroy(&interface->illif_avl_by_ppa); 3020 if (interface->illif_ppa_arena != NULL) 3021 vmem_destroy(interface->illif_ppa_arena); 3022 3023 remque(interface); 3024 3025 mi_free(interface); 3026 } 3027 3028 /* 3029 * remove ill from the global list. 3030 */ 3031 static void 3032 ill_glist_delete(ill_t *ill) 3033 { 3034 ip_stack_t *ipst; 3035 phyint_t *phyi; 3036 3037 if (ill == NULL) 3038 return; 3039 ipst = ill->ill_ipst; 3040 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 3041 3042 /* 3043 * If the ill was never inserted into the AVL tree 3044 * we skip the if branch. 3045 */ 3046 if (ill->ill_ifptr != NULL) { 3047 /* 3048 * remove from AVL tree and free ppa number 3049 */ 3050 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 3051 3052 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 3053 vmem_free(ill->ill_ifptr->illif_ppa_arena, 3054 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 3055 } 3056 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 3057 ill_delete_interface_type(ill->ill_ifptr); 3058 } 3059 3060 /* 3061 * Indicate ill is no longer in the list. 3062 */ 3063 ill->ill_ifptr = NULL; 3064 ill->ill_name_length = 0; 3065 ill->ill_name[0] = '\0'; 3066 ill->ill_ppa = UINT_MAX; 3067 } 3068 3069 /* Generate one last event for this ill. */ 3070 ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name, 3071 ill->ill_name_length); 3072 3073 ASSERT(ill->ill_phyint != NULL); 3074 phyi = ill->ill_phyint; 3075 ill->ill_phyint = NULL; 3076 3077 /* 3078 * ill_init allocates a phyint always to store the copy 3079 * of flags relevant to phyint. At that point in time, we could 3080 * not assign the name and hence phyint_illv4/v6 could not be 3081 * initialized. Later in ipif_set_values, we assign the name to 3082 * the ill, at which point in time we assign phyint_illv4/v6. 3083 * Thus we don't rely on phyint_illv6 to be initialized always. 3084 */ 3085 if (ill->ill_flags & ILLF_IPV6) 3086 phyi->phyint_illv6 = NULL; 3087 else 3088 phyi->phyint_illv4 = NULL; 3089 3090 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) { 3091 rw_exit(&ipst->ips_ill_g_lock); 3092 return; 3093 } 3094 3095 /* 3096 * There are no ills left on this phyint; pull it out of the phyint 3097 * avl trees, and free it. 3098 */ 3099 if (phyi->phyint_ifindex > 0) { 3100 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3101 phyi); 3102 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 3103 phyi); 3104 } 3105 rw_exit(&ipst->ips_ill_g_lock); 3106 3107 phyint_free(phyi); 3108 } 3109 3110 /* 3111 * allocate a ppa, if the number of plumbed interfaces of this type are 3112 * less than ill_no_arena do a linear search to find a unused ppa. 3113 * When the number goes beyond ill_no_arena switch to using an arena. 3114 * Note: ppa value of zero cannot be allocated from vmem_arena as it 3115 * is the return value for an error condition, so allocation starts at one 3116 * and is decremented by one. 3117 */ 3118 static int 3119 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 3120 { 3121 ill_t *tmp_ill; 3122 uint_t start, end; 3123 int ppa; 3124 3125 if (ifp->illif_ppa_arena == NULL && 3126 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 3127 /* 3128 * Create an arena. 3129 */ 3130 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 3131 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 3132 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 3133 /* allocate what has already been assigned */ 3134 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 3135 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 3136 tmp_ill, AVL_AFTER)) { 3137 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 3138 1, /* size */ 3139 1, /* align/quantum */ 3140 0, /* phase */ 3141 0, /* nocross */ 3142 /* minaddr */ 3143 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), 3144 /* maxaddr */ 3145 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), 3146 VM_NOSLEEP|VM_FIRSTFIT); 3147 if (ppa == 0) { 3148 ip1dbg(("ill_alloc_ppa: ppa allocation" 3149 " failed while switching")); 3150 vmem_destroy(ifp->illif_ppa_arena); 3151 ifp->illif_ppa_arena = NULL; 3152 break; 3153 } 3154 } 3155 } 3156 3157 if (ifp->illif_ppa_arena != NULL) { 3158 if (ill->ill_ppa == UINT_MAX) { 3159 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 3160 1, VM_NOSLEEP|VM_FIRSTFIT); 3161 if (ppa == 0) 3162 return (EAGAIN); 3163 ill->ill_ppa = --ppa; 3164 } else { 3165 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 3166 1, /* size */ 3167 1, /* align/quantum */ 3168 0, /* phase */ 3169 0, /* nocross */ 3170 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 3171 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 3172 VM_NOSLEEP|VM_FIRSTFIT); 3173 /* 3174 * Most likely the allocation failed because 3175 * the requested ppa was in use. 3176 */ 3177 if (ppa == 0) 3178 return (EEXIST); 3179 } 3180 return (0); 3181 } 3182 3183 /* 3184 * No arena is in use and not enough (>ill_no_arena) interfaces have 3185 * been plumbed to create one. Do a linear search to get a unused ppa. 3186 */ 3187 if (ill->ill_ppa == UINT_MAX) { 3188 end = UINT_MAX - 1; 3189 start = 0; 3190 } else { 3191 end = start = ill->ill_ppa; 3192 } 3193 3194 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 3195 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 3196 if (start++ >= end) { 3197 if (ill->ill_ppa == UINT_MAX) 3198 return (EAGAIN); 3199 else 3200 return (EEXIST); 3201 } 3202 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 3203 } 3204 ill->ill_ppa = start; 3205 return (0); 3206 } 3207 3208 /* 3209 * Insert ill into the list of configured ill's. Once this function completes, 3210 * the ill is globally visible and is available through lookups. More precisely 3211 * this happens after the caller drops the ill_g_lock. 3212 */ 3213 static int 3214 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 3215 { 3216 ill_if_t *ill_interface; 3217 avl_index_t where = 0; 3218 int error; 3219 int name_length; 3220 int index; 3221 boolean_t check_length = B_FALSE; 3222 ip_stack_t *ipst = ill->ill_ipst; 3223 3224 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 3225 3226 name_length = mi_strlen(name) + 1; 3227 3228 if (isv6) 3229 index = IP_V6_G_HEAD; 3230 else 3231 index = IP_V4_G_HEAD; 3232 3233 ill_interface = IP_VX_ILL_G_LIST(index, ipst); 3234 /* 3235 * Search for interface type based on name 3236 */ 3237 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 3238 if ((ill_interface->illif_name_len == name_length) && 3239 (strcmp(ill_interface->illif_name, name) == 0)) { 3240 break; 3241 } 3242 ill_interface = ill_interface->illif_next; 3243 } 3244 3245 /* 3246 * Interface type not found, create one. 3247 */ 3248 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 3249 ill_g_head_t ghead; 3250 3251 /* 3252 * allocate ill_if_t structure 3253 */ 3254 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 3255 if (ill_interface == NULL) { 3256 return (ENOMEM); 3257 } 3258 3259 (void) strcpy(ill_interface->illif_name, name); 3260 ill_interface->illif_name_len = name_length; 3261 3262 avl_create(&ill_interface->illif_avl_by_ppa, 3263 ill_compare_ppa, sizeof (ill_t), 3264 offsetof(struct ill_s, ill_avl_byppa)); 3265 3266 /* 3267 * link the structure in the back to maintain order 3268 * of configuration for ifconfig output. 3269 */ 3270 ghead = ipst->ips_ill_g_heads[index]; 3271 insque(ill_interface, ghead.ill_g_list_tail); 3272 } 3273 3274 if (ill->ill_ppa == UINT_MAX) 3275 check_length = B_TRUE; 3276 3277 error = ill_alloc_ppa(ill_interface, ill); 3278 if (error != 0) { 3279 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 3280 ill_delete_interface_type(ill->ill_ifptr); 3281 return (error); 3282 } 3283 3284 /* 3285 * When the ppa is choosen by the system, check that there is 3286 * enough space to insert ppa. if a specific ppa was passed in this 3287 * check is not required as the interface name passed in will have 3288 * the right ppa in it. 3289 */ 3290 if (check_length) { 3291 /* 3292 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 3293 */ 3294 char buf[sizeof (uint_t) * 3]; 3295 3296 /* 3297 * convert ppa to string to calculate the amount of space 3298 * required for it in the name. 3299 */ 3300 numtos(ill->ill_ppa, buf); 3301 3302 /* Do we have enough space to insert ppa ? */ 3303 3304 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 3305 /* Free ppa and interface type struct */ 3306 if (ill_interface->illif_ppa_arena != NULL) { 3307 vmem_free(ill_interface->illif_ppa_arena, 3308 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 3309 } 3310 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 3311 ill_delete_interface_type(ill->ill_ifptr); 3312 3313 return (EINVAL); 3314 } 3315 } 3316 3317 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 3318 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 3319 3320 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 3321 &where); 3322 ill->ill_ifptr = ill_interface; 3323 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 3324 3325 ill_phyint_reinit(ill); 3326 return (0); 3327 } 3328 3329 /* Initialize the per phyint ipsq used for serialization */ 3330 static boolean_t 3331 ipsq_init(ill_t *ill, boolean_t enter) 3332 { 3333 ipsq_t *ipsq; 3334 ipxop_t *ipx; 3335 3336 if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL) 3337 return (B_FALSE); 3338 3339 ill->ill_phyint->phyint_ipsq = ipsq; 3340 ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop; 3341 ipx->ipx_ipsq = ipsq; 3342 ipsq->ipsq_next = ipsq; 3343 ipsq->ipsq_phyint = ill->ill_phyint; 3344 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 3345 mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0); 3346 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */ 3347 if (enter) { 3348 ipx->ipx_writer = curthread; 3349 ipx->ipx_forced = B_FALSE; 3350 ipx->ipx_reentry_cnt = 1; 3351 #ifdef DEBUG 3352 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 3353 #endif 3354 } 3355 return (B_TRUE); 3356 } 3357 3358 /* 3359 * ill_init is called by ip_open when a device control stream is opened. 3360 * It does a few initializations, and shoots a DL_INFO_REQ message down 3361 * to the driver. The response is later picked up in ip_rput_dlpi and 3362 * used to set up default mechanisms for talking to the driver. (Always 3363 * called as writer.) 3364 * 3365 * If this function returns error, ip_open will call ip_close which in 3366 * turn will call ill_delete to clean up any memory allocated here that 3367 * is not yet freed. 3368 */ 3369 int 3370 ill_init(queue_t *q, ill_t *ill) 3371 { 3372 int count; 3373 dl_info_req_t *dlir; 3374 mblk_t *info_mp; 3375 uchar_t *frag_ptr; 3376 3377 /* 3378 * The ill is initialized to zero by mi_alloc*(). In addition 3379 * some fields already contain valid values, initialized in 3380 * ip_open(), before we reach here. 3381 */ 3382 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 3383 mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 3384 ill->ill_saved_ire_cnt = 0; 3385 3386 ill->ill_rq = q; 3387 ill->ill_wq = WR(q); 3388 3389 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 3390 BPRI_HI); 3391 if (info_mp == NULL) 3392 return (ENOMEM); 3393 3394 /* 3395 * Allocate sufficient space to contain our fragment hash table and 3396 * the device name. 3397 */ 3398 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 3399 2 * LIFNAMSIZ + strlen(ipv6_forward_suffix)); 3400 if (frag_ptr == NULL) { 3401 freemsg(info_mp); 3402 return (ENOMEM); 3403 } 3404 ill->ill_frag_ptr = frag_ptr; 3405 ill->ill_frag_free_num_pkts = 0; 3406 ill->ill_last_frag_clean_time = 0; 3407 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 3408 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 3409 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 3410 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 3411 NULL, MUTEX_DEFAULT, NULL); 3412 } 3413 3414 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 3415 if (ill->ill_phyint == NULL) { 3416 freemsg(info_mp); 3417 mi_free(frag_ptr); 3418 return (ENOMEM); 3419 } 3420 3421 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 3422 /* 3423 * For now pretend this is a v4 ill. We need to set phyint_ill* 3424 * at this point because of the following reason. If we can't 3425 * enter the ipsq at some point and cv_wait, the writer that 3426 * wakes us up tries to locate us using the list of all phyints 3427 * in an ipsq and the ills from the phyint thru the phyint_ill*. 3428 * If we don't set it now, we risk a missed wakeup. 3429 */ 3430 ill->ill_phyint->phyint_illv4 = ill; 3431 ill->ill_ppa = UINT_MAX; 3432 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); 3433 3434 ill_set_inputfn(ill); 3435 3436 if (!ipsq_init(ill, B_TRUE)) { 3437 freemsg(info_mp); 3438 mi_free(frag_ptr); 3439 mi_free(ill->ill_phyint); 3440 return (ENOMEM); 3441 } 3442 3443 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 3444 3445 /* Frag queue limit stuff */ 3446 ill->ill_frag_count = 0; 3447 ill->ill_ipf_gen = 0; 3448 3449 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); 3450 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); 3451 ill->ill_global_timer = INFINITY; 3452 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 3453 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 3454 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 3455 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 3456 3457 /* 3458 * Initialize IPv6 configuration variables. The IP module is always 3459 * opened as an IPv4 module. Instead tracking down the cases where 3460 * it switches to do ipv6, we'll just initialize the IPv6 configuration 3461 * here for convenience, this has no effect until the ill is set to do 3462 * IPv6. 3463 */ 3464 ill->ill_reachable_time = ND_REACHABLE_TIME; 3465 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 3466 ill->ill_max_buf = ND_MAX_Q; 3467 ill->ill_refcnt = 0; 3468 3469 /* Send down the Info Request to the driver. */ 3470 info_mp->b_datap->db_type = M_PCPROTO; 3471 dlir = (dl_info_req_t *)info_mp->b_rptr; 3472 info_mp->b_wptr = (uchar_t *)&dlir[1]; 3473 dlir->dl_primitive = DL_INFO_REQ; 3474 3475 ill->ill_dlpi_pending = DL_PRIM_INVAL; 3476 3477 qprocson(q); 3478 ill_dlpi_send(ill, info_mp); 3479 3480 return (0); 3481 } 3482 3483 /* 3484 * ill_dls_info 3485 * creates datalink socket info from the device. 3486 */ 3487 int 3488 ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill) 3489 { 3490 size_t len; 3491 3492 sdl->sdl_family = AF_LINK; 3493 sdl->sdl_index = ill_get_upper_ifindex(ill); 3494 sdl->sdl_type = ill->ill_type; 3495 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); 3496 len = strlen(sdl->sdl_data); 3497 ASSERT(len < 256); 3498 sdl->sdl_nlen = (uchar_t)len; 3499 sdl->sdl_alen = ill->ill_phys_addr_length; 3500 sdl->sdl_slen = 0; 3501 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) 3502 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen); 3503 3504 return (sizeof (struct sockaddr_dl)); 3505 } 3506 3507 /* 3508 * ill_xarp_info 3509 * creates xarp info from the device. 3510 */ 3511 static int 3512 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 3513 { 3514 sdl->sdl_family = AF_LINK; 3515 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 3516 sdl->sdl_type = ill->ill_type; 3517 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); 3518 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 3519 sdl->sdl_alen = ill->ill_phys_addr_length; 3520 sdl->sdl_slen = 0; 3521 return (sdl->sdl_nlen); 3522 } 3523 3524 static int 3525 loopback_kstat_update(kstat_t *ksp, int rw) 3526 { 3527 kstat_named_t *kn; 3528 netstackid_t stackid; 3529 netstack_t *ns; 3530 ip_stack_t *ipst; 3531 3532 if (ksp == NULL || ksp->ks_data == NULL) 3533 return (EIO); 3534 3535 if (rw == KSTAT_WRITE) 3536 return (EACCES); 3537 3538 kn = KSTAT_NAMED_PTR(ksp); 3539 stackid = (zoneid_t)(uintptr_t)ksp->ks_private; 3540 3541 ns = netstack_find_by_stackid(stackid); 3542 if (ns == NULL) 3543 return (-1); 3544 3545 ipst = ns->netstack_ip; 3546 if (ipst == NULL) { 3547 netstack_rele(ns); 3548 return (-1); 3549 } 3550 kn[0].value.ui32 = ipst->ips_loopback_packets; 3551 kn[1].value.ui32 = ipst->ips_loopback_packets; 3552 netstack_rele(ns); 3553 return (0); 3554 } 3555 3556 /* 3557 * Has ifindex been plumbed already? 3558 */ 3559 static boolean_t 3560 phyint_exists(uint_t index, ip_stack_t *ipst) 3561 { 3562 ASSERT(index != 0); 3563 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 3564 3565 return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3566 &index, NULL) != NULL); 3567 } 3568 3569 /* Pick a unique ifindex */ 3570 boolean_t 3571 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst) 3572 { 3573 uint_t starting_index; 3574 3575 if (!ipst->ips_ill_index_wrap) { 3576 *indexp = ipst->ips_ill_index++; 3577 if (ipst->ips_ill_index == 0) { 3578 /* Reached the uint_t limit Next time wrap */ 3579 ipst->ips_ill_index_wrap = B_TRUE; 3580 } 3581 return (B_TRUE); 3582 } 3583 3584 /* 3585 * Start reusing unused indexes. Note that we hold the ill_g_lock 3586 * at this point and don't want to call any function that attempts 3587 * to get the lock again. 3588 */ 3589 starting_index = ipst->ips_ill_index++; 3590 for (; ipst->ips_ill_index != starting_index; ipst->ips_ill_index++) { 3591 if (ipst->ips_ill_index != 0 && 3592 !phyint_exists(ipst->ips_ill_index, ipst)) { 3593 /* found unused index - use it */ 3594 *indexp = ipst->ips_ill_index; 3595 return (B_TRUE); 3596 } 3597 } 3598 3599 /* 3600 * all interface indicies are inuse. 3601 */ 3602 return (B_FALSE); 3603 } 3604 3605 /* 3606 * Assign a unique interface index for the phyint. 3607 */ 3608 static boolean_t 3609 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst) 3610 { 3611 ASSERT(phyi->phyint_ifindex == 0); 3612 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst)); 3613 } 3614 3615 /* 3616 * Initialize the flags on `phyi' as per the provided mactype. 3617 */ 3618 static void 3619 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype) 3620 { 3621 uint64_t flags = 0; 3622 3623 /* 3624 * Initialize PHYI_RUNNING and PHYI_FAILED. For non-IPMP interfaces, 3625 * we always presume the underlying hardware is working and set 3626 * PHYI_RUNNING (if it's not, the driver will subsequently send a 3627 * DL_NOTE_LINK_DOWN message). For IPMP interfaces, at initialization 3628 * there are no active interfaces in the group so we set PHYI_FAILED. 3629 */ 3630 if (mactype == SUNW_DL_IPMP) 3631 flags |= PHYI_FAILED; 3632 else 3633 flags |= PHYI_RUNNING; 3634 3635 switch (mactype) { 3636 case SUNW_DL_VNI: 3637 flags |= PHYI_VIRTUAL; 3638 break; 3639 case SUNW_DL_IPMP: 3640 flags |= PHYI_IPMP; 3641 break; 3642 case DL_LOOP: 3643 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL); 3644 break; 3645 } 3646 3647 mutex_enter(&phyi->phyint_lock); 3648 phyi->phyint_flags |= flags; 3649 mutex_exit(&phyi->phyint_lock); 3650 } 3651 3652 /* 3653 * Return a pointer to the ill which matches the supplied name. Note that 3654 * the ill name length includes the null termination character. (May be 3655 * called as writer.) 3656 * If do_alloc and the interface is "lo0" it will be automatically created. 3657 * Cannot bump up reference on condemned ills. So dup detect can't be done 3658 * using this func. 3659 */ 3660 ill_t * 3661 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 3662 boolean_t *did_alloc, ip_stack_t *ipst) 3663 { 3664 ill_t *ill; 3665 ipif_t *ipif; 3666 ipsq_t *ipsq; 3667 kstat_named_t *kn; 3668 boolean_t isloopback; 3669 in6_addr_t ov6addr; 3670 3671 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 3672 3673 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3674 ill = ill_find_by_name(name, isv6, ipst); 3675 rw_exit(&ipst->ips_ill_g_lock); 3676 if (ill != NULL) 3677 return (ill); 3678 3679 /* 3680 * Couldn't find it. Does this happen to be a lookup for the 3681 * loopback device and are we allowed to allocate it? 3682 */ 3683 if (!isloopback || !do_alloc) 3684 return (NULL); 3685 3686 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 3687 ill = ill_find_by_name(name, isv6, ipst); 3688 if (ill != NULL) { 3689 rw_exit(&ipst->ips_ill_g_lock); 3690 return (ill); 3691 } 3692 3693 /* Create the loopback device on demand */ 3694 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 3695 sizeof (ipif_loopback_name), BPRI_MED)); 3696 if (ill == NULL) 3697 goto done; 3698 3699 *ill = ill_null; 3700 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 3701 ill->ill_ipst = ipst; 3702 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); 3703 netstack_hold(ipst->ips_netstack); 3704 /* 3705 * For exclusive stacks we set the zoneid to zero 3706 * to make IP operate as if in the global zone. 3707 */ 3708 ill->ill_zoneid = GLOBAL_ZONEID; 3709 3710 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 3711 if (ill->ill_phyint == NULL) 3712 goto done; 3713 3714 if (isv6) 3715 ill->ill_phyint->phyint_illv6 = ill; 3716 else 3717 ill->ill_phyint->phyint_illv4 = ill; 3718 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 3719 phyint_flags_init(ill->ill_phyint, DL_LOOP); 3720 3721 if (isv6) { 3722 ill->ill_isv6 = B_TRUE; 3723 ill->ill_max_frag = ip_loopback_mtu_v6plus; 3724 } else { 3725 ill->ill_max_frag = ip_loopback_mtuplus; 3726 } 3727 if (!ill_allocate_mibs(ill)) 3728 goto done; 3729 ill->ill_current_frag = ill->ill_max_frag; 3730 ill->ill_mtu = ill->ill_max_frag; /* Initial value */ 3731 /* 3732 * ipif_loopback_name can't be pointed at directly because its used 3733 * by both the ipv4 and ipv6 interfaces. When the ill is removed 3734 * from the glist, ill_glist_delete() sets the first character of 3735 * ill_name to '\0'. 3736 */ 3737 ill->ill_name = (char *)ill + sizeof (*ill); 3738 (void) strcpy(ill->ill_name, ipif_loopback_name); 3739 ill->ill_name_length = sizeof (ipif_loopback_name); 3740 /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */ 3741 ill->ill_dlpi_pending = DL_PRIM_INVAL; 3742 3743 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); 3744 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); 3745 ill->ill_global_timer = INFINITY; 3746 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 3747 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 3748 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 3749 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 3750 3751 /* No resolver here. */ 3752 ill->ill_net_type = IRE_LOOPBACK; 3753 3754 /* Initialize the ipsq */ 3755 if (!ipsq_init(ill, B_FALSE)) 3756 goto done; 3757 3758 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL); 3759 if (ipif == NULL) 3760 goto done; 3761 3762 ill->ill_flags = ILLF_MULTICAST; 3763 3764 ov6addr = ipif->ipif_v6lcl_addr; 3765 /* Set up default loopback address and mask. */ 3766 if (!isv6) { 3767 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 3768 3769 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 3770 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 3771 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 3772 ipif->ipif_v6subnet); 3773 ill->ill_flags |= ILLF_IPV4; 3774 } else { 3775 ipif->ipif_v6lcl_addr = ipv6_loopback; 3776 ipif->ipif_v6net_mask = ipv6_all_ones; 3777 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 3778 ipif->ipif_v6subnet); 3779 ill->ill_flags |= ILLF_IPV6; 3780 } 3781 3782 /* 3783 * Chain us in at the end of the ill list. hold the ill 3784 * before we make it globally visible. 1 for the lookup. 3785 */ 3786 ill->ill_refcnt = 0; 3787 ill_refhold(ill); 3788 3789 ill->ill_frag_count = 0; 3790 ill->ill_frag_free_num_pkts = 0; 3791 ill->ill_last_frag_clean_time = 0; 3792 3793 ipsq = ill->ill_phyint->phyint_ipsq; 3794 3795 ill_set_inputfn(ill); 3796 3797 if (ill_glist_insert(ill, "lo", isv6) != 0) 3798 cmn_err(CE_PANIC, "cannot insert loopback interface"); 3799 3800 /* Let SCTP know so that it can add this to its list */ 3801 sctp_update_ill(ill, SCTP_ILL_INSERT); 3802 3803 /* 3804 * We have already assigned ipif_v6lcl_addr above, but we need to 3805 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which 3806 * requires to be after ill_glist_insert() since we need the 3807 * ill_index set. Pass on ipv6_loopback as the old address. 3808 */ 3809 sctp_update_ipif_addr(ipif, ov6addr); 3810 3811 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT); 3812 3813 /* 3814 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs. 3815 * If so, free our original one. 3816 */ 3817 if (ipsq != ill->ill_phyint->phyint_ipsq) 3818 ipsq_delete(ipsq); 3819 3820 if (ipst->ips_loopback_ksp == NULL) { 3821 /* Export loopback interface statistics */ 3822 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0, 3823 ipif_loopback_name, "net", 3824 KSTAT_TYPE_NAMED, 2, 0, 3825 ipst->ips_netstack->netstack_stackid); 3826 if (ipst->ips_loopback_ksp != NULL) { 3827 ipst->ips_loopback_ksp->ks_update = 3828 loopback_kstat_update; 3829 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp); 3830 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 3831 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 3832 ipst->ips_loopback_ksp->ks_private = 3833 (void *)(uintptr_t)ipst->ips_netstack-> 3834 netstack_stackid; 3835 kstat_install(ipst->ips_loopback_ksp); 3836 } 3837 } 3838 3839 *did_alloc = B_TRUE; 3840 rw_exit(&ipst->ips_ill_g_lock); 3841 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id), 3842 NE_PLUMB, ill->ill_name, ill->ill_name_length); 3843 return (ill); 3844 done: 3845 if (ill != NULL) { 3846 if (ill->ill_phyint != NULL) { 3847 ipsq = ill->ill_phyint->phyint_ipsq; 3848 if (ipsq != NULL) { 3849 ipsq->ipsq_phyint = NULL; 3850 ipsq_delete(ipsq); 3851 } 3852 mi_free(ill->ill_phyint); 3853 } 3854 ill_free_mib(ill); 3855 if (ill->ill_ipst != NULL) 3856 netstack_rele(ill->ill_ipst->ips_netstack); 3857 mi_free(ill); 3858 } 3859 rw_exit(&ipst->ips_ill_g_lock); 3860 return (NULL); 3861 } 3862 3863 /* 3864 * For IPP calls - use the ip_stack_t for global stack. 3865 */ 3866 ill_t * 3867 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6) 3868 { 3869 ip_stack_t *ipst; 3870 ill_t *ill; 3871 3872 ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip; 3873 if (ipst == NULL) { 3874 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n"); 3875 return (NULL); 3876 } 3877 3878 ill = ill_lookup_on_ifindex(index, isv6, ipst); 3879 netstack_rele(ipst->ips_netstack); 3880 return (ill); 3881 } 3882 3883 /* 3884 * Return a pointer to the ill which matches the index and IP version type. 3885 */ 3886 ill_t * 3887 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 3888 { 3889 ill_t *ill; 3890 phyint_t *phyi; 3891 3892 /* 3893 * Indexes are stored in the phyint - a common structure 3894 * to both IPv4 and IPv6. 3895 */ 3896 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3897 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3898 (void *) &index, NULL); 3899 if (phyi != NULL) { 3900 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 3901 if (ill != NULL) { 3902 mutex_enter(&ill->ill_lock); 3903 if (!ILL_IS_CONDEMNED(ill)) { 3904 ill_refhold_locked(ill); 3905 mutex_exit(&ill->ill_lock); 3906 rw_exit(&ipst->ips_ill_g_lock); 3907 return (ill); 3908 } 3909 mutex_exit(&ill->ill_lock); 3910 } 3911 } 3912 rw_exit(&ipst->ips_ill_g_lock); 3913 return (NULL); 3914 } 3915 3916 /* 3917 * Verify whether or not an interface index is valid. 3918 * It can be zero (meaning "reset") or an interface index assigned 3919 * to a non-VNI interface. (We don't use VNI interface to send packets.) 3920 */ 3921 boolean_t 3922 ip_ifindex_valid(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst) 3923 { 3924 ill_t *ill; 3925 3926 if (ifindex == 0) 3927 return (B_TRUE); 3928 3929 ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 3930 if (ill == NULL) 3931 return (B_FALSE); 3932 if (IS_VNI(ill)) { 3933 ill_refrele(ill); 3934 return (B_FALSE); 3935 } 3936 ill_refrele(ill); 3937 return (B_TRUE); 3938 } 3939 3940 /* 3941 * Return the ifindex next in sequence after the passed in ifindex. 3942 * If there is no next ifindex for the given protocol, return 0. 3943 */ 3944 uint_t 3945 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 3946 { 3947 phyint_t *phyi; 3948 phyint_t *phyi_initial; 3949 uint_t ifindex; 3950 3951 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3952 3953 if (index == 0) { 3954 phyi = avl_first( 3955 &ipst->ips_phyint_g_list->phyint_list_avl_by_index); 3956 } else { 3957 phyi = phyi_initial = avl_find( 3958 &ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3959 (void *) &index, NULL); 3960 } 3961 3962 for (; phyi != NULL; 3963 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3964 phyi, AVL_AFTER)) { 3965 /* 3966 * If we're not returning the first interface in the tree 3967 * and we still haven't moved past the phyint_t that 3968 * corresponds to index, avl_walk needs to be called again 3969 */ 3970 if (!((index != 0) && (phyi == phyi_initial))) { 3971 if (isv6) { 3972 if ((phyi->phyint_illv6) && 3973 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 3974 (phyi->phyint_illv6->ill_isv6 == 1)) 3975 break; 3976 } else { 3977 if ((phyi->phyint_illv4) && 3978 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 3979 (phyi->phyint_illv4->ill_isv6 == 0)) 3980 break; 3981 } 3982 } 3983 } 3984 3985 rw_exit(&ipst->ips_ill_g_lock); 3986 3987 if (phyi != NULL) 3988 ifindex = phyi->phyint_ifindex; 3989 else 3990 ifindex = 0; 3991 3992 return (ifindex); 3993 } 3994 3995 /* 3996 * Return the ifindex for the named interface. 3997 * If there is no next ifindex for the interface, return 0. 3998 */ 3999 uint_t 4000 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst) 4001 { 4002 phyint_t *phyi; 4003 avl_index_t where = 0; 4004 uint_t ifindex; 4005 4006 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4007 4008 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 4009 name, &where)) == NULL) { 4010 rw_exit(&ipst->ips_ill_g_lock); 4011 return (0); 4012 } 4013 4014 ifindex = phyi->phyint_ifindex; 4015 4016 rw_exit(&ipst->ips_ill_g_lock); 4017 4018 return (ifindex); 4019 } 4020 4021 /* 4022 * Return the ifindex to be used by upper layer protocols for instance 4023 * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill. 4024 */ 4025 uint_t 4026 ill_get_upper_ifindex(const ill_t *ill) 4027 { 4028 if (IS_UNDER_IPMP(ill)) 4029 return (ipmp_ill_get_ipmp_ifindex(ill)); 4030 else 4031 return (ill->ill_phyint->phyint_ifindex); 4032 } 4033 4034 4035 /* 4036 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 4037 * that gives a running thread a reference to the ill. This reference must be 4038 * released by the thread when it is done accessing the ill and related 4039 * objects. ill_refcnt can not be used to account for static references 4040 * such as other structures pointing to an ill. Callers must generally 4041 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 4042 * or be sure that the ill is not being deleted or changing state before 4043 * calling the refhold functions. A non-zero ill_refcnt ensures that the 4044 * ill won't change any of its critical state such as address, netmask etc. 4045 */ 4046 void 4047 ill_refhold(ill_t *ill) 4048 { 4049 mutex_enter(&ill->ill_lock); 4050 ill->ill_refcnt++; 4051 ILL_TRACE_REF(ill); 4052 mutex_exit(&ill->ill_lock); 4053 } 4054 4055 void 4056 ill_refhold_locked(ill_t *ill) 4057 { 4058 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4059 ill->ill_refcnt++; 4060 ILL_TRACE_REF(ill); 4061 } 4062 4063 /* Returns true if we managed to get a refhold */ 4064 boolean_t 4065 ill_check_and_refhold(ill_t *ill) 4066 { 4067 mutex_enter(&ill->ill_lock); 4068 if (!ILL_IS_CONDEMNED(ill)) { 4069 ill_refhold_locked(ill); 4070 mutex_exit(&ill->ill_lock); 4071 return (B_TRUE); 4072 } 4073 mutex_exit(&ill->ill_lock); 4074 return (B_FALSE); 4075 } 4076 4077 /* 4078 * Must not be called while holding any locks. Otherwise if this is 4079 * the last reference to be released, there is a chance of recursive mutex 4080 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 4081 * to restart an ioctl. 4082 */ 4083 void 4084 ill_refrele(ill_t *ill) 4085 { 4086 mutex_enter(&ill->ill_lock); 4087 ASSERT(ill->ill_refcnt != 0); 4088 ill->ill_refcnt--; 4089 ILL_UNTRACE_REF(ill); 4090 if (ill->ill_refcnt != 0) { 4091 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 4092 mutex_exit(&ill->ill_lock); 4093 return; 4094 } 4095 4096 /* Drops the ill_lock */ 4097 ipif_ill_refrele_tail(ill); 4098 } 4099 4100 /* 4101 * Obtain a weak reference count on the ill. This reference ensures the 4102 * ill won't be freed, but the ill may change any of its critical state 4103 * such as netmask, address etc. Returns an error if the ill has started 4104 * closing. 4105 */ 4106 boolean_t 4107 ill_waiter_inc(ill_t *ill) 4108 { 4109 mutex_enter(&ill->ill_lock); 4110 if (ill->ill_state_flags & ILL_CONDEMNED) { 4111 mutex_exit(&ill->ill_lock); 4112 return (B_FALSE); 4113 } 4114 ill->ill_waiters++; 4115 mutex_exit(&ill->ill_lock); 4116 return (B_TRUE); 4117 } 4118 4119 void 4120 ill_waiter_dcr(ill_t *ill) 4121 { 4122 mutex_enter(&ill->ill_lock); 4123 ill->ill_waiters--; 4124 if (ill->ill_waiters == 0) 4125 cv_broadcast(&ill->ill_cv); 4126 mutex_exit(&ill->ill_lock); 4127 } 4128 4129 /* 4130 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 4131 * driver. We construct best guess defaults for lower level information that 4132 * we need. If an interface is brought up without injection of any overriding 4133 * information from outside, we have to be ready to go with these defaults. 4134 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 4135 * we primarely want the dl_provider_style. 4136 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 4137 * at which point we assume the other part of the information is valid. 4138 */ 4139 void 4140 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 4141 { 4142 uchar_t *brdcst_addr; 4143 uint_t brdcst_addr_length, phys_addr_length; 4144 t_scalar_t sap_length; 4145 dl_info_ack_t *dlia; 4146 ip_m_t *ipm; 4147 dl_qos_cl_sel1_t *sel1; 4148 int min_mtu; 4149 4150 ASSERT(IAM_WRITER_ILL(ill)); 4151 4152 /* 4153 * Till the ill is fully up the ill is not globally visible. 4154 * So no need for a lock. 4155 */ 4156 dlia = (dl_info_ack_t *)mp->b_rptr; 4157 ill->ill_mactype = dlia->dl_mac_type; 4158 4159 ipm = ip_m_lookup(dlia->dl_mac_type); 4160 if (ipm == NULL) { 4161 ipm = ip_m_lookup(DL_OTHER); 4162 ASSERT(ipm != NULL); 4163 } 4164 ill->ill_media = ipm; 4165 4166 /* 4167 * When the new DLPI stuff is ready we'll pull lengths 4168 * from dlia. 4169 */ 4170 if (dlia->dl_version == DL_VERSION_2) { 4171 brdcst_addr_length = dlia->dl_brdcst_addr_length; 4172 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 4173 brdcst_addr_length); 4174 if (brdcst_addr == NULL) { 4175 brdcst_addr_length = 0; 4176 } 4177 sap_length = dlia->dl_sap_length; 4178 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 4179 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 4180 brdcst_addr_length, sap_length, phys_addr_length)); 4181 } else { 4182 brdcst_addr_length = 6; 4183 brdcst_addr = ip_six_byte_all_ones; 4184 sap_length = -2; 4185 phys_addr_length = brdcst_addr_length; 4186 } 4187 4188 ill->ill_bcast_addr_length = brdcst_addr_length; 4189 ill->ill_phys_addr_length = phys_addr_length; 4190 ill->ill_sap_length = sap_length; 4191 4192 /* 4193 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU, 4194 * but we must ensure a minimum IP MTU is used since other bits of 4195 * IP will fly apart otherwise. 4196 */ 4197 min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; 4198 ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu); 4199 ill->ill_current_frag = ill->ill_max_frag; 4200 ill->ill_mtu = ill->ill_max_frag; 4201 4202 ill->ill_type = ipm->ip_m_type; 4203 4204 if (!ill->ill_dlpi_style_set) { 4205 if (dlia->dl_provider_style == DL_STYLE2) 4206 ill->ill_needs_attach = 1; 4207 4208 phyint_flags_init(ill->ill_phyint, ill->ill_mactype); 4209 4210 /* 4211 * Allocate the first ipif on this ill. We don't delay it 4212 * further as ioctl handling assumes at least one ipif exists. 4213 * 4214 * At this point we don't know whether the ill is v4 or v6. 4215 * We will know this whan the SIOCSLIFNAME happens and 4216 * the correct value for ill_isv6 will be assigned in 4217 * ipif_set_values(). We need to hold the ill lock and 4218 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 4219 * the wakeup. 4220 */ 4221 (void) ipif_allocate(ill, 0, IRE_LOCAL, 4222 dlia->dl_provider_style != DL_STYLE2, B_TRUE, NULL); 4223 mutex_enter(&ill->ill_lock); 4224 ASSERT(ill->ill_dlpi_style_set == 0); 4225 ill->ill_dlpi_style_set = 1; 4226 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 4227 cv_broadcast(&ill->ill_cv); 4228 mutex_exit(&ill->ill_lock); 4229 freemsg(mp); 4230 return; 4231 } 4232 ASSERT(ill->ill_ipif != NULL); 4233 /* 4234 * We know whether it is IPv4 or IPv6 now, as this is the 4235 * second DL_INFO_ACK we are recieving in response to the 4236 * DL_INFO_REQ sent in ipif_set_values. 4237 */ 4238 ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap; 4239 /* 4240 * Clear all the flags that were set based on ill_bcast_addr_length 4241 * and ill_phys_addr_length (in ipif_set_values) as these could have 4242 * changed now and we need to re-evaluate. 4243 */ 4244 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 4245 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 4246 4247 /* 4248 * Free ill_bcast_mp as things could have changed now. 4249 * 4250 * NOTE: The IPMP meta-interface is special-cased because it starts 4251 * with no underlying interfaces (and thus an unknown broadcast 4252 * address length), but we enforce that an interface is broadcast- 4253 * capable as part of allowing it to join a group. 4254 */ 4255 if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) { 4256 if (ill->ill_bcast_mp != NULL) 4257 freemsg(ill->ill_bcast_mp); 4258 ill->ill_net_type = IRE_IF_NORESOLVER; 4259 4260 ill->ill_bcast_mp = ill_dlur_gen(NULL, 4261 ill->ill_phys_addr_length, 4262 ill->ill_sap, 4263 ill->ill_sap_length); 4264 4265 if (ill->ill_isv6) 4266 /* 4267 * Note: xresolv interfaces will eventually need NOARP 4268 * set here as well, but that will require those 4269 * external resolvers to have some knowledge of 4270 * that flag and act appropriately. Not to be changed 4271 * at present. 4272 */ 4273 ill->ill_flags |= ILLF_NONUD; 4274 else 4275 ill->ill_flags |= ILLF_NOARP; 4276 4277 if (ill->ill_mactype == SUNW_DL_VNI) { 4278 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 4279 } else if (ill->ill_phys_addr_length == 0 || 4280 ill->ill_mactype == DL_IPV4 || 4281 ill->ill_mactype == DL_IPV6) { 4282 /* 4283 * The underying link is point-to-point, so mark the 4284 * interface as such. We can do IP multicast over 4285 * such a link since it transmits all network-layer 4286 * packets to the remote side the same way. 4287 */ 4288 ill->ill_flags |= ILLF_MULTICAST; 4289 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 4290 } 4291 } else { 4292 ill->ill_net_type = IRE_IF_RESOLVER; 4293 if (ill->ill_bcast_mp != NULL) 4294 freemsg(ill->ill_bcast_mp); 4295 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 4296 ill->ill_bcast_addr_length, ill->ill_sap, 4297 ill->ill_sap_length); 4298 /* 4299 * Later detect lack of DLPI driver multicast 4300 * capability by catching DL_ENABMULTI errors in 4301 * ip_rput_dlpi. 4302 */ 4303 ill->ill_flags |= ILLF_MULTICAST; 4304 if (!ill->ill_isv6) 4305 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 4306 } 4307 4308 /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */ 4309 if (ill->ill_mactype == SUNW_DL_IPMP) 4310 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP); 4311 4312 /* By default an interface does not support any CoS marking */ 4313 ill->ill_flags &= ~ILLF_COS_ENABLED; 4314 4315 /* 4316 * If we get QoS information in DL_INFO_ACK, the device supports 4317 * some form of CoS marking, set ILLF_COS_ENABLED. 4318 */ 4319 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 4320 dlia->dl_qos_length); 4321 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 4322 ill->ill_flags |= ILLF_COS_ENABLED; 4323 } 4324 4325 /* Clear any previous error indication. */ 4326 ill->ill_error = 0; 4327 freemsg(mp); 4328 } 4329 4330 /* 4331 * Perform various checks to verify that an address would make sense as a 4332 * local, remote, or subnet interface address. 4333 */ 4334 static boolean_t 4335 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 4336 { 4337 ipaddr_t net_mask; 4338 4339 /* 4340 * Don't allow all zeroes, or all ones, but allow 4341 * all ones netmask. 4342 */ 4343 if ((net_mask = ip_net_mask(addr)) == 0) 4344 return (B_FALSE); 4345 /* A given netmask overrides the "guess" netmask */ 4346 if (subnet_mask != 0) 4347 net_mask = subnet_mask; 4348 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 4349 (addr == (addr | ~net_mask)))) { 4350 return (B_FALSE); 4351 } 4352 4353 /* 4354 * Even if the netmask is all ones, we do not allow address to be 4355 * 255.255.255.255 4356 */ 4357 if (addr == INADDR_BROADCAST) 4358 return (B_FALSE); 4359 4360 if (CLASSD(addr)) 4361 return (B_FALSE); 4362 4363 return (B_TRUE); 4364 } 4365 4366 #define V6_IPIF_LINKLOCAL(p) \ 4367 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr) 4368 4369 /* 4370 * Compare two given ipifs and check if the second one is better than 4371 * the first one using the order of preference (not taking deprecated 4372 * into acount) specified in ipif_lookup_multicast(). 4373 */ 4374 static boolean_t 4375 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) 4376 { 4377 /* Check the least preferred first. */ 4378 if (IS_LOOPBACK(old_ipif->ipif_ill)) { 4379 /* If both ipifs are the same, use the first one. */ 4380 if (IS_LOOPBACK(new_ipif->ipif_ill)) 4381 return (B_FALSE); 4382 else 4383 return (B_TRUE); 4384 } 4385 4386 /* For IPv6, check for link local address. */ 4387 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) { 4388 if (IS_LOOPBACK(new_ipif->ipif_ill) || 4389 V6_IPIF_LINKLOCAL(new_ipif)) { 4390 /* The second one is equal or less preferred. */ 4391 return (B_FALSE); 4392 } else { 4393 return (B_TRUE); 4394 } 4395 } 4396 4397 /* Then check for point to point interface. */ 4398 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) { 4399 if (IS_LOOPBACK(new_ipif->ipif_ill) || 4400 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) || 4401 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) { 4402 return (B_FALSE); 4403 } else { 4404 return (B_TRUE); 4405 } 4406 } 4407 4408 /* old_ipif is a normal interface, so no need to use the new one. */ 4409 return (B_FALSE); 4410 } 4411 4412 /* 4413 * Find a mulitcast-capable ipif given an IP instance and zoneid. 4414 * The ipif must be up, and its ill must multicast-capable, not 4415 * condemned, not an underlying interface in an IPMP group, and 4416 * not a VNI interface. Order of preference: 4417 * 4418 * 1a. normal 4419 * 1b. normal, but deprecated 4420 * 2a. point to point 4421 * 2b. point to point, but deprecated 4422 * 3a. link local 4423 * 3b. link local, but deprecated 4424 * 4. loopback. 4425 */ 4426 static ipif_t * 4427 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 4428 { 4429 ill_t *ill; 4430 ill_walk_context_t ctx; 4431 ipif_t *ipif; 4432 ipif_t *saved_ipif = NULL; 4433 ipif_t *dep_ipif = NULL; 4434 4435 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4436 if (isv6) 4437 ill = ILL_START_WALK_V6(&ctx, ipst); 4438 else 4439 ill = ILL_START_WALK_V4(&ctx, ipst); 4440 4441 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4442 mutex_enter(&ill->ill_lock); 4443 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || 4444 ILL_IS_CONDEMNED(ill) || 4445 !(ill->ill_flags & ILLF_MULTICAST)) { 4446 mutex_exit(&ill->ill_lock); 4447 continue; 4448 } 4449 for (ipif = ill->ill_ipif; ipif != NULL; 4450 ipif = ipif->ipif_next) { 4451 if (zoneid != ipif->ipif_zoneid && 4452 zoneid != ALL_ZONES && 4453 ipif->ipif_zoneid != ALL_ZONES) { 4454 continue; 4455 } 4456 if (!(ipif->ipif_flags & IPIF_UP) || 4457 IPIF_IS_CONDEMNED(ipif)) { 4458 continue; 4459 } 4460 4461 /* 4462 * Found one candidate. If it is deprecated, 4463 * remember it in dep_ipif. If it is not deprecated, 4464 * remember it in saved_ipif. 4465 */ 4466 if (ipif->ipif_flags & IPIF_DEPRECATED) { 4467 if (dep_ipif == NULL) { 4468 dep_ipif = ipif; 4469 } else if (ipif_comp_multi(dep_ipif, ipif, 4470 isv6)) { 4471 /* 4472 * If the previous dep_ipif does not 4473 * belong to the same ill, we've done 4474 * a ipif_refhold() on it. So we need 4475 * to release it. 4476 */ 4477 if (dep_ipif->ipif_ill != ill) 4478 ipif_refrele(dep_ipif); 4479 dep_ipif = ipif; 4480 } 4481 continue; 4482 } 4483 if (saved_ipif == NULL) { 4484 saved_ipif = ipif; 4485 } else { 4486 if (ipif_comp_multi(saved_ipif, ipif, isv6)) { 4487 if (saved_ipif->ipif_ill != ill) 4488 ipif_refrele(saved_ipif); 4489 saved_ipif = ipif; 4490 } 4491 } 4492 } 4493 /* 4494 * Before going to the next ill, do a ipif_refhold() on the 4495 * saved ones. 4496 */ 4497 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill) 4498 ipif_refhold_locked(saved_ipif); 4499 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill) 4500 ipif_refhold_locked(dep_ipif); 4501 mutex_exit(&ill->ill_lock); 4502 } 4503 rw_exit(&ipst->ips_ill_g_lock); 4504 4505 /* 4506 * If we have only the saved_ipif, return it. But if we have both 4507 * saved_ipif and dep_ipif, check to see which one is better. 4508 */ 4509 if (saved_ipif != NULL) { 4510 if (dep_ipif != NULL) { 4511 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) { 4512 ipif_refrele(saved_ipif); 4513 return (dep_ipif); 4514 } else { 4515 ipif_refrele(dep_ipif); 4516 return (saved_ipif); 4517 } 4518 } 4519 return (saved_ipif); 4520 } else { 4521 return (dep_ipif); 4522 } 4523 } 4524 4525 ill_t * 4526 ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 4527 { 4528 ipif_t *ipif; 4529 ill_t *ill; 4530 4531 ipif = ipif_lookup_multicast(ipst, zoneid, isv6); 4532 if (ipif == NULL) 4533 return (NULL); 4534 4535 ill = ipif->ipif_ill; 4536 ill_refhold(ill); 4537 ipif_refrele(ipif); 4538 return (ill); 4539 } 4540 4541 /* 4542 * This function is called when an application does not specify an interface 4543 * to be used for multicast traffic (joining a group/sending data). It 4544 * calls ire_lookup_multi() to look for an interface route for the 4545 * specified multicast group. Doing this allows the administrator to add 4546 * prefix routes for multicast to indicate which interface to be used for 4547 * multicast traffic in the above scenario. The route could be for all 4548 * multicast (224.0/4), for a single multicast group (a /32 route) or 4549 * anything in between. If there is no such multicast route, we just find 4550 * any multicast capable interface and return it. The returned ipif 4551 * is refhold'ed. 4552 * 4553 * We support MULTIRT and RTF_SETSRC on the multicast routes added to the 4554 * unicast table. This is used by CGTP. 4555 */ 4556 ill_t * 4557 ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 4558 boolean_t *multirtp, ipaddr_t *setsrcp) 4559 { 4560 ill_t *ill; 4561 4562 ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp); 4563 if (ill != NULL) 4564 return (ill); 4565 4566 return (ill_lookup_multicast(ipst, zoneid, B_FALSE)); 4567 } 4568 4569 /* 4570 * Look for an ipif with the specified interface address and destination. 4571 * The destination address is used only for matching point-to-point interfaces. 4572 */ 4573 ipif_t * 4574 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst) 4575 { 4576 ipif_t *ipif; 4577 ill_t *ill; 4578 ill_walk_context_t ctx; 4579 4580 /* 4581 * First match all the point-to-point interfaces 4582 * before looking at non-point-to-point interfaces. 4583 * This is done to avoid returning non-point-to-point 4584 * ipif instead of unnumbered point-to-point ipif. 4585 */ 4586 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4587 ill = ILL_START_WALK_V4(&ctx, ipst); 4588 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4589 mutex_enter(&ill->ill_lock); 4590 for (ipif = ill->ill_ipif; ipif != NULL; 4591 ipif = ipif->ipif_next) { 4592 /* Allow the ipif to be down */ 4593 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 4594 (ipif->ipif_lcl_addr == if_addr) && 4595 (ipif->ipif_pp_dst_addr == dst)) { 4596 if (!IPIF_IS_CONDEMNED(ipif)) { 4597 ipif_refhold_locked(ipif); 4598 mutex_exit(&ill->ill_lock); 4599 rw_exit(&ipst->ips_ill_g_lock); 4600 return (ipif); 4601 } 4602 } 4603 } 4604 mutex_exit(&ill->ill_lock); 4605 } 4606 rw_exit(&ipst->ips_ill_g_lock); 4607 4608 /* lookup the ipif based on interface address */ 4609 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst); 4610 ASSERT(ipif == NULL || !ipif->ipif_isv6); 4611 return (ipif); 4612 } 4613 4614 /* 4615 * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact(). 4616 */ 4617 static ipif_t * 4618 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags, 4619 zoneid_t zoneid, ip_stack_t *ipst) 4620 { 4621 ipif_t *ipif; 4622 ill_t *ill; 4623 boolean_t ptp = B_FALSE; 4624 ill_walk_context_t ctx; 4625 boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP); 4626 boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP); 4627 4628 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4629 /* 4630 * Repeat twice, first based on local addresses and 4631 * next time for pointopoint. 4632 */ 4633 repeat: 4634 ill = ILL_START_WALK_V4(&ctx, ipst); 4635 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4636 if (match_ill != NULL && ill != match_ill && 4637 (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { 4638 continue; 4639 } 4640 mutex_enter(&ill->ill_lock); 4641 for (ipif = ill->ill_ipif; ipif != NULL; 4642 ipif = ipif->ipif_next) { 4643 if (zoneid != ALL_ZONES && 4644 zoneid != ipif->ipif_zoneid && 4645 ipif->ipif_zoneid != ALL_ZONES) 4646 continue; 4647 4648 if (no_duplicate && !(ipif->ipif_flags & IPIF_UP)) 4649 continue; 4650 4651 /* Allow the ipif to be down */ 4652 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 4653 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 4654 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 4655 (ipif->ipif_pp_dst_addr == addr))) { 4656 if (!IPIF_IS_CONDEMNED(ipif)) { 4657 ipif_refhold_locked(ipif); 4658 mutex_exit(&ill->ill_lock); 4659 rw_exit(&ipst->ips_ill_g_lock); 4660 return (ipif); 4661 } 4662 } 4663 } 4664 mutex_exit(&ill->ill_lock); 4665 } 4666 4667 /* If we already did the ptp case, then we are done */ 4668 if (ptp) { 4669 rw_exit(&ipst->ips_ill_g_lock); 4670 return (NULL); 4671 } 4672 ptp = B_TRUE; 4673 goto repeat; 4674 } 4675 4676 /* 4677 * Lookup an ipif with the specified address. For point-to-point links we 4678 * look for matches on either the destination address or the local address, 4679 * but we skip the local address check if IPIF_UNNUMBERED is set. If the 4680 * `match_ill' argument is non-NULL, the lookup is restricted to that ill 4681 * (or illgrp if `match_ill' is in an IPMP group). 4682 */ 4683 ipif_t * 4684 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, 4685 ip_stack_t *ipst) 4686 { 4687 return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP, 4688 zoneid, ipst)); 4689 } 4690 4691 /* 4692 * Lookup an ipif with the specified address. Similar to ipif_lookup_addr, 4693 * except that we will only return an address if it is not marked as 4694 * IPIF_DUPLICATE 4695 */ 4696 ipif_t * 4697 ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, 4698 ip_stack_t *ipst) 4699 { 4700 return (ipif_lookup_addr_common(addr, match_ill, 4701 (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP), 4702 zoneid, ipst)); 4703 } 4704 4705 /* 4706 * Special abbreviated version of ipif_lookup_addr() that doesn't match 4707 * `match_ill' across the IPMP group. This function is only needed in some 4708 * corner-cases; almost everything should use ipif_lookup_addr(). 4709 */ 4710 ipif_t * 4711 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 4712 { 4713 ASSERT(match_ill != NULL); 4714 return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES, 4715 ipst)); 4716 } 4717 4718 /* 4719 * Look for an ipif with the specified address. For point-point links 4720 * we look for matches on either the destination address and the local 4721 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 4722 * is set. 4723 * If the `match_ill' argument is non-NULL, the lookup is restricted to that 4724 * ill (or illgrp if `match_ill' is in an IPMP group). 4725 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 4726 */ 4727 zoneid_t 4728 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 4729 { 4730 zoneid_t zoneid; 4731 ipif_t *ipif; 4732 ill_t *ill; 4733 boolean_t ptp = B_FALSE; 4734 ill_walk_context_t ctx; 4735 4736 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4737 /* 4738 * Repeat twice, first based on local addresses and 4739 * next time for pointopoint. 4740 */ 4741 repeat: 4742 ill = ILL_START_WALK_V4(&ctx, ipst); 4743 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4744 if (match_ill != NULL && ill != match_ill && 4745 !IS_IN_SAME_ILLGRP(ill, match_ill)) { 4746 continue; 4747 } 4748 mutex_enter(&ill->ill_lock); 4749 for (ipif = ill->ill_ipif; ipif != NULL; 4750 ipif = ipif->ipif_next) { 4751 /* Allow the ipif to be down */ 4752 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 4753 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 4754 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 4755 (ipif->ipif_pp_dst_addr == addr)) && 4756 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 4757 zoneid = ipif->ipif_zoneid; 4758 mutex_exit(&ill->ill_lock); 4759 rw_exit(&ipst->ips_ill_g_lock); 4760 /* 4761 * If ipif_zoneid was ALL_ZONES then we have 4762 * a trusted extensions shared IP address. 4763 * In that case GLOBAL_ZONEID works to send. 4764 */ 4765 if (zoneid == ALL_ZONES) 4766 zoneid = GLOBAL_ZONEID; 4767 return (zoneid); 4768 } 4769 } 4770 mutex_exit(&ill->ill_lock); 4771 } 4772 4773 /* If we already did the ptp case, then we are done */ 4774 if (ptp) { 4775 rw_exit(&ipst->ips_ill_g_lock); 4776 return (ALL_ZONES); 4777 } 4778 ptp = B_TRUE; 4779 goto repeat; 4780 } 4781 4782 /* 4783 * Look for an ipif that matches the specified remote address i.e. the 4784 * ipif that would receive the specified packet. 4785 * First look for directly connected interfaces and then do a recursive 4786 * IRE lookup and pick the first ipif corresponding to the source address in the 4787 * ire. 4788 * Returns: held ipif 4789 * 4790 * This is only used for ICMP_ADDRESS_MASK_REQUESTs 4791 */ 4792 ipif_t * 4793 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 4794 { 4795 ipif_t *ipif; 4796 4797 ASSERT(!ill->ill_isv6); 4798 4799 /* 4800 * Someone could be changing this ipif currently or change it 4801 * after we return this. Thus a few packets could use the old 4802 * old values. However structure updates/creates (ire, ilg, ilm etc) 4803 * will atomically be updated or cleaned up with the new value 4804 * Thus we don't need a lock to check the flags or other attrs below. 4805 */ 4806 mutex_enter(&ill->ill_lock); 4807 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4808 if (IPIF_IS_CONDEMNED(ipif)) 4809 continue; 4810 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 4811 ipif->ipif_zoneid != ALL_ZONES) 4812 continue; 4813 /* Allow the ipif to be down */ 4814 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 4815 if ((ipif->ipif_pp_dst_addr == addr) || 4816 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 4817 ipif->ipif_lcl_addr == addr)) { 4818 ipif_refhold_locked(ipif); 4819 mutex_exit(&ill->ill_lock); 4820 return (ipif); 4821 } 4822 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 4823 ipif_refhold_locked(ipif); 4824 mutex_exit(&ill->ill_lock); 4825 return (ipif); 4826 } 4827 } 4828 mutex_exit(&ill->ill_lock); 4829 /* 4830 * For a remote destination it isn't possible to nail down a particular 4831 * ipif. 4832 */ 4833 4834 /* Pick the first interface */ 4835 ipif = ipif_get_next_ipif(NULL, ill); 4836 return (ipif); 4837 } 4838 4839 /* 4840 * This func does not prevent refcnt from increasing. But if 4841 * the caller has taken steps to that effect, then this func 4842 * can be used to determine whether the ill has become quiescent 4843 */ 4844 static boolean_t 4845 ill_is_quiescent(ill_t *ill) 4846 { 4847 ipif_t *ipif; 4848 4849 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4850 4851 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4852 if (ipif->ipif_refcnt != 0) 4853 return (B_FALSE); 4854 } 4855 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) { 4856 return (B_FALSE); 4857 } 4858 return (B_TRUE); 4859 } 4860 4861 boolean_t 4862 ill_is_freeable(ill_t *ill) 4863 { 4864 ipif_t *ipif; 4865 4866 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4867 4868 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4869 if (ipif->ipif_refcnt != 0) { 4870 return (B_FALSE); 4871 } 4872 } 4873 if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) { 4874 return (B_FALSE); 4875 } 4876 return (B_TRUE); 4877 } 4878 4879 /* 4880 * This func does not prevent refcnt from increasing. But if 4881 * the caller has taken steps to that effect, then this func 4882 * can be used to determine whether the ipif has become quiescent 4883 */ 4884 static boolean_t 4885 ipif_is_quiescent(ipif_t *ipif) 4886 { 4887 ill_t *ill; 4888 4889 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 4890 4891 if (ipif->ipif_refcnt != 0) 4892 return (B_FALSE); 4893 4894 ill = ipif->ipif_ill; 4895 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 4896 ill->ill_logical_down) { 4897 return (B_TRUE); 4898 } 4899 4900 /* This is the last ipif going down or being deleted on this ill */ 4901 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { 4902 return (B_FALSE); 4903 } 4904 4905 return (B_TRUE); 4906 } 4907 4908 /* 4909 * return true if the ipif can be destroyed: the ipif has to be quiescent 4910 * with zero references from ire/ilm to it. 4911 */ 4912 static boolean_t 4913 ipif_is_freeable(ipif_t *ipif) 4914 { 4915 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 4916 ASSERT(ipif->ipif_id != 0); 4917 return (ipif->ipif_refcnt == 0); 4918 } 4919 4920 /* 4921 * The ipif/ill/ire has been refreled. Do the tail processing. 4922 * Determine if the ipif or ill in question has become quiescent and if so 4923 * wakeup close and/or restart any queued pending ioctl that is waiting 4924 * for the ipif_down (or ill_down) 4925 */ 4926 void 4927 ipif_ill_refrele_tail(ill_t *ill) 4928 { 4929 mblk_t *mp; 4930 conn_t *connp; 4931 ipsq_t *ipsq; 4932 ipxop_t *ipx; 4933 ipif_t *ipif; 4934 dl_notify_ind_t *dlindp; 4935 4936 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4937 4938 if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) { 4939 /* ip_modclose() may be waiting */ 4940 cv_broadcast(&ill->ill_cv); 4941 } 4942 4943 ipsq = ill->ill_phyint->phyint_ipsq; 4944 mutex_enter(&ipsq->ipsq_lock); 4945 ipx = ipsq->ipsq_xop; 4946 mutex_enter(&ipx->ipx_lock); 4947 if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */ 4948 goto unlock; 4949 4950 ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL); 4951 4952 ipif = ipx->ipx_pending_ipif; 4953 if (ipif->ipif_ill != ill) /* wait is for another ill; bail */ 4954 goto unlock; 4955 4956 switch (ipx->ipx_waitfor) { 4957 case IPIF_DOWN: 4958 if (!ipif_is_quiescent(ipif)) 4959 goto unlock; 4960 break; 4961 case IPIF_FREE: 4962 if (!ipif_is_freeable(ipif)) 4963 goto unlock; 4964 break; 4965 case ILL_DOWN: 4966 if (!ill_is_quiescent(ill)) 4967 goto unlock; 4968 break; 4969 case ILL_FREE: 4970 /* 4971 * ILL_FREE is only for loopback; normal ill teardown waits 4972 * synchronously in ip_modclose() without using ipx_waitfor, 4973 * handled by the cv_broadcast() at the top of this function. 4974 */ 4975 if (!ill_is_freeable(ill)) 4976 goto unlock; 4977 break; 4978 default: 4979 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n", 4980 (void *)ipsq, ipx->ipx_waitfor); 4981 } 4982 4983 ill_refhold_locked(ill); /* for qwriter_ip() call below */ 4984 mutex_exit(&ipx->ipx_lock); 4985 mp = ipsq_pending_mp_get(ipsq, &connp); 4986 mutex_exit(&ipsq->ipsq_lock); 4987 mutex_exit(&ill->ill_lock); 4988 4989 ASSERT(mp != NULL); 4990 /* 4991 * NOTE: all of the qwriter_ip() calls below use CUR_OP since 4992 * we can only get here when the current operation decides it 4993 * it needs to quiesce via ipsq_pending_mp_add(). 4994 */ 4995 switch (mp->b_datap->db_type) { 4996 case M_PCPROTO: 4997 case M_PROTO: 4998 /* 4999 * For now, only DL_NOTIFY_IND messages can use this facility. 5000 */ 5001 dlindp = (dl_notify_ind_t *)mp->b_rptr; 5002 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND); 5003 5004 switch (dlindp->dl_notification) { 5005 case DL_NOTE_PHYS_ADDR: 5006 qwriter_ip(ill, ill->ill_rq, mp, 5007 ill_set_phys_addr_tail, CUR_OP, B_TRUE); 5008 return; 5009 case DL_NOTE_REPLUMB: 5010 qwriter_ip(ill, ill->ill_rq, mp, 5011 ill_replumb_tail, CUR_OP, B_TRUE); 5012 return; 5013 default: 5014 ASSERT(0); 5015 ill_refrele(ill); 5016 } 5017 break; 5018 5019 case M_ERROR: 5020 case M_HANGUP: 5021 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP, 5022 B_TRUE); 5023 return; 5024 5025 case M_IOCTL: 5026 case M_IOCDATA: 5027 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) : 5028 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE); 5029 return; 5030 5031 default: 5032 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 5033 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 5034 } 5035 return; 5036 unlock: 5037 mutex_exit(&ipsq->ipsq_lock); 5038 mutex_exit(&ipx->ipx_lock); 5039 mutex_exit(&ill->ill_lock); 5040 } 5041 5042 #ifdef DEBUG 5043 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 5044 static void 5045 th_trace_rrecord(th_trace_t *th_trace) 5046 { 5047 tr_buf_t *tr_buf; 5048 uint_t lastref; 5049 5050 lastref = th_trace->th_trace_lastref; 5051 lastref++; 5052 if (lastref == TR_BUF_MAX) 5053 lastref = 0; 5054 th_trace->th_trace_lastref = lastref; 5055 tr_buf = &th_trace->th_trbuf[lastref]; 5056 tr_buf->tr_time = ddi_get_lbolt(); 5057 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH); 5058 } 5059 5060 static void 5061 th_trace_free(void *value) 5062 { 5063 th_trace_t *th_trace = value; 5064 5065 ASSERT(th_trace->th_refcnt == 0); 5066 kmem_free(th_trace, sizeof (*th_trace)); 5067 } 5068 5069 /* 5070 * Find or create the per-thread hash table used to track object references. 5071 * The ipst argument is NULL if we shouldn't allocate. 5072 * 5073 * Accesses per-thread data, so there's no need to lock here. 5074 */ 5075 static mod_hash_t * 5076 th_trace_gethash(ip_stack_t *ipst) 5077 { 5078 th_hash_t *thh; 5079 5080 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) { 5081 mod_hash_t *mh; 5082 char name[256]; 5083 size_t objsize, rshift; 5084 int retv; 5085 5086 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL) 5087 return (NULL); 5088 (void) snprintf(name, sizeof (name), "th_trace_%p", 5089 (void *)curthread); 5090 5091 /* 5092 * We use mod_hash_create_extended here rather than the more 5093 * obvious mod_hash_create_ptrhash because the latter has a 5094 * hard-coded KM_SLEEP, and we'd prefer to fail rather than 5095 * block. 5096 */ 5097 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)), 5098 MAX(sizeof (ire_t), sizeof (ncec_t))); 5099 rshift = highbit(objsize); 5100 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor, 5101 th_trace_free, mod_hash_byptr, (void *)rshift, 5102 mod_hash_ptrkey_cmp, KM_NOSLEEP); 5103 if (mh == NULL) { 5104 kmem_free(thh, sizeof (*thh)); 5105 return (NULL); 5106 } 5107 thh->thh_hash = mh; 5108 thh->thh_ipst = ipst; 5109 /* 5110 * We trace ills, ipifs, ires, and nces. All of these are 5111 * per-IP-stack, so the lock on the thread list is as well. 5112 */ 5113 rw_enter(&ip_thread_rwlock, RW_WRITER); 5114 list_insert_tail(&ip_thread_list, thh); 5115 rw_exit(&ip_thread_rwlock); 5116 retv = tsd_set(ip_thread_data, thh); 5117 ASSERT(retv == 0); 5118 } 5119 return (thh != NULL ? thh->thh_hash : NULL); 5120 } 5121 5122 boolean_t 5123 th_trace_ref(const void *obj, ip_stack_t *ipst) 5124 { 5125 th_trace_t *th_trace; 5126 mod_hash_t *mh; 5127 mod_hash_val_t val; 5128 5129 if ((mh = th_trace_gethash(ipst)) == NULL) 5130 return (B_FALSE); 5131 5132 /* 5133 * Attempt to locate the trace buffer for this obj and thread. 5134 * If it does not exist, then allocate a new trace buffer and 5135 * insert into the hash. 5136 */ 5137 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) { 5138 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP); 5139 if (th_trace == NULL) 5140 return (B_FALSE); 5141 5142 th_trace->th_id = curthread; 5143 if (mod_hash_insert(mh, (mod_hash_key_t)obj, 5144 (mod_hash_val_t)th_trace) != 0) { 5145 kmem_free(th_trace, sizeof (th_trace_t)); 5146 return (B_FALSE); 5147 } 5148 } else { 5149 th_trace = (th_trace_t *)val; 5150 } 5151 5152 ASSERT(th_trace->th_refcnt >= 0 && 5153 th_trace->th_refcnt < TR_BUF_MAX - 1); 5154 5155 th_trace->th_refcnt++; 5156 th_trace_rrecord(th_trace); 5157 return (B_TRUE); 5158 } 5159 5160 /* 5161 * For the purpose of tracing a reference release, we assume that global 5162 * tracing is always on and that the same thread initiated the reference hold 5163 * is releasing. 5164 */ 5165 void 5166 th_trace_unref(const void *obj) 5167 { 5168 int retv; 5169 mod_hash_t *mh; 5170 th_trace_t *th_trace; 5171 mod_hash_val_t val; 5172 5173 mh = th_trace_gethash(NULL); 5174 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val); 5175 ASSERT(retv == 0); 5176 th_trace = (th_trace_t *)val; 5177 5178 ASSERT(th_trace->th_refcnt > 0); 5179 th_trace->th_refcnt--; 5180 th_trace_rrecord(th_trace); 5181 } 5182 5183 /* 5184 * If tracing has been disabled, then we assume that the reference counts are 5185 * now useless, and we clear them out before destroying the entries. 5186 */ 5187 void 5188 th_trace_cleanup(const void *obj, boolean_t trace_disable) 5189 { 5190 th_hash_t *thh; 5191 mod_hash_t *mh; 5192 mod_hash_val_t val; 5193 th_trace_t *th_trace; 5194 int retv; 5195 5196 rw_enter(&ip_thread_rwlock, RW_READER); 5197 for (thh = list_head(&ip_thread_list); thh != NULL; 5198 thh = list_next(&ip_thread_list, thh)) { 5199 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj, 5200 &val) == 0) { 5201 th_trace = (th_trace_t *)val; 5202 if (trace_disable) 5203 th_trace->th_refcnt = 0; 5204 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj); 5205 ASSERT(retv == 0); 5206 } 5207 } 5208 rw_exit(&ip_thread_rwlock); 5209 } 5210 5211 void 5212 ipif_trace_ref(ipif_t *ipif) 5213 { 5214 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5215 5216 if (ipif->ipif_trace_disable) 5217 return; 5218 5219 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) { 5220 ipif->ipif_trace_disable = B_TRUE; 5221 ipif_trace_cleanup(ipif); 5222 } 5223 } 5224 5225 void 5226 ipif_untrace_ref(ipif_t *ipif) 5227 { 5228 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5229 5230 if (!ipif->ipif_trace_disable) 5231 th_trace_unref(ipif); 5232 } 5233 5234 void 5235 ill_trace_ref(ill_t *ill) 5236 { 5237 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5238 5239 if (ill->ill_trace_disable) 5240 return; 5241 5242 if (!th_trace_ref(ill, ill->ill_ipst)) { 5243 ill->ill_trace_disable = B_TRUE; 5244 ill_trace_cleanup(ill); 5245 } 5246 } 5247 5248 void 5249 ill_untrace_ref(ill_t *ill) 5250 { 5251 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5252 5253 if (!ill->ill_trace_disable) 5254 th_trace_unref(ill); 5255 } 5256 5257 /* 5258 * Called when ipif is unplumbed or when memory alloc fails. Note that on 5259 * failure, ipif_trace_disable is set. 5260 */ 5261 static void 5262 ipif_trace_cleanup(const ipif_t *ipif) 5263 { 5264 th_trace_cleanup(ipif, ipif->ipif_trace_disable); 5265 } 5266 5267 /* 5268 * Called when ill is unplumbed or when memory alloc fails. Note that on 5269 * failure, ill_trace_disable is set. 5270 */ 5271 static void 5272 ill_trace_cleanup(const ill_t *ill) 5273 { 5274 th_trace_cleanup(ill, ill->ill_trace_disable); 5275 } 5276 #endif /* DEBUG */ 5277 5278 void 5279 ipif_refhold_locked(ipif_t *ipif) 5280 { 5281 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5282 ipif->ipif_refcnt++; 5283 IPIF_TRACE_REF(ipif); 5284 } 5285 5286 void 5287 ipif_refhold(ipif_t *ipif) 5288 { 5289 ill_t *ill; 5290 5291 ill = ipif->ipif_ill; 5292 mutex_enter(&ill->ill_lock); 5293 ipif->ipif_refcnt++; 5294 IPIF_TRACE_REF(ipif); 5295 mutex_exit(&ill->ill_lock); 5296 } 5297 5298 /* 5299 * Must not be called while holding any locks. Otherwise if this is 5300 * the last reference to be released there is a chance of recursive mutex 5301 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5302 * to restart an ioctl. 5303 */ 5304 void 5305 ipif_refrele(ipif_t *ipif) 5306 { 5307 ill_t *ill; 5308 5309 ill = ipif->ipif_ill; 5310 5311 mutex_enter(&ill->ill_lock); 5312 ASSERT(ipif->ipif_refcnt != 0); 5313 ipif->ipif_refcnt--; 5314 IPIF_UNTRACE_REF(ipif); 5315 if (ipif->ipif_refcnt != 0) { 5316 mutex_exit(&ill->ill_lock); 5317 return; 5318 } 5319 5320 /* Drops the ill_lock */ 5321 ipif_ill_refrele_tail(ill); 5322 } 5323 5324 ipif_t * 5325 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 5326 { 5327 ipif_t *ipif; 5328 5329 mutex_enter(&ill->ill_lock); 5330 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 5331 ipif != NULL; ipif = ipif->ipif_next) { 5332 if (IPIF_IS_CONDEMNED(ipif)) 5333 continue; 5334 ipif_refhold_locked(ipif); 5335 mutex_exit(&ill->ill_lock); 5336 return (ipif); 5337 } 5338 mutex_exit(&ill->ill_lock); 5339 return (NULL); 5340 } 5341 5342 /* 5343 * TODO: make this table extendible at run time 5344 * Return a pointer to the mac type info for 'mac_type' 5345 */ 5346 static ip_m_t * 5347 ip_m_lookup(t_uscalar_t mac_type) 5348 { 5349 ip_m_t *ipm; 5350 5351 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 5352 if (ipm->ip_m_mac_type == mac_type) 5353 return (ipm); 5354 return (NULL); 5355 } 5356 5357 /* 5358 * Make a link layer address from the multicast IP address *addr. 5359 * To form the link layer address, invoke the ip_m_v*mapping function 5360 * associated with the link-layer type. 5361 */ 5362 void 5363 ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr) 5364 { 5365 ip_m_t *ipm; 5366 5367 if (ill->ill_net_type == IRE_IF_NORESOLVER) 5368 return; 5369 5370 ASSERT(addr != NULL); 5371 5372 ipm = ip_m_lookup(ill->ill_mactype); 5373 if (ipm == NULL || 5374 (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) || 5375 (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) { 5376 ip0dbg(("no mapping for ill %s mactype 0x%x\n", 5377 ill->ill_name, ill->ill_mactype)); 5378 return; 5379 } 5380 if (ill->ill_isv6) 5381 (*ipm->ip_m_v6mapping)(ill, addr, hwaddr); 5382 else 5383 (*ipm->ip_m_v4mapping)(ill, addr, hwaddr); 5384 } 5385 5386 /* 5387 * ip_rt_add is called to add an IPv4 route to the forwarding table. 5388 * ill is passed in to associate it with the correct interface. 5389 * If ire_arg is set, then we return the held IRE in that location. 5390 */ 5391 int 5392 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 5393 ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg, 5394 boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid) 5395 { 5396 ire_t *ire, *nire; 5397 ire_t *gw_ire = NULL; 5398 ipif_t *ipif = NULL; 5399 uint_t type; 5400 int match_flags = MATCH_IRE_TYPE; 5401 tsol_gc_t *gc = NULL; 5402 tsol_gcgrp_t *gcgrp = NULL; 5403 boolean_t gcgrp_xtraref = B_FALSE; 5404 boolean_t cgtp_broadcast; 5405 5406 ip1dbg(("ip_rt_add:")); 5407 5408 if (ire_arg != NULL) 5409 *ire_arg = NULL; 5410 5411 /* 5412 * If this is the case of RTF_HOST being set, then we set the netmask 5413 * to all ones (regardless if one was supplied). 5414 */ 5415 if (flags & RTF_HOST) 5416 mask = IP_HOST_MASK; 5417 5418 /* 5419 * Prevent routes with a zero gateway from being created (since 5420 * interfaces can currently be plumbed and brought up no assigned 5421 * address). 5422 */ 5423 if (gw_addr == 0) 5424 return (ENETUNREACH); 5425 /* 5426 * Get the ipif, if any, corresponding to the gw_addr 5427 * If -ifp was specified we restrict ourselves to the ill, otherwise 5428 * we match on the gatway and destination to handle unnumbered pt-pt 5429 * interfaces. 5430 */ 5431 if (ill != NULL) 5432 ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst); 5433 else 5434 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); 5435 if (ipif != NULL) { 5436 if (IS_VNI(ipif->ipif_ill)) { 5437 ipif_refrele(ipif); 5438 return (EINVAL); 5439 } 5440 } 5441 5442 /* 5443 * GateD will attempt to create routes with a loopback interface 5444 * address as the gateway and with RTF_GATEWAY set. We allow 5445 * these routes to be added, but create them as interface routes 5446 * since the gateway is an interface address. 5447 */ 5448 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 5449 flags &= ~RTF_GATEWAY; 5450 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 5451 mask == IP_HOST_MASK) { 5452 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK, 5453 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, 5454 NULL); 5455 if (ire != NULL) { 5456 ire_refrele(ire); 5457 ipif_refrele(ipif); 5458 return (EEXIST); 5459 } 5460 ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x" 5461 "for 0x%x\n", (void *)ipif, 5462 ipif->ipif_ire_type, 5463 ntohl(ipif->ipif_lcl_addr))); 5464 ire = ire_create( 5465 (uchar_t *)&dst_addr, /* dest address */ 5466 (uchar_t *)&mask, /* mask */ 5467 NULL, /* no gateway */ 5468 ipif->ipif_ire_type, /* LOOPBACK */ 5469 ipif->ipif_ill, 5470 zoneid, 5471 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0, 5472 NULL, 5473 ipst); 5474 5475 if (ire == NULL) { 5476 ipif_refrele(ipif); 5477 return (ENOMEM); 5478 } 5479 /* src address assigned by the caller? */ 5480 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5481 ire->ire_setsrc_addr = src_addr; 5482 5483 nire = ire_add(ire); 5484 if (nire == NULL) { 5485 /* 5486 * In the result of failure, ire_add() will have 5487 * already deleted the ire in question, so there 5488 * is no need to do that here. 5489 */ 5490 ipif_refrele(ipif); 5491 return (ENOMEM); 5492 } 5493 /* 5494 * Check if it was a duplicate entry. This handles 5495 * the case of two racing route adds for the same route 5496 */ 5497 if (nire != ire) { 5498 ASSERT(nire->ire_identical_ref > 1); 5499 ire_delete(nire); 5500 ire_refrele(nire); 5501 ipif_refrele(ipif); 5502 return (EEXIST); 5503 } 5504 ire = nire; 5505 goto save_ire; 5506 } 5507 } 5508 5509 /* 5510 * The routes for multicast with CGTP are quite special in that 5511 * the gateway is the local interface address, yet RTF_GATEWAY 5512 * is set. We turn off RTF_GATEWAY to provide compatibility with 5513 * this undocumented and unusual use of multicast routes. 5514 */ 5515 if ((flags & RTF_MULTIRT) && ipif != NULL) 5516 flags &= ~RTF_GATEWAY; 5517 5518 /* 5519 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 5520 * and the gateway address provided is one of the system's interface 5521 * addresses. By using the routing socket interface and supplying an 5522 * RTA_IFP sockaddr with an interface index, an alternate method of 5523 * specifying an interface route to be created is available which uses 5524 * the interface index that specifies the outgoing interface rather than 5525 * the address of an outgoing interface (which may not be able to 5526 * uniquely identify an interface). When coupled with the RTF_GATEWAY 5527 * flag, routes can be specified which not only specify the next-hop to 5528 * be used when routing to a certain prefix, but also which outgoing 5529 * interface should be used. 5530 * 5531 * Previously, interfaces would have unique addresses assigned to them 5532 * and so the address assigned to a particular interface could be used 5533 * to identify a particular interface. One exception to this was the 5534 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 5535 * 5536 * With the advent of IPv6 and its link-local addresses, this 5537 * restriction was relaxed and interfaces could share addresses between 5538 * themselves. In fact, typically all of the link-local interfaces on 5539 * an IPv6 node or router will have the same link-local address. In 5540 * order to differentiate between these interfaces, the use of an 5541 * interface index is necessary and this index can be carried inside a 5542 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 5543 * of using the interface index, however, is that all of the ipif's that 5544 * are part of an ill have the same index and so the RTA_IFP sockaddr 5545 * cannot be used to differentiate between ipif's (or logical 5546 * interfaces) that belong to the same ill (physical interface). 5547 * 5548 * For example, in the following case involving IPv4 interfaces and 5549 * logical interfaces 5550 * 5551 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 5552 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0 5553 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0 5554 * 5555 * the ipif's corresponding to each of these interface routes can be 5556 * uniquely identified by the "gateway" (actually interface address). 5557 * 5558 * In this case involving multiple IPv6 default routes to a particular 5559 * link-local gateway, the use of RTA_IFP is necessary to specify which 5560 * default route is of interest: 5561 * 5562 * default fe80::123:4567:89ab:cdef U if0 5563 * default fe80::123:4567:89ab:cdef U if1 5564 */ 5565 5566 /* RTF_GATEWAY not set */ 5567 if (!(flags & RTF_GATEWAY)) { 5568 if (sp != NULL) { 5569 ip2dbg(("ip_rt_add: gateway security attributes " 5570 "cannot be set with interface route\n")); 5571 if (ipif != NULL) 5572 ipif_refrele(ipif); 5573 return (EINVAL); 5574 } 5575 5576 /* 5577 * Whether or not ill (RTA_IFP) is set, we require that 5578 * the gateway is one of our local addresses. 5579 */ 5580 if (ipif == NULL) 5581 return (ENETUNREACH); 5582 5583 /* 5584 * We use MATCH_IRE_ILL here. If the caller specified an 5585 * interface (from the RTA_IFP sockaddr) we use it, otherwise 5586 * we use the ill derived from the gateway address. 5587 * We can always match the gateway address since we record it 5588 * in ire_gateway_addr. 5589 * We don't allow RTA_IFP to specify a different ill than the 5590 * one matching the ipif to make sure we can delete the route. 5591 */ 5592 match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL; 5593 if (ill == NULL) { 5594 ill = ipif->ipif_ill; 5595 } else if (ill != ipif->ipif_ill) { 5596 ipif_refrele(ipif); 5597 return (EINVAL); 5598 } 5599 5600 /* 5601 * We check for an existing entry at this point. 5602 * 5603 * Since a netmask isn't passed in via the ioctl interface 5604 * (SIOCADDRT), we don't check for a matching netmask in that 5605 * case. 5606 */ 5607 if (!ioctl_msg) 5608 match_flags |= MATCH_IRE_MASK; 5609 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, 5610 IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst, 5611 NULL); 5612 if (ire != NULL) { 5613 ire_refrele(ire); 5614 ipif_refrele(ipif); 5615 return (EEXIST); 5616 } 5617 5618 /* 5619 * Create a copy of the IRE_LOOPBACK, IRE_IF_NORESOLVER or 5620 * IRE_IF_RESOLVER with the modified address, netmask, and 5621 * gateway. 5622 */ 5623 ire = ire_create( 5624 (uchar_t *)&dst_addr, 5625 (uint8_t *)&mask, 5626 (uint8_t *)&gw_addr, 5627 ill->ill_net_type, 5628 ill, 5629 zoneid, 5630 flags, 5631 NULL, 5632 ipst); 5633 if (ire == NULL) { 5634 ipif_refrele(ipif); 5635 return (ENOMEM); 5636 } 5637 5638 /* 5639 * Some software (for example, GateD and Sun Cluster) attempts 5640 * to create (what amount to) IRE_PREFIX routes with the 5641 * loopback address as the gateway. This is primarily done to 5642 * set up prefixes with the RTF_REJECT flag set (for example, 5643 * when generating aggregate routes.) 5644 * 5645 * If the IRE type (as defined by ill->ill_net_type) is 5646 * IRE_LOOPBACK, then we map the request into a 5647 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as 5648 * these interface routes, by definition, can only be that. 5649 * 5650 * Needless to say, the real IRE_LOOPBACK is NOT created by this 5651 * routine, but rather using ire_create() directly. 5652 * 5653 */ 5654 if (ill->ill_net_type == IRE_LOOPBACK) { 5655 ire->ire_type = IRE_IF_NORESOLVER; 5656 ire->ire_flags |= RTF_BLACKHOLE; 5657 } 5658 5659 /* src address assigned by the caller? */ 5660 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5661 ire->ire_setsrc_addr = src_addr; 5662 5663 nire = ire_add(ire); 5664 if (nire == NULL) { 5665 /* 5666 * In the result of failure, ire_add() will have 5667 * already deleted the ire in question, so there 5668 * is no need to do that here. 5669 */ 5670 ipif_refrele(ipif); 5671 return (ENOMEM); 5672 } 5673 /* 5674 * Check if it was a duplicate entry. This handles 5675 * the case of two racing route adds for the same route 5676 */ 5677 if (nire != ire) { 5678 ire_delete(nire); 5679 ire_refrele(nire); 5680 ipif_refrele(ipif); 5681 return (EEXIST); 5682 } 5683 ire = nire; 5684 goto save_ire; 5685 } 5686 5687 /* 5688 * Get an interface IRE for the specified gateway. 5689 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 5690 * gateway, it is currently unreachable and we fail the request 5691 * accordingly. 5692 * If RTA_IFP was specified we look on that particular ill. 5693 */ 5694 if (ill != NULL) 5695 match_flags |= MATCH_IRE_ILL; 5696 5697 /* Check whether the gateway is reachable. */ 5698 again: 5699 type = IRE_INTERFACE; 5700 if (flags & RTF_INDIRECT) 5701 type |= IRE_OFFLINK; 5702 5703 gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill, 5704 ALL_ZONES, NULL, match_flags, 0, ipst, NULL); 5705 if (gw_ire == NULL) { 5706 /* 5707 * With IPMP, we allow host routes to influence in.mpathd's 5708 * target selection. However, if the test addresses are on 5709 * their own network, the above lookup will fail since the 5710 * underlying IRE_INTERFACEs are marked hidden. So allow 5711 * hidden test IREs to be found and try again. 5712 */ 5713 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) { 5714 match_flags |= MATCH_IRE_TESTHIDDEN; 5715 goto again; 5716 } 5717 5718 if (ipif != NULL) 5719 ipif_refrele(ipif); 5720 return (ENETUNREACH); 5721 } 5722 5723 /* 5724 * We create one of three types of IREs as a result of this request 5725 * based on the netmask. A netmask of all ones (which is automatically 5726 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 5727 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 5728 * created. Otherwise, an IRE_PREFIX route is created for the 5729 * destination prefix. 5730 */ 5731 if (mask == IP_HOST_MASK) 5732 type = IRE_HOST; 5733 else if (mask == 0) 5734 type = IRE_DEFAULT; 5735 else 5736 type = IRE_PREFIX; 5737 5738 /* check for a duplicate entry */ 5739 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, 5740 ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, 5741 0, ipst, NULL); 5742 if (ire != NULL) { 5743 if (ipif != NULL) 5744 ipif_refrele(ipif); 5745 ire_refrele(gw_ire); 5746 ire_refrele(ire); 5747 return (EEXIST); 5748 } 5749 5750 /* Security attribute exists */ 5751 if (sp != NULL) { 5752 tsol_gcgrp_addr_t ga; 5753 5754 /* find or create the gateway credentials group */ 5755 ga.ga_af = AF_INET; 5756 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 5757 5758 /* we hold reference to it upon success */ 5759 gcgrp = gcgrp_lookup(&ga, B_TRUE); 5760 if (gcgrp == NULL) { 5761 if (ipif != NULL) 5762 ipif_refrele(ipif); 5763 ire_refrele(gw_ire); 5764 return (ENOMEM); 5765 } 5766 5767 /* 5768 * Create and add the security attribute to the group; a 5769 * reference to the group is made upon allocating a new 5770 * entry successfully. If it finds an already-existing 5771 * entry for the security attribute in the group, it simply 5772 * returns it and no new reference is made to the group. 5773 */ 5774 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 5775 if (gc == NULL) { 5776 if (ipif != NULL) 5777 ipif_refrele(ipif); 5778 /* release reference held by gcgrp_lookup */ 5779 GCGRP_REFRELE(gcgrp); 5780 ire_refrele(gw_ire); 5781 return (ENOMEM); 5782 } 5783 } 5784 5785 /* Create the IRE. */ 5786 ire = ire_create( 5787 (uchar_t *)&dst_addr, /* dest address */ 5788 (uchar_t *)&mask, /* mask */ 5789 (uchar_t *)&gw_addr, /* gateway address */ 5790 (ushort_t)type, /* IRE type */ 5791 ill, 5792 zoneid, 5793 flags, 5794 gc, /* security attribute */ 5795 ipst); 5796 5797 /* 5798 * The ire holds a reference to the 'gc' and the 'gc' holds a 5799 * reference to the 'gcgrp'. We can now release the extra reference 5800 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 5801 */ 5802 if (gcgrp_xtraref) 5803 GCGRP_REFRELE(gcgrp); 5804 if (ire == NULL) { 5805 if (gc != NULL) 5806 GC_REFRELE(gc); 5807 if (ipif != NULL) 5808 ipif_refrele(ipif); 5809 ire_refrele(gw_ire); 5810 return (ENOMEM); 5811 } 5812 5813 /* Before we add, check if an extra CGTP broadcast is needed */ 5814 cgtp_broadcast = ((flags & RTF_MULTIRT) && 5815 ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST); 5816 5817 /* src address assigned by the caller? */ 5818 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5819 ire->ire_setsrc_addr = src_addr; 5820 5821 /* 5822 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 5823 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 5824 */ 5825 5826 /* Add the new IRE. */ 5827 nire = ire_add(ire); 5828 if (nire == NULL) { 5829 /* 5830 * In the result of failure, ire_add() will have 5831 * already deleted the ire in question, so there 5832 * is no need to do that here. 5833 */ 5834 if (ipif != NULL) 5835 ipif_refrele(ipif); 5836 ire_refrele(gw_ire); 5837 return (ENOMEM); 5838 } 5839 /* 5840 * Check if it was a duplicate entry. This handles 5841 * the case of two racing route adds for the same route 5842 */ 5843 if (nire != ire) { 5844 ire_delete(nire); 5845 ire_refrele(nire); 5846 if (ipif != NULL) 5847 ipif_refrele(ipif); 5848 ire_refrele(gw_ire); 5849 return (EEXIST); 5850 } 5851 ire = nire; 5852 5853 if (flags & RTF_MULTIRT) { 5854 /* 5855 * Invoke the CGTP (multirouting) filtering module 5856 * to add the dst address in the filtering database. 5857 * Replicated inbound packets coming from that address 5858 * will be filtered to discard the duplicates. 5859 * It is not necessary to call the CGTP filter hook 5860 * when the dst address is a broadcast or multicast, 5861 * because an IP source address cannot be a broadcast 5862 * or a multicast. 5863 */ 5864 if (cgtp_broadcast) { 5865 ip_cgtp_bcast_add(ire, ipst); 5866 goto save_ire; 5867 } 5868 if (ipst->ips_ip_cgtp_filter_ops != NULL && 5869 !CLASSD(ire->ire_addr)) { 5870 int res; 5871 ipif_t *src_ipif; 5872 5873 /* Find the source address corresponding to gw_ire */ 5874 src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr, 5875 NULL, zoneid, ipst); 5876 if (src_ipif != NULL) { 5877 res = ipst->ips_ip_cgtp_filter_ops-> 5878 cfo_add_dest_v4( 5879 ipst->ips_netstack->netstack_stackid, 5880 ire->ire_addr, 5881 ire->ire_gateway_addr, 5882 ire->ire_setsrc_addr, 5883 src_ipif->ipif_lcl_addr); 5884 ipif_refrele(src_ipif); 5885 } else { 5886 res = EADDRNOTAVAIL; 5887 } 5888 if (res != 0) { 5889 if (ipif != NULL) 5890 ipif_refrele(ipif); 5891 ire_refrele(gw_ire); 5892 ire_delete(ire); 5893 ire_refrele(ire); /* Held in ire_add */ 5894 return (res); 5895 } 5896 } 5897 } 5898 5899 save_ire: 5900 if (gw_ire != NULL) { 5901 ire_refrele(gw_ire); 5902 gw_ire = NULL; 5903 } 5904 if (ill != NULL) { 5905 /* 5906 * Save enough information so that we can recreate the IRE if 5907 * the interface goes down and then up. The metrics associated 5908 * with the route will be saved as well when rts_setmetrics() is 5909 * called after the IRE has been created. In the case where 5910 * memory cannot be allocated, none of this information will be 5911 * saved. 5912 */ 5913 ill_save_ire(ill, ire); 5914 } 5915 if (ioctl_msg) 5916 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst); 5917 if (ire_arg != NULL) { 5918 /* 5919 * Store the ire that was successfully added into where ire_arg 5920 * points to so that callers don't have to look it up 5921 * themselves (but they are responsible for ire_refrele()ing 5922 * the ire when they are finished with it). 5923 */ 5924 *ire_arg = ire; 5925 } else { 5926 ire_refrele(ire); /* Held in ire_add */ 5927 } 5928 if (ipif != NULL) 5929 ipif_refrele(ipif); 5930 return (0); 5931 } 5932 5933 /* 5934 * ip_rt_delete is called to delete an IPv4 route. 5935 * ill is passed in to associate it with the correct interface. 5936 */ 5937 /* ARGSUSED4 */ 5938 int 5939 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 5940 uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg, 5941 ip_stack_t *ipst, zoneid_t zoneid) 5942 { 5943 ire_t *ire = NULL; 5944 ipif_t *ipif; 5945 uint_t type; 5946 uint_t match_flags = MATCH_IRE_TYPE; 5947 int err = 0; 5948 5949 ip1dbg(("ip_rt_delete:")); 5950 /* 5951 * If this is the case of RTF_HOST being set, then we set the netmask 5952 * to all ones. Otherwise, we use the netmask if one was supplied. 5953 */ 5954 if (flags & RTF_HOST) { 5955 mask = IP_HOST_MASK; 5956 match_flags |= MATCH_IRE_MASK; 5957 } else if (rtm_addrs & RTA_NETMASK) { 5958 match_flags |= MATCH_IRE_MASK; 5959 } 5960 5961 /* 5962 * Note that RTF_GATEWAY is never set on a delete, therefore 5963 * we check if the gateway address is one of our interfaces first, 5964 * and fall back on RTF_GATEWAY routes. 5965 * 5966 * This makes it possible to delete an original 5967 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 5968 * However, we have RTF_KERNEL set on the ones created by ipif_up 5969 * and those can not be deleted here. 5970 * 5971 * We use MATCH_IRE_ILL if we know the interface. If the caller 5972 * specified an interface (from the RTA_IFP sockaddr) we use it, 5973 * otherwise we use the ill derived from the gateway address. 5974 * We can always match the gateway address since we record it 5975 * in ire_gateway_addr. 5976 * 5977 * For more detail on specifying routes by gateway address and by 5978 * interface index, see the comments in ip_rt_add(). 5979 */ 5980 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); 5981 if (ipif != NULL) { 5982 ill_t *ill_match; 5983 5984 if (ill != NULL) 5985 ill_match = ill; 5986 else 5987 ill_match = ipif->ipif_ill; 5988 5989 match_flags |= MATCH_IRE_ILL; 5990 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 5991 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK, 5992 ill_match, ALL_ZONES, NULL, match_flags, 0, ipst, 5993 NULL); 5994 } 5995 if (ire == NULL) { 5996 match_flags |= MATCH_IRE_GW; 5997 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, 5998 IRE_INTERFACE, ill_match, ALL_ZONES, NULL, 5999 match_flags, 0, ipst, NULL); 6000 } 6001 /* Avoid deleting routes created by kernel from an ipif */ 6002 if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) { 6003 ire_refrele(ire); 6004 ire = NULL; 6005 } 6006 6007 /* Restore in case we didn't find a match */ 6008 match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL); 6009 } 6010 6011 if (ire == NULL) { 6012 /* 6013 * At this point, the gateway address is not one of our own 6014 * addresses or a matching interface route was not found. We 6015 * set the IRE type to lookup based on whether 6016 * this is a host route, a default route or just a prefix. 6017 * 6018 * If an ill was passed in, then the lookup is based on an 6019 * interface index so MATCH_IRE_ILL is added to match_flags. 6020 */ 6021 match_flags |= MATCH_IRE_GW; 6022 if (ill != NULL) 6023 match_flags |= MATCH_IRE_ILL; 6024 if (mask == IP_HOST_MASK) 6025 type = IRE_HOST; 6026 else if (mask == 0) 6027 type = IRE_DEFAULT; 6028 else 6029 type = IRE_PREFIX; 6030 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, 6031 ALL_ZONES, NULL, match_flags, 0, ipst, NULL); 6032 } 6033 6034 if (ipif != NULL) { 6035 ipif_refrele(ipif); 6036 ipif = NULL; 6037 } 6038 6039 if (ire == NULL) 6040 return (ESRCH); 6041 6042 if (ire->ire_flags & RTF_MULTIRT) { 6043 /* 6044 * Invoke the CGTP (multirouting) filtering module 6045 * to remove the dst address from the filtering database. 6046 * Packets coming from that address will no longer be 6047 * filtered to remove duplicates. 6048 */ 6049 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 6050 err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4( 6051 ipst->ips_netstack->netstack_stackid, 6052 ire->ire_addr, ire->ire_gateway_addr); 6053 } 6054 ip_cgtp_bcast_delete(ire, ipst); 6055 } 6056 6057 ill = ire->ire_ill; 6058 if (ill != NULL) 6059 ill_remove_saved_ire(ill, ire); 6060 if (ioctl_msg) 6061 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst); 6062 ire_delete(ire); 6063 ire_refrele(ire); 6064 return (err); 6065 } 6066 6067 /* 6068 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 6069 */ 6070 /* ARGSUSED */ 6071 int 6072 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6073 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6074 { 6075 ipaddr_t dst_addr; 6076 ipaddr_t gw_addr; 6077 ipaddr_t mask; 6078 int error = 0; 6079 mblk_t *mp1; 6080 struct rtentry *rt; 6081 ipif_t *ipif = NULL; 6082 ip_stack_t *ipst; 6083 6084 ASSERT(q->q_next == NULL); 6085 ipst = CONNQ_TO_IPST(q); 6086 6087 ip1dbg(("ip_siocaddrt:")); 6088 /* Existence of mp1 verified in ip_wput_nondata */ 6089 mp1 = mp->b_cont->b_cont; 6090 rt = (struct rtentry *)mp1->b_rptr; 6091 6092 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6093 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6094 6095 /* 6096 * If the RTF_HOST flag is on, this is a request to assign a gateway 6097 * to a particular host address. In this case, we set the netmask to 6098 * all ones for the particular destination address. Otherwise, 6099 * determine the netmask to be used based on dst_addr and the interfaces 6100 * in use. 6101 */ 6102 if (rt->rt_flags & RTF_HOST) { 6103 mask = IP_HOST_MASK; 6104 } else { 6105 /* 6106 * Note that ip_subnet_mask returns a zero mask in the case of 6107 * default (an all-zeroes address). 6108 */ 6109 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 6110 } 6111 6112 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 6113 B_TRUE, NULL, ipst, ALL_ZONES); 6114 if (ipif != NULL) 6115 ipif_refrele(ipif); 6116 return (error); 6117 } 6118 6119 /* 6120 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 6121 */ 6122 /* ARGSUSED */ 6123 int 6124 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6125 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6126 { 6127 ipaddr_t dst_addr; 6128 ipaddr_t gw_addr; 6129 ipaddr_t mask; 6130 int error; 6131 mblk_t *mp1; 6132 struct rtentry *rt; 6133 ipif_t *ipif = NULL; 6134 ip_stack_t *ipst; 6135 6136 ASSERT(q->q_next == NULL); 6137 ipst = CONNQ_TO_IPST(q); 6138 6139 ip1dbg(("ip_siocdelrt:")); 6140 /* Existence of mp1 verified in ip_wput_nondata */ 6141 mp1 = mp->b_cont->b_cont; 6142 rt = (struct rtentry *)mp1->b_rptr; 6143 6144 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6145 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6146 6147 /* 6148 * If the RTF_HOST flag is on, this is a request to delete a gateway 6149 * to a particular host address. In this case, we set the netmask to 6150 * all ones for the particular destination address. Otherwise, 6151 * determine the netmask to be used based on dst_addr and the interfaces 6152 * in use. 6153 */ 6154 if (rt->rt_flags & RTF_HOST) { 6155 mask = IP_HOST_MASK; 6156 } else { 6157 /* 6158 * Note that ip_subnet_mask returns a zero mask in the case of 6159 * default (an all-zeroes address). 6160 */ 6161 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 6162 } 6163 6164 error = ip_rt_delete(dst_addr, mask, gw_addr, 6165 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, 6166 ipst, ALL_ZONES); 6167 if (ipif != NULL) 6168 ipif_refrele(ipif); 6169 return (error); 6170 } 6171 6172 /* 6173 * Enqueue the mp onto the ipsq, chained by b_next. 6174 * b_prev stores the function to be executed later, and b_queue the queue 6175 * where this mp originated. 6176 */ 6177 void 6178 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 6179 ill_t *pending_ill) 6180 { 6181 conn_t *connp; 6182 ipxop_t *ipx = ipsq->ipsq_xop; 6183 6184 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 6185 ASSERT(MUTEX_HELD(&ipx->ipx_lock)); 6186 ASSERT(func != NULL); 6187 6188 mp->b_queue = q; 6189 mp->b_prev = (void *)func; 6190 mp->b_next = NULL; 6191 6192 switch (type) { 6193 case CUR_OP: 6194 if (ipx->ipx_mptail != NULL) { 6195 ASSERT(ipx->ipx_mphead != NULL); 6196 ipx->ipx_mptail->b_next = mp; 6197 } else { 6198 ASSERT(ipx->ipx_mphead == NULL); 6199 ipx->ipx_mphead = mp; 6200 } 6201 ipx->ipx_mptail = mp; 6202 break; 6203 6204 case NEW_OP: 6205 if (ipsq->ipsq_xopq_mptail != NULL) { 6206 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 6207 ipsq->ipsq_xopq_mptail->b_next = mp; 6208 } else { 6209 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 6210 ipsq->ipsq_xopq_mphead = mp; 6211 } 6212 ipsq->ipsq_xopq_mptail = mp; 6213 ipx->ipx_ipsq_queued = B_TRUE; 6214 break; 6215 6216 case SWITCH_OP: 6217 ASSERT(ipsq->ipsq_swxop != NULL); 6218 /* only one switch operation is currently allowed */ 6219 ASSERT(ipsq->ipsq_switch_mp == NULL); 6220 ipsq->ipsq_switch_mp = mp; 6221 ipx->ipx_ipsq_queued = B_TRUE; 6222 break; 6223 default: 6224 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 6225 } 6226 6227 if (CONN_Q(q) && pending_ill != NULL) { 6228 connp = Q_TO_CONN(q); 6229 ASSERT(MUTEX_HELD(&connp->conn_lock)); 6230 connp->conn_oper_pending_ill = pending_ill; 6231 } 6232 } 6233 6234 /* 6235 * Dequeue the next message that requested exclusive access to this IPSQ's 6236 * xop. Specifically: 6237 * 6238 * 1. If we're still processing the current operation on `ipsq', then 6239 * dequeue the next message for the operation (from ipx_mphead), or 6240 * return NULL if there are no queued messages for the operation. 6241 * These messages are queued via CUR_OP to qwriter_ip() and friends. 6242 * 6243 * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is 6244 * not set) see if the ipsq has requested an xop switch. If so, switch 6245 * `ipsq' to a different xop. Xop switches only happen when joining or 6246 * leaving IPMP groups and require a careful dance -- see the comments 6247 * in-line below for details. If we're leaving a group xop or if we're 6248 * joining a group xop and become writer on it, then we proceed to (3). 6249 * Otherwise, we return NULL and exit the xop. 6250 * 6251 * 3. For each IPSQ in the xop, return any switch operation stored on 6252 * ipsq_switch_mp (set via SWITCH_OP); these must be processed before 6253 * any other messages queued on the IPSQ. Otherwise, dequeue the next 6254 * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead. 6255 * Note that if the phyint tied to `ipsq' is not using IPMP there will 6256 * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for 6257 * each phyint in the group, including the IPMP meta-interface phyint. 6258 */ 6259 static mblk_t * 6260 ipsq_dq(ipsq_t *ipsq) 6261 { 6262 ill_t *illv4, *illv6; 6263 mblk_t *mp; 6264 ipsq_t *xopipsq; 6265 ipsq_t *leftipsq = NULL; 6266 ipxop_t *ipx; 6267 phyint_t *phyi = ipsq->ipsq_phyint; 6268 ip_stack_t *ipst = ipsq->ipsq_ipst; 6269 boolean_t emptied = B_FALSE; 6270 6271 /* 6272 * Grab all the locks we need in the defined order (ill_g_lock -> 6273 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next. 6274 */ 6275 rw_enter(&ipst->ips_ill_g_lock, 6276 ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER); 6277 mutex_enter(&ipsq->ipsq_lock); 6278 ipx = ipsq->ipsq_xop; 6279 mutex_enter(&ipx->ipx_lock); 6280 6281 /* 6282 * Dequeue the next message associated with the current exclusive 6283 * operation, if any. 6284 */ 6285 if ((mp = ipx->ipx_mphead) != NULL) { 6286 ipx->ipx_mphead = mp->b_next; 6287 if (ipx->ipx_mphead == NULL) 6288 ipx->ipx_mptail = NULL; 6289 mp->b_next = (void *)ipsq; 6290 goto out; 6291 } 6292 6293 if (ipx->ipx_current_ipif != NULL) 6294 goto empty; 6295 6296 if (ipsq->ipsq_swxop != NULL) { 6297 /* 6298 * The exclusive operation that is now being completed has 6299 * requested a switch to a different xop. This happens 6300 * when an interface joins or leaves an IPMP group. Joins 6301 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()). 6302 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb 6303 * (phyint_free()), or interface plumb for an ill type 6304 * not in the IPMP group (ip_rput_dlpi_writer()). 6305 * 6306 * Xop switches are not allowed on the IPMP meta-interface. 6307 */ 6308 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP)); 6309 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 6310 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq); 6311 6312 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) { 6313 /* 6314 * We're switching back to our own xop, so we have two 6315 * xop's to drain/exit: our own, and the group xop 6316 * that we are leaving. 6317 * 6318 * First, pull ourselves out of the group ipsq list. 6319 * This is safe since we're writer on ill_g_lock. 6320 */ 6321 ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop); 6322 6323 xopipsq = ipx->ipx_ipsq; 6324 while (xopipsq->ipsq_next != ipsq) 6325 xopipsq = xopipsq->ipsq_next; 6326 6327 xopipsq->ipsq_next = ipsq->ipsq_next; 6328 ipsq->ipsq_next = ipsq; 6329 ipsq->ipsq_xop = ipsq->ipsq_swxop; 6330 ipsq->ipsq_swxop = NULL; 6331 6332 /* 6333 * Second, prepare to exit the group xop. The actual 6334 * ipsq_exit() is done at the end of this function 6335 * since we cannot hold any locks across ipsq_exit(). 6336 * Note that although we drop the group's ipx_lock, no 6337 * threads can proceed since we're still ipx_writer. 6338 */ 6339 leftipsq = xopipsq; 6340 mutex_exit(&ipx->ipx_lock); 6341 6342 /* 6343 * Third, set ipx to point to our own xop (which was 6344 * inactive and therefore can be entered). 6345 */ 6346 ipx = ipsq->ipsq_xop; 6347 mutex_enter(&ipx->ipx_lock); 6348 ASSERT(ipx->ipx_writer == NULL); 6349 ASSERT(ipx->ipx_current_ipif == NULL); 6350 } else { 6351 /* 6352 * We're switching from our own xop to a group xop. 6353 * The requestor of the switch must ensure that the 6354 * group xop cannot go away (e.g. by ensuring the 6355 * phyint associated with the xop cannot go away). 6356 * 6357 * If we can become writer on our new xop, then we'll 6358 * do the drain. Otherwise, the current writer of our 6359 * new xop will do the drain when it exits. 6360 * 6361 * First, splice ourselves into the group IPSQ list. 6362 * This is safe since we're writer on ill_g_lock. 6363 */ 6364 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 6365 6366 xopipsq = ipsq->ipsq_swxop->ipx_ipsq; 6367 while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq) 6368 xopipsq = xopipsq->ipsq_next; 6369 6370 xopipsq->ipsq_next = ipsq; 6371 ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq; 6372 ipsq->ipsq_xop = ipsq->ipsq_swxop; 6373 ipsq->ipsq_swxop = NULL; 6374 6375 /* 6376 * Second, exit our own xop, since it's now unused. 6377 * This is safe since we've got the only reference. 6378 */ 6379 ASSERT(ipx->ipx_writer == curthread); 6380 ipx->ipx_writer = NULL; 6381 VERIFY(--ipx->ipx_reentry_cnt == 0); 6382 ipx->ipx_ipsq_queued = B_FALSE; 6383 mutex_exit(&ipx->ipx_lock); 6384 6385 /* 6386 * Third, set ipx to point to our new xop, and check 6387 * if we can become writer on it. If we cannot, then 6388 * the current writer will drain the IPSQ group when 6389 * it exits. Our ipsq_xop is guaranteed to be stable 6390 * because we're still holding ipsq_lock. 6391 */ 6392 ipx = ipsq->ipsq_xop; 6393 mutex_enter(&ipx->ipx_lock); 6394 if (ipx->ipx_writer != NULL || 6395 ipx->ipx_current_ipif != NULL) { 6396 goto out; 6397 } 6398 } 6399 6400 /* 6401 * Fourth, become writer on our new ipx before we continue 6402 * with the drain. Note that we never dropped ipsq_lock 6403 * above, so no other thread could've raced with us to 6404 * become writer first. Also, we're holding ipx_lock, so 6405 * no other thread can examine the ipx right now. 6406 */ 6407 ASSERT(ipx->ipx_current_ipif == NULL); 6408 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 6409 VERIFY(ipx->ipx_reentry_cnt++ == 0); 6410 ipx->ipx_writer = curthread; 6411 ipx->ipx_forced = B_FALSE; 6412 #ifdef DEBUG 6413 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6414 #endif 6415 } 6416 6417 xopipsq = ipsq; 6418 do { 6419 /* 6420 * So that other operations operate on a consistent and 6421 * complete phyint, a switch message on an IPSQ must be 6422 * handled prior to any other operations on that IPSQ. 6423 */ 6424 if ((mp = xopipsq->ipsq_switch_mp) != NULL) { 6425 xopipsq->ipsq_switch_mp = NULL; 6426 ASSERT(mp->b_next == NULL); 6427 mp->b_next = (void *)xopipsq; 6428 goto out; 6429 } 6430 6431 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) { 6432 xopipsq->ipsq_xopq_mphead = mp->b_next; 6433 if (xopipsq->ipsq_xopq_mphead == NULL) 6434 xopipsq->ipsq_xopq_mptail = NULL; 6435 mp->b_next = (void *)xopipsq; 6436 goto out; 6437 } 6438 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 6439 empty: 6440 /* 6441 * There are no messages. Further, we are holding ipx_lock, hence no 6442 * new messages can end up on any IPSQ in the xop. 6443 */ 6444 ipx->ipx_writer = NULL; 6445 ipx->ipx_forced = B_FALSE; 6446 VERIFY(--ipx->ipx_reentry_cnt == 0); 6447 ipx->ipx_ipsq_queued = B_FALSE; 6448 emptied = B_TRUE; 6449 #ifdef DEBUG 6450 ipx->ipx_depth = 0; 6451 #endif 6452 out: 6453 mutex_exit(&ipx->ipx_lock); 6454 mutex_exit(&ipsq->ipsq_lock); 6455 6456 /* 6457 * If we completely emptied the xop, then wake up any threads waiting 6458 * to enter any of the IPSQ's associated with it. 6459 */ 6460 if (emptied) { 6461 xopipsq = ipsq; 6462 do { 6463 if ((phyi = xopipsq->ipsq_phyint) == NULL) 6464 continue; 6465 6466 illv4 = phyi->phyint_illv4; 6467 illv6 = phyi->phyint_illv6; 6468 6469 GRAB_ILL_LOCKS(illv4, illv6); 6470 if (illv4 != NULL) 6471 cv_broadcast(&illv4->ill_cv); 6472 if (illv6 != NULL) 6473 cv_broadcast(&illv6->ill_cv); 6474 RELEASE_ILL_LOCKS(illv4, illv6); 6475 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 6476 } 6477 rw_exit(&ipst->ips_ill_g_lock); 6478 6479 /* 6480 * Now that all locks are dropped, exit the IPSQ we left. 6481 */ 6482 if (leftipsq != NULL) 6483 ipsq_exit(leftipsq); 6484 6485 return (mp); 6486 } 6487 6488 /* 6489 * Return completion status of previously initiated DLPI operations on 6490 * ills in the purview of an ipsq. 6491 */ 6492 static boolean_t 6493 ipsq_dlpi_done(ipsq_t *ipsq) 6494 { 6495 ipsq_t *ipsq_start; 6496 phyint_t *phyi; 6497 ill_t *ill; 6498 6499 ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock)); 6500 ipsq_start = ipsq; 6501 6502 do { 6503 /* 6504 * The only current users of this function are ipsq_try_enter 6505 * and ipsq_enter which have made sure that ipsq_writer is 6506 * NULL before we reach here. ill_dlpi_pending is modified 6507 * only by an ipsq writer 6508 */ 6509 ASSERT(ipsq->ipsq_xop->ipx_writer == NULL); 6510 phyi = ipsq->ipsq_phyint; 6511 /* 6512 * phyi could be NULL if a phyint that is part of an 6513 * IPMP group is being unplumbed. A more detailed 6514 * comment is in ipmp_grp_update_kstats() 6515 */ 6516 if (phyi != NULL) { 6517 ill = phyi->phyint_illv4; 6518 if (ill != NULL && 6519 (ill->ill_dlpi_pending != DL_PRIM_INVAL || 6520 ill->ill_arl_dlpi_pending)) 6521 return (B_FALSE); 6522 6523 ill = phyi->phyint_illv6; 6524 if (ill != NULL && 6525 ill->ill_dlpi_pending != DL_PRIM_INVAL) 6526 return (B_FALSE); 6527 } 6528 6529 } while ((ipsq = ipsq->ipsq_next) != ipsq_start); 6530 6531 return (B_TRUE); 6532 } 6533 6534 /* 6535 * Enter the ipsq corresponding to ill, by waiting synchronously till 6536 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 6537 * will have to drain completely before ipsq_enter returns success. 6538 * ipx_current_ipif will be set if some exclusive op is in progress, 6539 * and the ipsq_exit logic will start the next enqueued op after 6540 * completion of the current op. If 'force' is used, we don't wait 6541 * for the enqueued ops. This is needed when a conn_close wants to 6542 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 6543 * of an ill can also use this option. But we dont' use it currently. 6544 */ 6545 #define ENTER_SQ_WAIT_TICKS 100 6546 boolean_t 6547 ipsq_enter(ill_t *ill, boolean_t force, int type) 6548 { 6549 ipsq_t *ipsq; 6550 ipxop_t *ipx; 6551 boolean_t waited_enough = B_FALSE; 6552 ip_stack_t *ipst = ill->ill_ipst; 6553 6554 /* 6555 * Note that the relationship between ill and ipsq is fixed as long as 6556 * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the 6557 * relationship between the IPSQ and xop cannot change. However, 6558 * since we cannot hold ipsq_lock across the cv_wait(), it may change 6559 * while we're waiting. We wait on ill_cv and rely on ipsq_exit() 6560 * waking up all ills in the xop when it becomes available. 6561 */ 6562 for (;;) { 6563 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6564 mutex_enter(&ill->ill_lock); 6565 if (ill->ill_state_flags & ILL_CONDEMNED) { 6566 mutex_exit(&ill->ill_lock); 6567 rw_exit(&ipst->ips_ill_g_lock); 6568 return (B_FALSE); 6569 } 6570 6571 ipsq = ill->ill_phyint->phyint_ipsq; 6572 mutex_enter(&ipsq->ipsq_lock); 6573 ipx = ipsq->ipsq_xop; 6574 mutex_enter(&ipx->ipx_lock); 6575 6576 if (ipx->ipx_writer == NULL && (type == CUR_OP || 6577 (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) || 6578 waited_enough)) 6579 break; 6580 6581 rw_exit(&ipst->ips_ill_g_lock); 6582 6583 if (!force || ipx->ipx_writer != NULL) { 6584 mutex_exit(&ipx->ipx_lock); 6585 mutex_exit(&ipsq->ipsq_lock); 6586 cv_wait(&ill->ill_cv, &ill->ill_lock); 6587 } else { 6588 mutex_exit(&ipx->ipx_lock); 6589 mutex_exit(&ipsq->ipsq_lock); 6590 (void) cv_reltimedwait(&ill->ill_cv, 6591 &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK); 6592 waited_enough = B_TRUE; 6593 } 6594 mutex_exit(&ill->ill_lock); 6595 } 6596 6597 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 6598 ASSERT(ipx->ipx_reentry_cnt == 0); 6599 ipx->ipx_writer = curthread; 6600 ipx->ipx_forced = (ipx->ipx_current_ipif != NULL); 6601 ipx->ipx_reentry_cnt++; 6602 #ifdef DEBUG 6603 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6604 #endif 6605 mutex_exit(&ipx->ipx_lock); 6606 mutex_exit(&ipsq->ipsq_lock); 6607 mutex_exit(&ill->ill_lock); 6608 rw_exit(&ipst->ips_ill_g_lock); 6609 6610 return (B_TRUE); 6611 } 6612 6613 /* 6614 * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock 6615 * across the call to the core interface ipsq_try_enter() and hence calls this 6616 * function directly. This is explained more fully in ipif_set_values(). 6617 * In order to support the above constraint, ipsq_try_enter is implemented as 6618 * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently 6619 */ 6620 static ipsq_t * 6621 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, 6622 int type, boolean_t reentry_ok) 6623 { 6624 ipsq_t *ipsq; 6625 ipxop_t *ipx; 6626 ip_stack_t *ipst = ill->ill_ipst; 6627 6628 /* 6629 * lock ordering: 6630 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock. 6631 * 6632 * ipx of an ipsq can't change when ipsq_lock is held. 6633 */ 6634 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 6635 GRAB_CONN_LOCK(q); 6636 mutex_enter(&ill->ill_lock); 6637 ipsq = ill->ill_phyint->phyint_ipsq; 6638 mutex_enter(&ipsq->ipsq_lock); 6639 ipx = ipsq->ipsq_xop; 6640 mutex_enter(&ipx->ipx_lock); 6641 6642 /* 6643 * 1. Enter the ipsq if we are already writer and reentry is ok. 6644 * (Note: If the caller does not specify reentry_ok then neither 6645 * 'func' nor any of its callees must ever attempt to enter the ipsq 6646 * again. Otherwise it can lead to an infinite loop 6647 * 2. Enter the ipsq if there is no current writer and this attempted 6648 * entry is part of the current operation 6649 * 3. Enter the ipsq if there is no current writer and this is a new 6650 * operation and the operation queue is empty and there is no 6651 * operation currently in progress and if all previously initiated 6652 * DLPI operations have completed. 6653 */ 6654 if ((ipx->ipx_writer == curthread && reentry_ok) || 6655 (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP && 6656 !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL && 6657 ipsq_dlpi_done(ipsq))))) { 6658 /* Success. */ 6659 ipx->ipx_reentry_cnt++; 6660 ipx->ipx_writer = curthread; 6661 ipx->ipx_forced = B_FALSE; 6662 mutex_exit(&ipx->ipx_lock); 6663 mutex_exit(&ipsq->ipsq_lock); 6664 mutex_exit(&ill->ill_lock); 6665 RELEASE_CONN_LOCK(q); 6666 #ifdef DEBUG 6667 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6668 #endif 6669 return (ipsq); 6670 } 6671 6672 if (func != NULL) 6673 ipsq_enq(ipsq, q, mp, func, type, ill); 6674 6675 mutex_exit(&ipx->ipx_lock); 6676 mutex_exit(&ipsq->ipsq_lock); 6677 mutex_exit(&ill->ill_lock); 6678 RELEASE_CONN_LOCK(q); 6679 return (NULL); 6680 } 6681 6682 /* 6683 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 6684 * certain critical operations like plumbing (i.e. most set ioctls), etc. 6685 * There is one ipsq per phyint. The ipsq 6686 * serializes exclusive ioctls issued by applications on a per ipsq basis in 6687 * ipsq_xopq_mphead. It also protects against multiple threads executing in 6688 * the ipsq. Responses from the driver pertain to the current ioctl (say a 6689 * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing 6690 * up the interface) and are enqueued in ipx_mphead. 6691 * 6692 * If a thread does not want to reenter the ipsq when it is already writer, 6693 * it must make sure that the specified reentry point to be called later 6694 * when the ipsq is empty, nor any code path starting from the specified reentry 6695 * point must never ever try to enter the ipsq again. Otherwise it can lead 6696 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 6697 * When the thread that is currently exclusive finishes, it (ipsq_exit) 6698 * dequeues the requests waiting to become exclusive in ipx_mphead and calls 6699 * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit 6700 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 6701 * ioctl if the current ioctl has completed. If the current ioctl is still 6702 * in progress it simply returns. The current ioctl could be waiting for 6703 * a response from another module (the driver or could be waiting for 6704 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp 6705 * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the 6706 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 6707 * ipx_current_ipif is NULL which happens only once the ioctl is complete and 6708 * all associated DLPI operations have completed. 6709 */ 6710 6711 /* 6712 * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif' 6713 * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ 6714 * on success, or NULL on failure. The caller ensures ipif/ill is valid by 6715 * refholding it as necessary. If the IPSQ cannot be entered and `func' is 6716 * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ 6717 * can be entered. If `func' is NULL, then `q' and `mp' are ignored. 6718 */ 6719 ipsq_t * 6720 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 6721 ipsq_func_t func, int type, boolean_t reentry_ok) 6722 { 6723 ip_stack_t *ipst; 6724 ipsq_t *ipsq; 6725 6726 /* Only 1 of ipif or ill can be specified */ 6727 ASSERT((ipif != NULL) ^ (ill != NULL)); 6728 6729 if (ipif != NULL) 6730 ill = ipif->ipif_ill; 6731 ipst = ill->ill_ipst; 6732 6733 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6734 ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok); 6735 rw_exit(&ipst->ips_ill_g_lock); 6736 6737 return (ipsq); 6738 } 6739 6740 /* 6741 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures 6742 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ 6743 * cannot be entered, the mp is queued for completion. 6744 */ 6745 void 6746 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 6747 boolean_t reentry_ok) 6748 { 6749 ipsq_t *ipsq; 6750 6751 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok); 6752 6753 /* 6754 * Drop the caller's refhold on the ill. This is safe since we either 6755 * entered the IPSQ (and thus are exclusive), or failed to enter the 6756 * IPSQ, in which case we return without accessing ill anymore. This 6757 * is needed because func needs to see the correct refcount. 6758 * e.g. removeif can work only then. 6759 */ 6760 ill_refrele(ill); 6761 if (ipsq != NULL) { 6762 (*func)(ipsq, q, mp, NULL); 6763 ipsq_exit(ipsq); 6764 } 6765 } 6766 6767 /* 6768 * Exit the specified IPSQ. If this is the final exit on it then drain it 6769 * prior to exiting. Caller must be writer on the specified IPSQ. 6770 */ 6771 void 6772 ipsq_exit(ipsq_t *ipsq) 6773 { 6774 mblk_t *mp; 6775 ipsq_t *mp_ipsq; 6776 queue_t *q; 6777 phyint_t *phyi; 6778 ipsq_func_t func; 6779 6780 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6781 6782 ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1); 6783 if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) { 6784 ipsq->ipsq_xop->ipx_reentry_cnt--; 6785 return; 6786 } 6787 6788 for (;;) { 6789 phyi = ipsq->ipsq_phyint; 6790 mp = ipsq_dq(ipsq); 6791 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next; 6792 6793 /* 6794 * If we've changed to a new IPSQ, and the phyint associated 6795 * with the old one has gone away, free the old IPSQ. Note 6796 * that this cannot happen while the IPSQ is in a group. 6797 */ 6798 if (mp_ipsq != ipsq && phyi == NULL) { 6799 ASSERT(ipsq->ipsq_next == ipsq); 6800 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 6801 ipsq_delete(ipsq); 6802 } 6803 6804 if (mp == NULL) 6805 break; 6806 6807 q = mp->b_queue; 6808 func = (ipsq_func_t)mp->b_prev; 6809 ipsq = mp_ipsq; 6810 mp->b_next = mp->b_prev = NULL; 6811 mp->b_queue = NULL; 6812 6813 /* 6814 * If 'q' is an conn queue, it is valid, since we did a 6815 * a refhold on the conn at the start of the ioctl. 6816 * If 'q' is an ill queue, it is valid, since close of an 6817 * ill will clean up its IPSQ. 6818 */ 6819 (*func)(ipsq, q, mp, NULL); 6820 } 6821 } 6822 6823 /* 6824 * Used to start any igmp or mld timers that could not be started 6825 * while holding ill_mcast_lock. The timers can't be started while holding 6826 * the lock, since mld/igmp_start_timers may need to call untimeout() 6827 * which can't be done while holding the lock which the timeout handler 6828 * acquires. Otherwise 6829 * there could be a deadlock since the timeout handlers 6830 * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire 6831 * ill_mcast_lock. 6832 */ 6833 void 6834 ill_mcast_timer_start(ip_stack_t *ipst) 6835 { 6836 int next; 6837 6838 mutex_enter(&ipst->ips_igmp_timer_lock); 6839 next = ipst->ips_igmp_deferred_next; 6840 ipst->ips_igmp_deferred_next = INFINITY; 6841 mutex_exit(&ipst->ips_igmp_timer_lock); 6842 6843 if (next != INFINITY) 6844 igmp_start_timers(next, ipst); 6845 6846 mutex_enter(&ipst->ips_mld_timer_lock); 6847 next = ipst->ips_mld_deferred_next; 6848 ipst->ips_mld_deferred_next = INFINITY; 6849 mutex_exit(&ipst->ips_mld_timer_lock); 6850 6851 if (next != INFINITY) 6852 mld_start_timers(next, ipst); 6853 } 6854 6855 /* 6856 * Start the current exclusive operation on `ipsq'; associate it with `ipif' 6857 * and `ioccmd'. 6858 */ 6859 void 6860 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) 6861 { 6862 ill_t *ill = ipif->ipif_ill; 6863 ipxop_t *ipx = ipsq->ipsq_xop; 6864 6865 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6866 ASSERT(ipx->ipx_current_ipif == NULL); 6867 ASSERT(ipx->ipx_current_ioctl == 0); 6868 6869 ipx->ipx_current_done = B_FALSE; 6870 ipx->ipx_current_ioctl = ioccmd; 6871 mutex_enter(&ipx->ipx_lock); 6872 ipx->ipx_current_ipif = ipif; 6873 mutex_exit(&ipx->ipx_lock); 6874 6875 /* 6876 * Set IPIF_CHANGING on one or more ipifs associated with the 6877 * current exclusive operation. IPIF_CHANGING prevents any new 6878 * references to the ipif (so that the references will eventually 6879 * drop to zero) and also prevents any "get" operations (e.g., 6880 * SIOCGLIFFLAGS) from being able to access the ipif until the 6881 * operation has completed and the ipif is again in a stable state. 6882 * 6883 * For ioctls, IPIF_CHANGING is set on the ipif associated with the 6884 * ioctl. For internal operations (where ioccmd is zero), all ipifs 6885 * on the ill are marked with IPIF_CHANGING since it's unclear which 6886 * ipifs will be affected. 6887 * 6888 * Note that SIOCLIFREMOVEIF is a special case as it sets 6889 * IPIF_CONDEMNED internally after identifying the right ipif to 6890 * operate on. 6891 */ 6892 switch (ioccmd) { 6893 case SIOCLIFREMOVEIF: 6894 break; 6895 case 0: 6896 mutex_enter(&ill->ill_lock); 6897 ipif = ipif->ipif_ill->ill_ipif; 6898 for (; ipif != NULL; ipif = ipif->ipif_next) 6899 ipif->ipif_state_flags |= IPIF_CHANGING; 6900 mutex_exit(&ill->ill_lock); 6901 break; 6902 default: 6903 mutex_enter(&ill->ill_lock); 6904 ipif->ipif_state_flags |= IPIF_CHANGING; 6905 mutex_exit(&ill->ill_lock); 6906 } 6907 } 6908 6909 /* 6910 * Finish the current exclusive operation on `ipsq'. Usually, this will allow 6911 * the next exclusive operation to begin once we ipsq_exit(). However, if 6912 * pending DLPI operations remain, then we will wait for the queue to drain 6913 * before allowing the next exclusive operation to begin. This ensures that 6914 * DLPI operations from one exclusive operation are never improperly processed 6915 * as part of a subsequent exclusive operation. 6916 */ 6917 void 6918 ipsq_current_finish(ipsq_t *ipsq) 6919 { 6920 ipxop_t *ipx = ipsq->ipsq_xop; 6921 t_uscalar_t dlpi_pending = DL_PRIM_INVAL; 6922 ipif_t *ipif = ipx->ipx_current_ipif; 6923 6924 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6925 6926 /* 6927 * For SIOCLIFREMOVEIF, the ipif has been already been blown away 6928 * (but in that case, IPIF_CHANGING will already be clear and no 6929 * pending DLPI messages can remain). 6930 */ 6931 if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) { 6932 ill_t *ill = ipif->ipif_ill; 6933 6934 mutex_enter(&ill->ill_lock); 6935 dlpi_pending = ill->ill_dlpi_pending; 6936 if (ipx->ipx_current_ioctl == 0) { 6937 ipif = ill->ill_ipif; 6938 for (; ipif != NULL; ipif = ipif->ipif_next) 6939 ipif->ipif_state_flags &= ~IPIF_CHANGING; 6940 } else { 6941 ipif->ipif_state_flags &= ~IPIF_CHANGING; 6942 } 6943 mutex_exit(&ill->ill_lock); 6944 } 6945 6946 ASSERT(!ipx->ipx_current_done); 6947 ipx->ipx_current_done = B_TRUE; 6948 ipx->ipx_current_ioctl = 0; 6949 if (dlpi_pending == DL_PRIM_INVAL) { 6950 mutex_enter(&ipx->ipx_lock); 6951 ipx->ipx_current_ipif = NULL; 6952 mutex_exit(&ipx->ipx_lock); 6953 } 6954 } 6955 6956 /* 6957 * The ill is closing. Flush all messages on the ipsq that originated 6958 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 6959 * for this ill since ipsq_enter could not have entered until then. 6960 * New messages can't be queued since the CONDEMNED flag is set. 6961 */ 6962 static void 6963 ipsq_flush(ill_t *ill) 6964 { 6965 queue_t *q; 6966 mblk_t *prev; 6967 mblk_t *mp; 6968 mblk_t *mp_next; 6969 ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 6970 6971 ASSERT(IAM_WRITER_ILL(ill)); 6972 6973 /* 6974 * Flush any messages sent up by the driver. 6975 */ 6976 mutex_enter(&ipx->ipx_lock); 6977 for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) { 6978 mp_next = mp->b_next; 6979 q = mp->b_queue; 6980 if (q == ill->ill_rq || q == ill->ill_wq) { 6981 /* dequeue mp */ 6982 if (prev == NULL) 6983 ipx->ipx_mphead = mp->b_next; 6984 else 6985 prev->b_next = mp->b_next; 6986 if (ipx->ipx_mptail == mp) { 6987 ASSERT(mp_next == NULL); 6988 ipx->ipx_mptail = prev; 6989 } 6990 inet_freemsg(mp); 6991 } else { 6992 prev = mp; 6993 } 6994 } 6995 mutex_exit(&ipx->ipx_lock); 6996 (void) ipsq_pending_mp_cleanup(ill, NULL); 6997 ipsq_xopq_mp_cleanup(ill, NULL); 6998 } 6999 7000 /* 7001 * Parse an ifreq or lifreq struct coming down ioctls and refhold 7002 * and return the associated ipif. 7003 * Return value: 7004 * Non zero: An error has occurred. ci may not be filled out. 7005 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 7006 * a held ipif in ci.ci_ipif. 7007 */ 7008 int 7009 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 7010 cmd_info_t *ci) 7011 { 7012 char *name; 7013 struct ifreq *ifr; 7014 struct lifreq *lifr; 7015 ipif_t *ipif = NULL; 7016 ill_t *ill; 7017 conn_t *connp; 7018 boolean_t isv6; 7019 boolean_t exists; 7020 mblk_t *mp1; 7021 zoneid_t zoneid; 7022 ip_stack_t *ipst; 7023 7024 if (q->q_next != NULL) { 7025 ill = (ill_t *)q->q_ptr; 7026 isv6 = ill->ill_isv6; 7027 connp = NULL; 7028 zoneid = ALL_ZONES; 7029 ipst = ill->ill_ipst; 7030 } else { 7031 ill = NULL; 7032 connp = Q_TO_CONN(q); 7033 isv6 = (connp->conn_family == AF_INET6); 7034 zoneid = connp->conn_zoneid; 7035 if (zoneid == GLOBAL_ZONEID) { 7036 /* global zone can access ipifs in all zones */ 7037 zoneid = ALL_ZONES; 7038 } 7039 ipst = connp->conn_netstack->netstack_ip; 7040 } 7041 7042 /* Has been checked in ip_wput_nondata */ 7043 mp1 = mp->b_cont->b_cont; 7044 7045 if (ipip->ipi_cmd_type == IF_CMD) { 7046 /* This a old style SIOC[GS]IF* command */ 7047 ifr = (struct ifreq *)mp1->b_rptr; 7048 /* 7049 * Null terminate the string to protect against buffer 7050 * overrun. String was generated by user code and may not 7051 * be trusted. 7052 */ 7053 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 7054 name = ifr->ifr_name; 7055 ci->ci_sin = (sin_t *)&ifr->ifr_addr; 7056 ci->ci_sin6 = NULL; 7057 ci->ci_lifr = (struct lifreq *)ifr; 7058 } else { 7059 /* This a new style SIOC[GS]LIF* command */ 7060 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 7061 lifr = (struct lifreq *)mp1->b_rptr; 7062 /* 7063 * Null terminate the string to protect against buffer 7064 * overrun. String was generated by user code and may not 7065 * be trusted. 7066 */ 7067 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 7068 name = lifr->lifr_name; 7069 ci->ci_sin = (sin_t *)&lifr->lifr_addr; 7070 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr; 7071 ci->ci_lifr = lifr; 7072 } 7073 7074 if (ipip->ipi_cmd == SIOCSLIFNAME) { 7075 /* 7076 * The ioctl will be failed if the ioctl comes down 7077 * an conn stream 7078 */ 7079 if (ill == NULL) { 7080 /* 7081 * Not an ill queue, return EINVAL same as the 7082 * old error code. 7083 */ 7084 return (ENXIO); 7085 } 7086 ipif = ill->ill_ipif; 7087 ipif_refhold(ipif); 7088 } else { 7089 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 7090 &exists, isv6, zoneid, ipst); 7091 7092 /* 7093 * Ensure that get ioctls don't see any internal state changes 7094 * caused by set ioctls by deferring them if IPIF_CHANGING is 7095 * set. 7096 */ 7097 if (ipif != NULL && !(ipip->ipi_flags & IPI_WR) && 7098 !IAM_WRITER_IPIF(ipif)) { 7099 ipsq_t *ipsq; 7100 7101 if (connp != NULL) 7102 mutex_enter(&connp->conn_lock); 7103 mutex_enter(&ipif->ipif_ill->ill_lock); 7104 if (IPIF_IS_CHANGING(ipif) && 7105 !IPIF_IS_CONDEMNED(ipif)) { 7106 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 7107 mutex_enter(&ipsq->ipsq_lock); 7108 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 7109 mutex_exit(&ipif->ipif_ill->ill_lock); 7110 ipsq_enq(ipsq, q, mp, ip_process_ioctl, 7111 NEW_OP, ipif->ipif_ill); 7112 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 7113 mutex_exit(&ipsq->ipsq_lock); 7114 if (connp != NULL) 7115 mutex_exit(&connp->conn_lock); 7116 ipif_refrele(ipif); 7117 return (EINPROGRESS); 7118 } 7119 mutex_exit(&ipif->ipif_ill->ill_lock); 7120 if (connp != NULL) 7121 mutex_exit(&connp->conn_lock); 7122 } 7123 } 7124 7125 /* 7126 * Old style [GS]IFCMD does not admit IPv6 ipif 7127 */ 7128 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) { 7129 ipif_refrele(ipif); 7130 return (ENXIO); 7131 } 7132 7133 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 7134 name[0] == '\0') { 7135 /* 7136 * Handle a or a SIOC?IF* with a null name 7137 * during plumb (on the ill queue before the I_PLINK). 7138 */ 7139 ipif = ill->ill_ipif; 7140 ipif_refhold(ipif); 7141 } 7142 7143 if (ipif == NULL) 7144 return (ENXIO); 7145 7146 DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq", 7147 int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif); 7148 7149 ci->ci_ipif = ipif; 7150 return (0); 7151 } 7152 7153 /* 7154 * Return the total number of ipifs. 7155 */ 7156 static uint_t 7157 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst) 7158 { 7159 uint_t numifs = 0; 7160 ill_t *ill; 7161 ill_walk_context_t ctx; 7162 ipif_t *ipif; 7163 7164 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7165 ill = ILL_START_WALK_V4(&ctx, ipst); 7166 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7167 if (IS_UNDER_IPMP(ill)) 7168 continue; 7169 for (ipif = ill->ill_ipif; ipif != NULL; 7170 ipif = ipif->ipif_next) { 7171 if (ipif->ipif_zoneid == zoneid || 7172 ipif->ipif_zoneid == ALL_ZONES) 7173 numifs++; 7174 } 7175 } 7176 rw_exit(&ipst->ips_ill_g_lock); 7177 return (numifs); 7178 } 7179 7180 /* 7181 * Return the total number of ipifs. 7182 */ 7183 static uint_t 7184 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst) 7185 { 7186 uint_t numifs = 0; 7187 ill_t *ill; 7188 ipif_t *ipif; 7189 ill_walk_context_t ctx; 7190 7191 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 7192 7193 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7194 if (family == AF_INET) 7195 ill = ILL_START_WALK_V4(&ctx, ipst); 7196 else if (family == AF_INET6) 7197 ill = ILL_START_WALK_V6(&ctx, ipst); 7198 else 7199 ill = ILL_START_WALK_ALL(&ctx, ipst); 7200 7201 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7202 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP)) 7203 continue; 7204 7205 for (ipif = ill->ill_ipif; ipif != NULL; 7206 ipif = ipif->ipif_next) { 7207 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7208 !(lifn_flags & LIFC_NOXMIT)) 7209 continue; 7210 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7211 !(lifn_flags & LIFC_TEMPORARY)) 7212 continue; 7213 if (((ipif->ipif_flags & 7214 (IPIF_NOXMIT|IPIF_NOLOCAL| 7215 IPIF_DEPRECATED)) || 7216 IS_LOOPBACK(ill) || 7217 !(ipif->ipif_flags & IPIF_UP)) && 7218 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 7219 continue; 7220 7221 if (zoneid != ipif->ipif_zoneid && 7222 ipif->ipif_zoneid != ALL_ZONES && 7223 (zoneid != GLOBAL_ZONEID || 7224 !(lifn_flags & LIFC_ALLZONES))) 7225 continue; 7226 7227 numifs++; 7228 } 7229 } 7230 rw_exit(&ipst->ips_ill_g_lock); 7231 return (numifs); 7232 } 7233 7234 uint_t 7235 ip_get_lifsrcofnum(ill_t *ill) 7236 { 7237 uint_t numifs = 0; 7238 ill_t *ill_head = ill; 7239 ip_stack_t *ipst = ill->ill_ipst; 7240 7241 /* 7242 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 7243 * other thread may be trying to relink the ILLs in this usesrc group 7244 * and adjusting the ill_usesrc_grp_next pointers 7245 */ 7246 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 7247 if ((ill->ill_usesrc_ifindex == 0) && 7248 (ill->ill_usesrc_grp_next != NULL)) { 7249 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 7250 ill = ill->ill_usesrc_grp_next) 7251 numifs++; 7252 } 7253 rw_exit(&ipst->ips_ill_g_usesrc_lock); 7254 7255 return (numifs); 7256 } 7257 7258 /* Null values are passed in for ipif, sin, and ifreq */ 7259 /* ARGSUSED */ 7260 int 7261 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7262 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7263 { 7264 int *nump; 7265 conn_t *connp = Q_TO_CONN(q); 7266 7267 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7268 7269 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 7270 nump = (int *)mp->b_cont->b_cont->b_rptr; 7271 7272 *nump = ip_get_numifs(connp->conn_zoneid, 7273 connp->conn_netstack->netstack_ip); 7274 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 7275 return (0); 7276 } 7277 7278 /* Null values are passed in for ipif, sin, and ifreq */ 7279 /* ARGSUSED */ 7280 int 7281 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 7282 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7283 { 7284 struct lifnum *lifn; 7285 mblk_t *mp1; 7286 conn_t *connp = Q_TO_CONN(q); 7287 7288 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7289 7290 /* Existence checked in ip_wput_nondata */ 7291 mp1 = mp->b_cont->b_cont; 7292 7293 lifn = (struct lifnum *)mp1->b_rptr; 7294 switch (lifn->lifn_family) { 7295 case AF_UNSPEC: 7296 case AF_INET: 7297 case AF_INET6: 7298 break; 7299 default: 7300 return (EAFNOSUPPORT); 7301 } 7302 7303 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 7304 connp->conn_zoneid, connp->conn_netstack->netstack_ip); 7305 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 7306 return (0); 7307 } 7308 7309 /* ARGSUSED */ 7310 int 7311 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7312 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7313 { 7314 STRUCT_HANDLE(ifconf, ifc); 7315 mblk_t *mp1; 7316 struct iocblk *iocp; 7317 struct ifreq *ifr; 7318 ill_walk_context_t ctx; 7319 ill_t *ill; 7320 ipif_t *ipif; 7321 struct sockaddr_in *sin; 7322 int32_t ifclen; 7323 zoneid_t zoneid; 7324 ip_stack_t *ipst = CONNQ_TO_IPST(q); 7325 7326 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 7327 7328 ip1dbg(("ip_sioctl_get_ifconf")); 7329 /* Existence verified in ip_wput_nondata */ 7330 mp1 = mp->b_cont->b_cont; 7331 iocp = (struct iocblk *)mp->b_rptr; 7332 zoneid = Q_TO_CONN(q)->conn_zoneid; 7333 7334 /* 7335 * The original SIOCGIFCONF passed in a struct ifconf which specified 7336 * the user buffer address and length into which the list of struct 7337 * ifreqs was to be copied. Since AT&T Streams does not seem to 7338 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 7339 * the SIOCGIFCONF operation was redefined to simply provide 7340 * a large output buffer into which we are supposed to jam the ifreq 7341 * array. The same ioctl command code was used, despite the fact that 7342 * both the applications and the kernel code had to change, thus making 7343 * it impossible to support both interfaces. 7344 * 7345 * For reasons not good enough to try to explain, the following 7346 * algorithm is used for deciding what to do with one of these: 7347 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 7348 * form with the output buffer coming down as the continuation message. 7349 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 7350 * and we have to copy in the ifconf structure to find out how big the 7351 * output buffer is and where to copy out to. Sure no problem... 7352 * 7353 */ 7354 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 7355 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 7356 int numifs = 0; 7357 size_t ifc_bufsize; 7358 7359 /* 7360 * Must be (better be!) continuation of a TRANSPARENT 7361 * IOCTL. We just copied in the ifconf structure. 7362 */ 7363 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 7364 (struct ifconf *)mp1->b_rptr); 7365 7366 /* 7367 * Allocate a buffer to hold requested information. 7368 * 7369 * If ifc_len is larger than what is needed, we only 7370 * allocate what we will use. 7371 * 7372 * If ifc_len is smaller than what is needed, return 7373 * EINVAL. 7374 * 7375 * XXX: the ill_t structure can hava 2 counters, for 7376 * v4 and v6 (not just ill_ipif_up_count) to store the 7377 * number of interfaces for a device, so we don't need 7378 * to count them here... 7379 */ 7380 numifs = ip_get_numifs(zoneid, ipst); 7381 7382 ifclen = STRUCT_FGET(ifc, ifc_len); 7383 ifc_bufsize = numifs * sizeof (struct ifreq); 7384 if (ifc_bufsize > ifclen) { 7385 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 7386 /* old behaviour */ 7387 return (EINVAL); 7388 } else { 7389 ifc_bufsize = ifclen; 7390 } 7391 } 7392 7393 mp1 = mi_copyout_alloc(q, mp, 7394 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 7395 if (mp1 == NULL) 7396 return (ENOMEM); 7397 7398 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 7399 } 7400 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 7401 /* 7402 * the SIOCGIFCONF ioctl only knows about 7403 * IPv4 addresses, so don't try to tell 7404 * it about interfaces with IPv6-only 7405 * addresses. (Last parm 'isv6' is B_FALSE) 7406 */ 7407 7408 ifr = (struct ifreq *)mp1->b_rptr; 7409 7410 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7411 ill = ILL_START_WALK_V4(&ctx, ipst); 7412 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7413 if (IS_UNDER_IPMP(ill)) 7414 continue; 7415 for (ipif = ill->ill_ipif; ipif != NULL; 7416 ipif = ipif->ipif_next) { 7417 if (zoneid != ipif->ipif_zoneid && 7418 ipif->ipif_zoneid != ALL_ZONES) 7419 continue; 7420 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 7421 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 7422 /* old behaviour */ 7423 rw_exit(&ipst->ips_ill_g_lock); 7424 return (EINVAL); 7425 } else { 7426 goto if_copydone; 7427 } 7428 } 7429 ipif_get_name(ipif, ifr->ifr_name, 7430 sizeof (ifr->ifr_name)); 7431 sin = (sin_t *)&ifr->ifr_addr; 7432 *sin = sin_null; 7433 sin->sin_family = AF_INET; 7434 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 7435 ifr++; 7436 } 7437 } 7438 if_copydone: 7439 rw_exit(&ipst->ips_ill_g_lock); 7440 mp1->b_wptr = (uchar_t *)ifr; 7441 7442 if (STRUCT_BUF(ifc) != NULL) { 7443 STRUCT_FSET(ifc, ifc_len, 7444 (int)((uchar_t *)ifr - mp1->b_rptr)); 7445 } 7446 return (0); 7447 } 7448 7449 /* 7450 * Get the interfaces using the address hosted on the interface passed in, 7451 * as a source adddress 7452 */ 7453 /* ARGSUSED */ 7454 int 7455 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7456 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7457 { 7458 mblk_t *mp1; 7459 ill_t *ill, *ill_head; 7460 ipif_t *ipif, *orig_ipif; 7461 int numlifs = 0; 7462 size_t lifs_bufsize, lifsmaxlen; 7463 struct lifreq *lifr; 7464 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7465 uint_t ifindex; 7466 zoneid_t zoneid; 7467 boolean_t isv6 = B_FALSE; 7468 struct sockaddr_in *sin; 7469 struct sockaddr_in6 *sin6; 7470 STRUCT_HANDLE(lifsrcof, lifs); 7471 ip_stack_t *ipst; 7472 7473 ipst = CONNQ_TO_IPST(q); 7474 7475 ASSERT(q->q_next == NULL); 7476 7477 zoneid = Q_TO_CONN(q)->conn_zoneid; 7478 7479 /* Existence verified in ip_wput_nondata */ 7480 mp1 = mp->b_cont->b_cont; 7481 7482 /* 7483 * Must be (better be!) continuation of a TRANSPARENT 7484 * IOCTL. We just copied in the lifsrcof structure. 7485 */ 7486 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 7487 (struct lifsrcof *)mp1->b_rptr); 7488 7489 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 7490 return (EINVAL); 7491 7492 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 7493 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; 7494 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst); 7495 if (ipif == NULL) { 7496 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 7497 ifindex)); 7498 return (ENXIO); 7499 } 7500 7501 /* Allocate a buffer to hold requested information */ 7502 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 7503 lifs_bufsize = numlifs * sizeof (struct lifreq); 7504 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 7505 /* The actual size needed is always returned in lifs_len */ 7506 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 7507 7508 /* If the amount we need is more than what is passed in, abort */ 7509 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 7510 ipif_refrele(ipif); 7511 return (0); 7512 } 7513 7514 mp1 = mi_copyout_alloc(q, mp, 7515 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 7516 if (mp1 == NULL) { 7517 ipif_refrele(ipif); 7518 return (ENOMEM); 7519 } 7520 7521 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 7522 bzero(mp1->b_rptr, lifs_bufsize); 7523 7524 lifr = (struct lifreq *)mp1->b_rptr; 7525 7526 ill = ill_head = ipif->ipif_ill; 7527 orig_ipif = ipif; 7528 7529 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 7530 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 7531 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7532 7533 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 7534 for (; (ill != NULL) && (ill != ill_head); 7535 ill = ill->ill_usesrc_grp_next) { 7536 7537 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 7538 break; 7539 7540 ipif = ill->ill_ipif; 7541 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name)); 7542 if (ipif->ipif_isv6) { 7543 sin6 = (sin6_t *)&lifr->lifr_addr; 7544 *sin6 = sin6_null; 7545 sin6->sin6_family = AF_INET6; 7546 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 7547 lifr->lifr_addrlen = ip_mask_to_plen_v6( 7548 &ipif->ipif_v6net_mask); 7549 } else { 7550 sin = (sin_t *)&lifr->lifr_addr; 7551 *sin = sin_null; 7552 sin->sin_family = AF_INET; 7553 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 7554 lifr->lifr_addrlen = ip_mask_to_plen( 7555 ipif->ipif_net_mask); 7556 } 7557 lifr++; 7558 } 7559 rw_exit(&ipst->ips_ill_g_usesrc_lock); 7560 rw_exit(&ipst->ips_ill_g_lock); 7561 ipif_refrele(orig_ipif); 7562 mp1->b_wptr = (uchar_t *)lifr; 7563 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 7564 7565 return (0); 7566 } 7567 7568 /* ARGSUSED */ 7569 int 7570 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7571 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7572 { 7573 mblk_t *mp1; 7574 int list; 7575 ill_t *ill; 7576 ipif_t *ipif; 7577 int flags; 7578 int numlifs = 0; 7579 size_t lifc_bufsize; 7580 struct lifreq *lifr; 7581 sa_family_t family; 7582 struct sockaddr_in *sin; 7583 struct sockaddr_in6 *sin6; 7584 ill_walk_context_t ctx; 7585 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7586 int32_t lifclen; 7587 zoneid_t zoneid; 7588 STRUCT_HANDLE(lifconf, lifc); 7589 ip_stack_t *ipst = CONNQ_TO_IPST(q); 7590 7591 ip1dbg(("ip_sioctl_get_lifconf")); 7592 7593 ASSERT(q->q_next == NULL); 7594 7595 zoneid = Q_TO_CONN(q)->conn_zoneid; 7596 7597 /* Existence verified in ip_wput_nondata */ 7598 mp1 = mp->b_cont->b_cont; 7599 7600 /* 7601 * An extended version of SIOCGIFCONF that takes an 7602 * additional address family and flags field. 7603 * AF_UNSPEC retrieve both IPv4 and IPv6. 7604 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 7605 * interfaces are omitted. 7606 * Similarly, IPIF_TEMPORARY interfaces are omitted 7607 * unless LIFC_TEMPORARY is specified. 7608 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 7609 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 7610 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 7611 * has priority over LIFC_NOXMIT. 7612 */ 7613 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 7614 7615 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 7616 return (EINVAL); 7617 7618 /* 7619 * Must be (better be!) continuation of a TRANSPARENT 7620 * IOCTL. We just copied in the lifconf structure. 7621 */ 7622 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 7623 7624 family = STRUCT_FGET(lifc, lifc_family); 7625 flags = STRUCT_FGET(lifc, lifc_flags); 7626 7627 switch (family) { 7628 case AF_UNSPEC: 7629 /* 7630 * walk all ILL's. 7631 */ 7632 list = MAX_G_HEADS; 7633 break; 7634 case AF_INET: 7635 /* 7636 * walk only IPV4 ILL's. 7637 */ 7638 list = IP_V4_G_HEAD; 7639 break; 7640 case AF_INET6: 7641 /* 7642 * walk only IPV6 ILL's. 7643 */ 7644 list = IP_V6_G_HEAD; 7645 break; 7646 default: 7647 return (EAFNOSUPPORT); 7648 } 7649 7650 /* 7651 * Allocate a buffer to hold requested information. 7652 * 7653 * If lifc_len is larger than what is needed, we only 7654 * allocate what we will use. 7655 * 7656 * If lifc_len is smaller than what is needed, return 7657 * EINVAL. 7658 */ 7659 numlifs = ip_get_numlifs(family, flags, zoneid, ipst); 7660 lifc_bufsize = numlifs * sizeof (struct lifreq); 7661 lifclen = STRUCT_FGET(lifc, lifc_len); 7662 if (lifc_bufsize > lifclen) { 7663 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 7664 return (EINVAL); 7665 else 7666 lifc_bufsize = lifclen; 7667 } 7668 7669 mp1 = mi_copyout_alloc(q, mp, 7670 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 7671 if (mp1 == NULL) 7672 return (ENOMEM); 7673 7674 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 7675 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 7676 7677 lifr = (struct lifreq *)mp1->b_rptr; 7678 7679 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7680 ill = ill_first(list, list, &ctx, ipst); 7681 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7682 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP)) 7683 continue; 7684 7685 for (ipif = ill->ill_ipif; ipif != NULL; 7686 ipif = ipif->ipif_next) { 7687 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7688 !(flags & LIFC_NOXMIT)) 7689 continue; 7690 7691 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7692 !(flags & LIFC_TEMPORARY)) 7693 continue; 7694 7695 if (((ipif->ipif_flags & 7696 (IPIF_NOXMIT|IPIF_NOLOCAL| 7697 IPIF_DEPRECATED)) || 7698 IS_LOOPBACK(ill) || 7699 !(ipif->ipif_flags & IPIF_UP)) && 7700 (flags & LIFC_EXTERNAL_SOURCE)) 7701 continue; 7702 7703 if (zoneid != ipif->ipif_zoneid && 7704 ipif->ipif_zoneid != ALL_ZONES && 7705 (zoneid != GLOBAL_ZONEID || 7706 !(flags & LIFC_ALLZONES))) 7707 continue; 7708 7709 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 7710 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 7711 rw_exit(&ipst->ips_ill_g_lock); 7712 return (EINVAL); 7713 } else { 7714 goto lif_copydone; 7715 } 7716 } 7717 7718 ipif_get_name(ipif, lifr->lifr_name, 7719 sizeof (lifr->lifr_name)); 7720 lifr->lifr_type = ill->ill_type; 7721 if (ipif->ipif_isv6) { 7722 sin6 = (sin6_t *)&lifr->lifr_addr; 7723 *sin6 = sin6_null; 7724 sin6->sin6_family = AF_INET6; 7725 sin6->sin6_addr = 7726 ipif->ipif_v6lcl_addr; 7727 lifr->lifr_addrlen = 7728 ip_mask_to_plen_v6( 7729 &ipif->ipif_v6net_mask); 7730 } else { 7731 sin = (sin_t *)&lifr->lifr_addr; 7732 *sin = sin_null; 7733 sin->sin_family = AF_INET; 7734 sin->sin_addr.s_addr = 7735 ipif->ipif_lcl_addr; 7736 lifr->lifr_addrlen = 7737 ip_mask_to_plen( 7738 ipif->ipif_net_mask); 7739 } 7740 lifr++; 7741 } 7742 } 7743 lif_copydone: 7744 rw_exit(&ipst->ips_ill_g_lock); 7745 7746 mp1->b_wptr = (uchar_t *)lifr; 7747 if (STRUCT_BUF(lifc) != NULL) { 7748 STRUCT_FSET(lifc, lifc_len, 7749 (int)((uchar_t *)lifr - mp1->b_rptr)); 7750 } 7751 return (0); 7752 } 7753 7754 static void 7755 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 7756 { 7757 ip6_asp_t *table; 7758 size_t table_size; 7759 mblk_t *data_mp; 7760 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7761 ip_stack_t *ipst; 7762 7763 if (q->q_next == NULL) 7764 ipst = CONNQ_TO_IPST(q); 7765 else 7766 ipst = ILLQ_TO_IPST(q); 7767 7768 /* These two ioctls are I_STR only */ 7769 if (iocp->ioc_count == TRANSPARENT) { 7770 miocnak(q, mp, 0, EINVAL); 7771 return; 7772 } 7773 7774 data_mp = mp->b_cont; 7775 if (data_mp == NULL) { 7776 /* The user passed us a NULL argument */ 7777 table = NULL; 7778 table_size = iocp->ioc_count; 7779 } else { 7780 /* 7781 * The user provided a table. The stream head 7782 * may have copied in the user data in chunks, 7783 * so make sure everything is pulled up 7784 * properly. 7785 */ 7786 if (MBLKL(data_mp) < iocp->ioc_count) { 7787 mblk_t *new_data_mp; 7788 if ((new_data_mp = msgpullup(data_mp, -1)) == 7789 NULL) { 7790 miocnak(q, mp, 0, ENOMEM); 7791 return; 7792 } 7793 freemsg(data_mp); 7794 data_mp = new_data_mp; 7795 mp->b_cont = data_mp; 7796 } 7797 table = (ip6_asp_t *)data_mp->b_rptr; 7798 table_size = iocp->ioc_count; 7799 } 7800 7801 switch (iocp->ioc_cmd) { 7802 case SIOCGIP6ADDRPOLICY: 7803 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst); 7804 if (iocp->ioc_rval == -1) 7805 iocp->ioc_error = EINVAL; 7806 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 7807 else if (table != NULL && 7808 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 7809 ip6_asp_t *src = table; 7810 ip6_asp32_t *dst = (void *)table; 7811 int count = table_size / sizeof (ip6_asp_t); 7812 int i; 7813 7814 /* 7815 * We need to do an in-place shrink of the array 7816 * to match the alignment attributes of the 7817 * 32-bit ABI looking at it. 7818 */ 7819 /* LINTED: logical expression always true: op "||" */ 7820 ASSERT(sizeof (*src) > sizeof (*dst)); 7821 for (i = 1; i < count; i++) 7822 bcopy(src + i, dst + i, sizeof (*dst)); 7823 } 7824 #endif 7825 break; 7826 7827 case SIOCSIP6ADDRPOLICY: 7828 ASSERT(mp->b_prev == NULL); 7829 mp->b_prev = (void *)q; 7830 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 7831 /* 7832 * We pass in the datamodel here so that the ip6_asp_replace() 7833 * routine can handle converting from 32-bit to native formats 7834 * where necessary. 7835 * 7836 * A better way to handle this might be to convert the inbound 7837 * data structure here, and hang it off a new 'mp'; thus the 7838 * ip6_asp_replace() logic would always be dealing with native 7839 * format data structures.. 7840 * 7841 * (An even simpler way to handle these ioctls is to just 7842 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 7843 * and just recompile everything that depends on it.) 7844 */ 7845 #endif 7846 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst, 7847 iocp->ioc_flag & IOC_MODELS); 7848 return; 7849 } 7850 7851 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 7852 qreply(q, mp); 7853 } 7854 7855 static void 7856 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 7857 { 7858 mblk_t *data_mp; 7859 struct dstinforeq *dir; 7860 uint8_t *end, *cur; 7861 in6_addr_t *daddr, *saddr; 7862 ipaddr_t v4daddr; 7863 ire_t *ire; 7864 ipaddr_t v4setsrc; 7865 in6_addr_t v6setsrc; 7866 char *slabel, *dlabel; 7867 boolean_t isipv4; 7868 int match_ire; 7869 ill_t *dst_ill; 7870 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7871 conn_t *connp = Q_TO_CONN(q); 7872 zoneid_t zoneid = IPCL_ZONEID(connp); 7873 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 7874 uint64_t ipif_flags; 7875 7876 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 7877 7878 /* 7879 * This ioctl is I_STR only, and must have a 7880 * data mblk following the M_IOCTL mblk. 7881 */ 7882 data_mp = mp->b_cont; 7883 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 7884 miocnak(q, mp, 0, EINVAL); 7885 return; 7886 } 7887 7888 if (MBLKL(data_mp) < iocp->ioc_count) { 7889 mblk_t *new_data_mp; 7890 7891 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 7892 miocnak(q, mp, 0, ENOMEM); 7893 return; 7894 } 7895 freemsg(data_mp); 7896 data_mp = new_data_mp; 7897 mp->b_cont = data_mp; 7898 } 7899 match_ire = MATCH_IRE_DSTONLY; 7900 7901 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 7902 end - cur >= sizeof (struct dstinforeq); 7903 cur += sizeof (struct dstinforeq)) { 7904 dir = (struct dstinforeq *)cur; 7905 daddr = &dir->dir_daddr; 7906 saddr = &dir->dir_saddr; 7907 7908 /* 7909 * ip_addr_scope_v6() and ip6_asp_lookup() handle 7910 * v4 mapped addresses; ire_ftable_lookup_v6() 7911 * and ip_select_source_v6() do not. 7912 */ 7913 dir->dir_dscope = ip_addr_scope_v6(daddr); 7914 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst); 7915 7916 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 7917 if (isipv4) { 7918 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 7919 v4setsrc = INADDR_ANY; 7920 ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid, 7921 NULL, match_ire, B_TRUE, 0, ipst, &v4setsrc, NULL, 7922 NULL); 7923 } else { 7924 v6setsrc = ipv6_all_zeros; 7925 ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid, 7926 NULL, match_ire, B_TRUE, 0, ipst, &v6setsrc, NULL, 7927 NULL); 7928 } 7929 ASSERT(ire != NULL); 7930 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 7931 ire_refrele(ire); 7932 dir->dir_dreachable = 0; 7933 7934 /* move on to next dst addr */ 7935 continue; 7936 } 7937 dir->dir_dreachable = 1; 7938 7939 dst_ill = ire_nexthop_ill(ire); 7940 if (dst_ill == NULL) { 7941 ire_refrele(ire); 7942 continue; 7943 } 7944 7945 /* With ipmp we most likely look at the ipmp ill here */ 7946 dir->dir_dmactype = dst_ill->ill_mactype; 7947 7948 if (isipv4) { 7949 ipaddr_t v4saddr; 7950 7951 if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr, 7952 connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst, 7953 &v4saddr, NULL, &ipif_flags) != 0) { 7954 v4saddr = INADDR_ANY; 7955 ipif_flags = 0; 7956 } 7957 IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr); 7958 } else { 7959 if (ip_select_source_v6(dst_ill, &v6setsrc, daddr, 7960 zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT, 7961 saddr, NULL, &ipif_flags) != 0) { 7962 *saddr = ipv6_all_zeros; 7963 ipif_flags = 0; 7964 } 7965 } 7966 7967 dir->dir_sscope = ip_addr_scope_v6(saddr); 7968 slabel = ip6_asp_lookup(saddr, NULL, ipst); 7969 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 7970 dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 7971 ire_refrele(ire); 7972 ill_refrele(dst_ill); 7973 } 7974 miocack(q, mp, iocp->ioc_count, 0); 7975 } 7976 7977 /* 7978 * Check if this is an address assigned to this machine. 7979 * Skips interfaces that are down by using ire checks. 7980 * Translates mapped addresses to v4 addresses and then 7981 * treats them as such, returning true if the v4 address 7982 * associated with this mapped address is configured. 7983 * Note: Applications will have to be careful what they do 7984 * with the response; use of mapped addresses limits 7985 * what can be done with the socket, especially with 7986 * respect to socket options and ioctls - neither IPv4 7987 * options nor IPv6 sticky options/ancillary data options 7988 * may be used. 7989 */ 7990 /* ARGSUSED */ 7991 int 7992 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7993 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 7994 { 7995 struct sioc_addrreq *sia; 7996 sin_t *sin; 7997 ire_t *ire; 7998 mblk_t *mp1; 7999 zoneid_t zoneid; 8000 ip_stack_t *ipst; 8001 8002 ip1dbg(("ip_sioctl_tmyaddr")); 8003 8004 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8005 zoneid = Q_TO_CONN(q)->conn_zoneid; 8006 ipst = CONNQ_TO_IPST(q); 8007 8008 /* Existence verified in ip_wput_nondata */ 8009 mp1 = mp->b_cont->b_cont; 8010 sia = (struct sioc_addrreq *)mp1->b_rptr; 8011 sin = (sin_t *)&sia->sa_addr; 8012 switch (sin->sin_family) { 8013 case AF_INET6: { 8014 sin6_t *sin6 = (sin6_t *)sin; 8015 8016 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8017 ipaddr_t v4_addr; 8018 8019 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8020 v4_addr); 8021 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 8022 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, 8023 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8024 } else { 8025 in6_addr_t v6addr; 8026 8027 v6addr = sin6->sin6_addr; 8028 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 8029 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, 8030 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8031 } 8032 break; 8033 } 8034 case AF_INET: { 8035 ipaddr_t v4addr; 8036 8037 v4addr = sin->sin_addr.s_addr; 8038 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 8039 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8040 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8041 break; 8042 } 8043 default: 8044 return (EAFNOSUPPORT); 8045 } 8046 if (ire != NULL) { 8047 sia->sa_res = 1; 8048 ire_refrele(ire); 8049 } else { 8050 sia->sa_res = 0; 8051 } 8052 return (0); 8053 } 8054 8055 /* 8056 * Check if this is an address assigned on-link i.e. neighbor, 8057 * and makes sure it's reachable from the current zone. 8058 * Returns true for my addresses as well. 8059 * Translates mapped addresses to v4 addresses and then 8060 * treats them as such, returning true if the v4 address 8061 * associated with this mapped address is configured. 8062 * Note: Applications will have to be careful what they do 8063 * with the response; use of mapped addresses limits 8064 * what can be done with the socket, especially with 8065 * respect to socket options and ioctls - neither IPv4 8066 * options nor IPv6 sticky options/ancillary data options 8067 * may be used. 8068 */ 8069 /* ARGSUSED */ 8070 int 8071 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8072 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 8073 { 8074 struct sioc_addrreq *sia; 8075 sin_t *sin; 8076 mblk_t *mp1; 8077 ire_t *ire = NULL; 8078 zoneid_t zoneid; 8079 ip_stack_t *ipst; 8080 8081 ip1dbg(("ip_sioctl_tonlink")); 8082 8083 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8084 zoneid = Q_TO_CONN(q)->conn_zoneid; 8085 ipst = CONNQ_TO_IPST(q); 8086 8087 /* Existence verified in ip_wput_nondata */ 8088 mp1 = mp->b_cont->b_cont; 8089 sia = (struct sioc_addrreq *)mp1->b_rptr; 8090 sin = (sin_t *)&sia->sa_addr; 8091 8092 /* 8093 * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST 8094 * to make sure we only look at on-link unicast address. 8095 */ 8096 switch (sin->sin_family) { 8097 case AF_INET6: { 8098 sin6_t *sin6 = (sin6_t *)sin; 8099 8100 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8101 ipaddr_t v4_addr; 8102 8103 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8104 v4_addr); 8105 if (!CLASSD(v4_addr)) { 8106 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0, 8107 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 8108 0, ipst, NULL); 8109 } 8110 } else { 8111 in6_addr_t v6addr; 8112 8113 v6addr = sin6->sin6_addr; 8114 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 8115 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0, 8116 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 0, 8117 ipst, NULL); 8118 } 8119 } 8120 break; 8121 } 8122 case AF_INET: { 8123 ipaddr_t v4addr; 8124 8125 v4addr = sin->sin_addr.s_addr; 8126 if (!CLASSD(v4addr)) { 8127 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL, 8128 zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 8129 } 8130 break; 8131 } 8132 default: 8133 return (EAFNOSUPPORT); 8134 } 8135 sia->sa_res = 0; 8136 if (ire != NULL) { 8137 ASSERT(!(ire->ire_type & IRE_MULTICAST)); 8138 8139 if ((ire->ire_type & IRE_ONLINK) && 8140 !(ire->ire_type & IRE_BROADCAST)) 8141 sia->sa_res = 1; 8142 ire_refrele(ire); 8143 } 8144 return (0); 8145 } 8146 8147 /* 8148 * TBD: implement when kernel maintaines a list of site prefixes. 8149 */ 8150 /* ARGSUSED */ 8151 int 8152 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8153 ip_ioctl_cmd_t *ipip, void *ifreq) 8154 { 8155 return (ENXIO); 8156 } 8157 8158 /* ARP IOCTLs. */ 8159 /* ARGSUSED */ 8160 int 8161 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8162 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8163 { 8164 int err; 8165 ipaddr_t ipaddr; 8166 struct iocblk *iocp; 8167 conn_t *connp; 8168 struct arpreq *ar; 8169 struct xarpreq *xar; 8170 int arp_flags, flags, alength; 8171 uchar_t *lladdr; 8172 ip_stack_t *ipst; 8173 ill_t *ill = ipif->ipif_ill; 8174 ill_t *proxy_ill = NULL; 8175 ipmp_arpent_t *entp = NULL; 8176 boolean_t proxyarp = B_FALSE; 8177 boolean_t if_arp_ioctl = B_FALSE; 8178 ncec_t *ncec = NULL; 8179 nce_t *nce; 8180 8181 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8182 connp = Q_TO_CONN(q); 8183 ipst = connp->conn_netstack->netstack_ip; 8184 iocp = (struct iocblk *)mp->b_rptr; 8185 8186 if (ipip->ipi_cmd_type == XARP_CMD) { 8187 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 8188 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 8189 ar = NULL; 8190 8191 arp_flags = xar->xarp_flags; 8192 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha); 8193 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0); 8194 /* 8195 * Validate against user's link layer address length 8196 * input and name and addr length limits. 8197 */ 8198 alength = ill->ill_phys_addr_length; 8199 if (ipip->ipi_cmd == SIOCSXARP) { 8200 if (alength != xar->xarp_ha.sdl_alen || 8201 (alength + xar->xarp_ha.sdl_nlen > 8202 sizeof (xar->xarp_ha.sdl_data))) 8203 return (EINVAL); 8204 } 8205 } else { 8206 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 8207 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 8208 xar = NULL; 8209 8210 arp_flags = ar->arp_flags; 8211 lladdr = (uchar_t *)ar->arp_ha.sa_data; 8212 /* 8213 * Theoretically, the sa_family could tell us what link 8214 * layer type this operation is trying to deal with. By 8215 * common usage AF_UNSPEC means ethernet. We'll assume 8216 * any attempt to use the SIOC?ARP ioctls is for ethernet, 8217 * for now. Our new SIOC*XARP ioctls can be used more 8218 * generally. 8219 * 8220 * If the underlying media happens to have a non 6 byte 8221 * address, arp module will fail set/get, but the del 8222 * operation will succeed. 8223 */ 8224 alength = 6; 8225 if ((ipip->ipi_cmd != SIOCDARP) && 8226 (alength != ill->ill_phys_addr_length)) { 8227 return (EINVAL); 8228 } 8229 } 8230 8231 /* Translate ATF* flags to NCE* flags */ 8232 flags = 0; 8233 if (arp_flags & ATF_AUTHORITY) 8234 flags |= NCE_F_AUTHORITY; 8235 if (arp_flags & ATF_PERM) 8236 flags |= NCE_F_NONUD; /* not subject to aging */ 8237 if (arp_flags & ATF_PUBL) 8238 flags |= NCE_F_PUBLISH; 8239 8240 /* 8241 * IPMP ARP special handling: 8242 * 8243 * 1. Since ARP mappings must appear consistent across the group, 8244 * prohibit changing ARP mappings on the underlying interfaces. 8245 * 8246 * 2. Since ARP mappings for IPMP data addresses are maintained by 8247 * IP itself, prohibit changing them. 8248 * 8249 * 3. For proxy ARP, use a functioning hardware address in the group, 8250 * provided one exists. If one doesn't, just add the entry as-is; 8251 * ipmp_illgrp_refresh_arpent() will refresh it if things change. 8252 */ 8253 if (IS_UNDER_IPMP(ill)) { 8254 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP) 8255 return (EPERM); 8256 } 8257 if (IS_IPMP(ill)) { 8258 ipmp_illgrp_t *illg = ill->ill_grp; 8259 8260 switch (ipip->ipi_cmd) { 8261 case SIOCSARP: 8262 case SIOCSXARP: 8263 proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength); 8264 if (proxy_ill != NULL) { 8265 proxyarp = B_TRUE; 8266 if (!ipmp_ill_is_active(proxy_ill)) 8267 proxy_ill = ipmp_illgrp_next_ill(illg); 8268 if (proxy_ill != NULL) 8269 lladdr = proxy_ill->ill_phys_addr; 8270 } 8271 /* FALLTHRU */ 8272 } 8273 } 8274 8275 ipaddr = sin->sin_addr.s_addr; 8276 /* 8277 * don't match across illgrp per case (1) and (2). 8278 * XXX use IS_IPMP(ill) like ndp_sioc_update? 8279 */ 8280 nce = nce_lookup_v4(ill, &ipaddr); 8281 if (nce != NULL) 8282 ncec = nce->nce_common; 8283 8284 switch (iocp->ioc_cmd) { 8285 case SIOCDARP: 8286 case SIOCDXARP: { 8287 /* 8288 * Delete the NCE if any. 8289 */ 8290 if (ncec == NULL) { 8291 iocp->ioc_error = ENXIO; 8292 break; 8293 } 8294 /* Don't allow changes to arp mappings of local addresses. */ 8295 if (NCE_MYADDR(ncec)) { 8296 nce_refrele(nce); 8297 return (ENOTSUP); 8298 } 8299 iocp->ioc_error = 0; 8300 8301 /* 8302 * Delete the nce_common which has ncec_ill set to ipmp_ill. 8303 * This will delete all the nce entries on the under_ills. 8304 */ 8305 ncec_delete(ncec); 8306 /* 8307 * Once the NCE has been deleted, then the ire_dep* consistency 8308 * mechanism will find any IRE which depended on the now 8309 * condemned NCE (as part of sending packets). 8310 * That mechanism handles redirects by deleting redirects 8311 * that refer to UNREACHABLE nces. 8312 */ 8313 break; 8314 } 8315 case SIOCGARP: 8316 case SIOCGXARP: 8317 if (ncec != NULL) { 8318 lladdr = ncec->ncec_lladdr; 8319 flags = ncec->ncec_flags; 8320 iocp->ioc_error = 0; 8321 ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags); 8322 } else { 8323 iocp->ioc_error = ENXIO; 8324 } 8325 break; 8326 case SIOCSARP: 8327 case SIOCSXARP: 8328 /* Don't allow changes to arp mappings of local addresses. */ 8329 if (ncec != NULL && NCE_MYADDR(ncec)) { 8330 nce_refrele(nce); 8331 return (ENOTSUP); 8332 } 8333 8334 /* static arp entries will undergo NUD if ATF_PERM is not set */ 8335 flags |= NCE_F_STATIC; 8336 if (!if_arp_ioctl) { 8337 ip_nce_lookup_and_update(&ipaddr, NULL, ipst, 8338 lladdr, alength, flags); 8339 } else { 8340 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 8341 if (ipif != NULL) { 8342 ip_nce_lookup_and_update(&ipaddr, ipif, ipst, 8343 lladdr, alength, flags); 8344 ipif_refrele(ipif); 8345 } 8346 } 8347 if (nce != NULL) { 8348 nce_refrele(nce); 8349 nce = NULL; 8350 } 8351 /* 8352 * NCE_F_STATIC entries will be added in state ND_REACHABLE 8353 * by nce_add_common() 8354 */ 8355 err = nce_lookup_then_add_v4(ill, lladdr, 8356 ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED, 8357 &nce); 8358 if (err == EEXIST) { 8359 ncec = nce->nce_common; 8360 mutex_enter(&ncec->ncec_lock); 8361 ncec->ncec_state = ND_REACHABLE; 8362 ncec->ncec_flags = flags; 8363 nce_update(ncec, ND_UNCHANGED, lladdr); 8364 mutex_exit(&ncec->ncec_lock); 8365 err = 0; 8366 } 8367 if (nce != NULL) { 8368 nce_refrele(nce); 8369 nce = NULL; 8370 } 8371 if (IS_IPMP(ill) && err == 0) { 8372 entp = ipmp_illgrp_create_arpent(ill->ill_grp, 8373 proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length, 8374 flags); 8375 if (entp == NULL || (proxyarp && proxy_ill == NULL)) { 8376 iocp->ioc_error = (entp == NULL ? ENOMEM : 0); 8377 break; 8378 } 8379 } 8380 iocp->ioc_error = err; 8381 } 8382 8383 if (nce != NULL) { 8384 nce_refrele(nce); 8385 } 8386 8387 /* 8388 * If we created an IPMP ARP entry, mark that we've notified ARP. 8389 */ 8390 if (entp != NULL) 8391 ipmp_illgrp_mark_arpent(ill->ill_grp, entp); 8392 8393 return (iocp->ioc_error); 8394 } 8395 8396 /* 8397 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify 8398 * the associated sin and refhold and return the associated ipif via `ci'. 8399 */ 8400 int 8401 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 8402 cmd_info_t *ci) 8403 { 8404 mblk_t *mp1; 8405 sin_t *sin; 8406 conn_t *connp; 8407 ipif_t *ipif; 8408 ire_t *ire = NULL; 8409 ill_t *ill = NULL; 8410 boolean_t exists; 8411 ip_stack_t *ipst; 8412 struct arpreq *ar; 8413 struct xarpreq *xar; 8414 struct sockaddr_dl *sdl; 8415 8416 /* ioctl comes down on a conn */ 8417 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8418 connp = Q_TO_CONN(q); 8419 if (connp->conn_family == AF_INET6) 8420 return (ENXIO); 8421 8422 ipst = connp->conn_netstack->netstack_ip; 8423 8424 /* Verified in ip_wput_nondata */ 8425 mp1 = mp->b_cont->b_cont; 8426 8427 if (ipip->ipi_cmd_type == XARP_CMD) { 8428 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq)); 8429 xar = (struct xarpreq *)mp1->b_rptr; 8430 sin = (sin_t *)&xar->xarp_pa; 8431 sdl = &xar->xarp_ha; 8432 8433 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET) 8434 return (ENXIO); 8435 if (sdl->sdl_nlen >= LIFNAMSIZ) 8436 return (EINVAL); 8437 } else { 8438 ASSERT(ipip->ipi_cmd_type == ARP_CMD); 8439 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq)); 8440 ar = (struct arpreq *)mp1->b_rptr; 8441 sin = (sin_t *)&ar->arp_pa; 8442 } 8443 8444 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) { 8445 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen, 8446 B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst); 8447 if (ipif == NULL) 8448 return (ENXIO); 8449 if (ipif->ipif_id != 0) { 8450 ipif_refrele(ipif); 8451 return (ENXIO); 8452 } 8453 } else { 8454 /* 8455 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen 8456 * of 0: use the IP address to find the ipif. If the IP 8457 * address is an IPMP test address, ire_ftable_lookup() will 8458 * find the wrong ill, so we first do an ipif_lookup_addr(). 8459 */ 8460 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES, 8461 ipst); 8462 if (ipif == NULL) { 8463 ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr, 8464 0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES, 8465 NULL, MATCH_IRE_TYPE, 0, ipst, NULL); 8466 if (ire == NULL || ((ill = ire->ire_ill) == NULL)) { 8467 if (ire != NULL) 8468 ire_refrele(ire); 8469 return (ENXIO); 8470 } 8471 ASSERT(ire != NULL && ill != NULL); 8472 ipif = ill->ill_ipif; 8473 ipif_refhold(ipif); 8474 ire_refrele(ire); 8475 } 8476 } 8477 8478 if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) { 8479 ipif_refrele(ipif); 8480 return (ENXIO); 8481 } 8482 8483 ci->ci_sin = sin; 8484 ci->ci_ipif = ipif; 8485 return (0); 8486 } 8487 8488 /* 8489 * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the 8490 * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is 8491 * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it 8492 * up and thus an ill can join that illgrp. 8493 * 8494 * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than 8495 * open()/close() primarily because close() is not allowed to fail or block 8496 * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason 8497 * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure 8498 * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the 8499 * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts 8500 * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent 8501 * state if I_UNLINK didn't occur. 8502 * 8503 * Note that for each plumb/unplumb operation, we may end up here more than 8504 * once because of the way ifconfig works. However, it's OK to link the same 8505 * illgrp more than once, or unlink an illgrp that's already unlinked. 8506 */ 8507 static int 8508 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd) 8509 { 8510 int err; 8511 ip_stack_t *ipst = ill->ill_ipst; 8512 8513 ASSERT(IS_IPMP(ill)); 8514 ASSERT(IAM_WRITER_ILL(ill)); 8515 8516 switch (ioccmd) { 8517 case I_LINK: 8518 return (ENOTSUP); 8519 8520 case I_PLINK: 8521 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 8522 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp); 8523 rw_exit(&ipst->ips_ipmp_lock); 8524 break; 8525 8526 case I_PUNLINK: 8527 /* 8528 * Require all UP ipifs be brought down prior to unlinking the 8529 * illgrp so any associated IREs (and other state) is torched. 8530 */ 8531 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) 8532 return (EBUSY); 8533 8534 /* 8535 * NOTE: We hold ipmp_lock across the unlink to prevent a race 8536 * with an SIOCSLIFGROUPNAME request from an ill trying to 8537 * join this group. Specifically: ills trying to join grab 8538 * ipmp_lock and bump a "pending join" counter checked by 8539 * ipmp_illgrp_unlink_grp(). During the unlink no new pending 8540 * joins can occur (since we have ipmp_lock). Once we drop 8541 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not 8542 * find the illgrp (since we unlinked it) and will return 8543 * EAFNOSUPPORT. This will then take them back through the 8544 * IPMP meta-interface plumbing logic in ifconfig, and thus 8545 * back through I_PLINK above. 8546 */ 8547 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 8548 err = ipmp_illgrp_unlink_grp(ill->ill_grp); 8549 rw_exit(&ipst->ips_ipmp_lock); 8550 return (err); 8551 default: 8552 break; 8553 } 8554 return (0); 8555 } 8556 8557 /* 8558 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 8559 * atomically set/clear the muxids. Also complete the ioctl by acking or 8560 * naking it. Note that the code is structured such that the link type, 8561 * whether it's persistent or not, is treated equally. ifconfig(1M) and 8562 * its clones use the persistent link, while pppd(1M) and perhaps many 8563 * other daemons may use non-persistent link. When combined with some 8564 * ill_t states, linking and unlinking lower streams may be used as 8565 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 8566 */ 8567 /* ARGSUSED */ 8568 void 8569 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 8570 { 8571 mblk_t *mp1; 8572 struct linkblk *li; 8573 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; 8574 int err = 0; 8575 8576 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK || 8577 ioccmd == I_LINK || ioccmd == I_UNLINK); 8578 8579 mp1 = mp->b_cont; /* This is the linkblk info */ 8580 li = (struct linkblk *)mp1->b_rptr; 8581 8582 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li); 8583 if (err == EINPROGRESS) 8584 return; 8585 done: 8586 if (err == 0) 8587 miocack(q, mp, 0, 0); 8588 else 8589 miocnak(q, mp, 0, err); 8590 8591 /* Conn was refheld in ip_sioctl_copyin_setup */ 8592 if (CONN_Q(q)) 8593 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 8594 } 8595 8596 /* 8597 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to 8598 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP 8599 * module stream). If `doconsist' is set, then do the extended consistency 8600 * checks requested by ifconfig(1M) and (atomically) set ill_muxid here. 8601 * Returns zero on success, EINPROGRESS if the operation is still pending, or 8602 * an error code on failure. 8603 */ 8604 static int 8605 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, 8606 struct linkblk *li) 8607 { 8608 int err = 0; 8609 ill_t *ill; 8610 queue_t *ipwq, *dwq; 8611 const char *name; 8612 struct qinit *qinfo; 8613 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 8614 boolean_t entered_ipsq = B_FALSE; 8615 boolean_t is_ip = B_FALSE; 8616 arl_t *arl; 8617 8618 /* 8619 * Walk the lower stream to verify it's the IP module stream. 8620 * The IP module is identified by its name, wput function, 8621 * and non-NULL q_next. STREAMS ensures that the lower stream 8622 * (li->l_qbot) will not vanish until this ioctl completes. 8623 */ 8624 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) { 8625 qinfo = ipwq->q_qinfo; 8626 name = qinfo->qi_minfo->mi_idname; 8627 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 && 8628 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 8629 is_ip = B_TRUE; 8630 break; 8631 } 8632 if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 && 8633 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 8634 break; 8635 } 8636 } 8637 8638 /* 8639 * If this isn't an IP module stream, bail. 8640 */ 8641 if (ipwq == NULL) 8642 return (0); 8643 8644 if (!is_ip) { 8645 arl = (arl_t *)ipwq->q_ptr; 8646 ill = arl_to_ill(arl); 8647 if (ill == NULL) 8648 return (0); 8649 } else { 8650 ill = ipwq->q_ptr; 8651 } 8652 ASSERT(ill != NULL); 8653 8654 if (ipsq == NULL) { 8655 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 8656 NEW_OP, B_FALSE); 8657 if (ipsq == NULL) { 8658 if (!is_ip) 8659 ill_refrele(ill); 8660 return (EINPROGRESS); 8661 } 8662 entered_ipsq = B_TRUE; 8663 } 8664 ASSERT(IAM_WRITER_ILL(ill)); 8665 mutex_enter(&ill->ill_lock); 8666 if (!is_ip) { 8667 if (islink && ill->ill_muxid == 0) { 8668 /* 8669 * Plumbing has to be done with IP plumbed first, arp 8670 * second, but here we have arp being plumbed first. 8671 */ 8672 mutex_exit(&ill->ill_lock); 8673 ipsq_exit(ipsq); 8674 ill_refrele(ill); 8675 return (EINVAL); 8676 } 8677 } 8678 mutex_exit(&ill->ill_lock); 8679 if (!is_ip) { 8680 arl->arl_muxid = islink ? li->l_index : 0; 8681 ill_refrele(ill); 8682 goto done; 8683 } 8684 8685 if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) 8686 goto done; 8687 8688 /* 8689 * As part of I_{P}LINKing, stash the number of downstream modules and 8690 * the read queue of the module immediately below IP in the ill. 8691 * These are used during the capability negotiation below. 8692 */ 8693 ill->ill_lmod_rq = NULL; 8694 ill->ill_lmod_cnt = 0; 8695 if (islink && ((dwq = ipwq->q_next) != NULL)) { 8696 ill->ill_lmod_rq = RD(dwq); 8697 for (; dwq != NULL; dwq = dwq->q_next) 8698 ill->ill_lmod_cnt++; 8699 } 8700 8701 ill->ill_muxid = islink ? li->l_index : 0; 8702 8703 /* 8704 * Mark the ipsq busy until the capability operations initiated below 8705 * complete. The PLINK/UNLINK ioctl itself completes when our caller 8706 * returns, but the capability operation may complete asynchronously 8707 * much later. 8708 */ 8709 ipsq_current_start(ipsq, ill->ill_ipif, ioccmd); 8710 /* 8711 * If there's at least one up ipif on this ill, then we're bound to 8712 * the underlying driver via DLPI. In that case, renegotiate 8713 * capabilities to account for any possible change in modules 8714 * interposed between IP and the driver. 8715 */ 8716 if (ill->ill_ipif_up_count > 0) { 8717 if (islink) 8718 ill_capability_probe(ill); 8719 else 8720 ill_capability_reset(ill, B_FALSE); 8721 } 8722 ipsq_current_finish(ipsq); 8723 done: 8724 if (entered_ipsq) 8725 ipsq_exit(ipsq); 8726 8727 return (err); 8728 } 8729 8730 /* 8731 * Search the ioctl command in the ioctl tables and return a pointer 8732 * to the ioctl command information. The ioctl command tables are 8733 * static and fully populated at compile time. 8734 */ 8735 ip_ioctl_cmd_t * 8736 ip_sioctl_lookup(int ioc_cmd) 8737 { 8738 int index; 8739 ip_ioctl_cmd_t *ipip; 8740 ip_ioctl_cmd_t *ipip_end; 8741 8742 if (ioc_cmd == IPI_DONTCARE) 8743 return (NULL); 8744 8745 /* 8746 * Do a 2 step search. First search the indexed table 8747 * based on the least significant byte of the ioctl cmd. 8748 * If we don't find a match, then search the misc table 8749 * serially. 8750 */ 8751 index = ioc_cmd & 0xFF; 8752 if (index < ip_ndx_ioctl_count) { 8753 ipip = &ip_ndx_ioctl_table[index]; 8754 if (ipip->ipi_cmd == ioc_cmd) { 8755 /* Found a match in the ndx table */ 8756 return (ipip); 8757 } 8758 } 8759 8760 /* Search the misc table */ 8761 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 8762 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 8763 if (ipip->ipi_cmd == ioc_cmd) 8764 /* Found a match in the misc table */ 8765 return (ipip); 8766 } 8767 8768 return (NULL); 8769 } 8770 8771 /* 8772 * Wrapper function for resuming deferred ioctl processing 8773 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 8774 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 8775 */ 8776 /* ARGSUSED */ 8777 void 8778 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 8779 void *dummy_arg) 8780 { 8781 ip_sioctl_copyin_setup(q, mp); 8782 } 8783 8784 /* 8785 * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message 8786 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 8787 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 8788 * We establish here the size of the block to be copied in. mi_copyin 8789 * arranges for this to happen, an processing continues in ip_wput_nondata with 8790 * an M_IOCDATA message. 8791 */ 8792 void 8793 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 8794 { 8795 int copyin_size; 8796 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8797 ip_ioctl_cmd_t *ipip; 8798 cred_t *cr; 8799 ip_stack_t *ipst; 8800 8801 if (CONN_Q(q)) 8802 ipst = CONNQ_TO_IPST(q); 8803 else 8804 ipst = ILLQ_TO_IPST(q); 8805 8806 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 8807 if (ipip == NULL) { 8808 /* 8809 * The ioctl is not one we understand or own. 8810 * Pass it along to be processed down stream, 8811 * if this is a module instance of IP, else nak 8812 * the ioctl. 8813 */ 8814 if (q->q_next == NULL) { 8815 goto nak; 8816 } else { 8817 putnext(q, mp); 8818 return; 8819 } 8820 } 8821 8822 /* 8823 * If this is deferred, then we will do all the checks when we 8824 * come back. 8825 */ 8826 if ((iocp->ioc_cmd == SIOCGDSTINFO || 8827 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) { 8828 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 8829 return; 8830 } 8831 8832 /* 8833 * Only allow a very small subset of IP ioctls on this stream if 8834 * IP is a module and not a driver. Allowing ioctls to be processed 8835 * in this case may cause assert failures or data corruption. 8836 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 8837 * ioctls allowed on an IP module stream, after which this stream 8838 * normally becomes a multiplexor (at which time the stream head 8839 * will fail all ioctls). 8840 */ 8841 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 8842 goto nak; 8843 } 8844 8845 /* Make sure we have ioctl data to process. */ 8846 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 8847 goto nak; 8848 8849 /* 8850 * Prefer dblk credential over ioctl credential; some synthesized 8851 * ioctls have kcred set because there's no way to crhold() 8852 * a credential in some contexts. (ioc_cr is not crfree() by 8853 * the framework; the caller of ioctl needs to hold the reference 8854 * for the duration of the call). 8855 */ 8856 cr = msg_getcred(mp, NULL); 8857 if (cr == NULL) 8858 cr = iocp->ioc_cr; 8859 8860 /* Make sure normal users don't send down privileged ioctls */ 8861 if ((ipip->ipi_flags & IPI_PRIV) && 8862 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) { 8863 /* We checked the privilege earlier but log it here */ 8864 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE)); 8865 return; 8866 } 8867 8868 /* 8869 * The ioctl command tables can only encode fixed length 8870 * ioctl data. If the length is variable, the table will 8871 * encode the length as zero. Such special cases are handled 8872 * below in the switch. 8873 */ 8874 if (ipip->ipi_copyin_size != 0) { 8875 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 8876 return; 8877 } 8878 8879 switch (iocp->ioc_cmd) { 8880 case O_SIOCGIFCONF: 8881 case SIOCGIFCONF: 8882 /* 8883 * This IOCTL is hilarious. See comments in 8884 * ip_sioctl_get_ifconf for the story. 8885 */ 8886 if (iocp->ioc_count == TRANSPARENT) 8887 copyin_size = SIZEOF_STRUCT(ifconf, 8888 iocp->ioc_flag); 8889 else 8890 copyin_size = iocp->ioc_count; 8891 mi_copyin(q, mp, NULL, copyin_size); 8892 return; 8893 8894 case O_SIOCGLIFCONF: 8895 case SIOCGLIFCONF: 8896 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 8897 mi_copyin(q, mp, NULL, copyin_size); 8898 return; 8899 8900 case SIOCGLIFSRCOF: 8901 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 8902 mi_copyin(q, mp, NULL, copyin_size); 8903 return; 8904 case SIOCGIP6ADDRPOLICY: 8905 ip_sioctl_ip6addrpolicy(q, mp); 8906 ip6_asp_table_refrele(ipst); 8907 return; 8908 8909 case SIOCSIP6ADDRPOLICY: 8910 ip_sioctl_ip6addrpolicy(q, mp); 8911 return; 8912 8913 case SIOCGDSTINFO: 8914 ip_sioctl_dstinfo(q, mp); 8915 ip6_asp_table_refrele(ipst); 8916 return; 8917 8918 case I_PLINK: 8919 case I_PUNLINK: 8920 case I_LINK: 8921 case I_UNLINK: 8922 /* 8923 * We treat non-persistent link similarly as the persistent 8924 * link case, in terms of plumbing/unplumbing, as well as 8925 * dynamic re-plumbing events indicator. See comments 8926 * in ip_sioctl_plink() for more. 8927 * 8928 * Request can be enqueued in the 'ipsq' while waiting 8929 * to become exclusive. So bump up the conn ref. 8930 */ 8931 if (CONN_Q(q)) 8932 CONN_INC_REF(Q_TO_CONN(q)); 8933 ip_sioctl_plink(NULL, q, mp, NULL); 8934 return; 8935 8936 case ND_GET: 8937 case ND_SET: 8938 /* 8939 * Use of the nd table requires holding the reader lock. 8940 * Modifying the nd table thru nd_load/nd_unload requires 8941 * the writer lock. 8942 */ 8943 rw_enter(&ipst->ips_ip_g_nd_lock, RW_READER); 8944 if (nd_getset(q, ipst->ips_ip_g_nd, mp)) { 8945 rw_exit(&ipst->ips_ip_g_nd_lock); 8946 8947 if (iocp->ioc_error) 8948 iocp->ioc_count = 0; 8949 mp->b_datap->db_type = M_IOCACK; 8950 qreply(q, mp); 8951 return; 8952 } 8953 rw_exit(&ipst->ips_ip_g_nd_lock); 8954 /* 8955 * We don't understand this subioctl of ND_GET / ND_SET. 8956 * Maybe intended for some driver / module below us 8957 */ 8958 if (q->q_next) { 8959 putnext(q, mp); 8960 } else { 8961 iocp->ioc_error = ENOENT; 8962 mp->b_datap->db_type = M_IOCNAK; 8963 iocp->ioc_count = 0; 8964 qreply(q, mp); 8965 } 8966 return; 8967 8968 case IP_IOCTL: 8969 ip_wput_ioctl(q, mp); 8970 return; 8971 8972 case SIOCILB: 8973 /* The ioctl length varies depending on the ILB command. */ 8974 copyin_size = iocp->ioc_count; 8975 if (copyin_size < sizeof (ilb_cmd_t)) 8976 goto nak; 8977 mi_copyin(q, mp, NULL, copyin_size); 8978 return; 8979 8980 default: 8981 cmn_err(CE_PANIC, "should not happen "); 8982 } 8983 nak: 8984 if (mp->b_cont != NULL) { 8985 freemsg(mp->b_cont); 8986 mp->b_cont = NULL; 8987 } 8988 iocp->ioc_error = EINVAL; 8989 mp->b_datap->db_type = M_IOCNAK; 8990 iocp->ioc_count = 0; 8991 qreply(q, mp); 8992 } 8993 8994 static void 8995 ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags) 8996 { 8997 struct arpreq *ar; 8998 struct xarpreq *xar; 8999 mblk_t *tmp; 9000 struct iocblk *iocp; 9001 int x_arp_ioctl = B_FALSE; 9002 int *flagsp; 9003 char *storage = NULL; 9004 9005 ASSERT(ill != NULL); 9006 9007 iocp = (struct iocblk *)mp->b_rptr; 9008 ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP); 9009 9010 tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */ 9011 if ((iocp->ioc_cmd == SIOCGXARP) || 9012 (iocp->ioc_cmd == SIOCSXARP)) { 9013 x_arp_ioctl = B_TRUE; 9014 xar = (struct xarpreq *)tmp->b_rptr; 9015 flagsp = &xar->xarp_flags; 9016 storage = xar->xarp_ha.sdl_data; 9017 } else { 9018 ar = (struct arpreq *)tmp->b_rptr; 9019 flagsp = &ar->arp_flags; 9020 storage = ar->arp_ha.sa_data; 9021 } 9022 9023 /* 9024 * We're done if this is not an SIOCG{X}ARP 9025 */ 9026 if (x_arp_ioctl) { 9027 storage += ill_xarp_info(&xar->xarp_ha, ill); 9028 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 9029 sizeof (xar->xarp_ha.sdl_data)) { 9030 iocp->ioc_error = EINVAL; 9031 return; 9032 } 9033 } 9034 *flagsp = ATF_INUSE; 9035 /* 9036 * If /sbin/arp told us we are the authority using the "permanent" 9037 * flag, or if this is one of my addresses print "permanent" 9038 * in the /sbin/arp output. 9039 */ 9040 if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY)) 9041 *flagsp |= ATF_AUTHORITY; 9042 if (flags & NCE_F_NONUD) 9043 *flagsp |= ATF_PERM; /* not subject to aging */ 9044 if (flags & NCE_F_PUBLISH) 9045 *flagsp |= ATF_PUBL; 9046 if (hwaddr != NULL) { 9047 *flagsp |= ATF_COM; 9048 bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length); 9049 } 9050 } 9051 9052 /* 9053 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 9054 * interface) create the next available logical interface for this 9055 * physical interface. 9056 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 9057 * ipif with the specified name. 9058 * 9059 * If the address family is not AF_UNSPEC then set the address as well. 9060 * 9061 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 9062 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 9063 * 9064 * Executed as a writer on the ill. 9065 * So no lock is needed to traverse the ipif chain, or examine the 9066 * phyint flags. 9067 */ 9068 /* ARGSUSED */ 9069 int 9070 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9071 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9072 { 9073 mblk_t *mp1; 9074 struct lifreq *lifr; 9075 boolean_t isv6; 9076 boolean_t exists; 9077 char *name; 9078 char *endp; 9079 char *cp; 9080 int namelen; 9081 ipif_t *ipif; 9082 long id; 9083 ipsq_t *ipsq; 9084 ill_t *ill; 9085 sin_t *sin; 9086 int err = 0; 9087 boolean_t found_sep = B_FALSE; 9088 conn_t *connp; 9089 zoneid_t zoneid; 9090 ip_stack_t *ipst = CONNQ_TO_IPST(q); 9091 9092 ASSERT(q->q_next == NULL); 9093 ip1dbg(("ip_sioctl_addif\n")); 9094 /* Existence of mp1 has been checked in ip_wput_nondata */ 9095 mp1 = mp->b_cont->b_cont; 9096 /* 9097 * Null terminate the string to protect against buffer 9098 * overrun. String was generated by user code and may not 9099 * be trusted. 9100 */ 9101 lifr = (struct lifreq *)mp1->b_rptr; 9102 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 9103 name = lifr->lifr_name; 9104 ASSERT(CONN_Q(q)); 9105 connp = Q_TO_CONN(q); 9106 isv6 = (connp->conn_family == AF_INET6); 9107 zoneid = connp->conn_zoneid; 9108 namelen = mi_strlen(name); 9109 if (namelen == 0) 9110 return (EINVAL); 9111 9112 exists = B_FALSE; 9113 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 9114 (mi_strcmp(name, ipif_loopback_name) == 0)) { 9115 /* 9116 * Allow creating lo0 using SIOCLIFADDIF. 9117 * can't be any other writer thread. So can pass null below 9118 * for the last 4 args to ipif_lookup_name. 9119 */ 9120 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE, 9121 &exists, isv6, zoneid, ipst); 9122 /* Prevent any further action */ 9123 if (ipif == NULL) { 9124 return (ENOBUFS); 9125 } else if (!exists) { 9126 /* We created the ipif now and as writer */ 9127 ipif_refrele(ipif); 9128 return (0); 9129 } else { 9130 ill = ipif->ipif_ill; 9131 ill_refhold(ill); 9132 ipif_refrele(ipif); 9133 } 9134 } else { 9135 /* Look for a colon in the name. */ 9136 endp = &name[namelen]; 9137 for (cp = endp; --cp > name; ) { 9138 if (*cp == IPIF_SEPARATOR_CHAR) { 9139 found_sep = B_TRUE; 9140 /* 9141 * Reject any non-decimal aliases for plumbing 9142 * of logical interfaces. Aliases with leading 9143 * zeroes are also rejected as they introduce 9144 * ambiguity in the naming of the interfaces. 9145 * Comparing with "0" takes care of all such 9146 * cases. 9147 */ 9148 if ((strncmp("0", cp+1, 1)) == 0) 9149 return (EINVAL); 9150 9151 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 9152 id <= 0 || *endp != '\0') { 9153 return (EINVAL); 9154 } 9155 *cp = '\0'; 9156 break; 9157 } 9158 } 9159 ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst); 9160 if (found_sep) 9161 *cp = IPIF_SEPARATOR_CHAR; 9162 if (ill == NULL) 9163 return (ENXIO); 9164 } 9165 9166 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 9167 B_TRUE); 9168 9169 /* 9170 * Release the refhold due to the lookup, now that we are excl 9171 * or we are just returning 9172 */ 9173 ill_refrele(ill); 9174 9175 if (ipsq == NULL) 9176 return (EINPROGRESS); 9177 9178 /* We are now exclusive on the IPSQ */ 9179 ASSERT(IAM_WRITER_ILL(ill)); 9180 9181 if (found_sep) { 9182 /* Now see if there is an IPIF with this unit number. */ 9183 for (ipif = ill->ill_ipif; ipif != NULL; 9184 ipif = ipif->ipif_next) { 9185 if (ipif->ipif_id == id) { 9186 err = EEXIST; 9187 goto done; 9188 } 9189 } 9190 } 9191 9192 /* 9193 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 9194 * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name() 9195 * instead. 9196 */ 9197 if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL, 9198 B_TRUE, B_TRUE, &err)) == NULL) { 9199 goto done; 9200 } 9201 9202 /* Return created name with ioctl */ 9203 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 9204 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 9205 ip1dbg(("created %s\n", lifr->lifr_name)); 9206 9207 /* Set address */ 9208 sin = (sin_t *)&lifr->lifr_addr; 9209 if (sin->sin_family != AF_UNSPEC) { 9210 err = ip_sioctl_addr(ipif, sin, q, mp, 9211 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 9212 } 9213 9214 done: 9215 ipsq_exit(ipsq); 9216 return (err); 9217 } 9218 9219 /* 9220 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 9221 * interface) delete it based on the IP address (on this physical interface). 9222 * Otherwise delete it based on the ipif_id. 9223 * Also, special handling to allow a removeif of lo0. 9224 */ 9225 /* ARGSUSED */ 9226 int 9227 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9228 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 9229 { 9230 conn_t *connp; 9231 ill_t *ill = ipif->ipif_ill; 9232 boolean_t success; 9233 ip_stack_t *ipst; 9234 9235 ipst = CONNQ_TO_IPST(q); 9236 9237 ASSERT(q->q_next == NULL); 9238 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 9239 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9240 ASSERT(IAM_WRITER_IPIF(ipif)); 9241 9242 connp = Q_TO_CONN(q); 9243 /* 9244 * Special case for unplumbing lo0 (the loopback physical interface). 9245 * If unplumbing lo0, the incoming address structure has been 9246 * initialized to all zeros. When unplumbing lo0, all its logical 9247 * interfaces must be removed too. 9248 * 9249 * Note that this interface may be called to remove a specific 9250 * loopback logical interface (eg, lo0:1). But in that case 9251 * ipif->ipif_id != 0 so that the code path for that case is the 9252 * same as any other interface (meaning it skips the code directly 9253 * below). 9254 */ 9255 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { 9256 if (sin->sin_family == AF_UNSPEC && 9257 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 9258 /* 9259 * Mark it condemned. No new ref. will be made to ill. 9260 */ 9261 mutex_enter(&ill->ill_lock); 9262 ill->ill_state_flags |= ILL_CONDEMNED; 9263 for (ipif = ill->ill_ipif; ipif != NULL; 9264 ipif = ipif->ipif_next) { 9265 ipif->ipif_state_flags |= IPIF_CONDEMNED; 9266 } 9267 mutex_exit(&ill->ill_lock); 9268 9269 ipif = ill->ill_ipif; 9270 /* unplumb the loopback interface */ 9271 ill_delete(ill); 9272 mutex_enter(&connp->conn_lock); 9273 mutex_enter(&ill->ill_lock); 9274 9275 /* Are any references to this ill active */ 9276 if (ill_is_freeable(ill)) { 9277 mutex_exit(&ill->ill_lock); 9278 mutex_exit(&connp->conn_lock); 9279 ill_delete_tail(ill); 9280 mi_free(ill); 9281 return (0); 9282 } 9283 success = ipsq_pending_mp_add(connp, ipif, 9284 CONNP_TO_WQ(connp), mp, ILL_FREE); 9285 mutex_exit(&connp->conn_lock); 9286 mutex_exit(&ill->ill_lock); 9287 if (success) 9288 return (EINPROGRESS); 9289 else 9290 return (EINTR); 9291 } 9292 } 9293 9294 if (ipif->ipif_id == 0) { 9295 ipsq_t *ipsq; 9296 9297 /* Find based on address */ 9298 if (ipif->ipif_isv6) { 9299 sin6_t *sin6; 9300 9301 if (sin->sin_family != AF_INET6) 9302 return (EAFNOSUPPORT); 9303 9304 sin6 = (sin6_t *)sin; 9305 /* We are a writer, so we should be able to lookup */ 9306 ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill, 9307 ipst); 9308 } else { 9309 if (sin->sin_family != AF_INET) 9310 return (EAFNOSUPPORT); 9311 9312 /* We are a writer, so we should be able to lookup */ 9313 ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill, 9314 ipst); 9315 } 9316 if (ipif == NULL) { 9317 return (EADDRNOTAVAIL); 9318 } 9319 9320 /* 9321 * It is possible for a user to send an SIOCLIFREMOVEIF with 9322 * lifr_name of the physical interface but with an ip address 9323 * lifr_addr of a logical interface plumbed over it. 9324 * So update ipx_current_ipif now that ipif points to the 9325 * correct one. 9326 */ 9327 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 9328 ipsq->ipsq_xop->ipx_current_ipif = ipif; 9329 9330 /* This is a writer */ 9331 ipif_refrele(ipif); 9332 } 9333 9334 /* 9335 * Can not delete instance zero since it is tied to the ill. 9336 */ 9337 if (ipif->ipif_id == 0) 9338 return (EBUSY); 9339 9340 mutex_enter(&ill->ill_lock); 9341 ipif->ipif_state_flags |= IPIF_CONDEMNED; 9342 mutex_exit(&ill->ill_lock); 9343 9344 ipif_free(ipif); 9345 9346 mutex_enter(&connp->conn_lock); 9347 mutex_enter(&ill->ill_lock); 9348 9349 /* Are any references to this ipif active */ 9350 if (ipif_is_freeable(ipif)) { 9351 mutex_exit(&ill->ill_lock); 9352 mutex_exit(&connp->conn_lock); 9353 ipif_non_duplicate(ipif); 9354 (void) ipif_down_tail(ipif); 9355 ipif_free_tail(ipif); /* frees ipif */ 9356 return (0); 9357 } 9358 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 9359 IPIF_FREE); 9360 mutex_exit(&ill->ill_lock); 9361 mutex_exit(&connp->conn_lock); 9362 if (success) 9363 return (EINPROGRESS); 9364 else 9365 return (EINTR); 9366 } 9367 9368 /* 9369 * Restart the removeif ioctl. The refcnt has gone down to 0. 9370 * The ipif is already condemned. So can't find it thru lookups. 9371 */ 9372 /* ARGSUSED */ 9373 int 9374 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 9375 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 9376 { 9377 ill_t *ill = ipif->ipif_ill; 9378 9379 ASSERT(IAM_WRITER_IPIF(ipif)); 9380 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 9381 9382 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 9383 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9384 9385 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { 9386 ASSERT(ill->ill_state_flags & ILL_CONDEMNED); 9387 ill_delete_tail(ill); 9388 mi_free(ill); 9389 return (0); 9390 } 9391 9392 ipif_non_duplicate(ipif); 9393 (void) ipif_down_tail(ipif); 9394 ipif_free_tail(ipif); 9395 9396 return (0); 9397 } 9398 9399 /* 9400 * Set the local interface address. 9401 * Allow an address of all zero when the interface is down. 9402 */ 9403 /* ARGSUSED */ 9404 int 9405 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9406 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9407 { 9408 int err = 0; 9409 in6_addr_t v6addr; 9410 boolean_t need_up = B_FALSE; 9411 9412 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 9413 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9414 9415 ASSERT(IAM_WRITER_IPIF(ipif)); 9416 9417 if (ipif->ipif_isv6) { 9418 sin6_t *sin6; 9419 ill_t *ill; 9420 phyint_t *phyi; 9421 9422 if (sin->sin_family != AF_INET6) 9423 return (EAFNOSUPPORT); 9424 9425 sin6 = (sin6_t *)sin; 9426 v6addr = sin6->sin6_addr; 9427 ill = ipif->ipif_ill; 9428 phyi = ill->ill_phyint; 9429 9430 /* 9431 * Enforce that true multicast interfaces have a link-local 9432 * address for logical unit 0. 9433 */ 9434 if (ipif->ipif_id == 0 && 9435 (ill->ill_flags & ILLF_MULTICAST) && 9436 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 9437 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 9438 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 9439 return (EADDRNOTAVAIL); 9440 } 9441 9442 /* 9443 * up interfaces shouldn't have the unspecified address 9444 * unless they also have the IPIF_NOLOCAL flags set and 9445 * have a subnet assigned. 9446 */ 9447 if ((ipif->ipif_flags & IPIF_UP) && 9448 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 9449 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 9450 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 9451 return (EADDRNOTAVAIL); 9452 } 9453 9454 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 9455 return (EADDRNOTAVAIL); 9456 } else { 9457 ipaddr_t addr; 9458 9459 if (sin->sin_family != AF_INET) 9460 return (EAFNOSUPPORT); 9461 9462 addr = sin->sin_addr.s_addr; 9463 9464 /* Allow 0 as the local address. */ 9465 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 9466 return (EADDRNOTAVAIL); 9467 9468 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9469 } 9470 9471 /* 9472 * Even if there is no change we redo things just to rerun 9473 * ipif_set_default. 9474 */ 9475 if (ipif->ipif_flags & IPIF_UP) { 9476 /* 9477 * Setting a new local address, make sure 9478 * we have net and subnet bcast ire's for 9479 * the old address if we need them. 9480 */ 9481 /* 9482 * If the interface is already marked up, 9483 * we call ipif_down which will take care 9484 * of ditching any IREs that have been set 9485 * up based on the old interface address. 9486 */ 9487 err = ipif_logical_down(ipif, q, mp); 9488 if (err == EINPROGRESS) 9489 return (err); 9490 (void) ipif_down_tail(ipif); 9491 need_up = 1; 9492 } 9493 9494 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 9495 return (err); 9496 } 9497 9498 int 9499 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9500 boolean_t need_up) 9501 { 9502 in6_addr_t v6addr; 9503 in6_addr_t ov6addr; 9504 ipaddr_t addr; 9505 sin6_t *sin6; 9506 int sinlen; 9507 int err = 0; 9508 ill_t *ill = ipif->ipif_ill; 9509 boolean_t need_dl_down; 9510 boolean_t need_arp_down; 9511 struct iocblk *iocp; 9512 9513 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL; 9514 9515 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 9516 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9517 ASSERT(IAM_WRITER_IPIF(ipif)); 9518 9519 /* Must cancel any pending timer before taking the ill_lock */ 9520 if (ipif->ipif_recovery_id != 0) 9521 (void) untimeout(ipif->ipif_recovery_id); 9522 ipif->ipif_recovery_id = 0; 9523 9524 if (ipif->ipif_isv6) { 9525 sin6 = (sin6_t *)sin; 9526 v6addr = sin6->sin6_addr; 9527 sinlen = sizeof (struct sockaddr_in6); 9528 } else { 9529 addr = sin->sin_addr.s_addr; 9530 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9531 sinlen = sizeof (struct sockaddr_in); 9532 } 9533 mutex_enter(&ill->ill_lock); 9534 ov6addr = ipif->ipif_v6lcl_addr; 9535 ipif->ipif_v6lcl_addr = v6addr; 9536 sctp_update_ipif_addr(ipif, ov6addr); 9537 ipif->ipif_addr_ready = 0; 9538 9539 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT); 9540 9541 /* 9542 * If the interface was previously marked as a duplicate, then since 9543 * we've now got a "new" address, it should no longer be considered a 9544 * duplicate -- even if the "new" address is the same as the old one. 9545 * Note that if all ipifs are down, we may have a pending ARP down 9546 * event to handle. This is because we want to recover from duplicates 9547 * and thus delay tearing down ARP until the duplicates have been 9548 * removed or disabled. 9549 */ 9550 need_dl_down = need_arp_down = B_FALSE; 9551 if (ipif->ipif_flags & IPIF_DUPLICATE) { 9552 need_arp_down = !need_up; 9553 ipif->ipif_flags &= ~IPIF_DUPLICATE; 9554 if (--ill->ill_ipif_dup_count == 0 && !need_up && 9555 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 9556 need_dl_down = B_TRUE; 9557 } 9558 } 9559 9560 ipif_set_default(ipif); 9561 9562 /* 9563 * If we've just manually set the IPv6 link-local address (0th ipif), 9564 * tag the ill so that future updates to the interface ID don't result 9565 * in this address getting automatically reconfigured from under the 9566 * administrator. 9567 */ 9568 if (ipif->ipif_isv6 && ipif->ipif_id == 0) 9569 ill->ill_manual_linklocal = 1; 9570 9571 /* 9572 * When publishing an interface address change event, we only notify 9573 * the event listeners of the new address. It is assumed that if they 9574 * actively care about the addresses assigned that they will have 9575 * already discovered the previous address assigned (if there was one.) 9576 * 9577 * Don't attach nic event message for SIOCLIFADDIF ioctl. 9578 */ 9579 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) { 9580 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id), 9581 NE_ADDRESS_CHANGE, sin, sinlen); 9582 } 9583 9584 mutex_exit(&ill->ill_lock); 9585 9586 if (need_up) { 9587 /* 9588 * Now bring the interface back up. If this 9589 * is the only IPIF for the ILL, ipif_up 9590 * will have to re-bind to the device, so 9591 * we may get back EINPROGRESS, in which 9592 * case, this IOCTL will get completed in 9593 * ip_rput_dlpi when we see the DL_BIND_ACK. 9594 */ 9595 err = ipif_up(ipif, q, mp); 9596 } else { 9597 /* Perhaps ilgs should use this ill */ 9598 update_conn_ill(NULL, ill->ill_ipst); 9599 } 9600 9601 if (need_dl_down) 9602 ill_dl_down(ill); 9603 9604 if (need_arp_down && !ill->ill_isv6) 9605 (void) ipif_arp_down(ipif); 9606 9607 /* 9608 * The default multicast interface might have changed (for 9609 * instance if the IPv6 scope of the address changed) 9610 */ 9611 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); 9612 9613 return (err); 9614 } 9615 9616 /* 9617 * Restart entry point to restart the address set operation after the 9618 * refcounts have dropped to zero. 9619 */ 9620 /* ARGSUSED */ 9621 int 9622 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9623 ip_ioctl_cmd_t *ipip, void *ifreq) 9624 { 9625 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 9626 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9627 ASSERT(IAM_WRITER_IPIF(ipif)); 9628 (void) ipif_down_tail(ipif); 9629 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 9630 } 9631 9632 /* ARGSUSED */ 9633 int 9634 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9635 ip_ioctl_cmd_t *ipip, void *if_req) 9636 { 9637 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 9638 struct lifreq *lifr = (struct lifreq *)if_req; 9639 9640 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 9641 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9642 /* 9643 * The net mask and address can't change since we have a 9644 * reference to the ipif. So no lock is necessary. 9645 */ 9646 if (ipif->ipif_isv6) { 9647 *sin6 = sin6_null; 9648 sin6->sin6_family = AF_INET6; 9649 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 9650 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 9651 lifr->lifr_addrlen = 9652 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 9653 } else { 9654 *sin = sin_null; 9655 sin->sin_family = AF_INET; 9656 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 9657 if (ipip->ipi_cmd_type == LIF_CMD) { 9658 lifr->lifr_addrlen = 9659 ip_mask_to_plen(ipif->ipif_net_mask); 9660 } 9661 } 9662 return (0); 9663 } 9664 9665 /* 9666 * Set the destination address for a pt-pt interface. 9667 */ 9668 /* ARGSUSED */ 9669 int 9670 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9671 ip_ioctl_cmd_t *ipip, void *if_req) 9672 { 9673 int err = 0; 9674 in6_addr_t v6addr; 9675 boolean_t need_up = B_FALSE; 9676 9677 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 9678 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9679 ASSERT(IAM_WRITER_IPIF(ipif)); 9680 9681 if (ipif->ipif_isv6) { 9682 sin6_t *sin6; 9683 9684 if (sin->sin_family != AF_INET6) 9685 return (EAFNOSUPPORT); 9686 9687 sin6 = (sin6_t *)sin; 9688 v6addr = sin6->sin6_addr; 9689 9690 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 9691 return (EADDRNOTAVAIL); 9692 } else { 9693 ipaddr_t addr; 9694 9695 if (sin->sin_family != AF_INET) 9696 return (EAFNOSUPPORT); 9697 9698 addr = sin->sin_addr.s_addr; 9699 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 9700 return (EADDRNOTAVAIL); 9701 9702 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9703 } 9704 9705 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 9706 return (0); /* No change */ 9707 9708 if (ipif->ipif_flags & IPIF_UP) { 9709 /* 9710 * If the interface is already marked up, 9711 * we call ipif_down which will take care 9712 * of ditching any IREs that have been set 9713 * up based on the old pp dst address. 9714 */ 9715 err = ipif_logical_down(ipif, q, mp); 9716 if (err == EINPROGRESS) 9717 return (err); 9718 (void) ipif_down_tail(ipif); 9719 need_up = B_TRUE; 9720 } 9721 /* 9722 * could return EINPROGRESS. If so ioctl will complete in 9723 * ip_rput_dlpi_writer 9724 */ 9725 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 9726 return (err); 9727 } 9728 9729 static int 9730 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9731 boolean_t need_up) 9732 { 9733 in6_addr_t v6addr; 9734 ill_t *ill = ipif->ipif_ill; 9735 int err = 0; 9736 boolean_t need_dl_down; 9737 boolean_t need_arp_down; 9738 9739 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 9740 ipif->ipif_id, (void *)ipif)); 9741 9742 /* Must cancel any pending timer before taking the ill_lock */ 9743 if (ipif->ipif_recovery_id != 0) 9744 (void) untimeout(ipif->ipif_recovery_id); 9745 ipif->ipif_recovery_id = 0; 9746 9747 if (ipif->ipif_isv6) { 9748 sin6_t *sin6; 9749 9750 sin6 = (sin6_t *)sin; 9751 v6addr = sin6->sin6_addr; 9752 } else { 9753 ipaddr_t addr; 9754 9755 addr = sin->sin_addr.s_addr; 9756 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9757 } 9758 mutex_enter(&ill->ill_lock); 9759 /* Set point to point destination address. */ 9760 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 9761 /* 9762 * Allow this as a means of creating logical 9763 * pt-pt interfaces on top of e.g. an Ethernet. 9764 * XXX Undocumented HACK for testing. 9765 * pt-pt interfaces are created with NUD disabled. 9766 */ 9767 ipif->ipif_flags |= IPIF_POINTOPOINT; 9768 ipif->ipif_flags &= ~IPIF_BROADCAST; 9769 if (ipif->ipif_isv6) 9770 ill->ill_flags |= ILLF_NONUD; 9771 } 9772 9773 /* 9774 * If the interface was previously marked as a duplicate, then since 9775 * we've now got a "new" address, it should no longer be considered a 9776 * duplicate -- even if the "new" address is the same as the old one. 9777 * Note that if all ipifs are down, we may have a pending ARP down 9778 * event to handle. 9779 */ 9780 need_dl_down = need_arp_down = B_FALSE; 9781 if (ipif->ipif_flags & IPIF_DUPLICATE) { 9782 need_arp_down = !need_up; 9783 ipif->ipif_flags &= ~IPIF_DUPLICATE; 9784 if (--ill->ill_ipif_dup_count == 0 && !need_up && 9785 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 9786 need_dl_down = B_TRUE; 9787 } 9788 } 9789 9790 /* 9791 * If we've just manually set the IPv6 destination link-local address 9792 * (0th ipif), tag the ill so that future updates to the destination 9793 * interface ID (as can happen with interfaces over IP tunnels) don't 9794 * result in this address getting automatically reconfigured from 9795 * under the administrator. 9796 */ 9797 if (ipif->ipif_isv6 && ipif->ipif_id == 0) 9798 ill->ill_manual_dst_linklocal = 1; 9799 9800 /* Set the new address. */ 9801 ipif->ipif_v6pp_dst_addr = v6addr; 9802 /* Make sure subnet tracks pp_dst */ 9803 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 9804 mutex_exit(&ill->ill_lock); 9805 9806 if (need_up) { 9807 /* 9808 * Now bring the interface back up. If this 9809 * is the only IPIF for the ILL, ipif_up 9810 * will have to re-bind to the device, so 9811 * we may get back EINPROGRESS, in which 9812 * case, this IOCTL will get completed in 9813 * ip_rput_dlpi when we see the DL_BIND_ACK. 9814 */ 9815 err = ipif_up(ipif, q, mp); 9816 } 9817 9818 if (need_dl_down) 9819 ill_dl_down(ill); 9820 if (need_arp_down && !ipif->ipif_isv6) 9821 (void) ipif_arp_down(ipif); 9822 9823 return (err); 9824 } 9825 9826 /* 9827 * Restart entry point to restart the dstaddress set operation after the 9828 * refcounts have dropped to zero. 9829 */ 9830 /* ARGSUSED */ 9831 int 9832 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9833 ip_ioctl_cmd_t *ipip, void *ifreq) 9834 { 9835 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 9836 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9837 (void) ipif_down_tail(ipif); 9838 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 9839 } 9840 9841 /* ARGSUSED */ 9842 int 9843 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9844 ip_ioctl_cmd_t *ipip, void *if_req) 9845 { 9846 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 9847 9848 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 9849 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9850 /* 9851 * Get point to point destination address. The addresses can't 9852 * change since we hold a reference to the ipif. 9853 */ 9854 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 9855 return (EADDRNOTAVAIL); 9856 9857 if (ipif->ipif_isv6) { 9858 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 9859 *sin6 = sin6_null; 9860 sin6->sin6_family = AF_INET6; 9861 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 9862 } else { 9863 *sin = sin_null; 9864 sin->sin_family = AF_INET; 9865 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 9866 } 9867 return (0); 9868 } 9869 9870 /* 9871 * Check which flags will change by the given flags being set 9872 * silently ignore flags which userland is not allowed to control. 9873 * (Because these flags may change between SIOCGLIFFLAGS and 9874 * SIOCSLIFFLAGS, and that's outside of userland's control, 9875 * we need to silently ignore them rather than fail.) 9876 */ 9877 static void 9878 ip_sioctl_flags_onoff(ipif_t *ipif, uint64_t flags, uint64_t *onp, 9879 uint64_t *offp) 9880 { 9881 ill_t *ill = ipif->ipif_ill; 9882 phyint_t *phyi = ill->ill_phyint; 9883 uint64_t cantchange_flags, intf_flags; 9884 uint64_t turn_on, turn_off; 9885 9886 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 9887 cantchange_flags = IFF_CANTCHANGE; 9888 if (IS_IPMP(ill)) 9889 cantchange_flags |= IFF_IPMP_CANTCHANGE; 9890 turn_on = (flags ^ intf_flags) & ~cantchange_flags; 9891 turn_off = intf_flags & turn_on; 9892 turn_on ^= turn_off; 9893 *onp = turn_on; 9894 *offp = turn_off; 9895 } 9896 9897 /* 9898 * Set interface flags. Many flags require special handling (e.g., 9899 * bringing the interface down); see below for details. 9900 * 9901 * NOTE : We really don't enforce that ipif_id zero should be used 9902 * for setting any flags other than IFF_LOGINT_FLAGS. This 9903 * is because applications generally does SICGLIFFLAGS and 9904 * ORs in the new flags (that affects the logical) and does a 9905 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 9906 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 9907 * flags that will be turned on is correct with respect to 9908 * ipif_id 0. For backward compatibility reasons, it is not done. 9909 */ 9910 /* ARGSUSED */ 9911 int 9912 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9913 ip_ioctl_cmd_t *ipip, void *if_req) 9914 { 9915 uint64_t turn_on; 9916 uint64_t turn_off; 9917 int err = 0; 9918 phyint_t *phyi; 9919 ill_t *ill; 9920 conn_t *connp; 9921 uint64_t intf_flags; 9922 boolean_t phyint_flags_modified = B_FALSE; 9923 uint64_t flags; 9924 struct ifreq *ifr; 9925 struct lifreq *lifr; 9926 boolean_t set_linklocal = B_FALSE; 9927 9928 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 9929 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9930 9931 ASSERT(IAM_WRITER_IPIF(ipif)); 9932 9933 ill = ipif->ipif_ill; 9934 phyi = ill->ill_phyint; 9935 9936 if (ipip->ipi_cmd_type == IF_CMD) { 9937 ifr = (struct ifreq *)if_req; 9938 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 9939 } else { 9940 lifr = (struct lifreq *)if_req; 9941 flags = lifr->lifr_flags; 9942 } 9943 9944 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 9945 9946 /* 9947 * Have the flags been set correctly until now? 9948 */ 9949 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 9950 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 9951 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 9952 /* 9953 * Compare the new flags to the old, and partition 9954 * into those coming on and those going off. 9955 * For the 16 bit command keep the bits above bit 16 unchanged. 9956 */ 9957 if (ipip->ipi_cmd == SIOCSIFFLAGS) 9958 flags |= intf_flags & ~0xFFFF; 9959 9960 /* 9961 * Explicitly fail attempts to change flags that are always invalid on 9962 * an IPMP meta-interface. 9963 */ 9964 if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID)) 9965 return (EINVAL); 9966 9967 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 9968 if ((turn_on|turn_off) == 0) 9969 return (0); /* No change */ 9970 9971 /* 9972 * All test addresses must be IFF_DEPRECATED (to ensure source address 9973 * selection avoids them) -- so force IFF_DEPRECATED on, and do not 9974 * allow it to be turned off. 9975 */ 9976 if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED && 9977 (turn_on|intf_flags) & IFF_NOFAILOVER) 9978 return (EINVAL); 9979 9980 if ((connp = Q_TO_CONN(q)) == NULL) 9981 return (EINVAL); 9982 9983 /* 9984 * Only vrrp control socket is allowed to change IFF_UP and 9985 * IFF_NOACCEPT flags when IFF_VRRP is set. 9986 */ 9987 if ((intf_flags & IFF_VRRP) && ((turn_off | turn_on) & IFF_UP)) { 9988 if (!connp->conn_isvrrp) 9989 return (EINVAL); 9990 } 9991 9992 /* 9993 * The IFF_NOACCEPT flag can only be set on an IFF_VRRP IP address by 9994 * VRRP control socket. 9995 */ 9996 if ((turn_off | turn_on) & IFF_NOACCEPT) { 9997 if (!connp->conn_isvrrp || !(intf_flags & IFF_VRRP)) 9998 return (EINVAL); 9999 } 10000 10001 if (turn_on & IFF_NOFAILOVER) { 10002 turn_on |= IFF_DEPRECATED; 10003 flags |= IFF_DEPRECATED; 10004 } 10005 10006 /* 10007 * On underlying interfaces, only allow applications to manage test 10008 * addresses -- otherwise, they may get confused when the address 10009 * moves as part of being brought up. Likewise, prevent an 10010 * application-managed test address from being converted to a data 10011 * address. To prevent migration of administratively up addresses in 10012 * the kernel, we don't allow them to be converted either. 10013 */ 10014 if (IS_UNDER_IPMP(ill)) { 10015 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF; 10016 10017 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER)) 10018 return (EINVAL); 10019 10020 if ((turn_off & IFF_NOFAILOVER) && 10021 (flags & (appflags | IFF_UP | IFF_DUPLICATE))) 10022 return (EINVAL); 10023 } 10024 10025 /* 10026 * Only allow IFF_TEMPORARY flag to be set on 10027 * IPv6 interfaces. 10028 */ 10029 if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6)) 10030 return (EINVAL); 10031 10032 /* 10033 * cannot turn off IFF_NOXMIT on VNI interfaces. 10034 */ 10035 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill)) 10036 return (EINVAL); 10037 10038 /* 10039 * Don't allow the IFF_ROUTER flag to be turned on on loopback 10040 * interfaces. It makes no sense in that context. 10041 */ 10042 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 10043 return (EINVAL); 10044 10045 /* 10046 * For IPv6 ipif_id 0, don't allow the interface to be up without 10047 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 10048 * If the link local address isn't set, and can be set, it will get 10049 * set later on in this function. 10050 */ 10051 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 10052 (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) && 10053 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 10054 if (ipif_cant_setlinklocal(ipif)) 10055 return (EINVAL); 10056 set_linklocal = B_TRUE; 10057 } 10058 10059 /* 10060 * If we modify physical interface flags, we'll potentially need to 10061 * send up two routing socket messages for the changes (one for the 10062 * IPv4 ill, and another for the IPv6 ill). Note that here. 10063 */ 10064 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 10065 phyint_flags_modified = B_TRUE; 10066 10067 /* 10068 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE 10069 * (otherwise, we'd immediately use them, defeating standby). Also, 10070 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not 10071 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already 10072 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We 10073 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics 10074 * will not be honored. 10075 */ 10076 if (turn_on & PHYI_STANDBY) { 10077 /* 10078 * No need to grab ill_g_usesrc_lock here; see the 10079 * synchronization notes in ip.c. 10080 */ 10081 if (ill->ill_usesrc_grp_next != NULL || 10082 intf_flags & PHYI_INACTIVE) 10083 return (EINVAL); 10084 if (!(flags & PHYI_FAILED)) { 10085 flags |= PHYI_INACTIVE; 10086 turn_on |= PHYI_INACTIVE; 10087 } 10088 } 10089 10090 if (turn_off & PHYI_STANDBY) { 10091 flags &= ~PHYI_INACTIVE; 10092 turn_off |= PHYI_INACTIVE; 10093 } 10094 10095 /* 10096 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both 10097 * would end up on. 10098 */ 10099 if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) == 10100 (PHYI_FAILED | PHYI_INACTIVE)) 10101 return (EINVAL); 10102 10103 /* 10104 * If ILLF_ROUTER changes, we need to change the ip forwarding 10105 * status of the interface. 10106 */ 10107 if ((turn_on | turn_off) & ILLF_ROUTER) 10108 (void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0)); 10109 10110 /* 10111 * If the interface is not UP and we are not going to 10112 * bring it UP, record the flags and return. When the 10113 * interface comes UP later, the right actions will be 10114 * taken. 10115 */ 10116 if (!(ipif->ipif_flags & IPIF_UP) && 10117 !(turn_on & IPIF_UP)) { 10118 /* Record new flags in their respective places. */ 10119 mutex_enter(&ill->ill_lock); 10120 mutex_enter(&ill->ill_phyint->phyint_lock); 10121 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 10122 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 10123 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 10124 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 10125 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 10126 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 10127 mutex_exit(&ill->ill_lock); 10128 mutex_exit(&ill->ill_phyint->phyint_lock); 10129 10130 /* 10131 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the 10132 * same to the kernel: if any of them has been set by 10133 * userland, the interface cannot be used for data traffic. 10134 */ 10135 if ((turn_on|turn_off) & 10136 (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 10137 ASSERT(!IS_IPMP(ill)); 10138 /* 10139 * It's possible the ill is part of an "anonymous" 10140 * IPMP group rather than a real group. In that case, 10141 * there are no other interfaces in the group and thus 10142 * no need to call ipmp_phyint_refresh_active(). 10143 */ 10144 if (IS_UNDER_IPMP(ill)) 10145 ipmp_phyint_refresh_active(phyi); 10146 } 10147 10148 if (phyint_flags_modified) { 10149 if (phyi->phyint_illv4 != NULL) { 10150 ip_rts_ifmsg(phyi->phyint_illv4-> 10151 ill_ipif, RTSQ_DEFAULT); 10152 } 10153 if (phyi->phyint_illv6 != NULL) { 10154 ip_rts_ifmsg(phyi->phyint_illv6-> 10155 ill_ipif, RTSQ_DEFAULT); 10156 } 10157 } 10158 /* The default multicast interface might have changed */ 10159 ire_increment_multicast_generation(ill->ill_ipst, 10160 ill->ill_isv6); 10161 10162 return (0); 10163 } else if (set_linklocal) { 10164 mutex_enter(&ill->ill_lock); 10165 if (set_linklocal) 10166 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 10167 mutex_exit(&ill->ill_lock); 10168 } 10169 10170 /* 10171 * Disallow IPv6 interfaces coming up that have the unspecified address, 10172 * or point-to-point interfaces with an unspecified destination. We do 10173 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 10174 * have a subnet assigned, which is how in.ndpd currently manages its 10175 * onlink prefix list when no addresses are configured with those 10176 * prefixes. 10177 */ 10178 if (ipif->ipif_isv6 && 10179 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 10180 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 10181 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 10182 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 10183 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 10184 return (EINVAL); 10185 } 10186 10187 /* 10188 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 10189 * from being brought up. 10190 */ 10191 if (!ipif->ipif_isv6 && 10192 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 10193 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 10194 return (EINVAL); 10195 } 10196 10197 /* 10198 * If we are going to change one or more of the flags that are 10199 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP, 10200 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and 10201 * IPIF_NOFAILOVER, we will take special action. This is 10202 * done by bring the ipif down, changing the flags and bringing 10203 * it back up again. For IPIF_NOFAILOVER, the act of bringing it 10204 * back up will trigger the address to be moved. 10205 * 10206 * If we are going to change IFF_NOACCEPT, we need to bring 10207 * all the ipifs down then bring them up again. The act of 10208 * bringing all the ipifs back up will trigger the local 10209 * ires being recreated with "no_accept" set/cleared. 10210 * 10211 * Note that ILLF_NOACCEPT is always set separately from the 10212 * other flags. 10213 */ 10214 if ((turn_on|turn_off) & 10215 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 10216 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED| 10217 IPIF_NOFAILOVER)) { 10218 /* 10219 * ipif_down() will ire_delete bcast ire's for the subnet, 10220 * while the ire_identical_ref tracks the case of IRE_BROADCAST 10221 * entries shared between multiple ipifs on the same subnet. 10222 */ 10223 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 10224 !(turn_off & IPIF_UP)) { 10225 if (ipif->ipif_flags & IPIF_UP) 10226 ill->ill_logical_down = 1; 10227 turn_on &= ~IPIF_UP; 10228 } 10229 err = ipif_down(ipif, q, mp); 10230 ip1dbg(("ipif_down returns %d err ", err)); 10231 if (err == EINPROGRESS) 10232 return (err); 10233 (void) ipif_down_tail(ipif); 10234 } else if ((turn_on|turn_off) & ILLF_NOACCEPT) { 10235 /* 10236 * If we can quiesce the ill, then continue. If not, then 10237 * ip_sioctl_flags_tail() will be called from 10238 * ipif_ill_refrele_tail(). 10239 */ 10240 ill_down_ipifs(ill, B_TRUE); 10241 10242 mutex_enter(&connp->conn_lock); 10243 mutex_enter(&ill->ill_lock); 10244 if (!ill_is_quiescent(ill)) { 10245 boolean_t success; 10246 10247 success = ipsq_pending_mp_add(connp, ill->ill_ipif, 10248 q, mp, ILL_DOWN); 10249 mutex_exit(&ill->ill_lock); 10250 mutex_exit(&connp->conn_lock); 10251 return (success ? EINPROGRESS : EINTR); 10252 } 10253 mutex_exit(&ill->ill_lock); 10254 mutex_exit(&connp->conn_lock); 10255 } 10256 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 10257 } 10258 10259 static int 10260 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) 10261 { 10262 ill_t *ill; 10263 phyint_t *phyi; 10264 uint64_t turn_on, turn_off; 10265 boolean_t phyint_flags_modified = B_FALSE; 10266 int err = 0; 10267 boolean_t set_linklocal = B_FALSE; 10268 10269 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 10270 ipif->ipif_ill->ill_name, ipif->ipif_id)); 10271 10272 ASSERT(IAM_WRITER_IPIF(ipif)); 10273 10274 ill = ipif->ipif_ill; 10275 phyi = ill->ill_phyint; 10276 10277 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10278 10279 /* 10280 * IFF_UP is handled separately. 10281 */ 10282 turn_on &= ~IFF_UP; 10283 turn_off &= ~IFF_UP; 10284 10285 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 10286 phyint_flags_modified = B_TRUE; 10287 10288 /* 10289 * Now we change the flags. Track current value of 10290 * other flags in their respective places. 10291 */ 10292 mutex_enter(&ill->ill_lock); 10293 mutex_enter(&phyi->phyint_lock); 10294 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 10295 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 10296 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 10297 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 10298 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 10299 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 10300 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 10301 set_linklocal = B_TRUE; 10302 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 10303 } 10304 10305 mutex_exit(&ill->ill_lock); 10306 mutex_exit(&phyi->phyint_lock); 10307 10308 if (set_linklocal) 10309 (void) ipif_setlinklocal(ipif); 10310 10311 /* 10312 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to 10313 * the kernel: if any of them has been set by userland, the interface 10314 * cannot be used for data traffic. 10315 */ 10316 if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 10317 ASSERT(!IS_IPMP(ill)); 10318 /* 10319 * It's possible the ill is part of an "anonymous" IPMP group 10320 * rather than a real group. In that case, there are no other 10321 * interfaces in the group and thus no need for us to call 10322 * ipmp_phyint_refresh_active(). 10323 */ 10324 if (IS_UNDER_IPMP(ill)) 10325 ipmp_phyint_refresh_active(phyi); 10326 } 10327 10328 if ((turn_on|turn_off) & ILLF_NOACCEPT) { 10329 /* 10330 * If the ILLF_NOACCEPT flag is changed, bring up all the 10331 * ipifs that were brought down. 10332 * 10333 * The routing sockets messages are sent as the result 10334 * of ill_up_ipifs(), further, SCTP's IPIF list was updated 10335 * as well. 10336 */ 10337 err = ill_up_ipifs(ill, q, mp); 10338 } else if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) { 10339 /* 10340 * XXX ipif_up really does not know whether a phyint flags 10341 * was modified or not. So, it sends up information on 10342 * only one routing sockets message. As we don't bring up 10343 * the interface and also set PHYI_ flags simultaneously 10344 * it should be okay. 10345 */ 10346 err = ipif_up(ipif, q, mp); 10347 } else { 10348 /* 10349 * Make sure routing socket sees all changes to the flags. 10350 * ipif_up_done* handles this when we use ipif_up. 10351 */ 10352 if (phyint_flags_modified) { 10353 if (phyi->phyint_illv4 != NULL) { 10354 ip_rts_ifmsg(phyi->phyint_illv4-> 10355 ill_ipif, RTSQ_DEFAULT); 10356 } 10357 if (phyi->phyint_illv6 != NULL) { 10358 ip_rts_ifmsg(phyi->phyint_illv6-> 10359 ill_ipif, RTSQ_DEFAULT); 10360 } 10361 } else { 10362 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 10363 } 10364 /* 10365 * Update the flags in SCTP's IPIF list, ipif_up() will do 10366 * this in need_up case. 10367 */ 10368 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10369 } 10370 10371 /* The default multicast interface might have changed */ 10372 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); 10373 return (err); 10374 } 10375 10376 /* 10377 * Restart the flags operation now that the refcounts have dropped to zero. 10378 */ 10379 /* ARGSUSED */ 10380 int 10381 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10382 ip_ioctl_cmd_t *ipip, void *if_req) 10383 { 10384 uint64_t flags; 10385 struct ifreq *ifr = if_req; 10386 struct lifreq *lifr = if_req; 10387 uint64_t turn_on, turn_off; 10388 10389 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 10390 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10391 10392 if (ipip->ipi_cmd_type == IF_CMD) { 10393 /* cast to uint16_t prevents unwanted sign extension */ 10394 flags = (uint16_t)ifr->ifr_flags; 10395 } else { 10396 flags = lifr->lifr_flags; 10397 } 10398 10399 /* 10400 * If this function call is a result of the ILLF_NOACCEPT flag 10401 * change, do not call ipif_down_tail(). See ip_sioctl_flags(). 10402 */ 10403 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10404 if (!((turn_on|turn_off) & ILLF_NOACCEPT)) 10405 (void) ipif_down_tail(ipif); 10406 10407 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 10408 } 10409 10410 /* 10411 * Can operate on either a module or a driver queue. 10412 */ 10413 /* ARGSUSED */ 10414 int 10415 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10416 ip_ioctl_cmd_t *ipip, void *if_req) 10417 { 10418 /* 10419 * Has the flags been set correctly till now ? 10420 */ 10421 ill_t *ill = ipif->ipif_ill; 10422 phyint_t *phyi = ill->ill_phyint; 10423 10424 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 10425 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10426 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 10427 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 10428 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 10429 10430 /* 10431 * Need a lock since some flags can be set even when there are 10432 * references to the ipif. 10433 */ 10434 mutex_enter(&ill->ill_lock); 10435 if (ipip->ipi_cmd_type == IF_CMD) { 10436 struct ifreq *ifr = (struct ifreq *)if_req; 10437 10438 /* Get interface flags (low 16 only). */ 10439 ifr->ifr_flags = ((ipif->ipif_flags | 10440 ill->ill_flags | phyi->phyint_flags) & 0xffff); 10441 } else { 10442 struct lifreq *lifr = (struct lifreq *)if_req; 10443 10444 /* Get interface flags. */ 10445 lifr->lifr_flags = ipif->ipif_flags | 10446 ill->ill_flags | phyi->phyint_flags; 10447 } 10448 mutex_exit(&ill->ill_lock); 10449 return (0); 10450 } 10451 10452 /* 10453 * We allow the MTU to be set on an ILL, but not have it be different 10454 * for different IPIFs since we don't actually send packets on IPIFs. 10455 */ 10456 /* ARGSUSED */ 10457 int 10458 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10459 ip_ioctl_cmd_t *ipip, void *if_req) 10460 { 10461 int mtu; 10462 int ip_min_mtu; 10463 struct ifreq *ifr; 10464 struct lifreq *lifr; 10465 ill_t *ill; 10466 10467 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 10468 ipif->ipif_id, (void *)ipif)); 10469 if (ipip->ipi_cmd_type == IF_CMD) { 10470 ifr = (struct ifreq *)if_req; 10471 mtu = ifr->ifr_metric; 10472 } else { 10473 lifr = (struct lifreq *)if_req; 10474 mtu = lifr->lifr_mtu; 10475 } 10476 /* Only allow for logical unit zero i.e. not on "bge0:17" */ 10477 if (ipif->ipif_id != 0) 10478 return (EINVAL); 10479 10480 ill = ipif->ipif_ill; 10481 if (ipif->ipif_isv6) 10482 ip_min_mtu = IPV6_MIN_MTU; 10483 else 10484 ip_min_mtu = IP_MIN_MTU; 10485 10486 mutex_enter(&ill->ill_lock); 10487 if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) { 10488 mutex_exit(&ill->ill_lock); 10489 return (EINVAL); 10490 } 10491 /* 10492 * The dce and fragmentation code can handle changes to ill_mtu 10493 * concurrent with sending/fragmenting packets. 10494 */ 10495 ill->ill_mtu = mtu; 10496 ill->ill_flags |= ILLF_FIXEDMTU; 10497 mutex_exit(&ill->ill_lock); 10498 10499 /* 10500 * Make sure all dce_generation checks find out 10501 * that ill_mtu has changed. 10502 */ 10503 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); 10504 10505 /* Update the MTU in SCTP's list */ 10506 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10507 return (0); 10508 } 10509 10510 /* Get interface MTU. */ 10511 /* ARGSUSED */ 10512 int 10513 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10514 ip_ioctl_cmd_t *ipip, void *if_req) 10515 { 10516 struct ifreq *ifr; 10517 struct lifreq *lifr; 10518 10519 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 10520 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10521 10522 /* 10523 * We allow a get on any logical interface even though the set 10524 * can only be done on logical unit 0. 10525 */ 10526 if (ipip->ipi_cmd_type == IF_CMD) { 10527 ifr = (struct ifreq *)if_req; 10528 ifr->ifr_metric = ipif->ipif_ill->ill_mtu; 10529 } else { 10530 lifr = (struct lifreq *)if_req; 10531 lifr->lifr_mtu = ipif->ipif_ill->ill_mtu; 10532 } 10533 return (0); 10534 } 10535 10536 /* Set interface broadcast address. */ 10537 /* ARGSUSED2 */ 10538 int 10539 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10540 ip_ioctl_cmd_t *ipip, void *if_req) 10541 { 10542 ipaddr_t addr; 10543 ire_t *ire; 10544 ill_t *ill = ipif->ipif_ill; 10545 ip_stack_t *ipst = ill->ill_ipst; 10546 10547 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name, 10548 ipif->ipif_id)); 10549 10550 ASSERT(IAM_WRITER_IPIF(ipif)); 10551 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 10552 return (EADDRNOTAVAIL); 10553 10554 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 10555 10556 if (sin->sin_family != AF_INET) 10557 return (EAFNOSUPPORT); 10558 10559 addr = sin->sin_addr.s_addr; 10560 if (ipif->ipif_flags & IPIF_UP) { 10561 /* 10562 * If we are already up, make sure the new 10563 * broadcast address makes sense. If it does, 10564 * there should be an IRE for it already. 10565 */ 10566 ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST, 10567 ill, ipif->ipif_zoneid, NULL, 10568 (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL); 10569 if (ire == NULL) { 10570 return (EINVAL); 10571 } else { 10572 ire_refrele(ire); 10573 } 10574 } 10575 /* 10576 * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST 10577 * needs to already exist we never need to change the set of 10578 * IRE_BROADCASTs when we are UP. 10579 */ 10580 if (addr != ipif->ipif_brd_addr) 10581 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 10582 10583 return (0); 10584 } 10585 10586 /* Get interface broadcast address. */ 10587 /* ARGSUSED */ 10588 int 10589 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10590 ip_ioctl_cmd_t *ipip, void *if_req) 10591 { 10592 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 10593 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10594 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 10595 return (EADDRNOTAVAIL); 10596 10597 /* IPIF_BROADCAST not possible with IPv6 */ 10598 ASSERT(!ipif->ipif_isv6); 10599 *sin = sin_null; 10600 sin->sin_family = AF_INET; 10601 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 10602 return (0); 10603 } 10604 10605 /* 10606 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 10607 */ 10608 /* ARGSUSED */ 10609 int 10610 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10611 ip_ioctl_cmd_t *ipip, void *if_req) 10612 { 10613 int err = 0; 10614 in6_addr_t v6mask; 10615 10616 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 10617 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10618 10619 ASSERT(IAM_WRITER_IPIF(ipif)); 10620 10621 if (ipif->ipif_isv6) { 10622 sin6_t *sin6; 10623 10624 if (sin->sin_family != AF_INET6) 10625 return (EAFNOSUPPORT); 10626 10627 sin6 = (sin6_t *)sin; 10628 v6mask = sin6->sin6_addr; 10629 } else { 10630 ipaddr_t mask; 10631 10632 if (sin->sin_family != AF_INET) 10633 return (EAFNOSUPPORT); 10634 10635 mask = sin->sin_addr.s_addr; 10636 V4MASK_TO_V6(mask, v6mask); 10637 } 10638 10639 /* 10640 * No big deal if the interface isn't already up, or the mask 10641 * isn't really changing, or this is pt-pt. 10642 */ 10643 if (!(ipif->ipif_flags & IPIF_UP) || 10644 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 10645 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 10646 ipif->ipif_v6net_mask = v6mask; 10647 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10648 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 10649 ipif->ipif_v6net_mask, 10650 ipif->ipif_v6subnet); 10651 } 10652 return (0); 10653 } 10654 /* 10655 * Make sure we have valid net and subnet broadcast ire's 10656 * for the old netmask, if needed by other logical interfaces. 10657 */ 10658 err = ipif_logical_down(ipif, q, mp); 10659 if (err == EINPROGRESS) 10660 return (err); 10661 (void) ipif_down_tail(ipif); 10662 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 10663 return (err); 10664 } 10665 10666 static int 10667 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 10668 { 10669 in6_addr_t v6mask; 10670 int err = 0; 10671 10672 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 10673 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10674 10675 if (ipif->ipif_isv6) { 10676 sin6_t *sin6; 10677 10678 sin6 = (sin6_t *)sin; 10679 v6mask = sin6->sin6_addr; 10680 } else { 10681 ipaddr_t mask; 10682 10683 mask = sin->sin_addr.s_addr; 10684 V4MASK_TO_V6(mask, v6mask); 10685 } 10686 10687 ipif->ipif_v6net_mask = v6mask; 10688 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10689 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 10690 ipif->ipif_v6subnet); 10691 } 10692 err = ipif_up(ipif, q, mp); 10693 10694 if (err == 0 || err == EINPROGRESS) { 10695 /* 10696 * The interface must be DL_BOUND if this packet has to 10697 * go out on the wire. Since we only go through a logical 10698 * down and are bound with the driver during an internal 10699 * down/up that is satisfied. 10700 */ 10701 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 10702 /* Potentially broadcast an address mask reply. */ 10703 ipif_mask_reply(ipif); 10704 } 10705 } 10706 return (err); 10707 } 10708 10709 /* ARGSUSED */ 10710 int 10711 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10712 ip_ioctl_cmd_t *ipip, void *if_req) 10713 { 10714 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 10715 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10716 (void) ipif_down_tail(ipif); 10717 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 10718 } 10719 10720 /* Get interface net mask. */ 10721 /* ARGSUSED */ 10722 int 10723 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10724 ip_ioctl_cmd_t *ipip, void *if_req) 10725 { 10726 struct lifreq *lifr = (struct lifreq *)if_req; 10727 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 10728 10729 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 10730 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10731 10732 /* 10733 * net mask can't change since we have a reference to the ipif. 10734 */ 10735 if (ipif->ipif_isv6) { 10736 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 10737 *sin6 = sin6_null; 10738 sin6->sin6_family = AF_INET6; 10739 sin6->sin6_addr = ipif->ipif_v6net_mask; 10740 lifr->lifr_addrlen = 10741 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 10742 } else { 10743 *sin = sin_null; 10744 sin->sin_family = AF_INET; 10745 sin->sin_addr.s_addr = ipif->ipif_net_mask; 10746 if (ipip->ipi_cmd_type == LIF_CMD) { 10747 lifr->lifr_addrlen = 10748 ip_mask_to_plen(ipif->ipif_net_mask); 10749 } 10750 } 10751 return (0); 10752 } 10753 10754 /* ARGSUSED */ 10755 int 10756 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10757 ip_ioctl_cmd_t *ipip, void *if_req) 10758 { 10759 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 10760 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10761 10762 /* 10763 * Since no applications should ever be setting metrics on underlying 10764 * interfaces, we explicitly fail to smoke 'em out. 10765 */ 10766 if (IS_UNDER_IPMP(ipif->ipif_ill)) 10767 return (EINVAL); 10768 10769 /* 10770 * Set interface metric. We don't use this for 10771 * anything but we keep track of it in case it is 10772 * important to routing applications or such. 10773 */ 10774 if (ipip->ipi_cmd_type == IF_CMD) { 10775 struct ifreq *ifr; 10776 10777 ifr = (struct ifreq *)if_req; 10778 ipif->ipif_metric = ifr->ifr_metric; 10779 } else { 10780 struct lifreq *lifr; 10781 10782 lifr = (struct lifreq *)if_req; 10783 ipif->ipif_metric = lifr->lifr_metric; 10784 } 10785 return (0); 10786 } 10787 10788 /* ARGSUSED */ 10789 int 10790 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10791 ip_ioctl_cmd_t *ipip, void *if_req) 10792 { 10793 /* Get interface metric. */ 10794 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 10795 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10796 10797 if (ipip->ipi_cmd_type == IF_CMD) { 10798 struct ifreq *ifr; 10799 10800 ifr = (struct ifreq *)if_req; 10801 ifr->ifr_metric = ipif->ipif_metric; 10802 } else { 10803 struct lifreq *lifr; 10804 10805 lifr = (struct lifreq *)if_req; 10806 lifr->lifr_metric = ipif->ipif_metric; 10807 } 10808 10809 return (0); 10810 } 10811 10812 /* ARGSUSED */ 10813 int 10814 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10815 ip_ioctl_cmd_t *ipip, void *if_req) 10816 { 10817 int arp_muxid; 10818 10819 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 10820 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10821 /* 10822 * Set the muxid returned from I_PLINK. 10823 */ 10824 if (ipip->ipi_cmd_type == IF_CMD) { 10825 struct ifreq *ifr = (struct ifreq *)if_req; 10826 10827 ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid; 10828 arp_muxid = ifr->ifr_arp_muxid; 10829 } else { 10830 struct lifreq *lifr = (struct lifreq *)if_req; 10831 10832 ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid; 10833 arp_muxid = lifr->lifr_arp_muxid; 10834 } 10835 arl_set_muxid(ipif->ipif_ill, arp_muxid); 10836 return (0); 10837 } 10838 10839 /* ARGSUSED */ 10840 int 10841 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10842 ip_ioctl_cmd_t *ipip, void *if_req) 10843 { 10844 int arp_muxid = 0; 10845 10846 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 10847 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10848 /* 10849 * Get the muxid saved in ill for I_PUNLINK. 10850 */ 10851 arp_muxid = arl_get_muxid(ipif->ipif_ill); 10852 if (ipip->ipi_cmd_type == IF_CMD) { 10853 struct ifreq *ifr = (struct ifreq *)if_req; 10854 10855 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid; 10856 ifr->ifr_arp_muxid = arp_muxid; 10857 } else { 10858 struct lifreq *lifr = (struct lifreq *)if_req; 10859 10860 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid; 10861 lifr->lifr_arp_muxid = arp_muxid; 10862 } 10863 return (0); 10864 } 10865 10866 /* 10867 * Set the subnet prefix. Does not modify the broadcast address. 10868 */ 10869 /* ARGSUSED */ 10870 int 10871 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10872 ip_ioctl_cmd_t *ipip, void *if_req) 10873 { 10874 int err = 0; 10875 in6_addr_t v6addr; 10876 in6_addr_t v6mask; 10877 boolean_t need_up = B_FALSE; 10878 int addrlen; 10879 10880 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 10881 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10882 10883 ASSERT(IAM_WRITER_IPIF(ipif)); 10884 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 10885 10886 if (ipif->ipif_isv6) { 10887 sin6_t *sin6; 10888 10889 if (sin->sin_family != AF_INET6) 10890 return (EAFNOSUPPORT); 10891 10892 sin6 = (sin6_t *)sin; 10893 v6addr = sin6->sin6_addr; 10894 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 10895 return (EADDRNOTAVAIL); 10896 } else { 10897 ipaddr_t addr; 10898 10899 if (sin->sin_family != AF_INET) 10900 return (EAFNOSUPPORT); 10901 10902 addr = sin->sin_addr.s_addr; 10903 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 10904 return (EADDRNOTAVAIL); 10905 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10906 /* Add 96 bits */ 10907 addrlen += IPV6_ABITS - IP_ABITS; 10908 } 10909 10910 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 10911 return (EINVAL); 10912 10913 /* Check if bits in the address is set past the mask */ 10914 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 10915 return (EINVAL); 10916 10917 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 10918 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 10919 return (0); /* No change */ 10920 10921 if (ipif->ipif_flags & IPIF_UP) { 10922 /* 10923 * If the interface is already marked up, 10924 * we call ipif_down which will take care 10925 * of ditching any IREs that have been set 10926 * up based on the old interface address. 10927 */ 10928 err = ipif_logical_down(ipif, q, mp); 10929 if (err == EINPROGRESS) 10930 return (err); 10931 (void) ipif_down_tail(ipif); 10932 need_up = B_TRUE; 10933 } 10934 10935 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 10936 return (err); 10937 } 10938 10939 static int 10940 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 10941 queue_t *q, mblk_t *mp, boolean_t need_up) 10942 { 10943 ill_t *ill = ipif->ipif_ill; 10944 int err = 0; 10945 10946 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 10947 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10948 10949 /* Set the new address. */ 10950 mutex_enter(&ill->ill_lock); 10951 ipif->ipif_v6net_mask = v6mask; 10952 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10953 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 10954 ipif->ipif_v6subnet); 10955 } 10956 mutex_exit(&ill->ill_lock); 10957 10958 if (need_up) { 10959 /* 10960 * Now bring the interface back up. If this 10961 * is the only IPIF for the ILL, ipif_up 10962 * will have to re-bind to the device, so 10963 * we may get back EINPROGRESS, in which 10964 * case, this IOCTL will get completed in 10965 * ip_rput_dlpi when we see the DL_BIND_ACK. 10966 */ 10967 err = ipif_up(ipif, q, mp); 10968 if (err == EINPROGRESS) 10969 return (err); 10970 } 10971 return (err); 10972 } 10973 10974 /* ARGSUSED */ 10975 int 10976 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10977 ip_ioctl_cmd_t *ipip, void *if_req) 10978 { 10979 int addrlen; 10980 in6_addr_t v6addr; 10981 in6_addr_t v6mask; 10982 struct lifreq *lifr = (struct lifreq *)if_req; 10983 10984 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 10985 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10986 (void) ipif_down_tail(ipif); 10987 10988 addrlen = lifr->lifr_addrlen; 10989 if (ipif->ipif_isv6) { 10990 sin6_t *sin6; 10991 10992 sin6 = (sin6_t *)sin; 10993 v6addr = sin6->sin6_addr; 10994 } else { 10995 ipaddr_t addr; 10996 10997 addr = sin->sin_addr.s_addr; 10998 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10999 addrlen += IPV6_ABITS - IP_ABITS; 11000 } 11001 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 11002 11003 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 11004 } 11005 11006 /* ARGSUSED */ 11007 int 11008 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11009 ip_ioctl_cmd_t *ipip, void *if_req) 11010 { 11011 struct lifreq *lifr = (struct lifreq *)if_req; 11012 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 11013 11014 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 11015 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11016 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11017 11018 if (ipif->ipif_isv6) { 11019 *sin6 = sin6_null; 11020 sin6->sin6_family = AF_INET6; 11021 sin6->sin6_addr = ipif->ipif_v6subnet; 11022 lifr->lifr_addrlen = 11023 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11024 } else { 11025 *sin = sin_null; 11026 sin->sin_family = AF_INET; 11027 sin->sin_addr.s_addr = ipif->ipif_subnet; 11028 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 11029 } 11030 return (0); 11031 } 11032 11033 /* 11034 * Set the IPv6 address token. 11035 */ 11036 /* ARGSUSED */ 11037 int 11038 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11039 ip_ioctl_cmd_t *ipi, void *if_req) 11040 { 11041 ill_t *ill = ipif->ipif_ill; 11042 int err; 11043 in6_addr_t v6addr; 11044 in6_addr_t v6mask; 11045 boolean_t need_up = B_FALSE; 11046 int i; 11047 sin6_t *sin6 = (sin6_t *)sin; 11048 struct lifreq *lifr = (struct lifreq *)if_req; 11049 int addrlen; 11050 11051 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 11052 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11053 ASSERT(IAM_WRITER_IPIF(ipif)); 11054 11055 addrlen = lifr->lifr_addrlen; 11056 /* Only allow for logical unit zero i.e. not on "le0:17" */ 11057 if (ipif->ipif_id != 0) 11058 return (EINVAL); 11059 11060 if (!ipif->ipif_isv6) 11061 return (EINVAL); 11062 11063 if (addrlen > IPV6_ABITS) 11064 return (EINVAL); 11065 11066 v6addr = sin6->sin6_addr; 11067 11068 /* 11069 * The length of the token is the length from the end. To get 11070 * the proper mask for this, compute the mask of the bits not 11071 * in the token; ie. the prefix, and then xor to get the mask. 11072 */ 11073 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 11074 return (EINVAL); 11075 for (i = 0; i < 4; i++) { 11076 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 11077 } 11078 11079 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 11080 ill->ill_token_length == addrlen) 11081 return (0); /* No change */ 11082 11083 if (ipif->ipif_flags & IPIF_UP) { 11084 err = ipif_logical_down(ipif, q, mp); 11085 if (err == EINPROGRESS) 11086 return (err); 11087 (void) ipif_down_tail(ipif); 11088 need_up = B_TRUE; 11089 } 11090 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 11091 return (err); 11092 } 11093 11094 static int 11095 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 11096 mblk_t *mp, boolean_t need_up) 11097 { 11098 in6_addr_t v6addr; 11099 in6_addr_t v6mask; 11100 ill_t *ill = ipif->ipif_ill; 11101 int i; 11102 int err = 0; 11103 11104 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 11105 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11106 v6addr = sin6->sin6_addr; 11107 /* 11108 * The length of the token is the length from the end. To get 11109 * the proper mask for this, compute the mask of the bits not 11110 * in the token; ie. the prefix, and then xor to get the mask. 11111 */ 11112 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 11113 for (i = 0; i < 4; i++) 11114 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 11115 11116 mutex_enter(&ill->ill_lock); 11117 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 11118 ill->ill_token_length = addrlen; 11119 ill->ill_manual_token = 1; 11120 11121 /* Reconfigure the link-local address based on this new token */ 11122 ipif_setlinklocal(ill->ill_ipif); 11123 11124 mutex_exit(&ill->ill_lock); 11125 11126 if (need_up) { 11127 /* 11128 * Now bring the interface back up. If this 11129 * is the only IPIF for the ILL, ipif_up 11130 * will have to re-bind to the device, so 11131 * we may get back EINPROGRESS, in which 11132 * case, this IOCTL will get completed in 11133 * ip_rput_dlpi when we see the DL_BIND_ACK. 11134 */ 11135 err = ipif_up(ipif, q, mp); 11136 if (err == EINPROGRESS) 11137 return (err); 11138 } 11139 return (err); 11140 } 11141 11142 /* ARGSUSED */ 11143 int 11144 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11145 ip_ioctl_cmd_t *ipi, void *if_req) 11146 { 11147 ill_t *ill; 11148 sin6_t *sin6 = (sin6_t *)sin; 11149 struct lifreq *lifr = (struct lifreq *)if_req; 11150 11151 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 11152 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11153 if (ipif->ipif_id != 0) 11154 return (EINVAL); 11155 11156 ill = ipif->ipif_ill; 11157 if (!ill->ill_isv6) 11158 return (ENXIO); 11159 11160 *sin6 = sin6_null; 11161 sin6->sin6_family = AF_INET6; 11162 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 11163 sin6->sin6_addr = ill->ill_token; 11164 lifr->lifr_addrlen = ill->ill_token_length; 11165 return (0); 11166 } 11167 11168 /* 11169 * Set (hardware) link specific information that might override 11170 * what was acquired through the DL_INFO_ACK. 11171 */ 11172 /* ARGSUSED */ 11173 int 11174 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11175 ip_ioctl_cmd_t *ipi, void *if_req) 11176 { 11177 ill_t *ill = ipif->ipif_ill; 11178 int ip_min_mtu; 11179 struct lifreq *lifr = (struct lifreq *)if_req; 11180 lif_ifinfo_req_t *lir; 11181 11182 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 11183 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11184 lir = &lifr->lifr_ifinfo; 11185 ASSERT(IAM_WRITER_IPIF(ipif)); 11186 11187 /* Only allow for logical unit zero i.e. not on "bge0:17" */ 11188 if (ipif->ipif_id != 0) 11189 return (EINVAL); 11190 11191 /* Set interface MTU. */ 11192 if (ipif->ipif_isv6) 11193 ip_min_mtu = IPV6_MIN_MTU; 11194 else 11195 ip_min_mtu = IP_MIN_MTU; 11196 11197 /* 11198 * Verify values before we set anything. Allow zero to 11199 * mean unspecified. 11200 * 11201 * XXX We should be able to set the user-defined lir_mtu to some value 11202 * that is greater than ill_current_frag but less than ill_max_frag- the 11203 * ill_max_frag value tells us the max MTU that can be handled by the 11204 * datalink, whereas the ill_current_frag is dynamically computed for 11205 * some link-types like tunnels, based on the tunnel PMTU. However, 11206 * since there is currently no way of distinguishing between 11207 * administratively fixed link mtu values (e.g., those set via 11208 * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered 11209 * for tunnels) we conservatively choose the ill_current_frag as the 11210 * upper-bound. 11211 */ 11212 if (lir->lir_maxmtu != 0 && 11213 (lir->lir_maxmtu > ill->ill_current_frag || 11214 lir->lir_maxmtu < ip_min_mtu)) 11215 return (EINVAL); 11216 if (lir->lir_reachtime != 0 && 11217 lir->lir_reachtime > ND_MAX_REACHTIME) 11218 return (EINVAL); 11219 if (lir->lir_reachretrans != 0 && 11220 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 11221 return (EINVAL); 11222 11223 mutex_enter(&ill->ill_lock); 11224 /* 11225 * The dce and fragmentation code can handle changes to ill_mtu 11226 * concurrent with sending/fragmenting packets. 11227 */ 11228 if (lir->lir_maxmtu != 0) 11229 ill->ill_user_mtu = lir->lir_maxmtu; 11230 11231 if (lir->lir_reachtime != 0) 11232 ill->ill_reachable_time = lir->lir_reachtime; 11233 11234 if (lir->lir_reachretrans != 0) 11235 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 11236 11237 ill->ill_max_hops = lir->lir_maxhops; 11238 ill->ill_max_buf = ND_MAX_Q; 11239 if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) { 11240 /* 11241 * ill_mtu is the actual interface MTU, obtained as the min 11242 * of user-configured mtu and the value announced by the 11243 * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since 11244 * we have already made the choice of requiring 11245 * ill_user_mtu < ill_current_frag by the time we get here, 11246 * the ill_mtu effectively gets assigned to the ill_user_mtu 11247 * here. 11248 */ 11249 ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu); 11250 } 11251 mutex_exit(&ill->ill_lock); 11252 11253 /* 11254 * Make sure all dce_generation checks find out 11255 * that ill_mtu has changed. 11256 */ 11257 if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0)) 11258 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); 11259 11260 /* 11261 * Refresh IPMP meta-interface MTU if necessary. 11262 */ 11263 if (IS_UNDER_IPMP(ill)) 11264 ipmp_illgrp_refresh_mtu(ill->ill_grp); 11265 11266 return (0); 11267 } 11268 11269 /* ARGSUSED */ 11270 int 11271 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11272 ip_ioctl_cmd_t *ipi, void *if_req) 11273 { 11274 struct lif_ifinfo_req *lir; 11275 ill_t *ill = ipif->ipif_ill; 11276 11277 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 11278 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11279 if (ipif->ipif_id != 0) 11280 return (EINVAL); 11281 11282 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 11283 lir->lir_maxhops = ill->ill_max_hops; 11284 lir->lir_reachtime = ill->ill_reachable_time; 11285 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 11286 lir->lir_maxmtu = ill->ill_mtu; 11287 11288 return (0); 11289 } 11290 11291 /* 11292 * Return best guess as to the subnet mask for the specified address. 11293 * Based on the subnet masks for all the configured interfaces. 11294 * 11295 * We end up returning a zero mask in the case of default, multicast or 11296 * experimental. 11297 */ 11298 static ipaddr_t 11299 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst) 11300 { 11301 ipaddr_t net_mask; 11302 ill_t *ill; 11303 ipif_t *ipif; 11304 ill_walk_context_t ctx; 11305 ipif_t *fallback_ipif = NULL; 11306 11307 net_mask = ip_net_mask(addr); 11308 if (net_mask == 0) { 11309 *ipifp = NULL; 11310 return (0); 11311 } 11312 11313 /* Let's check to see if this is maybe a local subnet route. */ 11314 /* this function only applies to IPv4 interfaces */ 11315 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 11316 ill = ILL_START_WALK_V4(&ctx, ipst); 11317 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 11318 mutex_enter(&ill->ill_lock); 11319 for (ipif = ill->ill_ipif; ipif != NULL; 11320 ipif = ipif->ipif_next) { 11321 if (IPIF_IS_CONDEMNED(ipif)) 11322 continue; 11323 if (!(ipif->ipif_flags & IPIF_UP)) 11324 continue; 11325 if ((ipif->ipif_subnet & net_mask) == 11326 (addr & net_mask)) { 11327 /* 11328 * Don't trust pt-pt interfaces if there are 11329 * other interfaces. 11330 */ 11331 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 11332 if (fallback_ipif == NULL) { 11333 ipif_refhold_locked(ipif); 11334 fallback_ipif = ipif; 11335 } 11336 continue; 11337 } 11338 11339 /* 11340 * Fine. Just assume the same net mask as the 11341 * directly attached subnet interface is using. 11342 */ 11343 ipif_refhold_locked(ipif); 11344 mutex_exit(&ill->ill_lock); 11345 rw_exit(&ipst->ips_ill_g_lock); 11346 if (fallback_ipif != NULL) 11347 ipif_refrele(fallback_ipif); 11348 *ipifp = ipif; 11349 return (ipif->ipif_net_mask); 11350 } 11351 } 11352 mutex_exit(&ill->ill_lock); 11353 } 11354 rw_exit(&ipst->ips_ill_g_lock); 11355 11356 *ipifp = fallback_ipif; 11357 return ((fallback_ipif != NULL) ? 11358 fallback_ipif->ipif_net_mask : net_mask); 11359 } 11360 11361 /* 11362 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 11363 */ 11364 static void 11365 ip_wput_ioctl(queue_t *q, mblk_t *mp) 11366 { 11367 IOCP iocp; 11368 ipft_t *ipft; 11369 ipllc_t *ipllc; 11370 mblk_t *mp1; 11371 cred_t *cr; 11372 int error = 0; 11373 conn_t *connp; 11374 11375 ip1dbg(("ip_wput_ioctl")); 11376 iocp = (IOCP)mp->b_rptr; 11377 mp1 = mp->b_cont; 11378 if (mp1 == NULL) { 11379 iocp->ioc_error = EINVAL; 11380 mp->b_datap->db_type = M_IOCNAK; 11381 iocp->ioc_count = 0; 11382 qreply(q, mp); 11383 return; 11384 } 11385 11386 /* 11387 * These IOCTLs provide various control capabilities to 11388 * upstream agents such as ULPs and processes. There 11389 * are currently two such IOCTLs implemented. They 11390 * are used by TCP to provide update information for 11391 * existing IREs and to forcibly delete an IRE for a 11392 * host that is not responding, thereby forcing an 11393 * attempt at a new route. 11394 */ 11395 iocp->ioc_error = EINVAL; 11396 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 11397 goto done; 11398 11399 ipllc = (ipllc_t *)mp1->b_rptr; 11400 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 11401 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 11402 break; 11403 } 11404 /* 11405 * prefer credential from mblk over ioctl; 11406 * see ip_sioctl_copyin_setup 11407 */ 11408 cr = msg_getcred(mp, NULL); 11409 if (cr == NULL) 11410 cr = iocp->ioc_cr; 11411 11412 /* 11413 * Refhold the conn in case the request gets queued up in some lookup 11414 */ 11415 ASSERT(CONN_Q(q)); 11416 connp = Q_TO_CONN(q); 11417 CONN_INC_REF(connp); 11418 if (ipft->ipft_pfi && 11419 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 11420 pullupmsg(mp1, ipft->ipft_min_size))) { 11421 error = (*ipft->ipft_pfi)(q, 11422 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 11423 } 11424 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 11425 /* 11426 * CONN_OPER_PENDING_DONE happens in the function called 11427 * through ipft_pfi above. 11428 */ 11429 return; 11430 } 11431 11432 CONN_OPER_PENDING_DONE(connp); 11433 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 11434 freemsg(mp); 11435 return; 11436 } 11437 iocp->ioc_error = error; 11438 11439 done: 11440 mp->b_datap->db_type = M_IOCACK; 11441 if (iocp->ioc_error) 11442 iocp->ioc_count = 0; 11443 qreply(q, mp); 11444 } 11445 11446 /* 11447 * Assign a unique id for the ipif. This is used by sctp_addr.c 11448 * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures. 11449 */ 11450 static void 11451 ipif_assign_seqid(ipif_t *ipif) 11452 { 11453 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 11454 11455 ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1); 11456 } 11457 11458 /* 11459 * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are 11460 * administratively down (i.e., no DAD), of the same type, and locked. Note 11461 * that the clone is complete -- including the seqid -- and the expectation is 11462 * that the caller will either free or overwrite `sipif' before it's unlocked. 11463 */ 11464 static void 11465 ipif_clone(const ipif_t *sipif, ipif_t *dipif) 11466 { 11467 ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock)); 11468 ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock)); 11469 ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 11470 ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 11471 ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type); 11472 11473 dipif->ipif_flags = sipif->ipif_flags; 11474 dipif->ipif_metric = sipif->ipif_metric; 11475 dipif->ipif_zoneid = sipif->ipif_zoneid; 11476 dipif->ipif_v6subnet = sipif->ipif_v6subnet; 11477 dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr; 11478 dipif->ipif_v6net_mask = sipif->ipif_v6net_mask; 11479 dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr; 11480 dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr; 11481 11482 /* 11483 * As per the comment atop the function, we assume that these sipif 11484 * fields will be changed before sipif is unlocked. 11485 */ 11486 dipif->ipif_seqid = sipif->ipif_seqid; 11487 dipif->ipif_state_flags = sipif->ipif_state_flags; 11488 } 11489 11490 /* 11491 * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif' 11492 * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin 11493 * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then 11494 * transfer the xop to `dipif'. Requires that all ipifs are administratively 11495 * down (i.e., no DAD), of the same type, and unlocked. 11496 */ 11497 static void 11498 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif) 11499 { 11500 ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq; 11501 ipxop_t *ipx = ipsq->ipsq_xop; 11502 11503 ASSERT(sipif != dipif); 11504 ASSERT(sipif != virgipif); 11505 11506 /* 11507 * Grab all of the locks that protect the ipif in a defined order. 11508 */ 11509 GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 11510 11511 ipif_clone(sipif, dipif); 11512 if (virgipif != NULL) { 11513 ipif_clone(virgipif, sipif); 11514 mi_free(virgipif); 11515 } 11516 11517 RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 11518 11519 /* 11520 * Transfer ownership of the current xop, if necessary. 11521 */ 11522 if (ipx->ipx_current_ipif == sipif) { 11523 ASSERT(ipx->ipx_pending_ipif == NULL); 11524 mutex_enter(&ipx->ipx_lock); 11525 ipx->ipx_current_ipif = dipif; 11526 mutex_exit(&ipx->ipx_lock); 11527 } 11528 11529 if (virgipif == NULL) 11530 mi_free(sipif); 11531 } 11532 11533 /* 11534 * checks if: 11535 * - <ill_name>:<ipif_id> is at most LIFNAMSIZ - 1 and 11536 * - logical interface is within the allowed range 11537 */ 11538 static int 11539 is_lifname_valid(ill_t *ill, unsigned int ipif_id) 11540 { 11541 if (snprintf(NULL, 0, "%s:%d", ill->ill_name, ipif_id) >= LIFNAMSIZ) 11542 return (ENAMETOOLONG); 11543 11544 if (ipif_id >= ill->ill_ipst->ips_ip_addrs_per_if) 11545 return (ERANGE); 11546 return (0); 11547 } 11548 11549 /* 11550 * Insert the ipif, so that the list of ipifs on the ill will be sorted 11551 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 11552 * be inserted into the first space available in the list. The value of 11553 * ipif_id will then be set to the appropriate value for its position. 11554 */ 11555 static int 11556 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock) 11557 { 11558 ill_t *ill; 11559 ipif_t *tipif; 11560 ipif_t **tipifp; 11561 int id, err; 11562 ip_stack_t *ipst; 11563 11564 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 11565 IAM_WRITER_IPIF(ipif)); 11566 11567 ill = ipif->ipif_ill; 11568 ASSERT(ill != NULL); 11569 ipst = ill->ill_ipst; 11570 11571 /* 11572 * In the case of lo0:0 we already hold the ill_g_lock. 11573 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 11574 * ipif_insert. 11575 */ 11576 if (acquire_g_lock) 11577 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 11578 mutex_enter(&ill->ill_lock); 11579 id = ipif->ipif_id; 11580 tipifp = &(ill->ill_ipif); 11581 if (id == -1) { /* need to find a real id */ 11582 id = 0; 11583 while ((tipif = *tipifp) != NULL) { 11584 ASSERT(tipif->ipif_id >= id); 11585 if (tipif->ipif_id != id) 11586 break; /* non-consecutive id */ 11587 id++; 11588 tipifp = &(tipif->ipif_next); 11589 } 11590 if ((err = is_lifname_valid(ill, id)) != 0) { 11591 mutex_exit(&ill->ill_lock); 11592 if (acquire_g_lock) 11593 rw_exit(&ipst->ips_ill_g_lock); 11594 return (err); 11595 } 11596 ipif->ipif_id = id; /* assign new id */ 11597 } else if ((err = is_lifname_valid(ill, id)) == 0) { 11598 /* we have a real id; insert ipif in the right place */ 11599 while ((tipif = *tipifp) != NULL) { 11600 ASSERT(tipif->ipif_id != id); 11601 if (tipif->ipif_id > id) 11602 break; /* found correct location */ 11603 tipifp = &(tipif->ipif_next); 11604 } 11605 } else { 11606 mutex_exit(&ill->ill_lock); 11607 if (acquire_g_lock) 11608 rw_exit(&ipst->ips_ill_g_lock); 11609 return (err); 11610 } 11611 11612 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 11613 11614 ipif->ipif_next = tipif; 11615 *tipifp = ipif; 11616 mutex_exit(&ill->ill_lock); 11617 if (acquire_g_lock) 11618 rw_exit(&ipst->ips_ill_g_lock); 11619 11620 return (0); 11621 } 11622 11623 static void 11624 ipif_remove(ipif_t *ipif) 11625 { 11626 ipif_t **ipifp; 11627 ill_t *ill = ipif->ipif_ill; 11628 11629 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock)); 11630 11631 mutex_enter(&ill->ill_lock); 11632 ipifp = &ill->ill_ipif; 11633 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 11634 if (*ipifp == ipif) { 11635 *ipifp = ipif->ipif_next; 11636 break; 11637 } 11638 } 11639 mutex_exit(&ill->ill_lock); 11640 } 11641 11642 /* 11643 * Allocate and initialize a new interface control structure. (Always 11644 * called as writer.) 11645 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 11646 * is not part of the global linked list of ills. ipif_seqid is unique 11647 * in the system and to preserve the uniqueness, it is assigned only 11648 * when ill becomes part of the global list. At that point ill will 11649 * have a name. If it doesn't get assigned here, it will get assigned 11650 * in ipif_set_values() as part of SIOCSLIFNAME processing. 11651 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 11652 * the interface flags or any other information from the DL_INFO_ACK for 11653 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 11654 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 11655 * second DL_INFO_ACK comes in from the driver. 11656 */ 11657 static ipif_t * 11658 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, 11659 boolean_t insert, int *errorp) 11660 { 11661 int err; 11662 ipif_t *ipif; 11663 ip_stack_t *ipst = ill->ill_ipst; 11664 11665 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 11666 ill->ill_name, id, (void *)ill)); 11667 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 11668 11669 if (errorp != NULL) 11670 *errorp = 0; 11671 11672 if ((ipif = mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) { 11673 if (errorp != NULL) 11674 *errorp = ENOMEM; 11675 return (NULL); 11676 } 11677 *ipif = ipif_zero; /* start clean */ 11678 11679 ipif->ipif_ill = ill; 11680 ipif->ipif_id = id; /* could be -1 */ 11681 /* 11682 * Inherit the zoneid from the ill; for the shared stack instance 11683 * this is always the global zone 11684 */ 11685 ipif->ipif_zoneid = ill->ill_zoneid; 11686 11687 ipif->ipif_refcnt = 0; 11688 11689 if (insert) { 11690 if ((err = ipif_insert(ipif, ire_type != IRE_LOOPBACK)) != 0) { 11691 mi_free(ipif); 11692 if (errorp != NULL) 11693 *errorp = err; 11694 return (NULL); 11695 } 11696 /* -1 id should have been replaced by real id */ 11697 id = ipif->ipif_id; 11698 ASSERT(id >= 0); 11699 } 11700 11701 if (ill->ill_name[0] != '\0') 11702 ipif_assign_seqid(ipif); 11703 11704 /* 11705 * If this is the zeroth ipif on the IPMP ill, create the illgrp 11706 * (which must not exist yet because the zeroth ipif is created once 11707 * per ill). However, do not not link it to the ipmp_grp_t until 11708 * I_PLINK is called; see ip_sioctl_plink_ipmp() for details. 11709 */ 11710 if (id == 0 && IS_IPMP(ill)) { 11711 if (ipmp_illgrp_create(ill) == NULL) { 11712 if (insert) { 11713 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 11714 ipif_remove(ipif); 11715 rw_exit(&ipst->ips_ill_g_lock); 11716 } 11717 mi_free(ipif); 11718 if (errorp != NULL) 11719 *errorp = ENOMEM; 11720 return (NULL); 11721 } 11722 } 11723 11724 /* 11725 * We grab ill_lock to protect the flag changes. The ipif is still 11726 * not up and can't be looked up until the ioctl completes and the 11727 * IPIF_CHANGING flag is cleared. 11728 */ 11729 mutex_enter(&ill->ill_lock); 11730 11731 ipif->ipif_ire_type = ire_type; 11732 11733 if (ipif->ipif_isv6) { 11734 ill->ill_flags |= ILLF_IPV6; 11735 } else { 11736 ipaddr_t inaddr_any = INADDR_ANY; 11737 11738 ill->ill_flags |= ILLF_IPV4; 11739 11740 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 11741 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11742 &ipif->ipif_v6lcl_addr); 11743 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11744 &ipif->ipif_v6subnet); 11745 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11746 &ipif->ipif_v6net_mask); 11747 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11748 &ipif->ipif_v6brd_addr); 11749 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11750 &ipif->ipif_v6pp_dst_addr); 11751 } 11752 11753 /* 11754 * Don't set the interface flags etc. now, will do it in 11755 * ip_ll_subnet_defaults. 11756 */ 11757 if (!initialize) 11758 goto out; 11759 11760 /* 11761 * NOTE: The IPMP meta-interface is special-cased because it starts 11762 * with no underlying interfaces (and thus an unknown broadcast 11763 * address length), but all interfaces that can be placed into an IPMP 11764 * group are required to be broadcast-capable. 11765 */ 11766 if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) { 11767 /* 11768 * Later detect lack of DLPI driver multicast capability by 11769 * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi(). 11770 */ 11771 ill->ill_flags |= ILLF_MULTICAST; 11772 if (!ipif->ipif_isv6) 11773 ipif->ipif_flags |= IPIF_BROADCAST; 11774 } else { 11775 if (ill->ill_net_type != IRE_LOOPBACK) { 11776 if (ipif->ipif_isv6) 11777 /* 11778 * Note: xresolv interfaces will eventually need 11779 * NOARP set here as well, but that will require 11780 * those external resolvers to have some 11781 * knowledge of that flag and act appropriately. 11782 * Not to be changed at present. 11783 */ 11784 ill->ill_flags |= ILLF_NONUD; 11785 else 11786 ill->ill_flags |= ILLF_NOARP; 11787 } 11788 if (ill->ill_phys_addr_length == 0) { 11789 if (IS_VNI(ill)) { 11790 ipif->ipif_flags |= IPIF_NOXMIT; 11791 } else { 11792 /* pt-pt supports multicast. */ 11793 ill->ill_flags |= ILLF_MULTICAST; 11794 if (ill->ill_net_type != IRE_LOOPBACK) 11795 ipif->ipif_flags |= IPIF_POINTOPOINT; 11796 } 11797 } 11798 } 11799 out: 11800 mutex_exit(&ill->ill_lock); 11801 return (ipif); 11802 } 11803 11804 /* 11805 * Remove the neighbor cache entries associated with this logical 11806 * interface. 11807 */ 11808 int 11809 ipif_arp_down(ipif_t *ipif) 11810 { 11811 ill_t *ill = ipif->ipif_ill; 11812 int err = 0; 11813 11814 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 11815 ASSERT(IAM_WRITER_IPIF(ipif)); 11816 11817 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down", 11818 ill_t *, ill, ipif_t *, ipif); 11819 ipif_nce_down(ipif); 11820 11821 /* 11822 * If this is the last ipif that is going down and there are no 11823 * duplicate addresses we may yet attempt to re-probe, then we need to 11824 * clean up ARP completely. 11825 */ 11826 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 11827 !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) { 11828 /* 11829 * If this was the last ipif on an IPMP interface, purge any 11830 * static ARP entries associated with it. 11831 */ 11832 if (IS_IPMP(ill)) 11833 ipmp_illgrp_refresh_arpent(ill->ill_grp); 11834 11835 /* UNBIND, DETACH */ 11836 err = arp_ll_down(ill); 11837 } 11838 11839 return (err); 11840 } 11841 11842 /* 11843 * Get the resolver set up for a new IP address. (Always called as writer.) 11844 * Called both for IPv4 and IPv6 interfaces, though it only does some 11845 * basic DAD related initialization for IPv6. Honors ILLF_NOARP. 11846 * 11847 * The enumerated value res_act tunes the behavior: 11848 * * Res_act_initial: set up all the resolver structures for a new 11849 * IP address. 11850 * * Res_act_defend: tell ARP that it needs to send a single gratuitous 11851 * ARP message in defense of the address. 11852 * * Res_act_rebind: tell ARP to change the hardware address for an IP 11853 * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif(). 11854 * 11855 * Returns zero on success, or an errno upon failure. 11856 */ 11857 int 11858 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 11859 { 11860 ill_t *ill = ipif->ipif_ill; 11861 int err; 11862 boolean_t was_dup; 11863 11864 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 11865 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 11866 ASSERT(IAM_WRITER_IPIF(ipif)); 11867 11868 was_dup = B_FALSE; 11869 if (res_act == Res_act_initial) { 11870 ipif->ipif_addr_ready = 0; 11871 /* 11872 * We're bringing an interface up here. There's no way that we 11873 * should need to shut down ARP now. 11874 */ 11875 mutex_enter(&ill->ill_lock); 11876 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11877 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11878 ill->ill_ipif_dup_count--; 11879 was_dup = B_TRUE; 11880 } 11881 mutex_exit(&ill->ill_lock); 11882 } 11883 if (ipif->ipif_recovery_id != 0) 11884 (void) untimeout(ipif->ipif_recovery_id); 11885 ipif->ipif_recovery_id = 0; 11886 if (ill->ill_net_type != IRE_IF_RESOLVER) { 11887 ipif->ipif_addr_ready = 1; 11888 return (0); 11889 } 11890 /* NDP will set the ipif_addr_ready flag when it's ready */ 11891 if (ill->ill_isv6) 11892 return (0); 11893 11894 err = ipif_arp_up(ipif, res_act, was_dup); 11895 return (err); 11896 } 11897 11898 /* 11899 * This routine restarts IPv4/IPv6 duplicate address detection (DAD) 11900 * when a link has just gone back up. 11901 */ 11902 static void 11903 ipif_nce_start_dad(ipif_t *ipif) 11904 { 11905 ncec_t *ncec; 11906 ill_t *ill = ipif->ipif_ill; 11907 boolean_t isv6 = ill->ill_isv6; 11908 11909 if (isv6) { 11910 ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill, 11911 &ipif->ipif_v6lcl_addr); 11912 } else { 11913 ipaddr_t v4addr; 11914 11915 if (ill->ill_net_type != IRE_IF_RESOLVER || 11916 (ipif->ipif_flags & IPIF_UNNUMBERED) || 11917 ipif->ipif_lcl_addr == INADDR_ANY) { 11918 /* 11919 * If we can't contact ARP for some reason, 11920 * that's not really a problem. Just send 11921 * out the routing socket notification that 11922 * DAD completion would have done, and continue. 11923 */ 11924 ipif_mask_reply(ipif); 11925 ipif_up_notify(ipif); 11926 ipif->ipif_addr_ready = 1; 11927 return; 11928 } 11929 11930 IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr); 11931 ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr); 11932 } 11933 11934 if (ncec == NULL) { 11935 ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n", 11936 (void *)ipif)); 11937 return; 11938 } 11939 if (!nce_restart_dad(ncec)) { 11940 /* 11941 * If we can't restart DAD for some reason, that's not really a 11942 * problem. Just send out the routing socket notification that 11943 * DAD completion would have done, and continue. 11944 */ 11945 ipif_up_notify(ipif); 11946 ipif->ipif_addr_ready = 1; 11947 } 11948 ncec_refrele(ncec); 11949 } 11950 11951 /* 11952 * Restart duplicate address detection on all interfaces on the given ill. 11953 * 11954 * This is called when an interface transitions from down to up 11955 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 11956 * 11957 * Note that since the underlying physical link has transitioned, we must cause 11958 * at least one routing socket message to be sent here, either via DAD 11959 * completion or just by default on the first ipif. (If we don't do this, then 11960 * in.mpathd will see long delays when doing link-based failure recovery.) 11961 */ 11962 void 11963 ill_restart_dad(ill_t *ill, boolean_t went_up) 11964 { 11965 ipif_t *ipif; 11966 11967 if (ill == NULL) 11968 return; 11969 11970 /* 11971 * If layer two doesn't support duplicate address detection, then just 11972 * send the routing socket message now and be done with it. 11973 */ 11974 if (!ill->ill_isv6 && arp_no_defense) { 11975 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 11976 return; 11977 } 11978 11979 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 11980 if (went_up) { 11981 11982 if (ipif->ipif_flags & IPIF_UP) { 11983 ipif_nce_start_dad(ipif); 11984 } else if (ipif->ipif_flags & IPIF_DUPLICATE) { 11985 /* 11986 * kick off the bring-up process now. 11987 */ 11988 ipif_do_recovery(ipif); 11989 } else { 11990 /* 11991 * Unfortunately, the first ipif is "special" 11992 * and represents the underlying ill in the 11993 * routing socket messages. Thus, when this 11994 * one ipif is down, we must still notify so 11995 * that the user knows the IFF_RUNNING status 11996 * change. (If the first ipif is up, then 11997 * we'll handle eventual routing socket 11998 * notification via DAD completion.) 11999 */ 12000 if (ipif == ill->ill_ipif) { 12001 ip_rts_ifmsg(ill->ill_ipif, 12002 RTSQ_DEFAULT); 12003 } 12004 } 12005 } else { 12006 /* 12007 * After link down, we'll need to send a new routing 12008 * message when the link comes back, so clear 12009 * ipif_addr_ready. 12010 */ 12011 ipif->ipif_addr_ready = 0; 12012 } 12013 } 12014 12015 /* 12016 * If we've torn down links, then notify the user right away. 12017 */ 12018 if (!went_up) 12019 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 12020 } 12021 12022 static void 12023 ipsq_delete(ipsq_t *ipsq) 12024 { 12025 ipxop_t *ipx = ipsq->ipsq_xop; 12026 12027 ipsq->ipsq_ipst = NULL; 12028 ASSERT(ipsq->ipsq_phyint == NULL); 12029 ASSERT(ipsq->ipsq_xop != NULL); 12030 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL); 12031 ASSERT(ipx->ipx_pending_mp == NULL); 12032 kmem_free(ipsq, sizeof (ipsq_t)); 12033 } 12034 12035 static int 12036 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp) 12037 { 12038 int err = 0; 12039 ipif_t *ipif; 12040 12041 if (ill == NULL) 12042 return (0); 12043 12044 ASSERT(IAM_WRITER_ILL(ill)); 12045 ill->ill_up_ipifs = B_TRUE; 12046 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12047 if (ipif->ipif_was_up) { 12048 if (!(ipif->ipif_flags & IPIF_UP)) 12049 err = ipif_up(ipif, q, mp); 12050 ipif->ipif_was_up = B_FALSE; 12051 if (err != 0) { 12052 ASSERT(err == EINPROGRESS); 12053 return (err); 12054 } 12055 } 12056 } 12057 ill->ill_up_ipifs = B_FALSE; 12058 return (0); 12059 } 12060 12061 /* 12062 * This function is called to bring up all the ipifs that were up before 12063 * bringing the ill down via ill_down_ipifs(). 12064 */ 12065 int 12066 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 12067 { 12068 int err; 12069 12070 ASSERT(IAM_WRITER_ILL(ill)); 12071 12072 if (ill->ill_replumbing) { 12073 ill->ill_replumbing = 0; 12074 /* 12075 * Send down REPLUMB_DONE notification followed by the 12076 * BIND_REQ on the arp stream. 12077 */ 12078 if (!ill->ill_isv6) 12079 arp_send_replumb_conf(ill); 12080 } 12081 err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp); 12082 if (err != 0) 12083 return (err); 12084 12085 return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp)); 12086 } 12087 12088 /* 12089 * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring 12090 * down the ipifs without sending DL_UNBIND_REQ to the driver. 12091 */ 12092 static void 12093 ill_down_ipifs(ill_t *ill, boolean_t logical) 12094 { 12095 ipif_t *ipif; 12096 12097 ASSERT(IAM_WRITER_ILL(ill)); 12098 12099 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12100 /* 12101 * We go through the ipif_down logic even if the ipif 12102 * is already down, since routes can be added based 12103 * on down ipifs. Going through ipif_down once again 12104 * will delete any IREs created based on these routes. 12105 */ 12106 if (ipif->ipif_flags & IPIF_UP) 12107 ipif->ipif_was_up = B_TRUE; 12108 12109 if (logical) { 12110 (void) ipif_logical_down(ipif, NULL, NULL); 12111 ipif_non_duplicate(ipif); 12112 (void) ipif_down_tail(ipif); 12113 } else { 12114 (void) ipif_down(ipif, NULL, NULL); 12115 } 12116 } 12117 } 12118 12119 /* 12120 * Redo source address selection. This makes IXAF_VERIFY_SOURCE take 12121 * a look again at valid source addresses. 12122 * This should be called each time after the set of source addresses has been 12123 * changed. 12124 */ 12125 void 12126 ip_update_source_selection(ip_stack_t *ipst) 12127 { 12128 /* We skip past SRC_GENERATION_VERIFY */ 12129 if (atomic_add_32_nv(&ipst->ips_src_generation, 1) == 12130 SRC_GENERATION_VERIFY) 12131 atomic_add_32(&ipst->ips_src_generation, 1); 12132 } 12133 12134 /* 12135 * Finish the group join started in ip_sioctl_groupname(). 12136 */ 12137 /* ARGSUSED */ 12138 static void 12139 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 12140 { 12141 ill_t *ill = q->q_ptr; 12142 phyint_t *phyi = ill->ill_phyint; 12143 ipmp_grp_t *grp = phyi->phyint_grp; 12144 ip_stack_t *ipst = ill->ill_ipst; 12145 12146 /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */ 12147 ASSERT(!IS_IPMP(ill) && grp != NULL); 12148 ASSERT(IAM_WRITER_IPSQ(ipsq)); 12149 12150 if (phyi->phyint_illv4 != NULL) { 12151 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12152 VERIFY(grp->gr_pendv4-- > 0); 12153 rw_exit(&ipst->ips_ipmp_lock); 12154 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4); 12155 } 12156 if (phyi->phyint_illv6 != NULL) { 12157 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12158 VERIFY(grp->gr_pendv6-- > 0); 12159 rw_exit(&ipst->ips_ipmp_lock); 12160 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6); 12161 } 12162 freemsg(mp); 12163 } 12164 12165 /* 12166 * Process an SIOCSLIFGROUPNAME request. 12167 */ 12168 /* ARGSUSED */ 12169 int 12170 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12171 ip_ioctl_cmd_t *ipip, void *ifreq) 12172 { 12173 struct lifreq *lifr = ifreq; 12174 ill_t *ill = ipif->ipif_ill; 12175 ip_stack_t *ipst = ill->ill_ipst; 12176 phyint_t *phyi = ill->ill_phyint; 12177 ipmp_grp_t *grp = phyi->phyint_grp; 12178 mblk_t *ipsq_mp; 12179 int err = 0; 12180 12181 /* 12182 * Note that phyint_grp can only change here, where we're exclusive. 12183 */ 12184 ASSERT(IAM_WRITER_ILL(ill)); 12185 12186 if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL || 12187 (phyi->phyint_flags & PHYI_VIRTUAL)) 12188 return (EINVAL); 12189 12190 lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0'; 12191 12192 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12193 12194 /* 12195 * If the name hasn't changed, there's nothing to do. 12196 */ 12197 if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0) 12198 goto unlock; 12199 12200 /* 12201 * Handle requests to rename an IPMP meta-interface. 12202 * 12203 * Note that creation of the IPMP meta-interface is handled in 12204 * userland through the standard plumbing sequence. As part of the 12205 * plumbing the IPMP meta-interface, its initial groupname is set to 12206 * the name of the interface (see ipif_set_values_tail()). 12207 */ 12208 if (IS_IPMP(ill)) { 12209 err = ipmp_grp_rename(grp, lifr->lifr_groupname); 12210 goto unlock; 12211 } 12212 12213 /* 12214 * Handle requests to add or remove an IP interface from a group. 12215 */ 12216 if (lifr->lifr_groupname[0] != '\0') { /* add */ 12217 /* 12218 * Moves are handled by first removing the interface from 12219 * its existing group, and then adding it to another group. 12220 * So, fail if it's already in a group. 12221 */ 12222 if (IS_UNDER_IPMP(ill)) { 12223 err = EALREADY; 12224 goto unlock; 12225 } 12226 12227 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst); 12228 if (grp == NULL) { 12229 err = ENOENT; 12230 goto unlock; 12231 } 12232 12233 /* 12234 * Check if the phyint and its ills are suitable for 12235 * inclusion into the group. 12236 */ 12237 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0) 12238 goto unlock; 12239 12240 /* 12241 * Checks pass; join the group, and enqueue the remaining 12242 * illgrp joins for when we've become part of the group xop 12243 * and are exclusive across its IPSQs. Since qwriter_ip() 12244 * requires an mblk_t to scribble on, and since `mp' will be 12245 * freed as part of completing the ioctl, allocate another. 12246 */ 12247 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) { 12248 err = ENOMEM; 12249 goto unlock; 12250 } 12251 12252 /* 12253 * Before we drop ipmp_lock, bump gr_pend* to ensure that the 12254 * IPMP meta-interface ills needed by `phyi' cannot go away 12255 * before ip_join_illgrps() is called back. See the comments 12256 * in ip_sioctl_plink_ipmp() for more. 12257 */ 12258 if (phyi->phyint_illv4 != NULL) 12259 grp->gr_pendv4++; 12260 if (phyi->phyint_illv6 != NULL) 12261 grp->gr_pendv6++; 12262 12263 rw_exit(&ipst->ips_ipmp_lock); 12264 12265 ipmp_phyint_join_grp(phyi, grp); 12266 ill_refhold(ill); 12267 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps, 12268 SWITCH_OP, B_FALSE); 12269 return (0); 12270 } else { 12271 /* 12272 * Request to remove the interface from a group. If the 12273 * interface is not in a group, this trivially succeeds. 12274 */ 12275 rw_exit(&ipst->ips_ipmp_lock); 12276 if (IS_UNDER_IPMP(ill)) 12277 ipmp_phyint_leave_grp(phyi); 12278 return (0); 12279 } 12280 unlock: 12281 rw_exit(&ipst->ips_ipmp_lock); 12282 return (err); 12283 } 12284 12285 /* 12286 * Process an SIOCGLIFBINDING request. 12287 */ 12288 /* ARGSUSED */ 12289 int 12290 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12291 ip_ioctl_cmd_t *ipip, void *ifreq) 12292 { 12293 ill_t *ill; 12294 struct lifreq *lifr = ifreq; 12295 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12296 12297 if (!IS_IPMP(ipif->ipif_ill)) 12298 return (EINVAL); 12299 12300 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12301 if ((ill = ipif->ipif_bound_ill) == NULL) 12302 lifr->lifr_binding[0] = '\0'; 12303 else 12304 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ); 12305 rw_exit(&ipst->ips_ipmp_lock); 12306 return (0); 12307 } 12308 12309 /* 12310 * Process an SIOCGLIFGROUPNAME request. 12311 */ 12312 /* ARGSUSED */ 12313 int 12314 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12315 ip_ioctl_cmd_t *ipip, void *ifreq) 12316 { 12317 ipmp_grp_t *grp; 12318 struct lifreq *lifr = ifreq; 12319 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12320 12321 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12322 if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL) 12323 lifr->lifr_groupname[0] = '\0'; 12324 else 12325 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ); 12326 rw_exit(&ipst->ips_ipmp_lock); 12327 return (0); 12328 } 12329 12330 /* 12331 * Process an SIOCGLIFGROUPINFO request. 12332 */ 12333 /* ARGSUSED */ 12334 int 12335 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12336 ip_ioctl_cmd_t *ipip, void *dummy) 12337 { 12338 ipmp_grp_t *grp; 12339 lifgroupinfo_t *lifgr; 12340 ip_stack_t *ipst = CONNQ_TO_IPST(q); 12341 12342 /* ip_wput_nondata() verified mp->b_cont->b_cont */ 12343 lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr; 12344 lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0'; 12345 12346 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12347 if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) { 12348 rw_exit(&ipst->ips_ipmp_lock); 12349 return (ENOENT); 12350 } 12351 ipmp_grp_info(grp, lifgr); 12352 rw_exit(&ipst->ips_ipmp_lock); 12353 return (0); 12354 } 12355 12356 static void 12357 ill_dl_down(ill_t *ill) 12358 { 12359 DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill); 12360 12361 /* 12362 * The ill is down; unbind but stay attached since we're still 12363 * associated with a PPA. If we have negotiated DLPI capabilites 12364 * with the data link service provider (IDS_OK) then reset them. 12365 * The interval between unbinding and rebinding is potentially 12366 * unbounded hence we cannot assume things will be the same. 12367 * The DLPI capabilities will be probed again when the data link 12368 * is brought up. 12369 */ 12370 mblk_t *mp = ill->ill_unbind_mp; 12371 12372 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 12373 12374 if (!ill->ill_replumbing) { 12375 /* Free all ilms for this ill */ 12376 update_conn_ill(ill, ill->ill_ipst); 12377 } else { 12378 ill_leave_multicast(ill); 12379 } 12380 12381 ill->ill_unbind_mp = NULL; 12382 if (mp != NULL) { 12383 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 12384 dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 12385 ill->ill_name)); 12386 mutex_enter(&ill->ill_lock); 12387 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 12388 mutex_exit(&ill->ill_lock); 12389 /* 12390 * ip_rput does not pass up normal (M_PROTO) DLPI messages 12391 * after ILL_CONDEMNED is set. So in the unplumb case, we call 12392 * ill_capability_dld_disable disable rightaway. If this is not 12393 * an unplumb operation then the disable happens on receipt of 12394 * the capab ack via ip_rput_dlpi_writer -> 12395 * ill_capability_ack_thr. In both cases the order of 12396 * the operations seen by DLD is capability disable followed 12397 * by DL_UNBIND. Also the DLD capability disable needs a 12398 * cv_wait'able context. 12399 */ 12400 if (ill->ill_state_flags & ILL_CONDEMNED) 12401 ill_capability_dld_disable(ill); 12402 ill_capability_reset(ill, B_FALSE); 12403 ill_dlpi_send(ill, mp); 12404 } 12405 mutex_enter(&ill->ill_lock); 12406 ill->ill_dl_up = 0; 12407 ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); 12408 mutex_exit(&ill->ill_lock); 12409 } 12410 12411 void 12412 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 12413 { 12414 union DL_primitives *dlp; 12415 t_uscalar_t prim; 12416 boolean_t waitack = B_FALSE; 12417 12418 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12419 12420 dlp = (union DL_primitives *)mp->b_rptr; 12421 prim = dlp->dl_primitive; 12422 12423 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 12424 dl_primstr(prim), prim, ill->ill_name)); 12425 12426 switch (prim) { 12427 case DL_PHYS_ADDR_REQ: 12428 { 12429 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 12430 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 12431 break; 12432 } 12433 case DL_BIND_REQ: 12434 mutex_enter(&ill->ill_lock); 12435 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 12436 mutex_exit(&ill->ill_lock); 12437 break; 12438 } 12439 12440 /* 12441 * Except for the ACKs for the M_PCPROTO messages, all other ACKs 12442 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore 12443 * we only wait for the ACK of the DL_UNBIND_REQ. 12444 */ 12445 mutex_enter(&ill->ill_lock); 12446 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 12447 (prim == DL_UNBIND_REQ)) { 12448 ill->ill_dlpi_pending = prim; 12449 waitack = B_TRUE; 12450 } 12451 12452 mutex_exit(&ill->ill_lock); 12453 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch", 12454 char *, dl_primstr(prim), ill_t *, ill); 12455 putnext(ill->ill_wq, mp); 12456 12457 /* 12458 * There is no ack for DL_NOTIFY_CONF messages 12459 */ 12460 if (waitack && prim == DL_NOTIFY_CONF) 12461 ill_dlpi_done(ill, prim); 12462 } 12463 12464 /* 12465 * Helper function for ill_dlpi_send(). 12466 */ 12467 /* ARGSUSED */ 12468 static void 12469 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 12470 { 12471 ill_dlpi_send(q->q_ptr, mp); 12472 } 12473 12474 /* 12475 * Send a DLPI control message to the driver but make sure there 12476 * is only one outstanding message. Uses ill_dlpi_pending to tell 12477 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 12478 * when an ACK or a NAK is received to process the next queued message. 12479 */ 12480 void 12481 ill_dlpi_send(ill_t *ill, mblk_t *mp) 12482 { 12483 mblk_t **mpp; 12484 12485 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12486 12487 /* 12488 * To ensure that any DLPI requests for current exclusive operation 12489 * are always completely sent before any DLPI messages for other 12490 * operations, require writer access before enqueuing. 12491 */ 12492 if (!IAM_WRITER_ILL(ill)) { 12493 ill_refhold(ill); 12494 /* qwriter_ip() does the ill_refrele() */ 12495 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer, 12496 NEW_OP, B_TRUE); 12497 return; 12498 } 12499 12500 mutex_enter(&ill->ill_lock); 12501 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 12502 /* Must queue message. Tail insertion */ 12503 mpp = &ill->ill_dlpi_deferred; 12504 while (*mpp != NULL) 12505 mpp = &((*mpp)->b_next); 12506 12507 ip1dbg(("ill_dlpi_send: deferring request for %s " 12508 "while %s pending\n", ill->ill_name, 12509 dl_primstr(ill->ill_dlpi_pending))); 12510 12511 *mpp = mp; 12512 mutex_exit(&ill->ill_lock); 12513 return; 12514 } 12515 mutex_exit(&ill->ill_lock); 12516 ill_dlpi_dispatch(ill, mp); 12517 } 12518 12519 void 12520 ill_capability_send(ill_t *ill, mblk_t *mp) 12521 { 12522 ill->ill_capab_pending_cnt++; 12523 ill_dlpi_send(ill, mp); 12524 } 12525 12526 void 12527 ill_capability_done(ill_t *ill) 12528 { 12529 ASSERT(ill->ill_capab_pending_cnt != 0); 12530 12531 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 12532 12533 ill->ill_capab_pending_cnt--; 12534 if (ill->ill_capab_pending_cnt == 0 && 12535 ill->ill_dlpi_capab_state == IDCS_OK) 12536 ill_capability_reset_alloc(ill); 12537 } 12538 12539 /* 12540 * Send all deferred DLPI messages without waiting for their ACKs. 12541 */ 12542 void 12543 ill_dlpi_send_deferred(ill_t *ill) 12544 { 12545 mblk_t *mp, *nextmp; 12546 12547 /* 12548 * Clear ill_dlpi_pending so that the message is not queued in 12549 * ill_dlpi_send(). 12550 */ 12551 mutex_enter(&ill->ill_lock); 12552 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12553 mp = ill->ill_dlpi_deferred; 12554 ill->ill_dlpi_deferred = NULL; 12555 mutex_exit(&ill->ill_lock); 12556 12557 for (; mp != NULL; mp = nextmp) { 12558 nextmp = mp->b_next; 12559 mp->b_next = NULL; 12560 ill_dlpi_send(ill, mp); 12561 } 12562 } 12563 12564 /* 12565 * Check if the DLPI primitive `prim' is pending; print a warning if not. 12566 */ 12567 boolean_t 12568 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim) 12569 { 12570 t_uscalar_t pending; 12571 12572 mutex_enter(&ill->ill_lock); 12573 if (ill->ill_dlpi_pending == prim) { 12574 mutex_exit(&ill->ill_lock); 12575 return (B_TRUE); 12576 } 12577 12578 /* 12579 * During teardown, ill_dlpi_dispatch() will send DLPI requests 12580 * without waiting, so don't print any warnings in that case. 12581 */ 12582 if (ill->ill_state_flags & ILL_CONDEMNED) { 12583 mutex_exit(&ill->ill_lock); 12584 return (B_FALSE); 12585 } 12586 pending = ill->ill_dlpi_pending; 12587 mutex_exit(&ill->ill_lock); 12588 12589 if (pending == DL_PRIM_INVAL) { 12590 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 12591 "received unsolicited ack for %s on %s\n", 12592 dl_primstr(prim), ill->ill_name); 12593 } else { 12594 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 12595 "received unexpected ack for %s on %s (expecting %s)\n", 12596 dl_primstr(prim), ill->ill_name, dl_primstr(pending)); 12597 } 12598 return (B_FALSE); 12599 } 12600 12601 /* 12602 * Complete the current DLPI operation associated with `prim' on `ill' and 12603 * start the next queued DLPI operation (if any). If there are no queued DLPI 12604 * operations and the ill's current exclusive IPSQ operation has finished 12605 * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to 12606 * allow the next exclusive IPSQ operation to begin upon ipsq_exit(). See 12607 * the comments above ipsq_current_finish() for details. 12608 */ 12609 void 12610 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 12611 { 12612 mblk_t *mp; 12613 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 12614 ipxop_t *ipx = ipsq->ipsq_xop; 12615 12616 ASSERT(IAM_WRITER_IPSQ(ipsq)); 12617 mutex_enter(&ill->ill_lock); 12618 12619 ASSERT(prim != DL_PRIM_INVAL); 12620 ASSERT(ill->ill_dlpi_pending == prim); 12621 12622 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 12623 dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 12624 12625 if ((mp = ill->ill_dlpi_deferred) == NULL) { 12626 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12627 if (ipx->ipx_current_done) { 12628 mutex_enter(&ipx->ipx_lock); 12629 ipx->ipx_current_ipif = NULL; 12630 mutex_exit(&ipx->ipx_lock); 12631 } 12632 cv_signal(&ill->ill_cv); 12633 mutex_exit(&ill->ill_lock); 12634 return; 12635 } 12636 12637 ill->ill_dlpi_deferred = mp->b_next; 12638 mp->b_next = NULL; 12639 mutex_exit(&ill->ill_lock); 12640 12641 ill_dlpi_dispatch(ill, mp); 12642 } 12643 12644 /* 12645 * Queue a (multicast) DLPI control message to be sent to the driver by 12646 * later calling ill_dlpi_send_queued. 12647 * We queue them while holding a lock (ill_mcast_lock) to ensure that they 12648 * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ 12649 * for the same group to race. 12650 * We send DLPI control messages in order using ill_lock. 12651 * For IPMP we should be called on the cast_ill. 12652 */ 12653 void 12654 ill_dlpi_queue(ill_t *ill, mblk_t *mp) 12655 { 12656 mblk_t **mpp; 12657 12658 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12659 12660 mutex_enter(&ill->ill_lock); 12661 /* Must queue message. Tail insertion */ 12662 mpp = &ill->ill_dlpi_deferred; 12663 while (*mpp != NULL) 12664 mpp = &((*mpp)->b_next); 12665 12666 *mpp = mp; 12667 mutex_exit(&ill->ill_lock); 12668 } 12669 12670 /* 12671 * Send the messages that were queued. Make sure there is only 12672 * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done() 12673 * when an ACK or a NAK is received to process the next queued message. 12674 * For IPMP we are called on the upper ill, but when send what is queued 12675 * on the cast_ill. 12676 */ 12677 void 12678 ill_dlpi_send_queued(ill_t *ill) 12679 { 12680 mblk_t *mp; 12681 union DL_primitives *dlp; 12682 t_uscalar_t prim; 12683 ill_t *release_ill = NULL; 12684 12685 if (IS_IPMP(ill)) { 12686 /* On the upper IPMP ill. */ 12687 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12688 if (release_ill == NULL) { 12689 /* Avoid ever sending anything down to the ipmpstub */ 12690 return; 12691 } 12692 ill = release_ill; 12693 } 12694 mutex_enter(&ill->ill_lock); 12695 while ((mp = ill->ill_dlpi_deferred) != NULL) { 12696 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 12697 /* Can't send. Somebody else will send it */ 12698 mutex_exit(&ill->ill_lock); 12699 goto done; 12700 } 12701 ill->ill_dlpi_deferred = mp->b_next; 12702 mp->b_next = NULL; 12703 if (!ill->ill_dl_up) { 12704 /* 12705 * Nobody there. All multicast addresses will be 12706 * re-joined when we get the DL_BIND_ACK bringing the 12707 * interface up. 12708 */ 12709 freemsg(mp); 12710 continue; 12711 } 12712 dlp = (union DL_primitives *)mp->b_rptr; 12713 prim = dlp->dl_primitive; 12714 12715 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 12716 (prim == DL_UNBIND_REQ)) { 12717 ill->ill_dlpi_pending = prim; 12718 } 12719 mutex_exit(&ill->ill_lock); 12720 12721 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued", 12722 char *, dl_primstr(prim), ill_t *, ill); 12723 putnext(ill->ill_wq, mp); 12724 mutex_enter(&ill->ill_lock); 12725 } 12726 mutex_exit(&ill->ill_lock); 12727 done: 12728 if (release_ill != NULL) 12729 ill_refrele(release_ill); 12730 } 12731 12732 /* 12733 * Queue an IP (IGMP/MLD) message to be sent by IP from 12734 * ill_mcast_send_queued 12735 * We queue them while holding a lock (ill_mcast_lock) to ensure that they 12736 * are sent in order i.e., prevent a IGMP leave and IGMP join for the same 12737 * group to race. 12738 * We send them in order using ill_lock. 12739 * For IPMP we are called on the upper ill, but we queue on the cast_ill. 12740 */ 12741 void 12742 ill_mcast_queue(ill_t *ill, mblk_t *mp) 12743 { 12744 mblk_t **mpp; 12745 ill_t *release_ill = NULL; 12746 12747 ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock)); 12748 12749 if (IS_IPMP(ill)) { 12750 /* On the upper IPMP ill. */ 12751 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12752 if (release_ill == NULL) { 12753 /* Discard instead of queuing for the ipmp interface */ 12754 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 12755 ip_drop_output("ipIfStatsOutDiscards - no cast_ill", 12756 mp, ill); 12757 freemsg(mp); 12758 return; 12759 } 12760 ill = release_ill; 12761 } 12762 12763 mutex_enter(&ill->ill_lock); 12764 /* Must queue message. Tail insertion */ 12765 mpp = &ill->ill_mcast_deferred; 12766 while (*mpp != NULL) 12767 mpp = &((*mpp)->b_next); 12768 12769 *mpp = mp; 12770 mutex_exit(&ill->ill_lock); 12771 if (release_ill != NULL) 12772 ill_refrele(release_ill); 12773 } 12774 12775 /* 12776 * Send the IP packets that were queued by ill_mcast_queue. 12777 * These are IGMP/MLD packets. 12778 * 12779 * For IPMP we are called on the upper ill, but when send what is queued 12780 * on the cast_ill. 12781 * 12782 * Request loopback of the report if we are acting as a multicast 12783 * router, so that the process-level routing demon can hear it. 12784 * This will run multiple times for the same group if there are members 12785 * on the same group for multiple ipif's on the same ill. The 12786 * igmp_input/mld_input code will suppress this due to the loopback thus we 12787 * always loopback membership report. 12788 * 12789 * We also need to make sure that this does not get load balanced 12790 * by IPMP. We do this by passing an ill to ip_output_simple. 12791 */ 12792 void 12793 ill_mcast_send_queued(ill_t *ill) 12794 { 12795 mblk_t *mp; 12796 ip_xmit_attr_t ixas; 12797 ill_t *release_ill = NULL; 12798 12799 if (IS_IPMP(ill)) { 12800 /* On the upper IPMP ill. */ 12801 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12802 if (release_ill == NULL) { 12803 /* 12804 * We should have no messages on the ipmp interface 12805 * but no point in trying to send them. 12806 */ 12807 return; 12808 } 12809 ill = release_ill; 12810 } 12811 bzero(&ixas, sizeof (ixas)); 12812 ixas.ixa_zoneid = ALL_ZONES; 12813 ixas.ixa_cred = kcred; 12814 ixas.ixa_cpid = NOPID; 12815 ixas.ixa_tsl = NULL; 12816 /* 12817 * Here we set ixa_ifindex. If IPMP it will be the lower ill which 12818 * makes ip_select_route pick the IRE_MULTICAST for the cast_ill. 12819 * That is necessary to handle IGMP/MLD snooping switches. 12820 */ 12821 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 12822 ixas.ixa_ipst = ill->ill_ipst; 12823 12824 mutex_enter(&ill->ill_lock); 12825 while ((mp = ill->ill_mcast_deferred) != NULL) { 12826 ill->ill_mcast_deferred = mp->b_next; 12827 mp->b_next = NULL; 12828 if (!ill->ill_dl_up) { 12829 /* 12830 * Nobody there. Just drop the ip packets. 12831 * IGMP/MLD will resend later, if this is a replumb. 12832 */ 12833 freemsg(mp); 12834 continue; 12835 } 12836 mutex_enter(&ill->ill_phyint->phyint_lock); 12837 if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) { 12838 /* 12839 * When the ill is getting deactivated, we only want to 12840 * send the DLPI messages, so drop IGMP/MLD packets. 12841 * DLPI messages are handled by ill_dlpi_send_queued() 12842 */ 12843 mutex_exit(&ill->ill_phyint->phyint_lock); 12844 freemsg(mp); 12845 continue; 12846 } 12847 mutex_exit(&ill->ill_phyint->phyint_lock); 12848 mutex_exit(&ill->ill_lock); 12849 12850 /* Check whether we are sending IPv4 or IPv6. */ 12851 if (ill->ill_isv6) { 12852 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 12853 12854 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 12855 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; 12856 } else { 12857 ipha_t *ipha = (ipha_t *)mp->b_rptr; 12858 12859 ixas.ixa_multicast_ttl = ipha->ipha_ttl; 12860 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 12861 ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM; 12862 } 12863 12864 ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE; 12865 (void) ip_output_simple(mp, &ixas); 12866 ixa_cleanup(&ixas); 12867 12868 mutex_enter(&ill->ill_lock); 12869 } 12870 mutex_exit(&ill->ill_lock); 12871 12872 done: 12873 if (release_ill != NULL) 12874 ill_refrele(release_ill); 12875 } 12876 12877 /* 12878 * Take down a specific interface, but don't lose any information about it. 12879 * (Always called as writer.) 12880 * This function goes through the down sequence even if the interface is 12881 * already down. There are 2 reasons. 12882 * a. Currently we permit interface routes that depend on down interfaces 12883 * to be added. This behaviour itself is questionable. However it appears 12884 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 12885 * time. We go thru the cleanup in order to remove these routes. 12886 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 12887 * DL_ERROR_ACK in response to the DL_BIND request. The interface is 12888 * down, but we need to cleanup i.e. do ill_dl_down and 12889 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 12890 * 12891 * IP-MT notes: 12892 * 12893 * Model of reference to interfaces. 12894 * 12895 * The following members in ipif_t track references to the ipif. 12896 * int ipif_refcnt; Active reference count 12897 * 12898 * The following members in ill_t track references to the ill. 12899 * int ill_refcnt; active refcnt 12900 * uint_t ill_ire_cnt; Number of ires referencing ill 12901 * uint_t ill_ncec_cnt; Number of ncecs referencing ill 12902 * uint_t ill_nce_cnt; Number of nces referencing ill 12903 * uint_t ill_ilm_cnt; Number of ilms referencing ill 12904 * 12905 * Reference to an ipif or ill can be obtained in any of the following ways. 12906 * 12907 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 12908 * Pointers to ipif / ill from other data structures viz ire and conn. 12909 * Implicit reference to the ipif / ill by holding a reference to the ire. 12910 * 12911 * The ipif/ill lookup functions return a reference held ipif / ill. 12912 * ipif_refcnt and ill_refcnt track the reference counts respectively. 12913 * This is a purely dynamic reference count associated with threads holding 12914 * references to the ipif / ill. Pointers from other structures do not 12915 * count towards this reference count. 12916 * 12917 * ill_ire_cnt is the number of ire's associated with the 12918 * ill. This is incremented whenever a new ire is created referencing the 12919 * ill. This is done atomically inside ire_add_v[46] where the ire is 12920 * actually added to the ire hash table. The count is decremented in 12921 * ire_inactive where the ire is destroyed. 12922 * 12923 * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill. 12924 * This is incremented atomically in 12925 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the 12926 * table. Similarly it is decremented in ncec_inactive() where the ncec 12927 * is destroyed. 12928 * 12929 * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is 12930 * incremented atomically in nce_add() where the nce is actually added to the 12931 * ill_nce. Similarly it is decremented in nce_inactive() where the nce 12932 * is destroyed. 12933 * 12934 * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in 12935 * ilm_add() and decremented before the ilm is freed in ilm_delete(). 12936 * 12937 * Flow of ioctls involving interface down/up 12938 * 12939 * The following is the sequence of an attempt to set some critical flags on an 12940 * up interface. 12941 * ip_sioctl_flags 12942 * ipif_down 12943 * wait for ipif to be quiescent 12944 * ipif_down_tail 12945 * ip_sioctl_flags_tail 12946 * 12947 * All set ioctls that involve down/up sequence would have a skeleton similar 12948 * to the above. All the *tail functions are called after the refcounts have 12949 * dropped to the appropriate values. 12950 * 12951 * SIOC ioctls during the IPIF_CHANGING interval. 12952 * 12953 * Threads handling SIOC set ioctls serialize on the squeue, but this 12954 * is not done for SIOC get ioctls. Since a set ioctl can cause several 12955 * steps of internal changes to the state, some of which are visible in 12956 * ipif_flags (such as IFF_UP being cleared and later set), and we want 12957 * the set ioctl to be atomic related to the get ioctls, the SIOC get code 12958 * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then 12959 * enqueued in the ipsq and the operation is restarted by ipsq_exit() when 12960 * the current exclusive operation completes. The IPIF_CHANGING check 12961 * and enqueue is atomic using the ill_lock and ipsq_lock. The 12962 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 12963 * change while the ill_lock is held. Before dropping the ill_lock we acquire 12964 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 12965 * until we release the ipsq_lock, even though the ill/ipif state flags 12966 * can change after we drop the ill_lock. 12967 */ 12968 int 12969 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 12970 { 12971 ill_t *ill = ipif->ipif_ill; 12972 conn_t *connp; 12973 boolean_t success; 12974 boolean_t ipif_was_up = B_FALSE; 12975 ip_stack_t *ipst = ill->ill_ipst; 12976 12977 ASSERT(IAM_WRITER_IPIF(ipif)); 12978 12979 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 12980 12981 DTRACE_PROBE3(ipif__downup, char *, "ipif_down", 12982 ill_t *, ill, ipif_t *, ipif); 12983 12984 if (ipif->ipif_flags & IPIF_UP) { 12985 mutex_enter(&ill->ill_lock); 12986 ipif->ipif_flags &= ~IPIF_UP; 12987 ASSERT(ill->ill_ipif_up_count > 0); 12988 --ill->ill_ipif_up_count; 12989 mutex_exit(&ill->ill_lock); 12990 ipif_was_up = B_TRUE; 12991 /* Update status in SCTP's list */ 12992 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 12993 ill_nic_event_dispatch(ipif->ipif_ill, 12994 MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0); 12995 } 12996 12997 /* 12998 * Blow away memberships we established in ipif_multicast_up(). 12999 */ 13000 ipif_multicast_down(ipif); 13001 13002 /* 13003 * Remove from the mapping for __sin6_src_id. We insert only 13004 * when the address is not INADDR_ANY. As IPv4 addresses are 13005 * stored as mapped addresses, we need to check for mapped 13006 * INADDR_ANY also. 13007 */ 13008 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 13009 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 13010 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 13011 int err; 13012 13013 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 13014 ipif->ipif_zoneid, ipst); 13015 if (err != 0) { 13016 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 13017 } 13018 } 13019 13020 if (ipif_was_up) { 13021 /* only delete if we'd added ire's before */ 13022 if (ipif->ipif_isv6) 13023 ipif_delete_ires_v6(ipif); 13024 else 13025 ipif_delete_ires_v4(ipif); 13026 } 13027 13028 if (ipif_was_up && ill->ill_ipif_up_count == 0) { 13029 /* 13030 * Since the interface is now down, it may have just become 13031 * inactive. Note that this needs to be done even for a 13032 * lll_logical_down(), or ARP entries will not get correctly 13033 * restored when the interface comes back up. 13034 */ 13035 if (IS_UNDER_IPMP(ill)) 13036 ipmp_ill_refresh_active(ill); 13037 } 13038 13039 /* 13040 * neighbor-discovery or arp entries for this interface. The ipif 13041 * has to be quiesced, so we walk all the nce's and delete those 13042 * that point at the ipif->ipif_ill. At the same time, we also 13043 * update IPMP so that ipifs for data addresses are unbound. We dont 13044 * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer 13045 * that for ipif_down_tail() 13046 */ 13047 ipif_nce_down(ipif); 13048 13049 /* 13050 * If this is the last ipif on the ill, we also need to remove 13051 * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will 13052 * never succeed. 13053 */ 13054 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) 13055 ire_walk_ill(0, 0, ill_downi, ill, ill); 13056 13057 /* 13058 * Walk all CONNs that can have a reference on an ire for this 13059 * ipif (we actually walk all that now have stale references). 13060 */ 13061 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); 13062 13063 /* 13064 * If mp is NULL the caller will wait for the appropriate refcnt. 13065 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 13066 * and ill_delete -> ipif_free -> ipif_down 13067 */ 13068 if (mp == NULL) { 13069 ASSERT(q == NULL); 13070 return (0); 13071 } 13072 13073 if (CONN_Q(q)) { 13074 connp = Q_TO_CONN(q); 13075 mutex_enter(&connp->conn_lock); 13076 } else { 13077 connp = NULL; 13078 } 13079 mutex_enter(&ill->ill_lock); 13080 /* 13081 * Are there any ire's pointing to this ipif that are still active ? 13082 * If this is the last ipif going down, are there any ire's pointing 13083 * to this ill that are still active ? 13084 */ 13085 if (ipif_is_quiescent(ipif)) { 13086 mutex_exit(&ill->ill_lock); 13087 if (connp != NULL) 13088 mutex_exit(&connp->conn_lock); 13089 return (0); 13090 } 13091 13092 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 13093 ill->ill_name, (void *)ill)); 13094 /* 13095 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 13096 * drops down, the operation will be restarted by ipif_ill_refrele_tail 13097 * which in turn is called by the last refrele on the ipif/ill/ire. 13098 */ 13099 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 13100 if (!success) { 13101 /* The conn is closing. So just return */ 13102 ASSERT(connp != NULL); 13103 mutex_exit(&ill->ill_lock); 13104 mutex_exit(&connp->conn_lock); 13105 return (EINTR); 13106 } 13107 13108 mutex_exit(&ill->ill_lock); 13109 if (connp != NULL) 13110 mutex_exit(&connp->conn_lock); 13111 return (EINPROGRESS); 13112 } 13113 13114 int 13115 ipif_down_tail(ipif_t *ipif) 13116 { 13117 ill_t *ill = ipif->ipif_ill; 13118 int err = 0; 13119 13120 DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail", 13121 ill_t *, ill, ipif_t *, ipif); 13122 13123 /* 13124 * Skip any loopback interface (null wq). 13125 * If this is the last logical interface on the ill 13126 * have ill_dl_down tell the driver we are gone (unbind) 13127 * Note that lun 0 can ipif_down even though 13128 * there are other logical units that are up. 13129 * This occurs e.g. when we change a "significant" IFF_ flag. 13130 */ 13131 if (ill->ill_wq != NULL && !ill->ill_logical_down && 13132 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 13133 ill->ill_dl_up) { 13134 ill_dl_down(ill); 13135 } 13136 if (!ipif->ipif_isv6) 13137 err = ipif_arp_down(ipif); 13138 13139 ill->ill_logical_down = 0; 13140 13141 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 13142 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT); 13143 return (err); 13144 } 13145 13146 /* 13147 * Bring interface logically down without bringing the physical interface 13148 * down e.g. when the netmask is changed. This avoids long lasting link 13149 * negotiations between an ethernet interface and a certain switches. 13150 */ 13151 static int 13152 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 13153 { 13154 DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down", 13155 ill_t *, ipif->ipif_ill, ipif_t *, ipif); 13156 13157 /* 13158 * The ill_logical_down flag is a transient flag. It is set here 13159 * and is cleared once the down has completed in ipif_down_tail. 13160 * This flag does not indicate whether the ill stream is in the 13161 * DL_BOUND state with the driver. Instead this flag is used by 13162 * ipif_down_tail to determine whether to DL_UNBIND the stream with 13163 * the driver. The state of the ill stream i.e. whether it is 13164 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 13165 */ 13166 ipif->ipif_ill->ill_logical_down = 1; 13167 return (ipif_down(ipif, q, mp)); 13168 } 13169 13170 /* 13171 * Initiate deallocate of an IPIF. Always called as writer. Called by 13172 * ill_delete or ip_sioctl_removeif. 13173 */ 13174 static void 13175 ipif_free(ipif_t *ipif) 13176 { 13177 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13178 13179 ASSERT(IAM_WRITER_IPIF(ipif)); 13180 13181 if (ipif->ipif_recovery_id != 0) 13182 (void) untimeout(ipif->ipif_recovery_id); 13183 ipif->ipif_recovery_id = 0; 13184 13185 /* 13186 * Take down the interface. We can be called either from ill_delete 13187 * or from ip_sioctl_removeif. 13188 */ 13189 (void) ipif_down(ipif, NULL, NULL); 13190 13191 /* 13192 * Now that the interface is down, there's no chance it can still 13193 * become a duplicate. Cancel any timer that may have been set while 13194 * tearing down. 13195 */ 13196 if (ipif->ipif_recovery_id != 0) 13197 (void) untimeout(ipif->ipif_recovery_id); 13198 ipif->ipif_recovery_id = 0; 13199 13200 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13201 /* Remove pointers to this ill in the multicast routing tables */ 13202 reset_mrt_vif_ipif(ipif); 13203 /* If necessary, clear the cached source ipif rotor. */ 13204 if (ipif->ipif_ill->ill_src_ipif == ipif) 13205 ipif->ipif_ill->ill_src_ipif = NULL; 13206 rw_exit(&ipst->ips_ill_g_lock); 13207 } 13208 13209 static void 13210 ipif_free_tail(ipif_t *ipif) 13211 { 13212 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13213 13214 /* 13215 * Need to hold both ill_g_lock and ill_lock while 13216 * inserting or removing an ipif from the linked list 13217 * of ipifs hanging off the ill. 13218 */ 13219 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13220 13221 #ifdef DEBUG 13222 ipif_trace_cleanup(ipif); 13223 #endif 13224 13225 /* Ask SCTP to take it out of it list */ 13226 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 13227 ip_rts_newaddrmsg(RTM_FREEADDR, 0, ipif, RTSQ_DEFAULT); 13228 13229 /* Get it out of the ILL interface list. */ 13230 ipif_remove(ipif); 13231 rw_exit(&ipst->ips_ill_g_lock); 13232 13233 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 13234 ASSERT(ipif->ipif_recovery_id == 0); 13235 ASSERT(ipif->ipif_ire_local == NULL); 13236 ASSERT(ipif->ipif_ire_if == NULL); 13237 13238 /* Free the memory. */ 13239 mi_free(ipif); 13240 } 13241 13242 /* 13243 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id" 13244 * is zero. 13245 */ 13246 void 13247 ipif_get_name(const ipif_t *ipif, char *buf, int len) 13248 { 13249 char lbuf[LIFNAMSIZ]; 13250 char *name; 13251 size_t name_len; 13252 13253 buf[0] = '\0'; 13254 name = ipif->ipif_ill->ill_name; 13255 name_len = ipif->ipif_ill->ill_name_length; 13256 if (ipif->ipif_id != 0) { 13257 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 13258 ipif->ipif_id); 13259 name = lbuf; 13260 name_len = mi_strlen(name) + 1; 13261 } 13262 len -= 1; 13263 buf[len] = '\0'; 13264 len = MIN(len, name_len); 13265 bcopy(name, buf, len); 13266 } 13267 13268 /* 13269 * Sets `buf' to an ill name. 13270 */ 13271 void 13272 ill_get_name(const ill_t *ill, char *buf, int len) 13273 { 13274 char *name; 13275 size_t name_len; 13276 13277 name = ill->ill_name; 13278 name_len = ill->ill_name_length; 13279 len -= 1; 13280 buf[len] = '\0'; 13281 len = MIN(len, name_len); 13282 bcopy(name, buf, len); 13283 } 13284 13285 /* 13286 * Find an IPIF based on the name passed in. Names can be of the form <phys> 13287 * (e.g., le0) or <phys>:<#> (e.g., le0:1). When there is no colon, the 13288 * implied unit id is zero. <phys> must correspond to the name of an ILL. 13289 * (May be called as writer.) 13290 */ 13291 static ipif_t * 13292 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 13293 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst) 13294 { 13295 char *cp; 13296 char *endp; 13297 long id; 13298 ill_t *ill; 13299 ipif_t *ipif; 13300 uint_t ire_type; 13301 boolean_t did_alloc = B_FALSE; 13302 13303 /* 13304 * If the caller wants to us to create the ipif, make sure we have a 13305 * valid zoneid 13306 */ 13307 ASSERT(!do_alloc || zoneid != ALL_ZONES); 13308 13309 if (namelen == 0) { 13310 return (NULL); 13311 } 13312 13313 *exists = B_FALSE; 13314 /* Look for a colon in the name. */ 13315 endp = &name[namelen]; 13316 for (cp = endp; --cp > name; ) { 13317 if (*cp == IPIF_SEPARATOR_CHAR) 13318 break; 13319 } 13320 13321 if (*cp == IPIF_SEPARATOR_CHAR) { 13322 /* 13323 * Reject any non-decimal aliases for logical 13324 * interfaces. Aliases with leading zeroes 13325 * are also rejected as they introduce ambiguity 13326 * in the naming of the interfaces. 13327 * In order to confirm with existing semantics, 13328 * and to not break any programs/script relying 13329 * on that behaviour, if<0>:0 is considered to be 13330 * a valid interface. 13331 * 13332 * If alias has two or more digits and the first 13333 * is zero, fail. 13334 */ 13335 if (&cp[2] < endp && cp[1] == '0') { 13336 return (NULL); 13337 } 13338 } 13339 13340 if (cp <= name) { 13341 cp = endp; 13342 } else { 13343 *cp = '\0'; 13344 } 13345 13346 /* 13347 * Look up the ILL, based on the portion of the name 13348 * before the slash. ill_lookup_on_name returns a held ill. 13349 * Temporary to check whether ill exists already. If so 13350 * ill_lookup_on_name will clear it. 13351 */ 13352 ill = ill_lookup_on_name(name, do_alloc, isv6, 13353 &did_alloc, ipst); 13354 if (cp != endp) 13355 *cp = IPIF_SEPARATOR_CHAR; 13356 if (ill == NULL) 13357 return (NULL); 13358 13359 /* Establish the unit number in the name. */ 13360 id = 0; 13361 if (cp < endp && *endp == '\0') { 13362 /* If there was a colon, the unit number follows. */ 13363 cp++; 13364 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 13365 ill_refrele(ill); 13366 return (NULL); 13367 } 13368 } 13369 13370 mutex_enter(&ill->ill_lock); 13371 /* Now see if there is an IPIF with this unit number. */ 13372 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13373 if (ipif->ipif_id == id) { 13374 if (zoneid != ALL_ZONES && 13375 zoneid != ipif->ipif_zoneid && 13376 ipif->ipif_zoneid != ALL_ZONES) { 13377 mutex_exit(&ill->ill_lock); 13378 ill_refrele(ill); 13379 return (NULL); 13380 } 13381 if (IPIF_CAN_LOOKUP(ipif)) { 13382 ipif_refhold_locked(ipif); 13383 mutex_exit(&ill->ill_lock); 13384 if (!did_alloc) 13385 *exists = B_TRUE; 13386 /* 13387 * Drop locks before calling ill_refrele 13388 * since it can potentially call into 13389 * ipif_ill_refrele_tail which can end up 13390 * in trying to acquire any lock. 13391 */ 13392 ill_refrele(ill); 13393 return (ipif); 13394 } 13395 } 13396 } 13397 13398 if (!do_alloc) { 13399 mutex_exit(&ill->ill_lock); 13400 ill_refrele(ill); 13401 return (NULL); 13402 } 13403 13404 /* 13405 * If none found, atomically allocate and return a new one. 13406 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 13407 * to support "receive only" use of lo0:1 etc. as is still done 13408 * below as an initial guess. 13409 * However, this is now likely to be overriden later in ipif_up_done() 13410 * when we know for sure what address has been configured on the 13411 * interface, since we might have more than one loopback interface 13412 * with a loopback address, e.g. in the case of zones, and all the 13413 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 13414 */ 13415 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 13416 ire_type = IRE_LOOPBACK; 13417 else 13418 ire_type = IRE_LOCAL; 13419 ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE, NULL); 13420 if (ipif != NULL) 13421 ipif_refhold_locked(ipif); 13422 mutex_exit(&ill->ill_lock); 13423 ill_refrele(ill); 13424 return (ipif); 13425 } 13426 13427 /* 13428 * This routine is called whenever a new address comes up on an ipif. If 13429 * we are configured to respond to address mask requests, then we are supposed 13430 * to broadcast an address mask reply at this time. This routine is also 13431 * called if we are already up, but a netmask change is made. This is legal 13432 * but might not make the system manager very popular. (May be called 13433 * as writer.) 13434 */ 13435 void 13436 ipif_mask_reply(ipif_t *ipif) 13437 { 13438 icmph_t *icmph; 13439 ipha_t *ipha; 13440 mblk_t *mp; 13441 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13442 ip_xmit_attr_t ixas; 13443 13444 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 13445 13446 if (!ipst->ips_ip_respond_to_address_mask_broadcast) 13447 return; 13448 13449 /* ICMP mask reply is IPv4 only */ 13450 ASSERT(!ipif->ipif_isv6); 13451 /* ICMP mask reply is not for a loopback interface */ 13452 ASSERT(ipif->ipif_ill->ill_wq != NULL); 13453 13454 if (ipif->ipif_lcl_addr == INADDR_ANY) 13455 return; 13456 13457 mp = allocb(REPLY_LEN, BPRI_HI); 13458 if (mp == NULL) 13459 return; 13460 mp->b_wptr = mp->b_rptr + REPLY_LEN; 13461 13462 ipha = (ipha_t *)mp->b_rptr; 13463 bzero(ipha, REPLY_LEN); 13464 *ipha = icmp_ipha; 13465 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 13466 ipha->ipha_src = ipif->ipif_lcl_addr; 13467 ipha->ipha_dst = ipif->ipif_brd_addr; 13468 ipha->ipha_length = htons(REPLY_LEN); 13469 ipha->ipha_ident = 0; 13470 13471 icmph = (icmph_t *)&ipha[1]; 13472 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 13473 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 13474 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 13475 13476 bzero(&ixas, sizeof (ixas)); 13477 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 13478 ixas.ixa_flags |= IXAF_SET_SOURCE; 13479 ixas.ixa_zoneid = ALL_ZONES; 13480 ixas.ixa_ifindex = 0; 13481 ixas.ixa_ipst = ipst; 13482 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 13483 (void) ip_output_simple(mp, &ixas); 13484 ixa_cleanup(&ixas); 13485 #undef REPLY_LEN 13486 } 13487 13488 /* 13489 * Join the ipif specific multicast groups. 13490 * Must be called after a mapping has been set up in the resolver. (Always 13491 * called as writer.) 13492 */ 13493 void 13494 ipif_multicast_up(ipif_t *ipif) 13495 { 13496 int err; 13497 ill_t *ill; 13498 ilm_t *ilm; 13499 13500 ASSERT(IAM_WRITER_IPIF(ipif)); 13501 13502 ill = ipif->ipif_ill; 13503 13504 ip1dbg(("ipif_multicast_up\n")); 13505 if (!(ill->ill_flags & ILLF_MULTICAST) || 13506 ipif->ipif_allhosts_ilm != NULL) 13507 return; 13508 13509 if (ipif->ipif_isv6) { 13510 in6_addr_t v6allmc = ipv6_all_hosts_mcast; 13511 in6_addr_t v6solmc = ipv6_solicited_node_mcast; 13512 13513 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 13514 13515 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 13516 return; 13517 13518 ip1dbg(("ipif_multicast_up - addmulti\n")); 13519 13520 /* 13521 * Join the all hosts multicast address. We skip this for 13522 * underlying IPMP interfaces since they should be invisible. 13523 */ 13524 if (!IS_UNDER_IPMP(ill)) { 13525 ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid, 13526 &err); 13527 if (ilm == NULL) { 13528 ASSERT(err != 0); 13529 ip0dbg(("ipif_multicast_up: " 13530 "all_hosts_mcast failed %d\n", err)); 13531 return; 13532 } 13533 ipif->ipif_allhosts_ilm = ilm; 13534 } 13535 13536 /* 13537 * Enable multicast for the solicited node multicast address. 13538 * If IPMP we need to put the membership on the upper ill. 13539 */ 13540 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 13541 ill_t *mcast_ill = NULL; 13542 boolean_t need_refrele; 13543 13544 if (IS_UNDER_IPMP(ill) && 13545 (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) { 13546 need_refrele = B_TRUE; 13547 } else { 13548 mcast_ill = ill; 13549 need_refrele = B_FALSE; 13550 } 13551 13552 ilm = ip_addmulti(&v6solmc, mcast_ill, 13553 ipif->ipif_zoneid, &err); 13554 if (need_refrele) 13555 ill_refrele(mcast_ill); 13556 13557 if (ilm == NULL) { 13558 ASSERT(err != 0); 13559 ip0dbg(("ipif_multicast_up: solicited MC" 13560 " failed %d\n", err)); 13561 if ((ilm = ipif->ipif_allhosts_ilm) != NULL) { 13562 ipif->ipif_allhosts_ilm = NULL; 13563 (void) ip_delmulti(ilm); 13564 } 13565 return; 13566 } 13567 ipif->ipif_solmulti_ilm = ilm; 13568 } 13569 } else { 13570 in6_addr_t v6group; 13571 13572 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill)) 13573 return; 13574 13575 /* Join the all hosts multicast address */ 13576 ip1dbg(("ipif_multicast_up - addmulti\n")); 13577 IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group); 13578 13579 ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err); 13580 if (ilm == NULL) { 13581 ASSERT(err != 0); 13582 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 13583 return; 13584 } 13585 ipif->ipif_allhosts_ilm = ilm; 13586 } 13587 } 13588 13589 /* 13590 * Blow away any multicast groups that we joined in ipif_multicast_up(). 13591 * (ilms from explicit memberships are handled in conn_update_ill.) 13592 */ 13593 void 13594 ipif_multicast_down(ipif_t *ipif) 13595 { 13596 ASSERT(IAM_WRITER_IPIF(ipif)); 13597 13598 ip1dbg(("ipif_multicast_down\n")); 13599 13600 if (ipif->ipif_allhosts_ilm != NULL) { 13601 (void) ip_delmulti(ipif->ipif_allhosts_ilm); 13602 ipif->ipif_allhosts_ilm = NULL; 13603 } 13604 if (ipif->ipif_solmulti_ilm != NULL) { 13605 (void) ip_delmulti(ipif->ipif_solmulti_ilm); 13606 ipif->ipif_solmulti_ilm = NULL; 13607 } 13608 } 13609 13610 /* 13611 * Used when an interface comes up to recreate any extra routes on this 13612 * interface. 13613 */ 13614 int 13615 ill_recover_saved_ire(ill_t *ill) 13616 { 13617 mblk_t *mp; 13618 ip_stack_t *ipst = ill->ill_ipst; 13619 13620 ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name)); 13621 13622 mutex_enter(&ill->ill_saved_ire_lock); 13623 for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 13624 ire_t *ire, *nire; 13625 ifrt_t *ifrt; 13626 13627 ifrt = (ifrt_t *)mp->b_rptr; 13628 /* 13629 * Create a copy of the IRE with the saved address and netmask. 13630 */ 13631 if (ill->ill_isv6) { 13632 ire = ire_create_v6( 13633 &ifrt->ifrt_v6addr, 13634 &ifrt->ifrt_v6mask, 13635 &ifrt->ifrt_v6gateway_addr, 13636 ifrt->ifrt_type, 13637 ill, 13638 ifrt->ifrt_zoneid, 13639 ifrt->ifrt_flags, 13640 NULL, 13641 ipst); 13642 } else { 13643 ire = ire_create( 13644 (uint8_t *)&ifrt->ifrt_addr, 13645 (uint8_t *)&ifrt->ifrt_mask, 13646 (uint8_t *)&ifrt->ifrt_gateway_addr, 13647 ifrt->ifrt_type, 13648 ill, 13649 ifrt->ifrt_zoneid, 13650 ifrt->ifrt_flags, 13651 NULL, 13652 ipst); 13653 } 13654 if (ire == NULL) { 13655 mutex_exit(&ill->ill_saved_ire_lock); 13656 return (ENOMEM); 13657 } 13658 13659 if (ifrt->ifrt_flags & RTF_SETSRC) { 13660 if (ill->ill_isv6) { 13661 ire->ire_setsrc_addr_v6 = 13662 ifrt->ifrt_v6setsrc_addr; 13663 } else { 13664 ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr; 13665 } 13666 } 13667 13668 /* 13669 * Some software (for example, GateD and Sun Cluster) attempts 13670 * to create (what amount to) IRE_PREFIX routes with the 13671 * loopback address as the gateway. This is primarily done to 13672 * set up prefixes with the RTF_REJECT flag set (for example, 13673 * when generating aggregate routes.) 13674 * 13675 * If the IRE type (as defined by ill->ill_net_type) is 13676 * IRE_LOOPBACK, then we map the request into a 13677 * IRE_IF_NORESOLVER. 13678 */ 13679 if (ill->ill_net_type == IRE_LOOPBACK) 13680 ire->ire_type = IRE_IF_NORESOLVER; 13681 13682 /* 13683 * ire held by ire_add, will be refreled' towards the 13684 * the end of ipif_up_done 13685 */ 13686 nire = ire_add(ire); 13687 /* 13688 * Check if it was a duplicate entry. This handles 13689 * the case of two racing route adds for the same route 13690 */ 13691 if (nire == NULL) { 13692 ip1dbg(("ill_recover_saved_ire: FAILED\n")); 13693 } else if (nire != ire) { 13694 ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n", 13695 (void *)nire)); 13696 ire_delete(nire); 13697 } else { 13698 ip1dbg(("ill_recover_saved_ire: added ire %p\n", 13699 (void *)nire)); 13700 } 13701 if (nire != NULL) 13702 ire_refrele(nire); 13703 } 13704 mutex_exit(&ill->ill_saved_ire_lock); 13705 return (0); 13706 } 13707 13708 /* 13709 * Used to set the netmask and broadcast address to default values when the 13710 * interface is brought up. (Always called as writer.) 13711 */ 13712 static void 13713 ipif_set_default(ipif_t *ipif) 13714 { 13715 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 13716 13717 if (!ipif->ipif_isv6) { 13718 /* 13719 * Interface holds an IPv4 address. Default 13720 * mask is the natural netmask. 13721 */ 13722 if (!ipif->ipif_net_mask) { 13723 ipaddr_t v4mask; 13724 13725 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 13726 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 13727 } 13728 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 13729 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 13730 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 13731 } else { 13732 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 13733 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 13734 } 13735 /* 13736 * NOTE: SunOS 4.X does this even if the broadcast address 13737 * has been already set thus we do the same here. 13738 */ 13739 if (ipif->ipif_flags & IPIF_BROADCAST) { 13740 ipaddr_t v4addr; 13741 13742 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 13743 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 13744 } 13745 } else { 13746 /* 13747 * Interface holds an IPv6-only address. Default 13748 * mask is all-ones. 13749 */ 13750 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 13751 ipif->ipif_v6net_mask = ipv6_all_ones; 13752 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 13753 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 13754 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 13755 } else { 13756 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 13757 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 13758 } 13759 } 13760 } 13761 13762 /* 13763 * Return 0 if this address can be used as local address without causing 13764 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 13765 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 13766 * Note that the same IPv6 link-local address is allowed as long as the ills 13767 * are not on the same link. 13768 */ 13769 int 13770 ip_addr_availability_check(ipif_t *new_ipif) 13771 { 13772 in6_addr_t our_v6addr; 13773 ill_t *ill; 13774 ipif_t *ipif; 13775 ill_walk_context_t ctx; 13776 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst; 13777 13778 ASSERT(IAM_WRITER_IPIF(new_ipif)); 13779 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock)); 13780 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); 13781 13782 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 13783 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 13784 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 13785 return (0); 13786 13787 our_v6addr = new_ipif->ipif_v6lcl_addr; 13788 13789 if (new_ipif->ipif_isv6) 13790 ill = ILL_START_WALK_V6(&ctx, ipst); 13791 else 13792 ill = ILL_START_WALK_V4(&ctx, ipst); 13793 13794 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 13795 for (ipif = ill->ill_ipif; ipif != NULL; 13796 ipif = ipif->ipif_next) { 13797 if ((ipif == new_ipif) || 13798 !(ipif->ipif_flags & IPIF_UP) || 13799 (ipif->ipif_flags & IPIF_UNNUMBERED) || 13800 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 13801 &our_v6addr)) 13802 continue; 13803 13804 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 13805 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 13806 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 13807 ipif->ipif_flags |= IPIF_UNNUMBERED; 13808 else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) || 13809 IN6_IS_ADDR_SITELOCAL(&our_v6addr)) && 13810 !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill)) 13811 continue; 13812 else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid && 13813 ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill)) 13814 continue; 13815 else if (new_ipif->ipif_ill == ill) 13816 return (EADDRINUSE); 13817 else 13818 return (EADDRNOTAVAIL); 13819 } 13820 } 13821 13822 return (0); 13823 } 13824 13825 /* 13826 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 13827 * IREs for the ipif. 13828 * When the routine returns EINPROGRESS then mp has been consumed and 13829 * the ioctl will be acked from ip_rput_dlpi. 13830 */ 13831 int 13832 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 13833 { 13834 ill_t *ill = ipif->ipif_ill; 13835 boolean_t isv6 = ipif->ipif_isv6; 13836 int err = 0; 13837 boolean_t success; 13838 uint_t ipif_orig_id; 13839 ip_stack_t *ipst = ill->ill_ipst; 13840 13841 ASSERT(IAM_WRITER_IPIF(ipif)); 13842 13843 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13844 DTRACE_PROBE3(ipif__downup, char *, "ipif_up", 13845 ill_t *, ill, ipif_t *, ipif); 13846 13847 /* Shouldn't get here if it is already up. */ 13848 if (ipif->ipif_flags & IPIF_UP) 13849 return (EALREADY); 13850 13851 /* 13852 * If this is a request to bring up a data address on an interface 13853 * under IPMP, then move the address to its IPMP meta-interface and 13854 * try to bring it up. One complication is that the zeroth ipif for 13855 * an ill is special, in that every ill always has one, and that code 13856 * throughout IP deferences ill->ill_ipif without holding any locks. 13857 */ 13858 if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) && 13859 (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) { 13860 ipif_t *stubipif = NULL, *moveipif = NULL; 13861 ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); 13862 13863 /* 13864 * The ipif being brought up should be quiesced. If it's not, 13865 * something has gone amiss and we need to bail out. (If it's 13866 * quiesced, we know it will remain so via IPIF_CONDEMNED.) 13867 */ 13868 mutex_enter(&ill->ill_lock); 13869 if (!ipif_is_quiescent(ipif)) { 13870 mutex_exit(&ill->ill_lock); 13871 return (EINVAL); 13872 } 13873 mutex_exit(&ill->ill_lock); 13874 13875 /* 13876 * If we're going to need to allocate ipifs, do it prior 13877 * to starting the move (and grabbing locks). 13878 */ 13879 if (ipif->ipif_id == 0) { 13880 if ((moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 13881 B_FALSE, &err)) == NULL) { 13882 return (err); 13883 } 13884 if ((stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 13885 B_FALSE, &err)) == NULL) { 13886 mi_free(moveipif); 13887 return (err); 13888 } 13889 } 13890 13891 /* 13892 * Grab or transfer the ipif to move. During the move, keep 13893 * ill_g_lock held to prevent any ill walker threads from 13894 * seeing things in an inconsistent state. 13895 */ 13896 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13897 if (ipif->ipif_id != 0) { 13898 ipif_remove(ipif); 13899 } else { 13900 ipif_transfer(ipif, moveipif, stubipif); 13901 ipif = moveipif; 13902 } 13903 13904 /* 13905 * Place the ipif on the IPMP ill. If the zeroth ipif on 13906 * the IPMP ill is a stub (0.0.0.0 down address) then we 13907 * replace that one. Otherwise, pick the next available slot. 13908 */ 13909 ipif->ipif_ill = ipmp_ill; 13910 ipif_orig_id = ipif->ipif_id; 13911 13912 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) { 13913 ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL); 13914 ipif = ipmp_ill->ill_ipif; 13915 } else { 13916 ipif->ipif_id = -1; 13917 if ((err = ipif_insert(ipif, B_FALSE)) != 0) { 13918 /* 13919 * No more available ipif_id's -- put it back 13920 * on the original ill and fail the operation. 13921 * Since we're writer on the ill, we can be 13922 * sure our old slot is still available. 13923 */ 13924 ipif->ipif_id = ipif_orig_id; 13925 ipif->ipif_ill = ill; 13926 if (ipif_orig_id == 0) { 13927 ipif_transfer(ipif, ill->ill_ipif, 13928 NULL); 13929 } else { 13930 VERIFY(ipif_insert(ipif, B_FALSE) == 0); 13931 } 13932 rw_exit(&ipst->ips_ill_g_lock); 13933 return (err); 13934 } 13935 } 13936 rw_exit(&ipst->ips_ill_g_lock); 13937 13938 /* 13939 * Tell SCTP that the ipif has moved. Note that even if we 13940 * had to allocate a new ipif, the original sequence id was 13941 * preserved and therefore SCTP won't know. 13942 */ 13943 sctp_move_ipif(ipif, ill, ipmp_ill); 13944 13945 /* 13946 * If the ipif being brought up was on slot zero, then we 13947 * first need to bring up the placeholder we stuck there. In 13948 * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive 13949 * call to ipif_up() itself, if we successfully bring up the 13950 * placeholder, we'll check ill_move_ipif and bring it up too. 13951 */ 13952 if (ipif_orig_id == 0) { 13953 ASSERT(ill->ill_move_ipif == NULL); 13954 ill->ill_move_ipif = ipif; 13955 if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0) 13956 ASSERT(ill->ill_move_ipif == NULL); 13957 if (err != EINPROGRESS) 13958 ill->ill_move_ipif = NULL; 13959 return (err); 13960 } 13961 13962 /* 13963 * Bring it up on the IPMP ill. 13964 */ 13965 return (ipif_up(ipif, q, mp)); 13966 } 13967 13968 /* Skip arp/ndp for any loopback interface. */ 13969 if (ill->ill_wq != NULL) { 13970 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 13971 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 13972 13973 if (!ill->ill_dl_up) { 13974 /* 13975 * ill_dl_up is not yet set. i.e. we are yet to 13976 * DL_BIND with the driver and this is the first 13977 * logical interface on the ill to become "up". 13978 * Tell the driver to get going (via DL_BIND_REQ). 13979 * Note that changing "significant" IFF_ flags 13980 * address/netmask etc cause a down/up dance, but 13981 * does not cause an unbind (DL_UNBIND) with the driver 13982 */ 13983 return (ill_dl_up(ill, ipif, mp, q)); 13984 } 13985 13986 /* 13987 * ipif_resolver_up may end up needeing to bind/attach 13988 * the ARP stream, which in turn necessitates a 13989 * DLPI message exchange with the driver. ioctls are 13990 * serialized and so we cannot send more than one 13991 * interface up message at a time. If ipif_resolver_up 13992 * does need to wait for the DLPI handshake for the ARP stream, 13993 * we get EINPROGRESS and we will complete in arp_bringup_done. 13994 */ 13995 13996 ASSERT(connp != NULL || !CONN_Q(q)); 13997 if (connp != NULL) 13998 mutex_enter(&connp->conn_lock); 13999 mutex_enter(&ill->ill_lock); 14000 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 14001 mutex_exit(&ill->ill_lock); 14002 if (connp != NULL) 14003 mutex_exit(&connp->conn_lock); 14004 if (!success) 14005 return (EINTR); 14006 14007 /* 14008 * Crank up IPv6 neighbor discovery. Unlike ARP, this should 14009 * complete when ipif_ndp_up returns. 14010 */ 14011 err = ipif_resolver_up(ipif, Res_act_initial); 14012 if (err == EINPROGRESS) { 14013 /* We will complete it in arp_bringup_done() */ 14014 return (err); 14015 } 14016 14017 if (isv6 && err == 0) 14018 err = ipif_ndp_up(ipif, B_TRUE); 14019 14020 ASSERT(err != EINPROGRESS); 14021 mp = ipsq_pending_mp_get(ipsq, &connp); 14022 ASSERT(mp != NULL); 14023 if (err != 0) 14024 return (err); 14025 } else { 14026 /* 14027 * Interfaces without underlying hardware don't do duplicate 14028 * address detection. 14029 */ 14030 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 14031 ipif->ipif_addr_ready = 1; 14032 err = ill_add_ires(ill); 14033 /* allocation failure? */ 14034 if (err != 0) 14035 return (err); 14036 } 14037 14038 err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 14039 if (err == 0 && ill->ill_move_ipif != NULL) { 14040 ipif = ill->ill_move_ipif; 14041 ill->ill_move_ipif = NULL; 14042 return (ipif_up(ipif, q, mp)); 14043 } 14044 return (err); 14045 } 14046 14047 /* 14048 * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST. 14049 * The identical set of IREs need to be removed in ill_delete_ires(). 14050 */ 14051 int 14052 ill_add_ires(ill_t *ill) 14053 { 14054 ire_t *ire; 14055 in6_addr_t dummy6 = {(uint32_t)V6_MCAST, 0, 0, 1}; 14056 in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP); 14057 14058 if (ill->ill_ire_multicast != NULL) 14059 return (0); 14060 14061 /* 14062 * provide some dummy ire_addr for creating the ire. 14063 */ 14064 if (ill->ill_isv6) { 14065 ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill, 14066 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); 14067 } else { 14068 ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill, 14069 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); 14070 } 14071 if (ire == NULL) 14072 return (ENOMEM); 14073 14074 ill->ill_ire_multicast = ire; 14075 return (0); 14076 } 14077 14078 void 14079 ill_delete_ires(ill_t *ill) 14080 { 14081 if (ill->ill_ire_multicast != NULL) { 14082 /* 14083 * BIND/ATTACH completed; Release the ref for ill_ire_multicast 14084 * which was taken without any th_tracing enabled. 14085 * We also mark it as condemned (note that it was never added) 14086 * so that caching conn's can move off of it. 14087 */ 14088 ire_make_condemned(ill->ill_ire_multicast); 14089 ire_refrele_notr(ill->ill_ire_multicast); 14090 ill->ill_ire_multicast = NULL; 14091 } 14092 } 14093 14094 /* 14095 * Perform a bind for the physical device. 14096 * When the routine returns EINPROGRESS then mp has been consumed and 14097 * the ioctl will be acked from ip_rput_dlpi. 14098 * Allocate an unbind message and save it until ipif_down. 14099 */ 14100 static int 14101 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 14102 { 14103 mblk_t *bind_mp = NULL; 14104 mblk_t *unbind_mp = NULL; 14105 conn_t *connp; 14106 boolean_t success; 14107 int err; 14108 14109 DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill); 14110 14111 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 14112 ASSERT(IAM_WRITER_ILL(ill)); 14113 ASSERT(mp != NULL); 14114 14115 /* 14116 * Make sure we have an IRE_MULTICAST in case we immediately 14117 * start receiving packets. 14118 */ 14119 err = ill_add_ires(ill); 14120 if (err != 0) 14121 goto bad; 14122 14123 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 14124 DL_BIND_REQ); 14125 if (bind_mp == NULL) 14126 goto bad; 14127 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 14128 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 14129 14130 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 14131 if (unbind_mp == NULL) 14132 goto bad; 14133 14134 /* 14135 * Record state needed to complete this operation when the 14136 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 14137 */ 14138 connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 14139 ASSERT(connp != NULL || !CONN_Q(q)); 14140 GRAB_CONN_LOCK(q); 14141 mutex_enter(&ipif->ipif_ill->ill_lock); 14142 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 14143 mutex_exit(&ipif->ipif_ill->ill_lock); 14144 RELEASE_CONN_LOCK(q); 14145 if (!success) 14146 goto bad; 14147 14148 /* 14149 * Save the unbind message for ill_dl_down(); it will be consumed when 14150 * the interface goes down. 14151 */ 14152 ASSERT(ill->ill_unbind_mp == NULL); 14153 ill->ill_unbind_mp = unbind_mp; 14154 14155 ill_dlpi_send(ill, bind_mp); 14156 /* Send down link-layer capabilities probe if not already done. */ 14157 ill_capability_probe(ill); 14158 14159 /* 14160 * Sysid used to rely on the fact that netboots set domainname 14161 * and the like. Now that miniroot boots aren't strictly netboots 14162 * and miniroot network configuration is driven from userland 14163 * these things still need to be set. This situation can be detected 14164 * by comparing the interface being configured here to the one 14165 * dhcifname was set to reference by the boot loader. Once sysid is 14166 * converted to use dhcp_ipc_getinfo() this call can go away. 14167 */ 14168 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && 14169 (strcmp(ill->ill_name, dhcifname) == 0) && 14170 (strlen(srpc_domain) == 0)) { 14171 if (dhcpinit() != 0) 14172 cmn_err(CE_WARN, "no cached dhcp response"); 14173 } 14174 14175 /* 14176 * This operation will complete in ip_rput_dlpi with either 14177 * a DL_BIND_ACK or DL_ERROR_ACK. 14178 */ 14179 return (EINPROGRESS); 14180 bad: 14181 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 14182 14183 freemsg(bind_mp); 14184 freemsg(unbind_mp); 14185 return (ENOMEM); 14186 } 14187 14188 /* Add room for tcp+ip headers */ 14189 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 14190 14191 /* 14192 * DLPI and ARP is up. 14193 * Create all the IREs associated with an interface. Bring up multicast. 14194 * Set the interface flag and finish other initialization 14195 * that potentially had to be deferred to after DL_BIND_ACK. 14196 */ 14197 int 14198 ipif_up_done(ipif_t *ipif) 14199 { 14200 ill_t *ill = ipif->ipif_ill; 14201 int err = 0; 14202 boolean_t loopback = B_FALSE; 14203 boolean_t update_src_selection = B_TRUE; 14204 ipif_t *tmp_ipif; 14205 14206 ip1dbg(("ipif_up_done(%s:%u)\n", 14207 ipif->ipif_ill->ill_name, ipif->ipif_id)); 14208 DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done", 14209 ill_t *, ill, ipif_t *, ipif); 14210 14211 /* Check if this is a loopback interface */ 14212 if (ipif->ipif_ill->ill_wq == NULL) 14213 loopback = B_TRUE; 14214 14215 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 14216 14217 /* 14218 * If all other interfaces for this ill are down or DEPRECATED, 14219 * or otherwise unsuitable for source address selection, 14220 * reset the src generation numbers to make sure source 14221 * address selection gets to take this new ipif into account. 14222 * No need to hold ill_lock while traversing the ipif list since 14223 * we are writer 14224 */ 14225 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 14226 tmp_ipif = tmp_ipif->ipif_next) { 14227 if (((tmp_ipif->ipif_flags & 14228 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 14229 !(tmp_ipif->ipif_flags & IPIF_UP)) || 14230 (tmp_ipif == ipif)) 14231 continue; 14232 /* first useable pre-existing interface */ 14233 update_src_selection = B_FALSE; 14234 break; 14235 } 14236 if (update_src_selection) 14237 ip_update_source_selection(ill->ill_ipst); 14238 14239 if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) { 14240 nce_t *loop_nce = NULL; 14241 uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD); 14242 14243 /* 14244 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 14245 * ipif_lookup_on_name(), but in the case of zones we can have 14246 * several loopback addresses on lo0. So all the interfaces with 14247 * loopback addresses need to be marked IRE_LOOPBACK. 14248 */ 14249 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 14250 htonl(INADDR_LOOPBACK)) 14251 ipif->ipif_ire_type = IRE_LOOPBACK; 14252 else 14253 ipif->ipif_ire_type = IRE_LOCAL; 14254 if (ill->ill_net_type != IRE_LOOPBACK) 14255 flags |= NCE_F_PUBLISH; 14256 14257 /* add unicast nce for the local addr */ 14258 err = nce_lookup_then_add_v4(ill, NULL, 14259 ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags, 14260 ND_REACHABLE, &loop_nce); 14261 /* A shared-IP zone sees EEXIST for lo0:N */ 14262 if (err == 0 || err == EEXIST) { 14263 ipif->ipif_added_nce = 1; 14264 loop_nce->nce_ipif_cnt++; 14265 nce_refrele(loop_nce); 14266 err = 0; 14267 } else { 14268 ASSERT(loop_nce == NULL); 14269 return (err); 14270 } 14271 } 14272 14273 /* Create all the IREs associated with this interface */ 14274 err = ipif_add_ires_v4(ipif, loopback); 14275 if (err != 0) { 14276 /* 14277 * see comments about return value from 14278 * ip_addr_availability_check() in ipif_add_ires_v4(). 14279 */ 14280 if (err != EADDRINUSE) { 14281 (void) ipif_arp_down(ipif); 14282 } else { 14283 /* 14284 * Make IPMP aware of the deleted ipif so that 14285 * the needed ipmp cleanup (e.g., of ipif_bound_ill) 14286 * can be completed. Note that we do not want to 14287 * destroy the nce that was created on the ipmp_ill 14288 * for the active copy of the duplicate address in 14289 * use. 14290 */ 14291 if (IS_IPMP(ill)) 14292 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 14293 err = EADDRNOTAVAIL; 14294 } 14295 return (err); 14296 } 14297 14298 if (ill->ill_ipif_up_count == 1 && !loopback) { 14299 /* Recover any additional IREs entries for this ill */ 14300 (void) ill_recover_saved_ire(ill); 14301 } 14302 14303 if (ill->ill_need_recover_multicast) { 14304 /* 14305 * Need to recover all multicast memberships in the driver. 14306 * This had to be deferred until we had attached. The same 14307 * code exists in ipif_up_done_v6() to recover IPv6 14308 * memberships. 14309 * 14310 * Note that it would be preferable to unconditionally do the 14311 * ill_recover_multicast() in ill_dl_up(), but we cannot do 14312 * that since ill_join_allmulti() depends on ill_dl_up being 14313 * set, and it is not set until we receive a DL_BIND_ACK after 14314 * having called ill_dl_up(). 14315 */ 14316 ill_recover_multicast(ill); 14317 } 14318 14319 if (ill->ill_ipif_up_count == 1) { 14320 /* 14321 * Since the interface is now up, it may now be active. 14322 */ 14323 if (IS_UNDER_IPMP(ill)) 14324 ipmp_ill_refresh_active(ill); 14325 14326 /* 14327 * If this is an IPMP interface, we may now be able to 14328 * establish ARP entries. 14329 */ 14330 if (IS_IPMP(ill)) 14331 ipmp_illgrp_refresh_arpent(ill->ill_grp); 14332 } 14333 14334 /* Join the allhosts multicast address */ 14335 ipif_multicast_up(ipif); 14336 14337 if (!loopback && !update_src_selection && 14338 !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) 14339 ip_update_source_selection(ill->ill_ipst); 14340 14341 if (!loopback && ipif->ipif_addr_ready) { 14342 /* Broadcast an address mask reply. */ 14343 ipif_mask_reply(ipif); 14344 } 14345 /* Perhaps ilgs should use this ill */ 14346 update_conn_ill(NULL, ill->ill_ipst); 14347 14348 /* 14349 * This had to be deferred until we had bound. Tell routing sockets and 14350 * others that this interface is up if it looks like the address has 14351 * been validated. Otherwise, if it isn't ready yet, wait for 14352 * duplicate address detection to do its thing. 14353 */ 14354 if (ipif->ipif_addr_ready) 14355 ipif_up_notify(ipif); 14356 return (0); 14357 } 14358 14359 /* 14360 * Add the IREs associated with the ipif. 14361 * Those MUST be explicitly removed in ipif_delete_ires_v4. 14362 */ 14363 static int 14364 ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback) 14365 { 14366 ill_t *ill = ipif->ipif_ill; 14367 ip_stack_t *ipst = ill->ill_ipst; 14368 ire_t *ire_array[20]; 14369 ire_t **irep = ire_array; 14370 ire_t **irep1; 14371 ipaddr_t net_mask = 0; 14372 ipaddr_t subnet_mask, route_mask; 14373 int err; 14374 ire_t *ire_local = NULL; /* LOCAL or LOOPBACK */ 14375 ire_t *ire_if = NULL; 14376 14377 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14378 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14379 /* 14380 * If we're on a labeled system then make sure that zone- 14381 * private addresses have proper remote host database entries. 14382 */ 14383 if (is_system_labeled() && 14384 ipif->ipif_ire_type != IRE_LOOPBACK && 14385 !tsol_check_interface_address(ipif)) 14386 return (EINVAL); 14387 14388 /* Register the source address for __sin6_src_id */ 14389 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 14390 ipif->ipif_zoneid, ipst); 14391 if (err != 0) { 14392 ip0dbg(("ipif_add_ires: srcid_insert %d\n", err)); 14393 return (err); 14394 } 14395 14396 /* If the interface address is set, create the local IRE. */ 14397 ire_local = ire_create( 14398 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 14399 (uchar_t *)&ip_g_all_ones, /* mask */ 14400 NULL, /* no gateway */ 14401 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 14402 ipif->ipif_ill, 14403 ipif->ipif_zoneid, 14404 ((ipif->ipif_flags & IPIF_PRIVATE) ? 14405 RTF_PRIVATE : 0) | RTF_KERNEL, 14406 NULL, 14407 ipst); 14408 ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x" 14409 " for 0x%x\n", (void *)ipif, (void *)ire_local, 14410 ipif->ipif_ire_type, 14411 ntohl(ipif->ipif_lcl_addr))); 14412 if (ire_local == NULL) { 14413 ip1dbg(("ipif_up_done: NULL ire_local\n")); 14414 err = ENOMEM; 14415 goto bad; 14416 } 14417 } else { 14418 ip1dbg(( 14419 "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n", 14420 ipif->ipif_ire_type, 14421 ntohl(ipif->ipif_lcl_addr), 14422 (uint_t)ipif->ipif_flags)); 14423 } 14424 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14425 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14426 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14427 } else { 14428 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 14429 } 14430 14431 subnet_mask = ipif->ipif_net_mask; 14432 14433 /* 14434 * If mask was not specified, use natural netmask of 14435 * interface address. Also, store this mask back into the 14436 * ipif struct. 14437 */ 14438 if (subnet_mask == 0) { 14439 subnet_mask = net_mask; 14440 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 14441 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 14442 ipif->ipif_v6subnet); 14443 } 14444 14445 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 14446 if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) && 14447 ipif->ipif_subnet != INADDR_ANY) { 14448 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 14449 14450 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 14451 route_mask = IP_HOST_MASK; 14452 } else { 14453 route_mask = subnet_mask; 14454 } 14455 14456 ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p " 14457 "creating if IRE ill_net_type 0x%x for 0x%x\n", 14458 (void *)ipif, (void *)ill, ill->ill_net_type, 14459 ntohl(ipif->ipif_subnet))); 14460 ire_if = ire_create( 14461 (uchar_t *)&ipif->ipif_subnet, 14462 (uchar_t *)&route_mask, 14463 (uchar_t *)&ipif->ipif_lcl_addr, 14464 ill->ill_net_type, 14465 ill, 14466 ipif->ipif_zoneid, 14467 ((ipif->ipif_flags & IPIF_PRIVATE) ? 14468 RTF_PRIVATE: 0) | RTF_KERNEL, 14469 NULL, 14470 ipst); 14471 if (ire_if == NULL) { 14472 ip1dbg(("ipif_up_done: NULL ire_if\n")); 14473 err = ENOMEM; 14474 goto bad; 14475 } 14476 } 14477 14478 /* 14479 * Create any necessary broadcast IREs. 14480 */ 14481 if ((ipif->ipif_flags & IPIF_BROADCAST) && 14482 !(ipif->ipif_flags & IPIF_NOXMIT)) 14483 irep = ipif_create_bcast_ires(ipif, irep); 14484 14485 /* If an earlier ire_create failed, get out now */ 14486 for (irep1 = irep; irep1 > ire_array; ) { 14487 irep1--; 14488 if (*irep1 == NULL) { 14489 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 14490 err = ENOMEM; 14491 goto bad; 14492 } 14493 } 14494 14495 /* 14496 * Need to atomically check for IP address availability under 14497 * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new 14498 * ills or new ipifs can be added while we are checking availability. 14499 */ 14500 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 14501 mutex_enter(&ipst->ips_ip_addr_avail_lock); 14502 /* Mark it up, and increment counters. */ 14503 ipif->ipif_flags |= IPIF_UP; 14504 ill->ill_ipif_up_count++; 14505 err = ip_addr_availability_check(ipif); 14506 mutex_exit(&ipst->ips_ip_addr_avail_lock); 14507 rw_exit(&ipst->ips_ill_g_lock); 14508 14509 if (err != 0) { 14510 /* 14511 * Our address may already be up on the same ill. In this case, 14512 * the ARP entry for our ipif replaced the one for the other 14513 * ipif. So we don't want to delete it (otherwise the other ipif 14514 * would be unable to send packets). 14515 * ip_addr_availability_check() identifies this case for us and 14516 * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL 14517 * which is the expected error code. 14518 */ 14519 ill->ill_ipif_up_count--; 14520 ipif->ipif_flags &= ~IPIF_UP; 14521 goto bad; 14522 } 14523 14524 /* 14525 * Add in all newly created IREs. ire_create_bcast() has 14526 * already checked for duplicates of the IRE_BROADCAST type. 14527 * We add the IRE_INTERFACE before the IRE_LOCAL to ensure 14528 * that lookups find the IRE_LOCAL even if the IRE_INTERFACE is 14529 * a /32 route. 14530 */ 14531 if (ire_if != NULL) { 14532 ire_if = ire_add(ire_if); 14533 if (ire_if == NULL) { 14534 err = ENOMEM; 14535 goto bad2; 14536 } 14537 #ifdef DEBUG 14538 ire_refhold_notr(ire_if); 14539 ire_refrele(ire_if); 14540 #endif 14541 } 14542 if (ire_local != NULL) { 14543 ire_local = ire_add(ire_local); 14544 if (ire_local == NULL) { 14545 err = ENOMEM; 14546 goto bad2; 14547 } 14548 #ifdef DEBUG 14549 ire_refhold_notr(ire_local); 14550 ire_refrele(ire_local); 14551 #endif 14552 } 14553 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14554 if (ire_local != NULL) 14555 ipif->ipif_ire_local = ire_local; 14556 if (ire_if != NULL) 14557 ipif->ipif_ire_if = ire_if; 14558 rw_exit(&ipst->ips_ill_g_lock); 14559 ire_local = NULL; 14560 ire_if = NULL; 14561 14562 /* 14563 * We first add all of them, and if that succeeds we refrele the 14564 * bunch. That enables us to delete all of them should any of the 14565 * ire_adds fail. 14566 */ 14567 for (irep1 = irep; irep1 > ire_array; ) { 14568 irep1--; 14569 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock))); 14570 *irep1 = ire_add(*irep1); 14571 if (*irep1 == NULL) { 14572 err = ENOMEM; 14573 goto bad2; 14574 } 14575 } 14576 14577 for (irep1 = irep; irep1 > ire_array; ) { 14578 irep1--; 14579 /* refheld by ire_add. */ 14580 if (*irep1 != NULL) { 14581 ire_refrele(*irep1); 14582 *irep1 = NULL; 14583 } 14584 } 14585 14586 if (!loopback) { 14587 /* 14588 * If the broadcast address has been set, make sure it makes 14589 * sense based on the interface address. 14590 * Only match on ill since we are sharing broadcast addresses. 14591 */ 14592 if ((ipif->ipif_brd_addr != INADDR_ANY) && 14593 (ipif->ipif_flags & IPIF_BROADCAST)) { 14594 ire_t *ire; 14595 14596 ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0, 14597 IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES, NULL, 14598 (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL); 14599 14600 if (ire == NULL) { 14601 /* 14602 * If there isn't a matching broadcast IRE, 14603 * revert to the default for this netmask. 14604 */ 14605 ipif->ipif_v6brd_addr = ipv6_all_zeros; 14606 mutex_enter(&ipif->ipif_ill->ill_lock); 14607 ipif_set_default(ipif); 14608 mutex_exit(&ipif->ipif_ill->ill_lock); 14609 } else { 14610 ire_refrele(ire); 14611 } 14612 } 14613 14614 } 14615 return (0); 14616 14617 bad2: 14618 ill->ill_ipif_up_count--; 14619 ipif->ipif_flags &= ~IPIF_UP; 14620 14621 bad: 14622 ip1dbg(("ipif_add_ires: FAILED \n")); 14623 if (ire_local != NULL) 14624 ire_delete(ire_local); 14625 if (ire_if != NULL) 14626 ire_delete(ire_if); 14627 14628 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14629 ire_local = ipif->ipif_ire_local; 14630 ipif->ipif_ire_local = NULL; 14631 ire_if = ipif->ipif_ire_if; 14632 ipif->ipif_ire_if = NULL; 14633 rw_exit(&ipst->ips_ill_g_lock); 14634 if (ire_local != NULL) { 14635 ire_delete(ire_local); 14636 ire_refrele_notr(ire_local); 14637 } 14638 if (ire_if != NULL) { 14639 ire_delete(ire_if); 14640 ire_refrele_notr(ire_if); 14641 } 14642 14643 while (irep > ire_array) { 14644 irep--; 14645 if (*irep != NULL) { 14646 ire_delete(*irep); 14647 } 14648 } 14649 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); 14650 14651 return (err); 14652 } 14653 14654 /* Remove all the IREs created by ipif_add_ires_v4 */ 14655 void 14656 ipif_delete_ires_v4(ipif_t *ipif) 14657 { 14658 ill_t *ill = ipif->ipif_ill; 14659 ip_stack_t *ipst = ill->ill_ipst; 14660 ire_t *ire; 14661 14662 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14663 ire = ipif->ipif_ire_local; 14664 ipif->ipif_ire_local = NULL; 14665 rw_exit(&ipst->ips_ill_g_lock); 14666 if (ire != NULL) { 14667 /* 14668 * Move count to ipif so we don't loose the count due to 14669 * a down/up dance. 14670 */ 14671 atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count); 14672 14673 ire_delete(ire); 14674 ire_refrele_notr(ire); 14675 } 14676 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14677 ire = ipif->ipif_ire_if; 14678 ipif->ipif_ire_if = NULL; 14679 rw_exit(&ipst->ips_ill_g_lock); 14680 if (ire != NULL) { 14681 ire_delete(ire); 14682 ire_refrele_notr(ire); 14683 } 14684 14685 /* 14686 * Delete the broadcast IREs. 14687 */ 14688 if ((ipif->ipif_flags & IPIF_BROADCAST) && 14689 !(ipif->ipif_flags & IPIF_NOXMIT)) 14690 ipif_delete_bcast_ires(ipif); 14691 } 14692 14693 /* 14694 * Checks for availbility of a usable source address (if there is one) when the 14695 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 14696 * this selection is done regardless of the destination. 14697 */ 14698 boolean_t 14699 ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid, 14700 ip_stack_t *ipst) 14701 { 14702 ipif_t *ipif = NULL; 14703 ill_t *uill; 14704 14705 ASSERT(ifindex != 0); 14706 14707 uill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 14708 if (uill == NULL) 14709 return (B_FALSE); 14710 14711 mutex_enter(&uill->ill_lock); 14712 for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14713 if (IPIF_IS_CONDEMNED(ipif)) 14714 continue; 14715 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 14716 continue; 14717 if (!(ipif->ipif_flags & IPIF_UP)) 14718 continue; 14719 if (ipif->ipif_zoneid != zoneid) 14720 continue; 14721 if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : 14722 ipif->ipif_lcl_addr == INADDR_ANY) 14723 continue; 14724 mutex_exit(&uill->ill_lock); 14725 ill_refrele(uill); 14726 return (B_TRUE); 14727 } 14728 mutex_exit(&uill->ill_lock); 14729 ill_refrele(uill); 14730 return (B_FALSE); 14731 } 14732 14733 /* 14734 * Find an ipif with a good local address on the ill+zoneid. 14735 */ 14736 ipif_t * 14737 ipif_good_addr(ill_t *ill, zoneid_t zoneid) 14738 { 14739 ipif_t *ipif; 14740 14741 mutex_enter(&ill->ill_lock); 14742 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14743 if (IPIF_IS_CONDEMNED(ipif)) 14744 continue; 14745 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 14746 continue; 14747 if (!(ipif->ipif_flags & IPIF_UP)) 14748 continue; 14749 if (ipif->ipif_zoneid != zoneid && 14750 ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES) 14751 continue; 14752 if (ill->ill_isv6 ? 14753 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : 14754 ipif->ipif_lcl_addr == INADDR_ANY) 14755 continue; 14756 ipif_refhold_locked(ipif); 14757 mutex_exit(&ill->ill_lock); 14758 return (ipif); 14759 } 14760 mutex_exit(&ill->ill_lock); 14761 return (NULL); 14762 } 14763 14764 /* 14765 * IP source address type, sorted from worst to best. For a given type, 14766 * always prefer IP addresses on the same subnet. All-zones addresses are 14767 * suboptimal because they pose problems with unlabeled destinations. 14768 */ 14769 typedef enum { 14770 IPIF_NONE, 14771 IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */ 14772 IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */ 14773 IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */ 14774 IPIF_SAMENET_ALLZONES, /* allzones and same subnet */ 14775 IPIF_DIFFNET, /* normal and different subnet */ 14776 IPIF_SAMENET, /* normal and same subnet */ 14777 IPIF_LOCALADDR /* local loopback */ 14778 } ipif_type_t; 14779 14780 /* 14781 * Pick the optimal ipif on `ill' for sending to destination `dst' from zone 14782 * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t 14783 * enumeration, and return the highest-rated ipif. If there's a tie, we pick 14784 * the first one, unless IPMP is used in which case we round-robin among them; 14785 * see below for more. 14786 * 14787 * Returns NULL if there is no suitable source address for the ill. 14788 * This only occurs when there is no valid source address for the ill. 14789 */ 14790 ipif_t * 14791 ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid, 14792 boolean_t allow_usesrc, boolean_t *notreadyp) 14793 { 14794 ill_t *usill = NULL; 14795 ill_t *ipmp_ill = NULL; 14796 ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif; 14797 ipif_type_t type, best_type; 14798 tsol_tpc_t *src_rhtp, *dst_rhtp; 14799 ip_stack_t *ipst = ill->ill_ipst; 14800 boolean_t samenet; 14801 14802 if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) { 14803 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, 14804 B_FALSE, ipst); 14805 if (usill != NULL) 14806 ill = usill; /* Select source from usesrc ILL */ 14807 else 14808 return (NULL); 14809 } 14810 14811 /* 14812 * Test addresses should never be used for source address selection, 14813 * so if we were passed one, switch to the IPMP meta-interface. 14814 */ 14815 if (IS_UNDER_IPMP(ill)) { 14816 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) 14817 ill = ipmp_ill; /* Select source from IPMP ill */ 14818 else 14819 return (NULL); 14820 } 14821 14822 /* 14823 * If we're dealing with an unlabeled destination on a labeled system, 14824 * make sure that we ignore source addresses that are incompatible with 14825 * the destination's default label. That destination's default label 14826 * must dominate the minimum label on the source address. 14827 */ 14828 dst_rhtp = NULL; 14829 if (is_system_labeled()) { 14830 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 14831 if (dst_rhtp == NULL) 14832 return (NULL); 14833 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 14834 TPC_RELE(dst_rhtp); 14835 dst_rhtp = NULL; 14836 } 14837 } 14838 14839 /* 14840 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill 14841 * can be deleted. But an ipif/ill can get CONDEMNED any time. 14842 * After selecting the right ipif, under ill_lock make sure ipif is 14843 * not condemned, and increment refcnt. If ipif is CONDEMNED, 14844 * we retry. Inside the loop we still need to check for CONDEMNED, 14845 * but not under a lock. 14846 */ 14847 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 14848 retry: 14849 /* 14850 * For source address selection, we treat the ipif list as circular 14851 * and continue until we get back to where we started. This allows 14852 * IPMP to vary source address selection (which improves inbound load 14853 * spreading) by caching its last ending point and starting from 14854 * there. NOTE: we don't have to worry about ill_src_ipif changing 14855 * ills since that can't happen on the IPMP ill. 14856 */ 14857 start_ipif = ill->ill_ipif; 14858 if (IS_IPMP(ill) && ill->ill_src_ipif != NULL) 14859 start_ipif = ill->ill_src_ipif; 14860 14861 ipif = start_ipif; 14862 best_ipif = NULL; 14863 best_type = IPIF_NONE; 14864 do { 14865 if ((next_ipif = ipif->ipif_next) == NULL) 14866 next_ipif = ill->ill_ipif; 14867 14868 if (IPIF_IS_CONDEMNED(ipif)) 14869 continue; 14870 /* Always skip NOLOCAL and ANYCAST interfaces */ 14871 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 14872 continue; 14873 /* Always skip NOACCEPT interfaces */ 14874 if (ipif->ipif_ill->ill_flags & ILLF_NOACCEPT) 14875 continue; 14876 if (!(ipif->ipif_flags & IPIF_UP)) 14877 continue; 14878 14879 if (!ipif->ipif_addr_ready) { 14880 if (notreadyp != NULL) 14881 *notreadyp = B_TRUE; 14882 continue; 14883 } 14884 14885 if (zoneid != ALL_ZONES && 14886 ipif->ipif_zoneid != zoneid && 14887 ipif->ipif_zoneid != ALL_ZONES) 14888 continue; 14889 14890 /* 14891 * Interfaces with 0.0.0.0 address are allowed to be UP, but 14892 * are not valid as source addresses. 14893 */ 14894 if (ipif->ipif_lcl_addr == INADDR_ANY) 14895 continue; 14896 14897 /* 14898 * Check compatibility of local address for destination's 14899 * default label if we're on a labeled system. Incompatible 14900 * addresses can't be used at all. 14901 */ 14902 if (dst_rhtp != NULL) { 14903 boolean_t incompat; 14904 14905 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 14906 IPV4_VERSION, B_FALSE); 14907 if (src_rhtp == NULL) 14908 continue; 14909 incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO || 14910 src_rhtp->tpc_tp.tp_doi != 14911 dst_rhtp->tpc_tp.tp_doi || 14912 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 14913 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 14914 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 14915 src_rhtp->tpc_tp.tp_sl_set_cipso)); 14916 TPC_RELE(src_rhtp); 14917 if (incompat) 14918 continue; 14919 } 14920 14921 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet); 14922 14923 if (ipif->ipif_lcl_addr == dst) { 14924 type = IPIF_LOCALADDR; 14925 } else if (ipif->ipif_flags & IPIF_DEPRECATED) { 14926 type = samenet ? IPIF_SAMENET_DEPRECATED : 14927 IPIF_DIFFNET_DEPRECATED; 14928 } else if (ipif->ipif_zoneid == ALL_ZONES) { 14929 type = samenet ? IPIF_SAMENET_ALLZONES : 14930 IPIF_DIFFNET_ALLZONES; 14931 } else { 14932 type = samenet ? IPIF_SAMENET : IPIF_DIFFNET; 14933 } 14934 14935 if (type > best_type) { 14936 best_type = type; 14937 best_ipif = ipif; 14938 if (best_type == IPIF_LOCALADDR) 14939 break; /* can't get better */ 14940 } 14941 } while ((ipif = next_ipif) != start_ipif); 14942 14943 if ((ipif = best_ipif) != NULL) { 14944 mutex_enter(&ipif->ipif_ill->ill_lock); 14945 if (IPIF_IS_CONDEMNED(ipif)) { 14946 mutex_exit(&ipif->ipif_ill->ill_lock); 14947 goto retry; 14948 } 14949 ipif_refhold_locked(ipif); 14950 14951 /* 14952 * For IPMP, update the source ipif rotor to the next ipif, 14953 * provided we can look it up. (We must not use it if it's 14954 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after 14955 * ipif_free() checked ill_src_ipif.) 14956 */ 14957 if (IS_IPMP(ill) && ipif != NULL) { 14958 next_ipif = ipif->ipif_next; 14959 if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif)) 14960 ill->ill_src_ipif = next_ipif; 14961 else 14962 ill->ill_src_ipif = NULL; 14963 } 14964 mutex_exit(&ipif->ipif_ill->ill_lock); 14965 } 14966 14967 rw_exit(&ipst->ips_ill_g_lock); 14968 if (usill != NULL) 14969 ill_refrele(usill); 14970 if (ipmp_ill != NULL) 14971 ill_refrele(ipmp_ill); 14972 if (dst_rhtp != NULL) 14973 TPC_RELE(dst_rhtp); 14974 14975 #ifdef DEBUG 14976 if (ipif == NULL) { 14977 char buf1[INET6_ADDRSTRLEN]; 14978 14979 ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n", 14980 ill->ill_name, 14981 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 14982 } else { 14983 char buf1[INET6_ADDRSTRLEN]; 14984 char buf2[INET6_ADDRSTRLEN]; 14985 14986 ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n", 14987 ipif->ipif_ill->ill_name, 14988 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 14989 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 14990 buf2, sizeof (buf2)))); 14991 } 14992 #endif /* DEBUG */ 14993 return (ipif); 14994 } 14995 14996 /* 14997 * Pick a source address based on the destination ill and an optional setsrc 14998 * address. 14999 * The result is stored in srcp. If generation is set, then put the source 15000 * generation number there before we look for the source address (to avoid 15001 * missing changes in the set of source addresses. 15002 * If flagsp is set, then us it to pass back ipif_flags. 15003 * 15004 * If the caller wants to cache the returned source address and detect when 15005 * that might be stale, the caller should pass in a generation argument, 15006 * which the caller can later compare against ips_src_generation 15007 * 15008 * The precedence order for selecting an IPv4 source address is: 15009 * - RTF_SETSRC on the offlink ire always wins. 15010 * - If usrsrc is set, swap the ill to be the usesrc one. 15011 * - If IPMP is used on the ill, select a random address from the most 15012 * preferred ones below: 15013 * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES 15014 * 2. Not deprecated, not ALL_ZONES 15015 * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES 15016 * 4. Not deprecated, ALL_ZONES 15017 * 5. If onlink destination, same subnet and deprecated 15018 * 6. Deprecated. 15019 * 15020 * We have lower preference for ALL_ZONES IP addresses, 15021 * as they pose problems with unlabeled destinations. 15022 * 15023 * Note that when multiple IP addresses match e.g., #1 we pick 15024 * the first one if IPMP is not in use. With IPMP we randomize. 15025 */ 15026 int 15027 ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst, 15028 ipaddr_t multicast_ifaddr, 15029 zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp, 15030 uint32_t *generation, uint64_t *flagsp) 15031 { 15032 ipif_t *ipif; 15033 boolean_t notready = B_FALSE; /* Set if !ipif_addr_ready found */ 15034 15035 if (flagsp != NULL) 15036 *flagsp = 0; 15037 15038 /* 15039 * Need to grab the generation number before we check to 15040 * avoid a race with a change to the set of local addresses. 15041 * No lock needed since the thread which updates the set of local 15042 * addresses use ipif/ill locks and exit those (hence a store memory 15043 * barrier) before doing the atomic increase of ips_src_generation. 15044 */ 15045 if (generation != NULL) { 15046 *generation = ipst->ips_src_generation; 15047 } 15048 15049 if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) { 15050 *srcp = multicast_ifaddr; 15051 return (0); 15052 } 15053 15054 /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */ 15055 if (setsrc != INADDR_ANY) { 15056 *srcp = setsrc; 15057 return (0); 15058 } 15059 ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, ¬ready); 15060 if (ipif == NULL) { 15061 if (notready) 15062 return (ENETDOWN); 15063 else 15064 return (EADDRNOTAVAIL); 15065 } 15066 *srcp = ipif->ipif_lcl_addr; 15067 if (flagsp != NULL) 15068 *flagsp = ipif->ipif_flags; 15069 ipif_refrele(ipif); 15070 return (0); 15071 } 15072 15073 /* ARGSUSED */ 15074 int 15075 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15076 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15077 { 15078 /* 15079 * ill_phyint_reinit merged the v4 and v6 into a single 15080 * ipsq. We might not have been able to complete the 15081 * operation in ipif_set_values, if we could not become 15082 * exclusive. If so restart it here. 15083 */ 15084 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 15085 } 15086 15087 /* 15088 * Can operate on either a module or a driver queue. 15089 * Returns an error if not a module queue. 15090 */ 15091 /* ARGSUSED */ 15092 int 15093 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15094 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15095 { 15096 queue_t *q1 = q; 15097 char *cp; 15098 char interf_name[LIFNAMSIZ]; 15099 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 15100 15101 if (q->q_next == NULL) { 15102 ip1dbg(( 15103 "if_unitsel: IF_UNITSEL: no q_next\n")); 15104 return (EINVAL); 15105 } 15106 15107 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 15108 return (EALREADY); 15109 15110 do { 15111 q1 = q1->q_next; 15112 } while (q1->q_next); 15113 cp = q1->q_qinfo->qi_minfo->mi_idname; 15114 (void) sprintf(interf_name, "%s%d", cp, ppa); 15115 15116 /* 15117 * Here we are not going to delay the ioack until after 15118 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 15119 * original ioctl message before sending the requests. 15120 */ 15121 return (ipif_set_values(q, mp, interf_name, &ppa)); 15122 } 15123 15124 /* ARGSUSED */ 15125 int 15126 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15127 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15128 { 15129 return (ENXIO); 15130 } 15131 15132 /* 15133 * Create any IRE_BROADCAST entries for `ipif', and store those entries in 15134 * `irep'. Returns a pointer to the next free `irep' entry 15135 * A mirror exists in ipif_delete_bcast_ires(). 15136 * 15137 * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is 15138 * done in ire_add. 15139 */ 15140 static ire_t ** 15141 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) 15142 { 15143 ipaddr_t addr; 15144 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 15145 ipaddr_t subnetmask = ipif->ipif_net_mask; 15146 ill_t *ill = ipif->ipif_ill; 15147 zoneid_t zoneid = ipif->ipif_zoneid; 15148 15149 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n")); 15150 15151 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 15152 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); 15153 15154 if (ipif->ipif_lcl_addr == INADDR_ANY || 15155 (ipif->ipif_flags & IPIF_NOLOCAL)) 15156 netmask = htonl(IN_CLASSA_NET); /* fallback */ 15157 15158 irep = ire_create_bcast(ill, 0, zoneid, irep); 15159 irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep); 15160 15161 /* 15162 * For backward compatibility, we create net broadcast IREs based on 15163 * the old "IP address class system", since some old machines only 15164 * respond to these class derived net broadcast. However, we must not 15165 * create these net broadcast IREs if the subnetmask is shorter than 15166 * the IP address class based derived netmask. Otherwise, we may 15167 * create a net broadcast address which is the same as an IP address 15168 * on the subnet -- and then TCP will refuse to talk to that address. 15169 */ 15170 if (netmask < subnetmask) { 15171 addr = netmask & ipif->ipif_subnet; 15172 irep = ire_create_bcast(ill, addr, zoneid, irep); 15173 irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep); 15174 } 15175 15176 /* 15177 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 15178 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 15179 * created. Creating these broadcast IREs will only create confusion 15180 * as `addr' will be the same as the IP address. 15181 */ 15182 if (subnetmask != 0xFFFFFFFF) { 15183 addr = ipif->ipif_subnet; 15184 irep = ire_create_bcast(ill, addr, zoneid, irep); 15185 irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep); 15186 } 15187 15188 return (irep); 15189 } 15190 15191 /* 15192 * Mirror of ipif_create_bcast_ires() 15193 */ 15194 static void 15195 ipif_delete_bcast_ires(ipif_t *ipif) 15196 { 15197 ipaddr_t addr; 15198 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 15199 ipaddr_t subnetmask = ipif->ipif_net_mask; 15200 ill_t *ill = ipif->ipif_ill; 15201 zoneid_t zoneid = ipif->ipif_zoneid; 15202 ire_t *ire; 15203 15204 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 15205 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); 15206 15207 if (ipif->ipif_lcl_addr == INADDR_ANY || 15208 (ipif->ipif_flags & IPIF_NOLOCAL)) 15209 netmask = htonl(IN_CLASSA_NET); /* fallback */ 15210 15211 ire = ire_lookup_bcast(ill, 0, zoneid); 15212 ASSERT(ire != NULL); 15213 ire_delete(ire); ire_refrele(ire); 15214 ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid); 15215 ASSERT(ire != NULL); 15216 ire_delete(ire); ire_refrele(ire); 15217 15218 /* 15219 * For backward compatibility, we create net broadcast IREs based on 15220 * the old "IP address class system", since some old machines only 15221 * respond to these class derived net broadcast. However, we must not 15222 * create these net broadcast IREs if the subnetmask is shorter than 15223 * the IP address class based derived netmask. Otherwise, we may 15224 * create a net broadcast address which is the same as an IP address 15225 * on the subnet -- and then TCP will refuse to talk to that address. 15226 */ 15227 if (netmask < subnetmask) { 15228 addr = netmask & ipif->ipif_subnet; 15229 ire = ire_lookup_bcast(ill, addr, zoneid); 15230 ASSERT(ire != NULL); 15231 ire_delete(ire); ire_refrele(ire); 15232 ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid); 15233 ASSERT(ire != NULL); 15234 ire_delete(ire); ire_refrele(ire); 15235 } 15236 15237 /* 15238 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 15239 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 15240 * created. Creating these broadcast IREs will only create confusion 15241 * as `addr' will be the same as the IP address. 15242 */ 15243 if (subnetmask != 0xFFFFFFFF) { 15244 addr = ipif->ipif_subnet; 15245 ire = ire_lookup_bcast(ill, addr, zoneid); 15246 ASSERT(ire != NULL); 15247 ire_delete(ire); ire_refrele(ire); 15248 ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid); 15249 ASSERT(ire != NULL); 15250 ire_delete(ire); ire_refrele(ire); 15251 } 15252 } 15253 15254 /* 15255 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 15256 * from lifr_flags and the name from lifr_name. 15257 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 15258 * since ipif_lookup_on_name uses the _isv6 flags when matching. 15259 * Returns EINPROGRESS when mp has been consumed by queueing it on 15260 * ipx_pending_mp and the ioctl will complete in ip_rput. 15261 * 15262 * Can operate on either a module or a driver queue. 15263 * Returns an error if not a module queue. 15264 */ 15265 /* ARGSUSED */ 15266 int 15267 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15268 ip_ioctl_cmd_t *ipip, void *if_req) 15269 { 15270 ill_t *ill = q->q_ptr; 15271 phyint_t *phyi; 15272 ip_stack_t *ipst; 15273 struct lifreq *lifr = if_req; 15274 uint64_t new_flags; 15275 15276 ASSERT(ipif != NULL); 15277 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 15278 15279 if (q->q_next == NULL) { 15280 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n")); 15281 return (EINVAL); 15282 } 15283 15284 /* 15285 * If we are not writer on 'q' then this interface exists already 15286 * and previous lookups (ip_extract_lifreq()) found this ipif -- 15287 * so return EALREADY. 15288 */ 15289 if (ill != ipif->ipif_ill) 15290 return (EALREADY); 15291 15292 if (ill->ill_name[0] != '\0') 15293 return (EALREADY); 15294 15295 /* 15296 * If there's another ill already with the requested name, ensure 15297 * that it's of the same type. Otherwise, ill_phyint_reinit() will 15298 * fuse together two unrelated ills, which will cause chaos. 15299 */ 15300 ipst = ill->ill_ipst; 15301 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 15302 lifr->lifr_name, NULL); 15303 if (phyi != NULL) { 15304 ill_t *ill_mate = phyi->phyint_illv4; 15305 15306 if (ill_mate == NULL) 15307 ill_mate = phyi->phyint_illv6; 15308 ASSERT(ill_mate != NULL); 15309 15310 if (ill_mate->ill_media->ip_m_mac_type != 15311 ill->ill_media->ip_m_mac_type) { 15312 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to " 15313 "use the same ill name on differing media\n")); 15314 return (EINVAL); 15315 } 15316 } 15317 15318 /* 15319 * We start off as IFF_IPV4 in ipif_allocate and become 15320 * IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value. 15321 * The only flags that we read from user space are IFF_IPV4, 15322 * IFF_IPV6, and IFF_BROADCAST. 15323 * 15324 * This ill has not been inserted into the global list. 15325 * So we are still single threaded and don't need any lock 15326 * 15327 * Saniy check the flags. 15328 */ 15329 15330 if ((lifr->lifr_flags & IFF_BROADCAST) && 15331 ((lifr->lifr_flags & IFF_IPV6) || 15332 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 15333 ip1dbg(("ip_sioctl_slifname: link not broadcast capable " 15334 "or IPv6 i.e., no broadcast \n")); 15335 return (EINVAL); 15336 } 15337 15338 new_flags = 15339 lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST); 15340 15341 if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) { 15342 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of " 15343 "IFF_IPV4 or IFF_IPV6\n")); 15344 return (EINVAL); 15345 } 15346 15347 /* 15348 * We always start off as IPv4, so only need to check for IPv6. 15349 */ 15350 if ((new_flags & IFF_IPV6) != 0) { 15351 ill->ill_flags |= ILLF_IPV6; 15352 ill->ill_flags &= ~ILLF_IPV4; 15353 } 15354 15355 if ((new_flags & IFF_BROADCAST) != 0) 15356 ipif->ipif_flags |= IPIF_BROADCAST; 15357 else 15358 ipif->ipif_flags &= ~IPIF_BROADCAST; 15359 15360 /* We started off as V4. */ 15361 if (ill->ill_flags & ILLF_IPV6) { 15362 ill->ill_phyint->phyint_illv6 = ill; 15363 ill->ill_phyint->phyint_illv4 = NULL; 15364 } 15365 15366 return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa)); 15367 } 15368 15369 /* ARGSUSED */ 15370 int 15371 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15372 ip_ioctl_cmd_t *ipip, void *if_req) 15373 { 15374 /* 15375 * ill_phyint_reinit merged the v4 and v6 into a single 15376 * ipsq. We might not have been able to complete the 15377 * slifname in ipif_set_values, if we could not become 15378 * exclusive. If so restart it here 15379 */ 15380 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 15381 } 15382 15383 /* 15384 * Return a pointer to the ipif which matches the index, IP version type and 15385 * zoneid. 15386 */ 15387 ipif_t * 15388 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 15389 ip_stack_t *ipst) 15390 { 15391 ill_t *ill; 15392 ipif_t *ipif = NULL; 15393 15394 ill = ill_lookup_on_ifindex(index, isv6, ipst); 15395 if (ill != NULL) { 15396 mutex_enter(&ill->ill_lock); 15397 for (ipif = ill->ill_ipif; ipif != NULL; 15398 ipif = ipif->ipif_next) { 15399 if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES || 15400 zoneid == ipif->ipif_zoneid || 15401 ipif->ipif_zoneid == ALL_ZONES)) { 15402 ipif_refhold_locked(ipif); 15403 break; 15404 } 15405 } 15406 mutex_exit(&ill->ill_lock); 15407 ill_refrele(ill); 15408 } 15409 return (ipif); 15410 } 15411 15412 /* 15413 * Change an existing physical interface's index. If the new index 15414 * is acceptable we update the index and the phyint_list_avl_by_index tree. 15415 * Finally, we update other systems which may have a dependence on the 15416 * index value. 15417 */ 15418 /* ARGSUSED */ 15419 int 15420 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15421 ip_ioctl_cmd_t *ipip, void *ifreq) 15422 { 15423 ill_t *ill; 15424 phyint_t *phyi; 15425 struct ifreq *ifr = (struct ifreq *)ifreq; 15426 struct lifreq *lifr = (struct lifreq *)ifreq; 15427 uint_t old_index, index; 15428 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15429 avl_index_t where; 15430 15431 if (ipip->ipi_cmd_type == IF_CMD) 15432 index = ifr->ifr_index; 15433 else 15434 index = lifr->lifr_index; 15435 15436 /* 15437 * Only allow on physical interface. Also, index zero is illegal. 15438 */ 15439 ill = ipif->ipif_ill; 15440 phyi = ill->ill_phyint; 15441 if (ipif->ipif_id != 0 || index == 0) { 15442 return (EINVAL); 15443 } 15444 15445 /* If the index is not changing, no work to do */ 15446 if (phyi->phyint_ifindex == index) 15447 return (0); 15448 15449 /* 15450 * Use phyint_exists() to determine if the new interface index 15451 * is already in use. If the index is unused then we need to 15452 * change the phyint's position in the phyint_list_avl_by_index 15453 * tree. If we do not do this, subsequent lookups (using the new 15454 * index value) will not find the phyint. 15455 */ 15456 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15457 if (phyint_exists(index, ipst)) { 15458 rw_exit(&ipst->ips_ill_g_lock); 15459 return (EEXIST); 15460 } 15461 15462 /* 15463 * The new index is unused. Set it in the phyint. However we must not 15464 * forget to trigger NE_IFINDEX_CHANGE event before the ifindex 15465 * changes. The event must be bound to old ifindex value. 15466 */ 15467 ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE, 15468 &index, sizeof (index)); 15469 15470 old_index = phyi->phyint_ifindex; 15471 phyi->phyint_ifindex = index; 15472 15473 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi); 15474 (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 15475 &index, &where); 15476 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 15477 phyi, where); 15478 rw_exit(&ipst->ips_ill_g_lock); 15479 15480 /* Update SCTP's ILL list */ 15481 sctp_ill_reindex(ill, old_index); 15482 15483 /* Send the routing sockets message */ 15484 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 15485 if (ILL_OTHER(ill)) 15486 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT); 15487 15488 /* Perhaps ilgs should use this ill */ 15489 update_conn_ill(NULL, ill->ill_ipst); 15490 return (0); 15491 } 15492 15493 /* ARGSUSED */ 15494 int 15495 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15496 ip_ioctl_cmd_t *ipip, void *ifreq) 15497 { 15498 struct ifreq *ifr = (struct ifreq *)ifreq; 15499 struct lifreq *lifr = (struct lifreq *)ifreq; 15500 15501 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 15502 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15503 /* Get the interface index */ 15504 if (ipip->ipi_cmd_type == IF_CMD) { 15505 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 15506 } else { 15507 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 15508 } 15509 return (0); 15510 } 15511 15512 /* ARGSUSED */ 15513 int 15514 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15515 ip_ioctl_cmd_t *ipip, void *ifreq) 15516 { 15517 struct lifreq *lifr = (struct lifreq *)ifreq; 15518 15519 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 15520 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15521 /* Get the interface zone */ 15522 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15523 lifr->lifr_zoneid = ipif->ipif_zoneid; 15524 return (0); 15525 } 15526 15527 /* 15528 * Set the zoneid of an interface. 15529 */ 15530 /* ARGSUSED */ 15531 int 15532 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15533 ip_ioctl_cmd_t *ipip, void *ifreq) 15534 { 15535 struct lifreq *lifr = (struct lifreq *)ifreq; 15536 int err = 0; 15537 boolean_t need_up = B_FALSE; 15538 zone_t *zptr; 15539 zone_status_t status; 15540 zoneid_t zoneid; 15541 15542 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15543 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 15544 if (!is_system_labeled()) 15545 return (ENOTSUP); 15546 zoneid = GLOBAL_ZONEID; 15547 } 15548 15549 /* cannot assign instance zero to a non-global zone */ 15550 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 15551 return (ENOTSUP); 15552 15553 /* 15554 * Cannot assign to a zone that doesn't exist or is shutting down. In 15555 * the event of a race with the zone shutdown processing, since IP 15556 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 15557 * interface will be cleaned up even if the zone is shut down 15558 * immediately after the status check. If the interface can't be brought 15559 * down right away, and the zone is shut down before the restart 15560 * function is called, we resolve the possible races by rechecking the 15561 * zone status in the restart function. 15562 */ 15563 if ((zptr = zone_find_by_id(zoneid)) == NULL) 15564 return (EINVAL); 15565 status = zone_status_get(zptr); 15566 zone_rele(zptr); 15567 15568 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 15569 return (EINVAL); 15570 15571 if (ipif->ipif_flags & IPIF_UP) { 15572 /* 15573 * If the interface is already marked up, 15574 * we call ipif_down which will take care 15575 * of ditching any IREs that have been set 15576 * up based on the old interface address. 15577 */ 15578 err = ipif_logical_down(ipif, q, mp); 15579 if (err == EINPROGRESS) 15580 return (err); 15581 (void) ipif_down_tail(ipif); 15582 need_up = B_TRUE; 15583 } 15584 15585 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 15586 return (err); 15587 } 15588 15589 static int 15590 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 15591 queue_t *q, mblk_t *mp, boolean_t need_up) 15592 { 15593 int err = 0; 15594 ip_stack_t *ipst; 15595 15596 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 15597 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15598 15599 if (CONN_Q(q)) 15600 ipst = CONNQ_TO_IPST(q); 15601 else 15602 ipst = ILLQ_TO_IPST(q); 15603 15604 /* 15605 * For exclusive stacks we don't allow a different zoneid than 15606 * global. 15607 */ 15608 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID && 15609 zoneid != GLOBAL_ZONEID) 15610 return (EINVAL); 15611 15612 /* Set the new zone id. */ 15613 ipif->ipif_zoneid = zoneid; 15614 15615 /* Update sctp list */ 15616 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 15617 15618 /* The default multicast interface might have changed */ 15619 ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6); 15620 15621 if (need_up) { 15622 /* 15623 * Now bring the interface back up. If this 15624 * is the only IPIF for the ILL, ipif_up 15625 * will have to re-bind to the device, so 15626 * we may get back EINPROGRESS, in which 15627 * case, this IOCTL will get completed in 15628 * ip_rput_dlpi when we see the DL_BIND_ACK. 15629 */ 15630 err = ipif_up(ipif, q, mp); 15631 } 15632 return (err); 15633 } 15634 15635 /* ARGSUSED */ 15636 int 15637 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15638 ip_ioctl_cmd_t *ipip, void *if_req) 15639 { 15640 struct lifreq *lifr = (struct lifreq *)if_req; 15641 zoneid_t zoneid; 15642 zone_t *zptr; 15643 zone_status_t status; 15644 15645 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15646 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 15647 zoneid = GLOBAL_ZONEID; 15648 15649 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 15650 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15651 15652 /* 15653 * We recheck the zone status to resolve the following race condition: 15654 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 15655 * 2) hme0:1 is up and can't be brought down right away; 15656 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 15657 * 3) zone "myzone" is halted; the zone status switches to 15658 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 15659 * the interfaces to remove - hme0:1 is not returned because it's not 15660 * yet in "myzone", so it won't be removed; 15661 * 4) the restart function for SIOCSLIFZONE is called; without the 15662 * status check here, we would have hme0:1 in "myzone" after it's been 15663 * destroyed. 15664 * Note that if the status check fails, we need to bring the interface 15665 * back to its state prior to ip_sioctl_slifzone(), hence the call to 15666 * ipif_up_done[_v6](). 15667 */ 15668 status = ZONE_IS_UNINITIALIZED; 15669 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 15670 status = zone_status_get(zptr); 15671 zone_rele(zptr); 15672 } 15673 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 15674 if (ipif->ipif_isv6) { 15675 (void) ipif_up_done_v6(ipif); 15676 } else { 15677 (void) ipif_up_done(ipif); 15678 } 15679 return (EINVAL); 15680 } 15681 15682 (void) ipif_down_tail(ipif); 15683 15684 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 15685 B_TRUE)); 15686 } 15687 15688 /* 15689 * Return the number of addresses on `ill' with one or more of the values 15690 * in `set' set and all of the values in `clear' clear. 15691 */ 15692 static uint_t 15693 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear) 15694 { 15695 ipif_t *ipif; 15696 uint_t cnt = 0; 15697 15698 ASSERT(IAM_WRITER_ILL(ill)); 15699 15700 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 15701 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear)) 15702 cnt++; 15703 15704 return (cnt); 15705 } 15706 15707 /* 15708 * Return the number of migratable addresses on `ill' that are under 15709 * application control. 15710 */ 15711 uint_t 15712 ill_appaddr_cnt(const ill_t *ill) 15713 { 15714 return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF, 15715 IPIF_NOFAILOVER)); 15716 } 15717 15718 /* 15719 * Return the number of point-to-point addresses on `ill'. 15720 */ 15721 uint_t 15722 ill_ptpaddr_cnt(const ill_t *ill) 15723 { 15724 return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0)); 15725 } 15726 15727 /* ARGSUSED */ 15728 int 15729 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15730 ip_ioctl_cmd_t *ipip, void *ifreq) 15731 { 15732 struct lifreq *lifr = ifreq; 15733 15734 ASSERT(q->q_next == NULL); 15735 ASSERT(CONN_Q(q)); 15736 15737 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 15738 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15739 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 15740 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 15741 15742 return (0); 15743 } 15744 15745 /* Find the previous ILL in this usesrc group */ 15746 static ill_t * 15747 ill_prev_usesrc(ill_t *uill) 15748 { 15749 ill_t *ill; 15750 15751 for (ill = uill->ill_usesrc_grp_next; 15752 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 15753 ill = ill->ill_usesrc_grp_next) 15754 /* do nothing */; 15755 return (ill); 15756 } 15757 15758 /* 15759 * Release all members of the usesrc group. This routine is called 15760 * from ill_delete when the interface being unplumbed is the 15761 * group head. 15762 * 15763 * This silently clears the usesrc that ifconfig setup. 15764 * An alternative would be to keep that ifindex, and drop packets on the floor 15765 * since no source address can be selected. 15766 * Even if we keep the current semantics, don't need a lock and a linked list. 15767 * Can walk all the ills checking if they have a ill_usesrc_ifindex matching 15768 * the one that is being removed. Issue is how we return the usesrc users 15769 * (SIOCGLIFSRCOF). We want to be able to find the ills which have an 15770 * ill_usesrc_ifindex matching a target ill. We could also do that with an 15771 * ill walk, but the walker would need to insert in the ioctl response. 15772 */ 15773 static void 15774 ill_disband_usesrc_group(ill_t *uill) 15775 { 15776 ill_t *next_ill, *tmp_ill; 15777 ip_stack_t *ipst = uill->ill_ipst; 15778 15779 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 15780 next_ill = uill->ill_usesrc_grp_next; 15781 15782 do { 15783 ASSERT(next_ill != NULL); 15784 tmp_ill = next_ill->ill_usesrc_grp_next; 15785 ASSERT(tmp_ill != NULL); 15786 next_ill->ill_usesrc_grp_next = NULL; 15787 next_ill->ill_usesrc_ifindex = 0; 15788 next_ill = tmp_ill; 15789 } while (next_ill->ill_usesrc_ifindex != 0); 15790 uill->ill_usesrc_grp_next = NULL; 15791 } 15792 15793 /* 15794 * Remove the client usesrc ILL from the list and relink to a new list 15795 */ 15796 int 15797 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 15798 { 15799 ill_t *ill, *tmp_ill; 15800 ip_stack_t *ipst = ucill->ill_ipst; 15801 15802 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 15803 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 15804 15805 /* 15806 * Check if the usesrc client ILL passed in is not already 15807 * in use as a usesrc ILL i.e one whose source address is 15808 * in use OR a usesrc ILL is not already in use as a usesrc 15809 * client ILL 15810 */ 15811 if ((ucill->ill_usesrc_ifindex == 0) || 15812 (uill->ill_usesrc_ifindex != 0)) { 15813 return (-1); 15814 } 15815 15816 ill = ill_prev_usesrc(ucill); 15817 ASSERT(ill->ill_usesrc_grp_next != NULL); 15818 15819 /* Remove from the current list */ 15820 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 15821 /* Only two elements in the list */ 15822 ASSERT(ill->ill_usesrc_ifindex == 0); 15823 ill->ill_usesrc_grp_next = NULL; 15824 } else { 15825 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 15826 } 15827 15828 if (ifindex == 0) { 15829 ucill->ill_usesrc_ifindex = 0; 15830 ucill->ill_usesrc_grp_next = NULL; 15831 return (0); 15832 } 15833 15834 ucill->ill_usesrc_ifindex = ifindex; 15835 tmp_ill = uill->ill_usesrc_grp_next; 15836 uill->ill_usesrc_grp_next = ucill; 15837 ucill->ill_usesrc_grp_next = 15838 (tmp_ill != NULL) ? tmp_ill : uill; 15839 return (0); 15840 } 15841 15842 /* 15843 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 15844 * ip.c for locking details. 15845 */ 15846 /* ARGSUSED */ 15847 int 15848 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15849 ip_ioctl_cmd_t *ipip, void *ifreq) 15850 { 15851 struct lifreq *lifr = (struct lifreq *)ifreq; 15852 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE; 15853 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 15854 int err = 0, ret; 15855 uint_t ifindex; 15856 ipsq_t *ipsq = NULL; 15857 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15858 15859 ASSERT(IAM_WRITER_IPIF(ipif)); 15860 ASSERT(q->q_next == NULL); 15861 ASSERT(CONN_Q(q)); 15862 15863 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; 15864 15865 ifindex = lifr->lifr_index; 15866 if (ifindex == 0) { 15867 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 15868 /* non usesrc group interface, nothing to reset */ 15869 return (0); 15870 } 15871 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 15872 /* valid reset request */ 15873 reset_flg = B_TRUE; 15874 } 15875 15876 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 15877 if (usesrc_ill == NULL) { 15878 return (ENXIO); 15879 } 15880 15881 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 15882 NEW_OP, B_TRUE); 15883 if (ipsq == NULL) { 15884 err = EINPROGRESS; 15885 /* Operation enqueued on the ipsq of the usesrc ILL */ 15886 goto done; 15887 } 15888 15889 /* USESRC isn't currently supported with IPMP */ 15890 if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) { 15891 err = ENOTSUP; 15892 goto done; 15893 } 15894 15895 /* 15896 * USESRC isn't compatible with the STANDBY flag. (STANDBY is only 15897 * used by IPMP underlying interfaces, but someone might think it's 15898 * more general and try to use it independently with VNI.) 15899 */ 15900 if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 15901 err = ENOTSUP; 15902 goto done; 15903 } 15904 15905 /* 15906 * If the client is already in use as a usesrc_ill or a usesrc_ill is 15907 * already a client then return EINVAL 15908 */ 15909 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 15910 err = EINVAL; 15911 goto done; 15912 } 15913 15914 /* 15915 * If the ill_usesrc_ifindex field is already set to what it needs to 15916 * be then this is a duplicate operation. 15917 */ 15918 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 15919 err = 0; 15920 goto done; 15921 } 15922 15923 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 15924 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 15925 usesrc_ill->ill_isv6)); 15926 15927 /* 15928 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 15929 * and the ill_usesrc_ifindex fields 15930 */ 15931 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 15932 15933 if (reset_flg) { 15934 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 15935 if (ret != 0) { 15936 err = EINVAL; 15937 } 15938 rw_exit(&ipst->ips_ill_g_usesrc_lock); 15939 goto done; 15940 } 15941 15942 /* 15943 * Four possibilities to consider: 15944 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 15945 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 15946 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 15947 * 4. Both are part of their respective usesrc groups 15948 */ 15949 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 15950 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 15951 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 15952 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 15953 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 15954 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 15955 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 15956 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 15957 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 15958 /* Insert at head of list */ 15959 usesrc_cli_ill->ill_usesrc_grp_next = 15960 usesrc_ill->ill_usesrc_grp_next; 15961 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 15962 } else { 15963 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 15964 ifindex); 15965 if (ret != 0) 15966 err = EINVAL; 15967 } 15968 rw_exit(&ipst->ips_ill_g_usesrc_lock); 15969 15970 done: 15971 if (ipsq != NULL) 15972 ipsq_exit(ipsq); 15973 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 15974 ill_refrele(usesrc_ill); 15975 15976 /* Let conn_ixa caching know that source address selection changed */ 15977 ip_update_source_selection(ipst); 15978 15979 return (err); 15980 } 15981 15982 /* 15983 * comparison function used by avl. 15984 */ 15985 static int 15986 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 15987 { 15988 15989 uint_t index; 15990 15991 ASSERT(phyip != NULL && index_ptr != NULL); 15992 15993 index = *((uint_t *)index_ptr); 15994 /* 15995 * let the phyint with the lowest index be on top. 15996 */ 15997 if (((phyint_t *)phyip)->phyint_ifindex < index) 15998 return (1); 15999 if (((phyint_t *)phyip)->phyint_ifindex > index) 16000 return (-1); 16001 return (0); 16002 } 16003 16004 /* 16005 * comparison function used by avl. 16006 */ 16007 static int 16008 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 16009 { 16010 ill_t *ill; 16011 int res = 0; 16012 16013 ASSERT(phyip != NULL && name_ptr != NULL); 16014 16015 if (((phyint_t *)phyip)->phyint_illv4) 16016 ill = ((phyint_t *)phyip)->phyint_illv4; 16017 else 16018 ill = ((phyint_t *)phyip)->phyint_illv6; 16019 ASSERT(ill != NULL); 16020 16021 res = strcmp(ill->ill_name, (char *)name_ptr); 16022 if (res > 0) 16023 return (1); 16024 else if (res < 0) 16025 return (-1); 16026 return (0); 16027 } 16028 16029 /* 16030 * This function is called on the unplumb path via ill_glist_delete() when 16031 * there are no ills left on the phyint and thus the phyint can be freed. 16032 */ 16033 static void 16034 phyint_free(phyint_t *phyi) 16035 { 16036 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 16037 16038 ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL); 16039 16040 /* 16041 * If this phyint was an IPMP meta-interface, blow away the group. 16042 * This is safe to do because all of the illgrps have already been 16043 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us. 16044 * If we're cleaning up as a result of failed initialization, 16045 * phyint_grp may be NULL. 16046 */ 16047 if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) { 16048 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 16049 ipmp_grp_destroy(phyi->phyint_grp); 16050 phyi->phyint_grp = NULL; 16051 rw_exit(&ipst->ips_ipmp_lock); 16052 } 16053 16054 /* 16055 * If this interface was under IPMP, take it out of the group. 16056 */ 16057 if (phyi->phyint_grp != NULL) 16058 ipmp_phyint_leave_grp(phyi); 16059 16060 /* 16061 * Delete the phyint and disassociate its ipsq. The ipsq itself 16062 * will be freed in ipsq_exit(). 16063 */ 16064 phyi->phyint_ipsq->ipsq_phyint = NULL; 16065 phyi->phyint_name[0] = '\0'; 16066 16067 mi_free(phyi); 16068 } 16069 16070 /* 16071 * Attach the ill to the phyint structure which can be shared by both 16072 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 16073 * function is called from ipif_set_values and ill_lookup_on_name (for 16074 * loopback) where we know the name of the ill. We lookup the ill and if 16075 * there is one present already with the name use that phyint. Otherwise 16076 * reuse the one allocated by ill_init. 16077 */ 16078 static void 16079 ill_phyint_reinit(ill_t *ill) 16080 { 16081 boolean_t isv6 = ill->ill_isv6; 16082 phyint_t *phyi_old; 16083 phyint_t *phyi; 16084 avl_index_t where = 0; 16085 ill_t *ill_other = NULL; 16086 ip_stack_t *ipst = ill->ill_ipst; 16087 16088 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 16089 16090 phyi_old = ill->ill_phyint; 16091 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 16092 phyi_old->phyint_illv6 == NULL)); 16093 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 16094 phyi_old->phyint_illv4 == NULL)); 16095 ASSERT(phyi_old->phyint_ifindex == 0); 16096 16097 /* 16098 * Now that our ill has a name, set it in the phyint. 16099 */ 16100 (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ); 16101 16102 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16103 ill->ill_name, &where); 16104 16105 /* 16106 * 1. We grabbed the ill_g_lock before inserting this ill into 16107 * the global list of ills. So no other thread could have located 16108 * this ill and hence the ipsq of this ill is guaranteed to be empty. 16109 * 2. Now locate the other protocol instance of this ill. 16110 * 3. Now grab both ill locks in the right order, and the phyint lock of 16111 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 16112 * of neither ill can change. 16113 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 16114 * other ill. 16115 * 5. Release all locks. 16116 */ 16117 16118 /* 16119 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 16120 * we are initializing IPv4. 16121 */ 16122 if (phyi != NULL) { 16123 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6; 16124 ASSERT(ill_other->ill_phyint != NULL); 16125 ASSERT((isv6 && !ill_other->ill_isv6) || 16126 (!isv6 && ill_other->ill_isv6)); 16127 GRAB_ILL_LOCKS(ill, ill_other); 16128 /* 16129 * We are potentially throwing away phyint_flags which 16130 * could be different from the one that we obtain from 16131 * ill_other->ill_phyint. But it is okay as we are assuming 16132 * that the state maintained within IP is correct. 16133 */ 16134 mutex_enter(&phyi->phyint_lock); 16135 if (isv6) { 16136 ASSERT(phyi->phyint_illv6 == NULL); 16137 phyi->phyint_illv6 = ill; 16138 } else { 16139 ASSERT(phyi->phyint_illv4 == NULL); 16140 phyi->phyint_illv4 = ill; 16141 } 16142 16143 /* 16144 * Delete the old phyint and make its ipsq eligible 16145 * to be freed in ipsq_exit(). 16146 */ 16147 phyi_old->phyint_illv4 = NULL; 16148 phyi_old->phyint_illv6 = NULL; 16149 phyi_old->phyint_ipsq->ipsq_phyint = NULL; 16150 phyi_old->phyint_name[0] = '\0'; 16151 mi_free(phyi_old); 16152 } else { 16153 mutex_enter(&ill->ill_lock); 16154 /* 16155 * We don't need to acquire any lock, since 16156 * the ill is not yet visible globally and we 16157 * have not yet released the ill_g_lock. 16158 */ 16159 phyi = phyi_old; 16160 mutex_enter(&phyi->phyint_lock); 16161 /* XXX We need a recovery strategy here. */ 16162 if (!phyint_assign_ifindex(phyi, ipst)) 16163 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 16164 16165 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16166 (void *)phyi, where); 16167 16168 (void) avl_find(&ipst->ips_phyint_g_list-> 16169 phyint_list_avl_by_index, 16170 &phyi->phyint_ifindex, &where); 16171 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16172 (void *)phyi, where); 16173 } 16174 16175 /* 16176 * Reassigning ill_phyint automatically reassigns the ipsq also. 16177 * pending mp is not affected because that is per ill basis. 16178 */ 16179 ill->ill_phyint = phyi; 16180 16181 /* 16182 * Now that the phyint's ifindex has been assigned, complete the 16183 * remaining 16184 */ 16185 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex; 16186 if (ill->ill_isv6) { 16187 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 16188 ill->ill_phyint->phyint_ifindex; 16189 ill->ill_mcast_type = ipst->ips_mld_max_version; 16190 } else { 16191 ill->ill_mcast_type = ipst->ips_igmp_max_version; 16192 } 16193 16194 /* 16195 * Generate an event within the hooks framework to indicate that 16196 * a new interface has just been added to IP. For this event to 16197 * be generated, the network interface must, at least, have an 16198 * ifindex assigned to it. (We don't generate the event for 16199 * loopback since ill_lookup_on_name() has its own NE_PLUMB event.) 16200 * 16201 * This needs to be run inside the ill_g_lock perimeter to ensure 16202 * that the ordering of delivered events to listeners matches the 16203 * order of them in the kernel. 16204 */ 16205 if (!IS_LOOPBACK(ill)) { 16206 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name, 16207 ill->ill_name_length); 16208 } 16209 RELEASE_ILL_LOCKS(ill, ill_other); 16210 mutex_exit(&phyi->phyint_lock); 16211 } 16212 16213 /* 16214 * Notify any downstream modules of the name of this interface. 16215 * An M_IOCTL is used even though we don't expect a successful reply. 16216 * Any reply message from the driver (presumably an M_IOCNAK) will 16217 * eventually get discarded somewhere upstream. The message format is 16218 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 16219 * to IP. 16220 */ 16221 static void 16222 ip_ifname_notify(ill_t *ill, queue_t *q) 16223 { 16224 mblk_t *mp1, *mp2; 16225 struct iocblk *iocp; 16226 struct lifreq *lifr; 16227 16228 mp1 = mkiocb(SIOCSLIFNAME); 16229 if (mp1 == NULL) 16230 return; 16231 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 16232 if (mp2 == NULL) { 16233 freeb(mp1); 16234 return; 16235 } 16236 16237 mp1->b_cont = mp2; 16238 iocp = (struct iocblk *)mp1->b_rptr; 16239 iocp->ioc_count = sizeof (struct lifreq); 16240 16241 lifr = (struct lifreq *)mp2->b_rptr; 16242 mp2->b_wptr += sizeof (struct lifreq); 16243 bzero(lifr, sizeof (struct lifreq)); 16244 16245 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 16246 lifr->lifr_ppa = ill->ill_ppa; 16247 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 16248 16249 DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify", 16250 char *, "SIOCSLIFNAME", ill_t *, ill); 16251 putnext(q, mp1); 16252 } 16253 16254 static int 16255 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 16256 { 16257 int err; 16258 ip_stack_t *ipst = ill->ill_ipst; 16259 phyint_t *phyi = ill->ill_phyint; 16260 16261 /* Set the obsolete NDD per-interface forwarding name. */ 16262 err = ill_set_ndd_name(ill); 16263 if (err != 0) { 16264 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 16265 err); 16266 } 16267 16268 /* 16269 * Now that ill_name is set, the configuration for the IPMP 16270 * meta-interface can be performed. 16271 */ 16272 if (IS_IPMP(ill)) { 16273 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 16274 /* 16275 * If phyi->phyint_grp is NULL, then this is the first IPMP 16276 * meta-interface and we need to create the IPMP group. 16277 */ 16278 if (phyi->phyint_grp == NULL) { 16279 /* 16280 * If someone has renamed another IPMP group to have 16281 * the same name as our interface, bail. 16282 */ 16283 if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) { 16284 rw_exit(&ipst->ips_ipmp_lock); 16285 return (EEXIST); 16286 } 16287 phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi); 16288 if (phyi->phyint_grp == NULL) { 16289 rw_exit(&ipst->ips_ipmp_lock); 16290 return (ENOMEM); 16291 } 16292 } 16293 rw_exit(&ipst->ips_ipmp_lock); 16294 } 16295 16296 /* Tell downstream modules where they are. */ 16297 ip_ifname_notify(ill, q); 16298 16299 /* 16300 * ill_dl_phys returns EINPROGRESS in the usual case. 16301 * Error cases are ENOMEM ... 16302 */ 16303 err = ill_dl_phys(ill, ipif, mp, q); 16304 16305 if (ill->ill_isv6) { 16306 mutex_enter(&ipst->ips_mld_slowtimeout_lock); 16307 if (ipst->ips_mld_slowtimeout_id == 0) { 16308 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, 16309 (void *)ipst, 16310 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 16311 } 16312 mutex_exit(&ipst->ips_mld_slowtimeout_lock); 16313 } else { 16314 mutex_enter(&ipst->ips_igmp_slowtimeout_lock); 16315 if (ipst->ips_igmp_slowtimeout_id == 0) { 16316 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, 16317 (void *)ipst, 16318 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 16319 } 16320 mutex_exit(&ipst->ips_igmp_slowtimeout_lock); 16321 } 16322 16323 return (err); 16324 } 16325 16326 /* 16327 * Common routine for ppa and ifname setting. Should be called exclusive. 16328 * 16329 * Returns EINPROGRESS when mp has been consumed by queueing it on 16330 * ipx_pending_mp and the ioctl will complete in ip_rput. 16331 * 16332 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 16333 * the new name and new ppa in lifr_name and lifr_ppa respectively. 16334 * For SLIFNAME, we pass these values back to the userland. 16335 */ 16336 static int 16337 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 16338 { 16339 ill_t *ill; 16340 ipif_t *ipif; 16341 ipsq_t *ipsq; 16342 char *ppa_ptr; 16343 char *old_ptr; 16344 char old_char; 16345 int error; 16346 ip_stack_t *ipst; 16347 16348 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 16349 ASSERT(q->q_next != NULL); 16350 ASSERT(interf_name != NULL); 16351 16352 ill = (ill_t *)q->q_ptr; 16353 ipst = ill->ill_ipst; 16354 16355 ASSERT(ill->ill_ipst != NULL); 16356 ASSERT(ill->ill_name[0] == '\0'); 16357 ASSERT(IAM_WRITER_ILL(ill)); 16358 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 16359 ASSERT(ill->ill_ppa == UINT_MAX); 16360 16361 ill->ill_defend_start = ill->ill_defend_count = 0; 16362 /* The ppa is sent down by ifconfig or is chosen */ 16363 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 16364 return (EINVAL); 16365 } 16366 16367 /* 16368 * make sure ppa passed in is same as ppa in the name. 16369 * This check is not made when ppa == UINT_MAX in that case ppa 16370 * in the name could be anything. System will choose a ppa and 16371 * update new_ppa_ptr and inter_name to contain the choosen ppa. 16372 */ 16373 if (*new_ppa_ptr != UINT_MAX) { 16374 /* stoi changes the pointer */ 16375 old_ptr = ppa_ptr; 16376 /* 16377 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 16378 * (they don't have an externally visible ppa). We assign one 16379 * here so that we can manage the interface. Note that in 16380 * the past this value was always 0 for DLPI 1 drivers. 16381 */ 16382 if (*new_ppa_ptr == 0) 16383 *new_ppa_ptr = stoi(&old_ptr); 16384 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 16385 return (EINVAL); 16386 } 16387 /* 16388 * terminate string before ppa 16389 * save char at that location. 16390 */ 16391 old_char = ppa_ptr[0]; 16392 ppa_ptr[0] = '\0'; 16393 16394 ill->ill_ppa = *new_ppa_ptr; 16395 /* 16396 * Finish as much work now as possible before calling ill_glist_insert 16397 * which makes the ill globally visible and also merges it with the 16398 * other protocol instance of this phyint. The remaining work is 16399 * done after entering the ipsq which may happen sometime later. 16400 * ill_set_ndd_name occurs after the ill has been made globally visible. 16401 */ 16402 ipif = ill->ill_ipif; 16403 16404 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 16405 ipif_assign_seqid(ipif); 16406 16407 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 16408 ill->ill_flags |= ILLF_IPV4; 16409 16410 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 16411 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 16412 16413 if (ill->ill_flags & ILLF_IPV6) { 16414 16415 ill->ill_isv6 = B_TRUE; 16416 ill_set_inputfn(ill); 16417 if (ill->ill_rq != NULL) { 16418 ill->ill_rq->q_qinfo = &iprinitv6; 16419 } 16420 16421 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 16422 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 16423 ipif->ipif_v6subnet = ipv6_all_zeros; 16424 ipif->ipif_v6net_mask = ipv6_all_zeros; 16425 ipif->ipif_v6brd_addr = ipv6_all_zeros; 16426 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 16427 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 16428 /* 16429 * point-to-point or Non-mulicast capable 16430 * interfaces won't do NUD unless explicitly 16431 * configured to do so. 16432 */ 16433 if (ipif->ipif_flags & IPIF_POINTOPOINT || 16434 !(ill->ill_flags & ILLF_MULTICAST)) { 16435 ill->ill_flags |= ILLF_NONUD; 16436 } 16437 /* Make sure IPv4 specific flag is not set on IPv6 if */ 16438 if (ill->ill_flags & ILLF_NOARP) { 16439 /* 16440 * Note: xresolv interfaces will eventually need 16441 * NOARP set here as well, but that will require 16442 * those external resolvers to have some 16443 * knowledge of that flag and act appropriately. 16444 * Not to be changed at present. 16445 */ 16446 ill->ill_flags &= ~ILLF_NOARP; 16447 } 16448 /* 16449 * Set the ILLF_ROUTER flag according to the global 16450 * IPv6 forwarding policy. 16451 */ 16452 if (ipst->ips_ipv6_forward != 0) 16453 ill->ill_flags |= ILLF_ROUTER; 16454 } else if (ill->ill_flags & ILLF_IPV4) { 16455 ill->ill_isv6 = B_FALSE; 16456 ill_set_inputfn(ill); 16457 ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER; 16458 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 16459 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 16460 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 16461 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 16462 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 16463 /* 16464 * Set the ILLF_ROUTER flag according to the global 16465 * IPv4 forwarding policy. 16466 */ 16467 if (ipst->ips_ip_g_forward != 0) 16468 ill->ill_flags |= ILLF_ROUTER; 16469 } 16470 16471 ASSERT(ill->ill_phyint != NULL); 16472 16473 /* 16474 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will 16475 * be completed in ill_glist_insert -> ill_phyint_reinit 16476 */ 16477 if (!ill_allocate_mibs(ill)) 16478 return (ENOMEM); 16479 16480 /* 16481 * Pick a default sap until we get the DL_INFO_ACK back from 16482 * the driver. 16483 */ 16484 ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap : 16485 ill->ill_media->ip_m_ipv4sap; 16486 16487 ill->ill_ifname_pending = 1; 16488 ill->ill_ifname_pending_err = 0; 16489 16490 /* 16491 * When the first ipif comes up in ipif_up_done(), multicast groups 16492 * that were joined while this ill was not bound to the DLPI link need 16493 * to be recovered by ill_recover_multicast(). 16494 */ 16495 ill->ill_need_recover_multicast = 1; 16496 16497 ill_refhold(ill); 16498 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16499 if ((error = ill_glist_insert(ill, interf_name, 16500 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 16501 ill->ill_ppa = UINT_MAX; 16502 ill->ill_name[0] = '\0'; 16503 /* 16504 * undo null termination done above. 16505 */ 16506 ppa_ptr[0] = old_char; 16507 rw_exit(&ipst->ips_ill_g_lock); 16508 ill_refrele(ill); 16509 return (error); 16510 } 16511 16512 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 16513 16514 /* 16515 * When we return the buffer pointed to by interf_name should contain 16516 * the same name as in ill_name. 16517 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 16518 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 16519 * so copy full name and update the ppa ptr. 16520 * When ppa passed in != UINT_MAX all values are correct just undo 16521 * null termination, this saves a bcopy. 16522 */ 16523 if (*new_ppa_ptr == UINT_MAX) { 16524 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 16525 *new_ppa_ptr = ill->ill_ppa; 16526 } else { 16527 /* 16528 * undo null termination done above. 16529 */ 16530 ppa_ptr[0] = old_char; 16531 } 16532 16533 /* Let SCTP know about this ILL */ 16534 sctp_update_ill(ill, SCTP_ILL_INSERT); 16535 16536 /* 16537 * ill_glist_insert has made the ill visible globally, and 16538 * ill_phyint_reinit could have changed the ipsq. At this point, 16539 * we need to hold the ips_ill_g_lock across the call to enter the 16540 * ipsq to enforce atomicity and prevent reordering. In the event 16541 * the ipsq has changed, and if the new ipsq is currently busy, 16542 * we need to make sure that this half-completed ioctl is ahead of 16543 * any subsequent ioctl. We achieve this by not dropping the 16544 * ips_ill_g_lock which prevents any ill lookup itself thereby 16545 * ensuring that new ioctls can't start. 16546 */ 16547 ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP, 16548 B_TRUE); 16549 16550 rw_exit(&ipst->ips_ill_g_lock); 16551 ill_refrele(ill); 16552 if (ipsq == NULL) 16553 return (EINPROGRESS); 16554 16555 /* 16556 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. 16557 */ 16558 if (ipsq->ipsq_xop->ipx_current_ipif == NULL) 16559 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); 16560 else 16561 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif); 16562 16563 error = ipif_set_values_tail(ill, ipif, mp, q); 16564 ipsq_exit(ipsq); 16565 if (error != 0 && error != EINPROGRESS) { 16566 /* 16567 * restore previous values 16568 */ 16569 ill->ill_isv6 = B_FALSE; 16570 ill_set_inputfn(ill); 16571 } 16572 return (error); 16573 } 16574 16575 void 16576 ipif_init(ip_stack_t *ipst) 16577 { 16578 int i; 16579 16580 for (i = 0; i < MAX_G_HEADS; i++) { 16581 ipst->ips_ill_g_heads[i].ill_g_list_head = 16582 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 16583 ipst->ips_ill_g_heads[i].ill_g_list_tail = 16584 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 16585 } 16586 16587 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16588 ill_phyint_compare_index, 16589 sizeof (phyint_t), 16590 offsetof(struct phyint, phyint_avl_by_index)); 16591 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16592 ill_phyint_compare_name, 16593 sizeof (phyint_t), 16594 offsetof(struct phyint, phyint_avl_by_name)); 16595 } 16596 16597 /* 16598 * Save enough information so that we can recreate the IRE if 16599 * the interface goes down and then up. 16600 */ 16601 void 16602 ill_save_ire(ill_t *ill, ire_t *ire) 16603 { 16604 mblk_t *save_mp; 16605 16606 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 16607 if (save_mp != NULL) { 16608 ifrt_t *ifrt; 16609 16610 save_mp->b_wptr += sizeof (ifrt_t); 16611 ifrt = (ifrt_t *)save_mp->b_rptr; 16612 bzero(ifrt, sizeof (ifrt_t)); 16613 ifrt->ifrt_type = ire->ire_type; 16614 if (ire->ire_ipversion == IPV4_VERSION) { 16615 ASSERT(!ill->ill_isv6); 16616 ifrt->ifrt_addr = ire->ire_addr; 16617 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 16618 ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr; 16619 ifrt->ifrt_mask = ire->ire_mask; 16620 } else { 16621 ASSERT(ill->ill_isv6); 16622 ifrt->ifrt_v6addr = ire->ire_addr_v6; 16623 /* ire_gateway_addr_v6 can change due to RTM_CHANGE */ 16624 mutex_enter(&ire->ire_lock); 16625 ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6; 16626 mutex_exit(&ire->ire_lock); 16627 ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6; 16628 ifrt->ifrt_v6mask = ire->ire_mask_v6; 16629 } 16630 ifrt->ifrt_flags = ire->ire_flags; 16631 ifrt->ifrt_zoneid = ire->ire_zoneid; 16632 mutex_enter(&ill->ill_saved_ire_lock); 16633 save_mp->b_cont = ill->ill_saved_ire_mp; 16634 ill->ill_saved_ire_mp = save_mp; 16635 ill->ill_saved_ire_cnt++; 16636 mutex_exit(&ill->ill_saved_ire_lock); 16637 } 16638 } 16639 16640 /* 16641 * Remove one entry from ill_saved_ire_mp. 16642 */ 16643 void 16644 ill_remove_saved_ire(ill_t *ill, ire_t *ire) 16645 { 16646 mblk_t **mpp; 16647 mblk_t *mp; 16648 ifrt_t *ifrt; 16649 16650 /* Remove from ill_saved_ire_mp list if it is there */ 16651 mutex_enter(&ill->ill_saved_ire_lock); 16652 for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL; 16653 mpp = &(*mpp)->b_cont) { 16654 in6_addr_t gw_addr_v6; 16655 16656 /* 16657 * On a given ill, the tuple of address, gateway, mask, 16658 * ire_type, and zoneid is unique for each saved IRE. 16659 */ 16660 mp = *mpp; 16661 ifrt = (ifrt_t *)mp->b_rptr; 16662 /* ire_gateway_addr_v6 can change - need lock */ 16663 mutex_enter(&ire->ire_lock); 16664 gw_addr_v6 = ire->ire_gateway_addr_v6; 16665 mutex_exit(&ire->ire_lock); 16666 16667 if (ifrt->ifrt_zoneid != ire->ire_zoneid || 16668 ifrt->ifrt_type != ire->ire_type) 16669 continue; 16670 16671 if (ill->ill_isv6 ? 16672 (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr, 16673 &ire->ire_addr_v6) && 16674 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr, 16675 &gw_addr_v6) && 16676 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask, 16677 &ire->ire_mask_v6)) : 16678 (ifrt->ifrt_addr == ire->ire_addr && 16679 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 16680 ifrt->ifrt_mask == ire->ire_mask)) { 16681 *mpp = mp->b_cont; 16682 ill->ill_saved_ire_cnt--; 16683 freeb(mp); 16684 break; 16685 } 16686 } 16687 mutex_exit(&ill->ill_saved_ire_lock); 16688 } 16689 16690 /* 16691 * IP multirouting broadcast routes handling 16692 * Append CGTP broadcast IREs to regular ones created 16693 * at ifconfig time. 16694 * The usage is a route add <cgtp_bc> <nic_bc> -multirt i.e., both 16695 * the destination and the gateway are broadcast addresses. 16696 * The caller has verified that the destination is an IRE_BROADCAST and that 16697 * RTF_MULTIRT was set. Here if the gateway is a broadcast address, then 16698 * we create a MULTIRT IRE_BROADCAST. 16699 * Note that the IRE_HOST created by ire_rt_add doesn't get found by anything 16700 * since the IRE_BROADCAST takes precedence; ire_add_v4 does head insertion. 16701 */ 16702 static void 16703 ip_cgtp_bcast_add(ire_t *ire, ip_stack_t *ipst) 16704 { 16705 ire_t *ire_prim; 16706 16707 ASSERT(ire != NULL); 16708 16709 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, 16710 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, 16711 NULL); 16712 if (ire_prim != NULL) { 16713 /* 16714 * We are in the special case of broadcasts for 16715 * CGTP. We add an IRE_BROADCAST that holds 16716 * the RTF_MULTIRT flag, the destination 16717 * address and the low level 16718 * info of ire_prim. In other words, CGTP 16719 * broadcast is added to the redundant ipif. 16720 */ 16721 ill_t *ill_prim; 16722 ire_t *bcast_ire; 16723 16724 ill_prim = ire_prim->ire_ill; 16725 16726 ip2dbg(("ip_cgtp_filter_bcast_add: ire_prim %p, ill_prim %p\n", 16727 (void *)ire_prim, (void *)ill_prim)); 16728 16729 bcast_ire = ire_create( 16730 (uchar_t *)&ire->ire_addr, 16731 (uchar_t *)&ip_g_all_ones, 16732 (uchar_t *)&ire->ire_gateway_addr, 16733 IRE_BROADCAST, 16734 ill_prim, 16735 GLOBAL_ZONEID, /* CGTP is only for the global zone */ 16736 ire->ire_flags | RTF_KERNEL, 16737 NULL, 16738 ipst); 16739 16740 /* 16741 * Here we assume that ire_add does head insertion so that 16742 * the added IRE_BROADCAST comes before the existing IRE_HOST. 16743 */ 16744 if (bcast_ire != NULL) { 16745 if (ire->ire_flags & RTF_SETSRC) { 16746 bcast_ire->ire_setsrc_addr = 16747 ire->ire_setsrc_addr; 16748 } 16749 bcast_ire = ire_add(bcast_ire); 16750 if (bcast_ire != NULL) { 16751 ip2dbg(("ip_cgtp_filter_bcast_add: " 16752 "added bcast_ire %p\n", 16753 (void *)bcast_ire)); 16754 16755 ill_save_ire(ill_prim, bcast_ire); 16756 ire_refrele(bcast_ire); 16757 } 16758 } 16759 ire_refrele(ire_prim); 16760 } 16761 } 16762 16763 /* 16764 * IP multirouting broadcast routes handling 16765 * Remove the broadcast ire. 16766 * The usage is a route delete <cgtp_bc> <nic_bc> -multirt i.e., both 16767 * the destination and the gateway are broadcast addresses. 16768 * The caller has only verified that RTF_MULTIRT was set. We check 16769 * that the destination is broadcast and that the gateway is a broadcast 16770 * address, and if so delete the IRE added by ip_cgtp_bcast_add(). 16771 */ 16772 static void 16773 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst) 16774 { 16775 ASSERT(ire != NULL); 16776 16777 if (ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST) { 16778 ire_t *ire_prim; 16779 16780 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, 16781 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, 16782 ipst, NULL); 16783 if (ire_prim != NULL) { 16784 ill_t *ill_prim; 16785 ire_t *bcast_ire; 16786 16787 ill_prim = ire_prim->ire_ill; 16788 16789 ip2dbg(("ip_cgtp_filter_bcast_delete: " 16790 "ire_prim %p, ill_prim %p\n", 16791 (void *)ire_prim, (void *)ill_prim)); 16792 16793 bcast_ire = ire_ftable_lookup_v4(ire->ire_addr, 0, 16794 ire->ire_gateway_addr, IRE_BROADCAST, 16795 ill_prim, ALL_ZONES, NULL, 16796 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_ILL | 16797 MATCH_IRE_MASK, 0, ipst, NULL); 16798 16799 if (bcast_ire != NULL) { 16800 ip2dbg(("ip_cgtp_filter_bcast_delete: " 16801 "looked up bcast_ire %p\n", 16802 (void *)bcast_ire)); 16803 ill_remove_saved_ire(bcast_ire->ire_ill, 16804 bcast_ire); 16805 ire_delete(bcast_ire); 16806 ire_refrele(bcast_ire); 16807 } 16808 ire_refrele(ire_prim); 16809 } 16810 } 16811 } 16812 16813 /* 16814 * Derive an interface id from the link layer address. 16815 * Knows about IEEE 802 and IEEE EUI-64 mappings. 16816 */ 16817 static void 16818 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16819 { 16820 char *addr; 16821 16822 /* 16823 * Note that some IPv6 interfaces get plumbed over links that claim to 16824 * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g. 16825 * PPP links). The ETHERADDRL check here ensures that we only set the 16826 * interface ID on IPv6 interfaces above links that actually have real 16827 * Ethernet addresses. 16828 */ 16829 if (ill->ill_phys_addr_length == ETHERADDRL) { 16830 /* Form EUI-64 like address */ 16831 addr = (char *)&v6addr->s6_addr32[2]; 16832 bcopy(ill->ill_phys_addr, addr, 3); 16833 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 16834 addr[3] = (char)0xff; 16835 addr[4] = (char)0xfe; 16836 bcopy(ill->ill_phys_addr + 3, addr + 5, 3); 16837 } 16838 } 16839 16840 /* ARGSUSED */ 16841 static void 16842 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16843 { 16844 } 16845 16846 typedef struct ipmp_ifcookie { 16847 uint32_t ic_hostid; 16848 char ic_ifname[LIFNAMSIZ]; 16849 char ic_zonename[ZONENAME_MAX]; 16850 } ipmp_ifcookie_t; 16851 16852 /* 16853 * Construct a pseudo-random interface ID for the IPMP interface that's both 16854 * predictable and (almost) guaranteed to be unique. 16855 */ 16856 static void 16857 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16858 { 16859 zone_t *zp; 16860 uint8_t *addr; 16861 uchar_t hash[16]; 16862 ulong_t hostid; 16863 MD5_CTX ctx; 16864 ipmp_ifcookie_t ic = { 0 }; 16865 16866 ASSERT(IS_IPMP(ill)); 16867 16868 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); 16869 ic.ic_hostid = htonl((uint32_t)hostid); 16870 16871 (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ); 16872 16873 if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) { 16874 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX); 16875 zone_rele(zp); 16876 } 16877 16878 MD5Init(&ctx); 16879 MD5Update(&ctx, &ic, sizeof (ic)); 16880 MD5Final(hash, &ctx); 16881 16882 /* 16883 * Map the hash to an interface ID per the basic approach in RFC3041. 16884 */ 16885 addr = &v6addr->s6_addr8[8]; 16886 bcopy(hash + 8, addr, sizeof (uint64_t)); 16887 addr[0] &= ~0x2; /* set local bit */ 16888 } 16889 16890 /* 16891 * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet. 16892 */ 16893 static void 16894 ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr) 16895 { 16896 phyint_t *phyi = ill->ill_phyint; 16897 16898 /* 16899 * Check PHYI_MULTI_BCAST and length of physical 16900 * address to determine if we use the mapping or the 16901 * broadcast address. 16902 */ 16903 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || 16904 ill->ill_phys_addr_length != ETHERADDRL) { 16905 ip_mbcast_mapping(ill, m_ip6addr, m_physaddr); 16906 return; 16907 } 16908 m_physaddr[0] = 0x33; 16909 m_physaddr[1] = 0x33; 16910 m_physaddr[2] = m_ip6addr[12]; 16911 m_physaddr[3] = m_ip6addr[13]; 16912 m_physaddr[4] = m_ip6addr[14]; 16913 m_physaddr[5] = m_ip6addr[15]; 16914 } 16915 16916 /* 16917 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet. 16918 */ 16919 static void 16920 ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 16921 { 16922 phyint_t *phyi = ill->ill_phyint; 16923 16924 /* 16925 * Check PHYI_MULTI_BCAST and length of physical 16926 * address to determine if we use the mapping or the 16927 * broadcast address. 16928 */ 16929 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || 16930 ill->ill_phys_addr_length != ETHERADDRL) { 16931 ip_mbcast_mapping(ill, m_ipaddr, m_physaddr); 16932 return; 16933 } 16934 m_physaddr[0] = 0x01; 16935 m_physaddr[1] = 0x00; 16936 m_physaddr[2] = 0x5e; 16937 m_physaddr[3] = m_ipaddr[1] & 0x7f; 16938 m_physaddr[4] = m_ipaddr[2]; 16939 m_physaddr[5] = m_ipaddr[3]; 16940 } 16941 16942 /* ARGSUSED */ 16943 static void 16944 ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 16945 { 16946 /* 16947 * for the MULTI_BCAST case and other cases when we want to 16948 * use the link-layer broadcast address for multicast. 16949 */ 16950 uint8_t *bphys_addr; 16951 dl_unitdata_req_t *dlur; 16952 16953 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 16954 if (ill->ill_sap_length < 0) { 16955 bphys_addr = (uchar_t *)dlur + 16956 dlur->dl_dest_addr_offset; 16957 } else { 16958 bphys_addr = (uchar_t *)dlur + 16959 dlur->dl_dest_addr_offset + ill->ill_sap_length; 16960 } 16961 16962 bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length); 16963 } 16964 16965 /* 16966 * Derive IPoIB interface id from the link layer address. 16967 */ 16968 static void 16969 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16970 { 16971 char *addr; 16972 16973 ASSERT(ill->ill_phys_addr_length == 20); 16974 addr = (char *)&v6addr->s6_addr32[2]; 16975 bcopy(ill->ill_phys_addr + 12, addr, 8); 16976 /* 16977 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 16978 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 16979 * rules. In these cases, the IBA considers these GUIDs to be in 16980 * "Modified EUI-64" format, and thus toggling the u/l bit is not 16981 * required; vendors are required not to assign global EUI-64's 16982 * that differ only in u/l bit values, thus guaranteeing uniqueness 16983 * of the interface identifier. Whether the GUID is in modified 16984 * or proper EUI-64 format, the ipv6 identifier must have the u/l 16985 * bit set to 1. 16986 */ 16987 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 16988 } 16989 16990 /* 16991 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand. 16992 * Note on mapping from multicast IP addresses to IPoIB multicast link 16993 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 16994 * The format of an IPoIB multicast address is: 16995 * 16996 * 4 byte QPN Scope Sign. Pkey 16997 * +--------------------------------------------+ 16998 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 16999 * +--------------------------------------------+ 17000 * 17001 * The Scope and Pkey components are properties of the IBA port and 17002 * network interface. They can be ascertained from the broadcast address. 17003 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 17004 */ 17005 static void 17006 ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17007 { 17008 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 17009 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 17010 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 17011 uint8_t *bphys_addr; 17012 dl_unitdata_req_t *dlur; 17013 17014 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); 17015 17016 /* 17017 * RFC 4391: IPv4 MGID is 28-bit long. 17018 */ 17019 m_physaddr[16] = m_ipaddr[0] & 0x0f; 17020 m_physaddr[17] = m_ipaddr[1]; 17021 m_physaddr[18] = m_ipaddr[2]; 17022 m_physaddr[19] = m_ipaddr[3]; 17023 17024 17025 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17026 if (ill->ill_sap_length < 0) { 17027 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 17028 } else { 17029 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + 17030 ill->ill_sap_length; 17031 } 17032 /* 17033 * Now fill in the IBA scope/Pkey values from the broadcast address. 17034 */ 17035 m_physaddr[5] = bphys_addr[5]; 17036 m_physaddr[8] = bphys_addr[8]; 17037 m_physaddr[9] = bphys_addr[9]; 17038 } 17039 17040 static void 17041 ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17042 { 17043 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 17044 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 17045 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 17046 uint8_t *bphys_addr; 17047 dl_unitdata_req_t *dlur; 17048 17049 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); 17050 17051 /* 17052 * RFC 4391: IPv4 MGID is 80-bit long. 17053 */ 17054 bcopy(&m_ipaddr[6], &m_physaddr[10], 10); 17055 17056 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17057 if (ill->ill_sap_length < 0) { 17058 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 17059 } else { 17060 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + 17061 ill->ill_sap_length; 17062 } 17063 /* 17064 * Now fill in the IBA scope/Pkey values from the broadcast address. 17065 */ 17066 m_physaddr[5] = bphys_addr[5]; 17067 m_physaddr[8] = bphys_addr[8]; 17068 m_physaddr[9] = bphys_addr[9]; 17069 } 17070 17071 /* 17072 * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4 17073 * tunnel). The IPv4 address simply get placed in the lower 4 bytes of the 17074 * IPv6 interface id. This is a suggested mechanism described in section 3.7 17075 * of RFC4213. 17076 */ 17077 static void 17078 ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 17079 { 17080 ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t)); 17081 v6addr->s6_addr32[2] = 0; 17082 bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t)); 17083 } 17084 17085 /* 17086 * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6 17087 * tunnel). The lower 8 bytes of the IPv6 address simply become the interface 17088 * id. 17089 */ 17090 static void 17091 ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 17092 { 17093 in6_addr_t *v6lladdr = (in6_addr_t *)physaddr; 17094 17095 ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t)); 17096 bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8); 17097 } 17098 17099 static void 17100 ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17101 { 17102 ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr); 17103 } 17104 17105 static void 17106 ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 17107 { 17108 ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr); 17109 } 17110 17111 static void 17112 ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17113 { 17114 ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr); 17115 } 17116 17117 static void 17118 ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 17119 { 17120 ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr); 17121 } 17122 17123 /* 17124 * Lookup an ill and verify that the zoneid has an ipif on that ill. 17125 * Returns an held ill, or NULL. 17126 */ 17127 ill_t * 17128 ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6, 17129 ip_stack_t *ipst) 17130 { 17131 ill_t *ill; 17132 ipif_t *ipif; 17133 17134 ill = ill_lookup_on_ifindex(index, isv6, ipst); 17135 if (ill == NULL) 17136 return (NULL); 17137 17138 mutex_enter(&ill->ill_lock); 17139 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17140 if (IPIF_IS_CONDEMNED(ipif)) 17141 continue; 17142 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 17143 ipif->ipif_zoneid != ALL_ZONES) 17144 continue; 17145 17146 mutex_exit(&ill->ill_lock); 17147 return (ill); 17148 } 17149 mutex_exit(&ill->ill_lock); 17150 ill_refrele(ill); 17151 return (NULL); 17152 } 17153 17154 /* 17155 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 17156 * If a pointer to an ipif_t is returned then the caller will need to do 17157 * an ill_refrele(). 17158 */ 17159 ipif_t * 17160 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, 17161 ip_stack_t *ipst) 17162 { 17163 ipif_t *ipif; 17164 ill_t *ill; 17165 17166 ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 17167 if (ill == NULL) 17168 return (NULL); 17169 17170 mutex_enter(&ill->ill_lock); 17171 if (ill->ill_state_flags & ILL_CONDEMNED) { 17172 mutex_exit(&ill->ill_lock); 17173 ill_refrele(ill); 17174 return (NULL); 17175 } 17176 17177 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17178 if (!IPIF_CAN_LOOKUP(ipif)) 17179 continue; 17180 if (lifidx == ipif->ipif_id) { 17181 ipif_refhold_locked(ipif); 17182 break; 17183 } 17184 } 17185 17186 mutex_exit(&ill->ill_lock); 17187 ill_refrele(ill); 17188 return (ipif); 17189 } 17190 17191 /* 17192 * Set ill_inputfn based on the current know state. 17193 * This needs to be called when any of the factors taken into 17194 * account changes. 17195 */ 17196 void 17197 ill_set_inputfn(ill_t *ill) 17198 { 17199 ip_stack_t *ipst = ill->ill_ipst; 17200 17201 if (ill->ill_isv6) { 17202 if (is_system_labeled()) 17203 ill->ill_inputfn = ill_input_full_v6; 17204 else 17205 ill->ill_inputfn = ill_input_short_v6; 17206 } else { 17207 if (is_system_labeled()) 17208 ill->ill_inputfn = ill_input_full_v4; 17209 else if (ill->ill_dhcpinit != 0) 17210 ill->ill_inputfn = ill_input_full_v4; 17211 else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head 17212 != NULL) 17213 ill->ill_inputfn = ill_input_full_v4; 17214 else if (ipst->ips_ip_cgtp_filter && 17215 ipst->ips_ip_cgtp_filter_ops != NULL) 17216 ill->ill_inputfn = ill_input_full_v4; 17217 else 17218 ill->ill_inputfn = ill_input_short_v4; 17219 } 17220 } 17221 17222 /* 17223 * Re-evaluate ill_inputfn for all the IPv4 ills. 17224 * Used when RSVP and CGTP comes and goes. 17225 */ 17226 void 17227 ill_set_inputfn_all(ip_stack_t *ipst) 17228 { 17229 ill_walk_context_t ctx; 17230 ill_t *ill; 17231 17232 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 17233 ill = ILL_START_WALK_V4(&ctx, ipst); 17234 for (; ill != NULL; ill = ill_next(&ctx, ill)) 17235 ill_set_inputfn(ill); 17236 17237 rw_exit(&ipst->ips_ill_g_lock); 17238 } 17239 17240 /* 17241 * Set the physical address information for `ill' to the contents of the 17242 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be 17243 * asynchronous if `ill' cannot immediately be quiesced -- in which case 17244 * EINPROGRESS will be returned. 17245 */ 17246 int 17247 ill_set_phys_addr(ill_t *ill, mblk_t *mp) 17248 { 17249 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 17250 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr; 17251 17252 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17253 17254 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR && 17255 dlindp->dl_data != DL_CURR_DEST_ADDR && 17256 dlindp->dl_data != DL_CURR_PHYS_ADDR) { 17257 /* Changing DL_IPV6_TOKEN is not yet supported */ 17258 return (0); 17259 } 17260 17261 /* 17262 * We need to store up to two copies of `mp' in `ill'. Due to the 17263 * design of ipsq_pending_mp_add(), we can't pass them as separate 17264 * arguments to ill_set_phys_addr_tail(). Instead, chain them 17265 * together here, then pull 'em apart in ill_set_phys_addr_tail(). 17266 */ 17267 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) { 17268 freemsg(mp); 17269 return (ENOMEM); 17270 } 17271 17272 ipsq_current_start(ipsq, ill->ill_ipif, 0); 17273 mutex_enter(&ill->ill_lock); 17274 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 17275 /* no more nce addition allowed */ 17276 mutex_exit(&ill->ill_lock); 17277 17278 /* 17279 * If we can quiesce the ill, then set the address. If not, then 17280 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). 17281 */ 17282 ill_down_ipifs(ill, B_TRUE); 17283 mutex_enter(&ill->ill_lock); 17284 if (!ill_is_quiescent(ill)) { 17285 /* call cannot fail since `conn_t *' argument is NULL */ 17286 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 17287 mp, ILL_DOWN); 17288 mutex_exit(&ill->ill_lock); 17289 return (EINPROGRESS); 17290 } 17291 mutex_exit(&ill->ill_lock); 17292 17293 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL); 17294 return (0); 17295 } 17296 17297 /* 17298 * Once the ill associated with `q' has quiesced, set its physical address 17299 * information to the values in `addrmp'. Note that two copies of `addrmp' 17300 * are passed (linked by b_cont), since we sometimes need to save two distinct 17301 * copies in the ill_t, and our context doesn't permit sleeping or allocation 17302 * failure (we'll free the other copy if it's not needed). Since the ill_t 17303 * is quiesced, we know any stale nce's with the old address information have 17304 * already been removed, so we don't need to call nce_flush(). 17305 */ 17306 /* ARGSUSED */ 17307 static void 17308 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) 17309 { 17310 ill_t *ill = q->q_ptr; 17311 mblk_t *addrmp2 = unlinkb(addrmp); 17312 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr; 17313 uint_t addrlen, addroff; 17314 int status; 17315 17316 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17317 17318 addroff = dlindp->dl_addr_offset; 17319 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length); 17320 17321 switch (dlindp->dl_data) { 17322 case DL_IPV6_LINK_LAYER_ADDR: 17323 ill_set_ndmp(ill, addrmp, addroff, addrlen); 17324 freemsg(addrmp2); 17325 break; 17326 17327 case DL_CURR_DEST_ADDR: 17328 freemsg(ill->ill_dest_addr_mp); 17329 ill->ill_dest_addr = addrmp->b_rptr + addroff; 17330 ill->ill_dest_addr_mp = addrmp; 17331 if (ill->ill_isv6) { 17332 ill_setdesttoken(ill); 17333 ipif_setdestlinklocal(ill->ill_ipif); 17334 } 17335 freemsg(addrmp2); 17336 break; 17337 17338 case DL_CURR_PHYS_ADDR: 17339 freemsg(ill->ill_phys_addr_mp); 17340 ill->ill_phys_addr = addrmp->b_rptr + addroff; 17341 ill->ill_phys_addr_mp = addrmp; 17342 ill->ill_phys_addr_length = addrlen; 17343 if (ill->ill_isv6) 17344 ill_set_ndmp(ill, addrmp2, addroff, addrlen); 17345 else 17346 freemsg(addrmp2); 17347 if (ill->ill_isv6) { 17348 ill_setdefaulttoken(ill); 17349 ipif_setlinklocal(ill->ill_ipif); 17350 } 17351 break; 17352 default: 17353 ASSERT(0); 17354 } 17355 17356 /* 17357 * If there are ipifs to bring up, ill_up_ipifs() will return 17358 * EINPROGRESS, and ipsq_current_finish() will be called by 17359 * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is 17360 * brought up. 17361 */ 17362 status = ill_up_ipifs(ill, q, addrmp); 17363 mutex_enter(&ill->ill_lock); 17364 if (ill->ill_dl_up) 17365 ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS; 17366 mutex_exit(&ill->ill_lock); 17367 if (status != EINPROGRESS) 17368 ipsq_current_finish(ipsq); 17369 } 17370 17371 /* 17372 * Helper routine for setting the ill_nd_lla fields. 17373 */ 17374 void 17375 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen) 17376 { 17377 freemsg(ill->ill_nd_lla_mp); 17378 ill->ill_nd_lla = ndmp->b_rptr + addroff; 17379 ill->ill_nd_lla_mp = ndmp; 17380 ill->ill_nd_lla_len = addrlen; 17381 } 17382 17383 /* 17384 * Replumb the ill. 17385 */ 17386 int 17387 ill_replumb(ill_t *ill, mblk_t *mp) 17388 { 17389 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 17390 17391 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17392 17393 ipsq_current_start(ipsq, ill->ill_ipif, 0); 17394 17395 mutex_enter(&ill->ill_lock); 17396 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 17397 /* no more nce addition allowed */ 17398 mutex_exit(&ill->ill_lock); 17399 17400 /* 17401 * If we can quiesce the ill, then continue. If not, then 17402 * ill_replumb_tail() will be called from ipif_ill_refrele_tail(). 17403 */ 17404 ill_down_ipifs(ill, B_FALSE); 17405 17406 mutex_enter(&ill->ill_lock); 17407 if (!ill_is_quiescent(ill)) { 17408 /* call cannot fail since `conn_t *' argument is NULL */ 17409 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 17410 mp, ILL_DOWN); 17411 mutex_exit(&ill->ill_lock); 17412 return (EINPROGRESS); 17413 } 17414 mutex_exit(&ill->ill_lock); 17415 17416 ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL); 17417 return (0); 17418 } 17419 17420 /* ARGSUSED */ 17421 static void 17422 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 17423 { 17424 ill_t *ill = q->q_ptr; 17425 int err; 17426 conn_t *connp = NULL; 17427 17428 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17429 freemsg(ill->ill_replumb_mp); 17430 ill->ill_replumb_mp = copyb(mp); 17431 17432 if (ill->ill_replumb_mp == NULL) { 17433 /* out of memory */ 17434 ipsq_current_finish(ipsq); 17435 return; 17436 } 17437 17438 mutex_enter(&ill->ill_lock); 17439 ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif, 17440 ill->ill_rq, ill->ill_replumb_mp, 0); 17441 mutex_exit(&ill->ill_lock); 17442 17443 if (!ill->ill_up_ipifs) { 17444 /* already closing */ 17445 ipsq_current_finish(ipsq); 17446 return; 17447 } 17448 ill->ill_replumbing = 1; 17449 err = ill_down_ipifs_tail(ill); 17450 17451 /* 17452 * Successfully quiesced and brought down the interface, now we send 17453 * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the 17454 * DL_NOTE_REPLUMB message. 17455 */ 17456 mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO, 17457 DL_NOTIFY_CONF); 17458 ASSERT(mp != NULL); 17459 ((dl_notify_conf_t *)mp->b_rptr)->dl_notification = 17460 DL_NOTE_REPLUMB_DONE; 17461 ill_dlpi_send(ill, mp); 17462 17463 /* 17464 * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP 17465 * streams have to be unbound. When all the DLPI exchanges are done, 17466 * ipsq_current_finish() will be called by arp_bringup_done(). The 17467 * remainder of ipif bringup via ill_up_ipifs() will also be done in 17468 * arp_bringup_done(). 17469 */ 17470 ASSERT(ill->ill_replumb_mp != NULL); 17471 if (err == EINPROGRESS) 17472 return; 17473 else 17474 ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp); 17475 ASSERT(connp == NULL); 17476 if (err == 0 && ill->ill_replumb_mp != NULL && 17477 ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) { 17478 return; 17479 } 17480 ipsq_current_finish(ipsq); 17481 } 17482 17483 /* 17484 * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf' 17485 * which is `bufsize' bytes. On success, zero is returned and `buf' updated 17486 * as per the ioctl. On failure, an errno is returned. 17487 */ 17488 static int 17489 ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr) 17490 { 17491 int rval; 17492 struct strioctl iocb; 17493 17494 iocb.ic_cmd = cmd; 17495 iocb.ic_timout = 15; 17496 iocb.ic_len = bufsize; 17497 iocb.ic_dp = buf; 17498 17499 return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval)); 17500 } 17501 17502 /* 17503 * Issue an SIOCGLIFCONF for address family `af' and store the result into a 17504 * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success. 17505 */ 17506 static int 17507 ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp, 17508 uint_t *bufsizep, cred_t *cr) 17509 { 17510 int err; 17511 struct lifnum lifn; 17512 17513 bzero(&lifn, sizeof (lifn)); 17514 lifn.lifn_family = af; 17515 lifn.lifn_flags = LIFC_UNDER_IPMP; 17516 17517 if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0) 17518 return (err); 17519 17520 /* 17521 * Pad the interface count to account for additional interfaces that 17522 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 17523 */ 17524 lifn.lifn_count += 4; 17525 bzero(lifcp, sizeof (*lifcp)); 17526 lifcp->lifc_flags = LIFC_UNDER_IPMP; 17527 lifcp->lifc_family = af; 17528 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 17529 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 17530 17531 err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr); 17532 if (err != 0) { 17533 kmem_free(lifcp->lifc_buf, *bufsizep); 17534 return (err); 17535 } 17536 17537 return (0); 17538 } 17539 17540 /* 17541 * Helper for ip_interface_cleanup() that removes the loopback interface. 17542 */ 17543 static void 17544 ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 17545 { 17546 int err; 17547 struct lifreq lifr; 17548 17549 bzero(&lifr, sizeof (lifr)); 17550 (void) strcpy(lifr.lifr_name, ipif_loopback_name); 17551 17552 err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr); 17553 if (err != 0) { 17554 ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: " 17555 "error %d\n", isv6 ? "v6" : "v4", err)); 17556 } 17557 } 17558 17559 /* 17560 * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP 17561 * groups and that IPMP data addresses are down. These conditions must be met 17562 * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp(). 17563 */ 17564 static void 17565 ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 17566 { 17567 int af = isv6 ? AF_INET6 : AF_INET; 17568 int i, nifs; 17569 int err; 17570 uint_t bufsize; 17571 uint_t lifrsize = sizeof (struct lifreq); 17572 struct lifconf lifc; 17573 struct lifreq *lifrp; 17574 17575 if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) { 17576 cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list " 17577 "(error %d); any IPMP interfaces cannot be shutdown", err); 17578 return; 17579 } 17580 17581 nifs = lifc.lifc_len / lifrsize; 17582 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 17583 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 17584 if (err != 0) { 17585 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get " 17586 "flags: error %d", lifrp->lifr_name, err); 17587 continue; 17588 } 17589 17590 if (lifrp->lifr_flags & IFF_IPMP) { 17591 if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0) 17592 continue; 17593 17594 lifrp->lifr_flags &= ~IFF_UP; 17595 err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr); 17596 if (err != 0) { 17597 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 17598 "bring down (error %d); IPMP interface may " 17599 "not be shutdown", lifrp->lifr_name, err); 17600 } 17601 17602 /* 17603 * Check if IFF_DUPLICATE is still set -- and if so, 17604 * reset the address to clear it. 17605 */ 17606 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 17607 if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE)) 17608 continue; 17609 17610 err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr); 17611 if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR, 17612 lifrp, lifrsize, cr)) != 0) { 17613 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 17614 "reset DAD (error %d); IPMP interface may " 17615 "not be shutdown", lifrp->lifr_name, err); 17616 } 17617 continue; 17618 } 17619 17620 lifrp->lifr_groupname[0] = '\0'; 17621 err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp, lifrsize, cr); 17622 if (err != 0) { 17623 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot leave " 17624 "IPMP group (error %d); associated IPMP interface " 17625 "may not be shutdown", lifrp->lifr_name, err); 17626 continue; 17627 } 17628 } 17629 17630 kmem_free(lifc.lifc_buf, bufsize); 17631 } 17632 17633 #define UDPDEV "/devices/pseudo/udp@0:udp" 17634 #define UDP6DEV "/devices/pseudo/udp6@0:udp6" 17635 17636 /* 17637 * Remove the loopback interfaces and prep the IPMP interfaces to be torn down. 17638 * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away 17639 * when the user-level processes in the zone are killed and the latter are 17640 * cleaned up by str_stack_shutdown(). 17641 */ 17642 void 17643 ip_interface_cleanup(ip_stack_t *ipst) 17644 { 17645 ldi_handle_t lh; 17646 ldi_ident_t li; 17647 cred_t *cr; 17648 int err; 17649 int i; 17650 char *devs[] = { UDP6DEV, UDPDEV }; 17651 netstackid_t stackid = ipst->ips_netstack->netstack_stackid; 17652 17653 if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) { 17654 cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:" 17655 " error %d", err); 17656 return; 17657 } 17658 17659 cr = zone_get_kcred(netstackid_to_zoneid(stackid)); 17660 ASSERT(cr != NULL); 17661 17662 /* 17663 * NOTE: loop executes exactly twice and is hardcoded to know that the 17664 * first iteration is IPv6. (Unrolling yields repetitious code, hence 17665 * the loop.) 17666 */ 17667 for (i = 0; i < 2; i++) { 17668 err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li); 17669 if (err != 0) { 17670 cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:" 17671 " error %d", devs[i], err); 17672 continue; 17673 } 17674 17675 ip_loopback_removeif(lh, i == 0, cr); 17676 ip_ipmp_cleanup(lh, i == 0, cr); 17677 17678 (void) ldi_close(lh, FREAD|FWRITE, cr); 17679 } 17680 17681 ldi_ident_release(li); 17682 crfree(cr); 17683 } 17684 17685 /* 17686 * This needs to be in-sync with nic_event_t definition 17687 */ 17688 static const char * 17689 ill_hook_event2str(nic_event_t event) 17690 { 17691 switch (event) { 17692 case NE_PLUMB: 17693 return ("PLUMB"); 17694 case NE_UNPLUMB: 17695 return ("UNPLUMB"); 17696 case NE_UP: 17697 return ("UP"); 17698 case NE_DOWN: 17699 return ("DOWN"); 17700 case NE_ADDRESS_CHANGE: 17701 return ("ADDRESS_CHANGE"); 17702 case NE_LIF_UP: 17703 return ("LIF_UP"); 17704 case NE_LIF_DOWN: 17705 return ("LIF_DOWN"); 17706 case NE_IFINDEX_CHANGE: 17707 return ("IFINDEX_CHANGE"); 17708 default: 17709 return ("UNKNOWN"); 17710 } 17711 } 17712 17713 void 17714 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event, 17715 nic_event_data_t data, size_t datalen) 17716 { 17717 ip_stack_t *ipst = ill->ill_ipst; 17718 hook_nic_event_int_t *info; 17719 const char *str = NULL; 17720 17721 /* create a new nic event info */ 17722 if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL) 17723 goto fail; 17724 17725 info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex; 17726 info->hnei_event.hne_lif = lif; 17727 info->hnei_event.hne_event = event; 17728 info->hnei_event.hne_protocol = ill->ill_isv6 ? 17729 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 17730 info->hnei_event.hne_data = NULL; 17731 info->hnei_event.hne_datalen = 0; 17732 info->hnei_stackid = ipst->ips_netstack->netstack_stackid; 17733 17734 if (data != NULL && datalen != 0) { 17735 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP); 17736 if (info->hnei_event.hne_data == NULL) 17737 goto fail; 17738 bcopy(data, info->hnei_event.hne_data, datalen); 17739 info->hnei_event.hne_datalen = datalen; 17740 } 17741 17742 if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info, 17743 DDI_NOSLEEP) == DDI_SUCCESS) 17744 return; 17745 17746 fail: 17747 if (info != NULL) { 17748 if (info->hnei_event.hne_data != NULL) { 17749 kmem_free(info->hnei_event.hne_data, 17750 info->hnei_event.hne_datalen); 17751 } 17752 kmem_free(info, sizeof (hook_nic_event_t)); 17753 } 17754 str = ill_hook_event2str(event); 17755 ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event " 17756 "information for %s (ENOMEM)\n", str, ill->ill_name)); 17757 } 17758 17759 static int 17760 ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act) 17761 { 17762 int err = 0; 17763 const in_addr_t *addr = NULL; 17764 nce_t *nce = NULL; 17765 ill_t *ill = ipif->ipif_ill; 17766 ill_t *bound_ill; 17767 boolean_t added_ipif = B_FALSE; 17768 uint16_t state; 17769 uint16_t flags; 17770 17771 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail", 17772 ill_t *, ill, ipif_t *, ipif); 17773 if (ipif->ipif_lcl_addr != INADDR_ANY) { 17774 addr = &ipif->ipif_lcl_addr; 17775 } 17776 17777 if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) { 17778 if (res_act != Res_act_initial) 17779 return (EINVAL); 17780 } 17781 17782 if (addr != NULL) { 17783 ipmp_illgrp_t *illg = ill->ill_grp; 17784 17785 /* add unicast nce for the local addr */ 17786 17787 if (IS_IPMP(ill)) { 17788 /* 17789 * If we're here via ipif_up(), then the ipif 17790 * won't be bound yet -- add it to the group, 17791 * which will bind it if possible. (We would 17792 * add it in ipif_up(), but deleting on failure 17793 * there is gruesome.) If we're here via 17794 * ipmp_ill_bind_ipif(), then the ipif has 17795 * already been added to the group and we 17796 * just need to use the binding. 17797 */ 17798 if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) { 17799 bound_ill = ipmp_illgrp_add_ipif(illg, ipif); 17800 if (bound_ill == NULL) { 17801 /* 17802 * We couldn't bind the ipif to an ill 17803 * yet, so we have nothing to publish. 17804 * Mark the address as ready and return. 17805 */ 17806 ipif->ipif_addr_ready = 1; 17807 return (0); 17808 } 17809 added_ipif = B_TRUE; 17810 } 17811 } else { 17812 bound_ill = ill; 17813 } 17814 17815 flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY | 17816 NCE_F_NONUD); 17817 /* 17818 * If this is an initial bring-up (or the ipif was never 17819 * completely brought up), do DAD. Otherwise, we're here 17820 * because IPMP has rebound an address to this ill: send 17821 * unsolicited advertisements (ARP announcements) to 17822 * inform others. 17823 */ 17824 if (res_act == Res_act_initial || !ipif->ipif_addr_ready) { 17825 state = ND_UNCHANGED; /* compute in nce_add_common() */ 17826 } else { 17827 state = ND_REACHABLE; 17828 flags |= NCE_F_UNSOL_ADV; 17829 } 17830 17831 retry: 17832 err = nce_lookup_then_add_v4(ill, 17833 bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length, 17834 addr, flags, state, &nce); 17835 17836 /* 17837 * note that we may encounter EEXIST if we are moving 17838 * the nce as a result of a rebind operation. 17839 */ 17840 switch (err) { 17841 case 0: 17842 ipif->ipif_added_nce = 1; 17843 nce->nce_ipif_cnt++; 17844 break; 17845 case EEXIST: 17846 ip1dbg(("ipif_arp_up: NCE already exists for %s\n", 17847 ill->ill_name)); 17848 if (!NCE_MYADDR(nce->nce_common)) { 17849 /* 17850 * A leftover nce from before this address 17851 * existed 17852 */ 17853 ncec_delete(nce->nce_common); 17854 nce_refrele(nce); 17855 nce = NULL; 17856 goto retry; 17857 } 17858 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 17859 nce_refrele(nce); 17860 nce = NULL; 17861 ip1dbg(("ipif_arp_up: NCE already exists " 17862 "for %s:%u\n", ill->ill_name, 17863 ipif->ipif_id)); 17864 goto arp_up_done; 17865 } 17866 /* 17867 * Duplicate local addresses are permissible for 17868 * IPIF_POINTOPOINT interfaces which will get marked 17869 * IPIF_UNNUMBERED later in 17870 * ip_addr_availability_check(). 17871 * 17872 * The nce_ipif_cnt field tracks the number of 17873 * ipifs that have nce_addr as their local address. 17874 */ 17875 ipif->ipif_addr_ready = 1; 17876 ipif->ipif_added_nce = 1; 17877 nce->nce_ipif_cnt++; 17878 err = 0; 17879 break; 17880 default: 17881 ASSERT(nce == NULL); 17882 goto arp_up_done; 17883 } 17884 if (arp_no_defense) { 17885 if ((ipif->ipif_flags & IPIF_UP) && 17886 !ipif->ipif_addr_ready) 17887 ipif_up_notify(ipif); 17888 ipif->ipif_addr_ready = 1; 17889 } 17890 } else { 17891 /* zero address. nothing to publish */ 17892 ipif->ipif_addr_ready = 1; 17893 } 17894 if (nce != NULL) 17895 nce_refrele(nce); 17896 arp_up_done: 17897 if (added_ipif && err != 0) 17898 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 17899 return (err); 17900 } 17901 17902 int 17903 ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup) 17904 { 17905 int err = 0; 17906 ill_t *ill = ipif->ipif_ill; 17907 boolean_t first_interface, wait_for_dlpi = B_FALSE; 17908 17909 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up", 17910 ill_t *, ill, ipif_t *, ipif); 17911 17912 /* 17913 * need to bring up ARP or setup mcast mapping only 17914 * when the first interface is coming UP. 17915 */ 17916 first_interface = (ill->ill_ipif_up_count == 0 && 17917 ill->ill_ipif_dup_count == 0 && !was_dup); 17918 17919 if (res_act == Res_act_initial && first_interface) { 17920 /* 17921 * Send ATTACH + BIND 17922 */ 17923 err = arp_ll_up(ill); 17924 if (err != EINPROGRESS && err != 0) 17925 return (err); 17926 17927 /* 17928 * Add NCE for local address. Start DAD. 17929 * we'll wait to hear that DAD has finished 17930 * before using the interface. 17931 */ 17932 if (err == EINPROGRESS) 17933 wait_for_dlpi = B_TRUE; 17934 } 17935 17936 if (!wait_for_dlpi) 17937 (void) ipif_arp_up_done_tail(ipif, res_act); 17938 17939 return (!wait_for_dlpi ? 0 : EINPROGRESS); 17940 } 17941 17942 /* 17943 * Finish processing of "arp_up" after all the DLPI message 17944 * exchanges have completed between arp and the driver. 17945 */ 17946 void 17947 arp_bringup_done(ill_t *ill, int err) 17948 { 17949 mblk_t *mp1; 17950 ipif_t *ipif; 17951 conn_t *connp = NULL; 17952 ipsq_t *ipsq; 17953 queue_t *q; 17954 17955 ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name)); 17956 17957 ASSERT(IAM_WRITER_ILL(ill)); 17958 17959 ipsq = ill->ill_phyint->phyint_ipsq; 17960 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 17961 mp1 = ipsq_pending_mp_get(ipsq, &connp); 17962 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 17963 if (mp1 == NULL) /* bringup was aborted by the user */ 17964 return; 17965 17966 /* 17967 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 17968 * must have an associated conn_t. Otherwise, we're bringing this 17969 * interface back up as part of handling an asynchronous event (e.g., 17970 * physical address change). 17971 */ 17972 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 17973 ASSERT(connp != NULL); 17974 q = CONNP_TO_WQ(connp); 17975 } else { 17976 ASSERT(connp == NULL); 17977 q = ill->ill_rq; 17978 } 17979 if (err == 0) { 17980 if (ipif->ipif_isv6) { 17981 if ((err = ipif_up_done_v6(ipif)) != 0) 17982 ip0dbg(("arp_bringup_done: init failed\n")); 17983 } else { 17984 err = ipif_arp_up_done_tail(ipif, Res_act_initial); 17985 if (err != 0 || 17986 (err = ipif_up_done(ipif)) != 0) { 17987 ip0dbg(("arp_bringup_done: " 17988 "init failed err %x\n", err)); 17989 (void) ipif_arp_down(ipif); 17990 } 17991 17992 } 17993 } else { 17994 ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n")); 17995 } 17996 17997 if ((err == 0) && (ill->ill_up_ipifs)) { 17998 err = ill_up_ipifs(ill, q, mp1); 17999 if (err == EINPROGRESS) 18000 return; 18001 } 18002 18003 /* 18004 * If we have a moved ipif to bring up, and everything has succeeded 18005 * to this point, bring it up on the IPMP ill. Otherwise, leave it 18006 * down -- the admin can try to bring it up by hand if need be. 18007 */ 18008 if (ill->ill_move_ipif != NULL) { 18009 ipif = ill->ill_move_ipif; 18010 ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif, 18011 ipif->ipif_ill->ill_name)); 18012 ill->ill_move_ipif = NULL; 18013 if (err == 0) { 18014 err = ipif_up(ipif, q, mp1); 18015 if (err == EINPROGRESS) 18016 return; 18017 } 18018 } 18019 18020 /* 18021 * The operation must complete without EINPROGRESS since 18022 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 18023 * Otherwise, the operation will be stuck forever in the ipsq. 18024 */ 18025 ASSERT(err != EINPROGRESS); 18026 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18027 DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish", 18028 int, ipsq->ipsq_xop->ipx_current_ioctl, 18029 ill_t *, ill, ipif_t *, ipif); 18030 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 18031 } else { 18032 ipsq_current_finish(ipsq); 18033 } 18034 } 18035 18036 /* 18037 * Finish processing of arp replumb after all the DLPI message 18038 * exchanges have completed between arp and the driver. 18039 */ 18040 void 18041 arp_replumb_done(ill_t *ill, int err) 18042 { 18043 mblk_t *mp1; 18044 ipif_t *ipif; 18045 conn_t *connp = NULL; 18046 ipsq_t *ipsq; 18047 queue_t *q; 18048 18049 ASSERT(IAM_WRITER_ILL(ill)); 18050 18051 ipsq = ill->ill_phyint->phyint_ipsq; 18052 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 18053 mp1 = ipsq_pending_mp_get(ipsq, &connp); 18054 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 18055 if (mp1 == NULL) { 18056 ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n", 18057 ipsq->ipsq_xop->ipx_current_ioctl)); 18058 /* bringup was aborted by the user */ 18059 return; 18060 } 18061 /* 18062 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 18063 * must have an associated conn_t. Otherwise, we're bringing this 18064 * interface back up as part of handling an asynchronous event (e.g., 18065 * physical address change). 18066 */ 18067 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18068 ASSERT(connp != NULL); 18069 q = CONNP_TO_WQ(connp); 18070 } else { 18071 ASSERT(connp == NULL); 18072 q = ill->ill_rq; 18073 } 18074 if ((err == 0) && (ill->ill_up_ipifs)) { 18075 err = ill_up_ipifs(ill, q, mp1); 18076 if (err == EINPROGRESS) 18077 return; 18078 } 18079 /* 18080 * The operation must complete without EINPROGRESS since 18081 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 18082 * Otherwise, the operation will be stuck forever in the ipsq. 18083 */ 18084 ASSERT(err != EINPROGRESS); 18085 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18086 DTRACE_PROBE4(ipif__ioctl, char *, 18087 "arp_replumb_done finish", 18088 int, ipsq->ipsq_xop->ipx_current_ioctl, 18089 ill_t *, ill, ipif_t *, ipif); 18090 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 18091 } else { 18092 ipsq_current_finish(ipsq); 18093 } 18094 } 18095 18096 void 18097 ipif_up_notify(ipif_t *ipif) 18098 { 18099 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 18100 ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT); 18101 sctp_update_ipif(ipif, SCTP_IPIF_UP); 18102 ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id), 18103 NE_LIF_UP, NULL, 0); 18104 } 18105 18106 /* 18107 * ILB ioctl uses cv_wait (such as deleting a rule or adding a server) and 18108 * this assumes the context is cv_wait'able. Hence it shouldnt' be used on 18109 * TPI end points with STREAMS modules pushed above. This is assured by not 18110 * having the IPI_MODOK flag for the ioctl. And IP ensures the ILB ioctl 18111 * never ends up on an ipsq, otherwise we may end up processing the ioctl 18112 * while unwinding from the ispq and that could be a thread from the bottom. 18113 */ 18114 /* ARGSUSED */ 18115 int 18116 ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 18117 ip_ioctl_cmd_t *ipip, void *arg) 18118 { 18119 mblk_t *cmd_mp = mp->b_cont->b_cont; 18120 ilb_cmd_t command = *((ilb_cmd_t *)cmd_mp->b_rptr); 18121 int ret = 0; 18122 int i; 18123 size_t size; 18124 ip_stack_t *ipst; 18125 zoneid_t zoneid; 18126 ilb_stack_t *ilbs; 18127 18128 ipst = CONNQ_TO_IPST(q); 18129 ilbs = ipst->ips_netstack->netstack_ilb; 18130 zoneid = Q_TO_CONN(q)->conn_zoneid; 18131 18132 switch (command) { 18133 case ILB_CREATE_RULE: { 18134 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 18135 18136 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 18137 ret = EINVAL; 18138 break; 18139 } 18140 18141 ret = ilb_rule_add(ilbs, zoneid, cmd); 18142 break; 18143 } 18144 case ILB_DESTROY_RULE: 18145 case ILB_ENABLE_RULE: 18146 case ILB_DISABLE_RULE: { 18147 ilb_name_cmd_t *cmd = (ilb_name_cmd_t *)cmd_mp->b_rptr; 18148 18149 if (MBLKL(cmd_mp) != sizeof (ilb_name_cmd_t)) { 18150 ret = EINVAL; 18151 break; 18152 } 18153 18154 if (cmd->flags & ILB_RULE_ALLRULES) { 18155 if (command == ILB_DESTROY_RULE) { 18156 ilb_rule_del_all(ilbs, zoneid); 18157 break; 18158 } else if (command == ILB_ENABLE_RULE) { 18159 ilb_rule_enable_all(ilbs, zoneid); 18160 break; 18161 } else if (command == ILB_DISABLE_RULE) { 18162 ilb_rule_disable_all(ilbs, zoneid); 18163 break; 18164 } 18165 } else { 18166 if (command == ILB_DESTROY_RULE) { 18167 ret = ilb_rule_del(ilbs, zoneid, cmd->name); 18168 } else if (command == ILB_ENABLE_RULE) { 18169 ret = ilb_rule_enable(ilbs, zoneid, cmd->name, 18170 NULL); 18171 } else if (command == ILB_DISABLE_RULE) { 18172 ret = ilb_rule_disable(ilbs, zoneid, cmd->name, 18173 NULL); 18174 } 18175 } 18176 break; 18177 } 18178 case ILB_NUM_RULES: { 18179 ilb_num_rules_cmd_t *cmd; 18180 18181 if (MBLKL(cmd_mp) != sizeof (ilb_num_rules_cmd_t)) { 18182 ret = EINVAL; 18183 break; 18184 } 18185 cmd = (ilb_num_rules_cmd_t *)cmd_mp->b_rptr; 18186 ilb_get_num_rules(ilbs, zoneid, &(cmd->num)); 18187 break; 18188 } 18189 case ILB_RULE_NAMES: { 18190 ilb_rule_names_cmd_t *cmd; 18191 18192 cmd = (ilb_rule_names_cmd_t *)cmd_mp->b_rptr; 18193 if (MBLKL(cmd_mp) < sizeof (ilb_rule_names_cmd_t) || 18194 cmd->num_names == 0) { 18195 ret = EINVAL; 18196 break; 18197 } 18198 size = cmd->num_names * ILB_RULE_NAMESZ; 18199 if (cmd_mp->b_rptr + offsetof(ilb_rule_names_cmd_t, buf) + 18200 size != cmd_mp->b_wptr) { 18201 ret = EINVAL; 18202 break; 18203 } 18204 ilb_get_rulenames(ilbs, zoneid, &cmd->num_names, cmd->buf); 18205 break; 18206 } 18207 case ILB_NUM_SERVERS: { 18208 ilb_num_servers_cmd_t *cmd; 18209 18210 if (MBLKL(cmd_mp) != sizeof (ilb_num_servers_cmd_t)) { 18211 ret = EINVAL; 18212 break; 18213 } 18214 cmd = (ilb_num_servers_cmd_t *)cmd_mp->b_rptr; 18215 ret = ilb_get_num_servers(ilbs, zoneid, cmd->name, 18216 &(cmd->num)); 18217 break; 18218 } 18219 case ILB_LIST_RULE: { 18220 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 18221 18222 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 18223 ret = EINVAL; 18224 break; 18225 } 18226 ret = ilb_rule_list(ilbs, zoneid, cmd); 18227 break; 18228 } 18229 case ILB_LIST_SERVERS: { 18230 ilb_servers_info_cmd_t *cmd; 18231 18232 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 18233 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t) || 18234 cmd->num_servers == 0) { 18235 ret = EINVAL; 18236 break; 18237 } 18238 size = cmd->num_servers * sizeof (ilb_server_info_t); 18239 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 18240 size != cmd_mp->b_wptr) { 18241 ret = EINVAL; 18242 break; 18243 } 18244 18245 ret = ilb_get_servers(ilbs, zoneid, cmd->name, cmd->servers, 18246 &cmd->num_servers); 18247 break; 18248 } 18249 case ILB_ADD_SERVERS: { 18250 ilb_servers_info_cmd_t *cmd; 18251 ilb_rule_t *rule; 18252 18253 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 18254 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t)) { 18255 ret = EINVAL; 18256 break; 18257 } 18258 size = cmd->num_servers * sizeof (ilb_server_info_t); 18259 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 18260 size != cmd_mp->b_wptr) { 18261 ret = EINVAL; 18262 break; 18263 } 18264 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 18265 if (rule == NULL) { 18266 ASSERT(ret != 0); 18267 break; 18268 } 18269 for (i = 0; i < cmd->num_servers; i++) { 18270 ilb_server_info_t *s; 18271 18272 s = &cmd->servers[i]; 18273 s->err = ilb_server_add(ilbs, rule, s); 18274 } 18275 ILB_RULE_REFRELE(rule); 18276 break; 18277 } 18278 case ILB_DEL_SERVERS: 18279 case ILB_ENABLE_SERVERS: 18280 case ILB_DISABLE_SERVERS: { 18281 ilb_servers_cmd_t *cmd; 18282 ilb_rule_t *rule; 18283 int (*f)(); 18284 18285 cmd = (ilb_servers_cmd_t *)cmd_mp->b_rptr; 18286 if (MBLKL(cmd_mp) < sizeof (ilb_servers_cmd_t)) { 18287 ret = EINVAL; 18288 break; 18289 } 18290 size = cmd->num_servers * sizeof (ilb_server_arg_t); 18291 if (cmd_mp->b_rptr + offsetof(ilb_servers_cmd_t, servers) + 18292 size != cmd_mp->b_wptr) { 18293 ret = EINVAL; 18294 break; 18295 } 18296 18297 if (command == ILB_DEL_SERVERS) 18298 f = ilb_server_del; 18299 else if (command == ILB_ENABLE_SERVERS) 18300 f = ilb_server_enable; 18301 else if (command == ILB_DISABLE_SERVERS) 18302 f = ilb_server_disable; 18303 18304 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 18305 if (rule == NULL) { 18306 ASSERT(ret != 0); 18307 break; 18308 } 18309 18310 for (i = 0; i < cmd->num_servers; i++) { 18311 ilb_server_arg_t *s; 18312 18313 s = &cmd->servers[i]; 18314 s->err = f(ilbs, zoneid, NULL, rule, &s->addr); 18315 } 18316 ILB_RULE_REFRELE(rule); 18317 break; 18318 } 18319 case ILB_LIST_NAT_TABLE: { 18320 ilb_list_nat_cmd_t *cmd; 18321 18322 cmd = (ilb_list_nat_cmd_t *)cmd_mp->b_rptr; 18323 if (MBLKL(cmd_mp) < sizeof (ilb_list_nat_cmd_t)) { 18324 ret = EINVAL; 18325 break; 18326 } 18327 size = cmd->num_nat * sizeof (ilb_nat_entry_t); 18328 if (cmd_mp->b_rptr + offsetof(ilb_list_nat_cmd_t, entries) + 18329 size != cmd_mp->b_wptr) { 18330 ret = EINVAL; 18331 break; 18332 } 18333 18334 ret = ilb_list_nat(ilbs, zoneid, cmd->entries, &cmd->num_nat, 18335 &cmd->flags); 18336 break; 18337 } 18338 case ILB_LIST_STICKY_TABLE: { 18339 ilb_list_sticky_cmd_t *cmd; 18340 18341 cmd = (ilb_list_sticky_cmd_t *)cmd_mp->b_rptr; 18342 if (MBLKL(cmd_mp) < sizeof (ilb_list_sticky_cmd_t)) { 18343 ret = EINVAL; 18344 break; 18345 } 18346 size = cmd->num_sticky * sizeof (ilb_sticky_entry_t); 18347 if (cmd_mp->b_rptr + offsetof(ilb_list_sticky_cmd_t, entries) + 18348 size != cmd_mp->b_wptr) { 18349 ret = EINVAL; 18350 break; 18351 } 18352 18353 ret = ilb_list_sticky(ilbs, zoneid, cmd->entries, 18354 &cmd->num_sticky, &cmd->flags); 18355 break; 18356 } 18357 default: 18358 ret = EINVAL; 18359 break; 18360 } 18361 done: 18362 return (ret); 18363 } 18364 18365 /* Remove all cache entries for this logical interface */ 18366 void 18367 ipif_nce_down(ipif_t *ipif) 18368 { 18369 ill_t *ill = ipif->ipif_ill; 18370 nce_t *nce; 18371 18372 DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down", 18373 ill_t *, ill, ipif_t *, ipif); 18374 if (ipif->ipif_added_nce) { 18375 if (ipif->ipif_isv6) 18376 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); 18377 else 18378 nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr); 18379 if (nce != NULL) { 18380 if (--nce->nce_ipif_cnt == 0) 18381 ncec_delete(nce->nce_common); 18382 ipif->ipif_added_nce = 0; 18383 nce_refrele(nce); 18384 } else { 18385 /* 18386 * nce may already be NULL because it was already 18387 * flushed, e.g., due to a call to nce_flush 18388 */ 18389 ipif->ipif_added_nce = 0; 18390 } 18391 } 18392 /* 18393 * Make IPMP aware of the deleted data address. 18394 */ 18395 if (IS_IPMP(ill)) 18396 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 18397 18398 /* 18399 * Remove all other nces dependent on this ill when the last ipif 18400 * is going away. 18401 */ 18402 if (ill->ill_ipif_up_count == 0) { 18403 ncec_walk(ill, (pfi_t)ncec_delete_per_ill, 18404 (uchar_t *)ill, ill->ill_ipst); 18405 if (IS_UNDER_IPMP(ill)) 18406 nce_flush(ill, B_TRUE); 18407 } 18408 } 18409