1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 1990 Mentat Inc. 24 */ 25 26 /* 27 * This file contains the interface control functions for IP. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/stream.h> 32 #include <sys/dlpi.h> 33 #include <sys/stropts.h> 34 #include <sys/strsun.h> 35 #include <sys/sysmacros.h> 36 #include <sys/strsubr.h> 37 #include <sys/strlog.h> 38 #include <sys/ddi.h> 39 #include <sys/sunddi.h> 40 #include <sys/cmn_err.h> 41 #include <sys/kstat.h> 42 #include <sys/debug.h> 43 #include <sys/zone.h> 44 #include <sys/sunldi.h> 45 #include <sys/file.h> 46 #include <sys/bitmap.h> 47 #include <sys/cpuvar.h> 48 #include <sys/time.h> 49 #include <sys/ctype.h> 50 #include <sys/kmem.h> 51 #include <sys/systm.h> 52 #include <sys/param.h> 53 #include <sys/socket.h> 54 #include <sys/isa_defs.h> 55 #include <net/if.h> 56 #include <net/if_arp.h> 57 #include <net/if_types.h> 58 #include <net/if_dl.h> 59 #include <net/route.h> 60 #include <sys/sockio.h> 61 #include <netinet/in.h> 62 #include <netinet/ip6.h> 63 #include <netinet/icmp6.h> 64 #include <netinet/igmp_var.h> 65 #include <sys/policy.h> 66 #include <sys/ethernet.h> 67 #include <sys/callb.h> 68 #include <sys/md5.h> 69 70 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 71 #include <inet/mi.h> 72 #include <inet/nd.h> 73 #include <inet/tunables.h> 74 #include <inet/arp.h> 75 #include <inet/ip_arp.h> 76 #include <inet/mib2.h> 77 #include <inet/ip.h> 78 #include <inet/ip6.h> 79 #include <inet/ip6_asp.h> 80 #include <inet/tcp.h> 81 #include <inet/ip_multi.h> 82 #include <inet/ip_ire.h> 83 #include <inet/ip_ftable.h> 84 #include <inet/ip_rts.h> 85 #include <inet/ip_ndp.h> 86 #include <inet/ip_if.h> 87 #include <inet/ip_impl.h> 88 #include <inet/sctp_ip.h> 89 #include <inet/ip_netinfo.h> 90 #include <inet/ilb_ip.h> 91 92 #include <netinet/igmp.h> 93 #include <inet/ip_listutils.h> 94 #include <inet/ipclassifier.h> 95 #include <sys/mac_client.h> 96 #include <sys/dld.h> 97 98 #include <sys/systeminfo.h> 99 #include <sys/bootconf.h> 100 101 #include <sys/tsol/tndb.h> 102 #include <sys/tsol/tnet.h> 103 104 #include <inet/rawip_impl.h> /* needed for icmp_stack_t */ 105 #include <inet/udp_impl.h> /* needed for udp_stack_t */ 106 107 /* The character which tells where the ill_name ends */ 108 #define IPIF_SEPARATOR_CHAR ':' 109 110 /* IP ioctl function table entry */ 111 typedef struct ipft_s { 112 int ipft_cmd; 113 pfi_t ipft_pfi; 114 int ipft_min_size; 115 int ipft_flags; 116 } ipft_t; 117 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 118 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 119 120 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 121 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 122 char *value, caddr_t cp, cred_t *ioc_cr); 123 124 static boolean_t ill_is_quiescent(ill_t *); 125 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 126 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 127 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 128 mblk_t *mp, boolean_t need_up); 129 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 130 mblk_t *mp, boolean_t need_up); 131 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 132 queue_t *q, mblk_t *mp, boolean_t need_up); 133 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 134 mblk_t *mp); 135 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 136 mblk_t *mp); 137 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 138 queue_t *q, mblk_t *mp, boolean_t need_up); 139 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, 140 int ioccmd, struct linkblk *li); 141 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); 142 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 143 static void ipsq_flush(ill_t *ill); 144 145 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 146 queue_t *q, mblk_t *mp, boolean_t need_up); 147 static void ipsq_delete(ipsq_t *); 148 149 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 150 boolean_t initialize, boolean_t insert, int *errorp); 151 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); 152 static void ipif_delete_bcast_ires(ipif_t *ipif); 153 static int ipif_add_ires_v4(ipif_t *, boolean_t); 154 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, 155 boolean_t isv6); 156 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 157 static void ipif_free(ipif_t *ipif); 158 static void ipif_free_tail(ipif_t *ipif); 159 static void ipif_set_default(ipif_t *ipif); 160 static int ipif_set_values(queue_t *q, mblk_t *mp, 161 char *interf_name, uint_t *ppa); 162 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 163 queue_t *q); 164 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 165 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 166 ip_stack_t *); 167 static ipif_t *ipif_lookup_on_name_async(char *name, size_t namelen, 168 boolean_t isv6, zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, 169 int *error, ip_stack_t *); 170 171 static int ill_alloc_ppa(ill_if_t *, ill_t *); 172 static void ill_delete_interface_type(ill_if_t *); 173 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 174 static void ill_dl_down(ill_t *ill); 175 static void ill_down(ill_t *ill); 176 static void ill_down_ipifs(ill_t *, boolean_t); 177 static void ill_free_mib(ill_t *ill); 178 static void ill_glist_delete(ill_t *); 179 static void ill_phyint_reinit(ill_t *ill); 180 static void ill_set_nce_router_flags(ill_t *, boolean_t); 181 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); 182 static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *); 183 184 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; 185 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid; 186 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; 187 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid; 188 static ip_v4mapinfo_func_t ip_ether_v4_mapping; 189 static ip_v6mapinfo_func_t ip_ether_v6_mapping; 190 static ip_v4mapinfo_func_t ip_ib_v4_mapping; 191 static ip_v6mapinfo_func_t ip_ib_v6_mapping; 192 static ip_v4mapinfo_func_t ip_mbcast_mapping; 193 static void ip_cgtp_bcast_add(ire_t *, ip_stack_t *); 194 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); 195 static void phyint_free(phyint_t *); 196 197 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *); 198 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 199 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 200 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 201 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *); 202 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 203 dl_capability_sub_t *); 204 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *); 205 static void ill_capability_dld_reset_fill(ill_t *, mblk_t *); 206 static void ill_capability_dld_ack(ill_t *, mblk_t *, 207 dl_capability_sub_t *); 208 static void ill_capability_dld_enable(ill_t *); 209 static void ill_capability_ack_thr(void *); 210 static void ill_capability_lso_enable(ill_t *); 211 212 static ill_t *ill_prev_usesrc(ill_t *); 213 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 214 static void ill_disband_usesrc_group(ill_t *); 215 static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int); 216 217 #ifdef DEBUG 218 static void ill_trace_cleanup(const ill_t *); 219 static void ipif_trace_cleanup(const ipif_t *); 220 #endif 221 222 static void ill_dlpi_clear_deferred(ill_t *ill); 223 224 /* 225 * if we go over the memory footprint limit more than once in this msec 226 * interval, we'll start pruning aggressively. 227 */ 228 int ip_min_frag_prune_time = 0; 229 230 static ipft_t ip_ioctl_ftbl[] = { 231 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 232 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 233 IPFT_F_NO_REPLY }, 234 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 235 { 0 } 236 }; 237 238 /* Simple ICMP IP Header Template */ 239 static ipha_t icmp_ipha = { 240 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 241 }; 242 243 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 244 245 static ip_m_t ip_m_tbl[] = { 246 { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 247 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 248 ip_nodef_v6intfid }, 249 { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6, 250 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 251 ip_nodef_v6intfid }, 252 { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6, 253 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 254 ip_nodef_v6intfid }, 255 { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6, 256 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 257 ip_nodef_v6intfid }, 258 { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6, 259 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 260 ip_nodef_v6intfid }, 261 { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6, 262 ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid, 263 ip_nodef_v6intfid }, 264 { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6, 265 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 266 ip_ipv4_v6destintfid }, 267 { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6, 268 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid, 269 ip_ipv6_v6destintfid }, 270 { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6, 271 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 272 ip_nodef_v6intfid }, 273 { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 274 NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid }, 275 { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 276 NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid }, 277 { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 278 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 279 ip_nodef_v6intfid } 280 }; 281 282 static ill_t ill_null; /* Empty ILL for init. */ 283 char ipif_loopback_name[] = "lo0"; 284 285 /* These are used by all IP network modules. */ 286 sin6_t sin6_null; /* Zero address for quick clears */ 287 sin_t sin_null; /* Zero address for quick clears */ 288 289 /* When set search for unused ipif_seqid */ 290 static ipif_t ipif_zero; 291 292 /* 293 * ppa arena is created after these many 294 * interfaces have been plumbed. 295 */ 296 uint_t ill_no_arena = 12; /* Setable in /etc/system */ 297 298 /* 299 * Allocate per-interface mibs. 300 * Returns true if ok. False otherwise. 301 * ipsq may not yet be allocated (loopback case ). 302 */ 303 static boolean_t 304 ill_allocate_mibs(ill_t *ill) 305 { 306 /* Already allocated? */ 307 if (ill->ill_ip_mib != NULL) { 308 if (ill->ill_isv6) 309 ASSERT(ill->ill_icmp6_mib != NULL); 310 return (B_TRUE); 311 } 312 313 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), 314 KM_NOSLEEP); 315 if (ill->ill_ip_mib == NULL) { 316 return (B_FALSE); 317 } 318 319 /* Setup static information */ 320 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, 321 sizeof (mib2_ipIfStatsEntry_t)); 322 if (ill->ill_isv6) { 323 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 324 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 325 sizeof (mib2_ipv6AddrEntry_t)); 326 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 327 sizeof (mib2_ipv6RouteEntry_t)); 328 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 329 sizeof (mib2_ipv6NetToMediaEntry_t)); 330 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 331 sizeof (ipv6_member_t)); 332 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 333 sizeof (ipv6_grpsrc_t)); 334 } else { 335 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 336 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 337 sizeof (mib2_ipAddrEntry_t)); 338 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 339 sizeof (mib2_ipRouteEntry_t)); 340 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 341 sizeof (mib2_ipNetToMediaEntry_t)); 342 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 343 sizeof (ip_member_t)); 344 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 345 sizeof (ip_grpsrc_t)); 346 347 /* 348 * For a v4 ill, we are done at this point, because per ill 349 * icmp mibs are only used for v6. 350 */ 351 return (B_TRUE); 352 } 353 354 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 355 KM_NOSLEEP); 356 if (ill->ill_icmp6_mib == NULL) { 357 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 358 ill->ill_ip_mib = NULL; 359 return (B_FALSE); 360 } 361 /* static icmp info */ 362 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 363 sizeof (mib2_ipv6IfIcmpEntry_t); 364 /* 365 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later 366 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 367 * -> ill_phyint_reinit 368 */ 369 return (B_TRUE); 370 } 371 372 /* 373 * Completely vaporize a lower level tap and all associated interfaces. 374 * ill_delete is called only out of ip_close when the device control 375 * stream is being closed. 376 */ 377 void 378 ill_delete(ill_t *ill) 379 { 380 ipif_t *ipif; 381 ill_t *prev_ill; 382 ip_stack_t *ipst = ill->ill_ipst; 383 384 /* 385 * ill_delete may be forcibly entering the ipsq. The previous 386 * ioctl may not have completed and may need to be aborted. 387 * ipsq_flush takes care of it. If we don't need to enter the 388 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 389 * ill_delete_tail is sufficient. 390 */ 391 ipsq_flush(ill); 392 393 /* 394 * Nuke all interfaces. ipif_free will take down the interface, 395 * remove it from the list, and free the data structure. 396 * Walk down the ipif list and remove the logical interfaces 397 * first before removing the main ipif. We can't unplumb 398 * zeroth interface first in the case of IPv6 as update_conn_ill 399 * -> ip_ll_multireq de-references ill_ipif for checking 400 * POINTOPOINT. 401 * 402 * If ill_ipif was not properly initialized (i.e low on memory), 403 * then no interfaces to clean up. In this case just clean up the 404 * ill. 405 */ 406 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 407 ipif_free(ipif); 408 409 /* 410 * clean out all the nce_t entries that depend on this 411 * ill for the ill_phys_addr. 412 */ 413 nce_flush(ill, B_TRUE); 414 415 /* Clean up msgs on pending upcalls for mrouted */ 416 reset_mrt_ill(ill); 417 418 update_conn_ill(ill, ipst); 419 420 /* 421 * Remove multicast references added as a result of calls to 422 * ip_join_allmulti(). 423 */ 424 ip_purge_allmulti(ill); 425 426 /* 427 * If the ill being deleted is under IPMP, boot it out of the illgrp. 428 */ 429 if (IS_UNDER_IPMP(ill)) 430 ipmp_ill_leave_illgrp(ill); 431 432 /* 433 * ill_down will arrange to blow off any IRE's dependent on this 434 * ILL, and shut down fragmentation reassembly. 435 */ 436 ill_down(ill); 437 438 /* Let SCTP know, so that it can remove this from its list. */ 439 sctp_update_ill(ill, SCTP_ILL_REMOVE); 440 441 /* 442 * Walk all CONNs that can have a reference on an ire or nce for this 443 * ill (we actually walk all that now have stale references). 444 */ 445 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); 446 447 /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 448 if (ill->ill_isv6) 449 dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst); 450 451 /* 452 * If an address on this ILL is being used as a source address then 453 * clear out the pointers in other ILLs that point to this ILL. 454 */ 455 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 456 if (ill->ill_usesrc_grp_next != NULL) { 457 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 458 ill_disband_usesrc_group(ill); 459 } else { /* consumer of the usesrc ILL */ 460 prev_ill = ill_prev_usesrc(ill); 461 prev_ill->ill_usesrc_grp_next = 462 ill->ill_usesrc_grp_next; 463 } 464 } 465 rw_exit(&ipst->ips_ill_g_usesrc_lock); 466 } 467 468 static void 469 ipif_non_duplicate(ipif_t *ipif) 470 { 471 ill_t *ill = ipif->ipif_ill; 472 mutex_enter(&ill->ill_lock); 473 if (ipif->ipif_flags & IPIF_DUPLICATE) { 474 ipif->ipif_flags &= ~IPIF_DUPLICATE; 475 ASSERT(ill->ill_ipif_dup_count > 0); 476 ill->ill_ipif_dup_count--; 477 } 478 mutex_exit(&ill->ill_lock); 479 } 480 481 /* 482 * ill_delete_tail is called from ip_modclose after all references 483 * to the closing ill are gone. The wait is done in ip_modclose 484 */ 485 void 486 ill_delete_tail(ill_t *ill) 487 { 488 mblk_t **mpp; 489 ipif_t *ipif; 490 ip_stack_t *ipst = ill->ill_ipst; 491 492 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 493 ipif_non_duplicate(ipif); 494 (void) ipif_down_tail(ipif); 495 } 496 497 ASSERT(ill->ill_ipif_dup_count == 0); 498 499 /* 500 * If polling capability is enabled (which signifies direct 501 * upcall into IP and driver has ill saved as a handle), 502 * we need to make sure that unbind has completed before we 503 * let the ill disappear and driver no longer has any reference 504 * to this ill. 505 */ 506 mutex_enter(&ill->ill_lock); 507 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 508 cv_wait(&ill->ill_cv, &ill->ill_lock); 509 mutex_exit(&ill->ill_lock); 510 ASSERT(!(ill->ill_capabilities & 511 (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT))); 512 513 if (ill->ill_net_type != IRE_LOOPBACK) 514 qprocsoff(ill->ill_rq); 515 516 /* 517 * We do an ipsq_flush once again now. New messages could have 518 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 519 * could also have landed up if an ioctl thread had looked up 520 * the ill before we set the ILL_CONDEMNED flag, but not yet 521 * enqueued the ioctl when we did the ipsq_flush last time. 522 */ 523 ipsq_flush(ill); 524 525 /* 526 * Free capabilities. 527 */ 528 if (ill->ill_hcksum_capab != NULL) { 529 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 530 ill->ill_hcksum_capab = NULL; 531 } 532 533 if (ill->ill_zerocopy_capab != NULL) { 534 kmem_free(ill->ill_zerocopy_capab, 535 sizeof (ill_zerocopy_capab_t)); 536 ill->ill_zerocopy_capab = NULL; 537 } 538 539 if (ill->ill_lso_capab != NULL) { 540 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 541 ill->ill_lso_capab = NULL; 542 } 543 544 if (ill->ill_dld_capab != NULL) { 545 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t)); 546 ill->ill_dld_capab = NULL; 547 } 548 549 while (ill->ill_ipif != NULL) 550 ipif_free_tail(ill->ill_ipif); 551 552 /* 553 * We have removed all references to ilm from conn and the ones joined 554 * within the kernel. 555 * 556 * We don't walk conns, mrts and ires because 557 * 558 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts. 559 * 2) ill_down ->ill_downi walks all the ires and cleans up 560 * ill references. 561 */ 562 563 /* 564 * If this ill is an IPMP meta-interface, blow away the illgrp. This 565 * is safe to do because the illgrp has already been unlinked from the 566 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it. 567 */ 568 if (IS_IPMP(ill)) { 569 ipmp_illgrp_destroy(ill->ill_grp); 570 ill->ill_grp = NULL; 571 } 572 573 /* 574 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free 575 * could free the phyint. No more reference to the phyint after this 576 * point. 577 */ 578 (void) ill_glist_delete(ill); 579 580 if (ill->ill_frag_ptr != NULL) { 581 uint_t count; 582 583 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 584 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 585 } 586 mi_free(ill->ill_frag_ptr); 587 ill->ill_frag_ptr = NULL; 588 ill->ill_frag_hash_tbl = NULL; 589 } 590 591 freemsg(ill->ill_nd_lla_mp); 592 /* Free all retained control messages. */ 593 mpp = &ill->ill_first_mp_to_free; 594 do { 595 while (mpp[0]) { 596 mblk_t *mp; 597 mblk_t *mp1; 598 599 mp = mpp[0]; 600 mpp[0] = mp->b_next; 601 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 602 mp1->b_next = NULL; 603 mp1->b_prev = NULL; 604 } 605 freemsg(mp); 606 } 607 } while (mpp++ != &ill->ill_last_mp_to_free); 608 609 ill_free_mib(ill); 610 611 #ifdef DEBUG 612 ill_trace_cleanup(ill); 613 #endif 614 615 /* The default multicast interface might have changed */ 616 ire_increment_multicast_generation(ipst, ill->ill_isv6); 617 618 /* Drop refcnt here */ 619 netstack_rele(ill->ill_ipst->ips_netstack); 620 ill->ill_ipst = NULL; 621 } 622 623 static void 624 ill_free_mib(ill_t *ill) 625 { 626 ip_stack_t *ipst = ill->ill_ipst; 627 628 /* 629 * MIB statistics must not be lost, so when an interface 630 * goes away the counter values will be added to the global 631 * MIBs. 632 */ 633 if (ill->ill_ip_mib != NULL) { 634 if (ill->ill_isv6) { 635 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, 636 ill->ill_ip_mib); 637 } else { 638 ip_mib2_add_ip_stats(&ipst->ips_ip_mib, 639 ill->ill_ip_mib); 640 } 641 642 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 643 ill->ill_ip_mib = NULL; 644 } 645 if (ill->ill_icmp6_mib != NULL) { 646 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, 647 ill->ill_icmp6_mib); 648 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 649 ill->ill_icmp6_mib = NULL; 650 } 651 } 652 653 /* 654 * Concatenate together a physical address and a sap. 655 * 656 * Sap_lengths are interpreted as follows: 657 * sap_length == 0 ==> no sap 658 * sap_length > 0 ==> sap is at the head of the dlpi address 659 * sap_length < 0 ==> sap is at the tail of the dlpi address 660 */ 661 static void 662 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 663 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 664 { 665 uint16_t sap_addr = (uint16_t)sap_src; 666 667 if (sap_length == 0) { 668 if (phys_src == NULL) 669 bzero(dst, phys_length); 670 else 671 bcopy(phys_src, dst, phys_length); 672 } else if (sap_length < 0) { 673 if (phys_src == NULL) 674 bzero(dst, phys_length); 675 else 676 bcopy(phys_src, dst, phys_length); 677 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 678 } else { 679 bcopy(&sap_addr, dst, sizeof (sap_addr)); 680 if (phys_src == NULL) 681 bzero((char *)dst + sap_length, phys_length); 682 else 683 bcopy(phys_src, (char *)dst + sap_length, phys_length); 684 } 685 } 686 687 /* 688 * Generate a dl_unitdata_req mblk for the device and address given. 689 * addr_length is the length of the physical portion of the address. 690 * If addr is NULL include an all zero address of the specified length. 691 * TRUE? In any case, addr_length is taken to be the entire length of the 692 * dlpi address, including the absolute value of sap_length. 693 */ 694 mblk_t * 695 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 696 t_scalar_t sap_length) 697 { 698 dl_unitdata_req_t *dlur; 699 mblk_t *mp; 700 t_scalar_t abs_sap_length; /* absolute value */ 701 702 abs_sap_length = ABS(sap_length); 703 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 704 DL_UNITDATA_REQ); 705 if (mp == NULL) 706 return (NULL); 707 dlur = (dl_unitdata_req_t *)mp->b_rptr; 708 /* HACK: accomodate incompatible DLPI drivers */ 709 if (addr_length == 8) 710 addr_length = 6; 711 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 712 dlur->dl_dest_addr_offset = sizeof (*dlur); 713 dlur->dl_priority.dl_min = 0; 714 dlur->dl_priority.dl_max = 0; 715 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 716 (uchar_t *)&dlur[1]); 717 return (mp); 718 } 719 720 /* 721 * Add the pending mp to the list. There can be only 1 pending mp 722 * in the list. Any exclusive ioctl that needs to wait for a response 723 * from another module or driver needs to use this function to set 724 * the ipx_pending_mp to the ioctl mblk and wait for the response from 725 * the other module/driver. This is also used while waiting for the 726 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 727 */ 728 boolean_t 729 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 730 int waitfor) 731 { 732 ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop; 733 734 ASSERT(IAM_WRITER_IPIF(ipif)); 735 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 736 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 737 ASSERT(ipx->ipx_pending_mp == NULL); 738 /* 739 * The caller may be using a different ipif than the one passed into 740 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 741 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT 742 * that `ipx_current_ipif == ipif'. 743 */ 744 ASSERT(ipx->ipx_current_ipif != NULL); 745 746 /* 747 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the 748 * driver. 749 */ 750 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) || 751 (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) || 752 (DB_TYPE(add_mp) == M_PCPROTO)); 753 754 if (connp != NULL) { 755 ASSERT(MUTEX_HELD(&connp->conn_lock)); 756 /* 757 * Return error if the conn has started closing. The conn 758 * could have finished cleaning up the pending mp list, 759 * If so we should not add another mp to the list negating 760 * the cleanup. 761 */ 762 if (connp->conn_state_flags & CONN_CLOSING) 763 return (B_FALSE); 764 } 765 mutex_enter(&ipx->ipx_lock); 766 ipx->ipx_pending_ipif = ipif; 767 /* 768 * Note down the queue in b_queue. This will be returned by 769 * ipsq_pending_mp_get. Caller will then use these values to restart 770 * the processing 771 */ 772 add_mp->b_next = NULL; 773 add_mp->b_queue = q; 774 ipx->ipx_pending_mp = add_mp; 775 ipx->ipx_waitfor = waitfor; 776 mutex_exit(&ipx->ipx_lock); 777 778 if (connp != NULL) 779 connp->conn_oper_pending_ill = ipif->ipif_ill; 780 781 return (B_TRUE); 782 } 783 784 /* 785 * Retrieve the ipx_pending_mp and return it. There can be only 1 mp 786 * queued in the list. 787 */ 788 mblk_t * 789 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 790 { 791 mblk_t *curr = NULL; 792 ipxop_t *ipx = ipsq->ipsq_xop; 793 794 *connpp = NULL; 795 mutex_enter(&ipx->ipx_lock); 796 if (ipx->ipx_pending_mp == NULL) { 797 mutex_exit(&ipx->ipx_lock); 798 return (NULL); 799 } 800 801 /* There can be only 1 such excl message */ 802 curr = ipx->ipx_pending_mp; 803 ASSERT(curr->b_next == NULL); 804 ipx->ipx_pending_ipif = NULL; 805 ipx->ipx_pending_mp = NULL; 806 ipx->ipx_waitfor = 0; 807 mutex_exit(&ipx->ipx_lock); 808 809 if (CONN_Q(curr->b_queue)) { 810 /* 811 * This mp did a refhold on the conn, at the start of the ioctl. 812 * So we can safely return a pointer to the conn to the caller. 813 */ 814 *connpp = Q_TO_CONN(curr->b_queue); 815 } else { 816 *connpp = NULL; 817 } 818 curr->b_next = NULL; 819 curr->b_prev = NULL; 820 return (curr); 821 } 822 823 /* 824 * Cleanup the ioctl mp queued in ipx_pending_mp 825 * - Called in the ill_delete path 826 * - Called in the M_ERROR or M_HANGUP path on the ill. 827 * - Called in the conn close path. 828 * 829 * Returns success on finding the pending mblk associated with the ioctl or 830 * exclusive operation in progress, failure otherwise. 831 */ 832 boolean_t 833 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 834 { 835 mblk_t *mp; 836 ipxop_t *ipx; 837 queue_t *q; 838 ipif_t *ipif; 839 int cmd; 840 841 ASSERT(IAM_WRITER_ILL(ill)); 842 ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 843 844 mutex_enter(&ipx->ipx_lock); 845 mp = ipx->ipx_pending_mp; 846 if (connp != NULL) { 847 if (mp == NULL || mp->b_queue != CONNP_TO_WQ(connp)) { 848 /* 849 * Nothing to clean since the conn that is closing 850 * does not have a matching pending mblk in 851 * ipx_pending_mp. 852 */ 853 mutex_exit(&ipx->ipx_lock); 854 return (B_FALSE); 855 } 856 } else { 857 /* 858 * A non-zero ill_error signifies we are called in the 859 * M_ERROR or M_HANGUP path and we need to unconditionally 860 * abort any current ioctl and do the corresponding cleanup. 861 * A zero ill_error means we are in the ill_delete path and 862 * we do the cleanup only if there is a pending mp. 863 */ 864 if (mp == NULL && ill->ill_error == 0) { 865 mutex_exit(&ipx->ipx_lock); 866 return (B_FALSE); 867 } 868 } 869 870 /* Now remove from the ipx_pending_mp */ 871 ipx->ipx_pending_mp = NULL; 872 ipif = ipx->ipx_pending_ipif; 873 ipx->ipx_pending_ipif = NULL; 874 ipx->ipx_waitfor = 0; 875 ipx->ipx_current_ipif = NULL; 876 cmd = ipx->ipx_current_ioctl; 877 ipx->ipx_current_ioctl = 0; 878 ipx->ipx_current_done = B_TRUE; 879 mutex_exit(&ipx->ipx_lock); 880 881 if (mp == NULL) 882 return (B_FALSE); 883 884 q = mp->b_queue; 885 mp->b_next = NULL; 886 mp->b_prev = NULL; 887 mp->b_queue = NULL; 888 889 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 890 DTRACE_PROBE4(ipif__ioctl, 891 char *, "ipsq_pending_mp_cleanup", 892 int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill, 893 ipif_t *, ipif); 894 if (connp == NULL) { 895 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 896 } else { 897 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); 898 mutex_enter(&ipif->ipif_ill->ill_lock); 899 ipif->ipif_state_flags &= ~IPIF_CHANGING; 900 mutex_exit(&ipif->ipif_ill->ill_lock); 901 } 902 } else { 903 inet_freemsg(mp); 904 } 905 return (B_TRUE); 906 } 907 908 /* 909 * Called in the conn close path and ill delete path 910 */ 911 static void 912 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 913 { 914 ipsq_t *ipsq; 915 mblk_t *prev; 916 mblk_t *curr; 917 mblk_t *next; 918 queue_t *wq, *rq = NULL; 919 mblk_t *tmp_list = NULL; 920 921 ASSERT(IAM_WRITER_ILL(ill)); 922 if (connp != NULL) 923 wq = CONNP_TO_WQ(connp); 924 else 925 wq = ill->ill_wq; 926 927 /* 928 * In the case of lo0 being unplumbed, ill_wq will be NULL. Guard 929 * against this here. 930 */ 931 if (wq != NULL) 932 rq = RD(wq); 933 934 ipsq = ill->ill_phyint->phyint_ipsq; 935 /* 936 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 937 * In the case of ioctl from a conn, there can be only 1 mp 938 * queued on the ipsq. If an ill is being unplumbed flush all 939 * the messages. 940 */ 941 mutex_enter(&ipsq->ipsq_lock); 942 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 943 curr = next) { 944 next = curr->b_next; 945 if (connp == NULL || 946 (curr->b_queue == wq || curr->b_queue == rq)) { 947 /* Unlink the mblk from the pending mp list */ 948 if (prev != NULL) { 949 prev->b_next = curr->b_next; 950 } else { 951 ASSERT(ipsq->ipsq_xopq_mphead == curr); 952 ipsq->ipsq_xopq_mphead = curr->b_next; 953 } 954 if (ipsq->ipsq_xopq_mptail == curr) 955 ipsq->ipsq_xopq_mptail = prev; 956 /* 957 * Create a temporary list and release the ipsq lock 958 * New elements are added to the head of the tmp_list 959 */ 960 curr->b_next = tmp_list; 961 tmp_list = curr; 962 } else { 963 prev = curr; 964 } 965 } 966 mutex_exit(&ipsq->ipsq_lock); 967 968 while (tmp_list != NULL) { 969 curr = tmp_list; 970 tmp_list = curr->b_next; 971 curr->b_next = NULL; 972 curr->b_prev = NULL; 973 wq = curr->b_queue; 974 curr->b_queue = NULL; 975 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 976 DTRACE_PROBE4(ipif__ioctl, 977 char *, "ipsq_xopq_mp_cleanup", 978 int, 0, ill_t *, NULL, ipif_t *, NULL); 979 ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ? 980 CONN_CLOSE : NO_COPYOUT, NULL); 981 } else { 982 /* 983 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 984 * this can't be just inet_freemsg. we have to 985 * restart it otherwise the thread will be stuck. 986 */ 987 inet_freemsg(curr); 988 } 989 } 990 } 991 992 /* 993 * This conn has started closing. Cleanup any pending ioctl from this conn. 994 * STREAMS ensures that there can be at most 1 active ioctl on a stream. 995 */ 996 void 997 conn_ioctl_cleanup(conn_t *connp) 998 { 999 ipsq_t *ipsq; 1000 ill_t *ill; 1001 boolean_t refheld; 1002 1003 /* 1004 * Check for a queued ioctl. If the ioctl has not yet started, the mp 1005 * is pending in the list headed by ipsq_xopq_head. If the ioctl has 1006 * started the mp could be present in ipx_pending_mp. Note that if 1007 * conn_oper_pending_ill is NULL, the ioctl may still be in flight and 1008 * not yet queued anywhere. In this case, the conn close code will wait 1009 * until the conn_ref is dropped. If the stream was a tcp stream, then 1010 * tcp_close will wait first until all ioctls have completed for this 1011 * conn. 1012 */ 1013 mutex_enter(&connp->conn_lock); 1014 ill = connp->conn_oper_pending_ill; 1015 if (ill == NULL) { 1016 mutex_exit(&connp->conn_lock); 1017 return; 1018 } 1019 1020 /* 1021 * We may not be able to refhold the ill if the ill/ipif 1022 * is changing. But we need to make sure that the ill will 1023 * not vanish. So we just bump up the ill_waiter count. 1024 */ 1025 refheld = ill_waiter_inc(ill); 1026 mutex_exit(&connp->conn_lock); 1027 if (refheld) { 1028 if (ipsq_enter(ill, B_TRUE, NEW_OP)) { 1029 ill_waiter_dcr(ill); 1030 /* 1031 * Check whether this ioctl has started and is 1032 * pending. If it is not found there then check 1033 * whether this ioctl has not even started and is in 1034 * the ipsq_xopq list. 1035 */ 1036 if (!ipsq_pending_mp_cleanup(ill, connp)) 1037 ipsq_xopq_mp_cleanup(ill, connp); 1038 ipsq = ill->ill_phyint->phyint_ipsq; 1039 ipsq_exit(ipsq); 1040 return; 1041 } 1042 } 1043 1044 /* 1045 * The ill is also closing and we could not bump up the 1046 * ill_waiter_count or we could not enter the ipsq. Leave 1047 * the cleanup to ill_delete 1048 */ 1049 mutex_enter(&connp->conn_lock); 1050 while (connp->conn_oper_pending_ill != NULL) 1051 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1052 mutex_exit(&connp->conn_lock); 1053 if (refheld) 1054 ill_waiter_dcr(ill); 1055 } 1056 1057 /* 1058 * ipcl_walk function for cleaning up conn_*_ill fields. 1059 * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and 1060 * conn_bound_if in place. We prefer dropping 1061 * packets instead of sending them out the wrong interface, or accepting 1062 * packets from the wrong ifindex. 1063 */ 1064 static void 1065 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1066 { 1067 ill_t *ill = (ill_t *)arg; 1068 1069 mutex_enter(&connp->conn_lock); 1070 if (connp->conn_dhcpinit_ill == ill) { 1071 connp->conn_dhcpinit_ill = NULL; 1072 ASSERT(ill->ill_dhcpinit != 0); 1073 atomic_dec_32(&ill->ill_dhcpinit); 1074 ill_set_inputfn(ill); 1075 } 1076 mutex_exit(&connp->conn_lock); 1077 } 1078 1079 static int 1080 ill_down_ipifs_tail(ill_t *ill) 1081 { 1082 ipif_t *ipif; 1083 int err; 1084 1085 ASSERT(IAM_WRITER_ILL(ill)); 1086 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1087 ipif_non_duplicate(ipif); 1088 /* 1089 * ipif_down_tail will call arp_ll_down on the last ipif 1090 * and typically return EINPROGRESS when the DL_UNBIND is sent. 1091 */ 1092 if ((err = ipif_down_tail(ipif)) != 0) 1093 return (err); 1094 } 1095 return (0); 1096 } 1097 1098 /* ARGSUSED */ 1099 void 1100 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1101 { 1102 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1103 (void) ill_down_ipifs_tail(q->q_ptr); 1104 freemsg(mp); 1105 ipsq_current_finish(ipsq); 1106 } 1107 1108 /* 1109 * ill_down_start is called when we want to down this ill and bring it up again 1110 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1111 * all interfaces, but don't tear down any plumbing. 1112 */ 1113 boolean_t 1114 ill_down_start(queue_t *q, mblk_t *mp) 1115 { 1116 ill_t *ill = q->q_ptr; 1117 ipif_t *ipif; 1118 1119 ASSERT(IAM_WRITER_ILL(ill)); 1120 mutex_enter(&ill->ill_lock); 1121 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 1122 /* no more nce addition allowed */ 1123 mutex_exit(&ill->ill_lock); 1124 1125 /* 1126 * It is possible that some ioctl is already in progress while we 1127 * received the M_ERROR / M_HANGUP in which case, we need to abort 1128 * the ioctl. ill_down_start() is being processed as CUR_OP rather 1129 * than as NEW_OP since the cause of the M_ERROR / M_HANGUP may prevent 1130 * the in progress ioctl from ever completing. 1131 * 1132 * The thread that started the ioctl (if any) must have returned, 1133 * since we are now executing as writer. After the 2 calls below, 1134 * the state of the ipsq and the ill would reflect no trace of any 1135 * pending operation. Subsequently if there is any response to the 1136 * original ioctl from the driver, it would be discarded as an 1137 * unsolicited message from the driver. 1138 */ 1139 (void) ipsq_pending_mp_cleanup(ill, NULL); 1140 ill_dlpi_clear_deferred(ill); 1141 1142 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1143 (void) ipif_down(ipif, NULL, NULL); 1144 1145 ill_down(ill); 1146 1147 /* 1148 * Walk all CONNs that can have a reference on an ire or nce for this 1149 * ill (we actually walk all that now have stale references). 1150 */ 1151 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst); 1152 1153 /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 1154 if (ill->ill_isv6) 1155 dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst); 1156 1157 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); 1158 1159 /* 1160 * Atomically test and add the pending mp if references are active. 1161 */ 1162 mutex_enter(&ill->ill_lock); 1163 if (!ill_is_quiescent(ill)) { 1164 /* call cannot fail since `conn_t *' argument is NULL */ 1165 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1166 mp, ILL_DOWN); 1167 mutex_exit(&ill->ill_lock); 1168 return (B_FALSE); 1169 } 1170 mutex_exit(&ill->ill_lock); 1171 return (B_TRUE); 1172 } 1173 1174 static void 1175 ill_down(ill_t *ill) 1176 { 1177 mblk_t *mp; 1178 ip_stack_t *ipst = ill->ill_ipst; 1179 1180 /* 1181 * Blow off any IREs dependent on this ILL. 1182 * The caller needs to handle conn_ixa_cleanup 1183 */ 1184 ill_delete_ires(ill); 1185 1186 ire_walk_ill(0, 0, ill_downi, ill, ill); 1187 1188 /* Remove any conn_*_ill depending on this ill */ 1189 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); 1190 1191 /* 1192 * Free state for additional IREs. 1193 */ 1194 mutex_enter(&ill->ill_saved_ire_lock); 1195 mp = ill->ill_saved_ire_mp; 1196 ill->ill_saved_ire_mp = NULL; 1197 ill->ill_saved_ire_cnt = 0; 1198 mutex_exit(&ill->ill_saved_ire_lock); 1199 freemsg(mp); 1200 } 1201 1202 /* 1203 * ire_walk routine used to delete every IRE that depends on 1204 * 'ill'. (Always called as writer, and may only be called from ire_walk.) 1205 * 1206 * Note: since the routes added by the kernel are deleted separately, 1207 * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE. 1208 * 1209 * We also remove references on ire_nce_cache entries that refer to the ill. 1210 */ 1211 void 1212 ill_downi(ire_t *ire, char *ill_arg) 1213 { 1214 ill_t *ill = (ill_t *)ill_arg; 1215 nce_t *nce; 1216 1217 mutex_enter(&ire->ire_lock); 1218 nce = ire->ire_nce_cache; 1219 if (nce != NULL && nce->nce_ill == ill) 1220 ire->ire_nce_cache = NULL; 1221 else 1222 nce = NULL; 1223 mutex_exit(&ire->ire_lock); 1224 if (nce != NULL) 1225 nce_refrele(nce); 1226 if (ire->ire_ill == ill) { 1227 /* 1228 * The existing interface binding for ire must be 1229 * deleted before trying to bind the route to another 1230 * interface. However, since we are using the contents of the 1231 * ire after ire_delete, the caller has to ensure that 1232 * CONDEMNED (deleted) ire's are not removed from the list 1233 * when ire_delete() returns. Currently ill_downi() is 1234 * only called as part of ire_walk*() routines, so that 1235 * the irb_refhold() done by ire_walk*() will ensure that 1236 * ire_delete() does not lead to ire_inactive(). 1237 */ 1238 ASSERT(ire->ire_bucket->irb_refcnt > 0); 1239 ire_delete(ire); 1240 if (ire->ire_unbound) 1241 ire_rebind(ire); 1242 } 1243 } 1244 1245 /* Remove IRE_IF_CLONE on this ill */ 1246 void 1247 ill_downi_if_clone(ire_t *ire, char *ill_arg) 1248 { 1249 ill_t *ill = (ill_t *)ill_arg; 1250 1251 ASSERT(ire->ire_type & IRE_IF_CLONE); 1252 if (ire->ire_ill == ill) 1253 ire_delete(ire); 1254 } 1255 1256 /* Consume an M_IOCACK of the fastpath probe. */ 1257 void 1258 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1259 { 1260 mblk_t *mp1 = mp; 1261 1262 /* 1263 * If this was the first attempt turn on the fastpath probing. 1264 */ 1265 mutex_enter(&ill->ill_lock); 1266 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1267 ill->ill_dlpi_fastpath_state = IDS_OK; 1268 mutex_exit(&ill->ill_lock); 1269 1270 /* Free the M_IOCACK mblk, hold on to the data */ 1271 mp = mp->b_cont; 1272 freeb(mp1); 1273 if (mp == NULL) 1274 return; 1275 if (mp->b_cont != NULL) 1276 nce_fastpath_update(ill, mp); 1277 else 1278 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1279 freemsg(mp); 1280 } 1281 1282 /* 1283 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1284 * The data portion of the request is a dl_unitdata_req_t template for 1285 * what we would send downstream in the absence of a fastpath confirmation. 1286 */ 1287 int 1288 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1289 { 1290 struct iocblk *ioc; 1291 mblk_t *mp; 1292 1293 if (dlur_mp == NULL) 1294 return (EINVAL); 1295 1296 mutex_enter(&ill->ill_lock); 1297 switch (ill->ill_dlpi_fastpath_state) { 1298 case IDS_FAILED: 1299 /* 1300 * Driver NAKed the first fastpath ioctl - assume it doesn't 1301 * support it. 1302 */ 1303 mutex_exit(&ill->ill_lock); 1304 return (ENOTSUP); 1305 case IDS_UNKNOWN: 1306 /* This is the first probe */ 1307 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1308 break; 1309 default: 1310 break; 1311 } 1312 mutex_exit(&ill->ill_lock); 1313 1314 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1315 return (EAGAIN); 1316 1317 mp->b_cont = copyb(dlur_mp); 1318 if (mp->b_cont == NULL) { 1319 freeb(mp); 1320 return (EAGAIN); 1321 } 1322 1323 ioc = (struct iocblk *)mp->b_rptr; 1324 ioc->ioc_count = msgdsize(mp->b_cont); 1325 1326 DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe", 1327 char *, "DL_IOC_HDR_INFO", ill_t *, ill); 1328 putnext(ill->ill_wq, mp); 1329 return (0); 1330 } 1331 1332 void 1333 ill_capability_probe(ill_t *ill) 1334 { 1335 mblk_t *mp; 1336 1337 ASSERT(IAM_WRITER_ILL(ill)); 1338 1339 if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN && 1340 ill->ill_dlpi_capab_state != IDCS_FAILED) 1341 return; 1342 1343 /* 1344 * We are starting a new cycle of capability negotiation. 1345 * Free up the capab reset messages of any previous incarnation. 1346 * We will do a fresh allocation when we get the response to our probe 1347 */ 1348 if (ill->ill_capab_reset_mp != NULL) { 1349 freemsg(ill->ill_capab_reset_mp); 1350 ill->ill_capab_reset_mp = NULL; 1351 } 1352 1353 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1354 1355 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); 1356 if (mp == NULL) 1357 return; 1358 1359 ill_capability_send(ill, mp); 1360 ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; 1361 } 1362 1363 void 1364 ill_capability_reset(ill_t *ill, boolean_t reneg) 1365 { 1366 ASSERT(IAM_WRITER_ILL(ill)); 1367 1368 if (ill->ill_dlpi_capab_state != IDCS_OK) 1369 return; 1370 1371 ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; 1372 1373 ill_capability_send(ill, ill->ill_capab_reset_mp); 1374 ill->ill_capab_reset_mp = NULL; 1375 /* 1376 * We turn off all capabilities except those pertaining to 1377 * direct function call capabilities viz. ILL_CAPAB_DLD* 1378 * which will be turned off by the corresponding reset functions. 1379 */ 1380 ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY); 1381 } 1382 1383 static void 1384 ill_capability_reset_alloc(ill_t *ill) 1385 { 1386 mblk_t *mp; 1387 size_t size = 0; 1388 int err; 1389 dl_capability_req_t *capb; 1390 1391 ASSERT(IAM_WRITER_ILL(ill)); 1392 ASSERT(ill->ill_capab_reset_mp == NULL); 1393 1394 if (ILL_HCKSUM_CAPABLE(ill)) { 1395 size += sizeof (dl_capability_sub_t) + 1396 sizeof (dl_capab_hcksum_t); 1397 } 1398 1399 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) { 1400 size += sizeof (dl_capability_sub_t) + 1401 sizeof (dl_capab_zerocopy_t); 1402 } 1403 1404 if (ill->ill_capabilities & ILL_CAPAB_DLD) { 1405 size += sizeof (dl_capability_sub_t) + 1406 sizeof (dl_capab_dld_t); 1407 } 1408 1409 mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED, 1410 STR_NOSIG, &err); 1411 1412 mp->b_datap->db_type = M_PROTO; 1413 bzero(mp->b_rptr, size + sizeof (dl_capability_req_t)); 1414 1415 capb = (dl_capability_req_t *)mp->b_rptr; 1416 capb->dl_primitive = DL_CAPABILITY_REQ; 1417 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1418 capb->dl_sub_length = size; 1419 1420 mp->b_wptr += sizeof (dl_capability_req_t); 1421 1422 /* 1423 * Each handler fills in the corresponding dl_capability_sub_t 1424 * inside the mblk, 1425 */ 1426 ill_capability_hcksum_reset_fill(ill, mp); 1427 ill_capability_zerocopy_reset_fill(ill, mp); 1428 ill_capability_dld_reset_fill(ill, mp); 1429 1430 ill->ill_capab_reset_mp = mp; 1431 } 1432 1433 static void 1434 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1435 { 1436 dl_capab_id_t *id_ic; 1437 uint_t sub_dl_cap = outers->dl_cap; 1438 dl_capability_sub_t *inners; 1439 uint8_t *capend; 1440 1441 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1442 1443 /* 1444 * Note: range checks here are not absolutely sufficient to 1445 * make us robust against malformed messages sent by drivers; 1446 * this is in keeping with the rest of IP's dlpi handling. 1447 * (Remember, it's coming from something else in the kernel 1448 * address space) 1449 */ 1450 1451 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1452 if (capend > mp->b_wptr) { 1453 cmn_err(CE_WARN, "ill_capability_id_ack: " 1454 "malformed sub-capability too long for mblk"); 1455 return; 1456 } 1457 1458 id_ic = (dl_capab_id_t *)(outers + 1); 1459 1460 if (outers->dl_length < sizeof (*id_ic) || 1461 (inners = &id_ic->id_subcap, 1462 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1463 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1464 "encapsulated capab type %d too long for mblk", 1465 inners->dl_cap); 1466 return; 1467 } 1468 1469 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1470 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1471 "isn't as expected; pass-thru module(s) detected, " 1472 "discarding capability\n", inners->dl_cap)); 1473 return; 1474 } 1475 1476 /* Process the encapsulated sub-capability */ 1477 ill_capability_dispatch(ill, mp, inners); 1478 } 1479 1480 static void 1481 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp) 1482 { 1483 dl_capability_sub_t *dl_subcap; 1484 1485 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 1486 return; 1487 1488 /* 1489 * The dl_capab_dld_t that follows the dl_capability_sub_t is not 1490 * initialized below since it is not used by DLD. 1491 */ 1492 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1493 dl_subcap->dl_cap = DL_CAPAB_DLD; 1494 dl_subcap->dl_length = sizeof (dl_capab_dld_t); 1495 1496 mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); 1497 } 1498 1499 static void 1500 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp) 1501 { 1502 /* 1503 * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK 1504 * is only to get the VRRP capability. 1505 * 1506 * Note that we cannot check ill_ipif_up_count here since 1507 * ill_ipif_up_count is only incremented when the resolver is setup. 1508 * That is done asynchronously, and can race with this function. 1509 */ 1510 if (!ill->ill_dl_up) { 1511 if (subp->dl_cap == DL_CAPAB_VRRP) 1512 ill_capability_vrrp_ack(ill, mp, subp); 1513 return; 1514 } 1515 1516 switch (subp->dl_cap) { 1517 case DL_CAPAB_HCKSUM: 1518 ill_capability_hcksum_ack(ill, mp, subp); 1519 break; 1520 case DL_CAPAB_ZEROCOPY: 1521 ill_capability_zerocopy_ack(ill, mp, subp); 1522 break; 1523 case DL_CAPAB_DLD: 1524 ill_capability_dld_ack(ill, mp, subp); 1525 break; 1526 case DL_CAPAB_VRRP: 1527 break; 1528 default: 1529 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 1530 subp->dl_cap)); 1531 } 1532 } 1533 1534 /* 1535 * Process the vrrp capability received from a DLS Provider. isub must point 1536 * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message. 1537 */ 1538 static void 1539 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1540 { 1541 dl_capab_vrrp_t *vrrp; 1542 uint_t sub_dl_cap = isub->dl_cap; 1543 uint8_t *capend; 1544 1545 ASSERT(IAM_WRITER_ILL(ill)); 1546 ASSERT(sub_dl_cap == DL_CAPAB_VRRP); 1547 1548 /* 1549 * Note: range checks here are not absolutely sufficient to 1550 * make us robust against malformed messages sent by drivers; 1551 * this is in keeping with the rest of IP's dlpi handling. 1552 * (Remember, it's coming from something else in the kernel 1553 * address space) 1554 */ 1555 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1556 if (capend > mp->b_wptr) { 1557 cmn_err(CE_WARN, "ill_capability_vrrp_ack: " 1558 "malformed sub-capability too long for mblk"); 1559 return; 1560 } 1561 vrrp = (dl_capab_vrrp_t *)(isub + 1); 1562 1563 /* 1564 * Compare the IP address family and set ILLF_VRRP for the right ill. 1565 */ 1566 if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) || 1567 (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) { 1568 ill->ill_flags |= ILLF_VRRP; 1569 } 1570 } 1571 1572 /* 1573 * Process a hardware checksum offload capability negotiation ack received 1574 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 1575 * of a DL_CAPABILITY_ACK message. 1576 */ 1577 static void 1578 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1579 { 1580 dl_capability_req_t *ocap; 1581 dl_capab_hcksum_t *ihck, *ohck; 1582 ill_hcksum_capab_t **ill_hcksum; 1583 mblk_t *nmp = NULL; 1584 uint_t sub_dl_cap = isub->dl_cap; 1585 uint8_t *capend; 1586 1587 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 1588 1589 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 1590 1591 /* 1592 * Note: range checks here are not absolutely sufficient to 1593 * make us robust against malformed messages sent by drivers; 1594 * this is in keeping with the rest of IP's dlpi handling. 1595 * (Remember, it's coming from something else in the kernel 1596 * address space) 1597 */ 1598 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1599 if (capend > mp->b_wptr) { 1600 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1601 "malformed sub-capability too long for mblk"); 1602 return; 1603 } 1604 1605 /* 1606 * There are two types of acks we process here: 1607 * 1. acks in reply to a (first form) generic capability req 1608 * (no ENABLE flag set) 1609 * 2. acks in reply to a ENABLE capability req. 1610 * (ENABLE flag set) 1611 */ 1612 ihck = (dl_capab_hcksum_t *)(isub + 1); 1613 1614 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 1615 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 1616 "unsupported hardware checksum " 1617 "sub-capability (version %d, expected %d)", 1618 ihck->hcksum_version, HCKSUM_VERSION_1); 1619 return; 1620 } 1621 1622 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 1623 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 1624 "checksum capability isn't as expected; pass-thru " 1625 "module(s) detected, discarding capability\n")); 1626 return; 1627 } 1628 1629 #define CURR_HCKSUM_CAPAB \ 1630 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 1631 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 1632 1633 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 1634 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 1635 /* do ENABLE processing */ 1636 if (*ill_hcksum == NULL) { 1637 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 1638 KM_NOSLEEP); 1639 1640 if (*ill_hcksum == NULL) { 1641 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1642 "could not enable hcksum version %d " 1643 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 1644 ill->ill_name); 1645 return; 1646 } 1647 } 1648 1649 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 1650 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 1651 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 1652 ip1dbg(("ill_capability_hcksum_ack: interface %s " 1653 "has enabled hardware checksumming\n ", 1654 ill->ill_name)); 1655 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 1656 /* 1657 * Enabling hardware checksum offload 1658 * Currently IP supports {TCP,UDP}/IPv4 1659 * partial and full cksum offload and 1660 * IPv4 header checksum offload. 1661 * Allocate new mblk which will 1662 * contain a new capability request 1663 * to enable hardware checksum offload. 1664 */ 1665 uint_t size; 1666 uchar_t *rptr; 1667 1668 size = sizeof (dl_capability_req_t) + 1669 sizeof (dl_capability_sub_t) + isub->dl_length; 1670 1671 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1672 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1673 "could not enable hardware cksum for %s (ENOMEM)\n", 1674 ill->ill_name); 1675 return; 1676 } 1677 1678 rptr = nmp->b_rptr; 1679 /* initialize dl_capability_req_t */ 1680 ocap = (dl_capability_req_t *)nmp->b_rptr; 1681 ocap->dl_sub_offset = 1682 sizeof (dl_capability_req_t); 1683 ocap->dl_sub_length = 1684 sizeof (dl_capability_sub_t) + 1685 isub->dl_length; 1686 nmp->b_rptr += sizeof (dl_capability_req_t); 1687 1688 /* initialize dl_capability_sub_t */ 1689 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 1690 nmp->b_rptr += sizeof (*isub); 1691 1692 /* initialize dl_capab_hcksum_t */ 1693 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 1694 bcopy(ihck, ohck, sizeof (*ihck)); 1695 1696 nmp->b_rptr = rptr; 1697 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 1698 1699 /* Set ENABLE flag */ 1700 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 1701 ohck->hcksum_txflags |= HCKSUM_ENABLE; 1702 1703 /* 1704 * nmp points to a DL_CAPABILITY_REQ message to enable 1705 * hardware checksum acceleration. 1706 */ 1707 ill_capability_send(ill, nmp); 1708 } else { 1709 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 1710 "advertised %x hardware checksum capability flags\n", 1711 ill->ill_name, ihck->hcksum_txflags)); 1712 } 1713 } 1714 1715 static void 1716 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp) 1717 { 1718 dl_capab_hcksum_t *hck_subcap; 1719 dl_capability_sub_t *dl_subcap; 1720 1721 if (!ILL_HCKSUM_CAPABLE(ill)) 1722 return; 1723 1724 ASSERT(ill->ill_hcksum_capab != NULL); 1725 1726 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1727 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 1728 dl_subcap->dl_length = sizeof (*hck_subcap); 1729 1730 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 1731 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 1732 hck_subcap->hcksum_txflags = 0; 1733 1734 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap); 1735 } 1736 1737 static void 1738 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1739 { 1740 mblk_t *nmp = NULL; 1741 dl_capability_req_t *oc; 1742 dl_capab_zerocopy_t *zc_ic, *zc_oc; 1743 ill_zerocopy_capab_t **ill_zerocopy_capab; 1744 uint_t sub_dl_cap = isub->dl_cap; 1745 uint8_t *capend; 1746 1747 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 1748 1749 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 1750 1751 /* 1752 * Note: range checks here are not absolutely sufficient to 1753 * make us robust against malformed messages sent by drivers; 1754 * this is in keeping with the rest of IP's dlpi handling. 1755 * (Remember, it's coming from something else in the kernel 1756 * address space) 1757 */ 1758 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1759 if (capend > mp->b_wptr) { 1760 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1761 "malformed sub-capability too long for mblk"); 1762 return; 1763 } 1764 1765 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 1766 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 1767 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 1768 "unsupported ZEROCOPY sub-capability (version %d, " 1769 "expected %d)", zc_ic->zerocopy_version, 1770 ZEROCOPY_VERSION_1); 1771 return; 1772 } 1773 1774 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 1775 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 1776 "capability isn't as expected; pass-thru module(s) " 1777 "detected, discarding capability\n")); 1778 return; 1779 } 1780 1781 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 1782 if (*ill_zerocopy_capab == NULL) { 1783 *ill_zerocopy_capab = 1784 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 1785 KM_NOSLEEP); 1786 1787 if (*ill_zerocopy_capab == NULL) { 1788 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1789 "could not enable Zero-copy version %d " 1790 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 1791 ill->ill_name); 1792 return; 1793 } 1794 } 1795 1796 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 1797 "supports Zero-copy version %d\n", ill->ill_name, 1798 ZEROCOPY_VERSION_1)); 1799 1800 (*ill_zerocopy_capab)->ill_zerocopy_version = 1801 zc_ic->zerocopy_version; 1802 (*ill_zerocopy_capab)->ill_zerocopy_flags = 1803 zc_ic->zerocopy_flags; 1804 1805 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 1806 } else { 1807 uint_t size; 1808 uchar_t *rptr; 1809 1810 size = sizeof (dl_capability_req_t) + 1811 sizeof (dl_capability_sub_t) + 1812 sizeof (dl_capab_zerocopy_t); 1813 1814 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1815 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1816 "could not enable zerocopy for %s (ENOMEM)\n", 1817 ill->ill_name); 1818 return; 1819 } 1820 1821 rptr = nmp->b_rptr; 1822 /* initialize dl_capability_req_t */ 1823 oc = (dl_capability_req_t *)rptr; 1824 oc->dl_sub_offset = sizeof (dl_capability_req_t); 1825 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 1826 sizeof (dl_capab_zerocopy_t); 1827 rptr += sizeof (dl_capability_req_t); 1828 1829 /* initialize dl_capability_sub_t */ 1830 bcopy(isub, rptr, sizeof (*isub)); 1831 rptr += sizeof (*isub); 1832 1833 /* initialize dl_capab_zerocopy_t */ 1834 zc_oc = (dl_capab_zerocopy_t *)rptr; 1835 *zc_oc = *zc_ic; 1836 1837 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 1838 "to enable zero-copy version %d\n", ill->ill_name, 1839 ZEROCOPY_VERSION_1)); 1840 1841 /* set VMSAFE_MEM flag */ 1842 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 1843 1844 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 1845 ill_capability_send(ill, nmp); 1846 } 1847 } 1848 1849 static void 1850 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp) 1851 { 1852 dl_capab_zerocopy_t *zerocopy_subcap; 1853 dl_capability_sub_t *dl_subcap; 1854 1855 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 1856 return; 1857 1858 ASSERT(ill->ill_zerocopy_capab != NULL); 1859 1860 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1861 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 1862 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 1863 1864 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 1865 zerocopy_subcap->zerocopy_version = 1866 ill->ill_zerocopy_capab->ill_zerocopy_version; 1867 zerocopy_subcap->zerocopy_flags = 0; 1868 1869 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 1870 } 1871 1872 /* 1873 * DLD capability 1874 * Refer to dld.h for more information regarding the purpose and usage 1875 * of this capability. 1876 */ 1877 static void 1878 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1879 { 1880 dl_capab_dld_t *dld_ic, dld; 1881 uint_t sub_dl_cap = isub->dl_cap; 1882 uint8_t *capend; 1883 ill_dld_capab_t *idc; 1884 1885 ASSERT(IAM_WRITER_ILL(ill)); 1886 ASSERT(sub_dl_cap == DL_CAPAB_DLD); 1887 1888 /* 1889 * Note: range checks here are not absolutely sufficient to 1890 * make us robust against malformed messages sent by drivers; 1891 * this is in keeping with the rest of IP's dlpi handling. 1892 * (Remember, it's coming from something else in the kernel 1893 * address space) 1894 */ 1895 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1896 if (capend > mp->b_wptr) { 1897 cmn_err(CE_WARN, "ill_capability_dld_ack: " 1898 "malformed sub-capability too long for mblk"); 1899 return; 1900 } 1901 dld_ic = (dl_capab_dld_t *)(isub + 1); 1902 if (dld_ic->dld_version != DLD_CURRENT_VERSION) { 1903 cmn_err(CE_CONT, "ill_capability_dld_ack: " 1904 "unsupported DLD sub-capability (version %d, " 1905 "expected %d)", dld_ic->dld_version, 1906 DLD_CURRENT_VERSION); 1907 return; 1908 } 1909 if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) { 1910 ip1dbg(("ill_capability_dld_ack: mid token for dld " 1911 "capability isn't as expected; pass-thru module(s) " 1912 "detected, discarding capability\n")); 1913 return; 1914 } 1915 1916 /* 1917 * Copy locally to ensure alignment. 1918 */ 1919 bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t)); 1920 1921 if ((idc = ill->ill_dld_capab) == NULL) { 1922 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP); 1923 if (idc == NULL) { 1924 cmn_err(CE_WARN, "ill_capability_dld_ack: " 1925 "could not enable DLD version %d " 1926 "for %s (ENOMEM)\n", DLD_CURRENT_VERSION, 1927 ill->ill_name); 1928 return; 1929 } 1930 ill->ill_dld_capab = idc; 1931 } 1932 idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab; 1933 idc->idc_capab_dh = (void *)dld.dld_capab_handle; 1934 ip1dbg(("ill_capability_dld_ack: interface %s " 1935 "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION)); 1936 1937 ill_capability_dld_enable(ill); 1938 } 1939 1940 /* 1941 * Typically capability negotiation between IP and the driver happens via 1942 * DLPI message exchange. However GLD also offers a direct function call 1943 * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities, 1944 * But arbitrary function calls into IP or GLD are not permitted, since both 1945 * of them are protected by their own perimeter mechanism. The perimeter can 1946 * be viewed as a coarse lock or serialization mechanism. The hierarchy of 1947 * these perimeters is IP -> MAC. Thus for example to enable the squeue 1948 * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter 1949 * to enter the mac perimeter and then do the direct function calls into 1950 * GLD to enable squeue polling. The ring related callbacks from the mac into 1951 * the stack to add, bind, quiesce, restart or cleanup a ring are all 1952 * protected by the mac perimeter. 1953 */ 1954 static void 1955 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp) 1956 { 1957 ill_dld_capab_t *idc = ill->ill_dld_capab; 1958 int err; 1959 1960 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp, 1961 DLD_ENABLE); 1962 ASSERT(err == 0); 1963 } 1964 1965 static void 1966 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph) 1967 { 1968 ill_dld_capab_t *idc = ill->ill_dld_capab; 1969 int err; 1970 1971 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph, 1972 DLD_DISABLE); 1973 ASSERT(err == 0); 1974 } 1975 1976 boolean_t 1977 ill_mac_perim_held(ill_t *ill) 1978 { 1979 ill_dld_capab_t *idc = ill->ill_dld_capab; 1980 1981 return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL, 1982 DLD_QUERY)); 1983 } 1984 1985 static void 1986 ill_capability_direct_enable(ill_t *ill) 1987 { 1988 ill_dld_capab_t *idc = ill->ill_dld_capab; 1989 ill_dld_direct_t *idd = &idc->idc_direct; 1990 dld_capab_direct_t direct; 1991 int rc; 1992 1993 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 1994 1995 bzero(&direct, sizeof (direct)); 1996 direct.di_rx_cf = (uintptr_t)ip_input; 1997 direct.di_rx_ch = ill; 1998 1999 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct, 2000 DLD_ENABLE); 2001 if (rc == 0) { 2002 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df; 2003 idd->idd_tx_dh = direct.di_tx_dh; 2004 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df; 2005 idd->idd_tx_cb_dh = direct.di_tx_cb_dh; 2006 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df; 2007 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh; 2008 ASSERT(idd->idd_tx_cb_df != NULL); 2009 ASSERT(idd->idd_tx_fctl_df != NULL); 2010 ASSERT(idd->idd_tx_df != NULL); 2011 /* 2012 * One time registration of flow enable callback function 2013 */ 2014 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh, 2015 ill_flow_enable, ill); 2016 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT; 2017 DTRACE_PROBE1(direct_on, (ill_t *), ill); 2018 } else { 2019 cmn_err(CE_WARN, "warning: could not enable DIRECT " 2020 "capability, rc = %d\n", rc); 2021 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc); 2022 } 2023 } 2024 2025 static void 2026 ill_capability_poll_enable(ill_t *ill) 2027 { 2028 ill_dld_capab_t *idc = ill->ill_dld_capab; 2029 dld_capab_poll_t poll; 2030 int rc; 2031 2032 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 2033 2034 bzero(&poll, sizeof (poll)); 2035 poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring; 2036 poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring; 2037 poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring; 2038 poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring; 2039 poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring; 2040 poll.poll_ring_ch = ill; 2041 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll, 2042 DLD_ENABLE); 2043 if (rc == 0) { 2044 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL; 2045 DTRACE_PROBE1(poll_on, (ill_t *), ill); 2046 } else { 2047 ip1dbg(("warning: could not enable POLL " 2048 "capability, rc = %d\n", rc)); 2049 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc); 2050 } 2051 } 2052 2053 /* 2054 * Enable the LSO capability. 2055 */ 2056 static void 2057 ill_capability_lso_enable(ill_t *ill) 2058 { 2059 ill_dld_capab_t *idc = ill->ill_dld_capab; 2060 dld_capab_lso_t lso; 2061 int rc; 2062 2063 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 2064 2065 if (ill->ill_lso_capab == NULL) { 2066 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), 2067 KM_NOSLEEP); 2068 if (ill->ill_lso_capab == NULL) { 2069 cmn_err(CE_WARN, "ill_capability_lso_enable: " 2070 "could not enable LSO for %s (ENOMEM)\n", 2071 ill->ill_name); 2072 return; 2073 } 2074 } 2075 2076 bzero(&lso, sizeof (lso)); 2077 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso, 2078 DLD_ENABLE)) == 0) { 2079 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags; 2080 ill->ill_lso_capab->ill_lso_max = lso.lso_max; 2081 ill->ill_capabilities |= ILL_CAPAB_LSO; 2082 ip1dbg(("ill_capability_lso_enable: interface %s " 2083 "has enabled LSO\n ", ill->ill_name)); 2084 } else { 2085 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 2086 ill->ill_lso_capab = NULL; 2087 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc); 2088 } 2089 } 2090 2091 static void 2092 ill_capability_dld_enable(ill_t *ill) 2093 { 2094 mac_perim_handle_t mph; 2095 2096 ASSERT(IAM_WRITER_ILL(ill)); 2097 2098 if (ill->ill_isv6) 2099 return; 2100 2101 ill_mac_perim_enter(ill, &mph); 2102 if (!ill->ill_isv6) { 2103 ill_capability_direct_enable(ill); 2104 ill_capability_poll_enable(ill); 2105 ill_capability_lso_enable(ill); 2106 } 2107 ill->ill_capabilities |= ILL_CAPAB_DLD; 2108 ill_mac_perim_exit(ill, mph); 2109 } 2110 2111 static void 2112 ill_capability_dld_disable(ill_t *ill) 2113 { 2114 ill_dld_capab_t *idc; 2115 ill_dld_direct_t *idd; 2116 mac_perim_handle_t mph; 2117 2118 ASSERT(IAM_WRITER_ILL(ill)); 2119 2120 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 2121 return; 2122 2123 ill_mac_perim_enter(ill, &mph); 2124 2125 idc = ill->ill_dld_capab; 2126 if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) { 2127 /* 2128 * For performance we avoid locks in the transmit data path 2129 * and don't maintain a count of the number of threads using 2130 * direct calls. Thus some threads could be using direct 2131 * transmit calls to GLD, even after the capability mechanism 2132 * turns it off. This is still safe since the handles used in 2133 * the direct calls continue to be valid until the unplumb is 2134 * completed. Remove the callback that was added (1-time) at 2135 * capab enable time. 2136 */ 2137 mutex_enter(&ill->ill_lock); 2138 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT; 2139 mutex_exit(&ill->ill_lock); 2140 if (ill->ill_flownotify_mh != NULL) { 2141 idd = &idc->idc_direct; 2142 idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL, 2143 ill->ill_flownotify_mh); 2144 ill->ill_flownotify_mh = NULL; 2145 } 2146 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, 2147 NULL, DLD_DISABLE); 2148 } 2149 2150 if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) { 2151 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL; 2152 ip_squeue_clean_all(ill); 2153 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, 2154 NULL, DLD_DISABLE); 2155 } 2156 2157 if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) { 2158 ASSERT(ill->ill_lso_capab != NULL); 2159 /* 2160 * Clear the capability flag for LSO but retain the 2161 * ill_lso_capab structure since it's possible that another 2162 * thread is still referring to it. The structure only gets 2163 * deallocated when we destroy the ill. 2164 */ 2165 2166 ill->ill_capabilities &= ~ILL_CAPAB_LSO; 2167 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, 2168 NULL, DLD_DISABLE); 2169 } 2170 2171 ill->ill_capabilities &= ~ILL_CAPAB_DLD; 2172 ill_mac_perim_exit(ill, mph); 2173 } 2174 2175 /* 2176 * Capability Negotiation protocol 2177 * 2178 * We don't wait for DLPI capability operations to finish during interface 2179 * bringup or teardown. Doing so would introduce more asynchrony and the 2180 * interface up/down operations will need multiple return and restarts. 2181 * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as 2182 * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next 2183 * exclusive operation won't start until the DLPI operations of the previous 2184 * exclusive operation complete. 2185 * 2186 * The capability state machine is shown below. 2187 * 2188 * state next state event, action 2189 * 2190 * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe 2191 * IDCS_PROBE_SENT IDCS_OK ill_capability_ack 2192 * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack) 2193 * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG 2194 * IDCS_OK IDCS_RESET_SENT ill_capability_reset 2195 * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr 2196 * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr -> 2197 * ill_capability_probe. 2198 */ 2199 2200 /* 2201 * Dedicated thread started from ip_stack_init that handles capability 2202 * disable. This thread ensures the taskq dispatch does not fail by waiting 2203 * for resources using TQ_SLEEP. The taskq mechanism is used to ensure 2204 * that direct calls to DLD are done in a cv_waitable context. 2205 */ 2206 void 2207 ill_taskq_dispatch(ip_stack_t *ipst) 2208 { 2209 callb_cpr_t cprinfo; 2210 char name[64]; 2211 mblk_t *mp; 2212 2213 (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d", 2214 ipst->ips_netstack->netstack_stackid); 2215 CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr, 2216 name); 2217 mutex_enter(&ipst->ips_capab_taskq_lock); 2218 2219 for (;;) { 2220 mp = ipst->ips_capab_taskq_head; 2221 while (mp != NULL) { 2222 ipst->ips_capab_taskq_head = mp->b_next; 2223 if (ipst->ips_capab_taskq_head == NULL) 2224 ipst->ips_capab_taskq_tail = NULL; 2225 mutex_exit(&ipst->ips_capab_taskq_lock); 2226 mp->b_next = NULL; 2227 2228 VERIFY(taskq_dispatch(system_taskq, 2229 ill_capability_ack_thr, mp, TQ_SLEEP) != 0); 2230 mutex_enter(&ipst->ips_capab_taskq_lock); 2231 mp = ipst->ips_capab_taskq_head; 2232 } 2233 2234 if (ipst->ips_capab_taskq_quit) 2235 break; 2236 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2237 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock); 2238 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock); 2239 } 2240 VERIFY(ipst->ips_capab_taskq_head == NULL); 2241 VERIFY(ipst->ips_capab_taskq_tail == NULL); 2242 CALLB_CPR_EXIT(&cprinfo); 2243 thread_exit(); 2244 } 2245 2246 /* 2247 * Consume a new-style hardware capabilities negotiation ack. 2248 * Called via taskq on receipt of DL_CAPABILITY_ACK. 2249 */ 2250 static void 2251 ill_capability_ack_thr(void *arg) 2252 { 2253 mblk_t *mp = arg; 2254 dl_capability_ack_t *capp; 2255 dl_capability_sub_t *subp, *endp; 2256 ill_t *ill; 2257 boolean_t reneg; 2258 2259 ill = (ill_t *)mp->b_prev; 2260 mp->b_prev = NULL; 2261 2262 VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE); 2263 2264 if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT || 2265 ill->ill_dlpi_capab_state == IDCS_RENEG) { 2266 /* 2267 * We have received the ack for our DL_CAPAB reset request. 2268 * There isnt' anything in the message that needs processing. 2269 * All message based capabilities have been disabled, now 2270 * do the function call based capability disable. 2271 */ 2272 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG; 2273 ill_capability_dld_disable(ill); 2274 ill->ill_dlpi_capab_state = IDCS_UNKNOWN; 2275 if (reneg) 2276 ill_capability_probe(ill); 2277 goto done; 2278 } 2279 2280 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT) 2281 ill->ill_dlpi_capab_state = IDCS_OK; 2282 2283 capp = (dl_capability_ack_t *)mp->b_rptr; 2284 2285 if (capp->dl_sub_length == 0) { 2286 /* no new-style capabilities */ 2287 goto done; 2288 } 2289 2290 /* make sure the driver supplied correct dl_sub_length */ 2291 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 2292 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 2293 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 2294 goto done; 2295 } 2296 2297 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 2298 /* 2299 * There are sub-capabilities. Process the ones we know about. 2300 * Loop until we don't have room for another sub-cap header.. 2301 */ 2302 for (subp = SC(capp, capp->dl_sub_offset), 2303 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 2304 subp <= endp; 2305 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 2306 2307 switch (subp->dl_cap) { 2308 case DL_CAPAB_ID_WRAPPER: 2309 ill_capability_id_ack(ill, mp, subp); 2310 break; 2311 default: 2312 ill_capability_dispatch(ill, mp, subp); 2313 break; 2314 } 2315 } 2316 #undef SC 2317 done: 2318 inet_freemsg(mp); 2319 ill_capability_done(ill); 2320 ipsq_exit(ill->ill_phyint->phyint_ipsq); 2321 } 2322 2323 /* 2324 * This needs to be started in a taskq thread to provide a cv_waitable 2325 * context. 2326 */ 2327 void 2328 ill_capability_ack(ill_t *ill, mblk_t *mp) 2329 { 2330 ip_stack_t *ipst = ill->ill_ipst; 2331 2332 mp->b_prev = (mblk_t *)ill; 2333 ASSERT(mp->b_next == NULL); 2334 2335 if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp, 2336 TQ_NOSLEEP) != 0) 2337 return; 2338 2339 /* 2340 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread 2341 * which will do the dispatch using TQ_SLEEP to guarantee success. 2342 */ 2343 mutex_enter(&ipst->ips_capab_taskq_lock); 2344 if (ipst->ips_capab_taskq_head == NULL) { 2345 ASSERT(ipst->ips_capab_taskq_tail == NULL); 2346 ipst->ips_capab_taskq_head = mp; 2347 } else { 2348 ipst->ips_capab_taskq_tail->b_next = mp; 2349 } 2350 ipst->ips_capab_taskq_tail = mp; 2351 2352 cv_signal(&ipst->ips_capab_taskq_cv); 2353 mutex_exit(&ipst->ips_capab_taskq_lock); 2354 } 2355 2356 /* 2357 * This routine is called to scan the fragmentation reassembly table for 2358 * the specified ILL for any packets that are starting to smell. 2359 * dead_interval is the maximum time in seconds that will be tolerated. It 2360 * will either be the value specified in ip_g_frag_timeout, or zero if the 2361 * ILL is shutting down and it is time to blow everything off. 2362 * 2363 * It returns the number of seconds (as a time_t) that the next frag timer 2364 * should be scheduled for, 0 meaning that the timer doesn't need to be 2365 * re-started. Note that the method of calculating next_timeout isn't 2366 * entirely accurate since time will flow between the time we grab 2367 * current_time and the time we schedule the next timeout. This isn't a 2368 * big problem since this is the timer for sending an ICMP reassembly time 2369 * exceeded messages, and it doesn't have to be exactly accurate. 2370 * 2371 * This function is 2372 * sometimes called as writer, although this is not required. 2373 */ 2374 time_t 2375 ill_frag_timeout(ill_t *ill, time_t dead_interval) 2376 { 2377 ipfb_t *ipfb; 2378 ipfb_t *endp; 2379 ipf_t *ipf; 2380 ipf_t *ipfnext; 2381 mblk_t *mp; 2382 time_t current_time = gethrestime_sec(); 2383 time_t next_timeout = 0; 2384 uint32_t hdr_length; 2385 mblk_t *send_icmp_head; 2386 mblk_t *send_icmp_head_v6; 2387 ip_stack_t *ipst = ill->ill_ipst; 2388 ip_recv_attr_t iras; 2389 2390 bzero(&iras, sizeof (iras)); 2391 iras.ira_flags = 0; 2392 iras.ira_ill = iras.ira_rill = ill; 2393 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2394 iras.ira_rifindex = iras.ira_ruifindex; 2395 2396 ipfb = ill->ill_frag_hash_tbl; 2397 if (ipfb == NULL) 2398 return (B_FALSE); 2399 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 2400 /* Walk the frag hash table. */ 2401 for (; ipfb < endp; ipfb++) { 2402 send_icmp_head = NULL; 2403 send_icmp_head_v6 = NULL; 2404 mutex_enter(&ipfb->ipfb_lock); 2405 while ((ipf = ipfb->ipfb_ipf) != 0) { 2406 time_t frag_time = current_time - ipf->ipf_timestamp; 2407 time_t frag_timeout; 2408 2409 if (frag_time < dead_interval) { 2410 /* 2411 * There are some outstanding fragments 2412 * that will timeout later. Make note of 2413 * the time so that we can reschedule the 2414 * next timeout appropriately. 2415 */ 2416 frag_timeout = dead_interval - frag_time; 2417 if (next_timeout == 0 || 2418 frag_timeout < next_timeout) { 2419 next_timeout = frag_timeout; 2420 } 2421 break; 2422 } 2423 /* Time's up. Get it out of here. */ 2424 hdr_length = ipf->ipf_nf_hdr_len; 2425 ipfnext = ipf->ipf_hash_next; 2426 if (ipfnext) 2427 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 2428 *ipf->ipf_ptphn = ipfnext; 2429 mp = ipf->ipf_mp->b_cont; 2430 for (; mp; mp = mp->b_cont) { 2431 /* Extra points for neatness. */ 2432 IP_REASS_SET_START(mp, 0); 2433 IP_REASS_SET_END(mp, 0); 2434 } 2435 mp = ipf->ipf_mp->b_cont; 2436 atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count); 2437 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 2438 ipfb->ipfb_count -= ipf->ipf_count; 2439 ASSERT(ipfb->ipfb_frag_pkts > 0); 2440 ipfb->ipfb_frag_pkts--; 2441 /* 2442 * We do not send any icmp message from here because 2443 * we currently are holding the ipfb_lock for this 2444 * hash chain. If we try and send any icmp messages 2445 * from here we may end up via a put back into ip 2446 * trying to get the same lock, causing a recursive 2447 * mutex panic. Instead we build a list and send all 2448 * the icmp messages after we have dropped the lock. 2449 */ 2450 if (ill->ill_isv6) { 2451 if (hdr_length != 0) { 2452 mp->b_next = send_icmp_head_v6; 2453 send_icmp_head_v6 = mp; 2454 } else { 2455 freemsg(mp); 2456 } 2457 } else { 2458 if (hdr_length != 0) { 2459 mp->b_next = send_icmp_head; 2460 send_icmp_head = mp; 2461 } else { 2462 freemsg(mp); 2463 } 2464 } 2465 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2466 ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill); 2467 freeb(ipf->ipf_mp); 2468 } 2469 mutex_exit(&ipfb->ipfb_lock); 2470 /* 2471 * Now need to send any icmp messages that we delayed from 2472 * above. 2473 */ 2474 while (send_icmp_head_v6 != NULL) { 2475 ip6_t *ip6h; 2476 2477 mp = send_icmp_head_v6; 2478 send_icmp_head_v6 = send_icmp_head_v6->b_next; 2479 mp->b_next = NULL; 2480 ip6h = (ip6_t *)mp->b_rptr; 2481 iras.ira_flags = 0; 2482 /* 2483 * This will result in an incorrect ALL_ZONES zoneid 2484 * for multicast packets, but we 2485 * don't send ICMP errors for those in any case. 2486 */ 2487 iras.ira_zoneid = 2488 ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 2489 ill, ipst); 2490 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2491 icmp_time_exceeded_v6(mp, 2492 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 2493 &iras); 2494 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2495 } 2496 while (send_icmp_head != NULL) { 2497 ipaddr_t dst; 2498 2499 mp = send_icmp_head; 2500 send_icmp_head = send_icmp_head->b_next; 2501 mp->b_next = NULL; 2502 2503 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 2504 2505 iras.ira_flags = IRAF_IS_IPV4; 2506 /* 2507 * This will result in an incorrect ALL_ZONES zoneid 2508 * for broadcast and multicast packets, but we 2509 * don't send ICMP errors for those in any case. 2510 */ 2511 iras.ira_zoneid = ipif_lookup_addr_zoneid(dst, 2512 ill, ipst); 2513 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2514 icmp_time_exceeded(mp, 2515 ICMP_REASSEMBLY_TIME_EXCEEDED, &iras); 2516 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2517 } 2518 } 2519 /* 2520 * A non-dying ILL will use the return value to decide whether to 2521 * restart the frag timer, and for how long. 2522 */ 2523 return (next_timeout); 2524 } 2525 2526 /* 2527 * This routine is called when the approximate count of mblk memory used 2528 * for the specified ILL has exceeded max_count. 2529 */ 2530 void 2531 ill_frag_prune(ill_t *ill, uint_t max_count) 2532 { 2533 ipfb_t *ipfb; 2534 ipf_t *ipf; 2535 size_t count; 2536 clock_t now; 2537 2538 /* 2539 * If we are here within ip_min_frag_prune_time msecs remove 2540 * ill_frag_free_num_pkts oldest packets from each bucket and increment 2541 * ill_frag_free_num_pkts. 2542 */ 2543 mutex_enter(&ill->ill_lock); 2544 now = ddi_get_lbolt(); 2545 if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <= 2546 (ip_min_frag_prune_time != 0 ? 2547 ip_min_frag_prune_time : msec_per_tick)) { 2548 2549 ill->ill_frag_free_num_pkts++; 2550 2551 } else { 2552 ill->ill_frag_free_num_pkts = 0; 2553 } 2554 ill->ill_last_frag_clean_time = now; 2555 mutex_exit(&ill->ill_lock); 2556 2557 /* 2558 * free ill_frag_free_num_pkts oldest packets from each bucket. 2559 */ 2560 if (ill->ill_frag_free_num_pkts != 0) { 2561 int ix; 2562 2563 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2564 ipfb = &ill->ill_frag_hash_tbl[ix]; 2565 mutex_enter(&ipfb->ipfb_lock); 2566 if (ipfb->ipfb_ipf != NULL) { 2567 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 2568 ill->ill_frag_free_num_pkts); 2569 } 2570 mutex_exit(&ipfb->ipfb_lock); 2571 } 2572 } 2573 /* 2574 * While the reassembly list for this ILL is too big, prune a fragment 2575 * queue by age, oldest first. 2576 */ 2577 while (ill->ill_frag_count > max_count) { 2578 int ix; 2579 ipfb_t *oipfb = NULL; 2580 uint_t oldest = UINT_MAX; 2581 2582 count = 0; 2583 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2584 ipfb = &ill->ill_frag_hash_tbl[ix]; 2585 mutex_enter(&ipfb->ipfb_lock); 2586 ipf = ipfb->ipfb_ipf; 2587 if (ipf != NULL && ipf->ipf_gen < oldest) { 2588 oldest = ipf->ipf_gen; 2589 oipfb = ipfb; 2590 } 2591 count += ipfb->ipfb_count; 2592 mutex_exit(&ipfb->ipfb_lock); 2593 } 2594 if (oipfb == NULL) 2595 break; 2596 2597 if (count <= max_count) 2598 return; /* Somebody beat us to it, nothing to do */ 2599 mutex_enter(&oipfb->ipfb_lock); 2600 ipf = oipfb->ipfb_ipf; 2601 if (ipf != NULL) { 2602 ill_frag_free_pkts(ill, oipfb, ipf, 1); 2603 } 2604 mutex_exit(&oipfb->ipfb_lock); 2605 } 2606 } 2607 2608 /* 2609 * free 'free_cnt' fragmented packets starting at ipf. 2610 */ 2611 void 2612 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 2613 { 2614 size_t count; 2615 mblk_t *mp; 2616 mblk_t *tmp; 2617 ipf_t **ipfp = ipf->ipf_ptphn; 2618 2619 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 2620 ASSERT(ipfp != NULL); 2621 ASSERT(ipf != NULL); 2622 2623 while (ipf != NULL && free_cnt-- > 0) { 2624 count = ipf->ipf_count; 2625 mp = ipf->ipf_mp; 2626 ipf = ipf->ipf_hash_next; 2627 for (tmp = mp; tmp; tmp = tmp->b_cont) { 2628 IP_REASS_SET_START(tmp, 0); 2629 IP_REASS_SET_END(tmp, 0); 2630 } 2631 atomic_add_32(&ill->ill_frag_count, -count); 2632 ASSERT(ipfb->ipfb_count >= count); 2633 ipfb->ipfb_count -= count; 2634 ASSERT(ipfb->ipfb_frag_pkts > 0); 2635 ipfb->ipfb_frag_pkts--; 2636 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2637 ip_drop_input("ipIfStatsReasmFails", mp, ill); 2638 freemsg(mp); 2639 } 2640 2641 if (ipf) 2642 ipf->ipf_ptphn = ipfp; 2643 ipfp[0] = ipf; 2644 } 2645 2646 /* 2647 * Helper function for ill_forward_set(). 2648 */ 2649 static void 2650 ill_forward_set_on_ill(ill_t *ill, boolean_t enable) 2651 { 2652 ip_stack_t *ipst = ill->ill_ipst; 2653 2654 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2655 2656 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 2657 (enable ? "Enabling" : "Disabling"), 2658 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 2659 mutex_enter(&ill->ill_lock); 2660 if (enable) 2661 ill->ill_flags |= ILLF_ROUTER; 2662 else 2663 ill->ill_flags &= ~ILLF_ROUTER; 2664 mutex_exit(&ill->ill_lock); 2665 if (ill->ill_isv6) 2666 ill_set_nce_router_flags(ill, enable); 2667 /* Notify routing socket listeners of this change. */ 2668 if (ill->ill_ipif != NULL) 2669 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 2670 } 2671 2672 /* 2673 * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing 2674 * socket messages for each interface whose flags we change. 2675 */ 2676 int 2677 ill_forward_set(ill_t *ill, boolean_t enable) 2678 { 2679 ipmp_illgrp_t *illg; 2680 ip_stack_t *ipst = ill->ill_ipst; 2681 2682 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2683 2684 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 2685 (!enable && !(ill->ill_flags & ILLF_ROUTER))) 2686 return (0); 2687 2688 if (IS_LOOPBACK(ill)) 2689 return (EINVAL); 2690 2691 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { 2692 /* 2693 * Update all of the interfaces in the group. 2694 */ 2695 illg = ill->ill_grp; 2696 ill = list_head(&illg->ig_if); 2697 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) 2698 ill_forward_set_on_ill(ill, enable); 2699 2700 /* 2701 * Update the IPMP meta-interface. 2702 */ 2703 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable); 2704 return (0); 2705 } 2706 2707 ill_forward_set_on_ill(ill, enable); 2708 return (0); 2709 } 2710 2711 /* 2712 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 2713 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 2714 * set or clear. 2715 */ 2716 static void 2717 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 2718 { 2719 ipif_t *ipif; 2720 ncec_t *ncec; 2721 nce_t *nce; 2722 2723 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 2724 /* 2725 * NOTE: we match across the illgrp because nce's for 2726 * addresses on IPMP interfaces have an nce_ill that points to 2727 * the bound underlying ill. 2728 */ 2729 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); 2730 if (nce != NULL) { 2731 ncec = nce->nce_common; 2732 mutex_enter(&ncec->ncec_lock); 2733 if (enable) 2734 ncec->ncec_flags |= NCE_F_ISROUTER; 2735 else 2736 ncec->ncec_flags &= ~NCE_F_ISROUTER; 2737 mutex_exit(&ncec->ncec_lock); 2738 nce_refrele(nce); 2739 } 2740 } 2741 } 2742 2743 /* 2744 * Intializes the context structure and returns the first ill in the list 2745 * cuurently start_list and end_list can have values: 2746 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 2747 * IP_V4_G_HEAD Traverse IPV4 list only. 2748 * IP_V6_G_HEAD Traverse IPV6 list only. 2749 */ 2750 2751 /* 2752 * We don't check for CONDEMNED ills here. Caller must do that if 2753 * necessary under the ill lock. 2754 */ 2755 ill_t * 2756 ill_first(int start_list, int end_list, ill_walk_context_t *ctx, 2757 ip_stack_t *ipst) 2758 { 2759 ill_if_t *ifp; 2760 ill_t *ill; 2761 avl_tree_t *avl_tree; 2762 2763 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 2764 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 2765 2766 /* 2767 * setup the lists to search 2768 */ 2769 if (end_list != MAX_G_HEADS) { 2770 ctx->ctx_current_list = start_list; 2771 ctx->ctx_last_list = end_list; 2772 } else { 2773 ctx->ctx_last_list = MAX_G_HEADS - 1; 2774 ctx->ctx_current_list = 0; 2775 } 2776 2777 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 2778 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2779 if (ifp != (ill_if_t *) 2780 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2781 avl_tree = &ifp->illif_avl_by_ppa; 2782 ill = avl_first(avl_tree); 2783 /* 2784 * ill is guaranteed to be non NULL or ifp should have 2785 * not existed. 2786 */ 2787 ASSERT(ill != NULL); 2788 return (ill); 2789 } 2790 ctx->ctx_current_list++; 2791 } 2792 2793 return (NULL); 2794 } 2795 2796 /* 2797 * returns the next ill in the list. ill_first() must have been called 2798 * before calling ill_next() or bad things will happen. 2799 */ 2800 2801 /* 2802 * We don't check for CONDEMNED ills here. Caller must do that if 2803 * necessary under the ill lock. 2804 */ 2805 ill_t * 2806 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 2807 { 2808 ill_if_t *ifp; 2809 ill_t *ill; 2810 ip_stack_t *ipst = lastill->ill_ipst; 2811 2812 ASSERT(lastill->ill_ifptr != (ill_if_t *) 2813 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)); 2814 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 2815 AVL_AFTER)) != NULL) { 2816 return (ill); 2817 } 2818 2819 /* goto next ill_ifp in the list. */ 2820 ifp = lastill->ill_ifptr->illif_next; 2821 2822 /* make sure not at end of circular list */ 2823 while (ifp == 2824 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2825 if (++ctx->ctx_current_list > ctx->ctx_last_list) 2826 return (NULL); 2827 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2828 } 2829 2830 return (avl_first(&ifp->illif_avl_by_ppa)); 2831 } 2832 2833 /* 2834 * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+ 2835 * The final number (PPA) must not have any leading zeros. Upon success, a 2836 * pointer to the start of the PPA is returned; otherwise NULL is returned. 2837 */ 2838 static char * 2839 ill_get_ppa_ptr(char *name) 2840 { 2841 int namelen = strlen(name); 2842 int end_ndx = namelen - 1; 2843 int ppa_ndx, i; 2844 2845 /* 2846 * Check that the first character is [a-zA-Z], and that the last 2847 * character is [0-9]. 2848 */ 2849 if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx])) 2850 return (NULL); 2851 2852 /* 2853 * Set `ppa_ndx' to the PPA start, and check for leading zeroes. 2854 */ 2855 for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--) 2856 if (!isdigit(name[ppa_ndx - 1])) 2857 break; 2858 2859 if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx) 2860 return (NULL); 2861 2862 /* 2863 * Check that the intermediate characters are [a-z0-9.] 2864 */ 2865 for (i = 1; i < ppa_ndx; i++) { 2866 if (!isalpha(name[i]) && !isdigit(name[i]) && 2867 name[i] != '.' && name[i] != '_') { 2868 return (NULL); 2869 } 2870 } 2871 2872 return (name + ppa_ndx); 2873 } 2874 2875 /* 2876 * use avl tree to locate the ill. 2877 */ 2878 static ill_t * 2879 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst) 2880 { 2881 char *ppa_ptr = NULL; 2882 int len; 2883 uint_t ppa; 2884 ill_t *ill = NULL; 2885 ill_if_t *ifp; 2886 int list; 2887 2888 /* 2889 * get ppa ptr 2890 */ 2891 if (isv6) 2892 list = IP_V6_G_HEAD; 2893 else 2894 list = IP_V4_G_HEAD; 2895 2896 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 2897 return (NULL); 2898 } 2899 2900 len = ppa_ptr - name + 1; 2901 2902 ppa = stoi(&ppa_ptr); 2903 2904 ifp = IP_VX_ILL_G_LIST(list, ipst); 2905 2906 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 2907 /* 2908 * match is done on len - 1 as the name is not null 2909 * terminated it contains ppa in addition to the interface 2910 * name. 2911 */ 2912 if ((ifp->illif_name_len == len) && 2913 bcmp(ifp->illif_name, name, len - 1) == 0) { 2914 break; 2915 } else { 2916 ifp = ifp->illif_next; 2917 } 2918 } 2919 2920 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 2921 /* 2922 * Even the interface type does not exist. 2923 */ 2924 return (NULL); 2925 } 2926 2927 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 2928 if (ill != NULL) { 2929 mutex_enter(&ill->ill_lock); 2930 if (ILL_CAN_LOOKUP(ill)) { 2931 ill_refhold_locked(ill); 2932 mutex_exit(&ill->ill_lock); 2933 return (ill); 2934 } 2935 mutex_exit(&ill->ill_lock); 2936 } 2937 return (NULL); 2938 } 2939 2940 /* 2941 * comparison function for use with avl. 2942 */ 2943 static int 2944 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 2945 { 2946 uint_t ppa; 2947 uint_t ill_ppa; 2948 2949 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 2950 2951 ppa = *((uint_t *)ppa_ptr); 2952 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 2953 /* 2954 * We want the ill with the lowest ppa to be on the 2955 * top. 2956 */ 2957 if (ill_ppa < ppa) 2958 return (1); 2959 if (ill_ppa > ppa) 2960 return (-1); 2961 return (0); 2962 } 2963 2964 /* 2965 * remove an interface type from the global list. 2966 */ 2967 static void 2968 ill_delete_interface_type(ill_if_t *interface) 2969 { 2970 ASSERT(interface != NULL); 2971 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 2972 2973 avl_destroy(&interface->illif_avl_by_ppa); 2974 if (interface->illif_ppa_arena != NULL) 2975 vmem_destroy(interface->illif_ppa_arena); 2976 2977 remque(interface); 2978 2979 mi_free(interface); 2980 } 2981 2982 /* 2983 * remove ill from the global list. 2984 */ 2985 static void 2986 ill_glist_delete(ill_t *ill) 2987 { 2988 ip_stack_t *ipst; 2989 phyint_t *phyi; 2990 2991 if (ill == NULL) 2992 return; 2993 ipst = ill->ill_ipst; 2994 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 2995 2996 /* 2997 * If the ill was never inserted into the AVL tree 2998 * we skip the if branch. 2999 */ 3000 if (ill->ill_ifptr != NULL) { 3001 /* 3002 * remove from AVL tree and free ppa number 3003 */ 3004 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 3005 3006 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 3007 vmem_free(ill->ill_ifptr->illif_ppa_arena, 3008 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 3009 } 3010 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 3011 ill_delete_interface_type(ill->ill_ifptr); 3012 } 3013 3014 /* 3015 * Indicate ill is no longer in the list. 3016 */ 3017 ill->ill_ifptr = NULL; 3018 ill->ill_name_length = 0; 3019 ill->ill_name[0] = '\0'; 3020 ill->ill_ppa = UINT_MAX; 3021 } 3022 3023 /* Generate one last event for this ill. */ 3024 ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name, 3025 ill->ill_name_length); 3026 3027 ASSERT(ill->ill_phyint != NULL); 3028 phyi = ill->ill_phyint; 3029 ill->ill_phyint = NULL; 3030 3031 /* 3032 * ill_init allocates a phyint always to store the copy 3033 * of flags relevant to phyint. At that point in time, we could 3034 * not assign the name and hence phyint_illv4/v6 could not be 3035 * initialized. Later in ipif_set_values, we assign the name to 3036 * the ill, at which point in time we assign phyint_illv4/v6. 3037 * Thus we don't rely on phyint_illv6 to be initialized always. 3038 */ 3039 if (ill->ill_flags & ILLF_IPV6) 3040 phyi->phyint_illv6 = NULL; 3041 else 3042 phyi->phyint_illv4 = NULL; 3043 3044 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) { 3045 rw_exit(&ipst->ips_ill_g_lock); 3046 return; 3047 } 3048 3049 /* 3050 * There are no ills left on this phyint; pull it out of the phyint 3051 * avl trees, and free it. 3052 */ 3053 if (phyi->phyint_ifindex > 0) { 3054 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3055 phyi); 3056 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 3057 phyi); 3058 } 3059 rw_exit(&ipst->ips_ill_g_lock); 3060 3061 phyint_free(phyi); 3062 } 3063 3064 /* 3065 * allocate a ppa, if the number of plumbed interfaces of this type are 3066 * less than ill_no_arena do a linear search to find a unused ppa. 3067 * When the number goes beyond ill_no_arena switch to using an arena. 3068 * Note: ppa value of zero cannot be allocated from vmem_arena as it 3069 * is the return value for an error condition, so allocation starts at one 3070 * and is decremented by one. 3071 */ 3072 static int 3073 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 3074 { 3075 ill_t *tmp_ill; 3076 uint_t start, end; 3077 int ppa; 3078 3079 if (ifp->illif_ppa_arena == NULL && 3080 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 3081 /* 3082 * Create an arena. 3083 */ 3084 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 3085 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 3086 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 3087 /* allocate what has already been assigned */ 3088 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 3089 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 3090 tmp_ill, AVL_AFTER)) { 3091 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 3092 1, /* size */ 3093 1, /* align/quantum */ 3094 0, /* phase */ 3095 0, /* nocross */ 3096 /* minaddr */ 3097 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), 3098 /* maxaddr */ 3099 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), 3100 VM_NOSLEEP|VM_FIRSTFIT); 3101 if (ppa == 0) { 3102 ip1dbg(("ill_alloc_ppa: ppa allocation" 3103 " failed while switching")); 3104 vmem_destroy(ifp->illif_ppa_arena); 3105 ifp->illif_ppa_arena = NULL; 3106 break; 3107 } 3108 } 3109 } 3110 3111 if (ifp->illif_ppa_arena != NULL) { 3112 if (ill->ill_ppa == UINT_MAX) { 3113 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 3114 1, VM_NOSLEEP|VM_FIRSTFIT); 3115 if (ppa == 0) 3116 return (EAGAIN); 3117 ill->ill_ppa = --ppa; 3118 } else { 3119 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 3120 1, /* size */ 3121 1, /* align/quantum */ 3122 0, /* phase */ 3123 0, /* nocross */ 3124 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 3125 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 3126 VM_NOSLEEP|VM_FIRSTFIT); 3127 /* 3128 * Most likely the allocation failed because 3129 * the requested ppa was in use. 3130 */ 3131 if (ppa == 0) 3132 return (EEXIST); 3133 } 3134 return (0); 3135 } 3136 3137 /* 3138 * No arena is in use and not enough (>ill_no_arena) interfaces have 3139 * been plumbed to create one. Do a linear search to get a unused ppa. 3140 */ 3141 if (ill->ill_ppa == UINT_MAX) { 3142 end = UINT_MAX - 1; 3143 start = 0; 3144 } else { 3145 end = start = ill->ill_ppa; 3146 } 3147 3148 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 3149 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 3150 if (start++ >= end) { 3151 if (ill->ill_ppa == UINT_MAX) 3152 return (EAGAIN); 3153 else 3154 return (EEXIST); 3155 } 3156 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 3157 } 3158 ill->ill_ppa = start; 3159 return (0); 3160 } 3161 3162 /* 3163 * Insert ill into the list of configured ill's. Once this function completes, 3164 * the ill is globally visible and is available through lookups. More precisely 3165 * this happens after the caller drops the ill_g_lock. 3166 */ 3167 static int 3168 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 3169 { 3170 ill_if_t *ill_interface; 3171 avl_index_t where = 0; 3172 int error; 3173 int name_length; 3174 int index; 3175 boolean_t check_length = B_FALSE; 3176 ip_stack_t *ipst = ill->ill_ipst; 3177 3178 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 3179 3180 name_length = mi_strlen(name) + 1; 3181 3182 if (isv6) 3183 index = IP_V6_G_HEAD; 3184 else 3185 index = IP_V4_G_HEAD; 3186 3187 ill_interface = IP_VX_ILL_G_LIST(index, ipst); 3188 /* 3189 * Search for interface type based on name 3190 */ 3191 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 3192 if ((ill_interface->illif_name_len == name_length) && 3193 (strcmp(ill_interface->illif_name, name) == 0)) { 3194 break; 3195 } 3196 ill_interface = ill_interface->illif_next; 3197 } 3198 3199 /* 3200 * Interface type not found, create one. 3201 */ 3202 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 3203 ill_g_head_t ghead; 3204 3205 /* 3206 * allocate ill_if_t structure 3207 */ 3208 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 3209 if (ill_interface == NULL) { 3210 return (ENOMEM); 3211 } 3212 3213 (void) strcpy(ill_interface->illif_name, name); 3214 ill_interface->illif_name_len = name_length; 3215 3216 avl_create(&ill_interface->illif_avl_by_ppa, 3217 ill_compare_ppa, sizeof (ill_t), 3218 offsetof(struct ill_s, ill_avl_byppa)); 3219 3220 /* 3221 * link the structure in the back to maintain order 3222 * of configuration for ifconfig output. 3223 */ 3224 ghead = ipst->ips_ill_g_heads[index]; 3225 insque(ill_interface, ghead.ill_g_list_tail); 3226 } 3227 3228 if (ill->ill_ppa == UINT_MAX) 3229 check_length = B_TRUE; 3230 3231 error = ill_alloc_ppa(ill_interface, ill); 3232 if (error != 0) { 3233 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 3234 ill_delete_interface_type(ill->ill_ifptr); 3235 return (error); 3236 } 3237 3238 /* 3239 * When the ppa is choosen by the system, check that there is 3240 * enough space to insert ppa. if a specific ppa was passed in this 3241 * check is not required as the interface name passed in will have 3242 * the right ppa in it. 3243 */ 3244 if (check_length) { 3245 /* 3246 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 3247 */ 3248 char buf[sizeof (uint_t) * 3]; 3249 3250 /* 3251 * convert ppa to string to calculate the amount of space 3252 * required for it in the name. 3253 */ 3254 numtos(ill->ill_ppa, buf); 3255 3256 /* Do we have enough space to insert ppa ? */ 3257 3258 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 3259 /* Free ppa and interface type struct */ 3260 if (ill_interface->illif_ppa_arena != NULL) { 3261 vmem_free(ill_interface->illif_ppa_arena, 3262 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 3263 } 3264 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 3265 ill_delete_interface_type(ill->ill_ifptr); 3266 3267 return (EINVAL); 3268 } 3269 } 3270 3271 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 3272 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 3273 3274 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 3275 &where); 3276 ill->ill_ifptr = ill_interface; 3277 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 3278 3279 ill_phyint_reinit(ill); 3280 return (0); 3281 } 3282 3283 /* Initialize the per phyint ipsq used for serialization */ 3284 static boolean_t 3285 ipsq_init(ill_t *ill, boolean_t enter) 3286 { 3287 ipsq_t *ipsq; 3288 ipxop_t *ipx; 3289 3290 if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL) 3291 return (B_FALSE); 3292 3293 ill->ill_phyint->phyint_ipsq = ipsq; 3294 ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop; 3295 ipx->ipx_ipsq = ipsq; 3296 ipsq->ipsq_next = ipsq; 3297 ipsq->ipsq_phyint = ill->ill_phyint; 3298 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 3299 mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0); 3300 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */ 3301 if (enter) { 3302 ipx->ipx_writer = curthread; 3303 ipx->ipx_forced = B_FALSE; 3304 ipx->ipx_reentry_cnt = 1; 3305 #ifdef DEBUG 3306 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 3307 #endif 3308 } 3309 return (B_TRUE); 3310 } 3311 3312 /* 3313 * ill_init is called by ip_open when a device control stream is opened. 3314 * It does a few initializations, and shoots a DL_INFO_REQ message down 3315 * to the driver. The response is later picked up in ip_rput_dlpi and 3316 * used to set up default mechanisms for talking to the driver. (Always 3317 * called as writer.) 3318 * 3319 * If this function returns error, ip_open will call ip_close which in 3320 * turn will call ill_delete to clean up any memory allocated here that 3321 * is not yet freed. 3322 */ 3323 int 3324 ill_init(queue_t *q, ill_t *ill) 3325 { 3326 int count; 3327 dl_info_req_t *dlir; 3328 mblk_t *info_mp; 3329 uchar_t *frag_ptr; 3330 3331 /* 3332 * The ill is initialized to zero by mi_alloc*(). In addition 3333 * some fields already contain valid values, initialized in 3334 * ip_open(), before we reach here. 3335 */ 3336 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 3337 mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 3338 ill->ill_saved_ire_cnt = 0; 3339 3340 ill->ill_rq = q; 3341 ill->ill_wq = WR(q); 3342 3343 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 3344 BPRI_HI); 3345 if (info_mp == NULL) 3346 return (ENOMEM); 3347 3348 /* 3349 * Allocate sufficient space to contain our fragment hash table and 3350 * the device name. 3351 */ 3352 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 2 * LIFNAMSIZ); 3353 if (frag_ptr == NULL) { 3354 freemsg(info_mp); 3355 return (ENOMEM); 3356 } 3357 ill->ill_frag_ptr = frag_ptr; 3358 ill->ill_frag_free_num_pkts = 0; 3359 ill->ill_last_frag_clean_time = 0; 3360 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 3361 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 3362 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 3363 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 3364 NULL, MUTEX_DEFAULT, NULL); 3365 } 3366 3367 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 3368 if (ill->ill_phyint == NULL) { 3369 freemsg(info_mp); 3370 mi_free(frag_ptr); 3371 return (ENOMEM); 3372 } 3373 3374 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 3375 /* 3376 * For now pretend this is a v4 ill. We need to set phyint_ill* 3377 * at this point because of the following reason. If we can't 3378 * enter the ipsq at some point and cv_wait, the writer that 3379 * wakes us up tries to locate us using the list of all phyints 3380 * in an ipsq and the ills from the phyint thru the phyint_ill*. 3381 * If we don't set it now, we risk a missed wakeup. 3382 */ 3383 ill->ill_phyint->phyint_illv4 = ill; 3384 ill->ill_ppa = UINT_MAX; 3385 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); 3386 3387 ill_set_inputfn(ill); 3388 3389 if (!ipsq_init(ill, B_TRUE)) { 3390 freemsg(info_mp); 3391 mi_free(frag_ptr); 3392 mi_free(ill->ill_phyint); 3393 return (ENOMEM); 3394 } 3395 3396 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 3397 3398 /* Frag queue limit stuff */ 3399 ill->ill_frag_count = 0; 3400 ill->ill_ipf_gen = 0; 3401 3402 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); 3403 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); 3404 ill->ill_global_timer = INFINITY; 3405 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 3406 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 3407 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 3408 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 3409 3410 /* 3411 * Initialize IPv6 configuration variables. The IP module is always 3412 * opened as an IPv4 module. Instead tracking down the cases where 3413 * it switches to do ipv6, we'll just initialize the IPv6 configuration 3414 * here for convenience, this has no effect until the ill is set to do 3415 * IPv6. 3416 */ 3417 ill->ill_reachable_time = ND_REACHABLE_TIME; 3418 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 3419 ill->ill_max_buf = ND_MAX_Q; 3420 ill->ill_refcnt = 0; 3421 3422 /* Send down the Info Request to the driver. */ 3423 info_mp->b_datap->db_type = M_PCPROTO; 3424 dlir = (dl_info_req_t *)info_mp->b_rptr; 3425 info_mp->b_wptr = (uchar_t *)&dlir[1]; 3426 dlir->dl_primitive = DL_INFO_REQ; 3427 3428 ill->ill_dlpi_pending = DL_PRIM_INVAL; 3429 3430 qprocson(q); 3431 ill_dlpi_send(ill, info_mp); 3432 3433 return (0); 3434 } 3435 3436 /* 3437 * ill_dls_info 3438 * creates datalink socket info from the device. 3439 */ 3440 int 3441 ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill) 3442 { 3443 size_t len; 3444 3445 sdl->sdl_family = AF_LINK; 3446 sdl->sdl_index = ill_get_upper_ifindex(ill); 3447 sdl->sdl_type = ill->ill_type; 3448 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); 3449 len = strlen(sdl->sdl_data); 3450 ASSERT(len < 256); 3451 sdl->sdl_nlen = (uchar_t)len; 3452 sdl->sdl_alen = ill->ill_phys_addr_length; 3453 sdl->sdl_slen = 0; 3454 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) 3455 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen); 3456 3457 return (sizeof (struct sockaddr_dl)); 3458 } 3459 3460 /* 3461 * ill_xarp_info 3462 * creates xarp info from the device. 3463 */ 3464 static int 3465 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 3466 { 3467 sdl->sdl_family = AF_LINK; 3468 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 3469 sdl->sdl_type = ill->ill_type; 3470 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); 3471 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 3472 sdl->sdl_alen = ill->ill_phys_addr_length; 3473 sdl->sdl_slen = 0; 3474 return (sdl->sdl_nlen); 3475 } 3476 3477 static int 3478 loopback_kstat_update(kstat_t *ksp, int rw) 3479 { 3480 kstat_named_t *kn; 3481 netstackid_t stackid; 3482 netstack_t *ns; 3483 ip_stack_t *ipst; 3484 3485 if (ksp == NULL || ksp->ks_data == NULL) 3486 return (EIO); 3487 3488 if (rw == KSTAT_WRITE) 3489 return (EACCES); 3490 3491 kn = KSTAT_NAMED_PTR(ksp); 3492 stackid = (zoneid_t)(uintptr_t)ksp->ks_private; 3493 3494 ns = netstack_find_by_stackid(stackid); 3495 if (ns == NULL) 3496 return (-1); 3497 3498 ipst = ns->netstack_ip; 3499 if (ipst == NULL) { 3500 netstack_rele(ns); 3501 return (-1); 3502 } 3503 kn[0].value.ui32 = ipst->ips_loopback_packets; 3504 kn[1].value.ui32 = ipst->ips_loopback_packets; 3505 netstack_rele(ns); 3506 return (0); 3507 } 3508 3509 /* 3510 * Has ifindex been plumbed already? 3511 */ 3512 static boolean_t 3513 phyint_exists(uint_t index, ip_stack_t *ipst) 3514 { 3515 ASSERT(index != 0); 3516 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 3517 3518 return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3519 &index, NULL) != NULL); 3520 } 3521 3522 /* Pick a unique ifindex */ 3523 boolean_t 3524 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst) 3525 { 3526 uint_t starting_index; 3527 3528 if (!ipst->ips_ill_index_wrap) { 3529 *indexp = ipst->ips_ill_index++; 3530 if (ipst->ips_ill_index == 0) { 3531 /* Reached the uint_t limit Next time wrap */ 3532 ipst->ips_ill_index_wrap = B_TRUE; 3533 } 3534 return (B_TRUE); 3535 } 3536 3537 /* 3538 * Start reusing unused indexes. Note that we hold the ill_g_lock 3539 * at this point and don't want to call any function that attempts 3540 * to get the lock again. 3541 */ 3542 starting_index = ipst->ips_ill_index++; 3543 for (; ipst->ips_ill_index != starting_index; ipst->ips_ill_index++) { 3544 if (ipst->ips_ill_index != 0 && 3545 !phyint_exists(ipst->ips_ill_index, ipst)) { 3546 /* found unused index - use it */ 3547 *indexp = ipst->ips_ill_index; 3548 return (B_TRUE); 3549 } 3550 } 3551 3552 /* 3553 * all interface indicies are inuse. 3554 */ 3555 return (B_FALSE); 3556 } 3557 3558 /* 3559 * Assign a unique interface index for the phyint. 3560 */ 3561 static boolean_t 3562 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst) 3563 { 3564 ASSERT(phyi->phyint_ifindex == 0); 3565 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst)); 3566 } 3567 3568 /* 3569 * Initialize the flags on `phyi' as per the provided mactype. 3570 */ 3571 static void 3572 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype) 3573 { 3574 uint64_t flags = 0; 3575 3576 /* 3577 * Initialize PHYI_RUNNING and PHYI_FAILED. For non-IPMP interfaces, 3578 * we always presume the underlying hardware is working and set 3579 * PHYI_RUNNING (if it's not, the driver will subsequently send a 3580 * DL_NOTE_LINK_DOWN message). For IPMP interfaces, at initialization 3581 * there are no active interfaces in the group so we set PHYI_FAILED. 3582 */ 3583 if (mactype == SUNW_DL_IPMP) 3584 flags |= PHYI_FAILED; 3585 else 3586 flags |= PHYI_RUNNING; 3587 3588 switch (mactype) { 3589 case SUNW_DL_VNI: 3590 flags |= PHYI_VIRTUAL; 3591 break; 3592 case SUNW_DL_IPMP: 3593 flags |= PHYI_IPMP; 3594 break; 3595 case DL_LOOP: 3596 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL); 3597 break; 3598 } 3599 3600 mutex_enter(&phyi->phyint_lock); 3601 phyi->phyint_flags |= flags; 3602 mutex_exit(&phyi->phyint_lock); 3603 } 3604 3605 /* 3606 * Return a pointer to the ill which matches the supplied name. Note that 3607 * the ill name length includes the null termination character. (May be 3608 * called as writer.) 3609 * If do_alloc and the interface is "lo0" it will be automatically created. 3610 * Cannot bump up reference on condemned ills. So dup detect can't be done 3611 * using this func. 3612 */ 3613 ill_t * 3614 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 3615 boolean_t *did_alloc, ip_stack_t *ipst) 3616 { 3617 ill_t *ill; 3618 ipif_t *ipif; 3619 ipsq_t *ipsq; 3620 kstat_named_t *kn; 3621 boolean_t isloopback; 3622 in6_addr_t ov6addr; 3623 3624 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 3625 3626 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3627 ill = ill_find_by_name(name, isv6, ipst); 3628 rw_exit(&ipst->ips_ill_g_lock); 3629 if (ill != NULL) 3630 return (ill); 3631 3632 /* 3633 * Couldn't find it. Does this happen to be a lookup for the 3634 * loopback device and are we allowed to allocate it? 3635 */ 3636 if (!isloopback || !do_alloc) 3637 return (NULL); 3638 3639 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 3640 ill = ill_find_by_name(name, isv6, ipst); 3641 if (ill != NULL) { 3642 rw_exit(&ipst->ips_ill_g_lock); 3643 return (ill); 3644 } 3645 3646 /* Create the loopback device on demand */ 3647 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 3648 sizeof (ipif_loopback_name), BPRI_MED)); 3649 if (ill == NULL) 3650 goto done; 3651 3652 *ill = ill_null; 3653 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 3654 ill->ill_ipst = ipst; 3655 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); 3656 netstack_hold(ipst->ips_netstack); 3657 /* 3658 * For exclusive stacks we set the zoneid to zero 3659 * to make IP operate as if in the global zone. 3660 */ 3661 ill->ill_zoneid = GLOBAL_ZONEID; 3662 3663 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 3664 if (ill->ill_phyint == NULL) 3665 goto done; 3666 3667 if (isv6) 3668 ill->ill_phyint->phyint_illv6 = ill; 3669 else 3670 ill->ill_phyint->phyint_illv4 = ill; 3671 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 3672 phyint_flags_init(ill->ill_phyint, DL_LOOP); 3673 3674 if (isv6) { 3675 ill->ill_isv6 = B_TRUE; 3676 ill->ill_max_frag = ip_loopback_mtu_v6plus; 3677 } else { 3678 ill->ill_max_frag = ip_loopback_mtuplus; 3679 } 3680 if (!ill_allocate_mibs(ill)) 3681 goto done; 3682 ill->ill_current_frag = ill->ill_max_frag; 3683 ill->ill_mtu = ill->ill_max_frag; /* Initial value */ 3684 /* 3685 * ipif_loopback_name can't be pointed at directly because its used 3686 * by both the ipv4 and ipv6 interfaces. When the ill is removed 3687 * from the glist, ill_glist_delete() sets the first character of 3688 * ill_name to '\0'. 3689 */ 3690 ill->ill_name = (char *)ill + sizeof (*ill); 3691 (void) strcpy(ill->ill_name, ipif_loopback_name); 3692 ill->ill_name_length = sizeof (ipif_loopback_name); 3693 /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */ 3694 ill->ill_dlpi_pending = DL_PRIM_INVAL; 3695 3696 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); 3697 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); 3698 ill->ill_global_timer = INFINITY; 3699 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 3700 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 3701 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 3702 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 3703 3704 /* No resolver here. */ 3705 ill->ill_net_type = IRE_LOOPBACK; 3706 3707 /* Initialize the ipsq */ 3708 if (!ipsq_init(ill, B_FALSE)) 3709 goto done; 3710 3711 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL); 3712 if (ipif == NULL) 3713 goto done; 3714 3715 ill->ill_flags = ILLF_MULTICAST; 3716 3717 ov6addr = ipif->ipif_v6lcl_addr; 3718 /* Set up default loopback address and mask. */ 3719 if (!isv6) { 3720 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 3721 3722 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 3723 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 3724 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 3725 ipif->ipif_v6subnet); 3726 ill->ill_flags |= ILLF_IPV4; 3727 } else { 3728 ipif->ipif_v6lcl_addr = ipv6_loopback; 3729 ipif->ipif_v6net_mask = ipv6_all_ones; 3730 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 3731 ipif->ipif_v6subnet); 3732 ill->ill_flags |= ILLF_IPV6; 3733 } 3734 3735 /* 3736 * Chain us in at the end of the ill list. hold the ill 3737 * before we make it globally visible. 1 for the lookup. 3738 */ 3739 ill->ill_refcnt = 0; 3740 ill_refhold(ill); 3741 3742 ill->ill_frag_count = 0; 3743 ill->ill_frag_free_num_pkts = 0; 3744 ill->ill_last_frag_clean_time = 0; 3745 3746 ipsq = ill->ill_phyint->phyint_ipsq; 3747 3748 ill_set_inputfn(ill); 3749 3750 if (ill_glist_insert(ill, "lo", isv6) != 0) 3751 cmn_err(CE_PANIC, "cannot insert loopback interface"); 3752 3753 /* Let SCTP know so that it can add this to its list */ 3754 sctp_update_ill(ill, SCTP_ILL_INSERT); 3755 3756 /* 3757 * We have already assigned ipif_v6lcl_addr above, but we need to 3758 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which 3759 * requires to be after ill_glist_insert() since we need the 3760 * ill_index set. Pass on ipv6_loopback as the old address. 3761 */ 3762 sctp_update_ipif_addr(ipif, ov6addr); 3763 3764 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT); 3765 3766 /* 3767 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs. 3768 * If so, free our original one. 3769 */ 3770 if (ipsq != ill->ill_phyint->phyint_ipsq) 3771 ipsq_delete(ipsq); 3772 3773 if (ipst->ips_loopback_ksp == NULL) { 3774 /* Export loopback interface statistics */ 3775 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0, 3776 ipif_loopback_name, "net", 3777 KSTAT_TYPE_NAMED, 2, 0, 3778 ipst->ips_netstack->netstack_stackid); 3779 if (ipst->ips_loopback_ksp != NULL) { 3780 ipst->ips_loopback_ksp->ks_update = 3781 loopback_kstat_update; 3782 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp); 3783 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 3784 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 3785 ipst->ips_loopback_ksp->ks_private = 3786 (void *)(uintptr_t)ipst->ips_netstack-> 3787 netstack_stackid; 3788 kstat_install(ipst->ips_loopback_ksp); 3789 } 3790 } 3791 3792 *did_alloc = B_TRUE; 3793 rw_exit(&ipst->ips_ill_g_lock); 3794 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id), 3795 NE_PLUMB, ill->ill_name, ill->ill_name_length); 3796 return (ill); 3797 done: 3798 if (ill != NULL) { 3799 if (ill->ill_phyint != NULL) { 3800 ipsq = ill->ill_phyint->phyint_ipsq; 3801 if (ipsq != NULL) { 3802 ipsq->ipsq_phyint = NULL; 3803 ipsq_delete(ipsq); 3804 } 3805 mi_free(ill->ill_phyint); 3806 } 3807 ill_free_mib(ill); 3808 if (ill->ill_ipst != NULL) 3809 netstack_rele(ill->ill_ipst->ips_netstack); 3810 mi_free(ill); 3811 } 3812 rw_exit(&ipst->ips_ill_g_lock); 3813 return (NULL); 3814 } 3815 3816 /* 3817 * For IPP calls - use the ip_stack_t for global stack. 3818 */ 3819 ill_t * 3820 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6) 3821 { 3822 ip_stack_t *ipst; 3823 ill_t *ill; 3824 3825 ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip; 3826 if (ipst == NULL) { 3827 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n"); 3828 return (NULL); 3829 } 3830 3831 ill = ill_lookup_on_ifindex(index, isv6, ipst); 3832 netstack_rele(ipst->ips_netstack); 3833 return (ill); 3834 } 3835 3836 /* 3837 * Return a pointer to the ill which matches the index and IP version type. 3838 */ 3839 ill_t * 3840 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 3841 { 3842 ill_t *ill; 3843 phyint_t *phyi; 3844 3845 /* 3846 * Indexes are stored in the phyint - a common structure 3847 * to both IPv4 and IPv6. 3848 */ 3849 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3850 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3851 (void *) &index, NULL); 3852 if (phyi != NULL) { 3853 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 3854 if (ill != NULL) { 3855 mutex_enter(&ill->ill_lock); 3856 if (!ILL_IS_CONDEMNED(ill)) { 3857 ill_refhold_locked(ill); 3858 mutex_exit(&ill->ill_lock); 3859 rw_exit(&ipst->ips_ill_g_lock); 3860 return (ill); 3861 } 3862 mutex_exit(&ill->ill_lock); 3863 } 3864 } 3865 rw_exit(&ipst->ips_ill_g_lock); 3866 return (NULL); 3867 } 3868 3869 /* 3870 * Verify whether or not an interface index is valid for the specified zoneid 3871 * to transmit packets. 3872 * It can be zero (meaning "reset") or an interface index assigned 3873 * to a non-VNI interface. (We don't use VNI interface to send packets.) 3874 */ 3875 boolean_t 3876 ip_xmit_ifindex_valid(uint_t ifindex, zoneid_t zoneid, boolean_t isv6, 3877 ip_stack_t *ipst) 3878 { 3879 ill_t *ill; 3880 3881 if (ifindex == 0) 3882 return (B_TRUE); 3883 3884 ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid, isv6, ipst); 3885 if (ill == NULL) 3886 return (B_FALSE); 3887 if (IS_VNI(ill)) { 3888 ill_refrele(ill); 3889 return (B_FALSE); 3890 } 3891 ill_refrele(ill); 3892 return (B_TRUE); 3893 } 3894 3895 /* 3896 * Return the ifindex next in sequence after the passed in ifindex. 3897 * If there is no next ifindex for the given protocol, return 0. 3898 */ 3899 uint_t 3900 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 3901 { 3902 phyint_t *phyi; 3903 phyint_t *phyi_initial; 3904 uint_t ifindex; 3905 3906 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3907 3908 if (index == 0) { 3909 phyi = avl_first( 3910 &ipst->ips_phyint_g_list->phyint_list_avl_by_index); 3911 } else { 3912 phyi = phyi_initial = avl_find( 3913 &ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3914 (void *) &index, NULL); 3915 } 3916 3917 for (; phyi != NULL; 3918 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3919 phyi, AVL_AFTER)) { 3920 /* 3921 * If we're not returning the first interface in the tree 3922 * and we still haven't moved past the phyint_t that 3923 * corresponds to index, avl_walk needs to be called again 3924 */ 3925 if (!((index != 0) && (phyi == phyi_initial))) { 3926 if (isv6) { 3927 if ((phyi->phyint_illv6) && 3928 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 3929 (phyi->phyint_illv6->ill_isv6 == 1)) 3930 break; 3931 } else { 3932 if ((phyi->phyint_illv4) && 3933 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 3934 (phyi->phyint_illv4->ill_isv6 == 0)) 3935 break; 3936 } 3937 } 3938 } 3939 3940 rw_exit(&ipst->ips_ill_g_lock); 3941 3942 if (phyi != NULL) 3943 ifindex = phyi->phyint_ifindex; 3944 else 3945 ifindex = 0; 3946 3947 return (ifindex); 3948 } 3949 3950 /* 3951 * Return the ifindex for the named interface. 3952 * If there is no next ifindex for the interface, return 0. 3953 */ 3954 uint_t 3955 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst) 3956 { 3957 phyint_t *phyi; 3958 avl_index_t where = 0; 3959 uint_t ifindex; 3960 3961 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3962 3963 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 3964 name, &where)) == NULL) { 3965 rw_exit(&ipst->ips_ill_g_lock); 3966 return (0); 3967 } 3968 3969 ifindex = phyi->phyint_ifindex; 3970 3971 rw_exit(&ipst->ips_ill_g_lock); 3972 3973 return (ifindex); 3974 } 3975 3976 /* 3977 * Return the ifindex to be used by upper layer protocols for instance 3978 * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill. 3979 */ 3980 uint_t 3981 ill_get_upper_ifindex(const ill_t *ill) 3982 { 3983 if (IS_UNDER_IPMP(ill)) 3984 return (ipmp_ill_get_ipmp_ifindex(ill)); 3985 else 3986 return (ill->ill_phyint->phyint_ifindex); 3987 } 3988 3989 3990 /* 3991 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 3992 * that gives a running thread a reference to the ill. This reference must be 3993 * released by the thread when it is done accessing the ill and related 3994 * objects. ill_refcnt can not be used to account for static references 3995 * such as other structures pointing to an ill. Callers must generally 3996 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 3997 * or be sure that the ill is not being deleted or changing state before 3998 * calling the refhold functions. A non-zero ill_refcnt ensures that the 3999 * ill won't change any of its critical state such as address, netmask etc. 4000 */ 4001 void 4002 ill_refhold(ill_t *ill) 4003 { 4004 mutex_enter(&ill->ill_lock); 4005 ill->ill_refcnt++; 4006 ILL_TRACE_REF(ill); 4007 mutex_exit(&ill->ill_lock); 4008 } 4009 4010 void 4011 ill_refhold_locked(ill_t *ill) 4012 { 4013 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4014 ill->ill_refcnt++; 4015 ILL_TRACE_REF(ill); 4016 } 4017 4018 /* Returns true if we managed to get a refhold */ 4019 boolean_t 4020 ill_check_and_refhold(ill_t *ill) 4021 { 4022 mutex_enter(&ill->ill_lock); 4023 if (!ILL_IS_CONDEMNED(ill)) { 4024 ill_refhold_locked(ill); 4025 mutex_exit(&ill->ill_lock); 4026 return (B_TRUE); 4027 } 4028 mutex_exit(&ill->ill_lock); 4029 return (B_FALSE); 4030 } 4031 4032 /* 4033 * Must not be called while holding any locks. Otherwise if this is 4034 * the last reference to be released, there is a chance of recursive mutex 4035 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 4036 * to restart an ioctl. 4037 */ 4038 void 4039 ill_refrele(ill_t *ill) 4040 { 4041 mutex_enter(&ill->ill_lock); 4042 ASSERT(ill->ill_refcnt != 0); 4043 ill->ill_refcnt--; 4044 ILL_UNTRACE_REF(ill); 4045 if (ill->ill_refcnt != 0) { 4046 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 4047 mutex_exit(&ill->ill_lock); 4048 return; 4049 } 4050 4051 /* Drops the ill_lock */ 4052 ipif_ill_refrele_tail(ill); 4053 } 4054 4055 /* 4056 * Obtain a weak reference count on the ill. This reference ensures the 4057 * ill won't be freed, but the ill may change any of its critical state 4058 * such as netmask, address etc. Returns an error if the ill has started 4059 * closing. 4060 */ 4061 boolean_t 4062 ill_waiter_inc(ill_t *ill) 4063 { 4064 mutex_enter(&ill->ill_lock); 4065 if (ill->ill_state_flags & ILL_CONDEMNED) { 4066 mutex_exit(&ill->ill_lock); 4067 return (B_FALSE); 4068 } 4069 ill->ill_waiters++; 4070 mutex_exit(&ill->ill_lock); 4071 return (B_TRUE); 4072 } 4073 4074 void 4075 ill_waiter_dcr(ill_t *ill) 4076 { 4077 mutex_enter(&ill->ill_lock); 4078 ill->ill_waiters--; 4079 if (ill->ill_waiters == 0) 4080 cv_broadcast(&ill->ill_cv); 4081 mutex_exit(&ill->ill_lock); 4082 } 4083 4084 /* 4085 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 4086 * driver. We construct best guess defaults for lower level information that 4087 * we need. If an interface is brought up without injection of any overriding 4088 * information from outside, we have to be ready to go with these defaults. 4089 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 4090 * we primarely want the dl_provider_style. 4091 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 4092 * at which point we assume the other part of the information is valid. 4093 */ 4094 void 4095 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 4096 { 4097 uchar_t *brdcst_addr; 4098 uint_t brdcst_addr_length, phys_addr_length; 4099 t_scalar_t sap_length; 4100 dl_info_ack_t *dlia; 4101 ip_m_t *ipm; 4102 dl_qos_cl_sel1_t *sel1; 4103 int min_mtu; 4104 4105 ASSERT(IAM_WRITER_ILL(ill)); 4106 4107 /* 4108 * Till the ill is fully up the ill is not globally visible. 4109 * So no need for a lock. 4110 */ 4111 dlia = (dl_info_ack_t *)mp->b_rptr; 4112 ill->ill_mactype = dlia->dl_mac_type; 4113 4114 ipm = ip_m_lookup(dlia->dl_mac_type); 4115 if (ipm == NULL) { 4116 ipm = ip_m_lookup(DL_OTHER); 4117 ASSERT(ipm != NULL); 4118 } 4119 ill->ill_media = ipm; 4120 4121 /* 4122 * When the new DLPI stuff is ready we'll pull lengths 4123 * from dlia. 4124 */ 4125 if (dlia->dl_version == DL_VERSION_2) { 4126 brdcst_addr_length = dlia->dl_brdcst_addr_length; 4127 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 4128 brdcst_addr_length); 4129 if (brdcst_addr == NULL) { 4130 brdcst_addr_length = 0; 4131 } 4132 sap_length = dlia->dl_sap_length; 4133 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 4134 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 4135 brdcst_addr_length, sap_length, phys_addr_length)); 4136 } else { 4137 brdcst_addr_length = 6; 4138 brdcst_addr = ip_six_byte_all_ones; 4139 sap_length = -2; 4140 phys_addr_length = brdcst_addr_length; 4141 } 4142 4143 ill->ill_bcast_addr_length = brdcst_addr_length; 4144 ill->ill_phys_addr_length = phys_addr_length; 4145 ill->ill_sap_length = sap_length; 4146 4147 /* 4148 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU, 4149 * but we must ensure a minimum IP MTU is used since other bits of 4150 * IP will fly apart otherwise. 4151 */ 4152 min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; 4153 ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu); 4154 ill->ill_current_frag = ill->ill_max_frag; 4155 ill->ill_mtu = ill->ill_max_frag; 4156 4157 ill->ill_type = ipm->ip_m_type; 4158 4159 if (!ill->ill_dlpi_style_set) { 4160 if (dlia->dl_provider_style == DL_STYLE2) 4161 ill->ill_needs_attach = 1; 4162 4163 phyint_flags_init(ill->ill_phyint, ill->ill_mactype); 4164 4165 /* 4166 * Allocate the first ipif on this ill. We don't delay it 4167 * further as ioctl handling assumes at least one ipif exists. 4168 * 4169 * At this point we don't know whether the ill is v4 or v6. 4170 * We will know this whan the SIOCSLIFNAME happens and 4171 * the correct value for ill_isv6 will be assigned in 4172 * ipif_set_values(). We need to hold the ill lock and 4173 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 4174 * the wakeup. 4175 */ 4176 (void) ipif_allocate(ill, 0, IRE_LOCAL, 4177 dlia->dl_provider_style != DL_STYLE2, B_TRUE, NULL); 4178 mutex_enter(&ill->ill_lock); 4179 ASSERT(ill->ill_dlpi_style_set == 0); 4180 ill->ill_dlpi_style_set = 1; 4181 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 4182 cv_broadcast(&ill->ill_cv); 4183 mutex_exit(&ill->ill_lock); 4184 freemsg(mp); 4185 return; 4186 } 4187 ASSERT(ill->ill_ipif != NULL); 4188 /* 4189 * We know whether it is IPv4 or IPv6 now, as this is the 4190 * second DL_INFO_ACK we are recieving in response to the 4191 * DL_INFO_REQ sent in ipif_set_values. 4192 */ 4193 ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap; 4194 /* 4195 * Clear all the flags that were set based on ill_bcast_addr_length 4196 * and ill_phys_addr_length (in ipif_set_values) as these could have 4197 * changed now and we need to re-evaluate. 4198 */ 4199 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 4200 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 4201 4202 /* 4203 * Free ill_bcast_mp as things could have changed now. 4204 * 4205 * NOTE: The IPMP meta-interface is special-cased because it starts 4206 * with no underlying interfaces (and thus an unknown broadcast 4207 * address length), but we enforce that an interface is broadcast- 4208 * capable as part of allowing it to join a group. 4209 */ 4210 if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) { 4211 if (ill->ill_bcast_mp != NULL) 4212 freemsg(ill->ill_bcast_mp); 4213 ill->ill_net_type = IRE_IF_NORESOLVER; 4214 4215 ill->ill_bcast_mp = ill_dlur_gen(NULL, 4216 ill->ill_phys_addr_length, 4217 ill->ill_sap, 4218 ill->ill_sap_length); 4219 4220 if (ill->ill_isv6) 4221 /* 4222 * Note: xresolv interfaces will eventually need NOARP 4223 * set here as well, but that will require those 4224 * external resolvers to have some knowledge of 4225 * that flag and act appropriately. Not to be changed 4226 * at present. 4227 */ 4228 ill->ill_flags |= ILLF_NONUD; 4229 else 4230 ill->ill_flags |= ILLF_NOARP; 4231 4232 if (ill->ill_mactype == SUNW_DL_VNI) { 4233 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 4234 } else if (ill->ill_phys_addr_length == 0 || 4235 ill->ill_mactype == DL_IPV4 || 4236 ill->ill_mactype == DL_IPV6) { 4237 /* 4238 * The underying link is point-to-point, so mark the 4239 * interface as such. We can do IP multicast over 4240 * such a link since it transmits all network-layer 4241 * packets to the remote side the same way. 4242 */ 4243 ill->ill_flags |= ILLF_MULTICAST; 4244 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 4245 } 4246 } else { 4247 ill->ill_net_type = IRE_IF_RESOLVER; 4248 if (ill->ill_bcast_mp != NULL) 4249 freemsg(ill->ill_bcast_mp); 4250 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 4251 ill->ill_bcast_addr_length, ill->ill_sap, 4252 ill->ill_sap_length); 4253 /* 4254 * Later detect lack of DLPI driver multicast 4255 * capability by catching DL_ENABMULTI errors in 4256 * ip_rput_dlpi. 4257 */ 4258 ill->ill_flags |= ILLF_MULTICAST; 4259 if (!ill->ill_isv6) 4260 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 4261 } 4262 4263 /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */ 4264 if (ill->ill_mactype == SUNW_DL_IPMP) 4265 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP); 4266 4267 /* By default an interface does not support any CoS marking */ 4268 ill->ill_flags &= ~ILLF_COS_ENABLED; 4269 4270 /* 4271 * If we get QoS information in DL_INFO_ACK, the device supports 4272 * some form of CoS marking, set ILLF_COS_ENABLED. 4273 */ 4274 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 4275 dlia->dl_qos_length); 4276 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 4277 ill->ill_flags |= ILLF_COS_ENABLED; 4278 } 4279 4280 /* Clear any previous error indication. */ 4281 ill->ill_error = 0; 4282 freemsg(mp); 4283 } 4284 4285 /* 4286 * Perform various checks to verify that an address would make sense as a 4287 * local, remote, or subnet interface address. 4288 */ 4289 static boolean_t 4290 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 4291 { 4292 ipaddr_t net_mask; 4293 4294 /* 4295 * Don't allow all zeroes, or all ones, but allow 4296 * all ones netmask. 4297 */ 4298 if ((net_mask = ip_net_mask(addr)) == 0) 4299 return (B_FALSE); 4300 /* A given netmask overrides the "guess" netmask */ 4301 if (subnet_mask != 0) 4302 net_mask = subnet_mask; 4303 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 4304 (addr == (addr | ~net_mask)))) { 4305 return (B_FALSE); 4306 } 4307 4308 /* 4309 * Even if the netmask is all ones, we do not allow address to be 4310 * 255.255.255.255 4311 */ 4312 if (addr == INADDR_BROADCAST) 4313 return (B_FALSE); 4314 4315 if (CLASSD(addr)) 4316 return (B_FALSE); 4317 4318 return (B_TRUE); 4319 } 4320 4321 #define V6_IPIF_LINKLOCAL(p) \ 4322 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr) 4323 4324 /* 4325 * Compare two given ipifs and check if the second one is better than 4326 * the first one using the order of preference (not taking deprecated 4327 * into acount) specified in ipif_lookup_multicast(). 4328 */ 4329 static boolean_t 4330 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) 4331 { 4332 /* Check the least preferred first. */ 4333 if (IS_LOOPBACK(old_ipif->ipif_ill)) { 4334 /* If both ipifs are the same, use the first one. */ 4335 if (IS_LOOPBACK(new_ipif->ipif_ill)) 4336 return (B_FALSE); 4337 else 4338 return (B_TRUE); 4339 } 4340 4341 /* For IPv6, check for link local address. */ 4342 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) { 4343 if (IS_LOOPBACK(new_ipif->ipif_ill) || 4344 V6_IPIF_LINKLOCAL(new_ipif)) { 4345 /* The second one is equal or less preferred. */ 4346 return (B_FALSE); 4347 } else { 4348 return (B_TRUE); 4349 } 4350 } 4351 4352 /* Then check for point to point interface. */ 4353 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) { 4354 if (IS_LOOPBACK(new_ipif->ipif_ill) || 4355 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) || 4356 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) { 4357 return (B_FALSE); 4358 } else { 4359 return (B_TRUE); 4360 } 4361 } 4362 4363 /* old_ipif is a normal interface, so no need to use the new one. */ 4364 return (B_FALSE); 4365 } 4366 4367 /* 4368 * Find a mulitcast-capable ipif given an IP instance and zoneid. 4369 * The ipif must be up, and its ill must multicast-capable, not 4370 * condemned, not an underlying interface in an IPMP group, and 4371 * not a VNI interface. Order of preference: 4372 * 4373 * 1a. normal 4374 * 1b. normal, but deprecated 4375 * 2a. point to point 4376 * 2b. point to point, but deprecated 4377 * 3a. link local 4378 * 3b. link local, but deprecated 4379 * 4. loopback. 4380 */ 4381 static ipif_t * 4382 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 4383 { 4384 ill_t *ill; 4385 ill_walk_context_t ctx; 4386 ipif_t *ipif; 4387 ipif_t *saved_ipif = NULL; 4388 ipif_t *dep_ipif = NULL; 4389 4390 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4391 if (isv6) 4392 ill = ILL_START_WALK_V6(&ctx, ipst); 4393 else 4394 ill = ILL_START_WALK_V4(&ctx, ipst); 4395 4396 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4397 mutex_enter(&ill->ill_lock); 4398 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || 4399 ILL_IS_CONDEMNED(ill) || 4400 !(ill->ill_flags & ILLF_MULTICAST)) { 4401 mutex_exit(&ill->ill_lock); 4402 continue; 4403 } 4404 for (ipif = ill->ill_ipif; ipif != NULL; 4405 ipif = ipif->ipif_next) { 4406 if (zoneid != ipif->ipif_zoneid && 4407 zoneid != ALL_ZONES && 4408 ipif->ipif_zoneid != ALL_ZONES) { 4409 continue; 4410 } 4411 if (!(ipif->ipif_flags & IPIF_UP) || 4412 IPIF_IS_CONDEMNED(ipif)) { 4413 continue; 4414 } 4415 4416 /* 4417 * Found one candidate. If it is deprecated, 4418 * remember it in dep_ipif. If it is not deprecated, 4419 * remember it in saved_ipif. 4420 */ 4421 if (ipif->ipif_flags & IPIF_DEPRECATED) { 4422 if (dep_ipif == NULL) { 4423 dep_ipif = ipif; 4424 } else if (ipif_comp_multi(dep_ipif, ipif, 4425 isv6)) { 4426 /* 4427 * If the previous dep_ipif does not 4428 * belong to the same ill, we've done 4429 * a ipif_refhold() on it. So we need 4430 * to release it. 4431 */ 4432 if (dep_ipif->ipif_ill != ill) 4433 ipif_refrele(dep_ipif); 4434 dep_ipif = ipif; 4435 } 4436 continue; 4437 } 4438 if (saved_ipif == NULL) { 4439 saved_ipif = ipif; 4440 } else { 4441 if (ipif_comp_multi(saved_ipif, ipif, isv6)) { 4442 if (saved_ipif->ipif_ill != ill) 4443 ipif_refrele(saved_ipif); 4444 saved_ipif = ipif; 4445 } 4446 } 4447 } 4448 /* 4449 * Before going to the next ill, do a ipif_refhold() on the 4450 * saved ones. 4451 */ 4452 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill) 4453 ipif_refhold_locked(saved_ipif); 4454 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill) 4455 ipif_refhold_locked(dep_ipif); 4456 mutex_exit(&ill->ill_lock); 4457 } 4458 rw_exit(&ipst->ips_ill_g_lock); 4459 4460 /* 4461 * If we have only the saved_ipif, return it. But if we have both 4462 * saved_ipif and dep_ipif, check to see which one is better. 4463 */ 4464 if (saved_ipif != NULL) { 4465 if (dep_ipif != NULL) { 4466 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) { 4467 ipif_refrele(saved_ipif); 4468 return (dep_ipif); 4469 } else { 4470 ipif_refrele(dep_ipif); 4471 return (saved_ipif); 4472 } 4473 } 4474 return (saved_ipif); 4475 } else { 4476 return (dep_ipif); 4477 } 4478 } 4479 4480 ill_t * 4481 ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 4482 { 4483 ipif_t *ipif; 4484 ill_t *ill; 4485 4486 ipif = ipif_lookup_multicast(ipst, zoneid, isv6); 4487 if (ipif == NULL) 4488 return (NULL); 4489 4490 ill = ipif->ipif_ill; 4491 ill_refhold(ill); 4492 ipif_refrele(ipif); 4493 return (ill); 4494 } 4495 4496 /* 4497 * This function is called when an application does not specify an interface 4498 * to be used for multicast traffic (joining a group/sending data). It 4499 * calls ire_lookup_multi() to look for an interface route for the 4500 * specified multicast group. Doing this allows the administrator to add 4501 * prefix routes for multicast to indicate which interface to be used for 4502 * multicast traffic in the above scenario. The route could be for all 4503 * multicast (224.0/4), for a single multicast group (a /32 route) or 4504 * anything in between. If there is no such multicast route, we just find 4505 * any multicast capable interface and return it. The returned ipif 4506 * is refhold'ed. 4507 * 4508 * We support MULTIRT and RTF_SETSRC on the multicast routes added to the 4509 * unicast table. This is used by CGTP. 4510 */ 4511 ill_t * 4512 ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 4513 boolean_t *multirtp, ipaddr_t *setsrcp) 4514 { 4515 ill_t *ill; 4516 4517 ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp); 4518 if (ill != NULL) 4519 return (ill); 4520 4521 return (ill_lookup_multicast(ipst, zoneid, B_FALSE)); 4522 } 4523 4524 /* 4525 * Look for an ipif with the specified interface address and destination. 4526 * The destination address is used only for matching point-to-point interfaces. 4527 */ 4528 ipif_t * 4529 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst) 4530 { 4531 ipif_t *ipif; 4532 ill_t *ill; 4533 ill_walk_context_t ctx; 4534 4535 /* 4536 * First match all the point-to-point interfaces 4537 * before looking at non-point-to-point interfaces. 4538 * This is done to avoid returning non-point-to-point 4539 * ipif instead of unnumbered point-to-point ipif. 4540 */ 4541 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4542 ill = ILL_START_WALK_V4(&ctx, ipst); 4543 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4544 mutex_enter(&ill->ill_lock); 4545 for (ipif = ill->ill_ipif; ipif != NULL; 4546 ipif = ipif->ipif_next) { 4547 /* Allow the ipif to be down */ 4548 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 4549 (ipif->ipif_lcl_addr == if_addr) && 4550 (ipif->ipif_pp_dst_addr == dst)) { 4551 if (!IPIF_IS_CONDEMNED(ipif)) { 4552 ipif_refhold_locked(ipif); 4553 mutex_exit(&ill->ill_lock); 4554 rw_exit(&ipst->ips_ill_g_lock); 4555 return (ipif); 4556 } 4557 } 4558 } 4559 mutex_exit(&ill->ill_lock); 4560 } 4561 rw_exit(&ipst->ips_ill_g_lock); 4562 4563 /* lookup the ipif based on interface address */ 4564 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst); 4565 ASSERT(ipif == NULL || !ipif->ipif_isv6); 4566 return (ipif); 4567 } 4568 4569 /* 4570 * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact(). 4571 */ 4572 static ipif_t * 4573 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags, 4574 zoneid_t zoneid, ip_stack_t *ipst) 4575 { 4576 ipif_t *ipif; 4577 ill_t *ill; 4578 boolean_t ptp = B_FALSE; 4579 ill_walk_context_t ctx; 4580 boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP); 4581 boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP); 4582 4583 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4584 /* 4585 * Repeat twice, first based on local addresses and 4586 * next time for pointopoint. 4587 */ 4588 repeat: 4589 ill = ILL_START_WALK_V4(&ctx, ipst); 4590 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4591 if (match_ill != NULL && ill != match_ill && 4592 (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { 4593 continue; 4594 } 4595 mutex_enter(&ill->ill_lock); 4596 for (ipif = ill->ill_ipif; ipif != NULL; 4597 ipif = ipif->ipif_next) { 4598 if (zoneid != ALL_ZONES && 4599 zoneid != ipif->ipif_zoneid && 4600 ipif->ipif_zoneid != ALL_ZONES) 4601 continue; 4602 4603 if (no_duplicate && !(ipif->ipif_flags & IPIF_UP)) 4604 continue; 4605 4606 /* Allow the ipif to be down */ 4607 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 4608 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 4609 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 4610 (ipif->ipif_pp_dst_addr == addr))) { 4611 if (!IPIF_IS_CONDEMNED(ipif)) { 4612 ipif_refhold_locked(ipif); 4613 mutex_exit(&ill->ill_lock); 4614 rw_exit(&ipst->ips_ill_g_lock); 4615 return (ipif); 4616 } 4617 } 4618 } 4619 mutex_exit(&ill->ill_lock); 4620 } 4621 4622 /* If we already did the ptp case, then we are done */ 4623 if (ptp) { 4624 rw_exit(&ipst->ips_ill_g_lock); 4625 return (NULL); 4626 } 4627 ptp = B_TRUE; 4628 goto repeat; 4629 } 4630 4631 /* 4632 * Lookup an ipif with the specified address. For point-to-point links we 4633 * look for matches on either the destination address or the local address, 4634 * but we skip the local address check if IPIF_UNNUMBERED is set. If the 4635 * `match_ill' argument is non-NULL, the lookup is restricted to that ill 4636 * (or illgrp if `match_ill' is in an IPMP group). 4637 */ 4638 ipif_t * 4639 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, 4640 ip_stack_t *ipst) 4641 { 4642 return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP, 4643 zoneid, ipst)); 4644 } 4645 4646 /* 4647 * Lookup an ipif with the specified address. Similar to ipif_lookup_addr, 4648 * except that we will only return an address if it is not marked as 4649 * IPIF_DUPLICATE 4650 */ 4651 ipif_t * 4652 ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, 4653 ip_stack_t *ipst) 4654 { 4655 return (ipif_lookup_addr_common(addr, match_ill, 4656 (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP), 4657 zoneid, ipst)); 4658 } 4659 4660 /* 4661 * Special abbreviated version of ipif_lookup_addr() that doesn't match 4662 * `match_ill' across the IPMP group. This function is only needed in some 4663 * corner-cases; almost everything should use ipif_lookup_addr(). 4664 */ 4665 ipif_t * 4666 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 4667 { 4668 ASSERT(match_ill != NULL); 4669 return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES, 4670 ipst)); 4671 } 4672 4673 /* 4674 * Look for an ipif with the specified address. For point-point links 4675 * we look for matches on either the destination address and the local 4676 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 4677 * is set. 4678 * If the `match_ill' argument is non-NULL, the lookup is restricted to that 4679 * ill (or illgrp if `match_ill' is in an IPMP group). 4680 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 4681 */ 4682 zoneid_t 4683 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 4684 { 4685 zoneid_t zoneid; 4686 ipif_t *ipif; 4687 ill_t *ill; 4688 boolean_t ptp = B_FALSE; 4689 ill_walk_context_t ctx; 4690 4691 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4692 /* 4693 * Repeat twice, first based on local addresses and 4694 * next time for pointopoint. 4695 */ 4696 repeat: 4697 ill = ILL_START_WALK_V4(&ctx, ipst); 4698 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4699 if (match_ill != NULL && ill != match_ill && 4700 !IS_IN_SAME_ILLGRP(ill, match_ill)) { 4701 continue; 4702 } 4703 mutex_enter(&ill->ill_lock); 4704 for (ipif = ill->ill_ipif; ipif != NULL; 4705 ipif = ipif->ipif_next) { 4706 /* Allow the ipif to be down */ 4707 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 4708 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 4709 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 4710 (ipif->ipif_pp_dst_addr == addr)) && 4711 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 4712 zoneid = ipif->ipif_zoneid; 4713 mutex_exit(&ill->ill_lock); 4714 rw_exit(&ipst->ips_ill_g_lock); 4715 /* 4716 * If ipif_zoneid was ALL_ZONES then we have 4717 * a trusted extensions shared IP address. 4718 * In that case GLOBAL_ZONEID works to send. 4719 */ 4720 if (zoneid == ALL_ZONES) 4721 zoneid = GLOBAL_ZONEID; 4722 return (zoneid); 4723 } 4724 } 4725 mutex_exit(&ill->ill_lock); 4726 } 4727 4728 /* If we already did the ptp case, then we are done */ 4729 if (ptp) { 4730 rw_exit(&ipst->ips_ill_g_lock); 4731 return (ALL_ZONES); 4732 } 4733 ptp = B_TRUE; 4734 goto repeat; 4735 } 4736 4737 /* 4738 * Look for an ipif that matches the specified remote address i.e. the 4739 * ipif that would receive the specified packet. 4740 * First look for directly connected interfaces and then do a recursive 4741 * IRE lookup and pick the first ipif corresponding to the source address in the 4742 * ire. 4743 * Returns: held ipif 4744 * 4745 * This is only used for ICMP_ADDRESS_MASK_REQUESTs 4746 */ 4747 ipif_t * 4748 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 4749 { 4750 ipif_t *ipif; 4751 4752 ASSERT(!ill->ill_isv6); 4753 4754 /* 4755 * Someone could be changing this ipif currently or change it 4756 * after we return this. Thus a few packets could use the old 4757 * old values. However structure updates/creates (ire, ilg, ilm etc) 4758 * will atomically be updated or cleaned up with the new value 4759 * Thus we don't need a lock to check the flags or other attrs below. 4760 */ 4761 mutex_enter(&ill->ill_lock); 4762 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4763 if (IPIF_IS_CONDEMNED(ipif)) 4764 continue; 4765 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 4766 ipif->ipif_zoneid != ALL_ZONES) 4767 continue; 4768 /* Allow the ipif to be down */ 4769 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 4770 if ((ipif->ipif_pp_dst_addr == addr) || 4771 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 4772 ipif->ipif_lcl_addr == addr)) { 4773 ipif_refhold_locked(ipif); 4774 mutex_exit(&ill->ill_lock); 4775 return (ipif); 4776 } 4777 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 4778 ipif_refhold_locked(ipif); 4779 mutex_exit(&ill->ill_lock); 4780 return (ipif); 4781 } 4782 } 4783 mutex_exit(&ill->ill_lock); 4784 /* 4785 * For a remote destination it isn't possible to nail down a particular 4786 * ipif. 4787 */ 4788 4789 /* Pick the first interface */ 4790 ipif = ipif_get_next_ipif(NULL, ill); 4791 return (ipif); 4792 } 4793 4794 /* 4795 * This func does not prevent refcnt from increasing. But if 4796 * the caller has taken steps to that effect, then this func 4797 * can be used to determine whether the ill has become quiescent 4798 */ 4799 static boolean_t 4800 ill_is_quiescent(ill_t *ill) 4801 { 4802 ipif_t *ipif; 4803 4804 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4805 4806 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4807 if (ipif->ipif_refcnt != 0) 4808 return (B_FALSE); 4809 } 4810 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) { 4811 return (B_FALSE); 4812 } 4813 return (B_TRUE); 4814 } 4815 4816 boolean_t 4817 ill_is_freeable(ill_t *ill) 4818 { 4819 ipif_t *ipif; 4820 4821 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4822 4823 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4824 if (ipif->ipif_refcnt != 0) { 4825 return (B_FALSE); 4826 } 4827 } 4828 if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) { 4829 return (B_FALSE); 4830 } 4831 return (B_TRUE); 4832 } 4833 4834 /* 4835 * This func does not prevent refcnt from increasing. But if 4836 * the caller has taken steps to that effect, then this func 4837 * can be used to determine whether the ipif has become quiescent 4838 */ 4839 static boolean_t 4840 ipif_is_quiescent(ipif_t *ipif) 4841 { 4842 ill_t *ill; 4843 4844 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 4845 4846 if (ipif->ipif_refcnt != 0) 4847 return (B_FALSE); 4848 4849 ill = ipif->ipif_ill; 4850 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 4851 ill->ill_logical_down) { 4852 return (B_TRUE); 4853 } 4854 4855 /* This is the last ipif going down or being deleted on this ill */ 4856 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { 4857 return (B_FALSE); 4858 } 4859 4860 return (B_TRUE); 4861 } 4862 4863 /* 4864 * return true if the ipif can be destroyed: the ipif has to be quiescent 4865 * with zero references from ire/ilm to it. 4866 */ 4867 static boolean_t 4868 ipif_is_freeable(ipif_t *ipif) 4869 { 4870 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 4871 ASSERT(ipif->ipif_id != 0); 4872 return (ipif->ipif_refcnt == 0); 4873 } 4874 4875 /* 4876 * The ipif/ill/ire has been refreled. Do the tail processing. 4877 * Determine if the ipif or ill in question has become quiescent and if so 4878 * wakeup close and/or restart any queued pending ioctl that is waiting 4879 * for the ipif_down (or ill_down) 4880 */ 4881 void 4882 ipif_ill_refrele_tail(ill_t *ill) 4883 { 4884 mblk_t *mp; 4885 conn_t *connp; 4886 ipsq_t *ipsq; 4887 ipxop_t *ipx; 4888 ipif_t *ipif; 4889 dl_notify_ind_t *dlindp; 4890 4891 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4892 4893 if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) { 4894 /* ip_modclose() may be waiting */ 4895 cv_broadcast(&ill->ill_cv); 4896 } 4897 4898 ipsq = ill->ill_phyint->phyint_ipsq; 4899 mutex_enter(&ipsq->ipsq_lock); 4900 ipx = ipsq->ipsq_xop; 4901 mutex_enter(&ipx->ipx_lock); 4902 if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */ 4903 goto unlock; 4904 4905 ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL); 4906 4907 ipif = ipx->ipx_pending_ipif; 4908 if (ipif->ipif_ill != ill) /* wait is for another ill; bail */ 4909 goto unlock; 4910 4911 switch (ipx->ipx_waitfor) { 4912 case IPIF_DOWN: 4913 if (!ipif_is_quiescent(ipif)) 4914 goto unlock; 4915 break; 4916 case IPIF_FREE: 4917 if (!ipif_is_freeable(ipif)) 4918 goto unlock; 4919 break; 4920 case ILL_DOWN: 4921 if (!ill_is_quiescent(ill)) 4922 goto unlock; 4923 break; 4924 case ILL_FREE: 4925 /* 4926 * ILL_FREE is only for loopback; normal ill teardown waits 4927 * synchronously in ip_modclose() without using ipx_waitfor, 4928 * handled by the cv_broadcast() at the top of this function. 4929 */ 4930 if (!ill_is_freeable(ill)) 4931 goto unlock; 4932 break; 4933 default: 4934 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n", 4935 (void *)ipsq, ipx->ipx_waitfor); 4936 } 4937 4938 ill_refhold_locked(ill); /* for qwriter_ip() call below */ 4939 mutex_exit(&ipx->ipx_lock); 4940 mp = ipsq_pending_mp_get(ipsq, &connp); 4941 mutex_exit(&ipsq->ipsq_lock); 4942 mutex_exit(&ill->ill_lock); 4943 4944 ASSERT(mp != NULL); 4945 /* 4946 * NOTE: all of the qwriter_ip() calls below use CUR_OP since 4947 * we can only get here when the current operation decides it 4948 * it needs to quiesce via ipsq_pending_mp_add(). 4949 */ 4950 switch (mp->b_datap->db_type) { 4951 case M_PCPROTO: 4952 case M_PROTO: 4953 /* 4954 * For now, only DL_NOTIFY_IND messages can use this facility. 4955 */ 4956 dlindp = (dl_notify_ind_t *)mp->b_rptr; 4957 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND); 4958 4959 switch (dlindp->dl_notification) { 4960 case DL_NOTE_PHYS_ADDR: 4961 qwriter_ip(ill, ill->ill_rq, mp, 4962 ill_set_phys_addr_tail, CUR_OP, B_TRUE); 4963 return; 4964 case DL_NOTE_REPLUMB: 4965 qwriter_ip(ill, ill->ill_rq, mp, 4966 ill_replumb_tail, CUR_OP, B_TRUE); 4967 return; 4968 default: 4969 ASSERT(0); 4970 ill_refrele(ill); 4971 } 4972 break; 4973 4974 case M_ERROR: 4975 case M_HANGUP: 4976 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP, 4977 B_TRUE); 4978 return; 4979 4980 case M_IOCTL: 4981 case M_IOCDATA: 4982 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) : 4983 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE); 4984 return; 4985 4986 default: 4987 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 4988 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 4989 } 4990 return; 4991 unlock: 4992 mutex_exit(&ipsq->ipsq_lock); 4993 mutex_exit(&ipx->ipx_lock); 4994 mutex_exit(&ill->ill_lock); 4995 } 4996 4997 #ifdef DEBUG 4998 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 4999 static void 5000 th_trace_rrecord(th_trace_t *th_trace) 5001 { 5002 tr_buf_t *tr_buf; 5003 uint_t lastref; 5004 5005 lastref = th_trace->th_trace_lastref; 5006 lastref++; 5007 if (lastref == TR_BUF_MAX) 5008 lastref = 0; 5009 th_trace->th_trace_lastref = lastref; 5010 tr_buf = &th_trace->th_trbuf[lastref]; 5011 tr_buf->tr_time = ddi_get_lbolt(); 5012 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH); 5013 } 5014 5015 static void 5016 th_trace_free(void *value) 5017 { 5018 th_trace_t *th_trace = value; 5019 5020 ASSERT(th_trace->th_refcnt == 0); 5021 kmem_free(th_trace, sizeof (*th_trace)); 5022 } 5023 5024 /* 5025 * Find or create the per-thread hash table used to track object references. 5026 * The ipst argument is NULL if we shouldn't allocate. 5027 * 5028 * Accesses per-thread data, so there's no need to lock here. 5029 */ 5030 static mod_hash_t * 5031 th_trace_gethash(ip_stack_t *ipst) 5032 { 5033 th_hash_t *thh; 5034 5035 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) { 5036 mod_hash_t *mh; 5037 char name[256]; 5038 size_t objsize, rshift; 5039 int retv; 5040 5041 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL) 5042 return (NULL); 5043 (void) snprintf(name, sizeof (name), "th_trace_%p", 5044 (void *)curthread); 5045 5046 /* 5047 * We use mod_hash_create_extended here rather than the more 5048 * obvious mod_hash_create_ptrhash because the latter has a 5049 * hard-coded KM_SLEEP, and we'd prefer to fail rather than 5050 * block. 5051 */ 5052 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)), 5053 MAX(sizeof (ire_t), sizeof (ncec_t))); 5054 rshift = highbit(objsize); 5055 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor, 5056 th_trace_free, mod_hash_byptr, (void *)rshift, 5057 mod_hash_ptrkey_cmp, KM_NOSLEEP); 5058 if (mh == NULL) { 5059 kmem_free(thh, sizeof (*thh)); 5060 return (NULL); 5061 } 5062 thh->thh_hash = mh; 5063 thh->thh_ipst = ipst; 5064 /* 5065 * We trace ills, ipifs, ires, and nces. All of these are 5066 * per-IP-stack, so the lock on the thread list is as well. 5067 */ 5068 rw_enter(&ip_thread_rwlock, RW_WRITER); 5069 list_insert_tail(&ip_thread_list, thh); 5070 rw_exit(&ip_thread_rwlock); 5071 retv = tsd_set(ip_thread_data, thh); 5072 ASSERT(retv == 0); 5073 } 5074 return (thh != NULL ? thh->thh_hash : NULL); 5075 } 5076 5077 boolean_t 5078 th_trace_ref(const void *obj, ip_stack_t *ipst) 5079 { 5080 th_trace_t *th_trace; 5081 mod_hash_t *mh; 5082 mod_hash_val_t val; 5083 5084 if ((mh = th_trace_gethash(ipst)) == NULL) 5085 return (B_FALSE); 5086 5087 /* 5088 * Attempt to locate the trace buffer for this obj and thread. 5089 * If it does not exist, then allocate a new trace buffer and 5090 * insert into the hash. 5091 */ 5092 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) { 5093 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP); 5094 if (th_trace == NULL) 5095 return (B_FALSE); 5096 5097 th_trace->th_id = curthread; 5098 if (mod_hash_insert(mh, (mod_hash_key_t)obj, 5099 (mod_hash_val_t)th_trace) != 0) { 5100 kmem_free(th_trace, sizeof (th_trace_t)); 5101 return (B_FALSE); 5102 } 5103 } else { 5104 th_trace = (th_trace_t *)val; 5105 } 5106 5107 ASSERT(th_trace->th_refcnt >= 0 && 5108 th_trace->th_refcnt < TR_BUF_MAX - 1); 5109 5110 th_trace->th_refcnt++; 5111 th_trace_rrecord(th_trace); 5112 return (B_TRUE); 5113 } 5114 5115 /* 5116 * For the purpose of tracing a reference release, we assume that global 5117 * tracing is always on and that the same thread initiated the reference hold 5118 * is releasing. 5119 */ 5120 void 5121 th_trace_unref(const void *obj) 5122 { 5123 int retv; 5124 mod_hash_t *mh; 5125 th_trace_t *th_trace; 5126 mod_hash_val_t val; 5127 5128 mh = th_trace_gethash(NULL); 5129 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val); 5130 ASSERT(retv == 0); 5131 th_trace = (th_trace_t *)val; 5132 5133 ASSERT(th_trace->th_refcnt > 0); 5134 th_trace->th_refcnt--; 5135 th_trace_rrecord(th_trace); 5136 } 5137 5138 /* 5139 * If tracing has been disabled, then we assume that the reference counts are 5140 * now useless, and we clear them out before destroying the entries. 5141 */ 5142 void 5143 th_trace_cleanup(const void *obj, boolean_t trace_disable) 5144 { 5145 th_hash_t *thh; 5146 mod_hash_t *mh; 5147 mod_hash_val_t val; 5148 th_trace_t *th_trace; 5149 int retv; 5150 5151 rw_enter(&ip_thread_rwlock, RW_READER); 5152 for (thh = list_head(&ip_thread_list); thh != NULL; 5153 thh = list_next(&ip_thread_list, thh)) { 5154 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj, 5155 &val) == 0) { 5156 th_trace = (th_trace_t *)val; 5157 if (trace_disable) 5158 th_trace->th_refcnt = 0; 5159 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj); 5160 ASSERT(retv == 0); 5161 } 5162 } 5163 rw_exit(&ip_thread_rwlock); 5164 } 5165 5166 void 5167 ipif_trace_ref(ipif_t *ipif) 5168 { 5169 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5170 5171 if (ipif->ipif_trace_disable) 5172 return; 5173 5174 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) { 5175 ipif->ipif_trace_disable = B_TRUE; 5176 ipif_trace_cleanup(ipif); 5177 } 5178 } 5179 5180 void 5181 ipif_untrace_ref(ipif_t *ipif) 5182 { 5183 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5184 5185 if (!ipif->ipif_trace_disable) 5186 th_trace_unref(ipif); 5187 } 5188 5189 void 5190 ill_trace_ref(ill_t *ill) 5191 { 5192 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5193 5194 if (ill->ill_trace_disable) 5195 return; 5196 5197 if (!th_trace_ref(ill, ill->ill_ipst)) { 5198 ill->ill_trace_disable = B_TRUE; 5199 ill_trace_cleanup(ill); 5200 } 5201 } 5202 5203 void 5204 ill_untrace_ref(ill_t *ill) 5205 { 5206 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5207 5208 if (!ill->ill_trace_disable) 5209 th_trace_unref(ill); 5210 } 5211 5212 /* 5213 * Called when ipif is unplumbed or when memory alloc fails. Note that on 5214 * failure, ipif_trace_disable is set. 5215 */ 5216 static void 5217 ipif_trace_cleanup(const ipif_t *ipif) 5218 { 5219 th_trace_cleanup(ipif, ipif->ipif_trace_disable); 5220 } 5221 5222 /* 5223 * Called when ill is unplumbed or when memory alloc fails. Note that on 5224 * failure, ill_trace_disable is set. 5225 */ 5226 static void 5227 ill_trace_cleanup(const ill_t *ill) 5228 { 5229 th_trace_cleanup(ill, ill->ill_trace_disable); 5230 } 5231 #endif /* DEBUG */ 5232 5233 void 5234 ipif_refhold_locked(ipif_t *ipif) 5235 { 5236 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5237 ipif->ipif_refcnt++; 5238 IPIF_TRACE_REF(ipif); 5239 } 5240 5241 void 5242 ipif_refhold(ipif_t *ipif) 5243 { 5244 ill_t *ill; 5245 5246 ill = ipif->ipif_ill; 5247 mutex_enter(&ill->ill_lock); 5248 ipif->ipif_refcnt++; 5249 IPIF_TRACE_REF(ipif); 5250 mutex_exit(&ill->ill_lock); 5251 } 5252 5253 /* 5254 * Must not be called while holding any locks. Otherwise if this is 5255 * the last reference to be released there is a chance of recursive mutex 5256 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5257 * to restart an ioctl. 5258 */ 5259 void 5260 ipif_refrele(ipif_t *ipif) 5261 { 5262 ill_t *ill; 5263 5264 ill = ipif->ipif_ill; 5265 5266 mutex_enter(&ill->ill_lock); 5267 ASSERT(ipif->ipif_refcnt != 0); 5268 ipif->ipif_refcnt--; 5269 IPIF_UNTRACE_REF(ipif); 5270 if (ipif->ipif_refcnt != 0) { 5271 mutex_exit(&ill->ill_lock); 5272 return; 5273 } 5274 5275 /* Drops the ill_lock */ 5276 ipif_ill_refrele_tail(ill); 5277 } 5278 5279 ipif_t * 5280 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 5281 { 5282 ipif_t *ipif; 5283 5284 mutex_enter(&ill->ill_lock); 5285 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 5286 ipif != NULL; ipif = ipif->ipif_next) { 5287 if (IPIF_IS_CONDEMNED(ipif)) 5288 continue; 5289 ipif_refhold_locked(ipif); 5290 mutex_exit(&ill->ill_lock); 5291 return (ipif); 5292 } 5293 mutex_exit(&ill->ill_lock); 5294 return (NULL); 5295 } 5296 5297 /* 5298 * TODO: make this table extendible at run time 5299 * Return a pointer to the mac type info for 'mac_type' 5300 */ 5301 static ip_m_t * 5302 ip_m_lookup(t_uscalar_t mac_type) 5303 { 5304 ip_m_t *ipm; 5305 5306 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 5307 if (ipm->ip_m_mac_type == mac_type) 5308 return (ipm); 5309 return (NULL); 5310 } 5311 5312 /* 5313 * Make a link layer address from the multicast IP address *addr. 5314 * To form the link layer address, invoke the ip_m_v*mapping function 5315 * associated with the link-layer type. 5316 */ 5317 void 5318 ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr) 5319 { 5320 ip_m_t *ipm; 5321 5322 if (ill->ill_net_type == IRE_IF_NORESOLVER) 5323 return; 5324 5325 ASSERT(addr != NULL); 5326 5327 ipm = ip_m_lookup(ill->ill_mactype); 5328 if (ipm == NULL || 5329 (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) || 5330 (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) { 5331 ip0dbg(("no mapping for ill %s mactype 0x%x\n", 5332 ill->ill_name, ill->ill_mactype)); 5333 return; 5334 } 5335 if (ill->ill_isv6) 5336 (*ipm->ip_m_v6mapping)(ill, addr, hwaddr); 5337 else 5338 (*ipm->ip_m_v4mapping)(ill, addr, hwaddr); 5339 } 5340 5341 /* 5342 * Returns B_FALSE if the IPv4 netmask pointed by `mask' is non-contiguous. 5343 * Otherwise returns B_TRUE. 5344 * 5345 * The netmask can be verified to be contiguous with 32 shifts and or 5346 * operations. Take the contiguous mask (in host byte order) and compute 5347 * mask | mask << 1 | mask << 2 | ... | mask << 31 5348 * the result will be the same as the 'mask' for contiguous mask. 5349 */ 5350 static boolean_t 5351 ip_contiguous_mask(uint32_t mask) 5352 { 5353 uint32_t m = mask; 5354 int i; 5355 5356 for (i = 1; i < 32; i++) 5357 m |= (mask << i); 5358 5359 return (m == mask); 5360 } 5361 5362 /* 5363 * ip_rt_add is called to add an IPv4 route to the forwarding table. 5364 * ill is passed in to associate it with the correct interface. 5365 * If ire_arg is set, then we return the held IRE in that location. 5366 */ 5367 int 5368 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 5369 ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg, 5370 boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid) 5371 { 5372 ire_t *ire, *nire; 5373 ire_t *gw_ire = NULL; 5374 ipif_t *ipif = NULL; 5375 uint_t type; 5376 int match_flags = MATCH_IRE_TYPE; 5377 tsol_gc_t *gc = NULL; 5378 tsol_gcgrp_t *gcgrp = NULL; 5379 boolean_t gcgrp_xtraref = B_FALSE; 5380 boolean_t cgtp_broadcast; 5381 boolean_t unbound = B_FALSE; 5382 5383 ip1dbg(("ip_rt_add:")); 5384 5385 if (ire_arg != NULL) 5386 *ire_arg = NULL; 5387 5388 /* disallow non-contiguous netmasks */ 5389 if (!ip_contiguous_mask(ntohl(mask))) 5390 return (ENOTSUP); 5391 5392 /* 5393 * If this is the case of RTF_HOST being set, then we set the netmask 5394 * to all ones (regardless if one was supplied). 5395 */ 5396 if (flags & RTF_HOST) 5397 mask = IP_HOST_MASK; 5398 5399 /* 5400 * Prevent routes with a zero gateway from being created (since 5401 * interfaces can currently be plumbed and brought up no assigned 5402 * address). 5403 */ 5404 if (gw_addr == 0) 5405 return (ENETUNREACH); 5406 /* 5407 * Get the ipif, if any, corresponding to the gw_addr 5408 * If -ifp was specified we restrict ourselves to the ill, otherwise 5409 * we match on the gatway and destination to handle unnumbered pt-pt 5410 * interfaces. 5411 */ 5412 if (ill != NULL) 5413 ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst); 5414 else 5415 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); 5416 if (ipif != NULL) { 5417 if (IS_VNI(ipif->ipif_ill)) { 5418 ipif_refrele(ipif); 5419 return (EINVAL); 5420 } 5421 } 5422 5423 /* 5424 * GateD will attempt to create routes with a loopback interface 5425 * address as the gateway and with RTF_GATEWAY set. We allow 5426 * these routes to be added, but create them as interface routes 5427 * since the gateway is an interface address. 5428 */ 5429 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 5430 flags &= ~RTF_GATEWAY; 5431 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 5432 mask == IP_HOST_MASK) { 5433 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK, 5434 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, 5435 NULL); 5436 if (ire != NULL) { 5437 ire_refrele(ire); 5438 ipif_refrele(ipif); 5439 return (EEXIST); 5440 } 5441 ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x" 5442 "for 0x%x\n", (void *)ipif, 5443 ipif->ipif_ire_type, 5444 ntohl(ipif->ipif_lcl_addr))); 5445 ire = ire_create( 5446 (uchar_t *)&dst_addr, /* dest address */ 5447 (uchar_t *)&mask, /* mask */ 5448 NULL, /* no gateway */ 5449 ipif->ipif_ire_type, /* LOOPBACK */ 5450 ipif->ipif_ill, 5451 zoneid, 5452 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0, 5453 NULL, 5454 ipst); 5455 5456 if (ire == NULL) { 5457 ipif_refrele(ipif); 5458 return (ENOMEM); 5459 } 5460 /* src address assigned by the caller? */ 5461 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5462 ire->ire_setsrc_addr = src_addr; 5463 5464 nire = ire_add(ire); 5465 if (nire == NULL) { 5466 /* 5467 * In the result of failure, ire_add() will have 5468 * already deleted the ire in question, so there 5469 * is no need to do that here. 5470 */ 5471 ipif_refrele(ipif); 5472 return (ENOMEM); 5473 } 5474 /* 5475 * Check if it was a duplicate entry. This handles 5476 * the case of two racing route adds for the same route 5477 */ 5478 if (nire != ire) { 5479 ASSERT(nire->ire_identical_ref > 1); 5480 ire_delete(nire); 5481 ire_refrele(nire); 5482 ipif_refrele(ipif); 5483 return (EEXIST); 5484 } 5485 ire = nire; 5486 goto save_ire; 5487 } 5488 } 5489 5490 /* 5491 * The routes for multicast with CGTP are quite special in that 5492 * the gateway is the local interface address, yet RTF_GATEWAY 5493 * is set. We turn off RTF_GATEWAY to provide compatibility with 5494 * this undocumented and unusual use of multicast routes. 5495 */ 5496 if ((flags & RTF_MULTIRT) && ipif != NULL) 5497 flags &= ~RTF_GATEWAY; 5498 5499 /* 5500 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 5501 * and the gateway address provided is one of the system's interface 5502 * addresses. By using the routing socket interface and supplying an 5503 * RTA_IFP sockaddr with an interface index, an alternate method of 5504 * specifying an interface route to be created is available which uses 5505 * the interface index that specifies the outgoing interface rather than 5506 * the address of an outgoing interface (which may not be able to 5507 * uniquely identify an interface). When coupled with the RTF_GATEWAY 5508 * flag, routes can be specified which not only specify the next-hop to 5509 * be used when routing to a certain prefix, but also which outgoing 5510 * interface should be used. 5511 * 5512 * Previously, interfaces would have unique addresses assigned to them 5513 * and so the address assigned to a particular interface could be used 5514 * to identify a particular interface. One exception to this was the 5515 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 5516 * 5517 * With the advent of IPv6 and its link-local addresses, this 5518 * restriction was relaxed and interfaces could share addresses between 5519 * themselves. In fact, typically all of the link-local interfaces on 5520 * an IPv6 node or router will have the same link-local address. In 5521 * order to differentiate between these interfaces, the use of an 5522 * interface index is necessary and this index can be carried inside a 5523 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 5524 * of using the interface index, however, is that all of the ipif's that 5525 * are part of an ill have the same index and so the RTA_IFP sockaddr 5526 * cannot be used to differentiate between ipif's (or logical 5527 * interfaces) that belong to the same ill (physical interface). 5528 * 5529 * For example, in the following case involving IPv4 interfaces and 5530 * logical interfaces 5531 * 5532 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 5533 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0 5534 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0 5535 * 5536 * the ipif's corresponding to each of these interface routes can be 5537 * uniquely identified by the "gateway" (actually interface address). 5538 * 5539 * In this case involving multiple IPv6 default routes to a particular 5540 * link-local gateway, the use of RTA_IFP is necessary to specify which 5541 * default route is of interest: 5542 * 5543 * default fe80::123:4567:89ab:cdef U if0 5544 * default fe80::123:4567:89ab:cdef U if1 5545 */ 5546 5547 /* RTF_GATEWAY not set */ 5548 if (!(flags & RTF_GATEWAY)) { 5549 if (sp != NULL) { 5550 ip2dbg(("ip_rt_add: gateway security attributes " 5551 "cannot be set with interface route\n")); 5552 if (ipif != NULL) 5553 ipif_refrele(ipif); 5554 return (EINVAL); 5555 } 5556 5557 /* 5558 * Whether or not ill (RTA_IFP) is set, we require that 5559 * the gateway is one of our local addresses. 5560 */ 5561 if (ipif == NULL) 5562 return (ENETUNREACH); 5563 5564 /* 5565 * We use MATCH_IRE_ILL here. If the caller specified an 5566 * interface (from the RTA_IFP sockaddr) we use it, otherwise 5567 * we use the ill derived from the gateway address. 5568 * We can always match the gateway address since we record it 5569 * in ire_gateway_addr. 5570 * We don't allow RTA_IFP to specify a different ill than the 5571 * one matching the ipif to make sure we can delete the route. 5572 */ 5573 match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL; 5574 if (ill == NULL) { 5575 ill = ipif->ipif_ill; 5576 } else if (ill != ipif->ipif_ill) { 5577 ipif_refrele(ipif); 5578 return (EINVAL); 5579 } 5580 5581 /* 5582 * We check for an existing entry at this point. 5583 * 5584 * Since a netmask isn't passed in via the ioctl interface 5585 * (SIOCADDRT), we don't check for a matching netmask in that 5586 * case. 5587 */ 5588 if (!ioctl_msg) 5589 match_flags |= MATCH_IRE_MASK; 5590 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, 5591 IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst, 5592 NULL); 5593 if (ire != NULL) { 5594 ire_refrele(ire); 5595 ipif_refrele(ipif); 5596 return (EEXIST); 5597 } 5598 5599 /* 5600 * Some software (for example, GateD and Sun Cluster) attempts 5601 * to create (what amount to) IRE_PREFIX routes with the 5602 * loopback address as the gateway. This is primarily done to 5603 * set up prefixes with the RTF_REJECT flag set (for example, 5604 * when generating aggregate routes.) 5605 * 5606 * If the IRE type (as defined by ill->ill_net_type) would be 5607 * IRE_LOOPBACK, then we map the request into a 5608 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as 5609 * these interface routes, by definition, can only be that. 5610 * 5611 * Needless to say, the real IRE_LOOPBACK is NOT created by this 5612 * routine, but rather using ire_create() directly. 5613 * 5614 */ 5615 type = ill->ill_net_type; 5616 if (type == IRE_LOOPBACK) { 5617 type = IRE_IF_NORESOLVER; 5618 flags |= RTF_BLACKHOLE; 5619 } 5620 5621 /* 5622 * Create a copy of the IRE_IF_NORESOLVER or 5623 * IRE_IF_RESOLVER with the modified address, netmask, and 5624 * gateway. 5625 */ 5626 ire = ire_create( 5627 (uchar_t *)&dst_addr, 5628 (uint8_t *)&mask, 5629 (uint8_t *)&gw_addr, 5630 type, 5631 ill, 5632 zoneid, 5633 flags, 5634 NULL, 5635 ipst); 5636 if (ire == NULL) { 5637 ipif_refrele(ipif); 5638 return (ENOMEM); 5639 } 5640 5641 /* src address assigned by the caller? */ 5642 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5643 ire->ire_setsrc_addr = src_addr; 5644 5645 nire = ire_add(ire); 5646 if (nire == NULL) { 5647 /* 5648 * In the result of failure, ire_add() will have 5649 * already deleted the ire in question, so there 5650 * is no need to do that here. 5651 */ 5652 ipif_refrele(ipif); 5653 return (ENOMEM); 5654 } 5655 /* 5656 * Check if it was a duplicate entry. This handles 5657 * the case of two racing route adds for the same route 5658 */ 5659 if (nire != ire) { 5660 ire_delete(nire); 5661 ire_refrele(nire); 5662 ipif_refrele(ipif); 5663 return (EEXIST); 5664 } 5665 ire = nire; 5666 goto save_ire; 5667 } 5668 5669 /* 5670 * Get an interface IRE for the specified gateway. 5671 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 5672 * gateway, it is currently unreachable and we fail the request 5673 * accordingly. We reject any RTF_GATEWAY routes where the gateway 5674 * is an IRE_LOCAL or IRE_LOOPBACK. 5675 * If RTA_IFP was specified we look on that particular ill. 5676 */ 5677 if (ill != NULL) 5678 match_flags |= MATCH_IRE_ILL; 5679 5680 /* Check whether the gateway is reachable. */ 5681 again: 5682 type = IRE_INTERFACE | IRE_LOCAL | IRE_LOOPBACK; 5683 if (flags & RTF_INDIRECT) 5684 type |= IRE_OFFLINK; 5685 5686 gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill, 5687 ALL_ZONES, NULL, match_flags, 0, ipst, NULL); 5688 if (gw_ire == NULL) { 5689 /* 5690 * With IPMP, we allow host routes to influence in.mpathd's 5691 * target selection. However, if the test addresses are on 5692 * their own network, the above lookup will fail since the 5693 * underlying IRE_INTERFACEs are marked hidden. So allow 5694 * hidden test IREs to be found and try again. 5695 */ 5696 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) { 5697 match_flags |= MATCH_IRE_TESTHIDDEN; 5698 goto again; 5699 } 5700 if (ipif != NULL) 5701 ipif_refrele(ipif); 5702 return (ENETUNREACH); 5703 } 5704 if (gw_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) { 5705 ire_refrele(gw_ire); 5706 if (ipif != NULL) 5707 ipif_refrele(ipif); 5708 return (ENETUNREACH); 5709 } 5710 5711 if (ill == NULL && !(flags & RTF_INDIRECT)) { 5712 unbound = B_TRUE; 5713 if (ipst->ips_ip_strict_src_multihoming > 0) 5714 ill = gw_ire->ire_ill; 5715 } 5716 5717 /* 5718 * We create one of three types of IREs as a result of this request 5719 * based on the netmask. A netmask of all ones (which is automatically 5720 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 5721 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 5722 * created. Otherwise, an IRE_PREFIX route is created for the 5723 * destination prefix. 5724 */ 5725 if (mask == IP_HOST_MASK) 5726 type = IRE_HOST; 5727 else if (mask == 0) 5728 type = IRE_DEFAULT; 5729 else 5730 type = IRE_PREFIX; 5731 5732 /* check for a duplicate entry */ 5733 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, 5734 ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, 5735 0, ipst, NULL); 5736 if (ire != NULL) { 5737 if (ipif != NULL) 5738 ipif_refrele(ipif); 5739 ire_refrele(gw_ire); 5740 ire_refrele(ire); 5741 return (EEXIST); 5742 } 5743 5744 /* Security attribute exists */ 5745 if (sp != NULL) { 5746 tsol_gcgrp_addr_t ga; 5747 5748 /* find or create the gateway credentials group */ 5749 ga.ga_af = AF_INET; 5750 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 5751 5752 /* we hold reference to it upon success */ 5753 gcgrp = gcgrp_lookup(&ga, B_TRUE); 5754 if (gcgrp == NULL) { 5755 if (ipif != NULL) 5756 ipif_refrele(ipif); 5757 ire_refrele(gw_ire); 5758 return (ENOMEM); 5759 } 5760 5761 /* 5762 * Create and add the security attribute to the group; a 5763 * reference to the group is made upon allocating a new 5764 * entry successfully. If it finds an already-existing 5765 * entry for the security attribute in the group, it simply 5766 * returns it and no new reference is made to the group. 5767 */ 5768 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 5769 if (gc == NULL) { 5770 if (ipif != NULL) 5771 ipif_refrele(ipif); 5772 /* release reference held by gcgrp_lookup */ 5773 GCGRP_REFRELE(gcgrp); 5774 ire_refrele(gw_ire); 5775 return (ENOMEM); 5776 } 5777 } 5778 5779 /* Create the IRE. */ 5780 ire = ire_create( 5781 (uchar_t *)&dst_addr, /* dest address */ 5782 (uchar_t *)&mask, /* mask */ 5783 (uchar_t *)&gw_addr, /* gateway address */ 5784 (ushort_t)type, /* IRE type */ 5785 ill, 5786 zoneid, 5787 flags, 5788 gc, /* security attribute */ 5789 ipst); 5790 5791 /* 5792 * The ire holds a reference to the 'gc' and the 'gc' holds a 5793 * reference to the 'gcgrp'. We can now release the extra reference 5794 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 5795 */ 5796 if (gcgrp_xtraref) 5797 GCGRP_REFRELE(gcgrp); 5798 if (ire == NULL) { 5799 if (gc != NULL) 5800 GC_REFRELE(gc); 5801 if (ipif != NULL) 5802 ipif_refrele(ipif); 5803 ire_refrele(gw_ire); 5804 return (ENOMEM); 5805 } 5806 5807 /* Before we add, check if an extra CGTP broadcast is needed */ 5808 cgtp_broadcast = ((flags & RTF_MULTIRT) && 5809 ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST); 5810 5811 /* src address assigned by the caller? */ 5812 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5813 ire->ire_setsrc_addr = src_addr; 5814 5815 ire->ire_unbound = unbound; 5816 5817 /* 5818 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 5819 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 5820 */ 5821 5822 /* Add the new IRE. */ 5823 nire = ire_add(ire); 5824 if (nire == NULL) { 5825 /* 5826 * In the result of failure, ire_add() will have 5827 * already deleted the ire in question, so there 5828 * is no need to do that here. 5829 */ 5830 if (ipif != NULL) 5831 ipif_refrele(ipif); 5832 ire_refrele(gw_ire); 5833 return (ENOMEM); 5834 } 5835 /* 5836 * Check if it was a duplicate entry. This handles 5837 * the case of two racing route adds for the same route 5838 */ 5839 if (nire != ire) { 5840 ire_delete(nire); 5841 ire_refrele(nire); 5842 if (ipif != NULL) 5843 ipif_refrele(ipif); 5844 ire_refrele(gw_ire); 5845 return (EEXIST); 5846 } 5847 ire = nire; 5848 5849 if (flags & RTF_MULTIRT) { 5850 /* 5851 * Invoke the CGTP (multirouting) filtering module 5852 * to add the dst address in the filtering database. 5853 * Replicated inbound packets coming from that address 5854 * will be filtered to discard the duplicates. 5855 * It is not necessary to call the CGTP filter hook 5856 * when the dst address is a broadcast or multicast, 5857 * because an IP source address cannot be a broadcast 5858 * or a multicast. 5859 */ 5860 if (cgtp_broadcast) { 5861 ip_cgtp_bcast_add(ire, ipst); 5862 goto save_ire; 5863 } 5864 if (ipst->ips_ip_cgtp_filter_ops != NULL && 5865 !CLASSD(ire->ire_addr)) { 5866 int res; 5867 ipif_t *src_ipif; 5868 5869 /* Find the source address corresponding to gw_ire */ 5870 src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr, 5871 NULL, zoneid, ipst); 5872 if (src_ipif != NULL) { 5873 res = ipst->ips_ip_cgtp_filter_ops-> 5874 cfo_add_dest_v4( 5875 ipst->ips_netstack->netstack_stackid, 5876 ire->ire_addr, 5877 ire->ire_gateway_addr, 5878 ire->ire_setsrc_addr, 5879 src_ipif->ipif_lcl_addr); 5880 ipif_refrele(src_ipif); 5881 } else { 5882 res = EADDRNOTAVAIL; 5883 } 5884 if (res != 0) { 5885 if (ipif != NULL) 5886 ipif_refrele(ipif); 5887 ire_refrele(gw_ire); 5888 ire_delete(ire); 5889 ire_refrele(ire); /* Held in ire_add */ 5890 return (res); 5891 } 5892 } 5893 } 5894 5895 save_ire: 5896 if (gw_ire != NULL) { 5897 ire_refrele(gw_ire); 5898 gw_ire = NULL; 5899 } 5900 if (ill != NULL) { 5901 /* 5902 * Save enough information so that we can recreate the IRE if 5903 * the interface goes down and then up. The metrics associated 5904 * with the route will be saved as well when rts_setmetrics() is 5905 * called after the IRE has been created. In the case where 5906 * memory cannot be allocated, none of this information will be 5907 * saved. 5908 */ 5909 ill_save_ire(ill, ire); 5910 } 5911 if (ioctl_msg) 5912 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst); 5913 if (ire_arg != NULL) { 5914 /* 5915 * Store the ire that was successfully added into where ire_arg 5916 * points to so that callers don't have to look it up 5917 * themselves (but they are responsible for ire_refrele()ing 5918 * the ire when they are finished with it). 5919 */ 5920 *ire_arg = ire; 5921 } else { 5922 ire_refrele(ire); /* Held in ire_add */ 5923 } 5924 if (ipif != NULL) 5925 ipif_refrele(ipif); 5926 return (0); 5927 } 5928 5929 /* 5930 * ip_rt_delete is called to delete an IPv4 route. 5931 * ill is passed in to associate it with the correct interface. 5932 */ 5933 /* ARGSUSED4 */ 5934 int 5935 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 5936 uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg, 5937 ip_stack_t *ipst, zoneid_t zoneid) 5938 { 5939 ire_t *ire = NULL; 5940 ipif_t *ipif; 5941 uint_t type; 5942 uint_t match_flags = MATCH_IRE_TYPE; 5943 int err = 0; 5944 5945 ip1dbg(("ip_rt_delete:")); 5946 /* 5947 * If this is the case of RTF_HOST being set, then we set the netmask 5948 * to all ones. Otherwise, we use the netmask if one was supplied. 5949 */ 5950 if (flags & RTF_HOST) { 5951 mask = IP_HOST_MASK; 5952 match_flags |= MATCH_IRE_MASK; 5953 } else if (rtm_addrs & RTA_NETMASK) { 5954 match_flags |= MATCH_IRE_MASK; 5955 } 5956 5957 /* 5958 * Note that RTF_GATEWAY is never set on a delete, therefore 5959 * we check if the gateway address is one of our interfaces first, 5960 * and fall back on RTF_GATEWAY routes. 5961 * 5962 * This makes it possible to delete an original 5963 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 5964 * However, we have RTF_KERNEL set on the ones created by ipif_up 5965 * and those can not be deleted here. 5966 * 5967 * We use MATCH_IRE_ILL if we know the interface. If the caller 5968 * specified an interface (from the RTA_IFP sockaddr) we use it, 5969 * otherwise we use the ill derived from the gateway address. 5970 * We can always match the gateway address since we record it 5971 * in ire_gateway_addr. 5972 * 5973 * For more detail on specifying routes by gateway address and by 5974 * interface index, see the comments in ip_rt_add(). 5975 */ 5976 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); 5977 if (ipif != NULL) { 5978 ill_t *ill_match; 5979 5980 if (ill != NULL) 5981 ill_match = ill; 5982 else 5983 ill_match = ipif->ipif_ill; 5984 5985 match_flags |= MATCH_IRE_ILL; 5986 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 5987 ire = ire_ftable_lookup_v4(dst_addr, mask, 0, 5988 IRE_LOOPBACK, ill_match, ALL_ZONES, NULL, 5989 match_flags, 0, ipst, NULL); 5990 } 5991 if (ire == NULL) { 5992 match_flags |= MATCH_IRE_GW; 5993 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, 5994 IRE_INTERFACE, ill_match, ALL_ZONES, NULL, 5995 match_flags, 0, ipst, NULL); 5996 } 5997 /* Avoid deleting routes created by kernel from an ipif */ 5998 if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) { 5999 ire_refrele(ire); 6000 ire = NULL; 6001 } 6002 6003 /* Restore in case we didn't find a match */ 6004 match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL); 6005 } 6006 6007 if (ire == NULL) { 6008 /* 6009 * At this point, the gateway address is not one of our own 6010 * addresses or a matching interface route was not found. We 6011 * set the IRE type to lookup based on whether 6012 * this is a host route, a default route or just a prefix. 6013 * 6014 * If an ill was passed in, then the lookup is based on an 6015 * interface index so MATCH_IRE_ILL is added to match_flags. 6016 */ 6017 match_flags |= MATCH_IRE_GW; 6018 if (ill != NULL) 6019 match_flags |= MATCH_IRE_ILL; 6020 if (mask == IP_HOST_MASK) 6021 type = IRE_HOST; 6022 else if (mask == 0) 6023 type = IRE_DEFAULT; 6024 else 6025 type = IRE_PREFIX; 6026 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, 6027 ALL_ZONES, NULL, match_flags, 0, ipst, NULL); 6028 } 6029 6030 if (ipif != NULL) { 6031 ipif_refrele(ipif); 6032 ipif = NULL; 6033 } 6034 6035 if (ire == NULL) 6036 return (ESRCH); 6037 6038 if (ire->ire_flags & RTF_MULTIRT) { 6039 /* 6040 * Invoke the CGTP (multirouting) filtering module 6041 * to remove the dst address from the filtering database. 6042 * Packets coming from that address will no longer be 6043 * filtered to remove duplicates. 6044 */ 6045 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 6046 err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4( 6047 ipst->ips_netstack->netstack_stackid, 6048 ire->ire_addr, ire->ire_gateway_addr); 6049 } 6050 ip_cgtp_bcast_delete(ire, ipst); 6051 } 6052 6053 ill = ire->ire_ill; 6054 if (ill != NULL) 6055 ill_remove_saved_ire(ill, ire); 6056 if (ioctl_msg) 6057 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst); 6058 ire_delete(ire); 6059 ire_refrele(ire); 6060 return (err); 6061 } 6062 6063 /* 6064 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 6065 */ 6066 /* ARGSUSED */ 6067 int 6068 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6069 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6070 { 6071 ipaddr_t dst_addr; 6072 ipaddr_t gw_addr; 6073 ipaddr_t mask; 6074 int error = 0; 6075 mblk_t *mp1; 6076 struct rtentry *rt; 6077 ipif_t *ipif = NULL; 6078 ip_stack_t *ipst; 6079 6080 ASSERT(q->q_next == NULL); 6081 ipst = CONNQ_TO_IPST(q); 6082 6083 ip1dbg(("ip_siocaddrt:")); 6084 /* Existence of mp1 verified in ip_wput_nondata */ 6085 mp1 = mp->b_cont->b_cont; 6086 rt = (struct rtentry *)mp1->b_rptr; 6087 6088 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6089 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6090 6091 /* 6092 * If the RTF_HOST flag is on, this is a request to assign a gateway 6093 * to a particular host address. In this case, we set the netmask to 6094 * all ones for the particular destination address. Otherwise, 6095 * determine the netmask to be used based on dst_addr and the interfaces 6096 * in use. 6097 */ 6098 if (rt->rt_flags & RTF_HOST) { 6099 mask = IP_HOST_MASK; 6100 } else { 6101 /* 6102 * Note that ip_subnet_mask returns a zero mask in the case of 6103 * default (an all-zeroes address). 6104 */ 6105 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 6106 } 6107 6108 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 6109 B_TRUE, NULL, ipst, ALL_ZONES); 6110 if (ipif != NULL) 6111 ipif_refrele(ipif); 6112 return (error); 6113 } 6114 6115 /* 6116 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 6117 */ 6118 /* ARGSUSED */ 6119 int 6120 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6121 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6122 { 6123 ipaddr_t dst_addr; 6124 ipaddr_t gw_addr; 6125 ipaddr_t mask; 6126 int error; 6127 mblk_t *mp1; 6128 struct rtentry *rt; 6129 ipif_t *ipif = NULL; 6130 ip_stack_t *ipst; 6131 6132 ASSERT(q->q_next == NULL); 6133 ipst = CONNQ_TO_IPST(q); 6134 6135 ip1dbg(("ip_siocdelrt:")); 6136 /* Existence of mp1 verified in ip_wput_nondata */ 6137 mp1 = mp->b_cont->b_cont; 6138 rt = (struct rtentry *)mp1->b_rptr; 6139 6140 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6141 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6142 6143 /* 6144 * If the RTF_HOST flag is on, this is a request to delete a gateway 6145 * to a particular host address. In this case, we set the netmask to 6146 * all ones for the particular destination address. Otherwise, 6147 * determine the netmask to be used based on dst_addr and the interfaces 6148 * in use. 6149 */ 6150 if (rt->rt_flags & RTF_HOST) { 6151 mask = IP_HOST_MASK; 6152 } else { 6153 /* 6154 * Note that ip_subnet_mask returns a zero mask in the case of 6155 * default (an all-zeroes address). 6156 */ 6157 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 6158 } 6159 6160 error = ip_rt_delete(dst_addr, mask, gw_addr, 6161 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, 6162 ipst, ALL_ZONES); 6163 if (ipif != NULL) 6164 ipif_refrele(ipif); 6165 return (error); 6166 } 6167 6168 /* 6169 * Enqueue the mp onto the ipsq, chained by b_next. 6170 * b_prev stores the function to be executed later, and b_queue the queue 6171 * where this mp originated. 6172 */ 6173 void 6174 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 6175 ill_t *pending_ill) 6176 { 6177 conn_t *connp; 6178 ipxop_t *ipx = ipsq->ipsq_xop; 6179 6180 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 6181 ASSERT(MUTEX_HELD(&ipx->ipx_lock)); 6182 ASSERT(func != NULL); 6183 6184 mp->b_queue = q; 6185 mp->b_prev = (void *)func; 6186 mp->b_next = NULL; 6187 6188 switch (type) { 6189 case CUR_OP: 6190 if (ipx->ipx_mptail != NULL) { 6191 ASSERT(ipx->ipx_mphead != NULL); 6192 ipx->ipx_mptail->b_next = mp; 6193 } else { 6194 ASSERT(ipx->ipx_mphead == NULL); 6195 ipx->ipx_mphead = mp; 6196 } 6197 ipx->ipx_mptail = mp; 6198 break; 6199 6200 case NEW_OP: 6201 if (ipsq->ipsq_xopq_mptail != NULL) { 6202 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 6203 ipsq->ipsq_xopq_mptail->b_next = mp; 6204 } else { 6205 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 6206 ipsq->ipsq_xopq_mphead = mp; 6207 } 6208 ipsq->ipsq_xopq_mptail = mp; 6209 ipx->ipx_ipsq_queued = B_TRUE; 6210 break; 6211 6212 case SWITCH_OP: 6213 ASSERT(ipsq->ipsq_swxop != NULL); 6214 /* only one switch operation is currently allowed */ 6215 ASSERT(ipsq->ipsq_switch_mp == NULL); 6216 ipsq->ipsq_switch_mp = mp; 6217 ipx->ipx_ipsq_queued = B_TRUE; 6218 break; 6219 default: 6220 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 6221 } 6222 6223 if (CONN_Q(q) && pending_ill != NULL) { 6224 connp = Q_TO_CONN(q); 6225 ASSERT(MUTEX_HELD(&connp->conn_lock)); 6226 connp->conn_oper_pending_ill = pending_ill; 6227 } 6228 } 6229 6230 /* 6231 * Dequeue the next message that requested exclusive access to this IPSQ's 6232 * xop. Specifically: 6233 * 6234 * 1. If we're still processing the current operation on `ipsq', then 6235 * dequeue the next message for the operation (from ipx_mphead), or 6236 * return NULL if there are no queued messages for the operation. 6237 * These messages are queued via CUR_OP to qwriter_ip() and friends. 6238 * 6239 * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is 6240 * not set) see if the ipsq has requested an xop switch. If so, switch 6241 * `ipsq' to a different xop. Xop switches only happen when joining or 6242 * leaving IPMP groups and require a careful dance -- see the comments 6243 * in-line below for details. If we're leaving a group xop or if we're 6244 * joining a group xop and become writer on it, then we proceed to (3). 6245 * Otherwise, we return NULL and exit the xop. 6246 * 6247 * 3. For each IPSQ in the xop, return any switch operation stored on 6248 * ipsq_switch_mp (set via SWITCH_OP); these must be processed before 6249 * any other messages queued on the IPSQ. Otherwise, dequeue the next 6250 * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead. 6251 * Note that if the phyint tied to `ipsq' is not using IPMP there will 6252 * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for 6253 * each phyint in the group, including the IPMP meta-interface phyint. 6254 */ 6255 static mblk_t * 6256 ipsq_dq(ipsq_t *ipsq) 6257 { 6258 ill_t *illv4, *illv6; 6259 mblk_t *mp; 6260 ipsq_t *xopipsq; 6261 ipsq_t *leftipsq = NULL; 6262 ipxop_t *ipx; 6263 phyint_t *phyi = ipsq->ipsq_phyint; 6264 ip_stack_t *ipst = ipsq->ipsq_ipst; 6265 boolean_t emptied = B_FALSE; 6266 6267 /* 6268 * Grab all the locks we need in the defined order (ill_g_lock -> 6269 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next. 6270 */ 6271 rw_enter(&ipst->ips_ill_g_lock, 6272 ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER); 6273 mutex_enter(&ipsq->ipsq_lock); 6274 ipx = ipsq->ipsq_xop; 6275 mutex_enter(&ipx->ipx_lock); 6276 6277 /* 6278 * Dequeue the next message associated with the current exclusive 6279 * operation, if any. 6280 */ 6281 if ((mp = ipx->ipx_mphead) != NULL) { 6282 ipx->ipx_mphead = mp->b_next; 6283 if (ipx->ipx_mphead == NULL) 6284 ipx->ipx_mptail = NULL; 6285 mp->b_next = (void *)ipsq; 6286 goto out; 6287 } 6288 6289 if (ipx->ipx_current_ipif != NULL) 6290 goto empty; 6291 6292 if (ipsq->ipsq_swxop != NULL) { 6293 /* 6294 * The exclusive operation that is now being completed has 6295 * requested a switch to a different xop. This happens 6296 * when an interface joins or leaves an IPMP group. Joins 6297 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()). 6298 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb 6299 * (phyint_free()), or interface plumb for an ill type 6300 * not in the IPMP group (ip_rput_dlpi_writer()). 6301 * 6302 * Xop switches are not allowed on the IPMP meta-interface. 6303 */ 6304 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP)); 6305 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 6306 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq); 6307 6308 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) { 6309 /* 6310 * We're switching back to our own xop, so we have two 6311 * xop's to drain/exit: our own, and the group xop 6312 * that we are leaving. 6313 * 6314 * First, pull ourselves out of the group ipsq list. 6315 * This is safe since we're writer on ill_g_lock. 6316 */ 6317 ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop); 6318 6319 xopipsq = ipx->ipx_ipsq; 6320 while (xopipsq->ipsq_next != ipsq) 6321 xopipsq = xopipsq->ipsq_next; 6322 6323 xopipsq->ipsq_next = ipsq->ipsq_next; 6324 ipsq->ipsq_next = ipsq; 6325 ipsq->ipsq_xop = ipsq->ipsq_swxop; 6326 ipsq->ipsq_swxop = NULL; 6327 6328 /* 6329 * Second, prepare to exit the group xop. The actual 6330 * ipsq_exit() is done at the end of this function 6331 * since we cannot hold any locks across ipsq_exit(). 6332 * Note that although we drop the group's ipx_lock, no 6333 * threads can proceed since we're still ipx_writer. 6334 */ 6335 leftipsq = xopipsq; 6336 mutex_exit(&ipx->ipx_lock); 6337 6338 /* 6339 * Third, set ipx to point to our own xop (which was 6340 * inactive and therefore can be entered). 6341 */ 6342 ipx = ipsq->ipsq_xop; 6343 mutex_enter(&ipx->ipx_lock); 6344 ASSERT(ipx->ipx_writer == NULL); 6345 ASSERT(ipx->ipx_current_ipif == NULL); 6346 } else { 6347 /* 6348 * We're switching from our own xop to a group xop. 6349 * The requestor of the switch must ensure that the 6350 * group xop cannot go away (e.g. by ensuring the 6351 * phyint associated with the xop cannot go away). 6352 * 6353 * If we can become writer on our new xop, then we'll 6354 * do the drain. Otherwise, the current writer of our 6355 * new xop will do the drain when it exits. 6356 * 6357 * First, splice ourselves into the group IPSQ list. 6358 * This is safe since we're writer on ill_g_lock. 6359 */ 6360 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 6361 6362 xopipsq = ipsq->ipsq_swxop->ipx_ipsq; 6363 while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq) 6364 xopipsq = xopipsq->ipsq_next; 6365 6366 xopipsq->ipsq_next = ipsq; 6367 ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq; 6368 ipsq->ipsq_xop = ipsq->ipsq_swxop; 6369 ipsq->ipsq_swxop = NULL; 6370 6371 /* 6372 * Second, exit our own xop, since it's now unused. 6373 * This is safe since we've got the only reference. 6374 */ 6375 ASSERT(ipx->ipx_writer == curthread); 6376 ipx->ipx_writer = NULL; 6377 VERIFY(--ipx->ipx_reentry_cnt == 0); 6378 ipx->ipx_ipsq_queued = B_FALSE; 6379 mutex_exit(&ipx->ipx_lock); 6380 6381 /* 6382 * Third, set ipx to point to our new xop, and check 6383 * if we can become writer on it. If we cannot, then 6384 * the current writer will drain the IPSQ group when 6385 * it exits. Our ipsq_xop is guaranteed to be stable 6386 * because we're still holding ipsq_lock. 6387 */ 6388 ipx = ipsq->ipsq_xop; 6389 mutex_enter(&ipx->ipx_lock); 6390 if (ipx->ipx_writer != NULL || 6391 ipx->ipx_current_ipif != NULL) { 6392 goto out; 6393 } 6394 } 6395 6396 /* 6397 * Fourth, become writer on our new ipx before we continue 6398 * with the drain. Note that we never dropped ipsq_lock 6399 * above, so no other thread could've raced with us to 6400 * become writer first. Also, we're holding ipx_lock, so 6401 * no other thread can examine the ipx right now. 6402 */ 6403 ASSERT(ipx->ipx_current_ipif == NULL); 6404 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 6405 VERIFY(ipx->ipx_reentry_cnt++ == 0); 6406 ipx->ipx_writer = curthread; 6407 ipx->ipx_forced = B_FALSE; 6408 #ifdef DEBUG 6409 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6410 #endif 6411 } 6412 6413 xopipsq = ipsq; 6414 do { 6415 /* 6416 * So that other operations operate on a consistent and 6417 * complete phyint, a switch message on an IPSQ must be 6418 * handled prior to any other operations on that IPSQ. 6419 */ 6420 if ((mp = xopipsq->ipsq_switch_mp) != NULL) { 6421 xopipsq->ipsq_switch_mp = NULL; 6422 ASSERT(mp->b_next == NULL); 6423 mp->b_next = (void *)xopipsq; 6424 goto out; 6425 } 6426 6427 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) { 6428 xopipsq->ipsq_xopq_mphead = mp->b_next; 6429 if (xopipsq->ipsq_xopq_mphead == NULL) 6430 xopipsq->ipsq_xopq_mptail = NULL; 6431 mp->b_next = (void *)xopipsq; 6432 goto out; 6433 } 6434 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 6435 empty: 6436 /* 6437 * There are no messages. Further, we are holding ipx_lock, hence no 6438 * new messages can end up on any IPSQ in the xop. 6439 */ 6440 ipx->ipx_writer = NULL; 6441 ipx->ipx_forced = B_FALSE; 6442 VERIFY(--ipx->ipx_reentry_cnt == 0); 6443 ipx->ipx_ipsq_queued = B_FALSE; 6444 emptied = B_TRUE; 6445 #ifdef DEBUG 6446 ipx->ipx_depth = 0; 6447 #endif 6448 out: 6449 mutex_exit(&ipx->ipx_lock); 6450 mutex_exit(&ipsq->ipsq_lock); 6451 6452 /* 6453 * If we completely emptied the xop, then wake up any threads waiting 6454 * to enter any of the IPSQ's associated with it. 6455 */ 6456 if (emptied) { 6457 xopipsq = ipsq; 6458 do { 6459 if ((phyi = xopipsq->ipsq_phyint) == NULL) 6460 continue; 6461 6462 illv4 = phyi->phyint_illv4; 6463 illv6 = phyi->phyint_illv6; 6464 6465 GRAB_ILL_LOCKS(illv4, illv6); 6466 if (illv4 != NULL) 6467 cv_broadcast(&illv4->ill_cv); 6468 if (illv6 != NULL) 6469 cv_broadcast(&illv6->ill_cv); 6470 RELEASE_ILL_LOCKS(illv4, illv6); 6471 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 6472 } 6473 rw_exit(&ipst->ips_ill_g_lock); 6474 6475 /* 6476 * Now that all locks are dropped, exit the IPSQ we left. 6477 */ 6478 if (leftipsq != NULL) 6479 ipsq_exit(leftipsq); 6480 6481 return (mp); 6482 } 6483 6484 /* 6485 * Return completion status of previously initiated DLPI operations on 6486 * ills in the purview of an ipsq. 6487 */ 6488 static boolean_t 6489 ipsq_dlpi_done(ipsq_t *ipsq) 6490 { 6491 ipsq_t *ipsq_start; 6492 phyint_t *phyi; 6493 ill_t *ill; 6494 6495 ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock)); 6496 ipsq_start = ipsq; 6497 6498 do { 6499 /* 6500 * The only current users of this function are ipsq_try_enter 6501 * and ipsq_enter which have made sure that ipsq_writer is 6502 * NULL before we reach here. ill_dlpi_pending is modified 6503 * only by an ipsq writer 6504 */ 6505 ASSERT(ipsq->ipsq_xop->ipx_writer == NULL); 6506 phyi = ipsq->ipsq_phyint; 6507 /* 6508 * phyi could be NULL if a phyint that is part of an 6509 * IPMP group is being unplumbed. A more detailed 6510 * comment is in ipmp_grp_update_kstats() 6511 */ 6512 if (phyi != NULL) { 6513 ill = phyi->phyint_illv4; 6514 if (ill != NULL && 6515 (ill->ill_dlpi_pending != DL_PRIM_INVAL || 6516 ill->ill_arl_dlpi_pending)) 6517 return (B_FALSE); 6518 6519 ill = phyi->phyint_illv6; 6520 if (ill != NULL && 6521 ill->ill_dlpi_pending != DL_PRIM_INVAL) 6522 return (B_FALSE); 6523 } 6524 6525 } while ((ipsq = ipsq->ipsq_next) != ipsq_start); 6526 6527 return (B_TRUE); 6528 } 6529 6530 /* 6531 * Enter the ipsq corresponding to ill, by waiting synchronously till 6532 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 6533 * will have to drain completely before ipsq_enter returns success. 6534 * ipx_current_ipif will be set if some exclusive op is in progress, 6535 * and the ipsq_exit logic will start the next enqueued op after 6536 * completion of the current op. If 'force' is used, we don't wait 6537 * for the enqueued ops. This is needed when a conn_close wants to 6538 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 6539 * of an ill can also use this option. But we dont' use it currently. 6540 */ 6541 #define ENTER_SQ_WAIT_TICKS 100 6542 boolean_t 6543 ipsq_enter(ill_t *ill, boolean_t force, int type) 6544 { 6545 ipsq_t *ipsq; 6546 ipxop_t *ipx; 6547 boolean_t waited_enough = B_FALSE; 6548 ip_stack_t *ipst = ill->ill_ipst; 6549 6550 /* 6551 * Note that the relationship between ill and ipsq is fixed as long as 6552 * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the 6553 * relationship between the IPSQ and xop cannot change. However, 6554 * since we cannot hold ipsq_lock across the cv_wait(), it may change 6555 * while we're waiting. We wait on ill_cv and rely on ipsq_exit() 6556 * waking up all ills in the xop when it becomes available. 6557 */ 6558 for (;;) { 6559 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6560 mutex_enter(&ill->ill_lock); 6561 if (ill->ill_state_flags & ILL_CONDEMNED) { 6562 mutex_exit(&ill->ill_lock); 6563 rw_exit(&ipst->ips_ill_g_lock); 6564 return (B_FALSE); 6565 } 6566 6567 ipsq = ill->ill_phyint->phyint_ipsq; 6568 mutex_enter(&ipsq->ipsq_lock); 6569 ipx = ipsq->ipsq_xop; 6570 mutex_enter(&ipx->ipx_lock); 6571 6572 if (ipx->ipx_writer == NULL && (type == CUR_OP || 6573 (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) || 6574 waited_enough)) 6575 break; 6576 6577 rw_exit(&ipst->ips_ill_g_lock); 6578 6579 if (!force || ipx->ipx_writer != NULL) { 6580 mutex_exit(&ipx->ipx_lock); 6581 mutex_exit(&ipsq->ipsq_lock); 6582 cv_wait(&ill->ill_cv, &ill->ill_lock); 6583 } else { 6584 mutex_exit(&ipx->ipx_lock); 6585 mutex_exit(&ipsq->ipsq_lock); 6586 (void) cv_reltimedwait(&ill->ill_cv, 6587 &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK); 6588 waited_enough = B_TRUE; 6589 } 6590 mutex_exit(&ill->ill_lock); 6591 } 6592 6593 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 6594 ASSERT(ipx->ipx_reentry_cnt == 0); 6595 ipx->ipx_writer = curthread; 6596 ipx->ipx_forced = (ipx->ipx_current_ipif != NULL); 6597 ipx->ipx_reentry_cnt++; 6598 #ifdef DEBUG 6599 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6600 #endif 6601 mutex_exit(&ipx->ipx_lock); 6602 mutex_exit(&ipsq->ipsq_lock); 6603 mutex_exit(&ill->ill_lock); 6604 rw_exit(&ipst->ips_ill_g_lock); 6605 6606 return (B_TRUE); 6607 } 6608 6609 /* 6610 * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock 6611 * across the call to the core interface ipsq_try_enter() and hence calls this 6612 * function directly. This is explained more fully in ipif_set_values(). 6613 * In order to support the above constraint, ipsq_try_enter is implemented as 6614 * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently 6615 */ 6616 static ipsq_t * 6617 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, 6618 int type, boolean_t reentry_ok) 6619 { 6620 ipsq_t *ipsq; 6621 ipxop_t *ipx; 6622 ip_stack_t *ipst = ill->ill_ipst; 6623 6624 /* 6625 * lock ordering: 6626 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock. 6627 * 6628 * ipx of an ipsq can't change when ipsq_lock is held. 6629 */ 6630 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 6631 GRAB_CONN_LOCK(q); 6632 mutex_enter(&ill->ill_lock); 6633 ipsq = ill->ill_phyint->phyint_ipsq; 6634 mutex_enter(&ipsq->ipsq_lock); 6635 ipx = ipsq->ipsq_xop; 6636 mutex_enter(&ipx->ipx_lock); 6637 6638 /* 6639 * 1. Enter the ipsq if we are already writer and reentry is ok. 6640 * (Note: If the caller does not specify reentry_ok then neither 6641 * 'func' nor any of its callees must ever attempt to enter the ipsq 6642 * again. Otherwise it can lead to an infinite loop 6643 * 2. Enter the ipsq if there is no current writer and this attempted 6644 * entry is part of the current operation 6645 * 3. Enter the ipsq if there is no current writer and this is a new 6646 * operation and the operation queue is empty and there is no 6647 * operation currently in progress and if all previously initiated 6648 * DLPI operations have completed. 6649 */ 6650 if ((ipx->ipx_writer == curthread && reentry_ok) || 6651 (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP && 6652 !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL && 6653 ipsq_dlpi_done(ipsq))))) { 6654 /* Success. */ 6655 ipx->ipx_reentry_cnt++; 6656 ipx->ipx_writer = curthread; 6657 ipx->ipx_forced = B_FALSE; 6658 mutex_exit(&ipx->ipx_lock); 6659 mutex_exit(&ipsq->ipsq_lock); 6660 mutex_exit(&ill->ill_lock); 6661 RELEASE_CONN_LOCK(q); 6662 #ifdef DEBUG 6663 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6664 #endif 6665 return (ipsq); 6666 } 6667 6668 if (func != NULL) 6669 ipsq_enq(ipsq, q, mp, func, type, ill); 6670 6671 mutex_exit(&ipx->ipx_lock); 6672 mutex_exit(&ipsq->ipsq_lock); 6673 mutex_exit(&ill->ill_lock); 6674 RELEASE_CONN_LOCK(q); 6675 return (NULL); 6676 } 6677 6678 /* 6679 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 6680 * certain critical operations like plumbing (i.e. most set ioctls), etc. 6681 * There is one ipsq per phyint. The ipsq 6682 * serializes exclusive ioctls issued by applications on a per ipsq basis in 6683 * ipsq_xopq_mphead. It also protects against multiple threads executing in 6684 * the ipsq. Responses from the driver pertain to the current ioctl (say a 6685 * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing 6686 * up the interface) and are enqueued in ipx_mphead. 6687 * 6688 * If a thread does not want to reenter the ipsq when it is already writer, 6689 * it must make sure that the specified reentry point to be called later 6690 * when the ipsq is empty, nor any code path starting from the specified reentry 6691 * point must never ever try to enter the ipsq again. Otherwise it can lead 6692 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 6693 * When the thread that is currently exclusive finishes, it (ipsq_exit) 6694 * dequeues the requests waiting to become exclusive in ipx_mphead and calls 6695 * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit 6696 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 6697 * ioctl if the current ioctl has completed. If the current ioctl is still 6698 * in progress it simply returns. The current ioctl could be waiting for 6699 * a response from another module (the driver or could be waiting for 6700 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp 6701 * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the 6702 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 6703 * ipx_current_ipif is NULL which happens only once the ioctl is complete and 6704 * all associated DLPI operations have completed. 6705 */ 6706 6707 /* 6708 * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif' 6709 * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ 6710 * on success, or NULL on failure. The caller ensures ipif/ill is valid by 6711 * refholding it as necessary. If the IPSQ cannot be entered and `func' is 6712 * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ 6713 * can be entered. If `func' is NULL, then `q' and `mp' are ignored. 6714 */ 6715 ipsq_t * 6716 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 6717 ipsq_func_t func, int type, boolean_t reentry_ok) 6718 { 6719 ip_stack_t *ipst; 6720 ipsq_t *ipsq; 6721 6722 /* Only 1 of ipif or ill can be specified */ 6723 ASSERT((ipif != NULL) ^ (ill != NULL)); 6724 6725 if (ipif != NULL) 6726 ill = ipif->ipif_ill; 6727 ipst = ill->ill_ipst; 6728 6729 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6730 ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok); 6731 rw_exit(&ipst->ips_ill_g_lock); 6732 6733 return (ipsq); 6734 } 6735 6736 /* 6737 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures 6738 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ 6739 * cannot be entered, the mp is queued for completion. 6740 */ 6741 void 6742 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 6743 boolean_t reentry_ok) 6744 { 6745 ipsq_t *ipsq; 6746 6747 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok); 6748 6749 /* 6750 * Drop the caller's refhold on the ill. This is safe since we either 6751 * entered the IPSQ (and thus are exclusive), or failed to enter the 6752 * IPSQ, in which case we return without accessing ill anymore. This 6753 * is needed because func needs to see the correct refcount. 6754 * e.g. removeif can work only then. 6755 */ 6756 ill_refrele(ill); 6757 if (ipsq != NULL) { 6758 (*func)(ipsq, q, mp, NULL); 6759 ipsq_exit(ipsq); 6760 } 6761 } 6762 6763 /* 6764 * Exit the specified IPSQ. If this is the final exit on it then drain it 6765 * prior to exiting. Caller must be writer on the specified IPSQ. 6766 */ 6767 void 6768 ipsq_exit(ipsq_t *ipsq) 6769 { 6770 mblk_t *mp; 6771 ipsq_t *mp_ipsq; 6772 queue_t *q; 6773 phyint_t *phyi; 6774 ipsq_func_t func; 6775 6776 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6777 6778 ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1); 6779 if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) { 6780 ipsq->ipsq_xop->ipx_reentry_cnt--; 6781 return; 6782 } 6783 6784 for (;;) { 6785 phyi = ipsq->ipsq_phyint; 6786 mp = ipsq_dq(ipsq); 6787 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next; 6788 6789 /* 6790 * If we've changed to a new IPSQ, and the phyint associated 6791 * with the old one has gone away, free the old IPSQ. Note 6792 * that this cannot happen while the IPSQ is in a group. 6793 */ 6794 if (mp_ipsq != ipsq && phyi == NULL) { 6795 ASSERT(ipsq->ipsq_next == ipsq); 6796 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 6797 ipsq_delete(ipsq); 6798 } 6799 6800 if (mp == NULL) 6801 break; 6802 6803 q = mp->b_queue; 6804 func = (ipsq_func_t)mp->b_prev; 6805 ipsq = mp_ipsq; 6806 mp->b_next = mp->b_prev = NULL; 6807 mp->b_queue = NULL; 6808 6809 /* 6810 * If 'q' is an conn queue, it is valid, since we did a 6811 * a refhold on the conn at the start of the ioctl. 6812 * If 'q' is an ill queue, it is valid, since close of an 6813 * ill will clean up its IPSQ. 6814 */ 6815 (*func)(ipsq, q, mp, NULL); 6816 } 6817 } 6818 6819 /* 6820 * Used to start any igmp or mld timers that could not be started 6821 * while holding ill_mcast_lock. The timers can't be started while holding 6822 * the lock, since mld/igmp_start_timers may need to call untimeout() 6823 * which can't be done while holding the lock which the timeout handler 6824 * acquires. Otherwise 6825 * there could be a deadlock since the timeout handlers 6826 * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire 6827 * ill_mcast_lock. 6828 */ 6829 void 6830 ill_mcast_timer_start(ip_stack_t *ipst) 6831 { 6832 int next; 6833 6834 mutex_enter(&ipst->ips_igmp_timer_lock); 6835 next = ipst->ips_igmp_deferred_next; 6836 ipst->ips_igmp_deferred_next = INFINITY; 6837 mutex_exit(&ipst->ips_igmp_timer_lock); 6838 6839 if (next != INFINITY) 6840 igmp_start_timers(next, ipst); 6841 6842 mutex_enter(&ipst->ips_mld_timer_lock); 6843 next = ipst->ips_mld_deferred_next; 6844 ipst->ips_mld_deferred_next = INFINITY; 6845 mutex_exit(&ipst->ips_mld_timer_lock); 6846 6847 if (next != INFINITY) 6848 mld_start_timers(next, ipst); 6849 } 6850 6851 /* 6852 * Start the current exclusive operation on `ipsq'; associate it with `ipif' 6853 * and `ioccmd'. 6854 */ 6855 void 6856 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) 6857 { 6858 ill_t *ill = ipif->ipif_ill; 6859 ipxop_t *ipx = ipsq->ipsq_xop; 6860 6861 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6862 ASSERT(ipx->ipx_current_ipif == NULL); 6863 ASSERT(ipx->ipx_current_ioctl == 0); 6864 6865 ipx->ipx_current_done = B_FALSE; 6866 ipx->ipx_current_ioctl = ioccmd; 6867 mutex_enter(&ipx->ipx_lock); 6868 ipx->ipx_current_ipif = ipif; 6869 mutex_exit(&ipx->ipx_lock); 6870 6871 /* 6872 * Set IPIF_CHANGING on one or more ipifs associated with the 6873 * current exclusive operation. IPIF_CHANGING prevents any new 6874 * references to the ipif (so that the references will eventually 6875 * drop to zero) and also prevents any "get" operations (e.g., 6876 * SIOCGLIFFLAGS) from being able to access the ipif until the 6877 * operation has completed and the ipif is again in a stable state. 6878 * 6879 * For ioctls, IPIF_CHANGING is set on the ipif associated with the 6880 * ioctl. For internal operations (where ioccmd is zero), all ipifs 6881 * on the ill are marked with IPIF_CHANGING since it's unclear which 6882 * ipifs will be affected. 6883 * 6884 * Note that SIOCLIFREMOVEIF is a special case as it sets 6885 * IPIF_CONDEMNED internally after identifying the right ipif to 6886 * operate on. 6887 */ 6888 switch (ioccmd) { 6889 case SIOCLIFREMOVEIF: 6890 break; 6891 case 0: 6892 mutex_enter(&ill->ill_lock); 6893 ipif = ipif->ipif_ill->ill_ipif; 6894 for (; ipif != NULL; ipif = ipif->ipif_next) 6895 ipif->ipif_state_flags |= IPIF_CHANGING; 6896 mutex_exit(&ill->ill_lock); 6897 break; 6898 default: 6899 mutex_enter(&ill->ill_lock); 6900 ipif->ipif_state_flags |= IPIF_CHANGING; 6901 mutex_exit(&ill->ill_lock); 6902 } 6903 } 6904 6905 /* 6906 * Finish the current exclusive operation on `ipsq'. Usually, this will allow 6907 * the next exclusive operation to begin once we ipsq_exit(). However, if 6908 * pending DLPI operations remain, then we will wait for the queue to drain 6909 * before allowing the next exclusive operation to begin. This ensures that 6910 * DLPI operations from one exclusive operation are never improperly processed 6911 * as part of a subsequent exclusive operation. 6912 */ 6913 void 6914 ipsq_current_finish(ipsq_t *ipsq) 6915 { 6916 ipxop_t *ipx = ipsq->ipsq_xop; 6917 t_uscalar_t dlpi_pending = DL_PRIM_INVAL; 6918 ipif_t *ipif = ipx->ipx_current_ipif; 6919 6920 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6921 6922 /* 6923 * For SIOCLIFREMOVEIF, the ipif has been already been blown away 6924 * (but in that case, IPIF_CHANGING will already be clear and no 6925 * pending DLPI messages can remain). 6926 */ 6927 if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) { 6928 ill_t *ill = ipif->ipif_ill; 6929 6930 mutex_enter(&ill->ill_lock); 6931 dlpi_pending = ill->ill_dlpi_pending; 6932 if (ipx->ipx_current_ioctl == 0) { 6933 ipif = ill->ill_ipif; 6934 for (; ipif != NULL; ipif = ipif->ipif_next) 6935 ipif->ipif_state_flags &= ~IPIF_CHANGING; 6936 } else { 6937 ipif->ipif_state_flags &= ~IPIF_CHANGING; 6938 } 6939 mutex_exit(&ill->ill_lock); 6940 } 6941 6942 ASSERT(!ipx->ipx_current_done); 6943 ipx->ipx_current_done = B_TRUE; 6944 ipx->ipx_current_ioctl = 0; 6945 if (dlpi_pending == DL_PRIM_INVAL) { 6946 mutex_enter(&ipx->ipx_lock); 6947 ipx->ipx_current_ipif = NULL; 6948 mutex_exit(&ipx->ipx_lock); 6949 } 6950 } 6951 6952 /* 6953 * The ill is closing. Flush all messages on the ipsq that originated 6954 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 6955 * for this ill since ipsq_enter could not have entered until then. 6956 * New messages can't be queued since the CONDEMNED flag is set. 6957 */ 6958 static void 6959 ipsq_flush(ill_t *ill) 6960 { 6961 queue_t *q; 6962 mblk_t *prev; 6963 mblk_t *mp; 6964 mblk_t *mp_next; 6965 ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 6966 6967 ASSERT(IAM_WRITER_ILL(ill)); 6968 6969 /* 6970 * Flush any messages sent up by the driver. 6971 */ 6972 mutex_enter(&ipx->ipx_lock); 6973 for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) { 6974 mp_next = mp->b_next; 6975 q = mp->b_queue; 6976 if (q == ill->ill_rq || q == ill->ill_wq) { 6977 /* dequeue mp */ 6978 if (prev == NULL) 6979 ipx->ipx_mphead = mp->b_next; 6980 else 6981 prev->b_next = mp->b_next; 6982 if (ipx->ipx_mptail == mp) { 6983 ASSERT(mp_next == NULL); 6984 ipx->ipx_mptail = prev; 6985 } 6986 inet_freemsg(mp); 6987 } else { 6988 prev = mp; 6989 } 6990 } 6991 mutex_exit(&ipx->ipx_lock); 6992 (void) ipsq_pending_mp_cleanup(ill, NULL); 6993 ipsq_xopq_mp_cleanup(ill, NULL); 6994 } 6995 6996 /* 6997 * Parse an ifreq or lifreq struct coming down ioctls and refhold 6998 * and return the associated ipif. 6999 * Return value: 7000 * Non zero: An error has occurred. ci may not be filled out. 7001 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 7002 * a held ipif in ci.ci_ipif. 7003 */ 7004 int 7005 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 7006 cmd_info_t *ci) 7007 { 7008 char *name; 7009 struct ifreq *ifr; 7010 struct lifreq *lifr; 7011 ipif_t *ipif = NULL; 7012 ill_t *ill; 7013 conn_t *connp; 7014 boolean_t isv6; 7015 int err; 7016 mblk_t *mp1; 7017 zoneid_t zoneid; 7018 ip_stack_t *ipst; 7019 7020 if (q->q_next != NULL) { 7021 ill = (ill_t *)q->q_ptr; 7022 isv6 = ill->ill_isv6; 7023 connp = NULL; 7024 zoneid = ALL_ZONES; 7025 ipst = ill->ill_ipst; 7026 } else { 7027 ill = NULL; 7028 connp = Q_TO_CONN(q); 7029 isv6 = (connp->conn_family == AF_INET6); 7030 zoneid = connp->conn_zoneid; 7031 if (zoneid == GLOBAL_ZONEID) { 7032 /* global zone can access ipifs in all zones */ 7033 zoneid = ALL_ZONES; 7034 } 7035 ipst = connp->conn_netstack->netstack_ip; 7036 } 7037 7038 /* Has been checked in ip_wput_nondata */ 7039 mp1 = mp->b_cont->b_cont; 7040 7041 if (ipip->ipi_cmd_type == IF_CMD) { 7042 /* This a old style SIOC[GS]IF* command */ 7043 ifr = (struct ifreq *)mp1->b_rptr; 7044 /* 7045 * Null terminate the string to protect against buffer 7046 * overrun. String was generated by user code and may not 7047 * be trusted. 7048 */ 7049 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 7050 name = ifr->ifr_name; 7051 ci->ci_sin = (sin_t *)&ifr->ifr_addr; 7052 ci->ci_sin6 = NULL; 7053 ci->ci_lifr = (struct lifreq *)ifr; 7054 } else { 7055 /* This a new style SIOC[GS]LIF* command */ 7056 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 7057 lifr = (struct lifreq *)mp1->b_rptr; 7058 /* 7059 * Null terminate the string to protect against buffer 7060 * overrun. String was generated by user code and may not 7061 * be trusted. 7062 */ 7063 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 7064 name = lifr->lifr_name; 7065 ci->ci_sin = (sin_t *)&lifr->lifr_addr; 7066 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr; 7067 ci->ci_lifr = lifr; 7068 } 7069 7070 if (ipip->ipi_cmd == SIOCSLIFNAME) { 7071 /* 7072 * The ioctl will be failed if the ioctl comes down 7073 * an conn stream 7074 */ 7075 if (ill == NULL) { 7076 /* 7077 * Not an ill queue, return EINVAL same as the 7078 * old error code. 7079 */ 7080 return (ENXIO); 7081 } 7082 ipif = ill->ill_ipif; 7083 ipif_refhold(ipif); 7084 } else { 7085 /* 7086 * Ensure that ioctls don't see any internal state changes 7087 * caused by set ioctls by deferring them if IPIF_CHANGING is 7088 * set. 7089 */ 7090 ipif = ipif_lookup_on_name_async(name, mi_strlen(name), 7091 isv6, zoneid, q, mp, ip_process_ioctl, &err, ipst); 7092 if (ipif == NULL) { 7093 if (err == EINPROGRESS) 7094 return (err); 7095 err = 0; /* Ensure we don't use it below */ 7096 } 7097 } 7098 7099 /* 7100 * Old style [GS]IFCMD does not admit IPv6 ipif 7101 */ 7102 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) { 7103 ipif_refrele(ipif); 7104 return (ENXIO); 7105 } 7106 7107 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 7108 name[0] == '\0') { 7109 /* 7110 * Handle a or a SIOC?IF* with a null name 7111 * during plumb (on the ill queue before the I_PLINK). 7112 */ 7113 ipif = ill->ill_ipif; 7114 ipif_refhold(ipif); 7115 } 7116 7117 if (ipif == NULL) 7118 return (ENXIO); 7119 7120 DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq", 7121 int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif); 7122 7123 ci->ci_ipif = ipif; 7124 return (0); 7125 } 7126 7127 /* 7128 * Return the total number of ipifs. 7129 */ 7130 static uint_t 7131 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst) 7132 { 7133 uint_t numifs = 0; 7134 ill_t *ill; 7135 ill_walk_context_t ctx; 7136 ipif_t *ipif; 7137 7138 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7139 ill = ILL_START_WALK_V4(&ctx, ipst); 7140 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7141 if (IS_UNDER_IPMP(ill)) 7142 continue; 7143 for (ipif = ill->ill_ipif; ipif != NULL; 7144 ipif = ipif->ipif_next) { 7145 if (ipif->ipif_zoneid == zoneid || 7146 ipif->ipif_zoneid == ALL_ZONES) 7147 numifs++; 7148 } 7149 } 7150 rw_exit(&ipst->ips_ill_g_lock); 7151 return (numifs); 7152 } 7153 7154 /* 7155 * Return the total number of ipifs. 7156 */ 7157 static uint_t 7158 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst) 7159 { 7160 uint_t numifs = 0; 7161 ill_t *ill; 7162 ipif_t *ipif; 7163 ill_walk_context_t ctx; 7164 7165 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 7166 7167 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7168 if (family == AF_INET) 7169 ill = ILL_START_WALK_V4(&ctx, ipst); 7170 else if (family == AF_INET6) 7171 ill = ILL_START_WALK_V6(&ctx, ipst); 7172 else 7173 ill = ILL_START_WALK_ALL(&ctx, ipst); 7174 7175 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7176 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP)) 7177 continue; 7178 7179 for (ipif = ill->ill_ipif; ipif != NULL; 7180 ipif = ipif->ipif_next) { 7181 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7182 !(lifn_flags & LIFC_NOXMIT)) 7183 continue; 7184 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7185 !(lifn_flags & LIFC_TEMPORARY)) 7186 continue; 7187 if (((ipif->ipif_flags & 7188 (IPIF_NOXMIT|IPIF_NOLOCAL| 7189 IPIF_DEPRECATED)) || 7190 IS_LOOPBACK(ill) || 7191 !(ipif->ipif_flags & IPIF_UP)) && 7192 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 7193 continue; 7194 7195 if (zoneid != ipif->ipif_zoneid && 7196 ipif->ipif_zoneid != ALL_ZONES && 7197 (zoneid != GLOBAL_ZONEID || 7198 !(lifn_flags & LIFC_ALLZONES))) 7199 continue; 7200 7201 numifs++; 7202 } 7203 } 7204 rw_exit(&ipst->ips_ill_g_lock); 7205 return (numifs); 7206 } 7207 7208 uint_t 7209 ip_get_lifsrcofnum(ill_t *ill) 7210 { 7211 uint_t numifs = 0; 7212 ill_t *ill_head = ill; 7213 ip_stack_t *ipst = ill->ill_ipst; 7214 7215 /* 7216 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 7217 * other thread may be trying to relink the ILLs in this usesrc group 7218 * and adjusting the ill_usesrc_grp_next pointers 7219 */ 7220 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 7221 if ((ill->ill_usesrc_ifindex == 0) && 7222 (ill->ill_usesrc_grp_next != NULL)) { 7223 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 7224 ill = ill->ill_usesrc_grp_next) 7225 numifs++; 7226 } 7227 rw_exit(&ipst->ips_ill_g_usesrc_lock); 7228 7229 return (numifs); 7230 } 7231 7232 /* Null values are passed in for ipif, sin, and ifreq */ 7233 /* ARGSUSED */ 7234 int 7235 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7236 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7237 { 7238 int *nump; 7239 conn_t *connp = Q_TO_CONN(q); 7240 7241 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7242 7243 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 7244 nump = (int *)mp->b_cont->b_cont->b_rptr; 7245 7246 *nump = ip_get_numifs(connp->conn_zoneid, 7247 connp->conn_netstack->netstack_ip); 7248 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 7249 return (0); 7250 } 7251 7252 /* Null values are passed in for ipif, sin, and ifreq */ 7253 /* ARGSUSED */ 7254 int 7255 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 7256 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7257 { 7258 struct lifnum *lifn; 7259 mblk_t *mp1; 7260 conn_t *connp = Q_TO_CONN(q); 7261 7262 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7263 7264 /* Existence checked in ip_wput_nondata */ 7265 mp1 = mp->b_cont->b_cont; 7266 7267 lifn = (struct lifnum *)mp1->b_rptr; 7268 switch (lifn->lifn_family) { 7269 case AF_UNSPEC: 7270 case AF_INET: 7271 case AF_INET6: 7272 break; 7273 default: 7274 return (EAFNOSUPPORT); 7275 } 7276 7277 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 7278 connp->conn_zoneid, connp->conn_netstack->netstack_ip); 7279 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 7280 return (0); 7281 } 7282 7283 /* ARGSUSED */ 7284 int 7285 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7286 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7287 { 7288 STRUCT_HANDLE(ifconf, ifc); 7289 mblk_t *mp1; 7290 struct iocblk *iocp; 7291 struct ifreq *ifr; 7292 ill_walk_context_t ctx; 7293 ill_t *ill; 7294 ipif_t *ipif; 7295 struct sockaddr_in *sin; 7296 int32_t ifclen; 7297 zoneid_t zoneid; 7298 ip_stack_t *ipst = CONNQ_TO_IPST(q); 7299 7300 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 7301 7302 ip1dbg(("ip_sioctl_get_ifconf")); 7303 /* Existence verified in ip_wput_nondata */ 7304 mp1 = mp->b_cont->b_cont; 7305 iocp = (struct iocblk *)mp->b_rptr; 7306 zoneid = Q_TO_CONN(q)->conn_zoneid; 7307 7308 /* 7309 * The original SIOCGIFCONF passed in a struct ifconf which specified 7310 * the user buffer address and length into which the list of struct 7311 * ifreqs was to be copied. Since AT&T Streams does not seem to 7312 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 7313 * the SIOCGIFCONF operation was redefined to simply provide 7314 * a large output buffer into which we are supposed to jam the ifreq 7315 * array. The same ioctl command code was used, despite the fact that 7316 * both the applications and the kernel code had to change, thus making 7317 * it impossible to support both interfaces. 7318 * 7319 * For reasons not good enough to try to explain, the following 7320 * algorithm is used for deciding what to do with one of these: 7321 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 7322 * form with the output buffer coming down as the continuation message. 7323 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 7324 * and we have to copy in the ifconf structure to find out how big the 7325 * output buffer is and where to copy out to. Sure no problem... 7326 * 7327 */ 7328 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 7329 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 7330 int numifs = 0; 7331 size_t ifc_bufsize; 7332 7333 /* 7334 * Must be (better be!) continuation of a TRANSPARENT 7335 * IOCTL. We just copied in the ifconf structure. 7336 */ 7337 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 7338 (struct ifconf *)mp1->b_rptr); 7339 7340 /* 7341 * Allocate a buffer to hold requested information. 7342 * 7343 * If ifc_len is larger than what is needed, we only 7344 * allocate what we will use. 7345 * 7346 * If ifc_len is smaller than what is needed, return 7347 * EINVAL. 7348 * 7349 * XXX: the ill_t structure can hava 2 counters, for 7350 * v4 and v6 (not just ill_ipif_up_count) to store the 7351 * number of interfaces for a device, so we don't need 7352 * to count them here... 7353 */ 7354 numifs = ip_get_numifs(zoneid, ipst); 7355 7356 ifclen = STRUCT_FGET(ifc, ifc_len); 7357 ifc_bufsize = numifs * sizeof (struct ifreq); 7358 if (ifc_bufsize > ifclen) { 7359 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 7360 /* old behaviour */ 7361 return (EINVAL); 7362 } else { 7363 ifc_bufsize = ifclen; 7364 } 7365 } 7366 7367 mp1 = mi_copyout_alloc(q, mp, 7368 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 7369 if (mp1 == NULL) 7370 return (ENOMEM); 7371 7372 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 7373 } 7374 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 7375 /* 7376 * the SIOCGIFCONF ioctl only knows about 7377 * IPv4 addresses, so don't try to tell 7378 * it about interfaces with IPv6-only 7379 * addresses. (Last parm 'isv6' is B_FALSE) 7380 */ 7381 7382 ifr = (struct ifreq *)mp1->b_rptr; 7383 7384 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7385 ill = ILL_START_WALK_V4(&ctx, ipst); 7386 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7387 if (IS_UNDER_IPMP(ill)) 7388 continue; 7389 for (ipif = ill->ill_ipif; ipif != NULL; 7390 ipif = ipif->ipif_next) { 7391 if (zoneid != ipif->ipif_zoneid && 7392 ipif->ipif_zoneid != ALL_ZONES) 7393 continue; 7394 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 7395 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 7396 /* old behaviour */ 7397 rw_exit(&ipst->ips_ill_g_lock); 7398 return (EINVAL); 7399 } else { 7400 goto if_copydone; 7401 } 7402 } 7403 ipif_get_name(ipif, ifr->ifr_name, 7404 sizeof (ifr->ifr_name)); 7405 sin = (sin_t *)&ifr->ifr_addr; 7406 *sin = sin_null; 7407 sin->sin_family = AF_INET; 7408 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 7409 ifr++; 7410 } 7411 } 7412 if_copydone: 7413 rw_exit(&ipst->ips_ill_g_lock); 7414 mp1->b_wptr = (uchar_t *)ifr; 7415 7416 if (STRUCT_BUF(ifc) != NULL) { 7417 STRUCT_FSET(ifc, ifc_len, 7418 (int)((uchar_t *)ifr - mp1->b_rptr)); 7419 } 7420 return (0); 7421 } 7422 7423 /* 7424 * Get the interfaces using the address hosted on the interface passed in, 7425 * as a source adddress 7426 */ 7427 /* ARGSUSED */ 7428 int 7429 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7430 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7431 { 7432 mblk_t *mp1; 7433 ill_t *ill, *ill_head; 7434 ipif_t *ipif, *orig_ipif; 7435 int numlifs = 0; 7436 size_t lifs_bufsize, lifsmaxlen; 7437 struct lifreq *lifr; 7438 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7439 uint_t ifindex; 7440 zoneid_t zoneid; 7441 boolean_t isv6 = B_FALSE; 7442 struct sockaddr_in *sin; 7443 struct sockaddr_in6 *sin6; 7444 STRUCT_HANDLE(lifsrcof, lifs); 7445 ip_stack_t *ipst; 7446 7447 ipst = CONNQ_TO_IPST(q); 7448 7449 ASSERT(q->q_next == NULL); 7450 7451 zoneid = Q_TO_CONN(q)->conn_zoneid; 7452 7453 /* Existence verified in ip_wput_nondata */ 7454 mp1 = mp->b_cont->b_cont; 7455 7456 /* 7457 * Must be (better be!) continuation of a TRANSPARENT 7458 * IOCTL. We just copied in the lifsrcof structure. 7459 */ 7460 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 7461 (struct lifsrcof *)mp1->b_rptr); 7462 7463 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 7464 return (EINVAL); 7465 7466 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 7467 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; 7468 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst); 7469 if (ipif == NULL) { 7470 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 7471 ifindex)); 7472 return (ENXIO); 7473 } 7474 7475 /* Allocate a buffer to hold requested information */ 7476 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 7477 lifs_bufsize = numlifs * sizeof (struct lifreq); 7478 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 7479 /* The actual size needed is always returned in lifs_len */ 7480 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 7481 7482 /* If the amount we need is more than what is passed in, abort */ 7483 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 7484 ipif_refrele(ipif); 7485 return (0); 7486 } 7487 7488 mp1 = mi_copyout_alloc(q, mp, 7489 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 7490 if (mp1 == NULL) { 7491 ipif_refrele(ipif); 7492 return (ENOMEM); 7493 } 7494 7495 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 7496 bzero(mp1->b_rptr, lifs_bufsize); 7497 7498 lifr = (struct lifreq *)mp1->b_rptr; 7499 7500 ill = ill_head = ipif->ipif_ill; 7501 orig_ipif = ipif; 7502 7503 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 7504 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 7505 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7506 7507 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 7508 for (; (ill != NULL) && (ill != ill_head); 7509 ill = ill->ill_usesrc_grp_next) { 7510 7511 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 7512 break; 7513 7514 ipif = ill->ill_ipif; 7515 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name)); 7516 if (ipif->ipif_isv6) { 7517 sin6 = (sin6_t *)&lifr->lifr_addr; 7518 *sin6 = sin6_null; 7519 sin6->sin6_family = AF_INET6; 7520 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 7521 lifr->lifr_addrlen = ip_mask_to_plen_v6( 7522 &ipif->ipif_v6net_mask); 7523 } else { 7524 sin = (sin_t *)&lifr->lifr_addr; 7525 *sin = sin_null; 7526 sin->sin_family = AF_INET; 7527 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 7528 lifr->lifr_addrlen = ip_mask_to_plen( 7529 ipif->ipif_net_mask); 7530 } 7531 lifr++; 7532 } 7533 rw_exit(&ipst->ips_ill_g_lock); 7534 rw_exit(&ipst->ips_ill_g_usesrc_lock); 7535 ipif_refrele(orig_ipif); 7536 mp1->b_wptr = (uchar_t *)lifr; 7537 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 7538 7539 return (0); 7540 } 7541 7542 /* ARGSUSED */ 7543 int 7544 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7545 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7546 { 7547 mblk_t *mp1; 7548 int list; 7549 ill_t *ill; 7550 ipif_t *ipif; 7551 int flags; 7552 int numlifs = 0; 7553 size_t lifc_bufsize; 7554 struct lifreq *lifr; 7555 sa_family_t family; 7556 struct sockaddr_in *sin; 7557 struct sockaddr_in6 *sin6; 7558 ill_walk_context_t ctx; 7559 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7560 int32_t lifclen; 7561 zoneid_t zoneid; 7562 STRUCT_HANDLE(lifconf, lifc); 7563 ip_stack_t *ipst = CONNQ_TO_IPST(q); 7564 7565 ip1dbg(("ip_sioctl_get_lifconf")); 7566 7567 ASSERT(q->q_next == NULL); 7568 7569 zoneid = Q_TO_CONN(q)->conn_zoneid; 7570 7571 /* Existence verified in ip_wput_nondata */ 7572 mp1 = mp->b_cont->b_cont; 7573 7574 /* 7575 * An extended version of SIOCGIFCONF that takes an 7576 * additional address family and flags field. 7577 * AF_UNSPEC retrieve both IPv4 and IPv6. 7578 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 7579 * interfaces are omitted. 7580 * Similarly, IPIF_TEMPORARY interfaces are omitted 7581 * unless LIFC_TEMPORARY is specified. 7582 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 7583 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 7584 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 7585 * has priority over LIFC_NOXMIT. 7586 */ 7587 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 7588 7589 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 7590 return (EINVAL); 7591 7592 /* 7593 * Must be (better be!) continuation of a TRANSPARENT 7594 * IOCTL. We just copied in the lifconf structure. 7595 */ 7596 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 7597 7598 family = STRUCT_FGET(lifc, lifc_family); 7599 flags = STRUCT_FGET(lifc, lifc_flags); 7600 7601 switch (family) { 7602 case AF_UNSPEC: 7603 /* 7604 * walk all ILL's. 7605 */ 7606 list = MAX_G_HEADS; 7607 break; 7608 case AF_INET: 7609 /* 7610 * walk only IPV4 ILL's. 7611 */ 7612 list = IP_V4_G_HEAD; 7613 break; 7614 case AF_INET6: 7615 /* 7616 * walk only IPV6 ILL's. 7617 */ 7618 list = IP_V6_G_HEAD; 7619 break; 7620 default: 7621 return (EAFNOSUPPORT); 7622 } 7623 7624 /* 7625 * Allocate a buffer to hold requested information. 7626 * 7627 * If lifc_len is larger than what is needed, we only 7628 * allocate what we will use. 7629 * 7630 * If lifc_len is smaller than what is needed, return 7631 * EINVAL. 7632 */ 7633 numlifs = ip_get_numlifs(family, flags, zoneid, ipst); 7634 lifc_bufsize = numlifs * sizeof (struct lifreq); 7635 lifclen = STRUCT_FGET(lifc, lifc_len); 7636 if (lifc_bufsize > lifclen) { 7637 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 7638 return (EINVAL); 7639 else 7640 lifc_bufsize = lifclen; 7641 } 7642 7643 mp1 = mi_copyout_alloc(q, mp, 7644 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 7645 if (mp1 == NULL) 7646 return (ENOMEM); 7647 7648 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 7649 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 7650 7651 lifr = (struct lifreq *)mp1->b_rptr; 7652 7653 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7654 ill = ill_first(list, list, &ctx, ipst); 7655 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7656 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP)) 7657 continue; 7658 7659 for (ipif = ill->ill_ipif; ipif != NULL; 7660 ipif = ipif->ipif_next) { 7661 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7662 !(flags & LIFC_NOXMIT)) 7663 continue; 7664 7665 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7666 !(flags & LIFC_TEMPORARY)) 7667 continue; 7668 7669 if (((ipif->ipif_flags & 7670 (IPIF_NOXMIT|IPIF_NOLOCAL| 7671 IPIF_DEPRECATED)) || 7672 IS_LOOPBACK(ill) || 7673 !(ipif->ipif_flags & IPIF_UP)) && 7674 (flags & LIFC_EXTERNAL_SOURCE)) 7675 continue; 7676 7677 if (zoneid != ipif->ipif_zoneid && 7678 ipif->ipif_zoneid != ALL_ZONES && 7679 (zoneid != GLOBAL_ZONEID || 7680 !(flags & LIFC_ALLZONES))) 7681 continue; 7682 7683 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 7684 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 7685 rw_exit(&ipst->ips_ill_g_lock); 7686 return (EINVAL); 7687 } else { 7688 goto lif_copydone; 7689 } 7690 } 7691 7692 ipif_get_name(ipif, lifr->lifr_name, 7693 sizeof (lifr->lifr_name)); 7694 lifr->lifr_type = ill->ill_type; 7695 if (ipif->ipif_isv6) { 7696 sin6 = (sin6_t *)&lifr->lifr_addr; 7697 *sin6 = sin6_null; 7698 sin6->sin6_family = AF_INET6; 7699 sin6->sin6_addr = 7700 ipif->ipif_v6lcl_addr; 7701 lifr->lifr_addrlen = 7702 ip_mask_to_plen_v6( 7703 &ipif->ipif_v6net_mask); 7704 } else { 7705 sin = (sin_t *)&lifr->lifr_addr; 7706 *sin = sin_null; 7707 sin->sin_family = AF_INET; 7708 sin->sin_addr.s_addr = 7709 ipif->ipif_lcl_addr; 7710 lifr->lifr_addrlen = 7711 ip_mask_to_plen( 7712 ipif->ipif_net_mask); 7713 } 7714 lifr++; 7715 } 7716 } 7717 lif_copydone: 7718 rw_exit(&ipst->ips_ill_g_lock); 7719 7720 mp1->b_wptr = (uchar_t *)lifr; 7721 if (STRUCT_BUF(lifc) != NULL) { 7722 STRUCT_FSET(lifc, lifc_len, 7723 (int)((uchar_t *)lifr - mp1->b_rptr)); 7724 } 7725 return (0); 7726 } 7727 7728 static void 7729 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 7730 { 7731 ip6_asp_t *table; 7732 size_t table_size; 7733 mblk_t *data_mp; 7734 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7735 ip_stack_t *ipst; 7736 7737 if (q->q_next == NULL) 7738 ipst = CONNQ_TO_IPST(q); 7739 else 7740 ipst = ILLQ_TO_IPST(q); 7741 7742 /* These two ioctls are I_STR only */ 7743 if (iocp->ioc_count == TRANSPARENT) { 7744 miocnak(q, mp, 0, EINVAL); 7745 return; 7746 } 7747 7748 data_mp = mp->b_cont; 7749 if (data_mp == NULL) { 7750 /* The user passed us a NULL argument */ 7751 table = NULL; 7752 table_size = iocp->ioc_count; 7753 } else { 7754 /* 7755 * The user provided a table. The stream head 7756 * may have copied in the user data in chunks, 7757 * so make sure everything is pulled up 7758 * properly. 7759 */ 7760 if (MBLKL(data_mp) < iocp->ioc_count) { 7761 mblk_t *new_data_mp; 7762 if ((new_data_mp = msgpullup(data_mp, -1)) == 7763 NULL) { 7764 miocnak(q, mp, 0, ENOMEM); 7765 return; 7766 } 7767 freemsg(data_mp); 7768 data_mp = new_data_mp; 7769 mp->b_cont = data_mp; 7770 } 7771 table = (ip6_asp_t *)data_mp->b_rptr; 7772 table_size = iocp->ioc_count; 7773 } 7774 7775 switch (iocp->ioc_cmd) { 7776 case SIOCGIP6ADDRPOLICY: 7777 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst); 7778 if (iocp->ioc_rval == -1) 7779 iocp->ioc_error = EINVAL; 7780 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 7781 else if (table != NULL && 7782 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 7783 ip6_asp_t *src = table; 7784 ip6_asp32_t *dst = (void *)table; 7785 int count = table_size / sizeof (ip6_asp_t); 7786 int i; 7787 7788 /* 7789 * We need to do an in-place shrink of the array 7790 * to match the alignment attributes of the 7791 * 32-bit ABI looking at it. 7792 */ 7793 /* LINTED: logical expression always true: op "||" */ 7794 ASSERT(sizeof (*src) > sizeof (*dst)); 7795 for (i = 1; i < count; i++) 7796 bcopy(src + i, dst + i, sizeof (*dst)); 7797 } 7798 #endif 7799 break; 7800 7801 case SIOCSIP6ADDRPOLICY: 7802 ASSERT(mp->b_prev == NULL); 7803 mp->b_prev = (void *)q; 7804 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 7805 /* 7806 * We pass in the datamodel here so that the ip6_asp_replace() 7807 * routine can handle converting from 32-bit to native formats 7808 * where necessary. 7809 * 7810 * A better way to handle this might be to convert the inbound 7811 * data structure here, and hang it off a new 'mp'; thus the 7812 * ip6_asp_replace() logic would always be dealing with native 7813 * format data structures.. 7814 * 7815 * (An even simpler way to handle these ioctls is to just 7816 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 7817 * and just recompile everything that depends on it.) 7818 */ 7819 #endif 7820 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst, 7821 iocp->ioc_flag & IOC_MODELS); 7822 return; 7823 } 7824 7825 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 7826 qreply(q, mp); 7827 } 7828 7829 static void 7830 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 7831 { 7832 mblk_t *data_mp; 7833 struct dstinforeq *dir; 7834 uint8_t *end, *cur; 7835 in6_addr_t *daddr, *saddr; 7836 ipaddr_t v4daddr; 7837 ire_t *ire; 7838 ipaddr_t v4setsrc; 7839 in6_addr_t v6setsrc; 7840 char *slabel, *dlabel; 7841 boolean_t isipv4; 7842 int match_ire; 7843 ill_t *dst_ill; 7844 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7845 conn_t *connp = Q_TO_CONN(q); 7846 zoneid_t zoneid = IPCL_ZONEID(connp); 7847 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 7848 uint64_t ipif_flags; 7849 7850 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 7851 7852 /* 7853 * This ioctl is I_STR only, and must have a 7854 * data mblk following the M_IOCTL mblk. 7855 */ 7856 data_mp = mp->b_cont; 7857 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 7858 miocnak(q, mp, 0, EINVAL); 7859 return; 7860 } 7861 7862 if (MBLKL(data_mp) < iocp->ioc_count) { 7863 mblk_t *new_data_mp; 7864 7865 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 7866 miocnak(q, mp, 0, ENOMEM); 7867 return; 7868 } 7869 freemsg(data_mp); 7870 data_mp = new_data_mp; 7871 mp->b_cont = data_mp; 7872 } 7873 match_ire = MATCH_IRE_DSTONLY; 7874 7875 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 7876 end - cur >= sizeof (struct dstinforeq); 7877 cur += sizeof (struct dstinforeq)) { 7878 dir = (struct dstinforeq *)cur; 7879 daddr = &dir->dir_daddr; 7880 saddr = &dir->dir_saddr; 7881 7882 /* 7883 * ip_addr_scope_v6() and ip6_asp_lookup() handle 7884 * v4 mapped addresses; ire_ftable_lookup_v6() 7885 * and ip_select_source_v6() do not. 7886 */ 7887 dir->dir_dscope = ip_addr_scope_v6(daddr); 7888 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst); 7889 7890 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 7891 if (isipv4) { 7892 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 7893 v4setsrc = INADDR_ANY; 7894 ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid, 7895 NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v4setsrc, 7896 NULL, NULL); 7897 } else { 7898 v6setsrc = ipv6_all_zeros; 7899 ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid, 7900 NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v6setsrc, 7901 NULL, NULL); 7902 } 7903 ASSERT(ire != NULL); 7904 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 7905 ire_refrele(ire); 7906 dir->dir_dreachable = 0; 7907 7908 /* move on to next dst addr */ 7909 continue; 7910 } 7911 dir->dir_dreachable = 1; 7912 7913 dst_ill = ire_nexthop_ill(ire); 7914 if (dst_ill == NULL) { 7915 ire_refrele(ire); 7916 continue; 7917 } 7918 7919 /* With ipmp we most likely look at the ipmp ill here */ 7920 dir->dir_dmactype = dst_ill->ill_mactype; 7921 7922 if (isipv4) { 7923 ipaddr_t v4saddr; 7924 7925 if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr, 7926 connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst, 7927 &v4saddr, NULL, &ipif_flags) != 0) { 7928 v4saddr = INADDR_ANY; 7929 ipif_flags = 0; 7930 } 7931 IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr); 7932 } else { 7933 if (ip_select_source_v6(dst_ill, &v6setsrc, daddr, 7934 zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT, 7935 saddr, NULL, &ipif_flags) != 0) { 7936 *saddr = ipv6_all_zeros; 7937 ipif_flags = 0; 7938 } 7939 } 7940 7941 dir->dir_sscope = ip_addr_scope_v6(saddr); 7942 slabel = ip6_asp_lookup(saddr, NULL, ipst); 7943 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 7944 dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 7945 ire_refrele(ire); 7946 ill_refrele(dst_ill); 7947 } 7948 miocack(q, mp, iocp->ioc_count, 0); 7949 } 7950 7951 /* 7952 * Check if this is an address assigned to this machine. 7953 * Skips interfaces that are down by using ire checks. 7954 * Translates mapped addresses to v4 addresses and then 7955 * treats them as such, returning true if the v4 address 7956 * associated with this mapped address is configured. 7957 * Note: Applications will have to be careful what they do 7958 * with the response; use of mapped addresses limits 7959 * what can be done with the socket, especially with 7960 * respect to socket options and ioctls - neither IPv4 7961 * options nor IPv6 sticky options/ancillary data options 7962 * may be used. 7963 */ 7964 /* ARGSUSED */ 7965 int 7966 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7967 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 7968 { 7969 struct sioc_addrreq *sia; 7970 sin_t *sin; 7971 ire_t *ire; 7972 mblk_t *mp1; 7973 zoneid_t zoneid; 7974 ip_stack_t *ipst; 7975 7976 ip1dbg(("ip_sioctl_tmyaddr")); 7977 7978 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 7979 zoneid = Q_TO_CONN(q)->conn_zoneid; 7980 ipst = CONNQ_TO_IPST(q); 7981 7982 /* Existence verified in ip_wput_nondata */ 7983 mp1 = mp->b_cont->b_cont; 7984 sia = (struct sioc_addrreq *)mp1->b_rptr; 7985 sin = (sin_t *)&sia->sa_addr; 7986 switch (sin->sin_family) { 7987 case AF_INET6: { 7988 sin6_t *sin6 = (sin6_t *)sin; 7989 7990 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 7991 ipaddr_t v4_addr; 7992 7993 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 7994 v4_addr); 7995 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 7996 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, 7997 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 7998 } else { 7999 in6_addr_t v6addr; 8000 8001 v6addr = sin6->sin6_addr; 8002 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 8003 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, 8004 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8005 } 8006 break; 8007 } 8008 case AF_INET: { 8009 ipaddr_t v4addr; 8010 8011 v4addr = sin->sin_addr.s_addr; 8012 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 8013 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8014 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8015 break; 8016 } 8017 default: 8018 return (EAFNOSUPPORT); 8019 } 8020 if (ire != NULL) { 8021 sia->sa_res = 1; 8022 ire_refrele(ire); 8023 } else { 8024 sia->sa_res = 0; 8025 } 8026 return (0); 8027 } 8028 8029 /* 8030 * Check if this is an address assigned on-link i.e. neighbor, 8031 * and makes sure it's reachable from the current zone. 8032 * Returns true for my addresses as well. 8033 * Translates mapped addresses to v4 addresses and then 8034 * treats them as such, returning true if the v4 address 8035 * associated with this mapped address is configured. 8036 * Note: Applications will have to be careful what they do 8037 * with the response; use of mapped addresses limits 8038 * what can be done with the socket, especially with 8039 * respect to socket options and ioctls - neither IPv4 8040 * options nor IPv6 sticky options/ancillary data options 8041 * may be used. 8042 */ 8043 /* ARGSUSED */ 8044 int 8045 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8046 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 8047 { 8048 struct sioc_addrreq *sia; 8049 sin_t *sin; 8050 mblk_t *mp1; 8051 ire_t *ire = NULL; 8052 zoneid_t zoneid; 8053 ip_stack_t *ipst; 8054 8055 ip1dbg(("ip_sioctl_tonlink")); 8056 8057 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8058 zoneid = Q_TO_CONN(q)->conn_zoneid; 8059 ipst = CONNQ_TO_IPST(q); 8060 8061 /* Existence verified in ip_wput_nondata */ 8062 mp1 = mp->b_cont->b_cont; 8063 sia = (struct sioc_addrreq *)mp1->b_rptr; 8064 sin = (sin_t *)&sia->sa_addr; 8065 8066 /* 8067 * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST 8068 * to make sure we only look at on-link unicast address. 8069 */ 8070 switch (sin->sin_family) { 8071 case AF_INET6: { 8072 sin6_t *sin6 = (sin6_t *)sin; 8073 8074 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8075 ipaddr_t v4_addr; 8076 8077 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8078 v4_addr); 8079 if (!CLASSD(v4_addr)) { 8080 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0, 8081 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 8082 0, ipst, NULL); 8083 } 8084 } else { 8085 in6_addr_t v6addr; 8086 8087 v6addr = sin6->sin6_addr; 8088 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 8089 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0, 8090 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 0, 8091 ipst, NULL); 8092 } 8093 } 8094 break; 8095 } 8096 case AF_INET: { 8097 ipaddr_t v4addr; 8098 8099 v4addr = sin->sin_addr.s_addr; 8100 if (!CLASSD(v4addr)) { 8101 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL, 8102 zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 8103 } 8104 break; 8105 } 8106 default: 8107 return (EAFNOSUPPORT); 8108 } 8109 sia->sa_res = 0; 8110 if (ire != NULL) { 8111 ASSERT(!(ire->ire_type & IRE_MULTICAST)); 8112 8113 if ((ire->ire_type & IRE_ONLINK) && 8114 !(ire->ire_type & IRE_BROADCAST)) 8115 sia->sa_res = 1; 8116 ire_refrele(ire); 8117 } 8118 return (0); 8119 } 8120 8121 /* 8122 * TBD: implement when kernel maintaines a list of site prefixes. 8123 */ 8124 /* ARGSUSED */ 8125 int 8126 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8127 ip_ioctl_cmd_t *ipip, void *ifreq) 8128 { 8129 return (ENXIO); 8130 } 8131 8132 /* ARP IOCTLs. */ 8133 /* ARGSUSED */ 8134 int 8135 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8136 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8137 { 8138 int err; 8139 ipaddr_t ipaddr; 8140 struct iocblk *iocp; 8141 conn_t *connp; 8142 struct arpreq *ar; 8143 struct xarpreq *xar; 8144 int arp_flags, flags, alength; 8145 uchar_t *lladdr; 8146 ip_stack_t *ipst; 8147 ill_t *ill = ipif->ipif_ill; 8148 ill_t *proxy_ill = NULL; 8149 ipmp_arpent_t *entp = NULL; 8150 boolean_t proxyarp = B_FALSE; 8151 boolean_t if_arp_ioctl = B_FALSE; 8152 ncec_t *ncec = NULL; 8153 nce_t *nce; 8154 8155 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8156 connp = Q_TO_CONN(q); 8157 ipst = connp->conn_netstack->netstack_ip; 8158 iocp = (struct iocblk *)mp->b_rptr; 8159 8160 if (ipip->ipi_cmd_type == XARP_CMD) { 8161 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 8162 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 8163 ar = NULL; 8164 8165 arp_flags = xar->xarp_flags; 8166 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha); 8167 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0); 8168 /* 8169 * Validate against user's link layer address length 8170 * input and name and addr length limits. 8171 */ 8172 alength = ill->ill_phys_addr_length; 8173 if (ipip->ipi_cmd == SIOCSXARP) { 8174 if (alength != xar->xarp_ha.sdl_alen || 8175 (alength + xar->xarp_ha.sdl_nlen > 8176 sizeof (xar->xarp_ha.sdl_data))) 8177 return (EINVAL); 8178 } 8179 } else { 8180 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 8181 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 8182 xar = NULL; 8183 8184 arp_flags = ar->arp_flags; 8185 lladdr = (uchar_t *)ar->arp_ha.sa_data; 8186 /* 8187 * Theoretically, the sa_family could tell us what link 8188 * layer type this operation is trying to deal with. By 8189 * common usage AF_UNSPEC means ethernet. We'll assume 8190 * any attempt to use the SIOC?ARP ioctls is for ethernet, 8191 * for now. Our new SIOC*XARP ioctls can be used more 8192 * generally. 8193 * 8194 * If the underlying media happens to have a non 6 byte 8195 * address, arp module will fail set/get, but the del 8196 * operation will succeed. 8197 */ 8198 alength = 6; 8199 if ((ipip->ipi_cmd != SIOCDARP) && 8200 (alength != ill->ill_phys_addr_length)) { 8201 return (EINVAL); 8202 } 8203 } 8204 8205 /* Translate ATF* flags to NCE* flags */ 8206 flags = 0; 8207 if (arp_flags & ATF_AUTHORITY) 8208 flags |= NCE_F_AUTHORITY; 8209 if (arp_flags & ATF_PERM) 8210 flags |= NCE_F_NONUD; /* not subject to aging */ 8211 if (arp_flags & ATF_PUBL) 8212 flags |= NCE_F_PUBLISH; 8213 8214 /* 8215 * IPMP ARP special handling: 8216 * 8217 * 1. Since ARP mappings must appear consistent across the group, 8218 * prohibit changing ARP mappings on the underlying interfaces. 8219 * 8220 * 2. Since ARP mappings for IPMP data addresses are maintained by 8221 * IP itself, prohibit changing them. 8222 * 8223 * 3. For proxy ARP, use a functioning hardware address in the group, 8224 * provided one exists. If one doesn't, just add the entry as-is; 8225 * ipmp_illgrp_refresh_arpent() will refresh it if things change. 8226 */ 8227 if (IS_UNDER_IPMP(ill)) { 8228 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP) 8229 return (EPERM); 8230 } 8231 if (IS_IPMP(ill)) { 8232 ipmp_illgrp_t *illg = ill->ill_grp; 8233 8234 switch (ipip->ipi_cmd) { 8235 case SIOCSARP: 8236 case SIOCSXARP: 8237 proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength); 8238 if (proxy_ill != NULL) { 8239 proxyarp = B_TRUE; 8240 if (!ipmp_ill_is_active(proxy_ill)) 8241 proxy_ill = ipmp_illgrp_next_ill(illg); 8242 if (proxy_ill != NULL) 8243 lladdr = proxy_ill->ill_phys_addr; 8244 } 8245 /* FALLTHRU */ 8246 } 8247 } 8248 8249 ipaddr = sin->sin_addr.s_addr; 8250 /* 8251 * don't match across illgrp per case (1) and (2). 8252 * XXX use IS_IPMP(ill) like ndp_sioc_update? 8253 */ 8254 nce = nce_lookup_v4(ill, &ipaddr); 8255 if (nce != NULL) 8256 ncec = nce->nce_common; 8257 8258 switch (iocp->ioc_cmd) { 8259 case SIOCDARP: 8260 case SIOCDXARP: { 8261 /* 8262 * Delete the NCE if any. 8263 */ 8264 if (ncec == NULL) { 8265 iocp->ioc_error = ENXIO; 8266 break; 8267 } 8268 /* Don't allow changes to arp mappings of local addresses. */ 8269 if (NCE_MYADDR(ncec)) { 8270 nce_refrele(nce); 8271 return (ENOTSUP); 8272 } 8273 iocp->ioc_error = 0; 8274 8275 /* 8276 * Delete the nce_common which has ncec_ill set to ipmp_ill. 8277 * This will delete all the nce entries on the under_ills. 8278 */ 8279 ncec_delete(ncec); 8280 /* 8281 * Once the NCE has been deleted, then the ire_dep* consistency 8282 * mechanism will find any IRE which depended on the now 8283 * condemned NCE (as part of sending packets). 8284 * That mechanism handles redirects by deleting redirects 8285 * that refer to UNREACHABLE nces. 8286 */ 8287 break; 8288 } 8289 case SIOCGARP: 8290 case SIOCGXARP: 8291 if (ncec != NULL) { 8292 lladdr = ncec->ncec_lladdr; 8293 flags = ncec->ncec_flags; 8294 iocp->ioc_error = 0; 8295 ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags); 8296 } else { 8297 iocp->ioc_error = ENXIO; 8298 } 8299 break; 8300 case SIOCSARP: 8301 case SIOCSXARP: 8302 /* Don't allow changes to arp mappings of local addresses. */ 8303 if (ncec != NULL && NCE_MYADDR(ncec)) { 8304 nce_refrele(nce); 8305 return (ENOTSUP); 8306 } 8307 8308 /* static arp entries will undergo NUD if ATF_PERM is not set */ 8309 flags |= NCE_F_STATIC; 8310 if (!if_arp_ioctl) { 8311 ip_nce_lookup_and_update(&ipaddr, NULL, ipst, 8312 lladdr, alength, flags); 8313 } else { 8314 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 8315 if (ipif != NULL) { 8316 ip_nce_lookup_and_update(&ipaddr, ipif, ipst, 8317 lladdr, alength, flags); 8318 ipif_refrele(ipif); 8319 } 8320 } 8321 if (nce != NULL) { 8322 nce_refrele(nce); 8323 nce = NULL; 8324 } 8325 /* 8326 * NCE_F_STATIC entries will be added in state ND_REACHABLE 8327 * by nce_add_common() 8328 */ 8329 err = nce_lookup_then_add_v4(ill, lladdr, 8330 ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED, 8331 &nce); 8332 if (err == EEXIST) { 8333 ncec = nce->nce_common; 8334 mutex_enter(&ncec->ncec_lock); 8335 ncec->ncec_state = ND_REACHABLE; 8336 ncec->ncec_flags = flags; 8337 nce_update(ncec, ND_UNCHANGED, lladdr); 8338 mutex_exit(&ncec->ncec_lock); 8339 err = 0; 8340 } 8341 if (nce != NULL) { 8342 nce_refrele(nce); 8343 nce = NULL; 8344 } 8345 if (IS_IPMP(ill) && err == 0) { 8346 entp = ipmp_illgrp_create_arpent(ill->ill_grp, 8347 proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length, 8348 flags); 8349 if (entp == NULL || (proxyarp && proxy_ill == NULL)) { 8350 iocp->ioc_error = (entp == NULL ? ENOMEM : 0); 8351 break; 8352 } 8353 } 8354 iocp->ioc_error = err; 8355 } 8356 8357 if (nce != NULL) { 8358 nce_refrele(nce); 8359 } 8360 8361 /* 8362 * If we created an IPMP ARP entry, mark that we've notified ARP. 8363 */ 8364 if (entp != NULL) 8365 ipmp_illgrp_mark_arpent(ill->ill_grp, entp); 8366 8367 return (iocp->ioc_error); 8368 } 8369 8370 /* 8371 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify 8372 * the associated sin and refhold and return the associated ipif via `ci'. 8373 */ 8374 int 8375 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 8376 cmd_info_t *ci) 8377 { 8378 mblk_t *mp1; 8379 sin_t *sin; 8380 conn_t *connp; 8381 ipif_t *ipif; 8382 ire_t *ire = NULL; 8383 ill_t *ill = NULL; 8384 boolean_t exists; 8385 ip_stack_t *ipst; 8386 struct arpreq *ar; 8387 struct xarpreq *xar; 8388 struct sockaddr_dl *sdl; 8389 8390 /* ioctl comes down on a conn */ 8391 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8392 connp = Q_TO_CONN(q); 8393 if (connp->conn_family == AF_INET6) 8394 return (ENXIO); 8395 8396 ipst = connp->conn_netstack->netstack_ip; 8397 8398 /* Verified in ip_wput_nondata */ 8399 mp1 = mp->b_cont->b_cont; 8400 8401 if (ipip->ipi_cmd_type == XARP_CMD) { 8402 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq)); 8403 xar = (struct xarpreq *)mp1->b_rptr; 8404 sin = (sin_t *)&xar->xarp_pa; 8405 sdl = &xar->xarp_ha; 8406 8407 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET) 8408 return (ENXIO); 8409 if (sdl->sdl_nlen >= LIFNAMSIZ) 8410 return (EINVAL); 8411 } else { 8412 ASSERT(ipip->ipi_cmd_type == ARP_CMD); 8413 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq)); 8414 ar = (struct arpreq *)mp1->b_rptr; 8415 sin = (sin_t *)&ar->arp_pa; 8416 } 8417 8418 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) { 8419 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen, 8420 B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst); 8421 if (ipif == NULL) 8422 return (ENXIO); 8423 if (ipif->ipif_id != 0) { 8424 ipif_refrele(ipif); 8425 return (ENXIO); 8426 } 8427 } else { 8428 /* 8429 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen 8430 * of 0: use the IP address to find the ipif. If the IP 8431 * address is an IPMP test address, ire_ftable_lookup() will 8432 * find the wrong ill, so we first do an ipif_lookup_addr(). 8433 */ 8434 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES, 8435 ipst); 8436 if (ipif == NULL) { 8437 ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr, 8438 0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES, 8439 NULL, MATCH_IRE_TYPE, 0, ipst, NULL); 8440 if (ire == NULL || ((ill = ire->ire_ill) == NULL)) { 8441 if (ire != NULL) 8442 ire_refrele(ire); 8443 return (ENXIO); 8444 } 8445 ASSERT(ire != NULL && ill != NULL); 8446 ipif = ill->ill_ipif; 8447 ipif_refhold(ipif); 8448 ire_refrele(ire); 8449 } 8450 } 8451 8452 if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) { 8453 ipif_refrele(ipif); 8454 return (ENXIO); 8455 } 8456 8457 ci->ci_sin = sin; 8458 ci->ci_ipif = ipif; 8459 return (0); 8460 } 8461 8462 /* 8463 * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the 8464 * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is 8465 * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it 8466 * up and thus an ill can join that illgrp. 8467 * 8468 * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than 8469 * open()/close() primarily because close() is not allowed to fail or block 8470 * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason 8471 * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure 8472 * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the 8473 * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts 8474 * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent 8475 * state if I_UNLINK didn't occur. 8476 * 8477 * Note that for each plumb/unplumb operation, we may end up here more than 8478 * once because of the way ifconfig works. However, it's OK to link the same 8479 * illgrp more than once, or unlink an illgrp that's already unlinked. 8480 */ 8481 static int 8482 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd) 8483 { 8484 int err; 8485 ip_stack_t *ipst = ill->ill_ipst; 8486 8487 ASSERT(IS_IPMP(ill)); 8488 ASSERT(IAM_WRITER_ILL(ill)); 8489 8490 switch (ioccmd) { 8491 case I_LINK: 8492 return (ENOTSUP); 8493 8494 case I_PLINK: 8495 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 8496 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp); 8497 rw_exit(&ipst->ips_ipmp_lock); 8498 break; 8499 8500 case I_PUNLINK: 8501 /* 8502 * Require all UP ipifs be brought down prior to unlinking the 8503 * illgrp so any associated IREs (and other state) is torched. 8504 */ 8505 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) 8506 return (EBUSY); 8507 8508 /* 8509 * NOTE: We hold ipmp_lock across the unlink to prevent a race 8510 * with an SIOCSLIFGROUPNAME request from an ill trying to 8511 * join this group. Specifically: ills trying to join grab 8512 * ipmp_lock and bump a "pending join" counter checked by 8513 * ipmp_illgrp_unlink_grp(). During the unlink no new pending 8514 * joins can occur (since we have ipmp_lock). Once we drop 8515 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not 8516 * find the illgrp (since we unlinked it) and will return 8517 * EAFNOSUPPORT. This will then take them back through the 8518 * IPMP meta-interface plumbing logic in ifconfig, and thus 8519 * back through I_PLINK above. 8520 */ 8521 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 8522 err = ipmp_illgrp_unlink_grp(ill->ill_grp); 8523 rw_exit(&ipst->ips_ipmp_lock); 8524 return (err); 8525 default: 8526 break; 8527 } 8528 return (0); 8529 } 8530 8531 /* 8532 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 8533 * atomically set/clear the muxids. Also complete the ioctl by acking or 8534 * naking it. Note that the code is structured such that the link type, 8535 * whether it's persistent or not, is treated equally. ifconfig(1M) and 8536 * its clones use the persistent link, while pppd(1M) and perhaps many 8537 * other daemons may use non-persistent link. When combined with some 8538 * ill_t states, linking and unlinking lower streams may be used as 8539 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 8540 */ 8541 /* ARGSUSED */ 8542 void 8543 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 8544 { 8545 mblk_t *mp1; 8546 struct linkblk *li; 8547 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; 8548 int err = 0; 8549 8550 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK || 8551 ioccmd == I_LINK || ioccmd == I_UNLINK); 8552 8553 mp1 = mp->b_cont; /* This is the linkblk info */ 8554 li = (struct linkblk *)mp1->b_rptr; 8555 8556 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li); 8557 if (err == EINPROGRESS) 8558 return; 8559 done: 8560 if (err == 0) 8561 miocack(q, mp, 0, 0); 8562 else 8563 miocnak(q, mp, 0, err); 8564 8565 /* Conn was refheld in ip_sioctl_copyin_setup */ 8566 if (CONN_Q(q)) 8567 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 8568 } 8569 8570 /* 8571 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to 8572 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP 8573 * module stream). If `doconsist' is set, then do the extended consistency 8574 * checks requested by ifconfig(1M) and (atomically) set ill_muxid here. 8575 * Returns zero on success, EINPROGRESS if the operation is still pending, or 8576 * an error code on failure. 8577 */ 8578 static int 8579 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, 8580 struct linkblk *li) 8581 { 8582 int err = 0; 8583 ill_t *ill; 8584 queue_t *ipwq, *dwq; 8585 const char *name; 8586 struct qinit *qinfo; 8587 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 8588 boolean_t entered_ipsq = B_FALSE; 8589 boolean_t is_ip = B_FALSE; 8590 arl_t *arl; 8591 8592 /* 8593 * Walk the lower stream to verify it's the IP module stream. 8594 * The IP module is identified by its name, wput function, 8595 * and non-NULL q_next. STREAMS ensures that the lower stream 8596 * (li->l_qbot) will not vanish until this ioctl completes. 8597 */ 8598 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) { 8599 qinfo = ipwq->q_qinfo; 8600 name = qinfo->qi_minfo->mi_idname; 8601 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 && 8602 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 8603 is_ip = B_TRUE; 8604 break; 8605 } 8606 if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 && 8607 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 8608 break; 8609 } 8610 } 8611 8612 /* 8613 * If this isn't an IP module stream, bail. 8614 */ 8615 if (ipwq == NULL) 8616 return (0); 8617 8618 if (!is_ip) { 8619 arl = (arl_t *)ipwq->q_ptr; 8620 ill = arl_to_ill(arl); 8621 if (ill == NULL) 8622 return (0); 8623 } else { 8624 ill = ipwq->q_ptr; 8625 } 8626 ASSERT(ill != NULL); 8627 8628 if (ipsq == NULL) { 8629 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 8630 NEW_OP, B_FALSE); 8631 if (ipsq == NULL) { 8632 if (!is_ip) 8633 ill_refrele(ill); 8634 return (EINPROGRESS); 8635 } 8636 entered_ipsq = B_TRUE; 8637 } 8638 ASSERT(IAM_WRITER_ILL(ill)); 8639 mutex_enter(&ill->ill_lock); 8640 if (!is_ip) { 8641 if (islink && ill->ill_muxid == 0) { 8642 /* 8643 * Plumbing has to be done with IP plumbed first, arp 8644 * second, but here we have arp being plumbed first. 8645 */ 8646 mutex_exit(&ill->ill_lock); 8647 ipsq_exit(ipsq); 8648 ill_refrele(ill); 8649 return (EINVAL); 8650 } 8651 } 8652 mutex_exit(&ill->ill_lock); 8653 if (!is_ip) { 8654 arl->arl_muxid = islink ? li->l_index : 0; 8655 ill_refrele(ill); 8656 goto done; 8657 } 8658 8659 if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) 8660 goto done; 8661 8662 /* 8663 * As part of I_{P}LINKing, stash the number of downstream modules and 8664 * the read queue of the module immediately below IP in the ill. 8665 * These are used during the capability negotiation below. 8666 */ 8667 ill->ill_lmod_rq = NULL; 8668 ill->ill_lmod_cnt = 0; 8669 if (islink && ((dwq = ipwq->q_next) != NULL)) { 8670 ill->ill_lmod_rq = RD(dwq); 8671 for (; dwq != NULL; dwq = dwq->q_next) 8672 ill->ill_lmod_cnt++; 8673 } 8674 8675 ill->ill_muxid = islink ? li->l_index : 0; 8676 8677 /* 8678 * Mark the ipsq busy until the capability operations initiated below 8679 * complete. The PLINK/UNLINK ioctl itself completes when our caller 8680 * returns, but the capability operation may complete asynchronously 8681 * much later. 8682 */ 8683 ipsq_current_start(ipsq, ill->ill_ipif, ioccmd); 8684 /* 8685 * If there's at least one up ipif on this ill, then we're bound to 8686 * the underlying driver via DLPI. In that case, renegotiate 8687 * capabilities to account for any possible change in modules 8688 * interposed between IP and the driver. 8689 */ 8690 if (ill->ill_ipif_up_count > 0) { 8691 if (islink) 8692 ill_capability_probe(ill); 8693 else 8694 ill_capability_reset(ill, B_FALSE); 8695 } 8696 ipsq_current_finish(ipsq); 8697 done: 8698 if (entered_ipsq) 8699 ipsq_exit(ipsq); 8700 8701 return (err); 8702 } 8703 8704 /* 8705 * Search the ioctl command in the ioctl tables and return a pointer 8706 * to the ioctl command information. The ioctl command tables are 8707 * static and fully populated at compile time. 8708 */ 8709 ip_ioctl_cmd_t * 8710 ip_sioctl_lookup(int ioc_cmd) 8711 { 8712 int index; 8713 ip_ioctl_cmd_t *ipip; 8714 ip_ioctl_cmd_t *ipip_end; 8715 8716 if (ioc_cmd == IPI_DONTCARE) 8717 return (NULL); 8718 8719 /* 8720 * Do a 2 step search. First search the indexed table 8721 * based on the least significant byte of the ioctl cmd. 8722 * If we don't find a match, then search the misc table 8723 * serially. 8724 */ 8725 index = ioc_cmd & 0xFF; 8726 if (index < ip_ndx_ioctl_count) { 8727 ipip = &ip_ndx_ioctl_table[index]; 8728 if (ipip->ipi_cmd == ioc_cmd) { 8729 /* Found a match in the ndx table */ 8730 return (ipip); 8731 } 8732 } 8733 8734 /* Search the misc table */ 8735 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 8736 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 8737 if (ipip->ipi_cmd == ioc_cmd) 8738 /* Found a match in the misc table */ 8739 return (ipip); 8740 } 8741 8742 return (NULL); 8743 } 8744 8745 /* 8746 * helper function for ip_sioctl_getsetprop(), which does some sanity checks 8747 */ 8748 static boolean_t 8749 getset_ioctl_checks(mblk_t *mp) 8750 { 8751 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8752 mblk_t *mp1 = mp->b_cont; 8753 mod_ioc_prop_t *pioc; 8754 uint_t flags; 8755 uint_t pioc_size; 8756 8757 /* do sanity checks on various arguments */ 8758 if (mp1 == NULL || iocp->ioc_count == 0 || 8759 iocp->ioc_count == TRANSPARENT) { 8760 return (B_FALSE); 8761 } 8762 if (msgdsize(mp1) < iocp->ioc_count) { 8763 if (!pullupmsg(mp1, iocp->ioc_count)) 8764 return (B_FALSE); 8765 } 8766 8767 pioc = (mod_ioc_prop_t *)mp1->b_rptr; 8768 8769 /* sanity checks on mpr_valsize */ 8770 pioc_size = sizeof (mod_ioc_prop_t); 8771 if (pioc->mpr_valsize != 0) 8772 pioc_size += pioc->mpr_valsize - 1; 8773 8774 if (iocp->ioc_count != pioc_size) 8775 return (B_FALSE); 8776 8777 flags = pioc->mpr_flags; 8778 if (iocp->ioc_cmd == SIOCSETPROP) { 8779 /* 8780 * One can either reset the value to it's default value or 8781 * change the current value or append/remove the value from 8782 * a multi-valued properties. 8783 */ 8784 if ((flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT && 8785 flags != MOD_PROP_ACTIVE && 8786 flags != (MOD_PROP_ACTIVE|MOD_PROP_APPEND) && 8787 flags != (MOD_PROP_ACTIVE|MOD_PROP_REMOVE)) 8788 return (B_FALSE); 8789 } else { 8790 ASSERT(iocp->ioc_cmd == SIOCGETPROP); 8791 8792 /* 8793 * One can retrieve only one kind of property information 8794 * at a time. 8795 */ 8796 if ((flags & MOD_PROP_ACTIVE) != MOD_PROP_ACTIVE && 8797 (flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT && 8798 (flags & MOD_PROP_POSSIBLE) != MOD_PROP_POSSIBLE && 8799 (flags & MOD_PROP_PERM) != MOD_PROP_PERM) 8800 return (B_FALSE); 8801 } 8802 8803 return (B_TRUE); 8804 } 8805 8806 /* 8807 * process the SIOC{SET|GET}PROP ioctl's 8808 */ 8809 /* ARGSUSED */ 8810 static void 8811 ip_sioctl_getsetprop(queue_t *q, mblk_t *mp) 8812 { 8813 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8814 mblk_t *mp1 = mp->b_cont; 8815 mod_ioc_prop_t *pioc; 8816 mod_prop_info_t *ptbl = NULL, *pinfo = NULL; 8817 ip_stack_t *ipst; 8818 icmp_stack_t *is; 8819 tcp_stack_t *tcps; 8820 sctp_stack_t *sctps; 8821 udp_stack_t *us; 8822 netstack_t *stack; 8823 void *cbarg; 8824 cred_t *cr; 8825 boolean_t set; 8826 int err; 8827 8828 ASSERT(q->q_next == NULL); 8829 ASSERT(CONN_Q(q)); 8830 8831 if (!getset_ioctl_checks(mp)) { 8832 miocnak(q, mp, 0, EINVAL); 8833 return; 8834 } 8835 ipst = CONNQ_TO_IPST(q); 8836 stack = ipst->ips_netstack; 8837 pioc = (mod_ioc_prop_t *)mp1->b_rptr; 8838 8839 switch (pioc->mpr_proto) { 8840 case MOD_PROTO_IP: 8841 case MOD_PROTO_IPV4: 8842 case MOD_PROTO_IPV6: 8843 ptbl = ipst->ips_propinfo_tbl; 8844 cbarg = ipst; 8845 break; 8846 case MOD_PROTO_RAWIP: 8847 is = stack->netstack_icmp; 8848 ptbl = is->is_propinfo_tbl; 8849 cbarg = is; 8850 break; 8851 case MOD_PROTO_TCP: 8852 tcps = stack->netstack_tcp; 8853 ptbl = tcps->tcps_propinfo_tbl; 8854 cbarg = tcps; 8855 break; 8856 case MOD_PROTO_UDP: 8857 us = stack->netstack_udp; 8858 ptbl = us->us_propinfo_tbl; 8859 cbarg = us; 8860 break; 8861 case MOD_PROTO_SCTP: 8862 sctps = stack->netstack_sctp; 8863 ptbl = sctps->sctps_propinfo_tbl; 8864 cbarg = sctps; 8865 break; 8866 default: 8867 miocnak(q, mp, 0, EINVAL); 8868 return; 8869 } 8870 8871 /* search for given property in respective protocol propinfo table */ 8872 for (pinfo = ptbl; pinfo->mpi_name != NULL; pinfo++) { 8873 if (strcmp(pinfo->mpi_name, pioc->mpr_name) == 0 && 8874 pinfo->mpi_proto == pioc->mpr_proto) 8875 break; 8876 } 8877 if (pinfo->mpi_name == NULL) { 8878 miocnak(q, mp, 0, ENOENT); 8879 return; 8880 } 8881 8882 set = (iocp->ioc_cmd == SIOCSETPROP) ? B_TRUE : B_FALSE; 8883 if (set && pinfo->mpi_setf != NULL) { 8884 cr = msg_getcred(mp, NULL); 8885 if (cr == NULL) 8886 cr = iocp->ioc_cr; 8887 err = pinfo->mpi_setf(cbarg, cr, pinfo, pioc->mpr_ifname, 8888 pioc->mpr_val, pioc->mpr_flags); 8889 } else if (!set && pinfo->mpi_getf != NULL) { 8890 err = pinfo->mpi_getf(cbarg, pinfo, pioc->mpr_ifname, 8891 pioc->mpr_val, pioc->mpr_valsize, pioc->mpr_flags); 8892 } else { 8893 err = EPERM; 8894 } 8895 8896 if (err != 0) { 8897 miocnak(q, mp, 0, err); 8898 } else { 8899 if (set) 8900 miocack(q, mp, 0, 0); 8901 else /* For get, we need to return back the data */ 8902 miocack(q, mp, iocp->ioc_count, 0); 8903 } 8904 } 8905 8906 /* 8907 * process the legacy ND_GET, ND_SET ioctl just for {ip|ip6}_forwarding 8908 * as several routing daemons have unfortunately used this 'unpublished' 8909 * but well-known ioctls. 8910 */ 8911 /* ARGSUSED */ 8912 static void 8913 ip_process_legacy_nddprop(queue_t *q, mblk_t *mp) 8914 { 8915 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8916 mblk_t *mp1 = mp->b_cont; 8917 char *pname, *pval, *buf; 8918 uint_t bufsize, proto; 8919 mod_prop_info_t *ptbl = NULL, *pinfo = NULL; 8920 ip_stack_t *ipst; 8921 int err = 0; 8922 8923 ASSERT(CONN_Q(q)); 8924 ipst = CONNQ_TO_IPST(q); 8925 8926 if (iocp->ioc_count == 0 || mp1 == NULL) { 8927 miocnak(q, mp, 0, EINVAL); 8928 return; 8929 } 8930 8931 mp1->b_datap->db_lim[-1] = '\0'; /* Force null termination */ 8932 pval = buf = pname = (char *)mp1->b_rptr; 8933 bufsize = MBLKL(mp1); 8934 8935 if (strcmp(pname, "ip_forwarding") == 0) { 8936 pname = "forwarding"; 8937 proto = MOD_PROTO_IPV4; 8938 } else if (strcmp(pname, "ip6_forwarding") == 0) { 8939 pname = "forwarding"; 8940 proto = MOD_PROTO_IPV6; 8941 } else { 8942 miocnak(q, mp, 0, EINVAL); 8943 return; 8944 } 8945 8946 ptbl = ipst->ips_propinfo_tbl; 8947 for (pinfo = ptbl; pinfo->mpi_name != NULL; pinfo++) { 8948 if (strcmp(pinfo->mpi_name, pname) == 0 && 8949 pinfo->mpi_proto == proto) 8950 break; 8951 } 8952 8953 ASSERT(pinfo->mpi_name != NULL); 8954 8955 switch (iocp->ioc_cmd) { 8956 case ND_GET: 8957 if ((err = pinfo->mpi_getf(ipst, pinfo, NULL, buf, bufsize, 8958 0)) == 0) { 8959 miocack(q, mp, iocp->ioc_count, 0); 8960 return; 8961 } 8962 break; 8963 case ND_SET: 8964 /* 8965 * buffer will have property name and value in the following 8966 * format, 8967 * <property name>'\0'<property value>'\0', extract them; 8968 */ 8969 while (*pval++) 8970 noop; 8971 8972 if (!*pval || pval >= (char *)mp1->b_wptr) { 8973 err = EINVAL; 8974 } else if ((err = pinfo->mpi_setf(ipst, NULL, pinfo, NULL, 8975 pval, 0)) == 0) { 8976 miocack(q, mp, 0, 0); 8977 return; 8978 } 8979 break; 8980 default: 8981 err = EINVAL; 8982 break; 8983 } 8984 miocnak(q, mp, 0, err); 8985 } 8986 8987 /* 8988 * Wrapper function for resuming deferred ioctl processing 8989 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 8990 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 8991 */ 8992 /* ARGSUSED */ 8993 void 8994 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 8995 void *dummy_arg) 8996 { 8997 ip_sioctl_copyin_setup(q, mp); 8998 } 8999 9000 /* 9001 * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message 9002 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 9003 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 9004 * We establish here the size of the block to be copied in. mi_copyin 9005 * arranges for this to happen, an processing continues in ip_wput_nondata with 9006 * an M_IOCDATA message. 9007 */ 9008 void 9009 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 9010 { 9011 int copyin_size; 9012 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9013 ip_ioctl_cmd_t *ipip; 9014 cred_t *cr; 9015 ip_stack_t *ipst; 9016 9017 if (CONN_Q(q)) 9018 ipst = CONNQ_TO_IPST(q); 9019 else 9020 ipst = ILLQ_TO_IPST(q); 9021 9022 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 9023 if (ipip == NULL) { 9024 /* 9025 * The ioctl is not one we understand or own. 9026 * Pass it along to be processed down stream, 9027 * if this is a module instance of IP, else nak 9028 * the ioctl. 9029 */ 9030 if (q->q_next == NULL) { 9031 goto nak; 9032 } else { 9033 putnext(q, mp); 9034 return; 9035 } 9036 } 9037 9038 /* 9039 * If this is deferred, then we will do all the checks when we 9040 * come back. 9041 */ 9042 if ((iocp->ioc_cmd == SIOCGDSTINFO || 9043 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) { 9044 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 9045 return; 9046 } 9047 9048 /* 9049 * Only allow a very small subset of IP ioctls on this stream if 9050 * IP is a module and not a driver. Allowing ioctls to be processed 9051 * in this case may cause assert failures or data corruption. 9052 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 9053 * ioctls allowed on an IP module stream, after which this stream 9054 * normally becomes a multiplexor (at which time the stream head 9055 * will fail all ioctls). 9056 */ 9057 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 9058 goto nak; 9059 } 9060 9061 /* Make sure we have ioctl data to process. */ 9062 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 9063 goto nak; 9064 9065 /* 9066 * Prefer dblk credential over ioctl credential; some synthesized 9067 * ioctls have kcred set because there's no way to crhold() 9068 * a credential in some contexts. (ioc_cr is not crfree() by 9069 * the framework; the caller of ioctl needs to hold the reference 9070 * for the duration of the call). 9071 */ 9072 cr = msg_getcred(mp, NULL); 9073 if (cr == NULL) 9074 cr = iocp->ioc_cr; 9075 9076 /* Make sure normal users don't send down privileged ioctls */ 9077 if ((ipip->ipi_flags & IPI_PRIV) && 9078 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) { 9079 /* We checked the privilege earlier but log it here */ 9080 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE)); 9081 return; 9082 } 9083 9084 /* 9085 * The ioctl command tables can only encode fixed length 9086 * ioctl data. If the length is variable, the table will 9087 * encode the length as zero. Such special cases are handled 9088 * below in the switch. 9089 */ 9090 if (ipip->ipi_copyin_size != 0) { 9091 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 9092 return; 9093 } 9094 9095 switch (iocp->ioc_cmd) { 9096 case O_SIOCGIFCONF: 9097 case SIOCGIFCONF: 9098 /* 9099 * This IOCTL is hilarious. See comments in 9100 * ip_sioctl_get_ifconf for the story. 9101 */ 9102 if (iocp->ioc_count == TRANSPARENT) 9103 copyin_size = SIZEOF_STRUCT(ifconf, 9104 iocp->ioc_flag); 9105 else 9106 copyin_size = iocp->ioc_count; 9107 mi_copyin(q, mp, NULL, copyin_size); 9108 return; 9109 9110 case O_SIOCGLIFCONF: 9111 case SIOCGLIFCONF: 9112 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 9113 mi_copyin(q, mp, NULL, copyin_size); 9114 return; 9115 9116 case SIOCGLIFSRCOF: 9117 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 9118 mi_copyin(q, mp, NULL, copyin_size); 9119 return; 9120 9121 case SIOCGIP6ADDRPOLICY: 9122 ip_sioctl_ip6addrpolicy(q, mp); 9123 ip6_asp_table_refrele(ipst); 9124 return; 9125 9126 case SIOCSIP6ADDRPOLICY: 9127 ip_sioctl_ip6addrpolicy(q, mp); 9128 return; 9129 9130 case SIOCGDSTINFO: 9131 ip_sioctl_dstinfo(q, mp); 9132 ip6_asp_table_refrele(ipst); 9133 return; 9134 9135 case ND_SET: 9136 case ND_GET: 9137 ip_process_legacy_nddprop(q, mp); 9138 return; 9139 9140 case SIOCSETPROP: 9141 case SIOCGETPROP: 9142 ip_sioctl_getsetprop(q, mp); 9143 return; 9144 9145 case I_PLINK: 9146 case I_PUNLINK: 9147 case I_LINK: 9148 case I_UNLINK: 9149 /* 9150 * We treat non-persistent link similarly as the persistent 9151 * link case, in terms of plumbing/unplumbing, as well as 9152 * dynamic re-plumbing events indicator. See comments 9153 * in ip_sioctl_plink() for more. 9154 * 9155 * Request can be enqueued in the 'ipsq' while waiting 9156 * to become exclusive. So bump up the conn ref. 9157 */ 9158 if (CONN_Q(q)) 9159 CONN_INC_REF(Q_TO_CONN(q)); 9160 ip_sioctl_plink(NULL, q, mp, NULL); 9161 return; 9162 9163 case IP_IOCTL: 9164 ip_wput_ioctl(q, mp); 9165 return; 9166 9167 case SIOCILB: 9168 /* The ioctl length varies depending on the ILB command. */ 9169 copyin_size = iocp->ioc_count; 9170 if (copyin_size < sizeof (ilb_cmd_t)) 9171 goto nak; 9172 mi_copyin(q, mp, NULL, copyin_size); 9173 return; 9174 9175 default: 9176 cmn_err(CE_PANIC, "should not happen "); 9177 } 9178 nak: 9179 if (mp->b_cont != NULL) { 9180 freemsg(mp->b_cont); 9181 mp->b_cont = NULL; 9182 } 9183 iocp->ioc_error = EINVAL; 9184 mp->b_datap->db_type = M_IOCNAK; 9185 iocp->ioc_count = 0; 9186 qreply(q, mp); 9187 } 9188 9189 static void 9190 ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags) 9191 { 9192 struct arpreq *ar; 9193 struct xarpreq *xar; 9194 mblk_t *tmp; 9195 struct iocblk *iocp; 9196 int x_arp_ioctl = B_FALSE; 9197 int *flagsp; 9198 char *storage = NULL; 9199 9200 ASSERT(ill != NULL); 9201 9202 iocp = (struct iocblk *)mp->b_rptr; 9203 ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP); 9204 9205 tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */ 9206 if ((iocp->ioc_cmd == SIOCGXARP) || 9207 (iocp->ioc_cmd == SIOCSXARP)) { 9208 x_arp_ioctl = B_TRUE; 9209 xar = (struct xarpreq *)tmp->b_rptr; 9210 flagsp = &xar->xarp_flags; 9211 storage = xar->xarp_ha.sdl_data; 9212 } else { 9213 ar = (struct arpreq *)tmp->b_rptr; 9214 flagsp = &ar->arp_flags; 9215 storage = ar->arp_ha.sa_data; 9216 } 9217 9218 /* 9219 * We're done if this is not an SIOCG{X}ARP 9220 */ 9221 if (x_arp_ioctl) { 9222 storage += ill_xarp_info(&xar->xarp_ha, ill); 9223 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 9224 sizeof (xar->xarp_ha.sdl_data)) { 9225 iocp->ioc_error = EINVAL; 9226 return; 9227 } 9228 } 9229 *flagsp = ATF_INUSE; 9230 /* 9231 * If /sbin/arp told us we are the authority using the "permanent" 9232 * flag, or if this is one of my addresses print "permanent" 9233 * in the /sbin/arp output. 9234 */ 9235 if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY)) 9236 *flagsp |= ATF_AUTHORITY; 9237 if (flags & NCE_F_NONUD) 9238 *flagsp |= ATF_PERM; /* not subject to aging */ 9239 if (flags & NCE_F_PUBLISH) 9240 *flagsp |= ATF_PUBL; 9241 if (hwaddr != NULL) { 9242 *flagsp |= ATF_COM; 9243 bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length); 9244 } 9245 } 9246 9247 /* 9248 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 9249 * interface) create the next available logical interface for this 9250 * physical interface. 9251 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 9252 * ipif with the specified name. 9253 * 9254 * If the address family is not AF_UNSPEC then set the address as well. 9255 * 9256 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 9257 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 9258 * 9259 * Executed as a writer on the ill. 9260 * So no lock is needed to traverse the ipif chain, or examine the 9261 * phyint flags. 9262 */ 9263 /* ARGSUSED */ 9264 int 9265 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9266 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9267 { 9268 mblk_t *mp1; 9269 struct lifreq *lifr; 9270 boolean_t isv6; 9271 boolean_t exists; 9272 char *name; 9273 char *endp; 9274 char *cp; 9275 int namelen; 9276 ipif_t *ipif; 9277 long id; 9278 ipsq_t *ipsq; 9279 ill_t *ill; 9280 sin_t *sin; 9281 int err = 0; 9282 boolean_t found_sep = B_FALSE; 9283 conn_t *connp; 9284 zoneid_t zoneid; 9285 ip_stack_t *ipst = CONNQ_TO_IPST(q); 9286 9287 ASSERT(q->q_next == NULL); 9288 ip1dbg(("ip_sioctl_addif\n")); 9289 /* Existence of mp1 has been checked in ip_wput_nondata */ 9290 mp1 = mp->b_cont->b_cont; 9291 /* 9292 * Null terminate the string to protect against buffer 9293 * overrun. String was generated by user code and may not 9294 * be trusted. 9295 */ 9296 lifr = (struct lifreq *)mp1->b_rptr; 9297 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 9298 name = lifr->lifr_name; 9299 ASSERT(CONN_Q(q)); 9300 connp = Q_TO_CONN(q); 9301 isv6 = (connp->conn_family == AF_INET6); 9302 zoneid = connp->conn_zoneid; 9303 namelen = mi_strlen(name); 9304 if (namelen == 0) 9305 return (EINVAL); 9306 9307 exists = B_FALSE; 9308 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 9309 (mi_strcmp(name, ipif_loopback_name) == 0)) { 9310 /* 9311 * Allow creating lo0 using SIOCLIFADDIF. 9312 * can't be any other writer thread. So can pass null below 9313 * for the last 4 args to ipif_lookup_name. 9314 */ 9315 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE, 9316 &exists, isv6, zoneid, ipst); 9317 /* Prevent any further action */ 9318 if (ipif == NULL) { 9319 return (ENOBUFS); 9320 } else if (!exists) { 9321 /* We created the ipif now and as writer */ 9322 ipif_refrele(ipif); 9323 return (0); 9324 } else { 9325 ill = ipif->ipif_ill; 9326 ill_refhold(ill); 9327 ipif_refrele(ipif); 9328 } 9329 } else { 9330 /* Look for a colon in the name. */ 9331 endp = &name[namelen]; 9332 for (cp = endp; --cp > name; ) { 9333 if (*cp == IPIF_SEPARATOR_CHAR) { 9334 found_sep = B_TRUE; 9335 /* 9336 * Reject any non-decimal aliases for plumbing 9337 * of logical interfaces. Aliases with leading 9338 * zeroes are also rejected as they introduce 9339 * ambiguity in the naming of the interfaces. 9340 * Comparing with "0" takes care of all such 9341 * cases. 9342 */ 9343 if ((strncmp("0", cp+1, 1)) == 0) 9344 return (EINVAL); 9345 9346 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 9347 id <= 0 || *endp != '\0') { 9348 return (EINVAL); 9349 } 9350 *cp = '\0'; 9351 break; 9352 } 9353 } 9354 ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst); 9355 if (found_sep) 9356 *cp = IPIF_SEPARATOR_CHAR; 9357 if (ill == NULL) 9358 return (ENXIO); 9359 } 9360 9361 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 9362 B_TRUE); 9363 9364 /* 9365 * Release the refhold due to the lookup, now that we are excl 9366 * or we are just returning 9367 */ 9368 ill_refrele(ill); 9369 9370 if (ipsq == NULL) 9371 return (EINPROGRESS); 9372 9373 /* We are now exclusive on the IPSQ */ 9374 ASSERT(IAM_WRITER_ILL(ill)); 9375 9376 if (found_sep) { 9377 /* Now see if there is an IPIF with this unit number. */ 9378 for (ipif = ill->ill_ipif; ipif != NULL; 9379 ipif = ipif->ipif_next) { 9380 if (ipif->ipif_id == id) { 9381 err = EEXIST; 9382 goto done; 9383 } 9384 } 9385 } 9386 9387 /* 9388 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 9389 * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name() 9390 * instead. 9391 */ 9392 if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL, 9393 B_TRUE, B_TRUE, &err)) == NULL) { 9394 goto done; 9395 } 9396 9397 /* Return created name with ioctl */ 9398 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 9399 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 9400 ip1dbg(("created %s\n", lifr->lifr_name)); 9401 9402 /* Set address */ 9403 sin = (sin_t *)&lifr->lifr_addr; 9404 if (sin->sin_family != AF_UNSPEC) { 9405 err = ip_sioctl_addr(ipif, sin, q, mp, 9406 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 9407 } 9408 9409 done: 9410 ipsq_exit(ipsq); 9411 return (err); 9412 } 9413 9414 /* 9415 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 9416 * interface) delete it based on the IP address (on this physical interface). 9417 * Otherwise delete it based on the ipif_id. 9418 * Also, special handling to allow a removeif of lo0. 9419 */ 9420 /* ARGSUSED */ 9421 int 9422 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9423 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 9424 { 9425 conn_t *connp; 9426 ill_t *ill = ipif->ipif_ill; 9427 boolean_t success; 9428 ip_stack_t *ipst; 9429 9430 ipst = CONNQ_TO_IPST(q); 9431 9432 ASSERT(q->q_next == NULL); 9433 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 9434 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9435 ASSERT(IAM_WRITER_IPIF(ipif)); 9436 9437 connp = Q_TO_CONN(q); 9438 /* 9439 * Special case for unplumbing lo0 (the loopback physical interface). 9440 * If unplumbing lo0, the incoming address structure has been 9441 * initialized to all zeros. When unplumbing lo0, all its logical 9442 * interfaces must be removed too. 9443 * 9444 * Note that this interface may be called to remove a specific 9445 * loopback logical interface (eg, lo0:1). But in that case 9446 * ipif->ipif_id != 0 so that the code path for that case is the 9447 * same as any other interface (meaning it skips the code directly 9448 * below). 9449 */ 9450 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { 9451 if (sin->sin_family == AF_UNSPEC && 9452 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 9453 /* 9454 * Mark it condemned. No new ref. will be made to ill. 9455 */ 9456 mutex_enter(&ill->ill_lock); 9457 ill->ill_state_flags |= ILL_CONDEMNED; 9458 for (ipif = ill->ill_ipif; ipif != NULL; 9459 ipif = ipif->ipif_next) { 9460 ipif->ipif_state_flags |= IPIF_CONDEMNED; 9461 } 9462 mutex_exit(&ill->ill_lock); 9463 9464 ipif = ill->ill_ipif; 9465 /* unplumb the loopback interface */ 9466 ill_delete(ill); 9467 mutex_enter(&connp->conn_lock); 9468 mutex_enter(&ill->ill_lock); 9469 9470 /* Are any references to this ill active */ 9471 if (ill_is_freeable(ill)) { 9472 mutex_exit(&ill->ill_lock); 9473 mutex_exit(&connp->conn_lock); 9474 ill_delete_tail(ill); 9475 mi_free(ill); 9476 return (0); 9477 } 9478 success = ipsq_pending_mp_add(connp, ipif, 9479 CONNP_TO_WQ(connp), mp, ILL_FREE); 9480 mutex_exit(&connp->conn_lock); 9481 mutex_exit(&ill->ill_lock); 9482 if (success) 9483 return (EINPROGRESS); 9484 else 9485 return (EINTR); 9486 } 9487 } 9488 9489 if (ipif->ipif_id == 0) { 9490 ipsq_t *ipsq; 9491 9492 /* Find based on address */ 9493 if (ipif->ipif_isv6) { 9494 sin6_t *sin6; 9495 9496 if (sin->sin_family != AF_INET6) 9497 return (EAFNOSUPPORT); 9498 9499 sin6 = (sin6_t *)sin; 9500 /* We are a writer, so we should be able to lookup */ 9501 ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill, 9502 ipst); 9503 } else { 9504 if (sin->sin_family != AF_INET) 9505 return (EAFNOSUPPORT); 9506 9507 /* We are a writer, so we should be able to lookup */ 9508 ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill, 9509 ipst); 9510 } 9511 if (ipif == NULL) { 9512 return (EADDRNOTAVAIL); 9513 } 9514 9515 /* 9516 * It is possible for a user to send an SIOCLIFREMOVEIF with 9517 * lifr_name of the physical interface but with an ip address 9518 * lifr_addr of a logical interface plumbed over it. 9519 * So update ipx_current_ipif now that ipif points to the 9520 * correct one. 9521 */ 9522 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 9523 ipsq->ipsq_xop->ipx_current_ipif = ipif; 9524 9525 /* This is a writer */ 9526 ipif_refrele(ipif); 9527 } 9528 9529 /* 9530 * Can not delete instance zero since it is tied to the ill. 9531 */ 9532 if (ipif->ipif_id == 0) 9533 return (EBUSY); 9534 9535 mutex_enter(&ill->ill_lock); 9536 ipif->ipif_state_flags |= IPIF_CONDEMNED; 9537 mutex_exit(&ill->ill_lock); 9538 9539 ipif_free(ipif); 9540 9541 mutex_enter(&connp->conn_lock); 9542 mutex_enter(&ill->ill_lock); 9543 9544 /* Are any references to this ipif active */ 9545 if (ipif_is_freeable(ipif)) { 9546 mutex_exit(&ill->ill_lock); 9547 mutex_exit(&connp->conn_lock); 9548 ipif_non_duplicate(ipif); 9549 (void) ipif_down_tail(ipif); 9550 ipif_free_tail(ipif); /* frees ipif */ 9551 return (0); 9552 } 9553 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 9554 IPIF_FREE); 9555 mutex_exit(&ill->ill_lock); 9556 mutex_exit(&connp->conn_lock); 9557 if (success) 9558 return (EINPROGRESS); 9559 else 9560 return (EINTR); 9561 } 9562 9563 /* 9564 * Restart the removeif ioctl. The refcnt has gone down to 0. 9565 * The ipif is already condemned. So can't find it thru lookups. 9566 */ 9567 /* ARGSUSED */ 9568 int 9569 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 9570 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 9571 { 9572 ill_t *ill = ipif->ipif_ill; 9573 9574 ASSERT(IAM_WRITER_IPIF(ipif)); 9575 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 9576 9577 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 9578 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9579 9580 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { 9581 ASSERT(ill->ill_state_flags & ILL_CONDEMNED); 9582 ill_delete_tail(ill); 9583 mi_free(ill); 9584 return (0); 9585 } 9586 9587 ipif_non_duplicate(ipif); 9588 (void) ipif_down_tail(ipif); 9589 ipif_free_tail(ipif); 9590 9591 return (0); 9592 } 9593 9594 /* 9595 * Set the local interface address using the given prefix and ill_token. 9596 */ 9597 /* ARGSUSED */ 9598 int 9599 ip_sioctl_prefix(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9600 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9601 { 9602 int err; 9603 in6_addr_t v6addr; 9604 sin6_t *sin6; 9605 ill_t *ill; 9606 int i; 9607 9608 ip1dbg(("ip_sioctl_prefix(%s:%u %p)\n", 9609 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9610 9611 ASSERT(IAM_WRITER_IPIF(ipif)); 9612 9613 if (!ipif->ipif_isv6) 9614 return (EINVAL); 9615 9616 if (sin->sin_family != AF_INET6) 9617 return (EAFNOSUPPORT); 9618 9619 sin6 = (sin6_t *)sin; 9620 v6addr = sin6->sin6_addr; 9621 ill = ipif->ipif_ill; 9622 9623 if (IN6_IS_ADDR_UNSPECIFIED(&v6addr) || 9624 IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token)) 9625 return (EADDRNOTAVAIL); 9626 9627 for (i = 0; i < 4; i++) 9628 sin6->sin6_addr.s6_addr32[i] |= ill->ill_token.s6_addr32[i]; 9629 9630 err = ip_sioctl_addr(ipif, sin, q, mp, 9631 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], dummy_ifreq); 9632 return (err); 9633 } 9634 9635 /* 9636 * Restart entry point to restart the address set operation after the 9637 * refcounts have dropped to zero. 9638 */ 9639 /* ARGSUSED */ 9640 int 9641 ip_sioctl_prefix_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9642 ip_ioctl_cmd_t *ipip, void *ifreq) 9643 { 9644 ip1dbg(("ip_sioctl_prefix_restart(%s:%u %p)\n", 9645 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9646 return (ip_sioctl_addr_restart(ipif, sin, q, mp, ipip, ifreq)); 9647 } 9648 9649 /* 9650 * Set the local interface address. 9651 * Allow an address of all zero when the interface is down. 9652 */ 9653 /* ARGSUSED */ 9654 int 9655 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9656 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9657 { 9658 int err = 0; 9659 in6_addr_t v6addr; 9660 boolean_t need_up = B_FALSE; 9661 9662 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 9663 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9664 9665 ASSERT(IAM_WRITER_IPIF(ipif)); 9666 9667 if (ipif->ipif_isv6) { 9668 sin6_t *sin6; 9669 ill_t *ill; 9670 phyint_t *phyi; 9671 9672 if (sin->sin_family != AF_INET6) 9673 return (EAFNOSUPPORT); 9674 9675 sin6 = (sin6_t *)sin; 9676 v6addr = sin6->sin6_addr; 9677 ill = ipif->ipif_ill; 9678 phyi = ill->ill_phyint; 9679 9680 /* 9681 * Enforce that true multicast interfaces have a link-local 9682 * address for logical unit 0. 9683 * 9684 * However for those ipif's for which link-local address was 9685 * not created by default, also allow setting :: as the address. 9686 * This scenario would arise, when we delete an address on ipif 9687 * with logical unit 0, we would want to set :: as the address. 9688 */ 9689 if (ipif->ipif_id == 0 && 9690 (ill->ill_flags & ILLF_MULTICAST) && 9691 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 9692 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 9693 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 9694 9695 /* 9696 * if default link-local was not created by kernel for 9697 * this ill, allow setting :: as the address on ipif:0. 9698 */ 9699 if (ill->ill_flags & ILLF_NOLINKLOCAL) { 9700 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) 9701 return (EADDRNOTAVAIL); 9702 } else { 9703 return (EADDRNOTAVAIL); 9704 } 9705 } 9706 9707 /* 9708 * up interfaces shouldn't have the unspecified address 9709 * unless they also have the IPIF_NOLOCAL flags set and 9710 * have a subnet assigned. 9711 */ 9712 if ((ipif->ipif_flags & IPIF_UP) && 9713 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 9714 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 9715 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 9716 return (EADDRNOTAVAIL); 9717 } 9718 9719 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 9720 return (EADDRNOTAVAIL); 9721 } else { 9722 ipaddr_t addr; 9723 9724 if (sin->sin_family != AF_INET) 9725 return (EAFNOSUPPORT); 9726 9727 addr = sin->sin_addr.s_addr; 9728 9729 /* Allow INADDR_ANY as the local address. */ 9730 if (addr != INADDR_ANY && 9731 !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 9732 return (EADDRNOTAVAIL); 9733 9734 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9735 } 9736 9737 /* 9738 * Even if there is no change we redo things just to rerun 9739 * ipif_set_default. 9740 */ 9741 if (ipif->ipif_flags & IPIF_UP) { 9742 /* 9743 * Setting a new local address, make sure 9744 * we have net and subnet bcast ire's for 9745 * the old address if we need them. 9746 */ 9747 /* 9748 * If the interface is already marked up, 9749 * we call ipif_down which will take care 9750 * of ditching any IREs that have been set 9751 * up based on the old interface address. 9752 */ 9753 err = ipif_logical_down(ipif, q, mp); 9754 if (err == EINPROGRESS) 9755 return (err); 9756 (void) ipif_down_tail(ipif); 9757 need_up = 1; 9758 } 9759 9760 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 9761 return (err); 9762 } 9763 9764 int 9765 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9766 boolean_t need_up) 9767 { 9768 in6_addr_t v6addr; 9769 in6_addr_t ov6addr; 9770 ipaddr_t addr; 9771 sin6_t *sin6; 9772 int sinlen; 9773 int err = 0; 9774 ill_t *ill = ipif->ipif_ill; 9775 boolean_t need_dl_down; 9776 boolean_t need_arp_down; 9777 struct iocblk *iocp; 9778 9779 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL; 9780 9781 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 9782 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9783 ASSERT(IAM_WRITER_IPIF(ipif)); 9784 9785 /* Must cancel any pending timer before taking the ill_lock */ 9786 if (ipif->ipif_recovery_id != 0) 9787 (void) untimeout(ipif->ipif_recovery_id); 9788 ipif->ipif_recovery_id = 0; 9789 9790 if (ipif->ipif_isv6) { 9791 sin6 = (sin6_t *)sin; 9792 v6addr = sin6->sin6_addr; 9793 sinlen = sizeof (struct sockaddr_in6); 9794 } else { 9795 addr = sin->sin_addr.s_addr; 9796 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9797 sinlen = sizeof (struct sockaddr_in); 9798 } 9799 mutex_enter(&ill->ill_lock); 9800 ov6addr = ipif->ipif_v6lcl_addr; 9801 ipif->ipif_v6lcl_addr = v6addr; 9802 sctp_update_ipif_addr(ipif, ov6addr); 9803 ipif->ipif_addr_ready = 0; 9804 9805 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT); 9806 9807 /* 9808 * If the interface was previously marked as a duplicate, then since 9809 * we've now got a "new" address, it should no longer be considered a 9810 * duplicate -- even if the "new" address is the same as the old one. 9811 * Note that if all ipifs are down, we may have a pending ARP down 9812 * event to handle. This is because we want to recover from duplicates 9813 * and thus delay tearing down ARP until the duplicates have been 9814 * removed or disabled. 9815 */ 9816 need_dl_down = need_arp_down = B_FALSE; 9817 if (ipif->ipif_flags & IPIF_DUPLICATE) { 9818 need_arp_down = !need_up; 9819 ipif->ipif_flags &= ~IPIF_DUPLICATE; 9820 if (--ill->ill_ipif_dup_count == 0 && !need_up && 9821 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 9822 need_dl_down = B_TRUE; 9823 } 9824 } 9825 9826 ipif_set_default(ipif); 9827 9828 /* 9829 * If we've just manually set the IPv6 link-local address (0th ipif), 9830 * tag the ill so that future updates to the interface ID don't result 9831 * in this address getting automatically reconfigured from under the 9832 * administrator. 9833 */ 9834 if (ipif->ipif_isv6 && ipif->ipif_id == 0) { 9835 if (iocp == NULL || (iocp->ioc_cmd == SIOCSLIFADDR && 9836 !IN6_IS_ADDR_UNSPECIFIED(&v6addr))) 9837 ill->ill_manual_linklocal = 1; 9838 } 9839 9840 /* 9841 * When publishing an interface address change event, we only notify 9842 * the event listeners of the new address. It is assumed that if they 9843 * actively care about the addresses assigned that they will have 9844 * already discovered the previous address assigned (if there was one.) 9845 * 9846 * Don't attach nic event message for SIOCLIFADDIF ioctl. 9847 */ 9848 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) { 9849 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id), 9850 NE_ADDRESS_CHANGE, sin, sinlen); 9851 } 9852 9853 mutex_exit(&ill->ill_lock); 9854 9855 if (need_up) { 9856 /* 9857 * Now bring the interface back up. If this 9858 * is the only IPIF for the ILL, ipif_up 9859 * will have to re-bind to the device, so 9860 * we may get back EINPROGRESS, in which 9861 * case, this IOCTL will get completed in 9862 * ip_rput_dlpi when we see the DL_BIND_ACK. 9863 */ 9864 err = ipif_up(ipif, q, mp); 9865 } else { 9866 /* Perhaps ilgs should use this ill */ 9867 update_conn_ill(NULL, ill->ill_ipst); 9868 } 9869 9870 if (need_dl_down) 9871 ill_dl_down(ill); 9872 9873 if (need_arp_down && !ill->ill_isv6) 9874 (void) ipif_arp_down(ipif); 9875 9876 /* 9877 * The default multicast interface might have changed (for 9878 * instance if the IPv6 scope of the address changed) 9879 */ 9880 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); 9881 9882 return (err); 9883 } 9884 9885 /* 9886 * Restart entry point to restart the address set operation after the 9887 * refcounts have dropped to zero. 9888 */ 9889 /* ARGSUSED */ 9890 int 9891 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9892 ip_ioctl_cmd_t *ipip, void *ifreq) 9893 { 9894 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 9895 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9896 ASSERT(IAM_WRITER_IPIF(ipif)); 9897 (void) ipif_down_tail(ipif); 9898 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 9899 } 9900 9901 /* ARGSUSED */ 9902 int 9903 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9904 ip_ioctl_cmd_t *ipip, void *if_req) 9905 { 9906 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 9907 struct lifreq *lifr = (struct lifreq *)if_req; 9908 9909 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 9910 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9911 /* 9912 * The net mask and address can't change since we have a 9913 * reference to the ipif. So no lock is necessary. 9914 */ 9915 if (ipif->ipif_isv6) { 9916 *sin6 = sin6_null; 9917 sin6->sin6_family = AF_INET6; 9918 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 9919 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 9920 lifr->lifr_addrlen = 9921 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 9922 } else { 9923 *sin = sin_null; 9924 sin->sin_family = AF_INET; 9925 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 9926 if (ipip->ipi_cmd_type == LIF_CMD) { 9927 lifr->lifr_addrlen = 9928 ip_mask_to_plen(ipif->ipif_net_mask); 9929 } 9930 } 9931 return (0); 9932 } 9933 9934 /* 9935 * Set the destination address for a pt-pt interface. 9936 */ 9937 /* ARGSUSED */ 9938 int 9939 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9940 ip_ioctl_cmd_t *ipip, void *if_req) 9941 { 9942 int err = 0; 9943 in6_addr_t v6addr; 9944 boolean_t need_up = B_FALSE; 9945 9946 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 9947 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9948 ASSERT(IAM_WRITER_IPIF(ipif)); 9949 9950 if (ipif->ipif_isv6) { 9951 sin6_t *sin6; 9952 9953 if (sin->sin_family != AF_INET6) 9954 return (EAFNOSUPPORT); 9955 9956 sin6 = (sin6_t *)sin; 9957 v6addr = sin6->sin6_addr; 9958 9959 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 9960 return (EADDRNOTAVAIL); 9961 } else { 9962 ipaddr_t addr; 9963 9964 if (sin->sin_family != AF_INET) 9965 return (EAFNOSUPPORT); 9966 9967 addr = sin->sin_addr.s_addr; 9968 if (addr != INADDR_ANY && 9969 !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) { 9970 return (EADDRNOTAVAIL); 9971 } 9972 9973 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9974 } 9975 9976 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 9977 return (0); /* No change */ 9978 9979 if (ipif->ipif_flags & IPIF_UP) { 9980 /* 9981 * If the interface is already marked up, 9982 * we call ipif_down which will take care 9983 * of ditching any IREs that have been set 9984 * up based on the old pp dst address. 9985 */ 9986 err = ipif_logical_down(ipif, q, mp); 9987 if (err == EINPROGRESS) 9988 return (err); 9989 (void) ipif_down_tail(ipif); 9990 need_up = B_TRUE; 9991 } 9992 /* 9993 * could return EINPROGRESS. If so ioctl will complete in 9994 * ip_rput_dlpi_writer 9995 */ 9996 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 9997 return (err); 9998 } 9999 10000 static int 10001 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10002 boolean_t need_up) 10003 { 10004 in6_addr_t v6addr; 10005 ill_t *ill = ipif->ipif_ill; 10006 int err = 0; 10007 boolean_t need_dl_down; 10008 boolean_t need_arp_down; 10009 10010 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 10011 ipif->ipif_id, (void *)ipif)); 10012 10013 /* Must cancel any pending timer before taking the ill_lock */ 10014 if (ipif->ipif_recovery_id != 0) 10015 (void) untimeout(ipif->ipif_recovery_id); 10016 ipif->ipif_recovery_id = 0; 10017 10018 if (ipif->ipif_isv6) { 10019 sin6_t *sin6; 10020 10021 sin6 = (sin6_t *)sin; 10022 v6addr = sin6->sin6_addr; 10023 } else { 10024 ipaddr_t addr; 10025 10026 addr = sin->sin_addr.s_addr; 10027 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10028 } 10029 mutex_enter(&ill->ill_lock); 10030 /* Set point to point destination address. */ 10031 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10032 /* 10033 * Allow this as a means of creating logical 10034 * pt-pt interfaces on top of e.g. an Ethernet. 10035 * XXX Undocumented HACK for testing. 10036 * pt-pt interfaces are created with NUD disabled. 10037 */ 10038 ipif->ipif_flags |= IPIF_POINTOPOINT; 10039 ipif->ipif_flags &= ~IPIF_BROADCAST; 10040 if (ipif->ipif_isv6) 10041 ill->ill_flags |= ILLF_NONUD; 10042 } 10043 10044 /* 10045 * If the interface was previously marked as a duplicate, then since 10046 * we've now got a "new" address, it should no longer be considered a 10047 * duplicate -- even if the "new" address is the same as the old one. 10048 * Note that if all ipifs are down, we may have a pending ARP down 10049 * event to handle. 10050 */ 10051 need_dl_down = need_arp_down = B_FALSE; 10052 if (ipif->ipif_flags & IPIF_DUPLICATE) { 10053 need_arp_down = !need_up; 10054 ipif->ipif_flags &= ~IPIF_DUPLICATE; 10055 if (--ill->ill_ipif_dup_count == 0 && !need_up && 10056 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 10057 need_dl_down = B_TRUE; 10058 } 10059 } 10060 10061 /* 10062 * If we've just manually set the IPv6 destination link-local address 10063 * (0th ipif), tag the ill so that future updates to the destination 10064 * interface ID (as can happen with interfaces over IP tunnels) don't 10065 * result in this address getting automatically reconfigured from 10066 * under the administrator. 10067 */ 10068 if (ipif->ipif_isv6 && ipif->ipif_id == 0) 10069 ill->ill_manual_dst_linklocal = 1; 10070 10071 /* Set the new address. */ 10072 ipif->ipif_v6pp_dst_addr = v6addr; 10073 /* Make sure subnet tracks pp_dst */ 10074 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 10075 mutex_exit(&ill->ill_lock); 10076 10077 if (need_up) { 10078 /* 10079 * Now bring the interface back up. If this 10080 * is the only IPIF for the ILL, ipif_up 10081 * will have to re-bind to the device, so 10082 * we may get back EINPROGRESS, in which 10083 * case, this IOCTL will get completed in 10084 * ip_rput_dlpi when we see the DL_BIND_ACK. 10085 */ 10086 err = ipif_up(ipif, q, mp); 10087 } 10088 10089 if (need_dl_down) 10090 ill_dl_down(ill); 10091 if (need_arp_down && !ipif->ipif_isv6) 10092 (void) ipif_arp_down(ipif); 10093 10094 return (err); 10095 } 10096 10097 /* 10098 * Restart entry point to restart the dstaddress set operation after the 10099 * refcounts have dropped to zero. 10100 */ 10101 /* ARGSUSED */ 10102 int 10103 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10104 ip_ioctl_cmd_t *ipip, void *ifreq) 10105 { 10106 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 10107 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10108 (void) ipif_down_tail(ipif); 10109 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 10110 } 10111 10112 /* ARGSUSED */ 10113 int 10114 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10115 ip_ioctl_cmd_t *ipip, void *if_req) 10116 { 10117 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 10118 10119 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 10120 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10121 /* 10122 * Get point to point destination address. The addresses can't 10123 * change since we hold a reference to the ipif. 10124 */ 10125 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 10126 return (EADDRNOTAVAIL); 10127 10128 if (ipif->ipif_isv6) { 10129 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 10130 *sin6 = sin6_null; 10131 sin6->sin6_family = AF_INET6; 10132 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 10133 } else { 10134 *sin = sin_null; 10135 sin->sin_family = AF_INET; 10136 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 10137 } 10138 return (0); 10139 } 10140 10141 /* 10142 * Check which flags will change by the given flags being set 10143 * silently ignore flags which userland is not allowed to control. 10144 * (Because these flags may change between SIOCGLIFFLAGS and 10145 * SIOCSLIFFLAGS, and that's outside of userland's control, 10146 * we need to silently ignore them rather than fail.) 10147 */ 10148 static void 10149 ip_sioctl_flags_onoff(ipif_t *ipif, uint64_t flags, uint64_t *onp, 10150 uint64_t *offp) 10151 { 10152 ill_t *ill = ipif->ipif_ill; 10153 phyint_t *phyi = ill->ill_phyint; 10154 uint64_t cantchange_flags, intf_flags; 10155 uint64_t turn_on, turn_off; 10156 10157 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 10158 cantchange_flags = IFF_CANTCHANGE; 10159 if (IS_IPMP(ill)) 10160 cantchange_flags |= IFF_IPMP_CANTCHANGE; 10161 turn_on = (flags ^ intf_flags) & ~cantchange_flags; 10162 turn_off = intf_flags & turn_on; 10163 turn_on ^= turn_off; 10164 *onp = turn_on; 10165 *offp = turn_off; 10166 } 10167 10168 /* 10169 * Set interface flags. Many flags require special handling (e.g., 10170 * bringing the interface down); see below for details. 10171 * 10172 * NOTE : We really don't enforce that ipif_id zero should be used 10173 * for setting any flags other than IFF_LOGINT_FLAGS. This 10174 * is because applications generally does SICGLIFFLAGS and 10175 * ORs in the new flags (that affects the logical) and does a 10176 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 10177 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 10178 * flags that will be turned on is correct with respect to 10179 * ipif_id 0. For backward compatibility reasons, it is not done. 10180 */ 10181 /* ARGSUSED */ 10182 int 10183 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10184 ip_ioctl_cmd_t *ipip, void *if_req) 10185 { 10186 uint64_t turn_on; 10187 uint64_t turn_off; 10188 int err = 0; 10189 phyint_t *phyi; 10190 ill_t *ill; 10191 conn_t *connp; 10192 uint64_t intf_flags; 10193 boolean_t phyint_flags_modified = B_FALSE; 10194 uint64_t flags; 10195 struct ifreq *ifr; 10196 struct lifreq *lifr; 10197 boolean_t set_linklocal = B_FALSE; 10198 10199 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 10200 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10201 10202 ASSERT(IAM_WRITER_IPIF(ipif)); 10203 10204 ill = ipif->ipif_ill; 10205 phyi = ill->ill_phyint; 10206 10207 if (ipip->ipi_cmd_type == IF_CMD) { 10208 ifr = (struct ifreq *)if_req; 10209 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 10210 } else { 10211 lifr = (struct lifreq *)if_req; 10212 flags = lifr->lifr_flags; 10213 } 10214 10215 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 10216 10217 /* 10218 * Have the flags been set correctly until now? 10219 */ 10220 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 10221 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 10222 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 10223 /* 10224 * Compare the new flags to the old, and partition 10225 * into those coming on and those going off. 10226 * For the 16 bit command keep the bits above bit 16 unchanged. 10227 */ 10228 if (ipip->ipi_cmd == SIOCSIFFLAGS) 10229 flags |= intf_flags & ~0xFFFF; 10230 10231 /* 10232 * Explicitly fail attempts to change flags that are always invalid on 10233 * an IPMP meta-interface. 10234 */ 10235 if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID)) 10236 return (EINVAL); 10237 10238 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10239 if ((turn_on|turn_off) == 0) 10240 return (0); /* No change */ 10241 10242 /* 10243 * All test addresses must be IFF_DEPRECATED (to ensure source address 10244 * selection avoids them) -- so force IFF_DEPRECATED on, and do not 10245 * allow it to be turned off. 10246 */ 10247 if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED && 10248 (turn_on|intf_flags) & IFF_NOFAILOVER) 10249 return (EINVAL); 10250 10251 if ((connp = Q_TO_CONN(q)) == NULL) 10252 return (EINVAL); 10253 10254 /* 10255 * Only vrrp control socket is allowed to change IFF_UP and 10256 * IFF_NOACCEPT flags when IFF_VRRP is set. 10257 */ 10258 if ((intf_flags & IFF_VRRP) && ((turn_off | turn_on) & IFF_UP)) { 10259 if (!connp->conn_isvrrp) 10260 return (EINVAL); 10261 } 10262 10263 /* 10264 * The IFF_NOACCEPT flag can only be set on an IFF_VRRP IP address by 10265 * VRRP control socket. 10266 */ 10267 if ((turn_off | turn_on) & IFF_NOACCEPT) { 10268 if (!connp->conn_isvrrp || !(intf_flags & IFF_VRRP)) 10269 return (EINVAL); 10270 } 10271 10272 if (turn_on & IFF_NOFAILOVER) { 10273 turn_on |= IFF_DEPRECATED; 10274 flags |= IFF_DEPRECATED; 10275 } 10276 10277 /* 10278 * On underlying interfaces, only allow applications to manage test 10279 * addresses -- otherwise, they may get confused when the address 10280 * moves as part of being brought up. Likewise, prevent an 10281 * application-managed test address from being converted to a data 10282 * address. To prevent migration of administratively up addresses in 10283 * the kernel, we don't allow them to be converted either. 10284 */ 10285 if (IS_UNDER_IPMP(ill)) { 10286 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF; 10287 10288 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER)) 10289 return (EINVAL); 10290 10291 if ((turn_off & IFF_NOFAILOVER) && 10292 (flags & (appflags | IFF_UP | IFF_DUPLICATE))) 10293 return (EINVAL); 10294 } 10295 10296 /* 10297 * Only allow IFF_TEMPORARY flag to be set on 10298 * IPv6 interfaces. 10299 */ 10300 if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6)) 10301 return (EINVAL); 10302 10303 /* 10304 * cannot turn off IFF_NOXMIT on VNI interfaces. 10305 */ 10306 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill)) 10307 return (EINVAL); 10308 10309 /* 10310 * Don't allow the IFF_ROUTER flag to be turned on on loopback 10311 * interfaces. It makes no sense in that context. 10312 */ 10313 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 10314 return (EINVAL); 10315 10316 /* 10317 * For IPv6 ipif_id 0, don't allow the interface to be up without 10318 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 10319 * If the link local address isn't set, and can be set, it will get 10320 * set later on in this function. 10321 */ 10322 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 10323 (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) && 10324 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 10325 if (ipif_cant_setlinklocal(ipif)) 10326 return (EINVAL); 10327 set_linklocal = B_TRUE; 10328 } 10329 10330 /* 10331 * If we modify physical interface flags, we'll potentially need to 10332 * send up two routing socket messages for the changes (one for the 10333 * IPv4 ill, and another for the IPv6 ill). Note that here. 10334 */ 10335 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 10336 phyint_flags_modified = B_TRUE; 10337 10338 /* 10339 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE 10340 * (otherwise, we'd immediately use them, defeating standby). Also, 10341 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not 10342 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already 10343 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We 10344 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics 10345 * will not be honored. 10346 */ 10347 if (turn_on & PHYI_STANDBY) { 10348 /* 10349 * No need to grab ill_g_usesrc_lock here; see the 10350 * synchronization notes in ip.c. 10351 */ 10352 if (ill->ill_usesrc_grp_next != NULL || 10353 intf_flags & PHYI_INACTIVE) 10354 return (EINVAL); 10355 if (!(flags & PHYI_FAILED)) { 10356 flags |= PHYI_INACTIVE; 10357 turn_on |= PHYI_INACTIVE; 10358 } 10359 } 10360 10361 if (turn_off & PHYI_STANDBY) { 10362 flags &= ~PHYI_INACTIVE; 10363 turn_off |= PHYI_INACTIVE; 10364 } 10365 10366 /* 10367 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both 10368 * would end up on. 10369 */ 10370 if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) == 10371 (PHYI_FAILED | PHYI_INACTIVE)) 10372 return (EINVAL); 10373 10374 /* 10375 * If ILLF_ROUTER changes, we need to change the ip forwarding 10376 * status of the interface. 10377 */ 10378 if ((turn_on | turn_off) & ILLF_ROUTER) 10379 (void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0)); 10380 10381 /* 10382 * If the interface is not UP and we are not going to 10383 * bring it UP, record the flags and return. When the 10384 * interface comes UP later, the right actions will be 10385 * taken. 10386 */ 10387 if (!(ipif->ipif_flags & IPIF_UP) && 10388 !(turn_on & IPIF_UP)) { 10389 /* Record new flags in their respective places. */ 10390 mutex_enter(&ill->ill_lock); 10391 mutex_enter(&ill->ill_phyint->phyint_lock); 10392 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 10393 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 10394 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 10395 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 10396 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 10397 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 10398 mutex_exit(&ill->ill_lock); 10399 mutex_exit(&ill->ill_phyint->phyint_lock); 10400 10401 /* 10402 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the 10403 * same to the kernel: if any of them has been set by 10404 * userland, the interface cannot be used for data traffic. 10405 */ 10406 if ((turn_on|turn_off) & 10407 (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 10408 ASSERT(!IS_IPMP(ill)); 10409 /* 10410 * It's possible the ill is part of an "anonymous" 10411 * IPMP group rather than a real group. In that case, 10412 * there are no other interfaces in the group and thus 10413 * no need to call ipmp_phyint_refresh_active(). 10414 */ 10415 if (IS_UNDER_IPMP(ill)) 10416 ipmp_phyint_refresh_active(phyi); 10417 } 10418 10419 if (phyint_flags_modified) { 10420 if (phyi->phyint_illv4 != NULL) { 10421 ip_rts_ifmsg(phyi->phyint_illv4-> 10422 ill_ipif, RTSQ_DEFAULT); 10423 } 10424 if (phyi->phyint_illv6 != NULL) { 10425 ip_rts_ifmsg(phyi->phyint_illv6-> 10426 ill_ipif, RTSQ_DEFAULT); 10427 } 10428 } 10429 /* The default multicast interface might have changed */ 10430 ire_increment_multicast_generation(ill->ill_ipst, 10431 ill->ill_isv6); 10432 10433 return (0); 10434 } else if (set_linklocal) { 10435 mutex_enter(&ill->ill_lock); 10436 if (set_linklocal) 10437 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 10438 mutex_exit(&ill->ill_lock); 10439 } 10440 10441 /* 10442 * Disallow IPv6 interfaces coming up that have the unspecified address, 10443 * or point-to-point interfaces with an unspecified destination. We do 10444 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 10445 * have a subnet assigned, which is how in.ndpd currently manages its 10446 * onlink prefix list when no addresses are configured with those 10447 * prefixes. 10448 */ 10449 if (ipif->ipif_isv6 && 10450 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 10451 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 10452 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 10453 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 10454 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 10455 return (EINVAL); 10456 } 10457 10458 /* 10459 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 10460 * from being brought up. 10461 */ 10462 if (!ipif->ipif_isv6 && 10463 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 10464 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 10465 return (EINVAL); 10466 } 10467 10468 /* 10469 * If we are going to change one or more of the flags that are 10470 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP, 10471 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and 10472 * IPIF_NOFAILOVER, we will take special action. This is 10473 * done by bring the ipif down, changing the flags and bringing 10474 * it back up again. For IPIF_NOFAILOVER, the act of bringing it 10475 * back up will trigger the address to be moved. 10476 * 10477 * If we are going to change IFF_NOACCEPT, we need to bring 10478 * all the ipifs down then bring them up again. The act of 10479 * bringing all the ipifs back up will trigger the local 10480 * ires being recreated with "no_accept" set/cleared. 10481 * 10482 * Note that ILLF_NOACCEPT is always set separately from the 10483 * other flags. 10484 */ 10485 if ((turn_on|turn_off) & 10486 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 10487 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED| 10488 IPIF_NOFAILOVER)) { 10489 /* 10490 * ipif_down() will ire_delete bcast ire's for the subnet, 10491 * while the ire_identical_ref tracks the case of IRE_BROADCAST 10492 * entries shared between multiple ipifs on the same subnet. 10493 */ 10494 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 10495 !(turn_off & IPIF_UP)) { 10496 if (ipif->ipif_flags & IPIF_UP) 10497 ill->ill_logical_down = 1; 10498 turn_on &= ~IPIF_UP; 10499 } 10500 err = ipif_down(ipif, q, mp); 10501 ip1dbg(("ipif_down returns %d err ", err)); 10502 if (err == EINPROGRESS) 10503 return (err); 10504 (void) ipif_down_tail(ipif); 10505 } else if ((turn_on|turn_off) & ILLF_NOACCEPT) { 10506 /* 10507 * If we can quiesce the ill, then continue. If not, then 10508 * ip_sioctl_flags_tail() will be called from 10509 * ipif_ill_refrele_tail(). 10510 */ 10511 ill_down_ipifs(ill, B_TRUE); 10512 10513 mutex_enter(&connp->conn_lock); 10514 mutex_enter(&ill->ill_lock); 10515 if (!ill_is_quiescent(ill)) { 10516 boolean_t success; 10517 10518 success = ipsq_pending_mp_add(connp, ill->ill_ipif, 10519 q, mp, ILL_DOWN); 10520 mutex_exit(&ill->ill_lock); 10521 mutex_exit(&connp->conn_lock); 10522 return (success ? EINPROGRESS : EINTR); 10523 } 10524 mutex_exit(&ill->ill_lock); 10525 mutex_exit(&connp->conn_lock); 10526 } 10527 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 10528 } 10529 10530 static int 10531 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) 10532 { 10533 ill_t *ill; 10534 phyint_t *phyi; 10535 uint64_t turn_on, turn_off; 10536 boolean_t phyint_flags_modified = B_FALSE; 10537 int err = 0; 10538 boolean_t set_linklocal = B_FALSE; 10539 10540 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 10541 ipif->ipif_ill->ill_name, ipif->ipif_id)); 10542 10543 ASSERT(IAM_WRITER_IPIF(ipif)); 10544 10545 ill = ipif->ipif_ill; 10546 phyi = ill->ill_phyint; 10547 10548 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10549 10550 /* 10551 * IFF_UP is handled separately. 10552 */ 10553 turn_on &= ~IFF_UP; 10554 turn_off &= ~IFF_UP; 10555 10556 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 10557 phyint_flags_modified = B_TRUE; 10558 10559 /* 10560 * Now we change the flags. Track current value of 10561 * other flags in their respective places. 10562 */ 10563 mutex_enter(&ill->ill_lock); 10564 mutex_enter(&phyi->phyint_lock); 10565 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 10566 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 10567 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 10568 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 10569 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 10570 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 10571 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 10572 set_linklocal = B_TRUE; 10573 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 10574 } 10575 10576 mutex_exit(&ill->ill_lock); 10577 mutex_exit(&phyi->phyint_lock); 10578 10579 if (set_linklocal) 10580 (void) ipif_setlinklocal(ipif); 10581 10582 /* 10583 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to 10584 * the kernel: if any of them has been set by userland, the interface 10585 * cannot be used for data traffic. 10586 */ 10587 if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 10588 ASSERT(!IS_IPMP(ill)); 10589 /* 10590 * It's possible the ill is part of an "anonymous" IPMP group 10591 * rather than a real group. In that case, there are no other 10592 * interfaces in the group and thus no need for us to call 10593 * ipmp_phyint_refresh_active(). 10594 */ 10595 if (IS_UNDER_IPMP(ill)) 10596 ipmp_phyint_refresh_active(phyi); 10597 } 10598 10599 if ((turn_on|turn_off) & ILLF_NOACCEPT) { 10600 /* 10601 * If the ILLF_NOACCEPT flag is changed, bring up all the 10602 * ipifs that were brought down. 10603 * 10604 * The routing sockets messages are sent as the result 10605 * of ill_up_ipifs(), further, SCTP's IPIF list was updated 10606 * as well. 10607 */ 10608 err = ill_up_ipifs(ill, q, mp); 10609 } else if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) { 10610 /* 10611 * XXX ipif_up really does not know whether a phyint flags 10612 * was modified or not. So, it sends up information on 10613 * only one routing sockets message. As we don't bring up 10614 * the interface and also set PHYI_ flags simultaneously 10615 * it should be okay. 10616 */ 10617 err = ipif_up(ipif, q, mp); 10618 } else { 10619 /* 10620 * Make sure routing socket sees all changes to the flags. 10621 * ipif_up_done* handles this when we use ipif_up. 10622 */ 10623 if (phyint_flags_modified) { 10624 if (phyi->phyint_illv4 != NULL) { 10625 ip_rts_ifmsg(phyi->phyint_illv4-> 10626 ill_ipif, RTSQ_DEFAULT); 10627 } 10628 if (phyi->phyint_illv6 != NULL) { 10629 ip_rts_ifmsg(phyi->phyint_illv6-> 10630 ill_ipif, RTSQ_DEFAULT); 10631 } 10632 } else { 10633 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 10634 } 10635 /* 10636 * Update the flags in SCTP's IPIF list, ipif_up() will do 10637 * this in need_up case. 10638 */ 10639 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10640 } 10641 10642 /* The default multicast interface might have changed */ 10643 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); 10644 return (err); 10645 } 10646 10647 /* 10648 * Restart the flags operation now that the refcounts have dropped to zero. 10649 */ 10650 /* ARGSUSED */ 10651 int 10652 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10653 ip_ioctl_cmd_t *ipip, void *if_req) 10654 { 10655 uint64_t flags; 10656 struct ifreq *ifr = if_req; 10657 struct lifreq *lifr = if_req; 10658 uint64_t turn_on, turn_off; 10659 10660 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 10661 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10662 10663 if (ipip->ipi_cmd_type == IF_CMD) { 10664 /* cast to uint16_t prevents unwanted sign extension */ 10665 flags = (uint16_t)ifr->ifr_flags; 10666 } else { 10667 flags = lifr->lifr_flags; 10668 } 10669 10670 /* 10671 * If this function call is a result of the ILLF_NOACCEPT flag 10672 * change, do not call ipif_down_tail(). See ip_sioctl_flags(). 10673 */ 10674 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10675 if (!((turn_on|turn_off) & ILLF_NOACCEPT)) 10676 (void) ipif_down_tail(ipif); 10677 10678 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 10679 } 10680 10681 /* 10682 * Can operate on either a module or a driver queue. 10683 */ 10684 /* ARGSUSED */ 10685 int 10686 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10687 ip_ioctl_cmd_t *ipip, void *if_req) 10688 { 10689 /* 10690 * Has the flags been set correctly till now ? 10691 */ 10692 ill_t *ill = ipif->ipif_ill; 10693 phyint_t *phyi = ill->ill_phyint; 10694 10695 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 10696 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10697 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 10698 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 10699 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 10700 10701 /* 10702 * Need a lock since some flags can be set even when there are 10703 * references to the ipif. 10704 */ 10705 mutex_enter(&ill->ill_lock); 10706 if (ipip->ipi_cmd_type == IF_CMD) { 10707 struct ifreq *ifr = (struct ifreq *)if_req; 10708 10709 /* Get interface flags (low 16 only). */ 10710 ifr->ifr_flags = ((ipif->ipif_flags | 10711 ill->ill_flags | phyi->phyint_flags) & 0xffff); 10712 } else { 10713 struct lifreq *lifr = (struct lifreq *)if_req; 10714 10715 /* Get interface flags. */ 10716 lifr->lifr_flags = ipif->ipif_flags | 10717 ill->ill_flags | phyi->phyint_flags; 10718 } 10719 mutex_exit(&ill->ill_lock); 10720 return (0); 10721 } 10722 10723 /* 10724 * We allow the MTU to be set on an ILL, but not have it be different 10725 * for different IPIFs since we don't actually send packets on IPIFs. 10726 */ 10727 /* ARGSUSED */ 10728 int 10729 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10730 ip_ioctl_cmd_t *ipip, void *if_req) 10731 { 10732 int mtu; 10733 int ip_min_mtu; 10734 struct ifreq *ifr; 10735 struct lifreq *lifr; 10736 ill_t *ill; 10737 10738 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 10739 ipif->ipif_id, (void *)ipif)); 10740 if (ipip->ipi_cmd_type == IF_CMD) { 10741 ifr = (struct ifreq *)if_req; 10742 mtu = ifr->ifr_metric; 10743 } else { 10744 lifr = (struct lifreq *)if_req; 10745 mtu = lifr->lifr_mtu; 10746 } 10747 /* Only allow for logical unit zero i.e. not on "bge0:17" */ 10748 if (ipif->ipif_id != 0) 10749 return (EINVAL); 10750 10751 ill = ipif->ipif_ill; 10752 if (ipif->ipif_isv6) 10753 ip_min_mtu = IPV6_MIN_MTU; 10754 else 10755 ip_min_mtu = IP_MIN_MTU; 10756 10757 mutex_enter(&ill->ill_lock); 10758 if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) { 10759 mutex_exit(&ill->ill_lock); 10760 return (EINVAL); 10761 } 10762 /* 10763 * The dce and fragmentation code can handle changes to ill_mtu 10764 * concurrent with sending/fragmenting packets. 10765 */ 10766 ill->ill_mtu = mtu; 10767 ill->ill_flags |= ILLF_FIXEDMTU; 10768 mutex_exit(&ill->ill_lock); 10769 10770 /* 10771 * Make sure all dce_generation checks find out 10772 * that ill_mtu has changed. 10773 */ 10774 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); 10775 10776 /* 10777 * Refresh IPMP meta-interface MTU if necessary. 10778 */ 10779 if (IS_UNDER_IPMP(ill)) 10780 ipmp_illgrp_refresh_mtu(ill->ill_grp); 10781 10782 /* Update the MTU in SCTP's list */ 10783 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10784 return (0); 10785 } 10786 10787 /* Get interface MTU. */ 10788 /* ARGSUSED */ 10789 int 10790 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10791 ip_ioctl_cmd_t *ipip, void *if_req) 10792 { 10793 struct ifreq *ifr; 10794 struct lifreq *lifr; 10795 10796 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 10797 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10798 10799 /* 10800 * We allow a get on any logical interface even though the set 10801 * can only be done on logical unit 0. 10802 */ 10803 if (ipip->ipi_cmd_type == IF_CMD) { 10804 ifr = (struct ifreq *)if_req; 10805 ifr->ifr_metric = ipif->ipif_ill->ill_mtu; 10806 } else { 10807 lifr = (struct lifreq *)if_req; 10808 lifr->lifr_mtu = ipif->ipif_ill->ill_mtu; 10809 } 10810 return (0); 10811 } 10812 10813 /* Set interface broadcast address. */ 10814 /* ARGSUSED2 */ 10815 int 10816 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10817 ip_ioctl_cmd_t *ipip, void *if_req) 10818 { 10819 ipaddr_t addr; 10820 ire_t *ire; 10821 ill_t *ill = ipif->ipif_ill; 10822 ip_stack_t *ipst = ill->ill_ipst; 10823 10824 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name, 10825 ipif->ipif_id)); 10826 10827 ASSERT(IAM_WRITER_IPIF(ipif)); 10828 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 10829 return (EADDRNOTAVAIL); 10830 10831 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 10832 10833 if (sin->sin_family != AF_INET) 10834 return (EAFNOSUPPORT); 10835 10836 addr = sin->sin_addr.s_addr; 10837 10838 if (ipif->ipif_flags & IPIF_UP) { 10839 /* 10840 * If we are already up, make sure the new 10841 * broadcast address makes sense. If it does, 10842 * there should be an IRE for it already. 10843 */ 10844 ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST, 10845 ill, ipif->ipif_zoneid, NULL, 10846 (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL); 10847 if (ire == NULL) { 10848 return (EINVAL); 10849 } else { 10850 ire_refrele(ire); 10851 } 10852 } 10853 /* 10854 * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST 10855 * needs to already exist we never need to change the set of 10856 * IRE_BROADCASTs when we are UP. 10857 */ 10858 if (addr != ipif->ipif_brd_addr) 10859 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 10860 10861 return (0); 10862 } 10863 10864 /* Get interface broadcast address. */ 10865 /* ARGSUSED */ 10866 int 10867 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10868 ip_ioctl_cmd_t *ipip, void *if_req) 10869 { 10870 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 10871 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10872 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 10873 return (EADDRNOTAVAIL); 10874 10875 /* IPIF_BROADCAST not possible with IPv6 */ 10876 ASSERT(!ipif->ipif_isv6); 10877 *sin = sin_null; 10878 sin->sin_family = AF_INET; 10879 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 10880 return (0); 10881 } 10882 10883 /* 10884 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 10885 */ 10886 /* ARGSUSED */ 10887 int 10888 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10889 ip_ioctl_cmd_t *ipip, void *if_req) 10890 { 10891 int err = 0; 10892 in6_addr_t v6mask; 10893 10894 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 10895 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10896 10897 ASSERT(IAM_WRITER_IPIF(ipif)); 10898 10899 if (ipif->ipif_isv6) { 10900 sin6_t *sin6; 10901 10902 if (sin->sin_family != AF_INET6) 10903 return (EAFNOSUPPORT); 10904 10905 sin6 = (sin6_t *)sin; 10906 v6mask = sin6->sin6_addr; 10907 } else { 10908 ipaddr_t mask; 10909 10910 if (sin->sin_family != AF_INET) 10911 return (EAFNOSUPPORT); 10912 10913 mask = sin->sin_addr.s_addr; 10914 if (!ip_contiguous_mask(ntohl(mask))) 10915 return (ENOTSUP); 10916 V4MASK_TO_V6(mask, v6mask); 10917 } 10918 10919 /* 10920 * No big deal if the interface isn't already up, or the mask 10921 * isn't really changing, or this is pt-pt. 10922 */ 10923 if (!(ipif->ipif_flags & IPIF_UP) || 10924 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 10925 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 10926 ipif->ipif_v6net_mask = v6mask; 10927 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10928 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 10929 ipif->ipif_v6net_mask, 10930 ipif->ipif_v6subnet); 10931 } 10932 return (0); 10933 } 10934 /* 10935 * Make sure we have valid net and subnet broadcast ire's 10936 * for the old netmask, if needed by other logical interfaces. 10937 */ 10938 err = ipif_logical_down(ipif, q, mp); 10939 if (err == EINPROGRESS) 10940 return (err); 10941 (void) ipif_down_tail(ipif); 10942 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 10943 return (err); 10944 } 10945 10946 static int 10947 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 10948 { 10949 in6_addr_t v6mask; 10950 int err = 0; 10951 10952 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 10953 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10954 10955 if (ipif->ipif_isv6) { 10956 sin6_t *sin6; 10957 10958 sin6 = (sin6_t *)sin; 10959 v6mask = sin6->sin6_addr; 10960 } else { 10961 ipaddr_t mask; 10962 10963 mask = sin->sin_addr.s_addr; 10964 V4MASK_TO_V6(mask, v6mask); 10965 } 10966 10967 ipif->ipif_v6net_mask = v6mask; 10968 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10969 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 10970 ipif->ipif_v6subnet); 10971 } 10972 err = ipif_up(ipif, q, mp); 10973 10974 if (err == 0 || err == EINPROGRESS) { 10975 /* 10976 * The interface must be DL_BOUND if this packet has to 10977 * go out on the wire. Since we only go through a logical 10978 * down and are bound with the driver during an internal 10979 * down/up that is satisfied. 10980 */ 10981 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 10982 /* Potentially broadcast an address mask reply. */ 10983 ipif_mask_reply(ipif); 10984 } 10985 } 10986 return (err); 10987 } 10988 10989 /* ARGSUSED */ 10990 int 10991 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10992 ip_ioctl_cmd_t *ipip, void *if_req) 10993 { 10994 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 10995 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10996 (void) ipif_down_tail(ipif); 10997 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 10998 } 10999 11000 /* Get interface net mask. */ 11001 /* ARGSUSED */ 11002 int 11003 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11004 ip_ioctl_cmd_t *ipip, void *if_req) 11005 { 11006 struct lifreq *lifr = (struct lifreq *)if_req; 11007 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 11008 11009 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 11010 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11011 11012 /* 11013 * net mask can't change since we have a reference to the ipif. 11014 */ 11015 if (ipif->ipif_isv6) { 11016 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11017 *sin6 = sin6_null; 11018 sin6->sin6_family = AF_INET6; 11019 sin6->sin6_addr = ipif->ipif_v6net_mask; 11020 lifr->lifr_addrlen = 11021 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11022 } else { 11023 *sin = sin_null; 11024 sin->sin_family = AF_INET; 11025 sin->sin_addr.s_addr = ipif->ipif_net_mask; 11026 if (ipip->ipi_cmd_type == LIF_CMD) { 11027 lifr->lifr_addrlen = 11028 ip_mask_to_plen(ipif->ipif_net_mask); 11029 } 11030 } 11031 return (0); 11032 } 11033 11034 /* ARGSUSED */ 11035 int 11036 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11037 ip_ioctl_cmd_t *ipip, void *if_req) 11038 { 11039 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 11040 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11041 11042 /* 11043 * Since no applications should ever be setting metrics on underlying 11044 * interfaces, we explicitly fail to smoke 'em out. 11045 */ 11046 if (IS_UNDER_IPMP(ipif->ipif_ill)) 11047 return (EINVAL); 11048 11049 /* 11050 * Set interface metric. We don't use this for 11051 * anything but we keep track of it in case it is 11052 * important to routing applications or such. 11053 */ 11054 if (ipip->ipi_cmd_type == IF_CMD) { 11055 struct ifreq *ifr; 11056 11057 ifr = (struct ifreq *)if_req; 11058 ipif->ipif_ill->ill_metric = ifr->ifr_metric; 11059 } else { 11060 struct lifreq *lifr; 11061 11062 lifr = (struct lifreq *)if_req; 11063 ipif->ipif_ill->ill_metric = lifr->lifr_metric; 11064 } 11065 return (0); 11066 } 11067 11068 /* ARGSUSED */ 11069 int 11070 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11071 ip_ioctl_cmd_t *ipip, void *if_req) 11072 { 11073 /* Get interface metric. */ 11074 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 11075 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11076 11077 if (ipip->ipi_cmd_type == IF_CMD) { 11078 struct ifreq *ifr; 11079 11080 ifr = (struct ifreq *)if_req; 11081 ifr->ifr_metric = ipif->ipif_ill->ill_metric; 11082 } else { 11083 struct lifreq *lifr; 11084 11085 lifr = (struct lifreq *)if_req; 11086 lifr->lifr_metric = ipif->ipif_ill->ill_metric; 11087 } 11088 11089 return (0); 11090 } 11091 11092 /* ARGSUSED */ 11093 int 11094 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11095 ip_ioctl_cmd_t *ipip, void *if_req) 11096 { 11097 int arp_muxid; 11098 11099 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 11100 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11101 /* 11102 * Set the muxid returned from I_PLINK. 11103 */ 11104 if (ipip->ipi_cmd_type == IF_CMD) { 11105 struct ifreq *ifr = (struct ifreq *)if_req; 11106 11107 ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid; 11108 arp_muxid = ifr->ifr_arp_muxid; 11109 } else { 11110 struct lifreq *lifr = (struct lifreq *)if_req; 11111 11112 ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid; 11113 arp_muxid = lifr->lifr_arp_muxid; 11114 } 11115 arl_set_muxid(ipif->ipif_ill, arp_muxid); 11116 return (0); 11117 } 11118 11119 /* ARGSUSED */ 11120 int 11121 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11122 ip_ioctl_cmd_t *ipip, void *if_req) 11123 { 11124 int arp_muxid = 0; 11125 11126 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 11127 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11128 /* 11129 * Get the muxid saved in ill for I_PUNLINK. 11130 */ 11131 arp_muxid = arl_get_muxid(ipif->ipif_ill); 11132 if (ipip->ipi_cmd_type == IF_CMD) { 11133 struct ifreq *ifr = (struct ifreq *)if_req; 11134 11135 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid; 11136 ifr->ifr_arp_muxid = arp_muxid; 11137 } else { 11138 struct lifreq *lifr = (struct lifreq *)if_req; 11139 11140 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid; 11141 lifr->lifr_arp_muxid = arp_muxid; 11142 } 11143 return (0); 11144 } 11145 11146 /* 11147 * Set the subnet prefix. Does not modify the broadcast address. 11148 */ 11149 /* ARGSUSED */ 11150 int 11151 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11152 ip_ioctl_cmd_t *ipip, void *if_req) 11153 { 11154 int err = 0; 11155 in6_addr_t v6addr; 11156 in6_addr_t v6mask; 11157 boolean_t need_up = B_FALSE; 11158 int addrlen; 11159 11160 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 11161 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11162 11163 ASSERT(IAM_WRITER_IPIF(ipif)); 11164 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 11165 11166 if (ipif->ipif_isv6) { 11167 sin6_t *sin6; 11168 11169 if (sin->sin_family != AF_INET6) 11170 return (EAFNOSUPPORT); 11171 11172 sin6 = (sin6_t *)sin; 11173 v6addr = sin6->sin6_addr; 11174 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 11175 return (EADDRNOTAVAIL); 11176 } else { 11177 ipaddr_t addr; 11178 11179 if (sin->sin_family != AF_INET) 11180 return (EAFNOSUPPORT); 11181 11182 addr = sin->sin_addr.s_addr; 11183 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 11184 return (EADDRNOTAVAIL); 11185 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11186 /* Add 96 bits */ 11187 addrlen += IPV6_ABITS - IP_ABITS; 11188 } 11189 11190 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 11191 return (EINVAL); 11192 11193 /* Check if bits in the address is set past the mask */ 11194 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 11195 return (EINVAL); 11196 11197 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 11198 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 11199 return (0); /* No change */ 11200 11201 if (ipif->ipif_flags & IPIF_UP) { 11202 /* 11203 * If the interface is already marked up, 11204 * we call ipif_down which will take care 11205 * of ditching any IREs that have been set 11206 * up based on the old interface address. 11207 */ 11208 err = ipif_logical_down(ipif, q, mp); 11209 if (err == EINPROGRESS) 11210 return (err); 11211 (void) ipif_down_tail(ipif); 11212 need_up = B_TRUE; 11213 } 11214 11215 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 11216 return (err); 11217 } 11218 11219 static int 11220 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 11221 queue_t *q, mblk_t *mp, boolean_t need_up) 11222 { 11223 ill_t *ill = ipif->ipif_ill; 11224 int err = 0; 11225 11226 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 11227 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11228 11229 /* Set the new address. */ 11230 mutex_enter(&ill->ill_lock); 11231 ipif->ipif_v6net_mask = v6mask; 11232 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11233 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 11234 ipif->ipif_v6subnet); 11235 } 11236 mutex_exit(&ill->ill_lock); 11237 11238 if (need_up) { 11239 /* 11240 * Now bring the interface back up. If this 11241 * is the only IPIF for the ILL, ipif_up 11242 * will have to re-bind to the device, so 11243 * we may get back EINPROGRESS, in which 11244 * case, this IOCTL will get completed in 11245 * ip_rput_dlpi when we see the DL_BIND_ACK. 11246 */ 11247 err = ipif_up(ipif, q, mp); 11248 if (err == EINPROGRESS) 11249 return (err); 11250 } 11251 return (err); 11252 } 11253 11254 /* ARGSUSED */ 11255 int 11256 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11257 ip_ioctl_cmd_t *ipip, void *if_req) 11258 { 11259 int addrlen; 11260 in6_addr_t v6addr; 11261 in6_addr_t v6mask; 11262 struct lifreq *lifr = (struct lifreq *)if_req; 11263 11264 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 11265 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11266 (void) ipif_down_tail(ipif); 11267 11268 addrlen = lifr->lifr_addrlen; 11269 if (ipif->ipif_isv6) { 11270 sin6_t *sin6; 11271 11272 sin6 = (sin6_t *)sin; 11273 v6addr = sin6->sin6_addr; 11274 } else { 11275 ipaddr_t addr; 11276 11277 addr = sin->sin_addr.s_addr; 11278 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11279 addrlen += IPV6_ABITS - IP_ABITS; 11280 } 11281 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 11282 11283 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 11284 } 11285 11286 /* ARGSUSED */ 11287 int 11288 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11289 ip_ioctl_cmd_t *ipip, void *if_req) 11290 { 11291 struct lifreq *lifr = (struct lifreq *)if_req; 11292 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 11293 11294 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 11295 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11296 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11297 11298 if (ipif->ipif_isv6) { 11299 *sin6 = sin6_null; 11300 sin6->sin6_family = AF_INET6; 11301 sin6->sin6_addr = ipif->ipif_v6subnet; 11302 lifr->lifr_addrlen = 11303 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11304 } else { 11305 *sin = sin_null; 11306 sin->sin_family = AF_INET; 11307 sin->sin_addr.s_addr = ipif->ipif_subnet; 11308 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 11309 } 11310 return (0); 11311 } 11312 11313 /* 11314 * Set the IPv6 address token. 11315 */ 11316 /* ARGSUSED */ 11317 int 11318 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11319 ip_ioctl_cmd_t *ipi, void *if_req) 11320 { 11321 ill_t *ill = ipif->ipif_ill; 11322 int err; 11323 in6_addr_t v6addr; 11324 in6_addr_t v6mask; 11325 boolean_t need_up = B_FALSE; 11326 int i; 11327 sin6_t *sin6 = (sin6_t *)sin; 11328 struct lifreq *lifr = (struct lifreq *)if_req; 11329 int addrlen; 11330 11331 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 11332 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11333 ASSERT(IAM_WRITER_IPIF(ipif)); 11334 11335 addrlen = lifr->lifr_addrlen; 11336 /* Only allow for logical unit zero i.e. not on "le0:17" */ 11337 if (ipif->ipif_id != 0) 11338 return (EINVAL); 11339 11340 if (!ipif->ipif_isv6) 11341 return (EINVAL); 11342 11343 if (addrlen > IPV6_ABITS) 11344 return (EINVAL); 11345 11346 v6addr = sin6->sin6_addr; 11347 11348 /* 11349 * The length of the token is the length from the end. To get 11350 * the proper mask for this, compute the mask of the bits not 11351 * in the token; ie. the prefix, and then xor to get the mask. 11352 */ 11353 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 11354 return (EINVAL); 11355 for (i = 0; i < 4; i++) { 11356 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 11357 } 11358 11359 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 11360 ill->ill_token_length == addrlen) 11361 return (0); /* No change */ 11362 11363 if (ipif->ipif_flags & IPIF_UP) { 11364 err = ipif_logical_down(ipif, q, mp); 11365 if (err == EINPROGRESS) 11366 return (err); 11367 (void) ipif_down_tail(ipif); 11368 need_up = B_TRUE; 11369 } 11370 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 11371 return (err); 11372 } 11373 11374 static int 11375 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 11376 mblk_t *mp, boolean_t need_up) 11377 { 11378 in6_addr_t v6addr; 11379 in6_addr_t v6mask; 11380 ill_t *ill = ipif->ipif_ill; 11381 int i; 11382 int err = 0; 11383 11384 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 11385 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11386 v6addr = sin6->sin6_addr; 11387 /* 11388 * The length of the token is the length from the end. To get 11389 * the proper mask for this, compute the mask of the bits not 11390 * in the token; ie. the prefix, and then xor to get the mask. 11391 */ 11392 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 11393 for (i = 0; i < 4; i++) 11394 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 11395 11396 mutex_enter(&ill->ill_lock); 11397 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 11398 ill->ill_token_length = addrlen; 11399 ill->ill_manual_token = 1; 11400 11401 /* Reconfigure the link-local address based on this new token */ 11402 ipif_setlinklocal(ill->ill_ipif); 11403 11404 mutex_exit(&ill->ill_lock); 11405 11406 if (need_up) { 11407 /* 11408 * Now bring the interface back up. If this 11409 * is the only IPIF for the ILL, ipif_up 11410 * will have to re-bind to the device, so 11411 * we may get back EINPROGRESS, in which 11412 * case, this IOCTL will get completed in 11413 * ip_rput_dlpi when we see the DL_BIND_ACK. 11414 */ 11415 err = ipif_up(ipif, q, mp); 11416 if (err == EINPROGRESS) 11417 return (err); 11418 } 11419 return (err); 11420 } 11421 11422 /* ARGSUSED */ 11423 int 11424 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11425 ip_ioctl_cmd_t *ipi, void *if_req) 11426 { 11427 ill_t *ill; 11428 sin6_t *sin6 = (sin6_t *)sin; 11429 struct lifreq *lifr = (struct lifreq *)if_req; 11430 11431 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 11432 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11433 if (ipif->ipif_id != 0) 11434 return (EINVAL); 11435 11436 ill = ipif->ipif_ill; 11437 if (!ill->ill_isv6) 11438 return (ENXIO); 11439 11440 *sin6 = sin6_null; 11441 sin6->sin6_family = AF_INET6; 11442 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 11443 sin6->sin6_addr = ill->ill_token; 11444 lifr->lifr_addrlen = ill->ill_token_length; 11445 return (0); 11446 } 11447 11448 /* 11449 * Set (hardware) link specific information that might override 11450 * what was acquired through the DL_INFO_ACK. 11451 */ 11452 /* ARGSUSED */ 11453 int 11454 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11455 ip_ioctl_cmd_t *ipi, void *if_req) 11456 { 11457 ill_t *ill = ipif->ipif_ill; 11458 int ip_min_mtu; 11459 struct lifreq *lifr = (struct lifreq *)if_req; 11460 lif_ifinfo_req_t *lir; 11461 11462 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 11463 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11464 lir = &lifr->lifr_ifinfo; 11465 ASSERT(IAM_WRITER_IPIF(ipif)); 11466 11467 /* Only allow for logical unit zero i.e. not on "bge0:17" */ 11468 if (ipif->ipif_id != 0) 11469 return (EINVAL); 11470 11471 /* Set interface MTU. */ 11472 if (ipif->ipif_isv6) 11473 ip_min_mtu = IPV6_MIN_MTU; 11474 else 11475 ip_min_mtu = IP_MIN_MTU; 11476 11477 /* 11478 * Verify values before we set anything. Allow zero to 11479 * mean unspecified. 11480 * 11481 * XXX We should be able to set the user-defined lir_mtu to some value 11482 * that is greater than ill_current_frag but less than ill_max_frag- the 11483 * ill_max_frag value tells us the max MTU that can be handled by the 11484 * datalink, whereas the ill_current_frag is dynamically computed for 11485 * some link-types like tunnels, based on the tunnel PMTU. However, 11486 * since there is currently no way of distinguishing between 11487 * administratively fixed link mtu values (e.g., those set via 11488 * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered 11489 * for tunnels) we conservatively choose the ill_current_frag as the 11490 * upper-bound. 11491 */ 11492 if (lir->lir_maxmtu != 0 && 11493 (lir->lir_maxmtu > ill->ill_current_frag || 11494 lir->lir_maxmtu < ip_min_mtu)) 11495 return (EINVAL); 11496 if (lir->lir_reachtime != 0 && 11497 lir->lir_reachtime > ND_MAX_REACHTIME) 11498 return (EINVAL); 11499 if (lir->lir_reachretrans != 0 && 11500 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 11501 return (EINVAL); 11502 11503 mutex_enter(&ill->ill_lock); 11504 /* 11505 * The dce and fragmentation code can handle changes to ill_mtu 11506 * concurrent with sending/fragmenting packets. 11507 */ 11508 if (lir->lir_maxmtu != 0) 11509 ill->ill_user_mtu = lir->lir_maxmtu; 11510 11511 if (lir->lir_reachtime != 0) 11512 ill->ill_reachable_time = lir->lir_reachtime; 11513 11514 if (lir->lir_reachretrans != 0) 11515 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 11516 11517 ill->ill_max_hops = lir->lir_maxhops; 11518 ill->ill_max_buf = ND_MAX_Q; 11519 if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) { 11520 /* 11521 * ill_mtu is the actual interface MTU, obtained as the min 11522 * of user-configured mtu and the value announced by the 11523 * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since 11524 * we have already made the choice of requiring 11525 * ill_user_mtu < ill_current_frag by the time we get here, 11526 * the ill_mtu effectively gets assigned to the ill_user_mtu 11527 * here. 11528 */ 11529 ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu); 11530 } 11531 mutex_exit(&ill->ill_lock); 11532 11533 /* 11534 * Make sure all dce_generation checks find out 11535 * that ill_mtu has changed. 11536 */ 11537 if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0)) 11538 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); 11539 11540 /* 11541 * Refresh IPMP meta-interface MTU if necessary. 11542 */ 11543 if (IS_UNDER_IPMP(ill)) 11544 ipmp_illgrp_refresh_mtu(ill->ill_grp); 11545 11546 return (0); 11547 } 11548 11549 /* ARGSUSED */ 11550 int 11551 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11552 ip_ioctl_cmd_t *ipi, void *if_req) 11553 { 11554 struct lif_ifinfo_req *lir; 11555 ill_t *ill = ipif->ipif_ill; 11556 11557 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 11558 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11559 if (ipif->ipif_id != 0) 11560 return (EINVAL); 11561 11562 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 11563 lir->lir_maxhops = ill->ill_max_hops; 11564 lir->lir_reachtime = ill->ill_reachable_time; 11565 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 11566 lir->lir_maxmtu = ill->ill_mtu; 11567 11568 return (0); 11569 } 11570 11571 /* 11572 * Return best guess as to the subnet mask for the specified address. 11573 * Based on the subnet masks for all the configured interfaces. 11574 * 11575 * We end up returning a zero mask in the case of default, multicast or 11576 * experimental. 11577 */ 11578 static ipaddr_t 11579 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst) 11580 { 11581 ipaddr_t net_mask; 11582 ill_t *ill; 11583 ipif_t *ipif; 11584 ill_walk_context_t ctx; 11585 ipif_t *fallback_ipif = NULL; 11586 11587 net_mask = ip_net_mask(addr); 11588 if (net_mask == 0) { 11589 *ipifp = NULL; 11590 return (0); 11591 } 11592 11593 /* Let's check to see if this is maybe a local subnet route. */ 11594 /* this function only applies to IPv4 interfaces */ 11595 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 11596 ill = ILL_START_WALK_V4(&ctx, ipst); 11597 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 11598 mutex_enter(&ill->ill_lock); 11599 for (ipif = ill->ill_ipif; ipif != NULL; 11600 ipif = ipif->ipif_next) { 11601 if (IPIF_IS_CONDEMNED(ipif)) 11602 continue; 11603 if (!(ipif->ipif_flags & IPIF_UP)) 11604 continue; 11605 if ((ipif->ipif_subnet & net_mask) == 11606 (addr & net_mask)) { 11607 /* 11608 * Don't trust pt-pt interfaces if there are 11609 * other interfaces. 11610 */ 11611 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 11612 if (fallback_ipif == NULL) { 11613 ipif_refhold_locked(ipif); 11614 fallback_ipif = ipif; 11615 } 11616 continue; 11617 } 11618 11619 /* 11620 * Fine. Just assume the same net mask as the 11621 * directly attached subnet interface is using. 11622 */ 11623 ipif_refhold_locked(ipif); 11624 mutex_exit(&ill->ill_lock); 11625 rw_exit(&ipst->ips_ill_g_lock); 11626 if (fallback_ipif != NULL) 11627 ipif_refrele(fallback_ipif); 11628 *ipifp = ipif; 11629 return (ipif->ipif_net_mask); 11630 } 11631 } 11632 mutex_exit(&ill->ill_lock); 11633 } 11634 rw_exit(&ipst->ips_ill_g_lock); 11635 11636 *ipifp = fallback_ipif; 11637 return ((fallback_ipif != NULL) ? 11638 fallback_ipif->ipif_net_mask : net_mask); 11639 } 11640 11641 /* 11642 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 11643 */ 11644 static void 11645 ip_wput_ioctl(queue_t *q, mblk_t *mp) 11646 { 11647 IOCP iocp; 11648 ipft_t *ipft; 11649 ipllc_t *ipllc; 11650 mblk_t *mp1; 11651 cred_t *cr; 11652 int error = 0; 11653 conn_t *connp; 11654 11655 ip1dbg(("ip_wput_ioctl")); 11656 iocp = (IOCP)mp->b_rptr; 11657 mp1 = mp->b_cont; 11658 if (mp1 == NULL) { 11659 iocp->ioc_error = EINVAL; 11660 mp->b_datap->db_type = M_IOCNAK; 11661 iocp->ioc_count = 0; 11662 qreply(q, mp); 11663 return; 11664 } 11665 11666 /* 11667 * These IOCTLs provide various control capabilities to 11668 * upstream agents such as ULPs and processes. There 11669 * are currently two such IOCTLs implemented. They 11670 * are used by TCP to provide update information for 11671 * existing IREs and to forcibly delete an IRE for a 11672 * host that is not responding, thereby forcing an 11673 * attempt at a new route. 11674 */ 11675 iocp->ioc_error = EINVAL; 11676 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 11677 goto done; 11678 11679 ipllc = (ipllc_t *)mp1->b_rptr; 11680 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 11681 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 11682 break; 11683 } 11684 /* 11685 * prefer credential from mblk over ioctl; 11686 * see ip_sioctl_copyin_setup 11687 */ 11688 cr = msg_getcred(mp, NULL); 11689 if (cr == NULL) 11690 cr = iocp->ioc_cr; 11691 11692 /* 11693 * Refhold the conn in case the request gets queued up in some lookup 11694 */ 11695 ASSERT(CONN_Q(q)); 11696 connp = Q_TO_CONN(q); 11697 CONN_INC_REF(connp); 11698 if (ipft->ipft_pfi && 11699 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 11700 pullupmsg(mp1, ipft->ipft_min_size))) { 11701 error = (*ipft->ipft_pfi)(q, 11702 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 11703 } 11704 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 11705 /* 11706 * CONN_OPER_PENDING_DONE happens in the function called 11707 * through ipft_pfi above. 11708 */ 11709 return; 11710 } 11711 11712 CONN_OPER_PENDING_DONE(connp); 11713 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 11714 freemsg(mp); 11715 return; 11716 } 11717 iocp->ioc_error = error; 11718 11719 done: 11720 mp->b_datap->db_type = M_IOCACK; 11721 if (iocp->ioc_error) 11722 iocp->ioc_count = 0; 11723 qreply(q, mp); 11724 } 11725 11726 /* 11727 * Assign a unique id for the ipif. This is used by sctp_addr.c 11728 * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures. 11729 */ 11730 static void 11731 ipif_assign_seqid(ipif_t *ipif) 11732 { 11733 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 11734 11735 ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1); 11736 } 11737 11738 /* 11739 * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are 11740 * administratively down (i.e., no DAD), of the same type, and locked. Note 11741 * that the clone is complete -- including the seqid -- and the expectation is 11742 * that the caller will either free or overwrite `sipif' before it's unlocked. 11743 */ 11744 static void 11745 ipif_clone(const ipif_t *sipif, ipif_t *dipif) 11746 { 11747 ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock)); 11748 ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock)); 11749 ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 11750 ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 11751 ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type); 11752 11753 dipif->ipif_flags = sipif->ipif_flags; 11754 dipif->ipif_zoneid = sipif->ipif_zoneid; 11755 dipif->ipif_v6subnet = sipif->ipif_v6subnet; 11756 dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr; 11757 dipif->ipif_v6net_mask = sipif->ipif_v6net_mask; 11758 dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr; 11759 dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr; 11760 11761 /* 11762 * As per the comment atop the function, we assume that these sipif 11763 * fields will be changed before sipif is unlocked. 11764 */ 11765 dipif->ipif_seqid = sipif->ipif_seqid; 11766 dipif->ipif_state_flags = sipif->ipif_state_flags; 11767 } 11768 11769 /* 11770 * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif' 11771 * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin 11772 * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then 11773 * transfer the xop to `dipif'. Requires that all ipifs are administratively 11774 * down (i.e., no DAD), of the same type, and unlocked. 11775 */ 11776 static void 11777 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif) 11778 { 11779 ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq; 11780 ipxop_t *ipx = ipsq->ipsq_xop; 11781 11782 ASSERT(sipif != dipif); 11783 ASSERT(sipif != virgipif); 11784 11785 /* 11786 * Grab all of the locks that protect the ipif in a defined order. 11787 */ 11788 GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 11789 11790 ipif_clone(sipif, dipif); 11791 if (virgipif != NULL) { 11792 ipif_clone(virgipif, sipif); 11793 mi_free(virgipif); 11794 } 11795 11796 RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 11797 11798 /* 11799 * Transfer ownership of the current xop, if necessary. 11800 */ 11801 if (ipx->ipx_current_ipif == sipif) { 11802 ASSERT(ipx->ipx_pending_ipif == NULL); 11803 mutex_enter(&ipx->ipx_lock); 11804 ipx->ipx_current_ipif = dipif; 11805 mutex_exit(&ipx->ipx_lock); 11806 } 11807 11808 if (virgipif == NULL) 11809 mi_free(sipif); 11810 } 11811 11812 /* 11813 * checks if: 11814 * - <ill_name>:<ipif_id> is at most LIFNAMSIZ - 1 and 11815 * - logical interface is within the allowed range 11816 */ 11817 static int 11818 is_lifname_valid(ill_t *ill, unsigned int ipif_id) 11819 { 11820 if (snprintf(NULL, 0, "%s:%d", ill->ill_name, ipif_id) >= LIFNAMSIZ) 11821 return (ENAMETOOLONG); 11822 11823 if (ipif_id >= ill->ill_ipst->ips_ip_addrs_per_if) 11824 return (ERANGE); 11825 return (0); 11826 } 11827 11828 /* 11829 * Insert the ipif, so that the list of ipifs on the ill will be sorted 11830 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 11831 * be inserted into the first space available in the list. The value of 11832 * ipif_id will then be set to the appropriate value for its position. 11833 */ 11834 static int 11835 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock) 11836 { 11837 ill_t *ill; 11838 ipif_t *tipif; 11839 ipif_t **tipifp; 11840 int id, err; 11841 ip_stack_t *ipst; 11842 11843 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 11844 IAM_WRITER_IPIF(ipif)); 11845 11846 ill = ipif->ipif_ill; 11847 ASSERT(ill != NULL); 11848 ipst = ill->ill_ipst; 11849 11850 /* 11851 * In the case of lo0:0 we already hold the ill_g_lock. 11852 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 11853 * ipif_insert. 11854 */ 11855 if (acquire_g_lock) 11856 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 11857 mutex_enter(&ill->ill_lock); 11858 id = ipif->ipif_id; 11859 tipifp = &(ill->ill_ipif); 11860 if (id == -1) { /* need to find a real id */ 11861 id = 0; 11862 while ((tipif = *tipifp) != NULL) { 11863 ASSERT(tipif->ipif_id >= id); 11864 if (tipif->ipif_id != id) 11865 break; /* non-consecutive id */ 11866 id++; 11867 tipifp = &(tipif->ipif_next); 11868 } 11869 if ((err = is_lifname_valid(ill, id)) != 0) { 11870 mutex_exit(&ill->ill_lock); 11871 if (acquire_g_lock) 11872 rw_exit(&ipst->ips_ill_g_lock); 11873 return (err); 11874 } 11875 ipif->ipif_id = id; /* assign new id */ 11876 } else if ((err = is_lifname_valid(ill, id)) == 0) { 11877 /* we have a real id; insert ipif in the right place */ 11878 while ((tipif = *tipifp) != NULL) { 11879 ASSERT(tipif->ipif_id != id); 11880 if (tipif->ipif_id > id) 11881 break; /* found correct location */ 11882 tipifp = &(tipif->ipif_next); 11883 } 11884 } else { 11885 mutex_exit(&ill->ill_lock); 11886 if (acquire_g_lock) 11887 rw_exit(&ipst->ips_ill_g_lock); 11888 return (err); 11889 } 11890 11891 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 11892 11893 ipif->ipif_next = tipif; 11894 *tipifp = ipif; 11895 mutex_exit(&ill->ill_lock); 11896 if (acquire_g_lock) 11897 rw_exit(&ipst->ips_ill_g_lock); 11898 11899 return (0); 11900 } 11901 11902 static void 11903 ipif_remove(ipif_t *ipif) 11904 { 11905 ipif_t **ipifp; 11906 ill_t *ill = ipif->ipif_ill; 11907 11908 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock)); 11909 11910 mutex_enter(&ill->ill_lock); 11911 ipifp = &ill->ill_ipif; 11912 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 11913 if (*ipifp == ipif) { 11914 *ipifp = ipif->ipif_next; 11915 break; 11916 } 11917 } 11918 mutex_exit(&ill->ill_lock); 11919 } 11920 11921 /* 11922 * Allocate and initialize a new interface control structure. (Always 11923 * called as writer.) 11924 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 11925 * is not part of the global linked list of ills. ipif_seqid is unique 11926 * in the system and to preserve the uniqueness, it is assigned only 11927 * when ill becomes part of the global list. At that point ill will 11928 * have a name. If it doesn't get assigned here, it will get assigned 11929 * in ipif_set_values() as part of SIOCSLIFNAME processing. 11930 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 11931 * the interface flags or any other information from the DL_INFO_ACK for 11932 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 11933 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 11934 * second DL_INFO_ACK comes in from the driver. 11935 */ 11936 static ipif_t * 11937 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, 11938 boolean_t insert, int *errorp) 11939 { 11940 int err; 11941 ipif_t *ipif; 11942 ip_stack_t *ipst = ill->ill_ipst; 11943 11944 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 11945 ill->ill_name, id, (void *)ill)); 11946 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 11947 11948 if (errorp != NULL) 11949 *errorp = 0; 11950 11951 if ((ipif = mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) { 11952 if (errorp != NULL) 11953 *errorp = ENOMEM; 11954 return (NULL); 11955 } 11956 *ipif = ipif_zero; /* start clean */ 11957 11958 ipif->ipif_ill = ill; 11959 ipif->ipif_id = id; /* could be -1 */ 11960 /* 11961 * Inherit the zoneid from the ill; for the shared stack instance 11962 * this is always the global zone 11963 */ 11964 ipif->ipif_zoneid = ill->ill_zoneid; 11965 11966 ipif->ipif_refcnt = 0; 11967 11968 if (insert) { 11969 if ((err = ipif_insert(ipif, ire_type != IRE_LOOPBACK)) != 0) { 11970 mi_free(ipif); 11971 if (errorp != NULL) 11972 *errorp = err; 11973 return (NULL); 11974 } 11975 /* -1 id should have been replaced by real id */ 11976 id = ipif->ipif_id; 11977 ASSERT(id >= 0); 11978 } 11979 11980 if (ill->ill_name[0] != '\0') 11981 ipif_assign_seqid(ipif); 11982 11983 /* 11984 * If this is the zeroth ipif on the IPMP ill, create the illgrp 11985 * (which must not exist yet because the zeroth ipif is created once 11986 * per ill). However, do not not link it to the ipmp_grp_t until 11987 * I_PLINK is called; see ip_sioctl_plink_ipmp() for details. 11988 */ 11989 if (id == 0 && IS_IPMP(ill)) { 11990 if (ipmp_illgrp_create(ill) == NULL) { 11991 if (insert) { 11992 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 11993 ipif_remove(ipif); 11994 rw_exit(&ipst->ips_ill_g_lock); 11995 } 11996 mi_free(ipif); 11997 if (errorp != NULL) 11998 *errorp = ENOMEM; 11999 return (NULL); 12000 } 12001 } 12002 12003 /* 12004 * We grab ill_lock to protect the flag changes. The ipif is still 12005 * not up and can't be looked up until the ioctl completes and the 12006 * IPIF_CHANGING flag is cleared. 12007 */ 12008 mutex_enter(&ill->ill_lock); 12009 12010 ipif->ipif_ire_type = ire_type; 12011 12012 if (ipif->ipif_isv6) { 12013 ill->ill_flags |= ILLF_IPV6; 12014 } else { 12015 ipaddr_t inaddr_any = INADDR_ANY; 12016 12017 ill->ill_flags |= ILLF_IPV4; 12018 12019 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 12020 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12021 &ipif->ipif_v6lcl_addr); 12022 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12023 &ipif->ipif_v6subnet); 12024 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12025 &ipif->ipif_v6net_mask); 12026 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12027 &ipif->ipif_v6brd_addr); 12028 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12029 &ipif->ipif_v6pp_dst_addr); 12030 } 12031 12032 /* 12033 * Don't set the interface flags etc. now, will do it in 12034 * ip_ll_subnet_defaults. 12035 */ 12036 if (!initialize) 12037 goto out; 12038 12039 /* 12040 * NOTE: The IPMP meta-interface is special-cased because it starts 12041 * with no underlying interfaces (and thus an unknown broadcast 12042 * address length), but all interfaces that can be placed into an IPMP 12043 * group are required to be broadcast-capable. 12044 */ 12045 if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) { 12046 /* 12047 * Later detect lack of DLPI driver multicast capability by 12048 * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi(). 12049 */ 12050 ill->ill_flags |= ILLF_MULTICAST; 12051 if (!ipif->ipif_isv6) 12052 ipif->ipif_flags |= IPIF_BROADCAST; 12053 } else { 12054 if (ill->ill_net_type != IRE_LOOPBACK) { 12055 if (ipif->ipif_isv6) 12056 /* 12057 * Note: xresolv interfaces will eventually need 12058 * NOARP set here as well, but that will require 12059 * those external resolvers to have some 12060 * knowledge of that flag and act appropriately. 12061 * Not to be changed at present. 12062 */ 12063 ill->ill_flags |= ILLF_NONUD; 12064 else 12065 ill->ill_flags |= ILLF_NOARP; 12066 } 12067 if (ill->ill_phys_addr_length == 0) { 12068 if (IS_VNI(ill)) { 12069 ipif->ipif_flags |= IPIF_NOXMIT; 12070 } else { 12071 /* pt-pt supports multicast. */ 12072 ill->ill_flags |= ILLF_MULTICAST; 12073 if (ill->ill_net_type != IRE_LOOPBACK) 12074 ipif->ipif_flags |= IPIF_POINTOPOINT; 12075 } 12076 } 12077 } 12078 out: 12079 mutex_exit(&ill->ill_lock); 12080 return (ipif); 12081 } 12082 12083 /* 12084 * Remove the neighbor cache entries associated with this logical 12085 * interface. 12086 */ 12087 int 12088 ipif_arp_down(ipif_t *ipif) 12089 { 12090 ill_t *ill = ipif->ipif_ill; 12091 int err = 0; 12092 12093 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 12094 ASSERT(IAM_WRITER_IPIF(ipif)); 12095 12096 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down", 12097 ill_t *, ill, ipif_t *, ipif); 12098 ipif_nce_down(ipif); 12099 12100 /* 12101 * If this is the last ipif that is going down and there are no 12102 * duplicate addresses we may yet attempt to re-probe, then we need to 12103 * clean up ARP completely. 12104 */ 12105 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 12106 !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) { 12107 /* 12108 * If this was the last ipif on an IPMP interface, purge any 12109 * static ARP entries associated with it. 12110 */ 12111 if (IS_IPMP(ill)) 12112 ipmp_illgrp_refresh_arpent(ill->ill_grp); 12113 12114 /* UNBIND, DETACH */ 12115 err = arp_ll_down(ill); 12116 } 12117 12118 return (err); 12119 } 12120 12121 /* 12122 * Get the resolver set up for a new IP address. (Always called as writer.) 12123 * Called both for IPv4 and IPv6 interfaces, though it only does some 12124 * basic DAD related initialization for IPv6. Honors ILLF_NOARP. 12125 * 12126 * The enumerated value res_act tunes the behavior: 12127 * * Res_act_initial: set up all the resolver structures for a new 12128 * IP address. 12129 * * Res_act_defend: tell ARP that it needs to send a single gratuitous 12130 * ARP message in defense of the address. 12131 * * Res_act_rebind: tell ARP to change the hardware address for an IP 12132 * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif(). 12133 * 12134 * Returns zero on success, or an errno upon failure. 12135 */ 12136 int 12137 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 12138 { 12139 ill_t *ill = ipif->ipif_ill; 12140 int err; 12141 boolean_t was_dup; 12142 12143 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 12144 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 12145 ASSERT(IAM_WRITER_IPIF(ipif)); 12146 12147 was_dup = B_FALSE; 12148 if (res_act == Res_act_initial) { 12149 ipif->ipif_addr_ready = 0; 12150 /* 12151 * We're bringing an interface up here. There's no way that we 12152 * should need to shut down ARP now. 12153 */ 12154 mutex_enter(&ill->ill_lock); 12155 if (ipif->ipif_flags & IPIF_DUPLICATE) { 12156 ipif->ipif_flags &= ~IPIF_DUPLICATE; 12157 ill->ill_ipif_dup_count--; 12158 was_dup = B_TRUE; 12159 } 12160 mutex_exit(&ill->ill_lock); 12161 } 12162 if (ipif->ipif_recovery_id != 0) 12163 (void) untimeout(ipif->ipif_recovery_id); 12164 ipif->ipif_recovery_id = 0; 12165 if (ill->ill_net_type != IRE_IF_RESOLVER) { 12166 ipif->ipif_addr_ready = 1; 12167 return (0); 12168 } 12169 /* NDP will set the ipif_addr_ready flag when it's ready */ 12170 if (ill->ill_isv6) 12171 return (0); 12172 12173 err = ipif_arp_up(ipif, res_act, was_dup); 12174 return (err); 12175 } 12176 12177 /* 12178 * This routine restarts IPv4/IPv6 duplicate address detection (DAD) 12179 * when a link has just gone back up. 12180 */ 12181 static void 12182 ipif_nce_start_dad(ipif_t *ipif) 12183 { 12184 ncec_t *ncec; 12185 ill_t *ill = ipif->ipif_ill; 12186 boolean_t isv6 = ill->ill_isv6; 12187 12188 if (isv6) { 12189 ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill, 12190 &ipif->ipif_v6lcl_addr); 12191 } else { 12192 ipaddr_t v4addr; 12193 12194 if (ill->ill_net_type != IRE_IF_RESOLVER || 12195 (ipif->ipif_flags & IPIF_UNNUMBERED) || 12196 ipif->ipif_lcl_addr == INADDR_ANY) { 12197 /* 12198 * If we can't contact ARP for some reason, 12199 * that's not really a problem. Just send 12200 * out the routing socket notification that 12201 * DAD completion would have done, and continue. 12202 */ 12203 ipif_mask_reply(ipif); 12204 ipif_up_notify(ipif); 12205 ipif->ipif_addr_ready = 1; 12206 return; 12207 } 12208 12209 IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr); 12210 ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr); 12211 } 12212 12213 if (ncec == NULL) { 12214 ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n", 12215 (void *)ipif)); 12216 return; 12217 } 12218 if (!nce_restart_dad(ncec)) { 12219 /* 12220 * If we can't restart DAD for some reason, that's not really a 12221 * problem. Just send out the routing socket notification that 12222 * DAD completion would have done, and continue. 12223 */ 12224 ipif_up_notify(ipif); 12225 ipif->ipif_addr_ready = 1; 12226 } 12227 ncec_refrele(ncec); 12228 } 12229 12230 /* 12231 * Restart duplicate address detection on all interfaces on the given ill. 12232 * 12233 * This is called when an interface transitions from down to up 12234 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 12235 * 12236 * Note that since the underlying physical link has transitioned, we must cause 12237 * at least one routing socket message to be sent here, either via DAD 12238 * completion or just by default on the first ipif. (If we don't do this, then 12239 * in.mpathd will see long delays when doing link-based failure recovery.) 12240 */ 12241 void 12242 ill_restart_dad(ill_t *ill, boolean_t went_up) 12243 { 12244 ipif_t *ipif; 12245 12246 if (ill == NULL) 12247 return; 12248 12249 /* 12250 * If layer two doesn't support duplicate address detection, then just 12251 * send the routing socket message now and be done with it. 12252 */ 12253 if (!ill->ill_isv6 && arp_no_defense) { 12254 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 12255 return; 12256 } 12257 12258 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12259 if (went_up) { 12260 12261 if (ipif->ipif_flags & IPIF_UP) { 12262 ipif_nce_start_dad(ipif); 12263 } else if (ipif->ipif_flags & IPIF_DUPLICATE) { 12264 /* 12265 * kick off the bring-up process now. 12266 */ 12267 ipif_do_recovery(ipif); 12268 } else { 12269 /* 12270 * Unfortunately, the first ipif is "special" 12271 * and represents the underlying ill in the 12272 * routing socket messages. Thus, when this 12273 * one ipif is down, we must still notify so 12274 * that the user knows the IFF_RUNNING status 12275 * change. (If the first ipif is up, then 12276 * we'll handle eventual routing socket 12277 * notification via DAD completion.) 12278 */ 12279 if (ipif == ill->ill_ipif) { 12280 ip_rts_ifmsg(ill->ill_ipif, 12281 RTSQ_DEFAULT); 12282 } 12283 } 12284 } else { 12285 /* 12286 * After link down, we'll need to send a new routing 12287 * message when the link comes back, so clear 12288 * ipif_addr_ready. 12289 */ 12290 ipif->ipif_addr_ready = 0; 12291 } 12292 } 12293 12294 /* 12295 * If we've torn down links, then notify the user right away. 12296 */ 12297 if (!went_up) 12298 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 12299 } 12300 12301 static void 12302 ipsq_delete(ipsq_t *ipsq) 12303 { 12304 ipxop_t *ipx = ipsq->ipsq_xop; 12305 12306 ipsq->ipsq_ipst = NULL; 12307 ASSERT(ipsq->ipsq_phyint == NULL); 12308 ASSERT(ipsq->ipsq_xop != NULL); 12309 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL); 12310 ASSERT(ipx->ipx_pending_mp == NULL); 12311 kmem_free(ipsq, sizeof (ipsq_t)); 12312 } 12313 12314 static int 12315 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp) 12316 { 12317 int err = 0; 12318 ipif_t *ipif; 12319 12320 if (ill == NULL) 12321 return (0); 12322 12323 ASSERT(IAM_WRITER_ILL(ill)); 12324 ill->ill_up_ipifs = B_TRUE; 12325 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12326 if (ipif->ipif_was_up) { 12327 if (!(ipif->ipif_flags & IPIF_UP)) 12328 err = ipif_up(ipif, q, mp); 12329 ipif->ipif_was_up = B_FALSE; 12330 if (err != 0) { 12331 ASSERT(err == EINPROGRESS); 12332 return (err); 12333 } 12334 } 12335 } 12336 ill->ill_up_ipifs = B_FALSE; 12337 return (0); 12338 } 12339 12340 /* 12341 * This function is called to bring up all the ipifs that were up before 12342 * bringing the ill down via ill_down_ipifs(). 12343 */ 12344 int 12345 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 12346 { 12347 int err; 12348 12349 ASSERT(IAM_WRITER_ILL(ill)); 12350 12351 if (ill->ill_replumbing) { 12352 ill->ill_replumbing = 0; 12353 /* 12354 * Send down REPLUMB_DONE notification followed by the 12355 * BIND_REQ on the arp stream. 12356 */ 12357 if (!ill->ill_isv6) 12358 arp_send_replumb_conf(ill); 12359 } 12360 err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp); 12361 if (err != 0) 12362 return (err); 12363 12364 return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp)); 12365 } 12366 12367 /* 12368 * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring 12369 * down the ipifs without sending DL_UNBIND_REQ to the driver. 12370 */ 12371 static void 12372 ill_down_ipifs(ill_t *ill, boolean_t logical) 12373 { 12374 ipif_t *ipif; 12375 12376 ASSERT(IAM_WRITER_ILL(ill)); 12377 12378 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12379 /* 12380 * We go through the ipif_down logic even if the ipif 12381 * is already down, since routes can be added based 12382 * on down ipifs. Going through ipif_down once again 12383 * will delete any IREs created based on these routes. 12384 */ 12385 if (ipif->ipif_flags & IPIF_UP) 12386 ipif->ipif_was_up = B_TRUE; 12387 12388 if (logical) { 12389 (void) ipif_logical_down(ipif, NULL, NULL); 12390 ipif_non_duplicate(ipif); 12391 (void) ipif_down_tail(ipif); 12392 } else { 12393 (void) ipif_down(ipif, NULL, NULL); 12394 } 12395 } 12396 } 12397 12398 /* 12399 * Redo source address selection. This makes IXAF_VERIFY_SOURCE take 12400 * a look again at valid source addresses. 12401 * This should be called each time after the set of source addresses has been 12402 * changed. 12403 */ 12404 void 12405 ip_update_source_selection(ip_stack_t *ipst) 12406 { 12407 /* We skip past SRC_GENERATION_VERIFY */ 12408 if (atomic_add_32_nv(&ipst->ips_src_generation, 1) == 12409 SRC_GENERATION_VERIFY) 12410 atomic_add_32(&ipst->ips_src_generation, 1); 12411 } 12412 12413 /* 12414 * Finish the group join started in ip_sioctl_groupname(). 12415 */ 12416 /* ARGSUSED */ 12417 static void 12418 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 12419 { 12420 ill_t *ill = q->q_ptr; 12421 phyint_t *phyi = ill->ill_phyint; 12422 ipmp_grp_t *grp = phyi->phyint_grp; 12423 ip_stack_t *ipst = ill->ill_ipst; 12424 12425 /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */ 12426 ASSERT(!IS_IPMP(ill) && grp != NULL); 12427 ASSERT(IAM_WRITER_IPSQ(ipsq)); 12428 12429 if (phyi->phyint_illv4 != NULL) { 12430 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12431 VERIFY(grp->gr_pendv4-- > 0); 12432 rw_exit(&ipst->ips_ipmp_lock); 12433 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4); 12434 } 12435 if (phyi->phyint_illv6 != NULL) { 12436 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12437 VERIFY(grp->gr_pendv6-- > 0); 12438 rw_exit(&ipst->ips_ipmp_lock); 12439 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6); 12440 } 12441 freemsg(mp); 12442 } 12443 12444 /* 12445 * Process an SIOCSLIFGROUPNAME request. 12446 */ 12447 /* ARGSUSED */ 12448 int 12449 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12450 ip_ioctl_cmd_t *ipip, void *ifreq) 12451 { 12452 struct lifreq *lifr = ifreq; 12453 ill_t *ill = ipif->ipif_ill; 12454 ip_stack_t *ipst = ill->ill_ipst; 12455 phyint_t *phyi = ill->ill_phyint; 12456 ipmp_grp_t *grp = phyi->phyint_grp; 12457 mblk_t *ipsq_mp; 12458 int err = 0; 12459 12460 /* 12461 * Note that phyint_grp can only change here, where we're exclusive. 12462 */ 12463 ASSERT(IAM_WRITER_ILL(ill)); 12464 12465 if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL || 12466 (phyi->phyint_flags & PHYI_VIRTUAL)) 12467 return (EINVAL); 12468 12469 lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0'; 12470 12471 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12472 12473 /* 12474 * If the name hasn't changed, there's nothing to do. 12475 */ 12476 if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0) 12477 goto unlock; 12478 12479 /* 12480 * Handle requests to rename an IPMP meta-interface. 12481 * 12482 * Note that creation of the IPMP meta-interface is handled in 12483 * userland through the standard plumbing sequence. As part of the 12484 * plumbing the IPMP meta-interface, its initial groupname is set to 12485 * the name of the interface (see ipif_set_values_tail()). 12486 */ 12487 if (IS_IPMP(ill)) { 12488 err = ipmp_grp_rename(grp, lifr->lifr_groupname); 12489 goto unlock; 12490 } 12491 12492 /* 12493 * Handle requests to add or remove an IP interface from a group. 12494 */ 12495 if (lifr->lifr_groupname[0] != '\0') { /* add */ 12496 /* 12497 * Moves are handled by first removing the interface from 12498 * its existing group, and then adding it to another group. 12499 * So, fail if it's already in a group. 12500 */ 12501 if (IS_UNDER_IPMP(ill)) { 12502 err = EALREADY; 12503 goto unlock; 12504 } 12505 12506 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst); 12507 if (grp == NULL) { 12508 err = ENOENT; 12509 goto unlock; 12510 } 12511 12512 /* 12513 * Check if the phyint and its ills are suitable for 12514 * inclusion into the group. 12515 */ 12516 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0) 12517 goto unlock; 12518 12519 /* 12520 * Checks pass; join the group, and enqueue the remaining 12521 * illgrp joins for when we've become part of the group xop 12522 * and are exclusive across its IPSQs. Since qwriter_ip() 12523 * requires an mblk_t to scribble on, and since `mp' will be 12524 * freed as part of completing the ioctl, allocate another. 12525 */ 12526 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) { 12527 err = ENOMEM; 12528 goto unlock; 12529 } 12530 12531 /* 12532 * Before we drop ipmp_lock, bump gr_pend* to ensure that the 12533 * IPMP meta-interface ills needed by `phyi' cannot go away 12534 * before ip_join_illgrps() is called back. See the comments 12535 * in ip_sioctl_plink_ipmp() for more. 12536 */ 12537 if (phyi->phyint_illv4 != NULL) 12538 grp->gr_pendv4++; 12539 if (phyi->phyint_illv6 != NULL) 12540 grp->gr_pendv6++; 12541 12542 rw_exit(&ipst->ips_ipmp_lock); 12543 12544 ipmp_phyint_join_grp(phyi, grp); 12545 ill_refhold(ill); 12546 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps, 12547 SWITCH_OP, B_FALSE); 12548 return (0); 12549 } else { 12550 /* 12551 * Request to remove the interface from a group. If the 12552 * interface is not in a group, this trivially succeeds. 12553 */ 12554 rw_exit(&ipst->ips_ipmp_lock); 12555 if (IS_UNDER_IPMP(ill)) 12556 ipmp_phyint_leave_grp(phyi); 12557 return (0); 12558 } 12559 unlock: 12560 rw_exit(&ipst->ips_ipmp_lock); 12561 return (err); 12562 } 12563 12564 /* 12565 * Process an SIOCGLIFBINDING request. 12566 */ 12567 /* ARGSUSED */ 12568 int 12569 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12570 ip_ioctl_cmd_t *ipip, void *ifreq) 12571 { 12572 ill_t *ill; 12573 struct lifreq *lifr = ifreq; 12574 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12575 12576 if (!IS_IPMP(ipif->ipif_ill)) 12577 return (EINVAL); 12578 12579 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12580 if ((ill = ipif->ipif_bound_ill) == NULL) 12581 lifr->lifr_binding[0] = '\0'; 12582 else 12583 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ); 12584 rw_exit(&ipst->ips_ipmp_lock); 12585 return (0); 12586 } 12587 12588 /* 12589 * Process an SIOCGLIFGROUPNAME request. 12590 */ 12591 /* ARGSUSED */ 12592 int 12593 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12594 ip_ioctl_cmd_t *ipip, void *ifreq) 12595 { 12596 ipmp_grp_t *grp; 12597 struct lifreq *lifr = ifreq; 12598 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12599 12600 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12601 if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL) 12602 lifr->lifr_groupname[0] = '\0'; 12603 else 12604 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ); 12605 rw_exit(&ipst->ips_ipmp_lock); 12606 return (0); 12607 } 12608 12609 /* 12610 * Process an SIOCGLIFGROUPINFO request. 12611 */ 12612 /* ARGSUSED */ 12613 int 12614 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12615 ip_ioctl_cmd_t *ipip, void *dummy) 12616 { 12617 ipmp_grp_t *grp; 12618 lifgroupinfo_t *lifgr; 12619 ip_stack_t *ipst = CONNQ_TO_IPST(q); 12620 12621 /* ip_wput_nondata() verified mp->b_cont->b_cont */ 12622 lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr; 12623 lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0'; 12624 12625 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12626 if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) { 12627 rw_exit(&ipst->ips_ipmp_lock); 12628 return (ENOENT); 12629 } 12630 ipmp_grp_info(grp, lifgr); 12631 rw_exit(&ipst->ips_ipmp_lock); 12632 return (0); 12633 } 12634 12635 static void 12636 ill_dl_down(ill_t *ill) 12637 { 12638 DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill); 12639 12640 /* 12641 * The ill is down; unbind but stay attached since we're still 12642 * associated with a PPA. If we have negotiated DLPI capabilites 12643 * with the data link service provider (IDS_OK) then reset them. 12644 * The interval between unbinding and rebinding is potentially 12645 * unbounded hence we cannot assume things will be the same. 12646 * The DLPI capabilities will be probed again when the data link 12647 * is brought up. 12648 */ 12649 mblk_t *mp = ill->ill_unbind_mp; 12650 12651 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 12652 12653 if (!ill->ill_replumbing) { 12654 /* Free all ilms for this ill */ 12655 update_conn_ill(ill, ill->ill_ipst); 12656 } else { 12657 ill_leave_multicast(ill); 12658 } 12659 12660 ill->ill_unbind_mp = NULL; 12661 if (mp != NULL) { 12662 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 12663 dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 12664 ill->ill_name)); 12665 mutex_enter(&ill->ill_lock); 12666 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 12667 mutex_exit(&ill->ill_lock); 12668 /* 12669 * ip_rput does not pass up normal (M_PROTO) DLPI messages 12670 * after ILL_CONDEMNED is set. So in the unplumb case, we call 12671 * ill_capability_dld_disable disable rightaway. If this is not 12672 * an unplumb operation then the disable happens on receipt of 12673 * the capab ack via ip_rput_dlpi_writer -> 12674 * ill_capability_ack_thr. In both cases the order of 12675 * the operations seen by DLD is capability disable followed 12676 * by DL_UNBIND. Also the DLD capability disable needs a 12677 * cv_wait'able context. 12678 */ 12679 if (ill->ill_state_flags & ILL_CONDEMNED) 12680 ill_capability_dld_disable(ill); 12681 ill_capability_reset(ill, B_FALSE); 12682 ill_dlpi_send(ill, mp); 12683 } 12684 mutex_enter(&ill->ill_lock); 12685 ill->ill_dl_up = 0; 12686 ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); 12687 mutex_exit(&ill->ill_lock); 12688 } 12689 12690 void 12691 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 12692 { 12693 union DL_primitives *dlp; 12694 t_uscalar_t prim; 12695 boolean_t waitack = B_FALSE; 12696 12697 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12698 12699 dlp = (union DL_primitives *)mp->b_rptr; 12700 prim = dlp->dl_primitive; 12701 12702 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 12703 dl_primstr(prim), prim, ill->ill_name)); 12704 12705 switch (prim) { 12706 case DL_PHYS_ADDR_REQ: 12707 { 12708 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 12709 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 12710 break; 12711 } 12712 case DL_BIND_REQ: 12713 mutex_enter(&ill->ill_lock); 12714 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 12715 mutex_exit(&ill->ill_lock); 12716 break; 12717 } 12718 12719 /* 12720 * Except for the ACKs for the M_PCPROTO messages, all other ACKs 12721 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore 12722 * we only wait for the ACK of the DL_UNBIND_REQ. 12723 */ 12724 mutex_enter(&ill->ill_lock); 12725 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 12726 (prim == DL_UNBIND_REQ)) { 12727 ill->ill_dlpi_pending = prim; 12728 waitack = B_TRUE; 12729 } 12730 12731 mutex_exit(&ill->ill_lock); 12732 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch", 12733 char *, dl_primstr(prim), ill_t *, ill); 12734 putnext(ill->ill_wq, mp); 12735 12736 /* 12737 * There is no ack for DL_NOTIFY_CONF messages 12738 */ 12739 if (waitack && prim == DL_NOTIFY_CONF) 12740 ill_dlpi_done(ill, prim); 12741 } 12742 12743 /* 12744 * Helper function for ill_dlpi_send(). 12745 */ 12746 /* ARGSUSED */ 12747 static void 12748 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 12749 { 12750 ill_dlpi_send(q->q_ptr, mp); 12751 } 12752 12753 /* 12754 * Send a DLPI control message to the driver but make sure there 12755 * is only one outstanding message. Uses ill_dlpi_pending to tell 12756 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 12757 * when an ACK or a NAK is received to process the next queued message. 12758 */ 12759 void 12760 ill_dlpi_send(ill_t *ill, mblk_t *mp) 12761 { 12762 mblk_t **mpp; 12763 12764 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12765 12766 /* 12767 * To ensure that any DLPI requests for current exclusive operation 12768 * are always completely sent before any DLPI messages for other 12769 * operations, require writer access before enqueuing. 12770 */ 12771 if (!IAM_WRITER_ILL(ill)) { 12772 ill_refhold(ill); 12773 /* qwriter_ip() does the ill_refrele() */ 12774 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer, 12775 NEW_OP, B_TRUE); 12776 return; 12777 } 12778 12779 mutex_enter(&ill->ill_lock); 12780 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 12781 /* Must queue message. Tail insertion */ 12782 mpp = &ill->ill_dlpi_deferred; 12783 while (*mpp != NULL) 12784 mpp = &((*mpp)->b_next); 12785 12786 ip1dbg(("ill_dlpi_send: deferring request for %s " 12787 "while %s pending\n", ill->ill_name, 12788 dl_primstr(ill->ill_dlpi_pending))); 12789 12790 *mpp = mp; 12791 mutex_exit(&ill->ill_lock); 12792 return; 12793 } 12794 mutex_exit(&ill->ill_lock); 12795 ill_dlpi_dispatch(ill, mp); 12796 } 12797 12798 void 12799 ill_capability_send(ill_t *ill, mblk_t *mp) 12800 { 12801 ill->ill_capab_pending_cnt++; 12802 ill_dlpi_send(ill, mp); 12803 } 12804 12805 void 12806 ill_capability_done(ill_t *ill) 12807 { 12808 ASSERT(ill->ill_capab_pending_cnt != 0); 12809 12810 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 12811 12812 ill->ill_capab_pending_cnt--; 12813 if (ill->ill_capab_pending_cnt == 0 && 12814 ill->ill_dlpi_capab_state == IDCS_OK) 12815 ill_capability_reset_alloc(ill); 12816 } 12817 12818 /* 12819 * Send all deferred DLPI messages without waiting for their ACKs. 12820 */ 12821 void 12822 ill_dlpi_send_deferred(ill_t *ill) 12823 { 12824 mblk_t *mp, *nextmp; 12825 12826 /* 12827 * Clear ill_dlpi_pending so that the message is not queued in 12828 * ill_dlpi_send(). 12829 */ 12830 mutex_enter(&ill->ill_lock); 12831 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12832 mp = ill->ill_dlpi_deferred; 12833 ill->ill_dlpi_deferred = NULL; 12834 mutex_exit(&ill->ill_lock); 12835 12836 for (; mp != NULL; mp = nextmp) { 12837 nextmp = mp->b_next; 12838 mp->b_next = NULL; 12839 ill_dlpi_send(ill, mp); 12840 } 12841 } 12842 12843 /* 12844 * Clear all the deferred DLPI messages. Called on receiving an M_ERROR 12845 * or M_HANGUP 12846 */ 12847 static void 12848 ill_dlpi_clear_deferred(ill_t *ill) 12849 { 12850 mblk_t *mp, *nextmp; 12851 12852 mutex_enter(&ill->ill_lock); 12853 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12854 mp = ill->ill_dlpi_deferred; 12855 ill->ill_dlpi_deferred = NULL; 12856 mutex_exit(&ill->ill_lock); 12857 12858 for (; mp != NULL; mp = nextmp) { 12859 nextmp = mp->b_next; 12860 inet_freemsg(mp); 12861 } 12862 } 12863 12864 /* 12865 * Check if the DLPI primitive `prim' is pending; print a warning if not. 12866 */ 12867 boolean_t 12868 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim) 12869 { 12870 t_uscalar_t pending; 12871 12872 mutex_enter(&ill->ill_lock); 12873 if (ill->ill_dlpi_pending == prim) { 12874 mutex_exit(&ill->ill_lock); 12875 return (B_TRUE); 12876 } 12877 12878 /* 12879 * During teardown, ill_dlpi_dispatch() will send DLPI requests 12880 * without waiting, so don't print any warnings in that case. 12881 */ 12882 if (ill->ill_state_flags & ILL_CONDEMNED) { 12883 mutex_exit(&ill->ill_lock); 12884 return (B_FALSE); 12885 } 12886 pending = ill->ill_dlpi_pending; 12887 mutex_exit(&ill->ill_lock); 12888 12889 if (pending == DL_PRIM_INVAL) { 12890 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 12891 "received unsolicited ack for %s on %s\n", 12892 dl_primstr(prim), ill->ill_name); 12893 } else { 12894 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 12895 "received unexpected ack for %s on %s (expecting %s)\n", 12896 dl_primstr(prim), ill->ill_name, dl_primstr(pending)); 12897 } 12898 return (B_FALSE); 12899 } 12900 12901 /* 12902 * Complete the current DLPI operation associated with `prim' on `ill' and 12903 * start the next queued DLPI operation (if any). If there are no queued DLPI 12904 * operations and the ill's current exclusive IPSQ operation has finished 12905 * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to 12906 * allow the next exclusive IPSQ operation to begin upon ipsq_exit(). See 12907 * the comments above ipsq_current_finish() for details. 12908 */ 12909 void 12910 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 12911 { 12912 mblk_t *mp; 12913 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 12914 ipxop_t *ipx = ipsq->ipsq_xop; 12915 12916 ASSERT(IAM_WRITER_IPSQ(ipsq)); 12917 mutex_enter(&ill->ill_lock); 12918 12919 ASSERT(prim != DL_PRIM_INVAL); 12920 ASSERT(ill->ill_dlpi_pending == prim); 12921 12922 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 12923 dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 12924 12925 if ((mp = ill->ill_dlpi_deferred) == NULL) { 12926 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12927 if (ipx->ipx_current_done) { 12928 mutex_enter(&ipx->ipx_lock); 12929 ipx->ipx_current_ipif = NULL; 12930 mutex_exit(&ipx->ipx_lock); 12931 } 12932 cv_signal(&ill->ill_cv); 12933 mutex_exit(&ill->ill_lock); 12934 return; 12935 } 12936 12937 ill->ill_dlpi_deferred = mp->b_next; 12938 mp->b_next = NULL; 12939 mutex_exit(&ill->ill_lock); 12940 12941 ill_dlpi_dispatch(ill, mp); 12942 } 12943 12944 /* 12945 * Queue a (multicast) DLPI control message to be sent to the driver by 12946 * later calling ill_dlpi_send_queued. 12947 * We queue them while holding a lock (ill_mcast_lock) to ensure that they 12948 * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ 12949 * for the same group to race. 12950 * We send DLPI control messages in order using ill_lock. 12951 * For IPMP we should be called on the cast_ill. 12952 */ 12953 void 12954 ill_dlpi_queue(ill_t *ill, mblk_t *mp) 12955 { 12956 mblk_t **mpp; 12957 12958 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12959 12960 mutex_enter(&ill->ill_lock); 12961 /* Must queue message. Tail insertion */ 12962 mpp = &ill->ill_dlpi_deferred; 12963 while (*mpp != NULL) 12964 mpp = &((*mpp)->b_next); 12965 12966 *mpp = mp; 12967 mutex_exit(&ill->ill_lock); 12968 } 12969 12970 /* 12971 * Send the messages that were queued. Make sure there is only 12972 * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done() 12973 * when an ACK or a NAK is received to process the next queued message. 12974 * For IPMP we are called on the upper ill, but when send what is queued 12975 * on the cast_ill. 12976 */ 12977 void 12978 ill_dlpi_send_queued(ill_t *ill) 12979 { 12980 mblk_t *mp; 12981 union DL_primitives *dlp; 12982 t_uscalar_t prim; 12983 ill_t *release_ill = NULL; 12984 12985 if (IS_IPMP(ill)) { 12986 /* On the upper IPMP ill. */ 12987 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12988 if (release_ill == NULL) { 12989 /* Avoid ever sending anything down to the ipmpstub */ 12990 return; 12991 } 12992 ill = release_ill; 12993 } 12994 mutex_enter(&ill->ill_lock); 12995 while ((mp = ill->ill_dlpi_deferred) != NULL) { 12996 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 12997 /* Can't send. Somebody else will send it */ 12998 mutex_exit(&ill->ill_lock); 12999 goto done; 13000 } 13001 ill->ill_dlpi_deferred = mp->b_next; 13002 mp->b_next = NULL; 13003 if (!ill->ill_dl_up) { 13004 /* 13005 * Nobody there. All multicast addresses will be 13006 * re-joined when we get the DL_BIND_ACK bringing the 13007 * interface up. 13008 */ 13009 freemsg(mp); 13010 continue; 13011 } 13012 dlp = (union DL_primitives *)mp->b_rptr; 13013 prim = dlp->dl_primitive; 13014 13015 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 13016 (prim == DL_UNBIND_REQ)) { 13017 ill->ill_dlpi_pending = prim; 13018 } 13019 mutex_exit(&ill->ill_lock); 13020 13021 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued", 13022 char *, dl_primstr(prim), ill_t *, ill); 13023 putnext(ill->ill_wq, mp); 13024 mutex_enter(&ill->ill_lock); 13025 } 13026 mutex_exit(&ill->ill_lock); 13027 done: 13028 if (release_ill != NULL) 13029 ill_refrele(release_ill); 13030 } 13031 13032 /* 13033 * Queue an IP (IGMP/MLD) message to be sent by IP from 13034 * ill_mcast_send_queued 13035 * We queue them while holding a lock (ill_mcast_lock) to ensure that they 13036 * are sent in order i.e., prevent a IGMP leave and IGMP join for the same 13037 * group to race. 13038 * We send them in order using ill_lock. 13039 * For IPMP we are called on the upper ill, but we queue on the cast_ill. 13040 */ 13041 void 13042 ill_mcast_queue(ill_t *ill, mblk_t *mp) 13043 { 13044 mblk_t **mpp; 13045 ill_t *release_ill = NULL; 13046 13047 ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock)); 13048 13049 if (IS_IPMP(ill)) { 13050 /* On the upper IPMP ill. */ 13051 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 13052 if (release_ill == NULL) { 13053 /* Discard instead of queuing for the ipmp interface */ 13054 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 13055 ip_drop_output("ipIfStatsOutDiscards - no cast_ill", 13056 mp, ill); 13057 freemsg(mp); 13058 return; 13059 } 13060 ill = release_ill; 13061 } 13062 13063 mutex_enter(&ill->ill_lock); 13064 /* Must queue message. Tail insertion */ 13065 mpp = &ill->ill_mcast_deferred; 13066 while (*mpp != NULL) 13067 mpp = &((*mpp)->b_next); 13068 13069 *mpp = mp; 13070 mutex_exit(&ill->ill_lock); 13071 if (release_ill != NULL) 13072 ill_refrele(release_ill); 13073 } 13074 13075 /* 13076 * Send the IP packets that were queued by ill_mcast_queue. 13077 * These are IGMP/MLD packets. 13078 * 13079 * For IPMP we are called on the upper ill, but when send what is queued 13080 * on the cast_ill. 13081 * 13082 * Request loopback of the report if we are acting as a multicast 13083 * router, so that the process-level routing demon can hear it. 13084 * This will run multiple times for the same group if there are members 13085 * on the same group for multiple ipif's on the same ill. The 13086 * igmp_input/mld_input code will suppress this due to the loopback thus we 13087 * always loopback membership report. 13088 * 13089 * We also need to make sure that this does not get load balanced 13090 * by IPMP. We do this by passing an ill to ip_output_simple. 13091 */ 13092 void 13093 ill_mcast_send_queued(ill_t *ill) 13094 { 13095 mblk_t *mp; 13096 ip_xmit_attr_t ixas; 13097 ill_t *release_ill = NULL; 13098 13099 if (IS_IPMP(ill)) { 13100 /* On the upper IPMP ill. */ 13101 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 13102 if (release_ill == NULL) { 13103 /* 13104 * We should have no messages on the ipmp interface 13105 * but no point in trying to send them. 13106 */ 13107 return; 13108 } 13109 ill = release_ill; 13110 } 13111 bzero(&ixas, sizeof (ixas)); 13112 ixas.ixa_zoneid = ALL_ZONES; 13113 ixas.ixa_cred = kcred; 13114 ixas.ixa_cpid = NOPID; 13115 ixas.ixa_tsl = NULL; 13116 /* 13117 * Here we set ixa_ifindex. If IPMP it will be the lower ill which 13118 * makes ip_select_route pick the IRE_MULTICAST for the cast_ill. 13119 * That is necessary to handle IGMP/MLD snooping switches. 13120 */ 13121 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 13122 ixas.ixa_ipst = ill->ill_ipst; 13123 13124 mutex_enter(&ill->ill_lock); 13125 while ((mp = ill->ill_mcast_deferred) != NULL) { 13126 ill->ill_mcast_deferred = mp->b_next; 13127 mp->b_next = NULL; 13128 if (!ill->ill_dl_up) { 13129 /* 13130 * Nobody there. Just drop the ip packets. 13131 * IGMP/MLD will resend later, if this is a replumb. 13132 */ 13133 freemsg(mp); 13134 continue; 13135 } 13136 mutex_enter(&ill->ill_phyint->phyint_lock); 13137 if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) { 13138 /* 13139 * When the ill is getting deactivated, we only want to 13140 * send the DLPI messages, so drop IGMP/MLD packets. 13141 * DLPI messages are handled by ill_dlpi_send_queued() 13142 */ 13143 mutex_exit(&ill->ill_phyint->phyint_lock); 13144 freemsg(mp); 13145 continue; 13146 } 13147 mutex_exit(&ill->ill_phyint->phyint_lock); 13148 mutex_exit(&ill->ill_lock); 13149 13150 /* Check whether we are sending IPv4 or IPv6. */ 13151 if (ill->ill_isv6) { 13152 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 13153 13154 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 13155 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; 13156 } else { 13157 ipha_t *ipha = (ipha_t *)mp->b_rptr; 13158 13159 ixas.ixa_multicast_ttl = ipha->ipha_ttl; 13160 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 13161 ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM; 13162 } 13163 ixas.ixa_flags &= ~IXAF_VERIFY_SOURCE; 13164 ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE; 13165 (void) ip_output_simple(mp, &ixas); 13166 ixa_cleanup(&ixas); 13167 13168 mutex_enter(&ill->ill_lock); 13169 } 13170 mutex_exit(&ill->ill_lock); 13171 13172 done: 13173 if (release_ill != NULL) 13174 ill_refrele(release_ill); 13175 } 13176 13177 /* 13178 * Take down a specific interface, but don't lose any information about it. 13179 * (Always called as writer.) 13180 * This function goes through the down sequence even if the interface is 13181 * already down. There are 2 reasons. 13182 * a. Currently we permit interface routes that depend on down interfaces 13183 * to be added. This behaviour itself is questionable. However it appears 13184 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 13185 * time. We go thru the cleanup in order to remove these routes. 13186 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 13187 * DL_ERROR_ACK in response to the DL_BIND request. The interface is 13188 * down, but we need to cleanup i.e. do ill_dl_down and 13189 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 13190 * 13191 * IP-MT notes: 13192 * 13193 * Model of reference to interfaces. 13194 * 13195 * The following members in ipif_t track references to the ipif. 13196 * int ipif_refcnt; Active reference count 13197 * 13198 * The following members in ill_t track references to the ill. 13199 * int ill_refcnt; active refcnt 13200 * uint_t ill_ire_cnt; Number of ires referencing ill 13201 * uint_t ill_ncec_cnt; Number of ncecs referencing ill 13202 * uint_t ill_nce_cnt; Number of nces referencing ill 13203 * uint_t ill_ilm_cnt; Number of ilms referencing ill 13204 * 13205 * Reference to an ipif or ill can be obtained in any of the following ways. 13206 * 13207 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 13208 * Pointers to ipif / ill from other data structures viz ire and conn. 13209 * Implicit reference to the ipif / ill by holding a reference to the ire. 13210 * 13211 * The ipif/ill lookup functions return a reference held ipif / ill. 13212 * ipif_refcnt and ill_refcnt track the reference counts respectively. 13213 * This is a purely dynamic reference count associated with threads holding 13214 * references to the ipif / ill. Pointers from other structures do not 13215 * count towards this reference count. 13216 * 13217 * ill_ire_cnt is the number of ire's associated with the 13218 * ill. This is incremented whenever a new ire is created referencing the 13219 * ill. This is done atomically inside ire_add_v[46] where the ire is 13220 * actually added to the ire hash table. The count is decremented in 13221 * ire_inactive where the ire is destroyed. 13222 * 13223 * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill. 13224 * This is incremented atomically in 13225 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the 13226 * table. Similarly it is decremented in ncec_inactive() where the ncec 13227 * is destroyed. 13228 * 13229 * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is 13230 * incremented atomically in nce_add() where the nce is actually added to the 13231 * ill_nce. Similarly it is decremented in nce_inactive() where the nce 13232 * is destroyed. 13233 * 13234 * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in 13235 * ilm_add() and decremented before the ilm is freed in ilm_delete(). 13236 * 13237 * Flow of ioctls involving interface down/up 13238 * 13239 * The following is the sequence of an attempt to set some critical flags on an 13240 * up interface. 13241 * ip_sioctl_flags 13242 * ipif_down 13243 * wait for ipif to be quiescent 13244 * ipif_down_tail 13245 * ip_sioctl_flags_tail 13246 * 13247 * All set ioctls that involve down/up sequence would have a skeleton similar 13248 * to the above. All the *tail functions are called after the refcounts have 13249 * dropped to the appropriate values. 13250 * 13251 * SIOC ioctls during the IPIF_CHANGING interval. 13252 * 13253 * Threads handling SIOC set ioctls serialize on the squeue, but this 13254 * is not done for SIOC get ioctls. Since a set ioctl can cause several 13255 * steps of internal changes to the state, some of which are visible in 13256 * ipif_flags (such as IFF_UP being cleared and later set), and we want 13257 * the set ioctl to be atomic related to the get ioctls, the SIOC get code 13258 * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then 13259 * enqueued in the ipsq and the operation is restarted by ipsq_exit() when 13260 * the current exclusive operation completes. The IPIF_CHANGING check 13261 * and enqueue is atomic using the ill_lock and ipsq_lock. The 13262 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 13263 * change while the ill_lock is held. Before dropping the ill_lock we acquire 13264 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 13265 * until we release the ipsq_lock, even though the ill/ipif state flags 13266 * can change after we drop the ill_lock. 13267 */ 13268 int 13269 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 13270 { 13271 ill_t *ill = ipif->ipif_ill; 13272 conn_t *connp; 13273 boolean_t success; 13274 boolean_t ipif_was_up = B_FALSE; 13275 ip_stack_t *ipst = ill->ill_ipst; 13276 13277 ASSERT(IAM_WRITER_IPIF(ipif)); 13278 13279 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13280 13281 DTRACE_PROBE3(ipif__downup, char *, "ipif_down", 13282 ill_t *, ill, ipif_t *, ipif); 13283 13284 if (ipif->ipif_flags & IPIF_UP) { 13285 mutex_enter(&ill->ill_lock); 13286 ipif->ipif_flags &= ~IPIF_UP; 13287 ASSERT(ill->ill_ipif_up_count > 0); 13288 --ill->ill_ipif_up_count; 13289 mutex_exit(&ill->ill_lock); 13290 ipif_was_up = B_TRUE; 13291 /* Update status in SCTP's list */ 13292 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 13293 ill_nic_event_dispatch(ipif->ipif_ill, 13294 MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0); 13295 } 13296 13297 /* 13298 * Blow away memberships we established in ipif_multicast_up(). 13299 */ 13300 ipif_multicast_down(ipif); 13301 13302 /* 13303 * Remove from the mapping for __sin6_src_id. We insert only 13304 * when the address is not INADDR_ANY. As IPv4 addresses are 13305 * stored as mapped addresses, we need to check for mapped 13306 * INADDR_ANY also. 13307 */ 13308 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 13309 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 13310 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 13311 int err; 13312 13313 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 13314 ipif->ipif_zoneid, ipst); 13315 if (err != 0) { 13316 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 13317 } 13318 } 13319 13320 if (ipif_was_up) { 13321 /* only delete if we'd added ire's before */ 13322 if (ipif->ipif_isv6) 13323 ipif_delete_ires_v6(ipif); 13324 else 13325 ipif_delete_ires_v4(ipif); 13326 } 13327 13328 if (ipif_was_up && ill->ill_ipif_up_count == 0) { 13329 /* 13330 * Since the interface is now down, it may have just become 13331 * inactive. Note that this needs to be done even for a 13332 * lll_logical_down(), or ARP entries will not get correctly 13333 * restored when the interface comes back up. 13334 */ 13335 if (IS_UNDER_IPMP(ill)) 13336 ipmp_ill_refresh_active(ill); 13337 } 13338 13339 /* 13340 * neighbor-discovery or arp entries for this interface. The ipif 13341 * has to be quiesced, so we walk all the nce's and delete those 13342 * that point at the ipif->ipif_ill. At the same time, we also 13343 * update IPMP so that ipifs for data addresses are unbound. We dont 13344 * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer 13345 * that for ipif_down_tail() 13346 */ 13347 ipif_nce_down(ipif); 13348 13349 /* 13350 * If this is the last ipif on the ill, we also need to remove 13351 * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will 13352 * never succeed. 13353 */ 13354 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) 13355 ire_walk_ill(0, 0, ill_downi, ill, ill); 13356 13357 /* 13358 * Walk all CONNs that can have a reference on an ire for this 13359 * ipif (we actually walk all that now have stale references). 13360 */ 13361 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); 13362 13363 /* 13364 * If mp is NULL the caller will wait for the appropriate refcnt. 13365 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 13366 * and ill_delete -> ipif_free -> ipif_down 13367 */ 13368 if (mp == NULL) { 13369 ASSERT(q == NULL); 13370 return (0); 13371 } 13372 13373 if (CONN_Q(q)) { 13374 connp = Q_TO_CONN(q); 13375 mutex_enter(&connp->conn_lock); 13376 } else { 13377 connp = NULL; 13378 } 13379 mutex_enter(&ill->ill_lock); 13380 /* 13381 * Are there any ire's pointing to this ipif that are still active ? 13382 * If this is the last ipif going down, are there any ire's pointing 13383 * to this ill that are still active ? 13384 */ 13385 if (ipif_is_quiescent(ipif)) { 13386 mutex_exit(&ill->ill_lock); 13387 if (connp != NULL) 13388 mutex_exit(&connp->conn_lock); 13389 return (0); 13390 } 13391 13392 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 13393 ill->ill_name, (void *)ill)); 13394 /* 13395 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 13396 * drops down, the operation will be restarted by ipif_ill_refrele_tail 13397 * which in turn is called by the last refrele on the ipif/ill/ire. 13398 */ 13399 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 13400 if (!success) { 13401 /* The conn is closing. So just return */ 13402 ASSERT(connp != NULL); 13403 mutex_exit(&ill->ill_lock); 13404 mutex_exit(&connp->conn_lock); 13405 return (EINTR); 13406 } 13407 13408 mutex_exit(&ill->ill_lock); 13409 if (connp != NULL) 13410 mutex_exit(&connp->conn_lock); 13411 return (EINPROGRESS); 13412 } 13413 13414 int 13415 ipif_down_tail(ipif_t *ipif) 13416 { 13417 ill_t *ill = ipif->ipif_ill; 13418 int err = 0; 13419 13420 DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail", 13421 ill_t *, ill, ipif_t *, ipif); 13422 13423 /* 13424 * Skip any loopback interface (null wq). 13425 * If this is the last logical interface on the ill 13426 * have ill_dl_down tell the driver we are gone (unbind) 13427 * Note that lun 0 can ipif_down even though 13428 * there are other logical units that are up. 13429 * This occurs e.g. when we change a "significant" IFF_ flag. 13430 */ 13431 if (ill->ill_wq != NULL && !ill->ill_logical_down && 13432 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 13433 ill->ill_dl_up) { 13434 ill_dl_down(ill); 13435 } 13436 if (!ipif->ipif_isv6) 13437 err = ipif_arp_down(ipif); 13438 13439 ill->ill_logical_down = 0; 13440 13441 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 13442 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT); 13443 return (err); 13444 } 13445 13446 /* 13447 * Bring interface logically down without bringing the physical interface 13448 * down e.g. when the netmask is changed. This avoids long lasting link 13449 * negotiations between an ethernet interface and a certain switches. 13450 */ 13451 static int 13452 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 13453 { 13454 DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down", 13455 ill_t *, ipif->ipif_ill, ipif_t *, ipif); 13456 13457 /* 13458 * The ill_logical_down flag is a transient flag. It is set here 13459 * and is cleared once the down has completed in ipif_down_tail. 13460 * This flag does not indicate whether the ill stream is in the 13461 * DL_BOUND state with the driver. Instead this flag is used by 13462 * ipif_down_tail to determine whether to DL_UNBIND the stream with 13463 * the driver. The state of the ill stream i.e. whether it is 13464 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 13465 */ 13466 ipif->ipif_ill->ill_logical_down = 1; 13467 return (ipif_down(ipif, q, mp)); 13468 } 13469 13470 /* 13471 * Initiate deallocate of an IPIF. Always called as writer. Called by 13472 * ill_delete or ip_sioctl_removeif. 13473 */ 13474 static void 13475 ipif_free(ipif_t *ipif) 13476 { 13477 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13478 13479 ASSERT(IAM_WRITER_IPIF(ipif)); 13480 13481 if (ipif->ipif_recovery_id != 0) 13482 (void) untimeout(ipif->ipif_recovery_id); 13483 ipif->ipif_recovery_id = 0; 13484 13485 /* 13486 * Take down the interface. We can be called either from ill_delete 13487 * or from ip_sioctl_removeif. 13488 */ 13489 (void) ipif_down(ipif, NULL, NULL); 13490 13491 /* 13492 * Now that the interface is down, there's no chance it can still 13493 * become a duplicate. Cancel any timer that may have been set while 13494 * tearing down. 13495 */ 13496 if (ipif->ipif_recovery_id != 0) 13497 (void) untimeout(ipif->ipif_recovery_id); 13498 ipif->ipif_recovery_id = 0; 13499 13500 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13501 /* Remove pointers to this ill in the multicast routing tables */ 13502 reset_mrt_vif_ipif(ipif); 13503 /* If necessary, clear the cached source ipif rotor. */ 13504 if (ipif->ipif_ill->ill_src_ipif == ipif) 13505 ipif->ipif_ill->ill_src_ipif = NULL; 13506 rw_exit(&ipst->ips_ill_g_lock); 13507 } 13508 13509 static void 13510 ipif_free_tail(ipif_t *ipif) 13511 { 13512 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13513 13514 /* 13515 * Need to hold both ill_g_lock and ill_lock while 13516 * inserting or removing an ipif from the linked list 13517 * of ipifs hanging off the ill. 13518 */ 13519 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13520 13521 #ifdef DEBUG 13522 ipif_trace_cleanup(ipif); 13523 #endif 13524 13525 /* Ask SCTP to take it out of it list */ 13526 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 13527 ip_rts_newaddrmsg(RTM_FREEADDR, 0, ipif, RTSQ_DEFAULT); 13528 13529 /* Get it out of the ILL interface list. */ 13530 ipif_remove(ipif); 13531 rw_exit(&ipst->ips_ill_g_lock); 13532 13533 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 13534 ASSERT(ipif->ipif_recovery_id == 0); 13535 ASSERT(ipif->ipif_ire_local == NULL); 13536 ASSERT(ipif->ipif_ire_if == NULL); 13537 13538 /* Free the memory. */ 13539 mi_free(ipif); 13540 } 13541 13542 /* 13543 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id" 13544 * is zero. 13545 */ 13546 void 13547 ipif_get_name(const ipif_t *ipif, char *buf, int len) 13548 { 13549 char lbuf[LIFNAMSIZ]; 13550 char *name; 13551 size_t name_len; 13552 13553 buf[0] = '\0'; 13554 name = ipif->ipif_ill->ill_name; 13555 name_len = ipif->ipif_ill->ill_name_length; 13556 if (ipif->ipif_id != 0) { 13557 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 13558 ipif->ipif_id); 13559 name = lbuf; 13560 name_len = mi_strlen(name) + 1; 13561 } 13562 len -= 1; 13563 buf[len] = '\0'; 13564 len = MIN(len, name_len); 13565 bcopy(name, buf, len); 13566 } 13567 13568 /* 13569 * Sets `buf' to an ill name. 13570 */ 13571 void 13572 ill_get_name(const ill_t *ill, char *buf, int len) 13573 { 13574 char *name; 13575 size_t name_len; 13576 13577 name = ill->ill_name; 13578 name_len = ill->ill_name_length; 13579 len -= 1; 13580 buf[len] = '\0'; 13581 len = MIN(len, name_len); 13582 bcopy(name, buf, len); 13583 } 13584 13585 /* 13586 * Find an IPIF based on the name passed in. Names can be of the form <phys> 13587 * (e.g., le0) or <phys>:<#> (e.g., le0:1). When there is no colon, the 13588 * implied unit id is zero. <phys> must correspond to the name of an ILL. 13589 * (May be called as writer.) 13590 */ 13591 static ipif_t * 13592 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 13593 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst) 13594 { 13595 char *cp; 13596 char *endp; 13597 long id; 13598 ill_t *ill; 13599 ipif_t *ipif; 13600 uint_t ire_type; 13601 boolean_t did_alloc = B_FALSE; 13602 13603 /* 13604 * If the caller wants to us to create the ipif, make sure we have a 13605 * valid zoneid 13606 */ 13607 ASSERT(!do_alloc || zoneid != ALL_ZONES); 13608 13609 if (namelen == 0) { 13610 return (NULL); 13611 } 13612 13613 *exists = B_FALSE; 13614 /* Look for a colon in the name. */ 13615 endp = &name[namelen]; 13616 for (cp = endp; --cp > name; ) { 13617 if (*cp == IPIF_SEPARATOR_CHAR) 13618 break; 13619 } 13620 13621 if (*cp == IPIF_SEPARATOR_CHAR) { 13622 /* 13623 * Reject any non-decimal aliases for logical 13624 * interfaces. Aliases with leading zeroes 13625 * are also rejected as they introduce ambiguity 13626 * in the naming of the interfaces. 13627 * In order to confirm with existing semantics, 13628 * and to not break any programs/script relying 13629 * on that behaviour, if<0>:0 is considered to be 13630 * a valid interface. 13631 * 13632 * If alias has two or more digits and the first 13633 * is zero, fail. 13634 */ 13635 if (&cp[2] < endp && cp[1] == '0') { 13636 return (NULL); 13637 } 13638 } 13639 13640 if (cp <= name) { 13641 cp = endp; 13642 } else { 13643 *cp = '\0'; 13644 } 13645 13646 /* 13647 * Look up the ILL, based on the portion of the name 13648 * before the slash. ill_lookup_on_name returns a held ill. 13649 * Temporary to check whether ill exists already. If so 13650 * ill_lookup_on_name will clear it. 13651 */ 13652 ill = ill_lookup_on_name(name, do_alloc, isv6, 13653 &did_alloc, ipst); 13654 if (cp != endp) 13655 *cp = IPIF_SEPARATOR_CHAR; 13656 if (ill == NULL) 13657 return (NULL); 13658 13659 /* Establish the unit number in the name. */ 13660 id = 0; 13661 if (cp < endp && *endp == '\0') { 13662 /* If there was a colon, the unit number follows. */ 13663 cp++; 13664 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 13665 ill_refrele(ill); 13666 return (NULL); 13667 } 13668 } 13669 13670 mutex_enter(&ill->ill_lock); 13671 /* Now see if there is an IPIF with this unit number. */ 13672 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13673 if (ipif->ipif_id == id) { 13674 if (zoneid != ALL_ZONES && 13675 zoneid != ipif->ipif_zoneid && 13676 ipif->ipif_zoneid != ALL_ZONES) { 13677 mutex_exit(&ill->ill_lock); 13678 ill_refrele(ill); 13679 return (NULL); 13680 } 13681 if (IPIF_CAN_LOOKUP(ipif)) { 13682 ipif_refhold_locked(ipif); 13683 mutex_exit(&ill->ill_lock); 13684 if (!did_alloc) 13685 *exists = B_TRUE; 13686 /* 13687 * Drop locks before calling ill_refrele 13688 * since it can potentially call into 13689 * ipif_ill_refrele_tail which can end up 13690 * in trying to acquire any lock. 13691 */ 13692 ill_refrele(ill); 13693 return (ipif); 13694 } 13695 } 13696 } 13697 13698 if (!do_alloc) { 13699 mutex_exit(&ill->ill_lock); 13700 ill_refrele(ill); 13701 return (NULL); 13702 } 13703 13704 /* 13705 * If none found, atomically allocate and return a new one. 13706 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 13707 * to support "receive only" use of lo0:1 etc. as is still done 13708 * below as an initial guess. 13709 * However, this is now likely to be overriden later in ipif_up_done() 13710 * when we know for sure what address has been configured on the 13711 * interface, since we might have more than one loopback interface 13712 * with a loopback address, e.g. in the case of zones, and all the 13713 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 13714 */ 13715 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 13716 ire_type = IRE_LOOPBACK; 13717 else 13718 ire_type = IRE_LOCAL; 13719 ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE, NULL); 13720 if (ipif != NULL) 13721 ipif_refhold_locked(ipif); 13722 mutex_exit(&ill->ill_lock); 13723 ill_refrele(ill); 13724 return (ipif); 13725 } 13726 13727 /* 13728 * Variant of the above that queues the request on the ipsq when 13729 * IPIF_CHANGING is set. 13730 */ 13731 static ipif_t * 13732 ipif_lookup_on_name_async(char *name, size_t namelen, boolean_t isv6, 13733 zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, 13734 ip_stack_t *ipst) 13735 { 13736 char *cp; 13737 char *endp; 13738 long id; 13739 ill_t *ill; 13740 ipif_t *ipif; 13741 boolean_t did_alloc = B_FALSE; 13742 ipsq_t *ipsq; 13743 13744 if (error != NULL) 13745 *error = 0; 13746 13747 if (namelen == 0) { 13748 if (error != NULL) 13749 *error = ENXIO; 13750 return (NULL); 13751 } 13752 13753 /* Look for a colon in the name. */ 13754 endp = &name[namelen]; 13755 for (cp = endp; --cp > name; ) { 13756 if (*cp == IPIF_SEPARATOR_CHAR) 13757 break; 13758 } 13759 13760 if (*cp == IPIF_SEPARATOR_CHAR) { 13761 /* 13762 * Reject any non-decimal aliases for logical 13763 * interfaces. Aliases with leading zeroes 13764 * are also rejected as they introduce ambiguity 13765 * in the naming of the interfaces. 13766 * In order to confirm with existing semantics, 13767 * and to not break any programs/script relying 13768 * on that behaviour, if<0>:0 is considered to be 13769 * a valid interface. 13770 * 13771 * If alias has two or more digits and the first 13772 * is zero, fail. 13773 */ 13774 if (&cp[2] < endp && cp[1] == '0') { 13775 if (error != NULL) 13776 *error = EINVAL; 13777 return (NULL); 13778 } 13779 } 13780 13781 if (cp <= name) { 13782 cp = endp; 13783 } else { 13784 *cp = '\0'; 13785 } 13786 13787 /* 13788 * Look up the ILL, based on the portion of the name 13789 * before the slash. ill_lookup_on_name returns a held ill. 13790 * Temporary to check whether ill exists already. If so 13791 * ill_lookup_on_name will clear it. 13792 */ 13793 ill = ill_lookup_on_name(name, B_FALSE, isv6, &did_alloc, ipst); 13794 if (cp != endp) 13795 *cp = IPIF_SEPARATOR_CHAR; 13796 if (ill == NULL) 13797 return (NULL); 13798 13799 /* Establish the unit number in the name. */ 13800 id = 0; 13801 if (cp < endp && *endp == '\0') { 13802 /* If there was a colon, the unit number follows. */ 13803 cp++; 13804 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 13805 ill_refrele(ill); 13806 if (error != NULL) 13807 *error = ENXIO; 13808 return (NULL); 13809 } 13810 } 13811 13812 GRAB_CONN_LOCK(q); 13813 mutex_enter(&ill->ill_lock); 13814 /* Now see if there is an IPIF with this unit number. */ 13815 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13816 if (ipif->ipif_id == id) { 13817 if (zoneid != ALL_ZONES && 13818 zoneid != ipif->ipif_zoneid && 13819 ipif->ipif_zoneid != ALL_ZONES) { 13820 mutex_exit(&ill->ill_lock); 13821 RELEASE_CONN_LOCK(q); 13822 ill_refrele(ill); 13823 if (error != NULL) 13824 *error = ENXIO; 13825 return (NULL); 13826 } 13827 13828 if (!(IPIF_IS_CHANGING(ipif) || 13829 IPIF_IS_CONDEMNED(ipif)) || 13830 IAM_WRITER_IPIF(ipif)) { 13831 ipif_refhold_locked(ipif); 13832 mutex_exit(&ill->ill_lock); 13833 /* 13834 * Drop locks before calling ill_refrele 13835 * since it can potentially call into 13836 * ipif_ill_refrele_tail which can end up 13837 * in trying to acquire any lock. 13838 */ 13839 RELEASE_CONN_LOCK(q); 13840 ill_refrele(ill); 13841 return (ipif); 13842 } else if (q != NULL && !IPIF_IS_CONDEMNED(ipif)) { 13843 ipsq = ill->ill_phyint->phyint_ipsq; 13844 mutex_enter(&ipsq->ipsq_lock); 13845 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 13846 mutex_exit(&ill->ill_lock); 13847 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 13848 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 13849 mutex_exit(&ipsq->ipsq_lock); 13850 RELEASE_CONN_LOCK(q); 13851 ill_refrele(ill); 13852 if (error != NULL) 13853 *error = EINPROGRESS; 13854 return (NULL); 13855 } 13856 } 13857 } 13858 RELEASE_CONN_LOCK(q); 13859 mutex_exit(&ill->ill_lock); 13860 ill_refrele(ill); 13861 if (error != NULL) 13862 *error = ENXIO; 13863 return (NULL); 13864 } 13865 13866 /* 13867 * This routine is called whenever a new address comes up on an ipif. If 13868 * we are configured to respond to address mask requests, then we are supposed 13869 * to broadcast an address mask reply at this time. This routine is also 13870 * called if we are already up, but a netmask change is made. This is legal 13871 * but might not make the system manager very popular. (May be called 13872 * as writer.) 13873 */ 13874 void 13875 ipif_mask_reply(ipif_t *ipif) 13876 { 13877 icmph_t *icmph; 13878 ipha_t *ipha; 13879 mblk_t *mp; 13880 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13881 ip_xmit_attr_t ixas; 13882 13883 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 13884 13885 if (!ipst->ips_ip_respond_to_address_mask_broadcast) 13886 return; 13887 13888 /* ICMP mask reply is IPv4 only */ 13889 ASSERT(!ipif->ipif_isv6); 13890 /* ICMP mask reply is not for a loopback interface */ 13891 ASSERT(ipif->ipif_ill->ill_wq != NULL); 13892 13893 if (ipif->ipif_lcl_addr == INADDR_ANY) 13894 return; 13895 13896 mp = allocb(REPLY_LEN, BPRI_HI); 13897 if (mp == NULL) 13898 return; 13899 mp->b_wptr = mp->b_rptr + REPLY_LEN; 13900 13901 ipha = (ipha_t *)mp->b_rptr; 13902 bzero(ipha, REPLY_LEN); 13903 *ipha = icmp_ipha; 13904 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 13905 ipha->ipha_src = ipif->ipif_lcl_addr; 13906 ipha->ipha_dst = ipif->ipif_brd_addr; 13907 ipha->ipha_length = htons(REPLY_LEN); 13908 ipha->ipha_ident = 0; 13909 13910 icmph = (icmph_t *)&ipha[1]; 13911 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 13912 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 13913 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 13914 13915 bzero(&ixas, sizeof (ixas)); 13916 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 13917 ixas.ixa_zoneid = ALL_ZONES; 13918 ixas.ixa_ifindex = 0; 13919 ixas.ixa_ipst = ipst; 13920 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 13921 (void) ip_output_simple(mp, &ixas); 13922 ixa_cleanup(&ixas); 13923 #undef REPLY_LEN 13924 } 13925 13926 /* 13927 * Join the ipif specific multicast groups. 13928 * Must be called after a mapping has been set up in the resolver. (Always 13929 * called as writer.) 13930 */ 13931 void 13932 ipif_multicast_up(ipif_t *ipif) 13933 { 13934 int err; 13935 ill_t *ill; 13936 ilm_t *ilm; 13937 13938 ASSERT(IAM_WRITER_IPIF(ipif)); 13939 13940 ill = ipif->ipif_ill; 13941 13942 ip1dbg(("ipif_multicast_up\n")); 13943 if (!(ill->ill_flags & ILLF_MULTICAST) || 13944 ipif->ipif_allhosts_ilm != NULL) 13945 return; 13946 13947 if (ipif->ipif_isv6) { 13948 in6_addr_t v6allmc = ipv6_all_hosts_mcast; 13949 in6_addr_t v6solmc = ipv6_solicited_node_mcast; 13950 13951 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 13952 13953 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 13954 return; 13955 13956 ip1dbg(("ipif_multicast_up - addmulti\n")); 13957 13958 /* 13959 * Join the all hosts multicast address. We skip this for 13960 * underlying IPMP interfaces since they should be invisible. 13961 */ 13962 if (!IS_UNDER_IPMP(ill)) { 13963 ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid, 13964 &err); 13965 if (ilm == NULL) { 13966 ASSERT(err != 0); 13967 ip0dbg(("ipif_multicast_up: " 13968 "all_hosts_mcast failed %d\n", err)); 13969 return; 13970 } 13971 ipif->ipif_allhosts_ilm = ilm; 13972 } 13973 13974 /* 13975 * Enable multicast for the solicited node multicast address. 13976 * If IPMP we need to put the membership on the upper ill. 13977 */ 13978 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 13979 ill_t *mcast_ill = NULL; 13980 boolean_t need_refrele; 13981 13982 if (IS_UNDER_IPMP(ill) && 13983 (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) { 13984 need_refrele = B_TRUE; 13985 } else { 13986 mcast_ill = ill; 13987 need_refrele = B_FALSE; 13988 } 13989 13990 ilm = ip_addmulti(&v6solmc, mcast_ill, 13991 ipif->ipif_zoneid, &err); 13992 if (need_refrele) 13993 ill_refrele(mcast_ill); 13994 13995 if (ilm == NULL) { 13996 ASSERT(err != 0); 13997 ip0dbg(("ipif_multicast_up: solicited MC" 13998 " failed %d\n", err)); 13999 if ((ilm = ipif->ipif_allhosts_ilm) != NULL) { 14000 ipif->ipif_allhosts_ilm = NULL; 14001 (void) ip_delmulti(ilm); 14002 } 14003 return; 14004 } 14005 ipif->ipif_solmulti_ilm = ilm; 14006 } 14007 } else { 14008 in6_addr_t v6group; 14009 14010 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill)) 14011 return; 14012 14013 /* Join the all hosts multicast address */ 14014 ip1dbg(("ipif_multicast_up - addmulti\n")); 14015 IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group); 14016 14017 ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err); 14018 if (ilm == NULL) { 14019 ASSERT(err != 0); 14020 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 14021 return; 14022 } 14023 ipif->ipif_allhosts_ilm = ilm; 14024 } 14025 } 14026 14027 /* 14028 * Blow away any multicast groups that we joined in ipif_multicast_up(). 14029 * (ilms from explicit memberships are handled in conn_update_ill.) 14030 */ 14031 void 14032 ipif_multicast_down(ipif_t *ipif) 14033 { 14034 ASSERT(IAM_WRITER_IPIF(ipif)); 14035 14036 ip1dbg(("ipif_multicast_down\n")); 14037 14038 if (ipif->ipif_allhosts_ilm != NULL) { 14039 (void) ip_delmulti(ipif->ipif_allhosts_ilm); 14040 ipif->ipif_allhosts_ilm = NULL; 14041 } 14042 if (ipif->ipif_solmulti_ilm != NULL) { 14043 (void) ip_delmulti(ipif->ipif_solmulti_ilm); 14044 ipif->ipif_solmulti_ilm = NULL; 14045 } 14046 } 14047 14048 /* 14049 * Used when an interface comes up to recreate any extra routes on this 14050 * interface. 14051 */ 14052 int 14053 ill_recover_saved_ire(ill_t *ill) 14054 { 14055 mblk_t *mp; 14056 ip_stack_t *ipst = ill->ill_ipst; 14057 14058 ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name)); 14059 14060 mutex_enter(&ill->ill_saved_ire_lock); 14061 for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 14062 ire_t *ire, *nire; 14063 ifrt_t *ifrt; 14064 14065 ifrt = (ifrt_t *)mp->b_rptr; 14066 /* 14067 * Create a copy of the IRE with the saved address and netmask. 14068 */ 14069 if (ill->ill_isv6) { 14070 ire = ire_create_v6( 14071 &ifrt->ifrt_v6addr, 14072 &ifrt->ifrt_v6mask, 14073 &ifrt->ifrt_v6gateway_addr, 14074 ifrt->ifrt_type, 14075 ill, 14076 ifrt->ifrt_zoneid, 14077 ifrt->ifrt_flags, 14078 NULL, 14079 ipst); 14080 } else { 14081 ire = ire_create( 14082 (uint8_t *)&ifrt->ifrt_addr, 14083 (uint8_t *)&ifrt->ifrt_mask, 14084 (uint8_t *)&ifrt->ifrt_gateway_addr, 14085 ifrt->ifrt_type, 14086 ill, 14087 ifrt->ifrt_zoneid, 14088 ifrt->ifrt_flags, 14089 NULL, 14090 ipst); 14091 } 14092 if (ire == NULL) { 14093 mutex_exit(&ill->ill_saved_ire_lock); 14094 return (ENOMEM); 14095 } 14096 14097 if (ifrt->ifrt_flags & RTF_SETSRC) { 14098 if (ill->ill_isv6) { 14099 ire->ire_setsrc_addr_v6 = 14100 ifrt->ifrt_v6setsrc_addr; 14101 } else { 14102 ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr; 14103 } 14104 } 14105 14106 /* 14107 * Some software (for example, GateD and Sun Cluster) attempts 14108 * to create (what amount to) IRE_PREFIX routes with the 14109 * loopback address as the gateway. This is primarily done to 14110 * set up prefixes with the RTF_REJECT flag set (for example, 14111 * when generating aggregate routes.) 14112 * 14113 * If the IRE type (as defined by ill->ill_net_type) is 14114 * IRE_LOOPBACK, then we map the request into a 14115 * IRE_IF_NORESOLVER. 14116 */ 14117 if (ill->ill_net_type == IRE_LOOPBACK) 14118 ire->ire_type = IRE_IF_NORESOLVER; 14119 14120 /* 14121 * ire held by ire_add, will be refreled' towards the 14122 * the end of ipif_up_done 14123 */ 14124 nire = ire_add(ire); 14125 /* 14126 * Check if it was a duplicate entry. This handles 14127 * the case of two racing route adds for the same route 14128 */ 14129 if (nire == NULL) { 14130 ip1dbg(("ill_recover_saved_ire: FAILED\n")); 14131 } else if (nire != ire) { 14132 ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n", 14133 (void *)nire)); 14134 ire_delete(nire); 14135 } else { 14136 ip1dbg(("ill_recover_saved_ire: added ire %p\n", 14137 (void *)nire)); 14138 } 14139 if (nire != NULL) 14140 ire_refrele(nire); 14141 } 14142 mutex_exit(&ill->ill_saved_ire_lock); 14143 return (0); 14144 } 14145 14146 /* 14147 * Used to set the netmask and broadcast address to default values when the 14148 * interface is brought up. (Always called as writer.) 14149 */ 14150 static void 14151 ipif_set_default(ipif_t *ipif) 14152 { 14153 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 14154 14155 if (!ipif->ipif_isv6) { 14156 /* 14157 * Interface holds an IPv4 address. Default 14158 * mask is the natural netmask. 14159 */ 14160 if (!ipif->ipif_net_mask) { 14161 ipaddr_t v4mask; 14162 14163 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 14164 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 14165 } 14166 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 14167 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 14168 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 14169 } else { 14170 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 14171 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 14172 } 14173 /* 14174 * NOTE: SunOS 4.X does this even if the broadcast address 14175 * has been already set thus we do the same here. 14176 */ 14177 if (ipif->ipif_flags & IPIF_BROADCAST) { 14178 ipaddr_t v4addr; 14179 14180 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 14181 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 14182 } 14183 } else { 14184 /* 14185 * Interface holds an IPv6-only address. Default 14186 * mask is all-ones. 14187 */ 14188 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 14189 ipif->ipif_v6net_mask = ipv6_all_ones; 14190 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 14191 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 14192 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 14193 } else { 14194 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 14195 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 14196 } 14197 } 14198 } 14199 14200 /* 14201 * Return 0 if this address can be used as local address without causing 14202 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 14203 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 14204 * Note that the same IPv6 link-local address is allowed as long as the ills 14205 * are not on the same link. 14206 */ 14207 int 14208 ip_addr_availability_check(ipif_t *new_ipif) 14209 { 14210 in6_addr_t our_v6addr; 14211 ill_t *ill; 14212 ipif_t *ipif; 14213 ill_walk_context_t ctx; 14214 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst; 14215 14216 ASSERT(IAM_WRITER_IPIF(new_ipif)); 14217 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock)); 14218 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); 14219 14220 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 14221 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 14222 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 14223 return (0); 14224 14225 our_v6addr = new_ipif->ipif_v6lcl_addr; 14226 14227 if (new_ipif->ipif_isv6) 14228 ill = ILL_START_WALK_V6(&ctx, ipst); 14229 else 14230 ill = ILL_START_WALK_V4(&ctx, ipst); 14231 14232 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 14233 for (ipif = ill->ill_ipif; ipif != NULL; 14234 ipif = ipif->ipif_next) { 14235 if ((ipif == new_ipif) || 14236 !(ipif->ipif_flags & IPIF_UP) || 14237 (ipif->ipif_flags & IPIF_UNNUMBERED) || 14238 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 14239 &our_v6addr)) 14240 continue; 14241 14242 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 14243 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 14244 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 14245 ipif->ipif_flags |= IPIF_UNNUMBERED; 14246 else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) || 14247 IN6_IS_ADDR_SITELOCAL(&our_v6addr)) && 14248 !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill)) 14249 continue; 14250 else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid && 14251 ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill)) 14252 continue; 14253 else if (new_ipif->ipif_ill == ill) 14254 return (EADDRINUSE); 14255 else 14256 return (EADDRNOTAVAIL); 14257 } 14258 } 14259 14260 return (0); 14261 } 14262 14263 /* 14264 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 14265 * IREs for the ipif. 14266 * When the routine returns EINPROGRESS then mp has been consumed and 14267 * the ioctl will be acked from ip_rput_dlpi. 14268 */ 14269 int 14270 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 14271 { 14272 ill_t *ill = ipif->ipif_ill; 14273 boolean_t isv6 = ipif->ipif_isv6; 14274 int err = 0; 14275 boolean_t success; 14276 uint_t ipif_orig_id; 14277 ip_stack_t *ipst = ill->ill_ipst; 14278 14279 ASSERT(IAM_WRITER_IPIF(ipif)); 14280 14281 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 14282 DTRACE_PROBE3(ipif__downup, char *, "ipif_up", 14283 ill_t *, ill, ipif_t *, ipif); 14284 14285 /* Shouldn't get here if it is already up. */ 14286 if (ipif->ipif_flags & IPIF_UP) 14287 return (EALREADY); 14288 14289 /* 14290 * If this is a request to bring up a data address on an interface 14291 * under IPMP, then move the address to its IPMP meta-interface and 14292 * try to bring it up. One complication is that the zeroth ipif for 14293 * an ill is special, in that every ill always has one, and that code 14294 * throughout IP deferences ill->ill_ipif without holding any locks. 14295 */ 14296 if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) && 14297 (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) { 14298 ipif_t *stubipif = NULL, *moveipif = NULL; 14299 ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); 14300 14301 /* 14302 * The ipif being brought up should be quiesced. If it's not, 14303 * something has gone amiss and we need to bail out. (If it's 14304 * quiesced, we know it will remain so via IPIF_CONDEMNED.) 14305 */ 14306 mutex_enter(&ill->ill_lock); 14307 if (!ipif_is_quiescent(ipif)) { 14308 mutex_exit(&ill->ill_lock); 14309 return (EINVAL); 14310 } 14311 mutex_exit(&ill->ill_lock); 14312 14313 /* 14314 * If we're going to need to allocate ipifs, do it prior 14315 * to starting the move (and grabbing locks). 14316 */ 14317 if (ipif->ipif_id == 0) { 14318 if ((moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 14319 B_FALSE, &err)) == NULL) { 14320 return (err); 14321 } 14322 if ((stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 14323 B_FALSE, &err)) == NULL) { 14324 mi_free(moveipif); 14325 return (err); 14326 } 14327 } 14328 14329 /* 14330 * Grab or transfer the ipif to move. During the move, keep 14331 * ill_g_lock held to prevent any ill walker threads from 14332 * seeing things in an inconsistent state. 14333 */ 14334 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14335 if (ipif->ipif_id != 0) { 14336 ipif_remove(ipif); 14337 } else { 14338 ipif_transfer(ipif, moveipif, stubipif); 14339 ipif = moveipif; 14340 } 14341 14342 /* 14343 * Place the ipif on the IPMP ill. If the zeroth ipif on 14344 * the IPMP ill is a stub (0.0.0.0 down address) then we 14345 * replace that one. Otherwise, pick the next available slot. 14346 */ 14347 ipif->ipif_ill = ipmp_ill; 14348 ipif_orig_id = ipif->ipif_id; 14349 14350 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) { 14351 ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL); 14352 ipif = ipmp_ill->ill_ipif; 14353 } else { 14354 ipif->ipif_id = -1; 14355 if ((err = ipif_insert(ipif, B_FALSE)) != 0) { 14356 /* 14357 * No more available ipif_id's -- put it back 14358 * on the original ill and fail the operation. 14359 * Since we're writer on the ill, we can be 14360 * sure our old slot is still available. 14361 */ 14362 ipif->ipif_id = ipif_orig_id; 14363 ipif->ipif_ill = ill; 14364 if (ipif_orig_id == 0) { 14365 ipif_transfer(ipif, ill->ill_ipif, 14366 NULL); 14367 } else { 14368 VERIFY(ipif_insert(ipif, B_FALSE) == 0); 14369 } 14370 rw_exit(&ipst->ips_ill_g_lock); 14371 return (err); 14372 } 14373 } 14374 rw_exit(&ipst->ips_ill_g_lock); 14375 14376 /* 14377 * Tell SCTP that the ipif has moved. Note that even if we 14378 * had to allocate a new ipif, the original sequence id was 14379 * preserved and therefore SCTP won't know. 14380 */ 14381 sctp_move_ipif(ipif, ill, ipmp_ill); 14382 14383 /* 14384 * If the ipif being brought up was on slot zero, then we 14385 * first need to bring up the placeholder we stuck there. In 14386 * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive 14387 * call to ipif_up() itself, if we successfully bring up the 14388 * placeholder, we'll check ill_move_ipif and bring it up too. 14389 */ 14390 if (ipif_orig_id == 0) { 14391 ASSERT(ill->ill_move_ipif == NULL); 14392 ill->ill_move_ipif = ipif; 14393 if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0) 14394 ASSERT(ill->ill_move_ipif == NULL); 14395 if (err != EINPROGRESS) 14396 ill->ill_move_ipif = NULL; 14397 return (err); 14398 } 14399 14400 /* 14401 * Bring it up on the IPMP ill. 14402 */ 14403 return (ipif_up(ipif, q, mp)); 14404 } 14405 14406 /* Skip arp/ndp for any loopback interface. */ 14407 if (ill->ill_wq != NULL) { 14408 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 14409 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 14410 14411 if (!ill->ill_dl_up) { 14412 /* 14413 * ill_dl_up is not yet set. i.e. we are yet to 14414 * DL_BIND with the driver and this is the first 14415 * logical interface on the ill to become "up". 14416 * Tell the driver to get going (via DL_BIND_REQ). 14417 * Note that changing "significant" IFF_ flags 14418 * address/netmask etc cause a down/up dance, but 14419 * does not cause an unbind (DL_UNBIND) with the driver 14420 */ 14421 return (ill_dl_up(ill, ipif, mp, q)); 14422 } 14423 14424 /* 14425 * ipif_resolver_up may end up needeing to bind/attach 14426 * the ARP stream, which in turn necessitates a 14427 * DLPI message exchange with the driver. ioctls are 14428 * serialized and so we cannot send more than one 14429 * interface up message at a time. If ipif_resolver_up 14430 * does need to wait for the DLPI handshake for the ARP stream, 14431 * we get EINPROGRESS and we will complete in arp_bringup_done. 14432 */ 14433 14434 ASSERT(connp != NULL || !CONN_Q(q)); 14435 if (connp != NULL) 14436 mutex_enter(&connp->conn_lock); 14437 mutex_enter(&ill->ill_lock); 14438 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 14439 mutex_exit(&ill->ill_lock); 14440 if (connp != NULL) 14441 mutex_exit(&connp->conn_lock); 14442 if (!success) 14443 return (EINTR); 14444 14445 /* 14446 * Crank up IPv6 neighbor discovery. Unlike ARP, this should 14447 * complete when ipif_ndp_up returns. 14448 */ 14449 err = ipif_resolver_up(ipif, Res_act_initial); 14450 if (err == EINPROGRESS) { 14451 /* We will complete it in arp_bringup_done() */ 14452 return (err); 14453 } 14454 14455 if (isv6 && err == 0) 14456 err = ipif_ndp_up(ipif, B_TRUE); 14457 14458 ASSERT(err != EINPROGRESS); 14459 mp = ipsq_pending_mp_get(ipsq, &connp); 14460 ASSERT(mp != NULL); 14461 if (err != 0) 14462 return (err); 14463 } else { 14464 /* 14465 * Interfaces without underlying hardware don't do duplicate 14466 * address detection. 14467 */ 14468 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 14469 ipif->ipif_addr_ready = 1; 14470 err = ill_add_ires(ill); 14471 /* allocation failure? */ 14472 if (err != 0) 14473 return (err); 14474 } 14475 14476 err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 14477 if (err == 0 && ill->ill_move_ipif != NULL) { 14478 ipif = ill->ill_move_ipif; 14479 ill->ill_move_ipif = NULL; 14480 return (ipif_up(ipif, q, mp)); 14481 } 14482 return (err); 14483 } 14484 14485 /* 14486 * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST. 14487 * The identical set of IREs need to be removed in ill_delete_ires(). 14488 */ 14489 int 14490 ill_add_ires(ill_t *ill) 14491 { 14492 ire_t *ire; 14493 in6_addr_t dummy6 = {(uint32_t)V6_MCAST, 0, 0, 1}; 14494 in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP); 14495 14496 if (ill->ill_ire_multicast != NULL) 14497 return (0); 14498 14499 /* 14500 * provide some dummy ire_addr for creating the ire. 14501 */ 14502 if (ill->ill_isv6) { 14503 ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill, 14504 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); 14505 } else { 14506 ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill, 14507 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); 14508 } 14509 if (ire == NULL) 14510 return (ENOMEM); 14511 14512 ill->ill_ire_multicast = ire; 14513 return (0); 14514 } 14515 14516 void 14517 ill_delete_ires(ill_t *ill) 14518 { 14519 if (ill->ill_ire_multicast != NULL) { 14520 /* 14521 * BIND/ATTACH completed; Release the ref for ill_ire_multicast 14522 * which was taken without any th_tracing enabled. 14523 * We also mark it as condemned (note that it was never added) 14524 * so that caching conn's can move off of it. 14525 */ 14526 ire_make_condemned(ill->ill_ire_multicast); 14527 ire_refrele_notr(ill->ill_ire_multicast); 14528 ill->ill_ire_multicast = NULL; 14529 } 14530 } 14531 14532 /* 14533 * Perform a bind for the physical device. 14534 * When the routine returns EINPROGRESS then mp has been consumed and 14535 * the ioctl will be acked from ip_rput_dlpi. 14536 * Allocate an unbind message and save it until ipif_down. 14537 */ 14538 static int 14539 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 14540 { 14541 mblk_t *bind_mp = NULL; 14542 mblk_t *unbind_mp = NULL; 14543 conn_t *connp; 14544 boolean_t success; 14545 int err; 14546 14547 DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill); 14548 14549 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 14550 ASSERT(IAM_WRITER_ILL(ill)); 14551 ASSERT(mp != NULL); 14552 14553 /* 14554 * Make sure we have an IRE_MULTICAST in case we immediately 14555 * start receiving packets. 14556 */ 14557 err = ill_add_ires(ill); 14558 if (err != 0) 14559 goto bad; 14560 14561 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 14562 DL_BIND_REQ); 14563 if (bind_mp == NULL) 14564 goto bad; 14565 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 14566 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 14567 14568 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 14569 if (unbind_mp == NULL) 14570 goto bad; 14571 14572 /* 14573 * Record state needed to complete this operation when the 14574 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 14575 */ 14576 connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 14577 ASSERT(connp != NULL || !CONN_Q(q)); 14578 GRAB_CONN_LOCK(q); 14579 mutex_enter(&ipif->ipif_ill->ill_lock); 14580 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 14581 mutex_exit(&ipif->ipif_ill->ill_lock); 14582 RELEASE_CONN_LOCK(q); 14583 if (!success) 14584 goto bad; 14585 14586 /* 14587 * Save the unbind message for ill_dl_down(); it will be consumed when 14588 * the interface goes down. 14589 */ 14590 ASSERT(ill->ill_unbind_mp == NULL); 14591 ill->ill_unbind_mp = unbind_mp; 14592 14593 ill_dlpi_send(ill, bind_mp); 14594 /* Send down link-layer capabilities probe if not already done. */ 14595 ill_capability_probe(ill); 14596 14597 /* 14598 * Sysid used to rely on the fact that netboots set domainname 14599 * and the like. Now that miniroot boots aren't strictly netboots 14600 * and miniroot network configuration is driven from userland 14601 * these things still need to be set. This situation can be detected 14602 * by comparing the interface being configured here to the one 14603 * dhcifname was set to reference by the boot loader. Once sysid is 14604 * converted to use dhcp_ipc_getinfo() this call can go away. 14605 */ 14606 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && 14607 (strcmp(ill->ill_name, dhcifname) == 0) && 14608 (strlen(srpc_domain) == 0)) { 14609 if (dhcpinit() != 0) 14610 cmn_err(CE_WARN, "no cached dhcp response"); 14611 } 14612 14613 /* 14614 * This operation will complete in ip_rput_dlpi with either 14615 * a DL_BIND_ACK or DL_ERROR_ACK. 14616 */ 14617 return (EINPROGRESS); 14618 bad: 14619 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 14620 14621 freemsg(bind_mp); 14622 freemsg(unbind_mp); 14623 return (ENOMEM); 14624 } 14625 14626 /* Add room for tcp+ip headers */ 14627 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 14628 14629 /* 14630 * DLPI and ARP is up. 14631 * Create all the IREs associated with an interface. Bring up multicast. 14632 * Set the interface flag and finish other initialization 14633 * that potentially had to be deferred to after DL_BIND_ACK. 14634 */ 14635 int 14636 ipif_up_done(ipif_t *ipif) 14637 { 14638 ill_t *ill = ipif->ipif_ill; 14639 int err = 0; 14640 boolean_t loopback = B_FALSE; 14641 boolean_t update_src_selection = B_TRUE; 14642 ipif_t *tmp_ipif; 14643 14644 ip1dbg(("ipif_up_done(%s:%u)\n", 14645 ipif->ipif_ill->ill_name, ipif->ipif_id)); 14646 DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done", 14647 ill_t *, ill, ipif_t *, ipif); 14648 14649 /* Check if this is a loopback interface */ 14650 if (ipif->ipif_ill->ill_wq == NULL) 14651 loopback = B_TRUE; 14652 14653 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 14654 14655 /* 14656 * If all other interfaces for this ill are down or DEPRECATED, 14657 * or otherwise unsuitable for source address selection, 14658 * reset the src generation numbers to make sure source 14659 * address selection gets to take this new ipif into account. 14660 * No need to hold ill_lock while traversing the ipif list since 14661 * we are writer 14662 */ 14663 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 14664 tmp_ipif = tmp_ipif->ipif_next) { 14665 if (((tmp_ipif->ipif_flags & 14666 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 14667 !(tmp_ipif->ipif_flags & IPIF_UP)) || 14668 (tmp_ipif == ipif)) 14669 continue; 14670 /* first useable pre-existing interface */ 14671 update_src_selection = B_FALSE; 14672 break; 14673 } 14674 if (update_src_selection) 14675 ip_update_source_selection(ill->ill_ipst); 14676 14677 if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) { 14678 nce_t *loop_nce = NULL; 14679 uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD); 14680 14681 /* 14682 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 14683 * ipif_lookup_on_name(), but in the case of zones we can have 14684 * several loopback addresses on lo0. So all the interfaces with 14685 * loopback addresses need to be marked IRE_LOOPBACK. 14686 */ 14687 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 14688 htonl(INADDR_LOOPBACK)) 14689 ipif->ipif_ire_type = IRE_LOOPBACK; 14690 else 14691 ipif->ipif_ire_type = IRE_LOCAL; 14692 if (ill->ill_net_type != IRE_LOOPBACK) 14693 flags |= NCE_F_PUBLISH; 14694 14695 /* add unicast nce for the local addr */ 14696 err = nce_lookup_then_add_v4(ill, NULL, 14697 ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags, 14698 ND_REACHABLE, &loop_nce); 14699 /* A shared-IP zone sees EEXIST for lo0:N */ 14700 if (err == 0 || err == EEXIST) { 14701 ipif->ipif_added_nce = 1; 14702 loop_nce->nce_ipif_cnt++; 14703 nce_refrele(loop_nce); 14704 err = 0; 14705 } else { 14706 ASSERT(loop_nce == NULL); 14707 return (err); 14708 } 14709 } 14710 14711 /* Create all the IREs associated with this interface */ 14712 err = ipif_add_ires_v4(ipif, loopback); 14713 if (err != 0) { 14714 /* 14715 * see comments about return value from 14716 * ip_addr_availability_check() in ipif_add_ires_v4(). 14717 */ 14718 if (err != EADDRINUSE) { 14719 (void) ipif_arp_down(ipif); 14720 } else { 14721 /* 14722 * Make IPMP aware of the deleted ipif so that 14723 * the needed ipmp cleanup (e.g., of ipif_bound_ill) 14724 * can be completed. Note that we do not want to 14725 * destroy the nce that was created on the ipmp_ill 14726 * for the active copy of the duplicate address in 14727 * use. 14728 */ 14729 if (IS_IPMP(ill)) 14730 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 14731 err = EADDRNOTAVAIL; 14732 } 14733 return (err); 14734 } 14735 14736 if (ill->ill_ipif_up_count == 1 && !loopback) { 14737 /* Recover any additional IREs entries for this ill */ 14738 (void) ill_recover_saved_ire(ill); 14739 } 14740 14741 if (ill->ill_need_recover_multicast) { 14742 /* 14743 * Need to recover all multicast memberships in the driver. 14744 * This had to be deferred until we had attached. The same 14745 * code exists in ipif_up_done_v6() to recover IPv6 14746 * memberships. 14747 * 14748 * Note that it would be preferable to unconditionally do the 14749 * ill_recover_multicast() in ill_dl_up(), but we cannot do 14750 * that since ill_join_allmulti() depends on ill_dl_up being 14751 * set, and it is not set until we receive a DL_BIND_ACK after 14752 * having called ill_dl_up(). 14753 */ 14754 ill_recover_multicast(ill); 14755 } 14756 14757 if (ill->ill_ipif_up_count == 1) { 14758 /* 14759 * Since the interface is now up, it may now be active. 14760 */ 14761 if (IS_UNDER_IPMP(ill)) 14762 ipmp_ill_refresh_active(ill); 14763 14764 /* 14765 * If this is an IPMP interface, we may now be able to 14766 * establish ARP entries. 14767 */ 14768 if (IS_IPMP(ill)) 14769 ipmp_illgrp_refresh_arpent(ill->ill_grp); 14770 } 14771 14772 /* Join the allhosts multicast address */ 14773 ipif_multicast_up(ipif); 14774 14775 if (!loopback && !update_src_selection && 14776 !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) 14777 ip_update_source_selection(ill->ill_ipst); 14778 14779 if (!loopback && ipif->ipif_addr_ready) { 14780 /* Broadcast an address mask reply. */ 14781 ipif_mask_reply(ipif); 14782 } 14783 /* Perhaps ilgs should use this ill */ 14784 update_conn_ill(NULL, ill->ill_ipst); 14785 14786 /* 14787 * This had to be deferred until we had bound. Tell routing sockets and 14788 * others that this interface is up if it looks like the address has 14789 * been validated. Otherwise, if it isn't ready yet, wait for 14790 * duplicate address detection to do its thing. 14791 */ 14792 if (ipif->ipif_addr_ready) 14793 ipif_up_notify(ipif); 14794 return (0); 14795 } 14796 14797 /* 14798 * Add the IREs associated with the ipif. 14799 * Those MUST be explicitly removed in ipif_delete_ires_v4. 14800 */ 14801 static int 14802 ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback) 14803 { 14804 ill_t *ill = ipif->ipif_ill; 14805 ip_stack_t *ipst = ill->ill_ipst; 14806 ire_t *ire_array[20]; 14807 ire_t **irep = ire_array; 14808 ire_t **irep1; 14809 ipaddr_t net_mask = 0; 14810 ipaddr_t subnet_mask, route_mask; 14811 int err; 14812 ire_t *ire_local = NULL; /* LOCAL or LOOPBACK */ 14813 ire_t *ire_if = NULL; 14814 uchar_t *gw; 14815 14816 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14817 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14818 /* 14819 * If we're on a labeled system then make sure that zone- 14820 * private addresses have proper remote host database entries. 14821 */ 14822 if (is_system_labeled() && 14823 ipif->ipif_ire_type != IRE_LOOPBACK && 14824 !tsol_check_interface_address(ipif)) 14825 return (EINVAL); 14826 14827 /* Register the source address for __sin6_src_id */ 14828 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 14829 ipif->ipif_zoneid, ipst); 14830 if (err != 0) { 14831 ip0dbg(("ipif_add_ires: srcid_insert %d\n", err)); 14832 return (err); 14833 } 14834 14835 if (loopback) 14836 gw = (uchar_t *)&ipif->ipif_lcl_addr; 14837 else 14838 gw = NULL; 14839 14840 /* If the interface address is set, create the local IRE. */ 14841 ire_local = ire_create( 14842 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 14843 (uchar_t *)&ip_g_all_ones, /* mask */ 14844 gw, /* gateway */ 14845 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 14846 ipif->ipif_ill, 14847 ipif->ipif_zoneid, 14848 ((ipif->ipif_flags & IPIF_PRIVATE) ? 14849 RTF_PRIVATE : 0) | RTF_KERNEL, 14850 NULL, 14851 ipst); 14852 ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x" 14853 " for 0x%x\n", (void *)ipif, (void *)ire_local, 14854 ipif->ipif_ire_type, 14855 ntohl(ipif->ipif_lcl_addr))); 14856 if (ire_local == NULL) { 14857 ip1dbg(("ipif_up_done: NULL ire_local\n")); 14858 err = ENOMEM; 14859 goto bad; 14860 } 14861 } else { 14862 ip1dbg(( 14863 "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n", 14864 ipif->ipif_ire_type, 14865 ntohl(ipif->ipif_lcl_addr), 14866 (uint_t)ipif->ipif_flags)); 14867 } 14868 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14869 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14870 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14871 } else { 14872 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 14873 } 14874 14875 subnet_mask = ipif->ipif_net_mask; 14876 14877 /* 14878 * If mask was not specified, use natural netmask of 14879 * interface address. Also, store this mask back into the 14880 * ipif struct. 14881 */ 14882 if (subnet_mask == 0) { 14883 subnet_mask = net_mask; 14884 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 14885 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 14886 ipif->ipif_v6subnet); 14887 } 14888 14889 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 14890 if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) && 14891 ipif->ipif_subnet != INADDR_ANY) { 14892 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 14893 14894 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 14895 route_mask = IP_HOST_MASK; 14896 } else { 14897 route_mask = subnet_mask; 14898 } 14899 14900 ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p " 14901 "creating if IRE ill_net_type 0x%x for 0x%x\n", 14902 (void *)ipif, (void *)ill, ill->ill_net_type, 14903 ntohl(ipif->ipif_subnet))); 14904 ire_if = ire_create( 14905 (uchar_t *)&ipif->ipif_subnet, 14906 (uchar_t *)&route_mask, 14907 (uchar_t *)&ipif->ipif_lcl_addr, 14908 ill->ill_net_type, 14909 ill, 14910 ipif->ipif_zoneid, 14911 ((ipif->ipif_flags & IPIF_PRIVATE) ? 14912 RTF_PRIVATE: 0) | RTF_KERNEL, 14913 NULL, 14914 ipst); 14915 if (ire_if == NULL) { 14916 ip1dbg(("ipif_up_done: NULL ire_if\n")); 14917 err = ENOMEM; 14918 goto bad; 14919 } 14920 } 14921 14922 /* 14923 * Create any necessary broadcast IREs. 14924 */ 14925 if ((ipif->ipif_flags & IPIF_BROADCAST) && 14926 !(ipif->ipif_flags & IPIF_NOXMIT)) 14927 irep = ipif_create_bcast_ires(ipif, irep); 14928 14929 /* If an earlier ire_create failed, get out now */ 14930 for (irep1 = irep; irep1 > ire_array; ) { 14931 irep1--; 14932 if (*irep1 == NULL) { 14933 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 14934 err = ENOMEM; 14935 goto bad; 14936 } 14937 } 14938 14939 /* 14940 * Need to atomically check for IP address availability under 14941 * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new 14942 * ills or new ipifs can be added while we are checking availability. 14943 */ 14944 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 14945 mutex_enter(&ipst->ips_ip_addr_avail_lock); 14946 /* Mark it up, and increment counters. */ 14947 ipif->ipif_flags |= IPIF_UP; 14948 ill->ill_ipif_up_count++; 14949 err = ip_addr_availability_check(ipif); 14950 mutex_exit(&ipst->ips_ip_addr_avail_lock); 14951 rw_exit(&ipst->ips_ill_g_lock); 14952 14953 if (err != 0) { 14954 /* 14955 * Our address may already be up on the same ill. In this case, 14956 * the ARP entry for our ipif replaced the one for the other 14957 * ipif. So we don't want to delete it (otherwise the other ipif 14958 * would be unable to send packets). 14959 * ip_addr_availability_check() identifies this case for us and 14960 * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL 14961 * which is the expected error code. 14962 */ 14963 ill->ill_ipif_up_count--; 14964 ipif->ipif_flags &= ~IPIF_UP; 14965 goto bad; 14966 } 14967 14968 /* 14969 * Add in all newly created IREs. ire_create_bcast() has 14970 * already checked for duplicates of the IRE_BROADCAST type. 14971 * We add the IRE_INTERFACE before the IRE_LOCAL to ensure 14972 * that lookups find the IRE_LOCAL even if the IRE_INTERFACE is 14973 * a /32 route. 14974 */ 14975 if (ire_if != NULL) { 14976 ire_if = ire_add(ire_if); 14977 if (ire_if == NULL) { 14978 err = ENOMEM; 14979 goto bad2; 14980 } 14981 #ifdef DEBUG 14982 ire_refhold_notr(ire_if); 14983 ire_refrele(ire_if); 14984 #endif 14985 } 14986 if (ire_local != NULL) { 14987 ire_local = ire_add(ire_local); 14988 if (ire_local == NULL) { 14989 err = ENOMEM; 14990 goto bad2; 14991 } 14992 #ifdef DEBUG 14993 ire_refhold_notr(ire_local); 14994 ire_refrele(ire_local); 14995 #endif 14996 } 14997 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14998 if (ire_local != NULL) 14999 ipif->ipif_ire_local = ire_local; 15000 if (ire_if != NULL) 15001 ipif->ipif_ire_if = ire_if; 15002 rw_exit(&ipst->ips_ill_g_lock); 15003 ire_local = NULL; 15004 ire_if = NULL; 15005 15006 /* 15007 * We first add all of them, and if that succeeds we refrele the 15008 * bunch. That enables us to delete all of them should any of the 15009 * ire_adds fail. 15010 */ 15011 for (irep1 = irep; irep1 > ire_array; ) { 15012 irep1--; 15013 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock))); 15014 *irep1 = ire_add(*irep1); 15015 if (*irep1 == NULL) { 15016 err = ENOMEM; 15017 goto bad2; 15018 } 15019 } 15020 15021 for (irep1 = irep; irep1 > ire_array; ) { 15022 irep1--; 15023 /* refheld by ire_add. */ 15024 if (*irep1 != NULL) { 15025 ire_refrele(*irep1); 15026 *irep1 = NULL; 15027 } 15028 } 15029 15030 if (!loopback) { 15031 /* 15032 * If the broadcast address has been set, make sure it makes 15033 * sense based on the interface address. 15034 * Only match on ill since we are sharing broadcast addresses. 15035 */ 15036 if ((ipif->ipif_brd_addr != INADDR_ANY) && 15037 (ipif->ipif_flags & IPIF_BROADCAST)) { 15038 ire_t *ire; 15039 15040 ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0, 15041 IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES, NULL, 15042 (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL); 15043 15044 if (ire == NULL) { 15045 /* 15046 * If there isn't a matching broadcast IRE, 15047 * revert to the default for this netmask. 15048 */ 15049 ipif->ipif_v6brd_addr = ipv6_all_zeros; 15050 mutex_enter(&ipif->ipif_ill->ill_lock); 15051 ipif_set_default(ipif); 15052 mutex_exit(&ipif->ipif_ill->ill_lock); 15053 } else { 15054 ire_refrele(ire); 15055 } 15056 } 15057 15058 } 15059 return (0); 15060 15061 bad2: 15062 ill->ill_ipif_up_count--; 15063 ipif->ipif_flags &= ~IPIF_UP; 15064 15065 bad: 15066 ip1dbg(("ipif_add_ires: FAILED \n")); 15067 if (ire_local != NULL) 15068 ire_delete(ire_local); 15069 if (ire_if != NULL) 15070 ire_delete(ire_if); 15071 15072 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15073 ire_local = ipif->ipif_ire_local; 15074 ipif->ipif_ire_local = NULL; 15075 ire_if = ipif->ipif_ire_if; 15076 ipif->ipif_ire_if = NULL; 15077 rw_exit(&ipst->ips_ill_g_lock); 15078 if (ire_local != NULL) { 15079 ire_delete(ire_local); 15080 ire_refrele_notr(ire_local); 15081 } 15082 if (ire_if != NULL) { 15083 ire_delete(ire_if); 15084 ire_refrele_notr(ire_if); 15085 } 15086 15087 while (irep > ire_array) { 15088 irep--; 15089 if (*irep != NULL) { 15090 ire_delete(*irep); 15091 } 15092 } 15093 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); 15094 15095 return (err); 15096 } 15097 15098 /* Remove all the IREs created by ipif_add_ires_v4 */ 15099 void 15100 ipif_delete_ires_v4(ipif_t *ipif) 15101 { 15102 ill_t *ill = ipif->ipif_ill; 15103 ip_stack_t *ipst = ill->ill_ipst; 15104 ire_t *ire; 15105 15106 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15107 ire = ipif->ipif_ire_local; 15108 ipif->ipif_ire_local = NULL; 15109 rw_exit(&ipst->ips_ill_g_lock); 15110 if (ire != NULL) { 15111 /* 15112 * Move count to ipif so we don't loose the count due to 15113 * a down/up dance. 15114 */ 15115 atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count); 15116 15117 ire_delete(ire); 15118 ire_refrele_notr(ire); 15119 } 15120 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15121 ire = ipif->ipif_ire_if; 15122 ipif->ipif_ire_if = NULL; 15123 rw_exit(&ipst->ips_ill_g_lock); 15124 if (ire != NULL) { 15125 ire_delete(ire); 15126 ire_refrele_notr(ire); 15127 } 15128 15129 /* 15130 * Delete the broadcast IREs. 15131 */ 15132 if ((ipif->ipif_flags & IPIF_BROADCAST) && 15133 !(ipif->ipif_flags & IPIF_NOXMIT)) 15134 ipif_delete_bcast_ires(ipif); 15135 } 15136 15137 /* 15138 * Checks for availbility of a usable source address (if there is one) when the 15139 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 15140 * this selection is done regardless of the destination. 15141 */ 15142 boolean_t 15143 ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid, 15144 ip_stack_t *ipst) 15145 { 15146 ipif_t *ipif = NULL; 15147 ill_t *uill; 15148 15149 ASSERT(ifindex != 0); 15150 15151 uill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 15152 if (uill == NULL) 15153 return (B_FALSE); 15154 15155 mutex_enter(&uill->ill_lock); 15156 for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15157 if (IPIF_IS_CONDEMNED(ipif)) 15158 continue; 15159 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 15160 continue; 15161 if (!(ipif->ipif_flags & IPIF_UP)) 15162 continue; 15163 if (ipif->ipif_zoneid != zoneid) 15164 continue; 15165 if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : 15166 ipif->ipif_lcl_addr == INADDR_ANY) 15167 continue; 15168 mutex_exit(&uill->ill_lock); 15169 ill_refrele(uill); 15170 return (B_TRUE); 15171 } 15172 mutex_exit(&uill->ill_lock); 15173 ill_refrele(uill); 15174 return (B_FALSE); 15175 } 15176 15177 /* 15178 * Find an ipif with a good local address on the ill+zoneid. 15179 */ 15180 ipif_t * 15181 ipif_good_addr(ill_t *ill, zoneid_t zoneid) 15182 { 15183 ipif_t *ipif; 15184 15185 mutex_enter(&ill->ill_lock); 15186 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15187 if (IPIF_IS_CONDEMNED(ipif)) 15188 continue; 15189 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 15190 continue; 15191 if (!(ipif->ipif_flags & IPIF_UP)) 15192 continue; 15193 if (ipif->ipif_zoneid != zoneid && 15194 ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES) 15195 continue; 15196 if (ill->ill_isv6 ? 15197 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : 15198 ipif->ipif_lcl_addr == INADDR_ANY) 15199 continue; 15200 ipif_refhold_locked(ipif); 15201 mutex_exit(&ill->ill_lock); 15202 return (ipif); 15203 } 15204 mutex_exit(&ill->ill_lock); 15205 return (NULL); 15206 } 15207 15208 /* 15209 * IP source address type, sorted from worst to best. For a given type, 15210 * always prefer IP addresses on the same subnet. All-zones addresses are 15211 * suboptimal because they pose problems with unlabeled destinations. 15212 */ 15213 typedef enum { 15214 IPIF_NONE, 15215 IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */ 15216 IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */ 15217 IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */ 15218 IPIF_SAMENET_ALLZONES, /* allzones and same subnet */ 15219 IPIF_DIFFNET, /* normal and different subnet */ 15220 IPIF_SAMENET, /* normal and same subnet */ 15221 IPIF_LOCALADDR /* local loopback */ 15222 } ipif_type_t; 15223 15224 /* 15225 * Pick the optimal ipif on `ill' for sending to destination `dst' from zone 15226 * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t 15227 * enumeration, and return the highest-rated ipif. If there's a tie, we pick 15228 * the first one, unless IPMP is used in which case we round-robin among them; 15229 * see below for more. 15230 * 15231 * Returns NULL if there is no suitable source address for the ill. 15232 * This only occurs when there is no valid source address for the ill. 15233 */ 15234 ipif_t * 15235 ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid, 15236 boolean_t allow_usesrc, boolean_t *notreadyp) 15237 { 15238 ill_t *usill = NULL; 15239 ill_t *ipmp_ill = NULL; 15240 ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif; 15241 ipif_type_t type, best_type; 15242 tsol_tpc_t *src_rhtp, *dst_rhtp; 15243 ip_stack_t *ipst = ill->ill_ipst; 15244 boolean_t samenet; 15245 15246 if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) { 15247 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, 15248 B_FALSE, ipst); 15249 if (usill != NULL) 15250 ill = usill; /* Select source from usesrc ILL */ 15251 else 15252 return (NULL); 15253 } 15254 15255 /* 15256 * Test addresses should never be used for source address selection, 15257 * so if we were passed one, switch to the IPMP meta-interface. 15258 */ 15259 if (IS_UNDER_IPMP(ill)) { 15260 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) 15261 ill = ipmp_ill; /* Select source from IPMP ill */ 15262 else 15263 return (NULL); 15264 } 15265 15266 /* 15267 * If we're dealing with an unlabeled destination on a labeled system, 15268 * make sure that we ignore source addresses that are incompatible with 15269 * the destination's default label. That destination's default label 15270 * must dominate the minimum label on the source address. 15271 */ 15272 dst_rhtp = NULL; 15273 if (is_system_labeled()) { 15274 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 15275 if (dst_rhtp == NULL) 15276 return (NULL); 15277 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 15278 TPC_RELE(dst_rhtp); 15279 dst_rhtp = NULL; 15280 } 15281 } 15282 15283 /* 15284 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill 15285 * can be deleted. But an ipif/ill can get CONDEMNED any time. 15286 * After selecting the right ipif, under ill_lock make sure ipif is 15287 * not condemned, and increment refcnt. If ipif is CONDEMNED, 15288 * we retry. Inside the loop we still need to check for CONDEMNED, 15289 * but not under a lock. 15290 */ 15291 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 15292 retry: 15293 /* 15294 * For source address selection, we treat the ipif list as circular 15295 * and continue until we get back to where we started. This allows 15296 * IPMP to vary source address selection (which improves inbound load 15297 * spreading) by caching its last ending point and starting from 15298 * there. NOTE: we don't have to worry about ill_src_ipif changing 15299 * ills since that can't happen on the IPMP ill. 15300 */ 15301 start_ipif = ill->ill_ipif; 15302 if (IS_IPMP(ill) && ill->ill_src_ipif != NULL) 15303 start_ipif = ill->ill_src_ipif; 15304 15305 ipif = start_ipif; 15306 best_ipif = NULL; 15307 best_type = IPIF_NONE; 15308 do { 15309 if ((next_ipif = ipif->ipif_next) == NULL) 15310 next_ipif = ill->ill_ipif; 15311 15312 if (IPIF_IS_CONDEMNED(ipif)) 15313 continue; 15314 /* Always skip NOLOCAL and ANYCAST interfaces */ 15315 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 15316 continue; 15317 /* Always skip NOACCEPT interfaces */ 15318 if (ipif->ipif_ill->ill_flags & ILLF_NOACCEPT) 15319 continue; 15320 if (!(ipif->ipif_flags & IPIF_UP)) 15321 continue; 15322 15323 if (!ipif->ipif_addr_ready) { 15324 if (notreadyp != NULL) 15325 *notreadyp = B_TRUE; 15326 continue; 15327 } 15328 15329 if (zoneid != ALL_ZONES && 15330 ipif->ipif_zoneid != zoneid && 15331 ipif->ipif_zoneid != ALL_ZONES) 15332 continue; 15333 15334 /* 15335 * Interfaces with 0.0.0.0 address are allowed to be UP, but 15336 * are not valid as source addresses. 15337 */ 15338 if (ipif->ipif_lcl_addr == INADDR_ANY) 15339 continue; 15340 15341 /* 15342 * Check compatibility of local address for destination's 15343 * default label if we're on a labeled system. Incompatible 15344 * addresses can't be used at all. 15345 */ 15346 if (dst_rhtp != NULL) { 15347 boolean_t incompat; 15348 15349 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 15350 IPV4_VERSION, B_FALSE); 15351 if (src_rhtp == NULL) 15352 continue; 15353 incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO || 15354 src_rhtp->tpc_tp.tp_doi != 15355 dst_rhtp->tpc_tp.tp_doi || 15356 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 15357 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 15358 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 15359 src_rhtp->tpc_tp.tp_sl_set_cipso)); 15360 TPC_RELE(src_rhtp); 15361 if (incompat) 15362 continue; 15363 } 15364 15365 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet); 15366 15367 if (ipif->ipif_lcl_addr == dst) { 15368 type = IPIF_LOCALADDR; 15369 } else if (ipif->ipif_flags & IPIF_DEPRECATED) { 15370 type = samenet ? IPIF_SAMENET_DEPRECATED : 15371 IPIF_DIFFNET_DEPRECATED; 15372 } else if (ipif->ipif_zoneid == ALL_ZONES) { 15373 type = samenet ? IPIF_SAMENET_ALLZONES : 15374 IPIF_DIFFNET_ALLZONES; 15375 } else { 15376 type = samenet ? IPIF_SAMENET : IPIF_DIFFNET; 15377 } 15378 15379 if (type > best_type) { 15380 best_type = type; 15381 best_ipif = ipif; 15382 if (best_type == IPIF_LOCALADDR) 15383 break; /* can't get better */ 15384 } 15385 } while ((ipif = next_ipif) != start_ipif); 15386 15387 if ((ipif = best_ipif) != NULL) { 15388 mutex_enter(&ipif->ipif_ill->ill_lock); 15389 if (IPIF_IS_CONDEMNED(ipif)) { 15390 mutex_exit(&ipif->ipif_ill->ill_lock); 15391 goto retry; 15392 } 15393 ipif_refhold_locked(ipif); 15394 15395 /* 15396 * For IPMP, update the source ipif rotor to the next ipif, 15397 * provided we can look it up. (We must not use it if it's 15398 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after 15399 * ipif_free() checked ill_src_ipif.) 15400 */ 15401 if (IS_IPMP(ill) && ipif != NULL) { 15402 next_ipif = ipif->ipif_next; 15403 if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif)) 15404 ill->ill_src_ipif = next_ipif; 15405 else 15406 ill->ill_src_ipif = NULL; 15407 } 15408 mutex_exit(&ipif->ipif_ill->ill_lock); 15409 } 15410 15411 rw_exit(&ipst->ips_ill_g_lock); 15412 if (usill != NULL) 15413 ill_refrele(usill); 15414 if (ipmp_ill != NULL) 15415 ill_refrele(ipmp_ill); 15416 if (dst_rhtp != NULL) 15417 TPC_RELE(dst_rhtp); 15418 15419 #ifdef DEBUG 15420 if (ipif == NULL) { 15421 char buf1[INET6_ADDRSTRLEN]; 15422 15423 ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n", 15424 ill->ill_name, 15425 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 15426 } else { 15427 char buf1[INET6_ADDRSTRLEN]; 15428 char buf2[INET6_ADDRSTRLEN]; 15429 15430 ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n", 15431 ipif->ipif_ill->ill_name, 15432 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 15433 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 15434 buf2, sizeof (buf2)))); 15435 } 15436 #endif /* DEBUG */ 15437 return (ipif); 15438 } 15439 15440 /* 15441 * Pick a source address based on the destination ill and an optional setsrc 15442 * address. 15443 * The result is stored in srcp. If generation is set, then put the source 15444 * generation number there before we look for the source address (to avoid 15445 * missing changes in the set of source addresses. 15446 * If flagsp is set, then us it to pass back ipif_flags. 15447 * 15448 * If the caller wants to cache the returned source address and detect when 15449 * that might be stale, the caller should pass in a generation argument, 15450 * which the caller can later compare against ips_src_generation 15451 * 15452 * The precedence order for selecting an IPv4 source address is: 15453 * - RTF_SETSRC on the offlink ire always wins. 15454 * - If usrsrc is set, swap the ill to be the usesrc one. 15455 * - If IPMP is used on the ill, select a random address from the most 15456 * preferred ones below: 15457 * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES 15458 * 2. Not deprecated, not ALL_ZONES 15459 * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES 15460 * 4. Not deprecated, ALL_ZONES 15461 * 5. If onlink destination, same subnet and deprecated 15462 * 6. Deprecated. 15463 * 15464 * We have lower preference for ALL_ZONES IP addresses, 15465 * as they pose problems with unlabeled destinations. 15466 * 15467 * Note that when multiple IP addresses match e.g., #1 we pick 15468 * the first one if IPMP is not in use. With IPMP we randomize. 15469 */ 15470 int 15471 ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst, 15472 ipaddr_t multicast_ifaddr, 15473 zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp, 15474 uint32_t *generation, uint64_t *flagsp) 15475 { 15476 ipif_t *ipif; 15477 boolean_t notready = B_FALSE; /* Set if !ipif_addr_ready found */ 15478 15479 if (flagsp != NULL) 15480 *flagsp = 0; 15481 15482 /* 15483 * Need to grab the generation number before we check to 15484 * avoid a race with a change to the set of local addresses. 15485 * No lock needed since the thread which updates the set of local 15486 * addresses use ipif/ill locks and exit those (hence a store memory 15487 * barrier) before doing the atomic increase of ips_src_generation. 15488 */ 15489 if (generation != NULL) { 15490 *generation = ipst->ips_src_generation; 15491 } 15492 15493 if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) { 15494 *srcp = multicast_ifaddr; 15495 return (0); 15496 } 15497 15498 /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */ 15499 if (setsrc != INADDR_ANY) { 15500 *srcp = setsrc; 15501 return (0); 15502 } 15503 ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, ¬ready); 15504 if (ipif == NULL) { 15505 if (notready) 15506 return (ENETDOWN); 15507 else 15508 return (EADDRNOTAVAIL); 15509 } 15510 *srcp = ipif->ipif_lcl_addr; 15511 if (flagsp != NULL) 15512 *flagsp = ipif->ipif_flags; 15513 ipif_refrele(ipif); 15514 return (0); 15515 } 15516 15517 /* ARGSUSED */ 15518 int 15519 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15520 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15521 { 15522 /* 15523 * ill_phyint_reinit merged the v4 and v6 into a single 15524 * ipsq. We might not have been able to complete the 15525 * operation in ipif_set_values, if we could not become 15526 * exclusive. If so restart it here. 15527 */ 15528 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 15529 } 15530 15531 /* 15532 * Can operate on either a module or a driver queue. 15533 * Returns an error if not a module queue. 15534 */ 15535 /* ARGSUSED */ 15536 int 15537 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15538 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15539 { 15540 queue_t *q1 = q; 15541 char *cp; 15542 char interf_name[LIFNAMSIZ]; 15543 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 15544 15545 if (q->q_next == NULL) { 15546 ip1dbg(( 15547 "if_unitsel: IF_UNITSEL: no q_next\n")); 15548 return (EINVAL); 15549 } 15550 15551 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 15552 return (EALREADY); 15553 15554 do { 15555 q1 = q1->q_next; 15556 } while (q1->q_next); 15557 cp = q1->q_qinfo->qi_minfo->mi_idname; 15558 (void) sprintf(interf_name, "%s%d", cp, ppa); 15559 15560 /* 15561 * Here we are not going to delay the ioack until after 15562 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 15563 * original ioctl message before sending the requests. 15564 */ 15565 return (ipif_set_values(q, mp, interf_name, &ppa)); 15566 } 15567 15568 /* ARGSUSED */ 15569 int 15570 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15571 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15572 { 15573 return (ENXIO); 15574 } 15575 15576 /* 15577 * Create any IRE_BROADCAST entries for `ipif', and store those entries in 15578 * `irep'. Returns a pointer to the next free `irep' entry 15579 * A mirror exists in ipif_delete_bcast_ires(). 15580 * 15581 * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is 15582 * done in ire_add. 15583 */ 15584 static ire_t ** 15585 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) 15586 { 15587 ipaddr_t addr; 15588 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 15589 ipaddr_t subnetmask = ipif->ipif_net_mask; 15590 ill_t *ill = ipif->ipif_ill; 15591 zoneid_t zoneid = ipif->ipif_zoneid; 15592 15593 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n")); 15594 15595 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 15596 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); 15597 15598 if (ipif->ipif_lcl_addr == INADDR_ANY || 15599 (ipif->ipif_flags & IPIF_NOLOCAL)) 15600 netmask = htonl(IN_CLASSA_NET); /* fallback */ 15601 15602 irep = ire_create_bcast(ill, 0, zoneid, irep); 15603 irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep); 15604 15605 /* 15606 * For backward compatibility, we create net broadcast IREs based on 15607 * the old "IP address class system", since some old machines only 15608 * respond to these class derived net broadcast. However, we must not 15609 * create these net broadcast IREs if the subnetmask is shorter than 15610 * the IP address class based derived netmask. Otherwise, we may 15611 * create a net broadcast address which is the same as an IP address 15612 * on the subnet -- and then TCP will refuse to talk to that address. 15613 */ 15614 if (netmask < subnetmask) { 15615 addr = netmask & ipif->ipif_subnet; 15616 irep = ire_create_bcast(ill, addr, zoneid, irep); 15617 irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep); 15618 } 15619 15620 /* 15621 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 15622 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 15623 * created. Creating these broadcast IREs will only create confusion 15624 * as `addr' will be the same as the IP address. 15625 */ 15626 if (subnetmask != 0xFFFFFFFF) { 15627 addr = ipif->ipif_subnet; 15628 irep = ire_create_bcast(ill, addr, zoneid, irep); 15629 irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep); 15630 } 15631 15632 return (irep); 15633 } 15634 15635 /* 15636 * Mirror of ipif_create_bcast_ires() 15637 */ 15638 static void 15639 ipif_delete_bcast_ires(ipif_t *ipif) 15640 { 15641 ipaddr_t addr; 15642 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 15643 ipaddr_t subnetmask = ipif->ipif_net_mask; 15644 ill_t *ill = ipif->ipif_ill; 15645 zoneid_t zoneid = ipif->ipif_zoneid; 15646 ire_t *ire; 15647 15648 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 15649 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); 15650 15651 if (ipif->ipif_lcl_addr == INADDR_ANY || 15652 (ipif->ipif_flags & IPIF_NOLOCAL)) 15653 netmask = htonl(IN_CLASSA_NET); /* fallback */ 15654 15655 ire = ire_lookup_bcast(ill, 0, zoneid); 15656 ASSERT(ire != NULL); 15657 ire_delete(ire); ire_refrele(ire); 15658 ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid); 15659 ASSERT(ire != NULL); 15660 ire_delete(ire); ire_refrele(ire); 15661 15662 /* 15663 * For backward compatibility, we create net broadcast IREs based on 15664 * the old "IP address class system", since some old machines only 15665 * respond to these class derived net broadcast. However, we must not 15666 * create these net broadcast IREs if the subnetmask is shorter than 15667 * the IP address class based derived netmask. Otherwise, we may 15668 * create a net broadcast address which is the same as an IP address 15669 * on the subnet -- and then TCP will refuse to talk to that address. 15670 */ 15671 if (netmask < subnetmask) { 15672 addr = netmask & ipif->ipif_subnet; 15673 ire = ire_lookup_bcast(ill, addr, zoneid); 15674 ASSERT(ire != NULL); 15675 ire_delete(ire); ire_refrele(ire); 15676 ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid); 15677 ASSERT(ire != NULL); 15678 ire_delete(ire); ire_refrele(ire); 15679 } 15680 15681 /* 15682 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 15683 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 15684 * created. Creating these broadcast IREs will only create confusion 15685 * as `addr' will be the same as the IP address. 15686 */ 15687 if (subnetmask != 0xFFFFFFFF) { 15688 addr = ipif->ipif_subnet; 15689 ire = ire_lookup_bcast(ill, addr, zoneid); 15690 ASSERT(ire != NULL); 15691 ire_delete(ire); ire_refrele(ire); 15692 ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid); 15693 ASSERT(ire != NULL); 15694 ire_delete(ire); ire_refrele(ire); 15695 } 15696 } 15697 15698 /* 15699 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 15700 * from lifr_flags and the name from lifr_name. 15701 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 15702 * since ipif_lookup_on_name uses the _isv6 flags when matching. 15703 * Returns EINPROGRESS when mp has been consumed by queueing it on 15704 * ipx_pending_mp and the ioctl will complete in ip_rput. 15705 * 15706 * Can operate on either a module or a driver queue. 15707 * Returns an error if not a module queue. 15708 */ 15709 /* ARGSUSED */ 15710 int 15711 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15712 ip_ioctl_cmd_t *ipip, void *if_req) 15713 { 15714 ill_t *ill = q->q_ptr; 15715 phyint_t *phyi; 15716 ip_stack_t *ipst; 15717 struct lifreq *lifr = if_req; 15718 uint64_t new_flags; 15719 15720 ASSERT(ipif != NULL); 15721 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 15722 15723 if (q->q_next == NULL) { 15724 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n")); 15725 return (EINVAL); 15726 } 15727 15728 /* 15729 * If we are not writer on 'q' then this interface exists already 15730 * and previous lookups (ip_extract_lifreq()) found this ipif -- 15731 * so return EALREADY. 15732 */ 15733 if (ill != ipif->ipif_ill) 15734 return (EALREADY); 15735 15736 if (ill->ill_name[0] != '\0') 15737 return (EALREADY); 15738 15739 /* 15740 * If there's another ill already with the requested name, ensure 15741 * that it's of the same type. Otherwise, ill_phyint_reinit() will 15742 * fuse together two unrelated ills, which will cause chaos. 15743 */ 15744 ipst = ill->ill_ipst; 15745 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 15746 lifr->lifr_name, NULL); 15747 if (phyi != NULL) { 15748 ill_t *ill_mate = phyi->phyint_illv4; 15749 15750 if (ill_mate == NULL) 15751 ill_mate = phyi->phyint_illv6; 15752 ASSERT(ill_mate != NULL); 15753 15754 if (ill_mate->ill_media->ip_m_mac_type != 15755 ill->ill_media->ip_m_mac_type) { 15756 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to " 15757 "use the same ill name on differing media\n")); 15758 return (EINVAL); 15759 } 15760 } 15761 15762 /* 15763 * We start off as IFF_IPV4 in ipif_allocate and become 15764 * IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value. 15765 * The only flags that we read from user space are IFF_IPV4, 15766 * IFF_IPV6, and IFF_BROADCAST. 15767 * 15768 * This ill has not been inserted into the global list. 15769 * So we are still single threaded and don't need any lock 15770 * 15771 * Saniy check the flags. 15772 */ 15773 15774 if ((lifr->lifr_flags & IFF_BROADCAST) && 15775 ((lifr->lifr_flags & IFF_IPV6) || 15776 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 15777 ip1dbg(("ip_sioctl_slifname: link not broadcast capable " 15778 "or IPv6 i.e., no broadcast \n")); 15779 return (EINVAL); 15780 } 15781 15782 new_flags = 15783 lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST); 15784 15785 if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) { 15786 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of " 15787 "IFF_IPV4 or IFF_IPV6\n")); 15788 return (EINVAL); 15789 } 15790 15791 /* 15792 * We always start off as IPv4, so only need to check for IPv6. 15793 */ 15794 if ((new_flags & IFF_IPV6) != 0) { 15795 ill->ill_flags |= ILLF_IPV6; 15796 ill->ill_flags &= ~ILLF_IPV4; 15797 15798 if (lifr->lifr_flags & IFF_NOLINKLOCAL) 15799 ill->ill_flags |= ILLF_NOLINKLOCAL; 15800 } 15801 15802 if ((new_flags & IFF_BROADCAST) != 0) 15803 ipif->ipif_flags |= IPIF_BROADCAST; 15804 else 15805 ipif->ipif_flags &= ~IPIF_BROADCAST; 15806 15807 /* We started off as V4. */ 15808 if (ill->ill_flags & ILLF_IPV6) { 15809 ill->ill_phyint->phyint_illv6 = ill; 15810 ill->ill_phyint->phyint_illv4 = NULL; 15811 } 15812 15813 return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa)); 15814 } 15815 15816 /* ARGSUSED */ 15817 int 15818 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15819 ip_ioctl_cmd_t *ipip, void *if_req) 15820 { 15821 /* 15822 * ill_phyint_reinit merged the v4 and v6 into a single 15823 * ipsq. We might not have been able to complete the 15824 * slifname in ipif_set_values, if we could not become 15825 * exclusive. If so restart it here 15826 */ 15827 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 15828 } 15829 15830 /* 15831 * Return a pointer to the ipif which matches the index, IP version type and 15832 * zoneid. 15833 */ 15834 ipif_t * 15835 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 15836 ip_stack_t *ipst) 15837 { 15838 ill_t *ill; 15839 ipif_t *ipif = NULL; 15840 15841 ill = ill_lookup_on_ifindex(index, isv6, ipst); 15842 if (ill != NULL) { 15843 mutex_enter(&ill->ill_lock); 15844 for (ipif = ill->ill_ipif; ipif != NULL; 15845 ipif = ipif->ipif_next) { 15846 if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES || 15847 zoneid == ipif->ipif_zoneid || 15848 ipif->ipif_zoneid == ALL_ZONES)) { 15849 ipif_refhold_locked(ipif); 15850 break; 15851 } 15852 } 15853 mutex_exit(&ill->ill_lock); 15854 ill_refrele(ill); 15855 } 15856 return (ipif); 15857 } 15858 15859 /* 15860 * Change an existing physical interface's index. If the new index 15861 * is acceptable we update the index and the phyint_list_avl_by_index tree. 15862 * Finally, we update other systems which may have a dependence on the 15863 * index value. 15864 */ 15865 /* ARGSUSED */ 15866 int 15867 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15868 ip_ioctl_cmd_t *ipip, void *ifreq) 15869 { 15870 ill_t *ill; 15871 phyint_t *phyi; 15872 struct ifreq *ifr = (struct ifreq *)ifreq; 15873 struct lifreq *lifr = (struct lifreq *)ifreq; 15874 uint_t old_index, index; 15875 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15876 avl_index_t where; 15877 15878 if (ipip->ipi_cmd_type == IF_CMD) 15879 index = ifr->ifr_index; 15880 else 15881 index = lifr->lifr_index; 15882 15883 /* 15884 * Only allow on physical interface. Also, index zero is illegal. 15885 */ 15886 ill = ipif->ipif_ill; 15887 phyi = ill->ill_phyint; 15888 if (ipif->ipif_id != 0 || index == 0) { 15889 return (EINVAL); 15890 } 15891 15892 /* If the index is not changing, no work to do */ 15893 if (phyi->phyint_ifindex == index) 15894 return (0); 15895 15896 /* 15897 * Use phyint_exists() to determine if the new interface index 15898 * is already in use. If the index is unused then we need to 15899 * change the phyint's position in the phyint_list_avl_by_index 15900 * tree. If we do not do this, subsequent lookups (using the new 15901 * index value) will not find the phyint. 15902 */ 15903 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15904 if (phyint_exists(index, ipst)) { 15905 rw_exit(&ipst->ips_ill_g_lock); 15906 return (EEXIST); 15907 } 15908 15909 /* 15910 * The new index is unused. Set it in the phyint. However we must not 15911 * forget to trigger NE_IFINDEX_CHANGE event before the ifindex 15912 * changes. The event must be bound to old ifindex value. 15913 */ 15914 ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE, 15915 &index, sizeof (index)); 15916 15917 old_index = phyi->phyint_ifindex; 15918 phyi->phyint_ifindex = index; 15919 15920 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi); 15921 (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 15922 &index, &where); 15923 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 15924 phyi, where); 15925 rw_exit(&ipst->ips_ill_g_lock); 15926 15927 /* Update SCTP's ILL list */ 15928 sctp_ill_reindex(ill, old_index); 15929 15930 /* Send the routing sockets message */ 15931 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 15932 if (ILL_OTHER(ill)) 15933 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT); 15934 15935 /* Perhaps ilgs should use this ill */ 15936 update_conn_ill(NULL, ill->ill_ipst); 15937 return (0); 15938 } 15939 15940 /* ARGSUSED */ 15941 int 15942 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15943 ip_ioctl_cmd_t *ipip, void *ifreq) 15944 { 15945 struct ifreq *ifr = (struct ifreq *)ifreq; 15946 struct lifreq *lifr = (struct lifreq *)ifreq; 15947 15948 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 15949 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15950 /* Get the interface index */ 15951 if (ipip->ipi_cmd_type == IF_CMD) { 15952 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 15953 } else { 15954 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 15955 } 15956 return (0); 15957 } 15958 15959 /* ARGSUSED */ 15960 int 15961 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15962 ip_ioctl_cmd_t *ipip, void *ifreq) 15963 { 15964 struct lifreq *lifr = (struct lifreq *)ifreq; 15965 15966 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 15967 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15968 /* Get the interface zone */ 15969 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15970 lifr->lifr_zoneid = ipif->ipif_zoneid; 15971 return (0); 15972 } 15973 15974 /* 15975 * Set the zoneid of an interface. 15976 */ 15977 /* ARGSUSED */ 15978 int 15979 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15980 ip_ioctl_cmd_t *ipip, void *ifreq) 15981 { 15982 struct lifreq *lifr = (struct lifreq *)ifreq; 15983 int err = 0; 15984 boolean_t need_up = B_FALSE; 15985 zone_t *zptr; 15986 zone_status_t status; 15987 zoneid_t zoneid; 15988 15989 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15990 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 15991 if (!is_system_labeled()) 15992 return (ENOTSUP); 15993 zoneid = GLOBAL_ZONEID; 15994 } 15995 15996 /* cannot assign instance zero to a non-global zone */ 15997 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 15998 return (ENOTSUP); 15999 16000 /* 16001 * Cannot assign to a zone that doesn't exist or is shutting down. In 16002 * the event of a race with the zone shutdown processing, since IP 16003 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 16004 * interface will be cleaned up even if the zone is shut down 16005 * immediately after the status check. If the interface can't be brought 16006 * down right away, and the zone is shut down before the restart 16007 * function is called, we resolve the possible races by rechecking the 16008 * zone status in the restart function. 16009 */ 16010 if ((zptr = zone_find_by_id(zoneid)) == NULL) 16011 return (EINVAL); 16012 status = zone_status_get(zptr); 16013 zone_rele(zptr); 16014 16015 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 16016 return (EINVAL); 16017 16018 if (ipif->ipif_flags & IPIF_UP) { 16019 /* 16020 * If the interface is already marked up, 16021 * we call ipif_down which will take care 16022 * of ditching any IREs that have been set 16023 * up based on the old interface address. 16024 */ 16025 err = ipif_logical_down(ipif, q, mp); 16026 if (err == EINPROGRESS) 16027 return (err); 16028 (void) ipif_down_tail(ipif); 16029 need_up = B_TRUE; 16030 } 16031 16032 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 16033 return (err); 16034 } 16035 16036 static int 16037 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 16038 queue_t *q, mblk_t *mp, boolean_t need_up) 16039 { 16040 int err = 0; 16041 ip_stack_t *ipst; 16042 16043 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 16044 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 16045 16046 if (CONN_Q(q)) 16047 ipst = CONNQ_TO_IPST(q); 16048 else 16049 ipst = ILLQ_TO_IPST(q); 16050 16051 /* 16052 * For exclusive stacks we don't allow a different zoneid than 16053 * global. 16054 */ 16055 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID && 16056 zoneid != GLOBAL_ZONEID) 16057 return (EINVAL); 16058 16059 /* Set the new zone id. */ 16060 ipif->ipif_zoneid = zoneid; 16061 16062 /* Update sctp list */ 16063 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 16064 16065 /* The default multicast interface might have changed */ 16066 ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6); 16067 16068 if (need_up) { 16069 /* 16070 * Now bring the interface back up. If this 16071 * is the only IPIF for the ILL, ipif_up 16072 * will have to re-bind to the device, so 16073 * we may get back EINPROGRESS, in which 16074 * case, this IOCTL will get completed in 16075 * ip_rput_dlpi when we see the DL_BIND_ACK. 16076 */ 16077 err = ipif_up(ipif, q, mp); 16078 } 16079 return (err); 16080 } 16081 16082 /* ARGSUSED */ 16083 int 16084 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16085 ip_ioctl_cmd_t *ipip, void *if_req) 16086 { 16087 struct lifreq *lifr = (struct lifreq *)if_req; 16088 zoneid_t zoneid; 16089 zone_t *zptr; 16090 zone_status_t status; 16091 16092 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 16093 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 16094 zoneid = GLOBAL_ZONEID; 16095 16096 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 16097 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 16098 16099 /* 16100 * We recheck the zone status to resolve the following race condition: 16101 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 16102 * 2) hme0:1 is up and can't be brought down right away; 16103 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 16104 * 3) zone "myzone" is halted; the zone status switches to 16105 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 16106 * the interfaces to remove - hme0:1 is not returned because it's not 16107 * yet in "myzone", so it won't be removed; 16108 * 4) the restart function for SIOCSLIFZONE is called; without the 16109 * status check here, we would have hme0:1 in "myzone" after it's been 16110 * destroyed. 16111 * Note that if the status check fails, we need to bring the interface 16112 * back to its state prior to ip_sioctl_slifzone(), hence the call to 16113 * ipif_up_done[_v6](). 16114 */ 16115 status = ZONE_IS_UNINITIALIZED; 16116 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 16117 status = zone_status_get(zptr); 16118 zone_rele(zptr); 16119 } 16120 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 16121 if (ipif->ipif_isv6) { 16122 (void) ipif_up_done_v6(ipif); 16123 } else { 16124 (void) ipif_up_done(ipif); 16125 } 16126 return (EINVAL); 16127 } 16128 16129 (void) ipif_down_tail(ipif); 16130 16131 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 16132 B_TRUE)); 16133 } 16134 16135 /* 16136 * Return the number of addresses on `ill' with one or more of the values 16137 * in `set' set and all of the values in `clear' clear. 16138 */ 16139 static uint_t 16140 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear) 16141 { 16142 ipif_t *ipif; 16143 uint_t cnt = 0; 16144 16145 ASSERT(IAM_WRITER_ILL(ill)); 16146 16147 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 16148 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear)) 16149 cnt++; 16150 16151 return (cnt); 16152 } 16153 16154 /* 16155 * Return the number of migratable addresses on `ill' that are under 16156 * application control. 16157 */ 16158 uint_t 16159 ill_appaddr_cnt(const ill_t *ill) 16160 { 16161 return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF, 16162 IPIF_NOFAILOVER)); 16163 } 16164 16165 /* 16166 * Return the number of point-to-point addresses on `ill'. 16167 */ 16168 uint_t 16169 ill_ptpaddr_cnt(const ill_t *ill) 16170 { 16171 return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0)); 16172 } 16173 16174 /* ARGSUSED */ 16175 int 16176 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16177 ip_ioctl_cmd_t *ipip, void *ifreq) 16178 { 16179 struct lifreq *lifr = ifreq; 16180 16181 ASSERT(q->q_next == NULL); 16182 ASSERT(CONN_Q(q)); 16183 16184 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 16185 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 16186 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 16187 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 16188 16189 return (0); 16190 } 16191 16192 /* Find the previous ILL in this usesrc group */ 16193 static ill_t * 16194 ill_prev_usesrc(ill_t *uill) 16195 { 16196 ill_t *ill; 16197 16198 for (ill = uill->ill_usesrc_grp_next; 16199 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 16200 ill = ill->ill_usesrc_grp_next) 16201 /* do nothing */; 16202 return (ill); 16203 } 16204 16205 /* 16206 * Release all members of the usesrc group. This routine is called 16207 * from ill_delete when the interface being unplumbed is the 16208 * group head. 16209 * 16210 * This silently clears the usesrc that ifconfig setup. 16211 * An alternative would be to keep that ifindex, and drop packets on the floor 16212 * since no source address can be selected. 16213 * Even if we keep the current semantics, don't need a lock and a linked list. 16214 * Can walk all the ills checking if they have a ill_usesrc_ifindex matching 16215 * the one that is being removed. Issue is how we return the usesrc users 16216 * (SIOCGLIFSRCOF). We want to be able to find the ills which have an 16217 * ill_usesrc_ifindex matching a target ill. We could also do that with an 16218 * ill walk, but the walker would need to insert in the ioctl response. 16219 */ 16220 static void 16221 ill_disband_usesrc_group(ill_t *uill) 16222 { 16223 ill_t *next_ill, *tmp_ill; 16224 ip_stack_t *ipst = uill->ill_ipst; 16225 16226 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 16227 next_ill = uill->ill_usesrc_grp_next; 16228 16229 do { 16230 ASSERT(next_ill != NULL); 16231 tmp_ill = next_ill->ill_usesrc_grp_next; 16232 ASSERT(tmp_ill != NULL); 16233 next_ill->ill_usesrc_grp_next = NULL; 16234 next_ill->ill_usesrc_ifindex = 0; 16235 next_ill = tmp_ill; 16236 } while (next_ill->ill_usesrc_ifindex != 0); 16237 uill->ill_usesrc_grp_next = NULL; 16238 } 16239 16240 /* 16241 * Remove the client usesrc ILL from the list and relink to a new list 16242 */ 16243 int 16244 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 16245 { 16246 ill_t *ill, *tmp_ill; 16247 ip_stack_t *ipst = ucill->ill_ipst; 16248 16249 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 16250 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 16251 16252 /* 16253 * Check if the usesrc client ILL passed in is not already 16254 * in use as a usesrc ILL i.e one whose source address is 16255 * in use OR a usesrc ILL is not already in use as a usesrc 16256 * client ILL 16257 */ 16258 if ((ucill->ill_usesrc_ifindex == 0) || 16259 (uill->ill_usesrc_ifindex != 0)) { 16260 return (-1); 16261 } 16262 16263 ill = ill_prev_usesrc(ucill); 16264 ASSERT(ill->ill_usesrc_grp_next != NULL); 16265 16266 /* Remove from the current list */ 16267 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 16268 /* Only two elements in the list */ 16269 ASSERT(ill->ill_usesrc_ifindex == 0); 16270 ill->ill_usesrc_grp_next = NULL; 16271 } else { 16272 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 16273 } 16274 16275 if (ifindex == 0) { 16276 ucill->ill_usesrc_ifindex = 0; 16277 ucill->ill_usesrc_grp_next = NULL; 16278 return (0); 16279 } 16280 16281 ucill->ill_usesrc_ifindex = ifindex; 16282 tmp_ill = uill->ill_usesrc_grp_next; 16283 uill->ill_usesrc_grp_next = ucill; 16284 ucill->ill_usesrc_grp_next = 16285 (tmp_ill != NULL) ? tmp_ill : uill; 16286 return (0); 16287 } 16288 16289 /* 16290 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 16291 * ip.c for locking details. 16292 */ 16293 /* ARGSUSED */ 16294 int 16295 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16296 ip_ioctl_cmd_t *ipip, void *ifreq) 16297 { 16298 struct lifreq *lifr = (struct lifreq *)ifreq; 16299 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE; 16300 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 16301 int err = 0, ret; 16302 uint_t ifindex; 16303 ipsq_t *ipsq = NULL; 16304 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 16305 16306 ASSERT(IAM_WRITER_IPIF(ipif)); 16307 ASSERT(q->q_next == NULL); 16308 ASSERT(CONN_Q(q)); 16309 16310 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; 16311 16312 ifindex = lifr->lifr_index; 16313 if (ifindex == 0) { 16314 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 16315 /* non usesrc group interface, nothing to reset */ 16316 return (0); 16317 } 16318 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 16319 /* valid reset request */ 16320 reset_flg = B_TRUE; 16321 } 16322 16323 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 16324 if (usesrc_ill == NULL) 16325 return (ENXIO); 16326 if (usesrc_ill == ipif->ipif_ill) { 16327 ill_refrele(usesrc_ill); 16328 return (EINVAL); 16329 } 16330 16331 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 16332 NEW_OP, B_TRUE); 16333 if (ipsq == NULL) { 16334 err = EINPROGRESS; 16335 /* Operation enqueued on the ipsq of the usesrc ILL */ 16336 goto done; 16337 } 16338 16339 /* USESRC isn't currently supported with IPMP */ 16340 if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) { 16341 err = ENOTSUP; 16342 goto done; 16343 } 16344 16345 /* 16346 * USESRC isn't compatible with the STANDBY flag. (STANDBY is only 16347 * used by IPMP underlying interfaces, but someone might think it's 16348 * more general and try to use it independently with VNI.) 16349 */ 16350 if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 16351 err = ENOTSUP; 16352 goto done; 16353 } 16354 16355 /* 16356 * If the client is already in use as a usesrc_ill or a usesrc_ill is 16357 * already a client then return EINVAL 16358 */ 16359 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 16360 err = EINVAL; 16361 goto done; 16362 } 16363 16364 /* 16365 * If the ill_usesrc_ifindex field is already set to what it needs to 16366 * be then this is a duplicate operation. 16367 */ 16368 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 16369 err = 0; 16370 goto done; 16371 } 16372 16373 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 16374 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 16375 usesrc_ill->ill_isv6)); 16376 16377 /* 16378 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 16379 * and the ill_usesrc_ifindex fields 16380 */ 16381 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 16382 16383 if (reset_flg) { 16384 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 16385 if (ret != 0) { 16386 err = EINVAL; 16387 } 16388 rw_exit(&ipst->ips_ill_g_usesrc_lock); 16389 goto done; 16390 } 16391 16392 /* 16393 * Four possibilities to consider: 16394 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 16395 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 16396 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 16397 * 4. Both are part of their respective usesrc groups 16398 */ 16399 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 16400 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 16401 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 16402 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 16403 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 16404 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 16405 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 16406 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 16407 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 16408 /* Insert at head of list */ 16409 usesrc_cli_ill->ill_usesrc_grp_next = 16410 usesrc_ill->ill_usesrc_grp_next; 16411 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 16412 } else { 16413 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 16414 ifindex); 16415 if (ret != 0) 16416 err = EINVAL; 16417 } 16418 rw_exit(&ipst->ips_ill_g_usesrc_lock); 16419 16420 done: 16421 if (ipsq != NULL) 16422 ipsq_exit(ipsq); 16423 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 16424 ill_refrele(usesrc_ill); 16425 16426 /* Let conn_ixa caching know that source address selection changed */ 16427 ip_update_source_selection(ipst); 16428 16429 return (err); 16430 } 16431 16432 /* ARGSUSED */ 16433 int 16434 ip_sioctl_get_dadstate(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16435 ip_ioctl_cmd_t *ipip, void *if_req) 16436 { 16437 struct lifreq *lifr = (struct lifreq *)if_req; 16438 ill_t *ill = ipif->ipif_ill; 16439 16440 /* 16441 * Need a lock since IFF_UP can be set even when there are 16442 * references to the ipif. 16443 */ 16444 mutex_enter(&ill->ill_lock); 16445 if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_addr_ready == 0) 16446 lifr->lifr_dadstate = DAD_IN_PROGRESS; 16447 else 16448 lifr->lifr_dadstate = DAD_DONE; 16449 mutex_exit(&ill->ill_lock); 16450 return (0); 16451 } 16452 16453 /* 16454 * comparison function used by avl. 16455 */ 16456 static int 16457 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 16458 { 16459 16460 uint_t index; 16461 16462 ASSERT(phyip != NULL && index_ptr != NULL); 16463 16464 index = *((uint_t *)index_ptr); 16465 /* 16466 * let the phyint with the lowest index be on top. 16467 */ 16468 if (((phyint_t *)phyip)->phyint_ifindex < index) 16469 return (1); 16470 if (((phyint_t *)phyip)->phyint_ifindex > index) 16471 return (-1); 16472 return (0); 16473 } 16474 16475 /* 16476 * comparison function used by avl. 16477 */ 16478 static int 16479 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 16480 { 16481 ill_t *ill; 16482 int res = 0; 16483 16484 ASSERT(phyip != NULL && name_ptr != NULL); 16485 16486 if (((phyint_t *)phyip)->phyint_illv4) 16487 ill = ((phyint_t *)phyip)->phyint_illv4; 16488 else 16489 ill = ((phyint_t *)phyip)->phyint_illv6; 16490 ASSERT(ill != NULL); 16491 16492 res = strcmp(ill->ill_name, (char *)name_ptr); 16493 if (res > 0) 16494 return (1); 16495 else if (res < 0) 16496 return (-1); 16497 return (0); 16498 } 16499 16500 /* 16501 * This function is called on the unplumb path via ill_glist_delete() when 16502 * there are no ills left on the phyint and thus the phyint can be freed. 16503 */ 16504 static void 16505 phyint_free(phyint_t *phyi) 16506 { 16507 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 16508 16509 ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL); 16510 16511 /* 16512 * If this phyint was an IPMP meta-interface, blow away the group. 16513 * This is safe to do because all of the illgrps have already been 16514 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us. 16515 * If we're cleaning up as a result of failed initialization, 16516 * phyint_grp may be NULL. 16517 */ 16518 if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) { 16519 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 16520 ipmp_grp_destroy(phyi->phyint_grp); 16521 phyi->phyint_grp = NULL; 16522 rw_exit(&ipst->ips_ipmp_lock); 16523 } 16524 16525 /* 16526 * If this interface was under IPMP, take it out of the group. 16527 */ 16528 if (phyi->phyint_grp != NULL) 16529 ipmp_phyint_leave_grp(phyi); 16530 16531 /* 16532 * Delete the phyint and disassociate its ipsq. The ipsq itself 16533 * will be freed in ipsq_exit(). 16534 */ 16535 phyi->phyint_ipsq->ipsq_phyint = NULL; 16536 phyi->phyint_name[0] = '\0'; 16537 16538 mi_free(phyi); 16539 } 16540 16541 /* 16542 * Attach the ill to the phyint structure which can be shared by both 16543 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 16544 * function is called from ipif_set_values and ill_lookup_on_name (for 16545 * loopback) where we know the name of the ill. We lookup the ill and if 16546 * there is one present already with the name use that phyint. Otherwise 16547 * reuse the one allocated by ill_init. 16548 */ 16549 static void 16550 ill_phyint_reinit(ill_t *ill) 16551 { 16552 boolean_t isv6 = ill->ill_isv6; 16553 phyint_t *phyi_old; 16554 phyint_t *phyi; 16555 avl_index_t where = 0; 16556 ill_t *ill_other = NULL; 16557 ip_stack_t *ipst = ill->ill_ipst; 16558 16559 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 16560 16561 phyi_old = ill->ill_phyint; 16562 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 16563 phyi_old->phyint_illv6 == NULL)); 16564 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 16565 phyi_old->phyint_illv4 == NULL)); 16566 ASSERT(phyi_old->phyint_ifindex == 0); 16567 16568 /* 16569 * Now that our ill has a name, set it in the phyint. 16570 */ 16571 (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ); 16572 16573 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16574 ill->ill_name, &where); 16575 16576 /* 16577 * 1. We grabbed the ill_g_lock before inserting this ill into 16578 * the global list of ills. So no other thread could have located 16579 * this ill and hence the ipsq of this ill is guaranteed to be empty. 16580 * 2. Now locate the other protocol instance of this ill. 16581 * 3. Now grab both ill locks in the right order, and the phyint lock of 16582 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 16583 * of neither ill can change. 16584 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 16585 * other ill. 16586 * 5. Release all locks. 16587 */ 16588 16589 /* 16590 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 16591 * we are initializing IPv4. 16592 */ 16593 if (phyi != NULL) { 16594 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6; 16595 ASSERT(ill_other->ill_phyint != NULL); 16596 ASSERT((isv6 && !ill_other->ill_isv6) || 16597 (!isv6 && ill_other->ill_isv6)); 16598 GRAB_ILL_LOCKS(ill, ill_other); 16599 /* 16600 * We are potentially throwing away phyint_flags which 16601 * could be different from the one that we obtain from 16602 * ill_other->ill_phyint. But it is okay as we are assuming 16603 * that the state maintained within IP is correct. 16604 */ 16605 mutex_enter(&phyi->phyint_lock); 16606 if (isv6) { 16607 ASSERT(phyi->phyint_illv6 == NULL); 16608 phyi->phyint_illv6 = ill; 16609 } else { 16610 ASSERT(phyi->phyint_illv4 == NULL); 16611 phyi->phyint_illv4 = ill; 16612 } 16613 16614 /* 16615 * Delete the old phyint and make its ipsq eligible 16616 * to be freed in ipsq_exit(). 16617 */ 16618 phyi_old->phyint_illv4 = NULL; 16619 phyi_old->phyint_illv6 = NULL; 16620 phyi_old->phyint_ipsq->ipsq_phyint = NULL; 16621 phyi_old->phyint_name[0] = '\0'; 16622 mi_free(phyi_old); 16623 } else { 16624 mutex_enter(&ill->ill_lock); 16625 /* 16626 * We don't need to acquire any lock, since 16627 * the ill is not yet visible globally and we 16628 * have not yet released the ill_g_lock. 16629 */ 16630 phyi = phyi_old; 16631 mutex_enter(&phyi->phyint_lock); 16632 /* XXX We need a recovery strategy here. */ 16633 if (!phyint_assign_ifindex(phyi, ipst)) 16634 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 16635 16636 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16637 (void *)phyi, where); 16638 16639 (void) avl_find(&ipst->ips_phyint_g_list-> 16640 phyint_list_avl_by_index, 16641 &phyi->phyint_ifindex, &where); 16642 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16643 (void *)phyi, where); 16644 } 16645 16646 /* 16647 * Reassigning ill_phyint automatically reassigns the ipsq also. 16648 * pending mp is not affected because that is per ill basis. 16649 */ 16650 ill->ill_phyint = phyi; 16651 16652 /* 16653 * Now that the phyint's ifindex has been assigned, complete the 16654 * remaining 16655 */ 16656 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex; 16657 if (ill->ill_isv6) { 16658 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 16659 ill->ill_phyint->phyint_ifindex; 16660 ill->ill_mcast_type = ipst->ips_mld_max_version; 16661 } else { 16662 ill->ill_mcast_type = ipst->ips_igmp_max_version; 16663 } 16664 16665 /* 16666 * Generate an event within the hooks framework to indicate that 16667 * a new interface has just been added to IP. For this event to 16668 * be generated, the network interface must, at least, have an 16669 * ifindex assigned to it. (We don't generate the event for 16670 * loopback since ill_lookup_on_name() has its own NE_PLUMB event.) 16671 * 16672 * This needs to be run inside the ill_g_lock perimeter to ensure 16673 * that the ordering of delivered events to listeners matches the 16674 * order of them in the kernel. 16675 */ 16676 if (!IS_LOOPBACK(ill)) { 16677 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name, 16678 ill->ill_name_length); 16679 } 16680 RELEASE_ILL_LOCKS(ill, ill_other); 16681 mutex_exit(&phyi->phyint_lock); 16682 } 16683 16684 /* 16685 * Notify any downstream modules of the name of this interface. 16686 * An M_IOCTL is used even though we don't expect a successful reply. 16687 * Any reply message from the driver (presumably an M_IOCNAK) will 16688 * eventually get discarded somewhere upstream. The message format is 16689 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 16690 * to IP. 16691 */ 16692 static void 16693 ip_ifname_notify(ill_t *ill, queue_t *q) 16694 { 16695 mblk_t *mp1, *mp2; 16696 struct iocblk *iocp; 16697 struct lifreq *lifr; 16698 16699 mp1 = mkiocb(SIOCSLIFNAME); 16700 if (mp1 == NULL) 16701 return; 16702 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 16703 if (mp2 == NULL) { 16704 freeb(mp1); 16705 return; 16706 } 16707 16708 mp1->b_cont = mp2; 16709 iocp = (struct iocblk *)mp1->b_rptr; 16710 iocp->ioc_count = sizeof (struct lifreq); 16711 16712 lifr = (struct lifreq *)mp2->b_rptr; 16713 mp2->b_wptr += sizeof (struct lifreq); 16714 bzero(lifr, sizeof (struct lifreq)); 16715 16716 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 16717 lifr->lifr_ppa = ill->ill_ppa; 16718 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 16719 16720 DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify", 16721 char *, "SIOCSLIFNAME", ill_t *, ill); 16722 putnext(q, mp1); 16723 } 16724 16725 static int 16726 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 16727 { 16728 int err; 16729 ip_stack_t *ipst = ill->ill_ipst; 16730 phyint_t *phyi = ill->ill_phyint; 16731 16732 /* 16733 * Now that ill_name is set, the configuration for the IPMP 16734 * meta-interface can be performed. 16735 */ 16736 if (IS_IPMP(ill)) { 16737 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 16738 /* 16739 * If phyi->phyint_grp is NULL, then this is the first IPMP 16740 * meta-interface and we need to create the IPMP group. 16741 */ 16742 if (phyi->phyint_grp == NULL) { 16743 /* 16744 * If someone has renamed another IPMP group to have 16745 * the same name as our interface, bail. 16746 */ 16747 if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) { 16748 rw_exit(&ipst->ips_ipmp_lock); 16749 return (EEXIST); 16750 } 16751 phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi); 16752 if (phyi->phyint_grp == NULL) { 16753 rw_exit(&ipst->ips_ipmp_lock); 16754 return (ENOMEM); 16755 } 16756 } 16757 rw_exit(&ipst->ips_ipmp_lock); 16758 } 16759 16760 /* Tell downstream modules where they are. */ 16761 ip_ifname_notify(ill, q); 16762 16763 /* 16764 * ill_dl_phys returns EINPROGRESS in the usual case. 16765 * Error cases are ENOMEM ... 16766 */ 16767 err = ill_dl_phys(ill, ipif, mp, q); 16768 16769 if (ill->ill_isv6) { 16770 mutex_enter(&ipst->ips_mld_slowtimeout_lock); 16771 if (ipst->ips_mld_slowtimeout_id == 0) { 16772 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, 16773 (void *)ipst, 16774 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 16775 } 16776 mutex_exit(&ipst->ips_mld_slowtimeout_lock); 16777 } else { 16778 mutex_enter(&ipst->ips_igmp_slowtimeout_lock); 16779 if (ipst->ips_igmp_slowtimeout_id == 0) { 16780 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, 16781 (void *)ipst, 16782 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 16783 } 16784 mutex_exit(&ipst->ips_igmp_slowtimeout_lock); 16785 } 16786 16787 return (err); 16788 } 16789 16790 /* 16791 * Common routine for ppa and ifname setting. Should be called exclusive. 16792 * 16793 * Returns EINPROGRESS when mp has been consumed by queueing it on 16794 * ipx_pending_mp and the ioctl will complete in ip_rput. 16795 * 16796 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 16797 * the new name and new ppa in lifr_name and lifr_ppa respectively. 16798 * For SLIFNAME, we pass these values back to the userland. 16799 */ 16800 static int 16801 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 16802 { 16803 ill_t *ill; 16804 ipif_t *ipif; 16805 ipsq_t *ipsq; 16806 char *ppa_ptr; 16807 char *old_ptr; 16808 char old_char; 16809 int error; 16810 ip_stack_t *ipst; 16811 16812 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 16813 ASSERT(q->q_next != NULL); 16814 ASSERT(interf_name != NULL); 16815 16816 ill = (ill_t *)q->q_ptr; 16817 ipst = ill->ill_ipst; 16818 16819 ASSERT(ill->ill_ipst != NULL); 16820 ASSERT(ill->ill_name[0] == '\0'); 16821 ASSERT(IAM_WRITER_ILL(ill)); 16822 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 16823 ASSERT(ill->ill_ppa == UINT_MAX); 16824 16825 ill->ill_defend_start = ill->ill_defend_count = 0; 16826 /* The ppa is sent down by ifconfig or is chosen */ 16827 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 16828 return (EINVAL); 16829 } 16830 16831 /* 16832 * make sure ppa passed in is same as ppa in the name. 16833 * This check is not made when ppa == UINT_MAX in that case ppa 16834 * in the name could be anything. System will choose a ppa and 16835 * update new_ppa_ptr and inter_name to contain the choosen ppa. 16836 */ 16837 if (*new_ppa_ptr != UINT_MAX) { 16838 /* stoi changes the pointer */ 16839 old_ptr = ppa_ptr; 16840 /* 16841 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 16842 * (they don't have an externally visible ppa). We assign one 16843 * here so that we can manage the interface. Note that in 16844 * the past this value was always 0 for DLPI 1 drivers. 16845 */ 16846 if (*new_ppa_ptr == 0) 16847 *new_ppa_ptr = stoi(&old_ptr); 16848 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 16849 return (EINVAL); 16850 } 16851 /* 16852 * terminate string before ppa 16853 * save char at that location. 16854 */ 16855 old_char = ppa_ptr[0]; 16856 ppa_ptr[0] = '\0'; 16857 16858 ill->ill_ppa = *new_ppa_ptr; 16859 /* 16860 * Finish as much work now as possible before calling ill_glist_insert 16861 * which makes the ill globally visible and also merges it with the 16862 * other protocol instance of this phyint. The remaining work is 16863 * done after entering the ipsq which may happen sometime later. 16864 */ 16865 ipif = ill->ill_ipif; 16866 16867 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 16868 ipif_assign_seqid(ipif); 16869 16870 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 16871 ill->ill_flags |= ILLF_IPV4; 16872 16873 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 16874 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 16875 16876 if (ill->ill_flags & ILLF_IPV6) { 16877 16878 ill->ill_isv6 = B_TRUE; 16879 ill_set_inputfn(ill); 16880 if (ill->ill_rq != NULL) { 16881 ill->ill_rq->q_qinfo = &iprinitv6; 16882 } 16883 16884 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 16885 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 16886 ipif->ipif_v6subnet = ipv6_all_zeros; 16887 ipif->ipif_v6net_mask = ipv6_all_zeros; 16888 ipif->ipif_v6brd_addr = ipv6_all_zeros; 16889 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 16890 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 16891 /* 16892 * point-to-point or Non-mulicast capable 16893 * interfaces won't do NUD unless explicitly 16894 * configured to do so. 16895 */ 16896 if (ipif->ipif_flags & IPIF_POINTOPOINT || 16897 !(ill->ill_flags & ILLF_MULTICAST)) { 16898 ill->ill_flags |= ILLF_NONUD; 16899 } 16900 /* Make sure IPv4 specific flag is not set on IPv6 if */ 16901 if (ill->ill_flags & ILLF_NOARP) { 16902 /* 16903 * Note: xresolv interfaces will eventually need 16904 * NOARP set here as well, but that will require 16905 * those external resolvers to have some 16906 * knowledge of that flag and act appropriately. 16907 * Not to be changed at present. 16908 */ 16909 ill->ill_flags &= ~ILLF_NOARP; 16910 } 16911 /* 16912 * Set the ILLF_ROUTER flag according to the global 16913 * IPv6 forwarding policy. 16914 */ 16915 if (ipst->ips_ipv6_forwarding != 0) 16916 ill->ill_flags |= ILLF_ROUTER; 16917 } else if (ill->ill_flags & ILLF_IPV4) { 16918 ill->ill_isv6 = B_FALSE; 16919 ill_set_inputfn(ill); 16920 ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER; 16921 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 16922 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 16923 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 16924 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 16925 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 16926 /* 16927 * Set the ILLF_ROUTER flag according to the global 16928 * IPv4 forwarding policy. 16929 */ 16930 if (ipst->ips_ip_forwarding != 0) 16931 ill->ill_flags |= ILLF_ROUTER; 16932 } 16933 16934 ASSERT(ill->ill_phyint != NULL); 16935 16936 /* 16937 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will 16938 * be completed in ill_glist_insert -> ill_phyint_reinit 16939 */ 16940 if (!ill_allocate_mibs(ill)) 16941 return (ENOMEM); 16942 16943 /* 16944 * Pick a default sap until we get the DL_INFO_ACK back from 16945 * the driver. 16946 */ 16947 ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap : 16948 ill->ill_media->ip_m_ipv4sap; 16949 16950 ill->ill_ifname_pending = 1; 16951 ill->ill_ifname_pending_err = 0; 16952 16953 /* 16954 * When the first ipif comes up in ipif_up_done(), multicast groups 16955 * that were joined while this ill was not bound to the DLPI link need 16956 * to be recovered by ill_recover_multicast(). 16957 */ 16958 ill->ill_need_recover_multicast = 1; 16959 16960 ill_refhold(ill); 16961 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16962 if ((error = ill_glist_insert(ill, interf_name, 16963 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 16964 ill->ill_ppa = UINT_MAX; 16965 ill->ill_name[0] = '\0'; 16966 /* 16967 * undo null termination done above. 16968 */ 16969 ppa_ptr[0] = old_char; 16970 rw_exit(&ipst->ips_ill_g_lock); 16971 ill_refrele(ill); 16972 return (error); 16973 } 16974 16975 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 16976 16977 /* 16978 * When we return the buffer pointed to by interf_name should contain 16979 * the same name as in ill_name. 16980 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 16981 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 16982 * so copy full name and update the ppa ptr. 16983 * When ppa passed in != UINT_MAX all values are correct just undo 16984 * null termination, this saves a bcopy. 16985 */ 16986 if (*new_ppa_ptr == UINT_MAX) { 16987 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 16988 *new_ppa_ptr = ill->ill_ppa; 16989 } else { 16990 /* 16991 * undo null termination done above. 16992 */ 16993 ppa_ptr[0] = old_char; 16994 } 16995 16996 /* Let SCTP know about this ILL */ 16997 sctp_update_ill(ill, SCTP_ILL_INSERT); 16998 16999 /* 17000 * ill_glist_insert has made the ill visible globally, and 17001 * ill_phyint_reinit could have changed the ipsq. At this point, 17002 * we need to hold the ips_ill_g_lock across the call to enter the 17003 * ipsq to enforce atomicity and prevent reordering. In the event 17004 * the ipsq has changed, and if the new ipsq is currently busy, 17005 * we need to make sure that this half-completed ioctl is ahead of 17006 * any subsequent ioctl. We achieve this by not dropping the 17007 * ips_ill_g_lock which prevents any ill lookup itself thereby 17008 * ensuring that new ioctls can't start. 17009 */ 17010 ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP, 17011 B_TRUE); 17012 17013 rw_exit(&ipst->ips_ill_g_lock); 17014 ill_refrele(ill); 17015 if (ipsq == NULL) 17016 return (EINPROGRESS); 17017 17018 /* 17019 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. 17020 */ 17021 if (ipsq->ipsq_xop->ipx_current_ipif == NULL) 17022 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); 17023 else 17024 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif); 17025 17026 error = ipif_set_values_tail(ill, ipif, mp, q); 17027 ipsq_exit(ipsq); 17028 if (error != 0 && error != EINPROGRESS) { 17029 /* 17030 * restore previous values 17031 */ 17032 ill->ill_isv6 = B_FALSE; 17033 ill_set_inputfn(ill); 17034 } 17035 return (error); 17036 } 17037 17038 void 17039 ipif_init(ip_stack_t *ipst) 17040 { 17041 int i; 17042 17043 for (i = 0; i < MAX_G_HEADS; i++) { 17044 ipst->ips_ill_g_heads[i].ill_g_list_head = 17045 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 17046 ipst->ips_ill_g_heads[i].ill_g_list_tail = 17047 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 17048 } 17049 17050 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 17051 ill_phyint_compare_index, 17052 sizeof (phyint_t), 17053 offsetof(struct phyint, phyint_avl_by_index)); 17054 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 17055 ill_phyint_compare_name, 17056 sizeof (phyint_t), 17057 offsetof(struct phyint, phyint_avl_by_name)); 17058 } 17059 17060 /* 17061 * Save enough information so that we can recreate the IRE if 17062 * the interface goes down and then up. 17063 */ 17064 void 17065 ill_save_ire(ill_t *ill, ire_t *ire) 17066 { 17067 mblk_t *save_mp; 17068 17069 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 17070 if (save_mp != NULL) { 17071 ifrt_t *ifrt; 17072 17073 save_mp->b_wptr += sizeof (ifrt_t); 17074 ifrt = (ifrt_t *)save_mp->b_rptr; 17075 bzero(ifrt, sizeof (ifrt_t)); 17076 ifrt->ifrt_type = ire->ire_type; 17077 if (ire->ire_ipversion == IPV4_VERSION) { 17078 ASSERT(!ill->ill_isv6); 17079 ifrt->ifrt_addr = ire->ire_addr; 17080 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 17081 ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr; 17082 ifrt->ifrt_mask = ire->ire_mask; 17083 } else { 17084 ASSERT(ill->ill_isv6); 17085 ifrt->ifrt_v6addr = ire->ire_addr_v6; 17086 /* ire_gateway_addr_v6 can change due to RTM_CHANGE */ 17087 mutex_enter(&ire->ire_lock); 17088 ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6; 17089 mutex_exit(&ire->ire_lock); 17090 ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6; 17091 ifrt->ifrt_v6mask = ire->ire_mask_v6; 17092 } 17093 ifrt->ifrt_flags = ire->ire_flags; 17094 ifrt->ifrt_zoneid = ire->ire_zoneid; 17095 mutex_enter(&ill->ill_saved_ire_lock); 17096 save_mp->b_cont = ill->ill_saved_ire_mp; 17097 ill->ill_saved_ire_mp = save_mp; 17098 ill->ill_saved_ire_cnt++; 17099 mutex_exit(&ill->ill_saved_ire_lock); 17100 } 17101 } 17102 17103 /* 17104 * Remove one entry from ill_saved_ire_mp. 17105 */ 17106 void 17107 ill_remove_saved_ire(ill_t *ill, ire_t *ire) 17108 { 17109 mblk_t **mpp; 17110 mblk_t *mp; 17111 ifrt_t *ifrt; 17112 17113 /* Remove from ill_saved_ire_mp list if it is there */ 17114 mutex_enter(&ill->ill_saved_ire_lock); 17115 for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL; 17116 mpp = &(*mpp)->b_cont) { 17117 in6_addr_t gw_addr_v6; 17118 17119 /* 17120 * On a given ill, the tuple of address, gateway, mask, 17121 * ire_type, and zoneid is unique for each saved IRE. 17122 */ 17123 mp = *mpp; 17124 ifrt = (ifrt_t *)mp->b_rptr; 17125 /* ire_gateway_addr_v6 can change - need lock */ 17126 mutex_enter(&ire->ire_lock); 17127 gw_addr_v6 = ire->ire_gateway_addr_v6; 17128 mutex_exit(&ire->ire_lock); 17129 17130 if (ifrt->ifrt_zoneid != ire->ire_zoneid || 17131 ifrt->ifrt_type != ire->ire_type) 17132 continue; 17133 17134 if (ill->ill_isv6 ? 17135 (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr, 17136 &ire->ire_addr_v6) && 17137 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr, 17138 &gw_addr_v6) && 17139 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask, 17140 &ire->ire_mask_v6)) : 17141 (ifrt->ifrt_addr == ire->ire_addr && 17142 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 17143 ifrt->ifrt_mask == ire->ire_mask)) { 17144 *mpp = mp->b_cont; 17145 ill->ill_saved_ire_cnt--; 17146 freeb(mp); 17147 break; 17148 } 17149 } 17150 mutex_exit(&ill->ill_saved_ire_lock); 17151 } 17152 17153 /* 17154 * IP multirouting broadcast routes handling 17155 * Append CGTP broadcast IREs to regular ones created 17156 * at ifconfig time. 17157 * The usage is a route add <cgtp_bc> <nic_bc> -multirt i.e., both 17158 * the destination and the gateway are broadcast addresses. 17159 * The caller has verified that the destination is an IRE_BROADCAST and that 17160 * RTF_MULTIRT was set. Here if the gateway is a broadcast address, then 17161 * we create a MULTIRT IRE_BROADCAST. 17162 * Note that the IRE_HOST created by ire_rt_add doesn't get found by anything 17163 * since the IRE_BROADCAST takes precedence; ire_add_v4 does head insertion. 17164 */ 17165 static void 17166 ip_cgtp_bcast_add(ire_t *ire, ip_stack_t *ipst) 17167 { 17168 ire_t *ire_prim; 17169 17170 ASSERT(ire != NULL); 17171 17172 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, 17173 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, 17174 NULL); 17175 if (ire_prim != NULL) { 17176 /* 17177 * We are in the special case of broadcasts for 17178 * CGTP. We add an IRE_BROADCAST that holds 17179 * the RTF_MULTIRT flag, the destination 17180 * address and the low level 17181 * info of ire_prim. In other words, CGTP 17182 * broadcast is added to the redundant ipif. 17183 */ 17184 ill_t *ill_prim; 17185 ire_t *bcast_ire; 17186 17187 ill_prim = ire_prim->ire_ill; 17188 17189 ip2dbg(("ip_cgtp_filter_bcast_add: ire_prim %p, ill_prim %p\n", 17190 (void *)ire_prim, (void *)ill_prim)); 17191 17192 bcast_ire = ire_create( 17193 (uchar_t *)&ire->ire_addr, 17194 (uchar_t *)&ip_g_all_ones, 17195 (uchar_t *)&ire->ire_gateway_addr, 17196 IRE_BROADCAST, 17197 ill_prim, 17198 GLOBAL_ZONEID, /* CGTP is only for the global zone */ 17199 ire->ire_flags | RTF_KERNEL, 17200 NULL, 17201 ipst); 17202 17203 /* 17204 * Here we assume that ire_add does head insertion so that 17205 * the added IRE_BROADCAST comes before the existing IRE_HOST. 17206 */ 17207 if (bcast_ire != NULL) { 17208 if (ire->ire_flags & RTF_SETSRC) { 17209 bcast_ire->ire_setsrc_addr = 17210 ire->ire_setsrc_addr; 17211 } 17212 bcast_ire = ire_add(bcast_ire); 17213 if (bcast_ire != NULL) { 17214 ip2dbg(("ip_cgtp_filter_bcast_add: " 17215 "added bcast_ire %p\n", 17216 (void *)bcast_ire)); 17217 17218 ill_save_ire(ill_prim, bcast_ire); 17219 ire_refrele(bcast_ire); 17220 } 17221 } 17222 ire_refrele(ire_prim); 17223 } 17224 } 17225 17226 /* 17227 * IP multirouting broadcast routes handling 17228 * Remove the broadcast ire. 17229 * The usage is a route delete <cgtp_bc> <nic_bc> -multirt i.e., both 17230 * the destination and the gateway are broadcast addresses. 17231 * The caller has only verified that RTF_MULTIRT was set. We check 17232 * that the destination is broadcast and that the gateway is a broadcast 17233 * address, and if so delete the IRE added by ip_cgtp_bcast_add(). 17234 */ 17235 static void 17236 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst) 17237 { 17238 ASSERT(ire != NULL); 17239 17240 if (ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST) { 17241 ire_t *ire_prim; 17242 17243 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, 17244 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, 17245 ipst, NULL); 17246 if (ire_prim != NULL) { 17247 ill_t *ill_prim; 17248 ire_t *bcast_ire; 17249 17250 ill_prim = ire_prim->ire_ill; 17251 17252 ip2dbg(("ip_cgtp_filter_bcast_delete: " 17253 "ire_prim %p, ill_prim %p\n", 17254 (void *)ire_prim, (void *)ill_prim)); 17255 17256 bcast_ire = ire_ftable_lookup_v4(ire->ire_addr, 0, 17257 ire->ire_gateway_addr, IRE_BROADCAST, 17258 ill_prim, ALL_ZONES, NULL, 17259 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_ILL | 17260 MATCH_IRE_MASK, 0, ipst, NULL); 17261 17262 if (bcast_ire != NULL) { 17263 ip2dbg(("ip_cgtp_filter_bcast_delete: " 17264 "looked up bcast_ire %p\n", 17265 (void *)bcast_ire)); 17266 ill_remove_saved_ire(bcast_ire->ire_ill, 17267 bcast_ire); 17268 ire_delete(bcast_ire); 17269 ire_refrele(bcast_ire); 17270 } 17271 ire_refrele(ire_prim); 17272 } 17273 } 17274 } 17275 17276 /* 17277 * Derive an interface id from the link layer address. 17278 * Knows about IEEE 802 and IEEE EUI-64 mappings. 17279 */ 17280 static void 17281 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17282 { 17283 char *addr; 17284 17285 /* 17286 * Note that some IPv6 interfaces get plumbed over links that claim to 17287 * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g. 17288 * PPP links). The ETHERADDRL check here ensures that we only set the 17289 * interface ID on IPv6 interfaces above links that actually have real 17290 * Ethernet addresses. 17291 */ 17292 if (ill->ill_phys_addr_length == ETHERADDRL) { 17293 /* Form EUI-64 like address */ 17294 addr = (char *)&v6addr->s6_addr32[2]; 17295 bcopy(ill->ill_phys_addr, addr, 3); 17296 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 17297 addr[3] = (char)0xff; 17298 addr[4] = (char)0xfe; 17299 bcopy(ill->ill_phys_addr + 3, addr + 5, 3); 17300 } 17301 } 17302 17303 /* ARGSUSED */ 17304 static void 17305 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17306 { 17307 } 17308 17309 typedef struct ipmp_ifcookie { 17310 uint32_t ic_hostid; 17311 char ic_ifname[LIFNAMSIZ]; 17312 char ic_zonename[ZONENAME_MAX]; 17313 } ipmp_ifcookie_t; 17314 17315 /* 17316 * Construct a pseudo-random interface ID for the IPMP interface that's both 17317 * predictable and (almost) guaranteed to be unique. 17318 */ 17319 static void 17320 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17321 { 17322 zone_t *zp; 17323 uint8_t *addr; 17324 uchar_t hash[16]; 17325 ulong_t hostid; 17326 MD5_CTX ctx; 17327 ipmp_ifcookie_t ic = { 0 }; 17328 17329 ASSERT(IS_IPMP(ill)); 17330 17331 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); 17332 ic.ic_hostid = htonl((uint32_t)hostid); 17333 17334 (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ); 17335 17336 if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) { 17337 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX); 17338 zone_rele(zp); 17339 } 17340 17341 MD5Init(&ctx); 17342 MD5Update(&ctx, &ic, sizeof (ic)); 17343 MD5Final(hash, &ctx); 17344 17345 /* 17346 * Map the hash to an interface ID per the basic approach in RFC3041. 17347 */ 17348 addr = &v6addr->s6_addr8[8]; 17349 bcopy(hash + 8, addr, sizeof (uint64_t)); 17350 addr[0] &= ~0x2; /* set local bit */ 17351 } 17352 17353 /* 17354 * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet. 17355 */ 17356 static void 17357 ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr) 17358 { 17359 phyint_t *phyi = ill->ill_phyint; 17360 17361 /* 17362 * Check PHYI_MULTI_BCAST and length of physical 17363 * address to determine if we use the mapping or the 17364 * broadcast address. 17365 */ 17366 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || 17367 ill->ill_phys_addr_length != ETHERADDRL) { 17368 ip_mbcast_mapping(ill, m_ip6addr, m_physaddr); 17369 return; 17370 } 17371 m_physaddr[0] = 0x33; 17372 m_physaddr[1] = 0x33; 17373 m_physaddr[2] = m_ip6addr[12]; 17374 m_physaddr[3] = m_ip6addr[13]; 17375 m_physaddr[4] = m_ip6addr[14]; 17376 m_physaddr[5] = m_ip6addr[15]; 17377 } 17378 17379 /* 17380 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet. 17381 */ 17382 static void 17383 ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17384 { 17385 phyint_t *phyi = ill->ill_phyint; 17386 17387 /* 17388 * Check PHYI_MULTI_BCAST and length of physical 17389 * address to determine if we use the mapping or the 17390 * broadcast address. 17391 */ 17392 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || 17393 ill->ill_phys_addr_length != ETHERADDRL) { 17394 ip_mbcast_mapping(ill, m_ipaddr, m_physaddr); 17395 return; 17396 } 17397 m_physaddr[0] = 0x01; 17398 m_physaddr[1] = 0x00; 17399 m_physaddr[2] = 0x5e; 17400 m_physaddr[3] = m_ipaddr[1] & 0x7f; 17401 m_physaddr[4] = m_ipaddr[2]; 17402 m_physaddr[5] = m_ipaddr[3]; 17403 } 17404 17405 /* ARGSUSED */ 17406 static void 17407 ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17408 { 17409 /* 17410 * for the MULTI_BCAST case and other cases when we want to 17411 * use the link-layer broadcast address for multicast. 17412 */ 17413 uint8_t *bphys_addr; 17414 dl_unitdata_req_t *dlur; 17415 17416 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17417 if (ill->ill_sap_length < 0) { 17418 bphys_addr = (uchar_t *)dlur + 17419 dlur->dl_dest_addr_offset; 17420 } else { 17421 bphys_addr = (uchar_t *)dlur + 17422 dlur->dl_dest_addr_offset + ill->ill_sap_length; 17423 } 17424 17425 bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length); 17426 } 17427 17428 /* 17429 * Derive IPoIB interface id from the link layer address. 17430 */ 17431 static void 17432 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17433 { 17434 char *addr; 17435 17436 ASSERT(ill->ill_phys_addr_length == 20); 17437 addr = (char *)&v6addr->s6_addr32[2]; 17438 bcopy(ill->ill_phys_addr + 12, addr, 8); 17439 /* 17440 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 17441 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 17442 * rules. In these cases, the IBA considers these GUIDs to be in 17443 * "Modified EUI-64" format, and thus toggling the u/l bit is not 17444 * required; vendors are required not to assign global EUI-64's 17445 * that differ only in u/l bit values, thus guaranteeing uniqueness 17446 * of the interface identifier. Whether the GUID is in modified 17447 * or proper EUI-64 format, the ipv6 identifier must have the u/l 17448 * bit set to 1. 17449 */ 17450 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 17451 } 17452 17453 /* 17454 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand. 17455 * Note on mapping from multicast IP addresses to IPoIB multicast link 17456 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 17457 * The format of an IPoIB multicast address is: 17458 * 17459 * 4 byte QPN Scope Sign. Pkey 17460 * +--------------------------------------------+ 17461 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 17462 * +--------------------------------------------+ 17463 * 17464 * The Scope and Pkey components are properties of the IBA port and 17465 * network interface. They can be ascertained from the broadcast address. 17466 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 17467 */ 17468 static void 17469 ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17470 { 17471 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 17472 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 17473 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 17474 uint8_t *bphys_addr; 17475 dl_unitdata_req_t *dlur; 17476 17477 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); 17478 17479 /* 17480 * RFC 4391: IPv4 MGID is 28-bit long. 17481 */ 17482 m_physaddr[16] = m_ipaddr[0] & 0x0f; 17483 m_physaddr[17] = m_ipaddr[1]; 17484 m_physaddr[18] = m_ipaddr[2]; 17485 m_physaddr[19] = m_ipaddr[3]; 17486 17487 17488 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17489 if (ill->ill_sap_length < 0) { 17490 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 17491 } else { 17492 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + 17493 ill->ill_sap_length; 17494 } 17495 /* 17496 * Now fill in the IBA scope/Pkey values from the broadcast address. 17497 */ 17498 m_physaddr[5] = bphys_addr[5]; 17499 m_physaddr[8] = bphys_addr[8]; 17500 m_physaddr[9] = bphys_addr[9]; 17501 } 17502 17503 static void 17504 ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17505 { 17506 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 17507 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 17508 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 17509 uint8_t *bphys_addr; 17510 dl_unitdata_req_t *dlur; 17511 17512 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); 17513 17514 /* 17515 * RFC 4391: IPv4 MGID is 80-bit long. 17516 */ 17517 bcopy(&m_ipaddr[6], &m_physaddr[10], 10); 17518 17519 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17520 if (ill->ill_sap_length < 0) { 17521 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 17522 } else { 17523 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + 17524 ill->ill_sap_length; 17525 } 17526 /* 17527 * Now fill in the IBA scope/Pkey values from the broadcast address. 17528 */ 17529 m_physaddr[5] = bphys_addr[5]; 17530 m_physaddr[8] = bphys_addr[8]; 17531 m_physaddr[9] = bphys_addr[9]; 17532 } 17533 17534 /* 17535 * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4 17536 * tunnel). The IPv4 address simply get placed in the lower 4 bytes of the 17537 * IPv6 interface id. This is a suggested mechanism described in section 3.7 17538 * of RFC4213. 17539 */ 17540 static void 17541 ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 17542 { 17543 ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t)); 17544 v6addr->s6_addr32[2] = 0; 17545 bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t)); 17546 } 17547 17548 /* 17549 * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6 17550 * tunnel). The lower 8 bytes of the IPv6 address simply become the interface 17551 * id. 17552 */ 17553 static void 17554 ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 17555 { 17556 in6_addr_t *v6lladdr = (in6_addr_t *)physaddr; 17557 17558 ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t)); 17559 bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8); 17560 } 17561 17562 static void 17563 ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17564 { 17565 ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr); 17566 } 17567 17568 static void 17569 ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 17570 { 17571 ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr); 17572 } 17573 17574 static void 17575 ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17576 { 17577 ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr); 17578 } 17579 17580 static void 17581 ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 17582 { 17583 ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr); 17584 } 17585 17586 /* 17587 * Lookup an ill and verify that the zoneid has an ipif on that ill. 17588 * Returns an held ill, or NULL. 17589 */ 17590 ill_t * 17591 ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6, 17592 ip_stack_t *ipst) 17593 { 17594 ill_t *ill; 17595 ipif_t *ipif; 17596 17597 ill = ill_lookup_on_ifindex(index, isv6, ipst); 17598 if (ill == NULL) 17599 return (NULL); 17600 17601 mutex_enter(&ill->ill_lock); 17602 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17603 if (IPIF_IS_CONDEMNED(ipif)) 17604 continue; 17605 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 17606 ipif->ipif_zoneid != ALL_ZONES) 17607 continue; 17608 17609 mutex_exit(&ill->ill_lock); 17610 return (ill); 17611 } 17612 mutex_exit(&ill->ill_lock); 17613 ill_refrele(ill); 17614 return (NULL); 17615 } 17616 17617 /* 17618 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 17619 * If a pointer to an ipif_t is returned then the caller will need to do 17620 * an ill_refrele(). 17621 */ 17622 ipif_t * 17623 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, 17624 ip_stack_t *ipst) 17625 { 17626 ipif_t *ipif; 17627 ill_t *ill; 17628 17629 ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 17630 if (ill == NULL) 17631 return (NULL); 17632 17633 mutex_enter(&ill->ill_lock); 17634 if (ill->ill_state_flags & ILL_CONDEMNED) { 17635 mutex_exit(&ill->ill_lock); 17636 ill_refrele(ill); 17637 return (NULL); 17638 } 17639 17640 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17641 if (!IPIF_CAN_LOOKUP(ipif)) 17642 continue; 17643 if (lifidx == ipif->ipif_id) { 17644 ipif_refhold_locked(ipif); 17645 break; 17646 } 17647 } 17648 17649 mutex_exit(&ill->ill_lock); 17650 ill_refrele(ill); 17651 return (ipif); 17652 } 17653 17654 /* 17655 * Set ill_inputfn based on the current know state. 17656 * This needs to be called when any of the factors taken into 17657 * account changes. 17658 */ 17659 void 17660 ill_set_inputfn(ill_t *ill) 17661 { 17662 ip_stack_t *ipst = ill->ill_ipst; 17663 17664 if (ill->ill_isv6) { 17665 if (is_system_labeled()) 17666 ill->ill_inputfn = ill_input_full_v6; 17667 else 17668 ill->ill_inputfn = ill_input_short_v6; 17669 } else { 17670 if (is_system_labeled()) 17671 ill->ill_inputfn = ill_input_full_v4; 17672 else if (ill->ill_dhcpinit != 0) 17673 ill->ill_inputfn = ill_input_full_v4; 17674 else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head 17675 != NULL) 17676 ill->ill_inputfn = ill_input_full_v4; 17677 else if (ipst->ips_ip_cgtp_filter && 17678 ipst->ips_ip_cgtp_filter_ops != NULL) 17679 ill->ill_inputfn = ill_input_full_v4; 17680 else 17681 ill->ill_inputfn = ill_input_short_v4; 17682 } 17683 } 17684 17685 /* 17686 * Re-evaluate ill_inputfn for all the IPv4 ills. 17687 * Used when RSVP and CGTP comes and goes. 17688 */ 17689 void 17690 ill_set_inputfn_all(ip_stack_t *ipst) 17691 { 17692 ill_walk_context_t ctx; 17693 ill_t *ill; 17694 17695 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 17696 ill = ILL_START_WALK_V4(&ctx, ipst); 17697 for (; ill != NULL; ill = ill_next(&ctx, ill)) 17698 ill_set_inputfn(ill); 17699 17700 rw_exit(&ipst->ips_ill_g_lock); 17701 } 17702 17703 /* 17704 * Set the physical address information for `ill' to the contents of the 17705 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be 17706 * asynchronous if `ill' cannot immediately be quiesced -- in which case 17707 * EINPROGRESS will be returned. 17708 */ 17709 int 17710 ill_set_phys_addr(ill_t *ill, mblk_t *mp) 17711 { 17712 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 17713 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr; 17714 17715 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17716 17717 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR && 17718 dlindp->dl_data != DL_CURR_DEST_ADDR && 17719 dlindp->dl_data != DL_CURR_PHYS_ADDR) { 17720 /* Changing DL_IPV6_TOKEN is not yet supported */ 17721 return (0); 17722 } 17723 17724 /* 17725 * We need to store up to two copies of `mp' in `ill'. Due to the 17726 * design of ipsq_pending_mp_add(), we can't pass them as separate 17727 * arguments to ill_set_phys_addr_tail(). Instead, chain them 17728 * together here, then pull 'em apart in ill_set_phys_addr_tail(). 17729 */ 17730 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) { 17731 freemsg(mp); 17732 return (ENOMEM); 17733 } 17734 17735 ipsq_current_start(ipsq, ill->ill_ipif, 0); 17736 mutex_enter(&ill->ill_lock); 17737 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 17738 /* no more nce addition allowed */ 17739 mutex_exit(&ill->ill_lock); 17740 17741 /* 17742 * If we can quiesce the ill, then set the address. If not, then 17743 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). 17744 */ 17745 ill_down_ipifs(ill, B_TRUE); 17746 mutex_enter(&ill->ill_lock); 17747 if (!ill_is_quiescent(ill)) { 17748 /* call cannot fail since `conn_t *' argument is NULL */ 17749 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 17750 mp, ILL_DOWN); 17751 mutex_exit(&ill->ill_lock); 17752 return (EINPROGRESS); 17753 } 17754 mutex_exit(&ill->ill_lock); 17755 17756 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL); 17757 return (0); 17758 } 17759 17760 /* 17761 * Once the ill associated with `q' has quiesced, set its physical address 17762 * information to the values in `addrmp'. Note that two copies of `addrmp' 17763 * are passed (linked by b_cont), since we sometimes need to save two distinct 17764 * copies in the ill_t, and our context doesn't permit sleeping or allocation 17765 * failure (we'll free the other copy if it's not needed). Since the ill_t 17766 * is quiesced, we know any stale nce's with the old address information have 17767 * already been removed, so we don't need to call nce_flush(). 17768 */ 17769 /* ARGSUSED */ 17770 static void 17771 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) 17772 { 17773 ill_t *ill = q->q_ptr; 17774 mblk_t *addrmp2 = unlinkb(addrmp); 17775 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr; 17776 uint_t addrlen, addroff; 17777 int status; 17778 17779 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17780 17781 addroff = dlindp->dl_addr_offset; 17782 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length); 17783 17784 switch (dlindp->dl_data) { 17785 case DL_IPV6_LINK_LAYER_ADDR: 17786 ill_set_ndmp(ill, addrmp, addroff, addrlen); 17787 freemsg(addrmp2); 17788 break; 17789 17790 case DL_CURR_DEST_ADDR: 17791 freemsg(ill->ill_dest_addr_mp); 17792 ill->ill_dest_addr = addrmp->b_rptr + addroff; 17793 ill->ill_dest_addr_mp = addrmp; 17794 if (ill->ill_isv6) { 17795 ill_setdesttoken(ill); 17796 ipif_setdestlinklocal(ill->ill_ipif); 17797 } 17798 freemsg(addrmp2); 17799 break; 17800 17801 case DL_CURR_PHYS_ADDR: 17802 freemsg(ill->ill_phys_addr_mp); 17803 ill->ill_phys_addr = addrmp->b_rptr + addroff; 17804 ill->ill_phys_addr_mp = addrmp; 17805 ill->ill_phys_addr_length = addrlen; 17806 if (ill->ill_isv6) 17807 ill_set_ndmp(ill, addrmp2, addroff, addrlen); 17808 else 17809 freemsg(addrmp2); 17810 if (ill->ill_isv6) { 17811 ill_setdefaulttoken(ill); 17812 ipif_setlinklocal(ill->ill_ipif); 17813 } 17814 break; 17815 default: 17816 ASSERT(0); 17817 } 17818 17819 /* 17820 * If there are ipifs to bring up, ill_up_ipifs() will return 17821 * EINPROGRESS, and ipsq_current_finish() will be called by 17822 * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is 17823 * brought up. 17824 */ 17825 status = ill_up_ipifs(ill, q, addrmp); 17826 mutex_enter(&ill->ill_lock); 17827 if (ill->ill_dl_up) 17828 ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS; 17829 mutex_exit(&ill->ill_lock); 17830 if (status != EINPROGRESS) 17831 ipsq_current_finish(ipsq); 17832 } 17833 17834 /* 17835 * Helper routine for setting the ill_nd_lla fields. 17836 */ 17837 void 17838 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen) 17839 { 17840 freemsg(ill->ill_nd_lla_mp); 17841 ill->ill_nd_lla = ndmp->b_rptr + addroff; 17842 ill->ill_nd_lla_mp = ndmp; 17843 ill->ill_nd_lla_len = addrlen; 17844 } 17845 17846 /* 17847 * Replumb the ill. 17848 */ 17849 int 17850 ill_replumb(ill_t *ill, mblk_t *mp) 17851 { 17852 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 17853 17854 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17855 17856 ipsq_current_start(ipsq, ill->ill_ipif, 0); 17857 17858 mutex_enter(&ill->ill_lock); 17859 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 17860 /* no more nce addition allowed */ 17861 mutex_exit(&ill->ill_lock); 17862 17863 /* 17864 * If we can quiesce the ill, then continue. If not, then 17865 * ill_replumb_tail() will be called from ipif_ill_refrele_tail(). 17866 */ 17867 ill_down_ipifs(ill, B_FALSE); 17868 17869 mutex_enter(&ill->ill_lock); 17870 if (!ill_is_quiescent(ill)) { 17871 /* call cannot fail since `conn_t *' argument is NULL */ 17872 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 17873 mp, ILL_DOWN); 17874 mutex_exit(&ill->ill_lock); 17875 return (EINPROGRESS); 17876 } 17877 mutex_exit(&ill->ill_lock); 17878 17879 ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL); 17880 return (0); 17881 } 17882 17883 /* ARGSUSED */ 17884 static void 17885 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 17886 { 17887 ill_t *ill = q->q_ptr; 17888 int err; 17889 conn_t *connp = NULL; 17890 17891 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17892 freemsg(ill->ill_replumb_mp); 17893 ill->ill_replumb_mp = copyb(mp); 17894 17895 if (ill->ill_replumb_mp == NULL) { 17896 /* out of memory */ 17897 ipsq_current_finish(ipsq); 17898 return; 17899 } 17900 17901 mutex_enter(&ill->ill_lock); 17902 ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif, 17903 ill->ill_rq, ill->ill_replumb_mp, 0); 17904 mutex_exit(&ill->ill_lock); 17905 17906 if (!ill->ill_up_ipifs) { 17907 /* already closing */ 17908 ipsq_current_finish(ipsq); 17909 return; 17910 } 17911 ill->ill_replumbing = 1; 17912 err = ill_down_ipifs_tail(ill); 17913 17914 /* 17915 * Successfully quiesced and brought down the interface, now we send 17916 * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the 17917 * DL_NOTE_REPLUMB message. 17918 */ 17919 mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO, 17920 DL_NOTIFY_CONF); 17921 ASSERT(mp != NULL); 17922 ((dl_notify_conf_t *)mp->b_rptr)->dl_notification = 17923 DL_NOTE_REPLUMB_DONE; 17924 ill_dlpi_send(ill, mp); 17925 17926 /* 17927 * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP 17928 * streams have to be unbound. When all the DLPI exchanges are done, 17929 * ipsq_current_finish() will be called by arp_bringup_done(). The 17930 * remainder of ipif bringup via ill_up_ipifs() will also be done in 17931 * arp_bringup_done(). 17932 */ 17933 ASSERT(ill->ill_replumb_mp != NULL); 17934 if (err == EINPROGRESS) 17935 return; 17936 else 17937 ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp); 17938 ASSERT(connp == NULL); 17939 if (err == 0 && ill->ill_replumb_mp != NULL && 17940 ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) { 17941 return; 17942 } 17943 ipsq_current_finish(ipsq); 17944 } 17945 17946 /* 17947 * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf' 17948 * which is `bufsize' bytes. On success, zero is returned and `buf' updated 17949 * as per the ioctl. On failure, an errno is returned. 17950 */ 17951 static int 17952 ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr) 17953 { 17954 int rval; 17955 struct strioctl iocb; 17956 17957 iocb.ic_cmd = cmd; 17958 iocb.ic_timout = 15; 17959 iocb.ic_len = bufsize; 17960 iocb.ic_dp = buf; 17961 17962 return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval)); 17963 } 17964 17965 /* 17966 * Issue an SIOCGLIFCONF for address family `af' and store the result into a 17967 * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success. 17968 */ 17969 static int 17970 ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp, 17971 uint_t *bufsizep, cred_t *cr) 17972 { 17973 int err; 17974 struct lifnum lifn; 17975 17976 bzero(&lifn, sizeof (lifn)); 17977 lifn.lifn_family = af; 17978 lifn.lifn_flags = LIFC_UNDER_IPMP; 17979 17980 if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0) 17981 return (err); 17982 17983 /* 17984 * Pad the interface count to account for additional interfaces that 17985 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 17986 */ 17987 lifn.lifn_count += 4; 17988 bzero(lifcp, sizeof (*lifcp)); 17989 lifcp->lifc_flags = LIFC_UNDER_IPMP; 17990 lifcp->lifc_family = af; 17991 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 17992 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 17993 17994 err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr); 17995 if (err != 0) { 17996 kmem_free(lifcp->lifc_buf, *bufsizep); 17997 return (err); 17998 } 17999 18000 return (0); 18001 } 18002 18003 /* 18004 * Helper for ip_interface_cleanup() that removes the loopback interface. 18005 */ 18006 static void 18007 ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 18008 { 18009 int err; 18010 struct lifreq lifr; 18011 18012 bzero(&lifr, sizeof (lifr)); 18013 (void) strcpy(lifr.lifr_name, ipif_loopback_name); 18014 18015 /* 18016 * Attempt to remove the interface. It may legitimately not exist 18017 * (e.g. the zone administrator unplumbed it), so ignore ENXIO. 18018 */ 18019 err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr); 18020 if (err != 0 && err != ENXIO) { 18021 ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: " 18022 "error %d\n", isv6 ? "v6" : "v4", err)); 18023 } 18024 } 18025 18026 /* 18027 * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP 18028 * groups and that IPMP data addresses are down. These conditions must be met 18029 * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp(). 18030 */ 18031 static void 18032 ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 18033 { 18034 int af = isv6 ? AF_INET6 : AF_INET; 18035 int i, nifs; 18036 int err; 18037 uint_t bufsize; 18038 uint_t lifrsize = sizeof (struct lifreq); 18039 struct lifconf lifc; 18040 struct lifreq *lifrp; 18041 18042 if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) { 18043 cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list " 18044 "(error %d); any IPMP interfaces cannot be shutdown", err); 18045 return; 18046 } 18047 18048 nifs = lifc.lifc_len / lifrsize; 18049 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 18050 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 18051 if (err != 0) { 18052 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get " 18053 "flags: error %d", lifrp->lifr_name, err); 18054 continue; 18055 } 18056 18057 if (lifrp->lifr_flags & IFF_IPMP) { 18058 if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0) 18059 continue; 18060 18061 lifrp->lifr_flags &= ~IFF_UP; 18062 err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr); 18063 if (err != 0) { 18064 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 18065 "bring down (error %d); IPMP interface may " 18066 "not be shutdown", lifrp->lifr_name, err); 18067 } 18068 18069 /* 18070 * Check if IFF_DUPLICATE is still set -- and if so, 18071 * reset the address to clear it. 18072 */ 18073 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 18074 if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE)) 18075 continue; 18076 18077 err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr); 18078 if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR, 18079 lifrp, lifrsize, cr)) != 0) { 18080 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 18081 "reset DAD (error %d); IPMP interface may " 18082 "not be shutdown", lifrp->lifr_name, err); 18083 } 18084 continue; 18085 } 18086 18087 if (strchr(lifrp->lifr_name, IPIF_SEPARATOR_CHAR) == 0) { 18088 lifrp->lifr_groupname[0] = '\0'; 18089 if ((err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp, 18090 lifrsize, cr)) != 0) { 18091 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 18092 "leave IPMP group (error %d); associated " 18093 "IPMP interface may not be shutdown", 18094 lifrp->lifr_name, err); 18095 continue; 18096 } 18097 } 18098 } 18099 18100 kmem_free(lifc.lifc_buf, bufsize); 18101 } 18102 18103 #define UDPDEV "/devices/pseudo/udp@0:udp" 18104 #define UDP6DEV "/devices/pseudo/udp6@0:udp6" 18105 18106 /* 18107 * Remove the loopback interfaces and prep the IPMP interfaces to be torn down. 18108 * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away 18109 * when the user-level processes in the zone are killed and the latter are 18110 * cleaned up by str_stack_shutdown(). 18111 */ 18112 void 18113 ip_interface_cleanup(ip_stack_t *ipst) 18114 { 18115 ldi_handle_t lh; 18116 ldi_ident_t li; 18117 cred_t *cr; 18118 int err; 18119 int i; 18120 char *devs[] = { UDP6DEV, UDPDEV }; 18121 netstackid_t stackid = ipst->ips_netstack->netstack_stackid; 18122 18123 if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) { 18124 cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:" 18125 " error %d", err); 18126 return; 18127 } 18128 18129 cr = zone_get_kcred(netstackid_to_zoneid(stackid)); 18130 ASSERT(cr != NULL); 18131 18132 /* 18133 * NOTE: loop executes exactly twice and is hardcoded to know that the 18134 * first iteration is IPv6. (Unrolling yields repetitious code, hence 18135 * the loop.) 18136 */ 18137 for (i = 0; i < 2; i++) { 18138 err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li); 18139 if (err != 0) { 18140 cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:" 18141 " error %d", devs[i], err); 18142 continue; 18143 } 18144 18145 ip_loopback_removeif(lh, i == 0, cr); 18146 ip_ipmp_cleanup(lh, i == 0, cr); 18147 18148 (void) ldi_close(lh, FREAD|FWRITE, cr); 18149 } 18150 18151 ldi_ident_release(li); 18152 crfree(cr); 18153 } 18154 18155 /* 18156 * This needs to be in-sync with nic_event_t definition 18157 */ 18158 static const char * 18159 ill_hook_event2str(nic_event_t event) 18160 { 18161 switch (event) { 18162 case NE_PLUMB: 18163 return ("PLUMB"); 18164 case NE_UNPLUMB: 18165 return ("UNPLUMB"); 18166 case NE_UP: 18167 return ("UP"); 18168 case NE_DOWN: 18169 return ("DOWN"); 18170 case NE_ADDRESS_CHANGE: 18171 return ("ADDRESS_CHANGE"); 18172 case NE_LIF_UP: 18173 return ("LIF_UP"); 18174 case NE_LIF_DOWN: 18175 return ("LIF_DOWN"); 18176 case NE_IFINDEX_CHANGE: 18177 return ("IFINDEX_CHANGE"); 18178 default: 18179 return ("UNKNOWN"); 18180 } 18181 } 18182 18183 void 18184 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event, 18185 nic_event_data_t data, size_t datalen) 18186 { 18187 ip_stack_t *ipst = ill->ill_ipst; 18188 hook_nic_event_int_t *info; 18189 const char *str = NULL; 18190 18191 /* create a new nic event info */ 18192 if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL) 18193 goto fail; 18194 18195 info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex; 18196 info->hnei_event.hne_lif = lif; 18197 info->hnei_event.hne_event = event; 18198 info->hnei_event.hne_protocol = ill->ill_isv6 ? 18199 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 18200 info->hnei_event.hne_data = NULL; 18201 info->hnei_event.hne_datalen = 0; 18202 info->hnei_stackid = ipst->ips_netstack->netstack_stackid; 18203 18204 if (data != NULL && datalen != 0) { 18205 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP); 18206 if (info->hnei_event.hne_data == NULL) 18207 goto fail; 18208 bcopy(data, info->hnei_event.hne_data, datalen); 18209 info->hnei_event.hne_datalen = datalen; 18210 } 18211 18212 if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info, 18213 DDI_NOSLEEP) == DDI_SUCCESS) 18214 return; 18215 18216 fail: 18217 if (info != NULL) { 18218 if (info->hnei_event.hne_data != NULL) { 18219 kmem_free(info->hnei_event.hne_data, 18220 info->hnei_event.hne_datalen); 18221 } 18222 kmem_free(info, sizeof (hook_nic_event_t)); 18223 } 18224 str = ill_hook_event2str(event); 18225 ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event " 18226 "information for %s (ENOMEM)\n", str, ill->ill_name)); 18227 } 18228 18229 static int 18230 ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act) 18231 { 18232 int err = 0; 18233 const in_addr_t *addr = NULL; 18234 nce_t *nce = NULL; 18235 ill_t *ill = ipif->ipif_ill; 18236 ill_t *bound_ill; 18237 boolean_t added_ipif = B_FALSE; 18238 uint16_t state; 18239 uint16_t flags; 18240 18241 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail", 18242 ill_t *, ill, ipif_t *, ipif); 18243 if (ipif->ipif_lcl_addr != INADDR_ANY) { 18244 addr = &ipif->ipif_lcl_addr; 18245 } 18246 18247 if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) { 18248 if (res_act != Res_act_initial) 18249 return (EINVAL); 18250 } 18251 18252 if (addr != NULL) { 18253 ipmp_illgrp_t *illg = ill->ill_grp; 18254 18255 /* add unicast nce for the local addr */ 18256 18257 if (IS_IPMP(ill)) { 18258 /* 18259 * If we're here via ipif_up(), then the ipif 18260 * won't be bound yet -- add it to the group, 18261 * which will bind it if possible. (We would 18262 * add it in ipif_up(), but deleting on failure 18263 * there is gruesome.) If we're here via 18264 * ipmp_ill_bind_ipif(), then the ipif has 18265 * already been added to the group and we 18266 * just need to use the binding. 18267 */ 18268 if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) { 18269 bound_ill = ipmp_illgrp_add_ipif(illg, ipif); 18270 if (bound_ill == NULL) { 18271 /* 18272 * We couldn't bind the ipif to an ill 18273 * yet, so we have nothing to publish. 18274 * Mark the address as ready and return. 18275 */ 18276 ipif->ipif_addr_ready = 1; 18277 return (0); 18278 } 18279 added_ipif = B_TRUE; 18280 } 18281 } else { 18282 bound_ill = ill; 18283 } 18284 18285 flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY | 18286 NCE_F_NONUD); 18287 /* 18288 * If this is an initial bring-up (or the ipif was never 18289 * completely brought up), do DAD. Otherwise, we're here 18290 * because IPMP has rebound an address to this ill: send 18291 * unsolicited advertisements (ARP announcements) to 18292 * inform others. 18293 */ 18294 if (res_act == Res_act_initial || !ipif->ipif_addr_ready) { 18295 state = ND_UNCHANGED; /* compute in nce_add_common() */ 18296 } else { 18297 state = ND_REACHABLE; 18298 flags |= NCE_F_UNSOL_ADV; 18299 } 18300 18301 retry: 18302 err = nce_lookup_then_add_v4(ill, 18303 bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length, 18304 addr, flags, state, &nce); 18305 18306 /* 18307 * note that we may encounter EEXIST if we are moving 18308 * the nce as a result of a rebind operation. 18309 */ 18310 switch (err) { 18311 case 0: 18312 ipif->ipif_added_nce = 1; 18313 nce->nce_ipif_cnt++; 18314 break; 18315 case EEXIST: 18316 ip1dbg(("ipif_arp_up: NCE already exists for %s\n", 18317 ill->ill_name)); 18318 if (!NCE_MYADDR(nce->nce_common)) { 18319 /* 18320 * A leftover nce from before this address 18321 * existed 18322 */ 18323 ncec_delete(nce->nce_common); 18324 nce_refrele(nce); 18325 nce = NULL; 18326 goto retry; 18327 } 18328 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 18329 nce_refrele(nce); 18330 nce = NULL; 18331 ip1dbg(("ipif_arp_up: NCE already exists " 18332 "for %s:%u\n", ill->ill_name, 18333 ipif->ipif_id)); 18334 goto arp_up_done; 18335 } 18336 /* 18337 * Duplicate local addresses are permissible for 18338 * IPIF_POINTOPOINT interfaces which will get marked 18339 * IPIF_UNNUMBERED later in 18340 * ip_addr_availability_check(). 18341 * 18342 * The nce_ipif_cnt field tracks the number of 18343 * ipifs that have nce_addr as their local address. 18344 */ 18345 ipif->ipif_addr_ready = 1; 18346 ipif->ipif_added_nce = 1; 18347 nce->nce_ipif_cnt++; 18348 err = 0; 18349 break; 18350 default: 18351 ASSERT(nce == NULL); 18352 goto arp_up_done; 18353 } 18354 if (arp_no_defense) { 18355 if ((ipif->ipif_flags & IPIF_UP) && 18356 !ipif->ipif_addr_ready) 18357 ipif_up_notify(ipif); 18358 ipif->ipif_addr_ready = 1; 18359 } 18360 } else { 18361 /* zero address. nothing to publish */ 18362 ipif->ipif_addr_ready = 1; 18363 } 18364 if (nce != NULL) 18365 nce_refrele(nce); 18366 arp_up_done: 18367 if (added_ipif && err != 0) 18368 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 18369 return (err); 18370 } 18371 18372 int 18373 ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup) 18374 { 18375 int err = 0; 18376 ill_t *ill = ipif->ipif_ill; 18377 boolean_t first_interface, wait_for_dlpi = B_FALSE; 18378 18379 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up", 18380 ill_t *, ill, ipif_t *, ipif); 18381 18382 /* 18383 * need to bring up ARP or setup mcast mapping only 18384 * when the first interface is coming UP. 18385 */ 18386 first_interface = (ill->ill_ipif_up_count == 0 && 18387 ill->ill_ipif_dup_count == 0 && !was_dup); 18388 18389 if (res_act == Res_act_initial && first_interface) { 18390 /* 18391 * Send ATTACH + BIND 18392 */ 18393 err = arp_ll_up(ill); 18394 if (err != EINPROGRESS && err != 0) 18395 return (err); 18396 18397 /* 18398 * Add NCE for local address. Start DAD. 18399 * we'll wait to hear that DAD has finished 18400 * before using the interface. 18401 */ 18402 if (err == EINPROGRESS) 18403 wait_for_dlpi = B_TRUE; 18404 } 18405 18406 if (!wait_for_dlpi) 18407 (void) ipif_arp_up_done_tail(ipif, res_act); 18408 18409 return (!wait_for_dlpi ? 0 : EINPROGRESS); 18410 } 18411 18412 /* 18413 * Finish processing of "arp_up" after all the DLPI message 18414 * exchanges have completed between arp and the driver. 18415 */ 18416 void 18417 arp_bringup_done(ill_t *ill, int err) 18418 { 18419 mblk_t *mp1; 18420 ipif_t *ipif; 18421 conn_t *connp = NULL; 18422 ipsq_t *ipsq; 18423 queue_t *q; 18424 18425 ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name)); 18426 18427 ASSERT(IAM_WRITER_ILL(ill)); 18428 18429 ipsq = ill->ill_phyint->phyint_ipsq; 18430 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 18431 mp1 = ipsq_pending_mp_get(ipsq, &connp); 18432 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 18433 if (mp1 == NULL) /* bringup was aborted by the user */ 18434 return; 18435 18436 /* 18437 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 18438 * must have an associated conn_t. Otherwise, we're bringing this 18439 * interface back up as part of handling an asynchronous event (e.g., 18440 * physical address change). 18441 */ 18442 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18443 ASSERT(connp != NULL); 18444 q = CONNP_TO_WQ(connp); 18445 } else { 18446 ASSERT(connp == NULL); 18447 q = ill->ill_rq; 18448 } 18449 if (err == 0) { 18450 if (ipif->ipif_isv6) { 18451 if ((err = ipif_up_done_v6(ipif)) != 0) 18452 ip0dbg(("arp_bringup_done: init failed\n")); 18453 } else { 18454 err = ipif_arp_up_done_tail(ipif, Res_act_initial); 18455 if (err != 0 || 18456 (err = ipif_up_done(ipif)) != 0) { 18457 ip0dbg(("arp_bringup_done: " 18458 "init failed err %x\n", err)); 18459 (void) ipif_arp_down(ipif); 18460 } 18461 18462 } 18463 } else { 18464 ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n")); 18465 } 18466 18467 if ((err == 0) && (ill->ill_up_ipifs)) { 18468 err = ill_up_ipifs(ill, q, mp1); 18469 if (err == EINPROGRESS) 18470 return; 18471 } 18472 18473 /* 18474 * If we have a moved ipif to bring up, and everything has succeeded 18475 * to this point, bring it up on the IPMP ill. Otherwise, leave it 18476 * down -- the admin can try to bring it up by hand if need be. 18477 */ 18478 if (ill->ill_move_ipif != NULL) { 18479 ipif = ill->ill_move_ipif; 18480 ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif, 18481 ipif->ipif_ill->ill_name)); 18482 ill->ill_move_ipif = NULL; 18483 if (err == 0) { 18484 err = ipif_up(ipif, q, mp1); 18485 if (err == EINPROGRESS) 18486 return; 18487 } 18488 } 18489 18490 /* 18491 * The operation must complete without EINPROGRESS since 18492 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 18493 * Otherwise, the operation will be stuck forever in the ipsq. 18494 */ 18495 ASSERT(err != EINPROGRESS); 18496 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18497 DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish", 18498 int, ipsq->ipsq_xop->ipx_current_ioctl, 18499 ill_t *, ill, ipif_t *, ipif); 18500 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 18501 } else { 18502 ipsq_current_finish(ipsq); 18503 } 18504 } 18505 18506 /* 18507 * Finish processing of arp replumb after all the DLPI message 18508 * exchanges have completed between arp and the driver. 18509 */ 18510 void 18511 arp_replumb_done(ill_t *ill, int err) 18512 { 18513 mblk_t *mp1; 18514 ipif_t *ipif; 18515 conn_t *connp = NULL; 18516 ipsq_t *ipsq; 18517 queue_t *q; 18518 18519 ASSERT(IAM_WRITER_ILL(ill)); 18520 18521 ipsq = ill->ill_phyint->phyint_ipsq; 18522 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 18523 mp1 = ipsq_pending_mp_get(ipsq, &connp); 18524 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 18525 if (mp1 == NULL) { 18526 ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n", 18527 ipsq->ipsq_xop->ipx_current_ioctl)); 18528 /* bringup was aborted by the user */ 18529 return; 18530 } 18531 /* 18532 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 18533 * must have an associated conn_t. Otherwise, we're bringing this 18534 * interface back up as part of handling an asynchronous event (e.g., 18535 * physical address change). 18536 */ 18537 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18538 ASSERT(connp != NULL); 18539 q = CONNP_TO_WQ(connp); 18540 } else { 18541 ASSERT(connp == NULL); 18542 q = ill->ill_rq; 18543 } 18544 if ((err == 0) && (ill->ill_up_ipifs)) { 18545 err = ill_up_ipifs(ill, q, mp1); 18546 if (err == EINPROGRESS) 18547 return; 18548 } 18549 /* 18550 * The operation must complete without EINPROGRESS since 18551 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 18552 * Otherwise, the operation will be stuck forever in the ipsq. 18553 */ 18554 ASSERT(err != EINPROGRESS); 18555 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18556 DTRACE_PROBE4(ipif__ioctl, char *, 18557 "arp_replumb_done finish", 18558 int, ipsq->ipsq_xop->ipx_current_ioctl, 18559 ill_t *, ill, ipif_t *, ipif); 18560 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 18561 } else { 18562 ipsq_current_finish(ipsq); 18563 } 18564 } 18565 18566 void 18567 ipif_up_notify(ipif_t *ipif) 18568 { 18569 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 18570 ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT); 18571 sctp_update_ipif(ipif, SCTP_IPIF_UP); 18572 ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id), 18573 NE_LIF_UP, NULL, 0); 18574 } 18575 18576 /* 18577 * ILB ioctl uses cv_wait (such as deleting a rule or adding a server) and 18578 * this assumes the context is cv_wait'able. Hence it shouldnt' be used on 18579 * TPI end points with STREAMS modules pushed above. This is assured by not 18580 * having the IPI_MODOK flag for the ioctl. And IP ensures the ILB ioctl 18581 * never ends up on an ipsq, otherwise we may end up processing the ioctl 18582 * while unwinding from the ispq and that could be a thread from the bottom. 18583 */ 18584 /* ARGSUSED */ 18585 int 18586 ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 18587 ip_ioctl_cmd_t *ipip, void *arg) 18588 { 18589 mblk_t *cmd_mp = mp->b_cont->b_cont; 18590 ilb_cmd_t command = *((ilb_cmd_t *)cmd_mp->b_rptr); 18591 int ret = 0; 18592 int i; 18593 size_t size; 18594 ip_stack_t *ipst; 18595 zoneid_t zoneid; 18596 ilb_stack_t *ilbs; 18597 18598 ipst = CONNQ_TO_IPST(q); 18599 ilbs = ipst->ips_netstack->netstack_ilb; 18600 zoneid = Q_TO_CONN(q)->conn_zoneid; 18601 18602 switch (command) { 18603 case ILB_CREATE_RULE: { 18604 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 18605 18606 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 18607 ret = EINVAL; 18608 break; 18609 } 18610 18611 ret = ilb_rule_add(ilbs, zoneid, cmd); 18612 break; 18613 } 18614 case ILB_DESTROY_RULE: 18615 case ILB_ENABLE_RULE: 18616 case ILB_DISABLE_RULE: { 18617 ilb_name_cmd_t *cmd = (ilb_name_cmd_t *)cmd_mp->b_rptr; 18618 18619 if (MBLKL(cmd_mp) != sizeof (ilb_name_cmd_t)) { 18620 ret = EINVAL; 18621 break; 18622 } 18623 18624 if (cmd->flags & ILB_RULE_ALLRULES) { 18625 if (command == ILB_DESTROY_RULE) { 18626 ilb_rule_del_all(ilbs, zoneid); 18627 break; 18628 } else if (command == ILB_ENABLE_RULE) { 18629 ilb_rule_enable_all(ilbs, zoneid); 18630 break; 18631 } else if (command == ILB_DISABLE_RULE) { 18632 ilb_rule_disable_all(ilbs, zoneid); 18633 break; 18634 } 18635 } else { 18636 if (command == ILB_DESTROY_RULE) { 18637 ret = ilb_rule_del(ilbs, zoneid, cmd->name); 18638 } else if (command == ILB_ENABLE_RULE) { 18639 ret = ilb_rule_enable(ilbs, zoneid, cmd->name, 18640 NULL); 18641 } else if (command == ILB_DISABLE_RULE) { 18642 ret = ilb_rule_disable(ilbs, zoneid, cmd->name, 18643 NULL); 18644 } 18645 } 18646 break; 18647 } 18648 case ILB_NUM_RULES: { 18649 ilb_num_rules_cmd_t *cmd; 18650 18651 if (MBLKL(cmd_mp) != sizeof (ilb_num_rules_cmd_t)) { 18652 ret = EINVAL; 18653 break; 18654 } 18655 cmd = (ilb_num_rules_cmd_t *)cmd_mp->b_rptr; 18656 ilb_get_num_rules(ilbs, zoneid, &(cmd->num)); 18657 break; 18658 } 18659 case ILB_RULE_NAMES: { 18660 ilb_rule_names_cmd_t *cmd; 18661 18662 cmd = (ilb_rule_names_cmd_t *)cmd_mp->b_rptr; 18663 if (MBLKL(cmd_mp) < sizeof (ilb_rule_names_cmd_t) || 18664 cmd->num_names == 0) { 18665 ret = EINVAL; 18666 break; 18667 } 18668 size = cmd->num_names * ILB_RULE_NAMESZ; 18669 if (cmd_mp->b_rptr + offsetof(ilb_rule_names_cmd_t, buf) + 18670 size != cmd_mp->b_wptr) { 18671 ret = EINVAL; 18672 break; 18673 } 18674 ilb_get_rulenames(ilbs, zoneid, &cmd->num_names, cmd->buf); 18675 break; 18676 } 18677 case ILB_NUM_SERVERS: { 18678 ilb_num_servers_cmd_t *cmd; 18679 18680 if (MBLKL(cmd_mp) != sizeof (ilb_num_servers_cmd_t)) { 18681 ret = EINVAL; 18682 break; 18683 } 18684 cmd = (ilb_num_servers_cmd_t *)cmd_mp->b_rptr; 18685 ret = ilb_get_num_servers(ilbs, zoneid, cmd->name, 18686 &(cmd->num)); 18687 break; 18688 } 18689 case ILB_LIST_RULE: { 18690 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 18691 18692 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 18693 ret = EINVAL; 18694 break; 18695 } 18696 ret = ilb_rule_list(ilbs, zoneid, cmd); 18697 break; 18698 } 18699 case ILB_LIST_SERVERS: { 18700 ilb_servers_info_cmd_t *cmd; 18701 18702 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 18703 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t) || 18704 cmd->num_servers == 0) { 18705 ret = EINVAL; 18706 break; 18707 } 18708 size = cmd->num_servers * sizeof (ilb_server_info_t); 18709 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 18710 size != cmd_mp->b_wptr) { 18711 ret = EINVAL; 18712 break; 18713 } 18714 18715 ret = ilb_get_servers(ilbs, zoneid, cmd->name, cmd->servers, 18716 &cmd->num_servers); 18717 break; 18718 } 18719 case ILB_ADD_SERVERS: { 18720 ilb_servers_info_cmd_t *cmd; 18721 ilb_rule_t *rule; 18722 18723 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 18724 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t)) { 18725 ret = EINVAL; 18726 break; 18727 } 18728 size = cmd->num_servers * sizeof (ilb_server_info_t); 18729 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 18730 size != cmd_mp->b_wptr) { 18731 ret = EINVAL; 18732 break; 18733 } 18734 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 18735 if (rule == NULL) { 18736 ASSERT(ret != 0); 18737 break; 18738 } 18739 for (i = 0; i < cmd->num_servers; i++) { 18740 ilb_server_info_t *s; 18741 18742 s = &cmd->servers[i]; 18743 s->err = ilb_server_add(ilbs, rule, s); 18744 } 18745 ILB_RULE_REFRELE(rule); 18746 break; 18747 } 18748 case ILB_DEL_SERVERS: 18749 case ILB_ENABLE_SERVERS: 18750 case ILB_DISABLE_SERVERS: { 18751 ilb_servers_cmd_t *cmd; 18752 ilb_rule_t *rule; 18753 int (*f)(); 18754 18755 cmd = (ilb_servers_cmd_t *)cmd_mp->b_rptr; 18756 if (MBLKL(cmd_mp) < sizeof (ilb_servers_cmd_t)) { 18757 ret = EINVAL; 18758 break; 18759 } 18760 size = cmd->num_servers * sizeof (ilb_server_arg_t); 18761 if (cmd_mp->b_rptr + offsetof(ilb_servers_cmd_t, servers) + 18762 size != cmd_mp->b_wptr) { 18763 ret = EINVAL; 18764 break; 18765 } 18766 18767 if (command == ILB_DEL_SERVERS) 18768 f = ilb_server_del; 18769 else if (command == ILB_ENABLE_SERVERS) 18770 f = ilb_server_enable; 18771 else if (command == ILB_DISABLE_SERVERS) 18772 f = ilb_server_disable; 18773 18774 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 18775 if (rule == NULL) { 18776 ASSERT(ret != 0); 18777 break; 18778 } 18779 18780 for (i = 0; i < cmd->num_servers; i++) { 18781 ilb_server_arg_t *s; 18782 18783 s = &cmd->servers[i]; 18784 s->err = f(ilbs, zoneid, NULL, rule, &s->addr); 18785 } 18786 ILB_RULE_REFRELE(rule); 18787 break; 18788 } 18789 case ILB_LIST_NAT_TABLE: { 18790 ilb_list_nat_cmd_t *cmd; 18791 18792 cmd = (ilb_list_nat_cmd_t *)cmd_mp->b_rptr; 18793 if (MBLKL(cmd_mp) < sizeof (ilb_list_nat_cmd_t)) { 18794 ret = EINVAL; 18795 break; 18796 } 18797 size = cmd->num_nat * sizeof (ilb_nat_entry_t); 18798 if (cmd_mp->b_rptr + offsetof(ilb_list_nat_cmd_t, entries) + 18799 size != cmd_mp->b_wptr) { 18800 ret = EINVAL; 18801 break; 18802 } 18803 18804 ret = ilb_list_nat(ilbs, zoneid, cmd->entries, &cmd->num_nat, 18805 &cmd->flags); 18806 break; 18807 } 18808 case ILB_LIST_STICKY_TABLE: { 18809 ilb_list_sticky_cmd_t *cmd; 18810 18811 cmd = (ilb_list_sticky_cmd_t *)cmd_mp->b_rptr; 18812 if (MBLKL(cmd_mp) < sizeof (ilb_list_sticky_cmd_t)) { 18813 ret = EINVAL; 18814 break; 18815 } 18816 size = cmd->num_sticky * sizeof (ilb_sticky_entry_t); 18817 if (cmd_mp->b_rptr + offsetof(ilb_list_sticky_cmd_t, entries) + 18818 size != cmd_mp->b_wptr) { 18819 ret = EINVAL; 18820 break; 18821 } 18822 18823 ret = ilb_list_sticky(ilbs, zoneid, cmd->entries, 18824 &cmd->num_sticky, &cmd->flags); 18825 break; 18826 } 18827 default: 18828 ret = EINVAL; 18829 break; 18830 } 18831 done: 18832 return (ret); 18833 } 18834 18835 /* Remove all cache entries for this logical interface */ 18836 void 18837 ipif_nce_down(ipif_t *ipif) 18838 { 18839 ill_t *ill = ipif->ipif_ill; 18840 nce_t *nce; 18841 18842 DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down", 18843 ill_t *, ill, ipif_t *, ipif); 18844 if (ipif->ipif_added_nce) { 18845 if (ipif->ipif_isv6) 18846 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); 18847 else 18848 nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr); 18849 if (nce != NULL) { 18850 if (--nce->nce_ipif_cnt == 0) 18851 ncec_delete(nce->nce_common); 18852 ipif->ipif_added_nce = 0; 18853 nce_refrele(nce); 18854 } else { 18855 /* 18856 * nce may already be NULL because it was already 18857 * flushed, e.g., due to a call to nce_flush 18858 */ 18859 ipif->ipif_added_nce = 0; 18860 } 18861 } 18862 /* 18863 * Make IPMP aware of the deleted data address. 18864 */ 18865 if (IS_IPMP(ill)) 18866 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 18867 18868 /* 18869 * Remove all other nces dependent on this ill when the last ipif 18870 * is going away. 18871 */ 18872 if (ill->ill_ipif_up_count == 0) { 18873 ncec_walk(ill, (pfi_t)ncec_delete_per_ill, 18874 (uchar_t *)ill, ill->ill_ipst); 18875 if (IS_UNDER_IPMP(ill)) 18876 nce_flush(ill, B_TRUE); 18877 } 18878 } 18879 18880 /* 18881 * find the first interface that uses usill for its source address. 18882 */ 18883 ill_t * 18884 ill_lookup_usesrc(ill_t *usill) 18885 { 18886 ip_stack_t *ipst = usill->ill_ipst; 18887 ill_t *ill; 18888 18889 ASSERT(usill != NULL); 18890 18891 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 18892 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 18893 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18894 for (ill = usill->ill_usesrc_grp_next; ill != NULL && ill != usill; 18895 ill = ill->ill_usesrc_grp_next) { 18896 if (!IS_UNDER_IPMP(ill) && (ill->ill_flags & ILLF_MULTICAST) && 18897 !ILL_IS_CONDEMNED(ill)) { 18898 ill_refhold(ill); 18899 break; 18900 } 18901 } 18902 rw_exit(&ipst->ips_ill_g_lock); 18903 rw_exit(&ipst->ips_ill_g_usesrc_lock); 18904 return (ill); 18905 } 18906