1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 /* 28 * This file contains the interface control functions for IP. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/stream.h> 33 #include <sys/dlpi.h> 34 #include <sys/stropts.h> 35 #include <sys/strsun.h> 36 #include <sys/sysmacros.h> 37 #include <sys/strsubr.h> 38 #include <sys/strlog.h> 39 #include <sys/ddi.h> 40 #include <sys/sunddi.h> 41 #include <sys/cmn_err.h> 42 #include <sys/kstat.h> 43 #include <sys/debug.h> 44 #include <sys/zone.h> 45 #include <sys/sunldi.h> 46 #include <sys/file.h> 47 #include <sys/bitmap.h> 48 #include <sys/cpuvar.h> 49 #include <sys/time.h> 50 #include <sys/ctype.h> 51 #include <sys/kmem.h> 52 #include <sys/systm.h> 53 #include <sys/param.h> 54 #include <sys/socket.h> 55 #include <sys/isa_defs.h> 56 #include <net/if.h> 57 #include <net/if_arp.h> 58 #include <net/if_types.h> 59 #include <net/if_dl.h> 60 #include <net/route.h> 61 #include <sys/sockio.h> 62 #include <netinet/in.h> 63 #include <netinet/ip6.h> 64 #include <netinet/icmp6.h> 65 #include <netinet/igmp_var.h> 66 #include <sys/policy.h> 67 #include <sys/ethernet.h> 68 #include <sys/callb.h> 69 #include <sys/md5.h> 70 71 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 72 #include <inet/mi.h> 73 #include <inet/nd.h> 74 #include <inet/arp.h> 75 #include <inet/ip_arp.h> 76 #include <inet/mib2.h> 77 #include <inet/ip.h> 78 #include <inet/ip6.h> 79 #include <inet/ip6_asp.h> 80 #include <inet/tcp.h> 81 #include <inet/ip_multi.h> 82 #include <inet/ip_ire.h> 83 #include <inet/ip_ftable.h> 84 #include <inet/ip_rts.h> 85 #include <inet/ip_ndp.h> 86 #include <inet/ip_if.h> 87 #include <inet/ip_impl.h> 88 #include <inet/sctp_ip.h> 89 #include <inet/ip_netinfo.h> 90 #include <inet/ilb_ip.h> 91 92 #include <netinet/igmp.h> 93 #include <inet/ip_listutils.h> 94 #include <inet/ipclassifier.h> 95 #include <sys/mac_client.h> 96 #include <sys/dld.h> 97 98 #include <sys/systeminfo.h> 99 #include <sys/bootconf.h> 100 101 #include <sys/tsol/tndb.h> 102 #include <sys/tsol/tnet.h> 103 104 /* The character which tells where the ill_name ends */ 105 #define IPIF_SEPARATOR_CHAR ':' 106 107 /* IP ioctl function table entry */ 108 typedef struct ipft_s { 109 int ipft_cmd; 110 pfi_t ipft_pfi; 111 int ipft_min_size; 112 int ipft_flags; 113 } ipft_t; 114 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 115 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 116 117 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 118 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 119 char *value, caddr_t cp, cred_t *ioc_cr); 120 121 static boolean_t ill_is_quiescent(ill_t *); 122 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 123 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 124 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 125 mblk_t *mp, boolean_t need_up); 126 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 127 mblk_t *mp, boolean_t need_up); 128 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 129 queue_t *q, mblk_t *mp, boolean_t need_up); 130 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 131 mblk_t *mp); 132 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 133 mblk_t *mp); 134 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 135 queue_t *q, mblk_t *mp, boolean_t need_up); 136 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, 137 int ioccmd, struct linkblk *li); 138 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); 139 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 140 static void ipsq_flush(ill_t *ill); 141 142 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 143 queue_t *q, mblk_t *mp, boolean_t need_up); 144 static void ipsq_delete(ipsq_t *); 145 146 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 147 boolean_t initialize, boolean_t insert, int *errorp); 148 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); 149 static void ipif_delete_bcast_ires(ipif_t *ipif); 150 static int ipif_add_ires_v4(ipif_t *, boolean_t); 151 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, 152 boolean_t isv6); 153 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 154 static void ipif_free(ipif_t *ipif); 155 static void ipif_free_tail(ipif_t *ipif); 156 static void ipif_set_default(ipif_t *ipif); 157 static int ipif_set_values(queue_t *q, mblk_t *mp, 158 char *interf_name, uint_t *ppa); 159 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 160 queue_t *q); 161 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 162 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 163 ip_stack_t *); 164 165 static int ill_alloc_ppa(ill_if_t *, ill_t *); 166 static void ill_delete_interface_type(ill_if_t *); 167 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 168 static void ill_dl_down(ill_t *ill); 169 static void ill_down(ill_t *ill); 170 static void ill_down_ipifs(ill_t *, boolean_t); 171 static void ill_free_mib(ill_t *ill); 172 static void ill_glist_delete(ill_t *); 173 static void ill_phyint_reinit(ill_t *ill); 174 static void ill_set_nce_router_flags(ill_t *, boolean_t); 175 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); 176 static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *); 177 178 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; 179 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid; 180 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; 181 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid; 182 static ip_v4mapinfo_func_t ip_ether_v4_mapping; 183 static ip_v6mapinfo_func_t ip_ether_v6_mapping; 184 static ip_v4mapinfo_func_t ip_ib_v4_mapping; 185 static ip_v6mapinfo_func_t ip_ib_v6_mapping; 186 static ip_v4mapinfo_func_t ip_mbcast_mapping; 187 static void ip_cgtp_bcast_add(ire_t *, ip_stack_t *); 188 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); 189 static void phyint_free(phyint_t *); 190 191 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *); 192 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 193 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 194 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 195 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *); 196 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 197 dl_capability_sub_t *); 198 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *); 199 static void ill_capability_dld_reset_fill(ill_t *, mblk_t *); 200 static void ill_capability_dld_ack(ill_t *, mblk_t *, 201 dl_capability_sub_t *); 202 static void ill_capability_dld_enable(ill_t *); 203 static void ill_capability_ack_thr(void *); 204 static void ill_capability_lso_enable(ill_t *); 205 206 static ill_t *ill_prev_usesrc(ill_t *); 207 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 208 static void ill_disband_usesrc_group(ill_t *); 209 static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int); 210 211 #ifdef DEBUG 212 static void ill_trace_cleanup(const ill_t *); 213 static void ipif_trace_cleanup(const ipif_t *); 214 #endif 215 216 /* 217 * if we go over the memory footprint limit more than once in this msec 218 * interval, we'll start pruning aggressively. 219 */ 220 int ip_min_frag_prune_time = 0; 221 222 static ipft_t ip_ioctl_ftbl[] = { 223 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 224 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 225 IPFT_F_NO_REPLY }, 226 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 227 { 0 } 228 }; 229 230 /* Simple ICMP IP Header Template */ 231 static ipha_t icmp_ipha = { 232 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 233 }; 234 235 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 236 237 static ip_m_t ip_m_tbl[] = { 238 { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 239 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 240 ip_nodef_v6intfid }, 241 { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6, 242 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 243 ip_nodef_v6intfid }, 244 { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6, 245 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 246 ip_nodef_v6intfid }, 247 { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6, 248 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 249 ip_nodef_v6intfid }, 250 { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6, 251 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 252 ip_nodef_v6intfid }, 253 { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6, 254 ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid, 255 ip_nodef_v6intfid }, 256 { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6, 257 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 258 ip_ipv4_v6destintfid }, 259 { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6, 260 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid, 261 ip_ipv6_v6destintfid }, 262 { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6, 263 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 264 ip_nodef_v6intfid }, 265 { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 266 NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid }, 267 { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 268 NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid }, 269 { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 270 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 271 ip_nodef_v6intfid } 272 }; 273 274 static ill_t ill_null; /* Empty ILL for init. */ 275 char ipif_loopback_name[] = "lo0"; 276 static char *ipv4_forward_suffix = ":ip_forwarding"; 277 static char *ipv6_forward_suffix = ":ip6_forwarding"; 278 static sin6_t sin6_null; /* Zero address for quick clears */ 279 static sin_t sin_null; /* Zero address for quick clears */ 280 281 /* When set search for unused ipif_seqid */ 282 static ipif_t ipif_zero; 283 284 /* 285 * ppa arena is created after these many 286 * interfaces have been plumbed. 287 */ 288 uint_t ill_no_arena = 12; /* Setable in /etc/system */ 289 290 /* 291 * Allocate per-interface mibs. 292 * Returns true if ok. False otherwise. 293 * ipsq may not yet be allocated (loopback case ). 294 */ 295 static boolean_t 296 ill_allocate_mibs(ill_t *ill) 297 { 298 /* Already allocated? */ 299 if (ill->ill_ip_mib != NULL) { 300 if (ill->ill_isv6) 301 ASSERT(ill->ill_icmp6_mib != NULL); 302 return (B_TRUE); 303 } 304 305 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), 306 KM_NOSLEEP); 307 if (ill->ill_ip_mib == NULL) { 308 return (B_FALSE); 309 } 310 311 /* Setup static information */ 312 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, 313 sizeof (mib2_ipIfStatsEntry_t)); 314 if (ill->ill_isv6) { 315 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 316 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 317 sizeof (mib2_ipv6AddrEntry_t)); 318 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 319 sizeof (mib2_ipv6RouteEntry_t)); 320 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 321 sizeof (mib2_ipv6NetToMediaEntry_t)); 322 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 323 sizeof (ipv6_member_t)); 324 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 325 sizeof (ipv6_grpsrc_t)); 326 } else { 327 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 328 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 329 sizeof (mib2_ipAddrEntry_t)); 330 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 331 sizeof (mib2_ipRouteEntry_t)); 332 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 333 sizeof (mib2_ipNetToMediaEntry_t)); 334 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 335 sizeof (ip_member_t)); 336 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 337 sizeof (ip_grpsrc_t)); 338 339 /* 340 * For a v4 ill, we are done at this point, because per ill 341 * icmp mibs are only used for v6. 342 */ 343 return (B_TRUE); 344 } 345 346 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 347 KM_NOSLEEP); 348 if (ill->ill_icmp6_mib == NULL) { 349 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 350 ill->ill_ip_mib = NULL; 351 return (B_FALSE); 352 } 353 /* static icmp info */ 354 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 355 sizeof (mib2_ipv6IfIcmpEntry_t); 356 /* 357 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later 358 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 359 * -> ill_phyint_reinit 360 */ 361 return (B_TRUE); 362 } 363 364 /* 365 * Completely vaporize a lower level tap and all associated interfaces. 366 * ill_delete is called only out of ip_close when the device control 367 * stream is being closed. 368 */ 369 void 370 ill_delete(ill_t *ill) 371 { 372 ipif_t *ipif; 373 ill_t *prev_ill; 374 ip_stack_t *ipst = ill->ill_ipst; 375 376 /* 377 * ill_delete may be forcibly entering the ipsq. The previous 378 * ioctl may not have completed and may need to be aborted. 379 * ipsq_flush takes care of it. If we don't need to enter the 380 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 381 * ill_delete_tail is sufficient. 382 */ 383 ipsq_flush(ill); 384 385 /* 386 * Nuke all interfaces. ipif_free will take down the interface, 387 * remove it from the list, and free the data structure. 388 * Walk down the ipif list and remove the logical interfaces 389 * first before removing the main ipif. We can't unplumb 390 * zeroth interface first in the case of IPv6 as update_conn_ill 391 * -> ip_ll_multireq de-references ill_ipif for checking 392 * POINTOPOINT. 393 * 394 * If ill_ipif was not properly initialized (i.e low on memory), 395 * then no interfaces to clean up. In this case just clean up the 396 * ill. 397 */ 398 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 399 ipif_free(ipif); 400 401 /* 402 * clean out all the nce_t entries that depend on this 403 * ill for the ill_phys_addr. 404 */ 405 nce_flush(ill, B_TRUE); 406 407 /* Clean up msgs on pending upcalls for mrouted */ 408 reset_mrt_ill(ill); 409 410 update_conn_ill(ill, ipst); 411 412 /* 413 * Remove multicast references added as a result of calls to 414 * ip_join_allmulti(). 415 */ 416 ip_purge_allmulti(ill); 417 418 /* 419 * If the ill being deleted is under IPMP, boot it out of the illgrp. 420 */ 421 if (IS_UNDER_IPMP(ill)) 422 ipmp_ill_leave_illgrp(ill); 423 424 /* 425 * ill_down will arrange to blow off any IRE's dependent on this 426 * ILL, and shut down fragmentation reassembly. 427 */ 428 ill_down(ill); 429 430 /* Let SCTP know, so that it can remove this from its list. */ 431 sctp_update_ill(ill, SCTP_ILL_REMOVE); 432 433 /* 434 * Walk all CONNs that can have a reference on an ire or nce for this 435 * ill (we actually walk all that now have stale references). 436 */ 437 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); 438 439 /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 440 if (ill->ill_isv6) 441 dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst); 442 443 /* 444 * If an address on this ILL is being used as a source address then 445 * clear out the pointers in other ILLs that point to this ILL. 446 */ 447 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 448 if (ill->ill_usesrc_grp_next != NULL) { 449 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 450 ill_disband_usesrc_group(ill); 451 } else { /* consumer of the usesrc ILL */ 452 prev_ill = ill_prev_usesrc(ill); 453 prev_ill->ill_usesrc_grp_next = 454 ill->ill_usesrc_grp_next; 455 } 456 } 457 rw_exit(&ipst->ips_ill_g_usesrc_lock); 458 } 459 460 static void 461 ipif_non_duplicate(ipif_t *ipif) 462 { 463 ill_t *ill = ipif->ipif_ill; 464 mutex_enter(&ill->ill_lock); 465 if (ipif->ipif_flags & IPIF_DUPLICATE) { 466 ipif->ipif_flags &= ~IPIF_DUPLICATE; 467 ASSERT(ill->ill_ipif_dup_count > 0); 468 ill->ill_ipif_dup_count--; 469 } 470 mutex_exit(&ill->ill_lock); 471 } 472 473 /* 474 * ill_delete_tail is called from ip_modclose after all references 475 * to the closing ill are gone. The wait is done in ip_modclose 476 */ 477 void 478 ill_delete_tail(ill_t *ill) 479 { 480 mblk_t **mpp; 481 ipif_t *ipif; 482 ip_stack_t *ipst = ill->ill_ipst; 483 484 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 485 ipif_non_duplicate(ipif); 486 (void) ipif_down_tail(ipif); 487 } 488 489 ASSERT(ill->ill_ipif_dup_count == 0); 490 491 /* 492 * If polling capability is enabled (which signifies direct 493 * upcall into IP and driver has ill saved as a handle), 494 * we need to make sure that unbind has completed before we 495 * let the ill disappear and driver no longer has any reference 496 * to this ill. 497 */ 498 mutex_enter(&ill->ill_lock); 499 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 500 cv_wait(&ill->ill_cv, &ill->ill_lock); 501 mutex_exit(&ill->ill_lock); 502 ASSERT(!(ill->ill_capabilities & 503 (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT))); 504 505 if (ill->ill_net_type != IRE_LOOPBACK) 506 qprocsoff(ill->ill_rq); 507 508 /* 509 * We do an ipsq_flush once again now. New messages could have 510 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 511 * could also have landed up if an ioctl thread had looked up 512 * the ill before we set the ILL_CONDEMNED flag, but not yet 513 * enqueued the ioctl when we did the ipsq_flush last time. 514 */ 515 ipsq_flush(ill); 516 517 /* 518 * Free capabilities. 519 */ 520 if (ill->ill_hcksum_capab != NULL) { 521 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 522 ill->ill_hcksum_capab = NULL; 523 } 524 525 if (ill->ill_zerocopy_capab != NULL) { 526 kmem_free(ill->ill_zerocopy_capab, 527 sizeof (ill_zerocopy_capab_t)); 528 ill->ill_zerocopy_capab = NULL; 529 } 530 531 if (ill->ill_lso_capab != NULL) { 532 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 533 ill->ill_lso_capab = NULL; 534 } 535 536 if (ill->ill_dld_capab != NULL) { 537 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t)); 538 ill->ill_dld_capab = NULL; 539 } 540 541 while (ill->ill_ipif != NULL) 542 ipif_free_tail(ill->ill_ipif); 543 544 /* 545 * We have removed all references to ilm from conn and the ones joined 546 * within the kernel. 547 * 548 * We don't walk conns, mrts and ires because 549 * 550 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts. 551 * 2) ill_down ->ill_downi walks all the ires and cleans up 552 * ill references. 553 */ 554 555 /* 556 * If this ill is an IPMP meta-interface, blow away the illgrp. This 557 * is safe to do because the illgrp has already been unlinked from the 558 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it. 559 */ 560 if (IS_IPMP(ill)) { 561 ipmp_illgrp_destroy(ill->ill_grp); 562 ill->ill_grp = NULL; 563 } 564 565 /* 566 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free 567 * could free the phyint. No more reference to the phyint after this 568 * point. 569 */ 570 (void) ill_glist_delete(ill); 571 572 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 573 if (ill->ill_ndd_name != NULL) 574 nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name); 575 rw_exit(&ipst->ips_ip_g_nd_lock); 576 577 if (ill->ill_frag_ptr != NULL) { 578 uint_t count; 579 580 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 581 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 582 } 583 mi_free(ill->ill_frag_ptr); 584 ill->ill_frag_ptr = NULL; 585 ill->ill_frag_hash_tbl = NULL; 586 } 587 588 freemsg(ill->ill_nd_lla_mp); 589 /* Free all retained control messages. */ 590 mpp = &ill->ill_first_mp_to_free; 591 do { 592 while (mpp[0]) { 593 mblk_t *mp; 594 mblk_t *mp1; 595 596 mp = mpp[0]; 597 mpp[0] = mp->b_next; 598 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 599 mp1->b_next = NULL; 600 mp1->b_prev = NULL; 601 } 602 freemsg(mp); 603 } 604 } while (mpp++ != &ill->ill_last_mp_to_free); 605 606 ill_free_mib(ill); 607 608 #ifdef DEBUG 609 ill_trace_cleanup(ill); 610 #endif 611 612 /* The default multicast interface might have changed */ 613 ire_increment_multicast_generation(ipst, ill->ill_isv6); 614 615 /* Drop refcnt here */ 616 netstack_rele(ill->ill_ipst->ips_netstack); 617 ill->ill_ipst = NULL; 618 } 619 620 static void 621 ill_free_mib(ill_t *ill) 622 { 623 ip_stack_t *ipst = ill->ill_ipst; 624 625 /* 626 * MIB statistics must not be lost, so when an interface 627 * goes away the counter values will be added to the global 628 * MIBs. 629 */ 630 if (ill->ill_ip_mib != NULL) { 631 if (ill->ill_isv6) { 632 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, 633 ill->ill_ip_mib); 634 } else { 635 ip_mib2_add_ip_stats(&ipst->ips_ip_mib, 636 ill->ill_ip_mib); 637 } 638 639 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 640 ill->ill_ip_mib = NULL; 641 } 642 if (ill->ill_icmp6_mib != NULL) { 643 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, 644 ill->ill_icmp6_mib); 645 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 646 ill->ill_icmp6_mib = NULL; 647 } 648 } 649 650 /* 651 * Concatenate together a physical address and a sap. 652 * 653 * Sap_lengths are interpreted as follows: 654 * sap_length == 0 ==> no sap 655 * sap_length > 0 ==> sap is at the head of the dlpi address 656 * sap_length < 0 ==> sap is at the tail of the dlpi address 657 */ 658 static void 659 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 660 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 661 { 662 uint16_t sap_addr = (uint16_t)sap_src; 663 664 if (sap_length == 0) { 665 if (phys_src == NULL) 666 bzero(dst, phys_length); 667 else 668 bcopy(phys_src, dst, phys_length); 669 } else if (sap_length < 0) { 670 if (phys_src == NULL) 671 bzero(dst, phys_length); 672 else 673 bcopy(phys_src, dst, phys_length); 674 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 675 } else { 676 bcopy(&sap_addr, dst, sizeof (sap_addr)); 677 if (phys_src == NULL) 678 bzero((char *)dst + sap_length, phys_length); 679 else 680 bcopy(phys_src, (char *)dst + sap_length, phys_length); 681 } 682 } 683 684 /* 685 * Generate a dl_unitdata_req mblk for the device and address given. 686 * addr_length is the length of the physical portion of the address. 687 * If addr is NULL include an all zero address of the specified length. 688 * TRUE? In any case, addr_length is taken to be the entire length of the 689 * dlpi address, including the absolute value of sap_length. 690 */ 691 mblk_t * 692 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 693 t_scalar_t sap_length) 694 { 695 dl_unitdata_req_t *dlur; 696 mblk_t *mp; 697 t_scalar_t abs_sap_length; /* absolute value */ 698 699 abs_sap_length = ABS(sap_length); 700 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 701 DL_UNITDATA_REQ); 702 if (mp == NULL) 703 return (NULL); 704 dlur = (dl_unitdata_req_t *)mp->b_rptr; 705 /* HACK: accomodate incompatible DLPI drivers */ 706 if (addr_length == 8) 707 addr_length = 6; 708 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 709 dlur->dl_dest_addr_offset = sizeof (*dlur); 710 dlur->dl_priority.dl_min = 0; 711 dlur->dl_priority.dl_max = 0; 712 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 713 (uchar_t *)&dlur[1]); 714 return (mp); 715 } 716 717 /* 718 * Add the pending mp to the list. There can be only 1 pending mp 719 * in the list. Any exclusive ioctl that needs to wait for a response 720 * from another module or driver needs to use this function to set 721 * the ipx_pending_mp to the ioctl mblk and wait for the response from 722 * the other module/driver. This is also used while waiting for the 723 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 724 */ 725 boolean_t 726 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 727 int waitfor) 728 { 729 ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop; 730 731 ASSERT(IAM_WRITER_IPIF(ipif)); 732 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 733 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 734 ASSERT(ipx->ipx_pending_mp == NULL); 735 /* 736 * The caller may be using a different ipif than the one passed into 737 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 738 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT 739 * that `ipx_current_ipif == ipif'. 740 */ 741 ASSERT(ipx->ipx_current_ipif != NULL); 742 743 /* 744 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the 745 * driver. 746 */ 747 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) || 748 (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) || 749 (DB_TYPE(add_mp) == M_PCPROTO)); 750 751 if (connp != NULL) { 752 ASSERT(MUTEX_HELD(&connp->conn_lock)); 753 /* 754 * Return error if the conn has started closing. The conn 755 * could have finished cleaning up the pending mp list, 756 * If so we should not add another mp to the list negating 757 * the cleanup. 758 */ 759 if (connp->conn_state_flags & CONN_CLOSING) 760 return (B_FALSE); 761 } 762 mutex_enter(&ipx->ipx_lock); 763 ipx->ipx_pending_ipif = ipif; 764 /* 765 * Note down the queue in b_queue. This will be returned by 766 * ipsq_pending_mp_get. Caller will then use these values to restart 767 * the processing 768 */ 769 add_mp->b_next = NULL; 770 add_mp->b_queue = q; 771 ipx->ipx_pending_mp = add_mp; 772 ipx->ipx_waitfor = waitfor; 773 mutex_exit(&ipx->ipx_lock); 774 775 if (connp != NULL) 776 connp->conn_oper_pending_ill = ipif->ipif_ill; 777 778 return (B_TRUE); 779 } 780 781 /* 782 * Retrieve the ipx_pending_mp and return it. There can be only 1 mp 783 * queued in the list. 784 */ 785 mblk_t * 786 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 787 { 788 mblk_t *curr = NULL; 789 ipxop_t *ipx = ipsq->ipsq_xop; 790 791 *connpp = NULL; 792 mutex_enter(&ipx->ipx_lock); 793 if (ipx->ipx_pending_mp == NULL) { 794 mutex_exit(&ipx->ipx_lock); 795 return (NULL); 796 } 797 798 /* There can be only 1 such excl message */ 799 curr = ipx->ipx_pending_mp; 800 ASSERT(curr->b_next == NULL); 801 ipx->ipx_pending_ipif = NULL; 802 ipx->ipx_pending_mp = NULL; 803 ipx->ipx_waitfor = 0; 804 mutex_exit(&ipx->ipx_lock); 805 806 if (CONN_Q(curr->b_queue)) { 807 /* 808 * This mp did a refhold on the conn, at the start of the ioctl. 809 * So we can safely return a pointer to the conn to the caller. 810 */ 811 *connpp = Q_TO_CONN(curr->b_queue); 812 } else { 813 *connpp = NULL; 814 } 815 curr->b_next = NULL; 816 curr->b_prev = NULL; 817 return (curr); 818 } 819 820 /* 821 * Cleanup the ioctl mp queued in ipx_pending_mp 822 * - Called in the ill_delete path 823 * - Called in the M_ERROR or M_HANGUP path on the ill. 824 * - Called in the conn close path. 825 */ 826 boolean_t 827 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 828 { 829 mblk_t *mp; 830 ipxop_t *ipx; 831 queue_t *q; 832 ipif_t *ipif; 833 int cmd; 834 835 ASSERT(IAM_WRITER_ILL(ill)); 836 ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 837 838 /* 839 * If connp is null, unconditionally clean up the ipx_pending_mp. 840 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 841 * even if it is meant for another ill, since we have to enqueue 842 * a new mp now in ipx_pending_mp to complete the ipif_down. 843 * If connp is non-null we are called from the conn close path. 844 */ 845 mutex_enter(&ipx->ipx_lock); 846 mp = ipx->ipx_pending_mp; 847 if (mp == NULL || (connp != NULL && 848 mp->b_queue != CONNP_TO_WQ(connp))) { 849 mutex_exit(&ipx->ipx_lock); 850 return (B_FALSE); 851 } 852 /* Now remove from the ipx_pending_mp */ 853 ipx->ipx_pending_mp = NULL; 854 q = mp->b_queue; 855 mp->b_next = NULL; 856 mp->b_prev = NULL; 857 mp->b_queue = NULL; 858 859 ipif = ipx->ipx_pending_ipif; 860 ipx->ipx_pending_ipif = NULL; 861 ipx->ipx_waitfor = 0; 862 ipx->ipx_current_ipif = NULL; 863 cmd = ipx->ipx_current_ioctl; 864 ipx->ipx_current_ioctl = 0; 865 ipx->ipx_current_done = B_TRUE; 866 mutex_exit(&ipx->ipx_lock); 867 868 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 869 DTRACE_PROBE4(ipif__ioctl, 870 char *, "ipsq_pending_mp_cleanup", 871 int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill, 872 ipif_t *, ipif); 873 if (connp == NULL) { 874 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 875 } else { 876 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); 877 mutex_enter(&ipif->ipif_ill->ill_lock); 878 ipif->ipif_state_flags &= ~IPIF_CHANGING; 879 mutex_exit(&ipif->ipif_ill->ill_lock); 880 } 881 } else { 882 /* 883 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 884 * be just inet_freemsg. we have to restart it 885 * otherwise the thread will be stuck. 886 */ 887 inet_freemsg(mp); 888 } 889 return (B_TRUE); 890 } 891 892 /* 893 * Called in the conn close path and ill delete path 894 */ 895 static void 896 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 897 { 898 ipsq_t *ipsq; 899 mblk_t *prev; 900 mblk_t *curr; 901 mblk_t *next; 902 queue_t *rq, *wq; 903 mblk_t *tmp_list = NULL; 904 905 ASSERT(IAM_WRITER_ILL(ill)); 906 if (connp != NULL) 907 wq = CONNP_TO_WQ(connp); 908 else 909 wq = ill->ill_wq; 910 rq = RD(wq); 911 912 ipsq = ill->ill_phyint->phyint_ipsq; 913 /* 914 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 915 * In the case of ioctl from a conn, there can be only 1 mp 916 * queued on the ipsq. If an ill is being unplumbed, only messages 917 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 918 * ioctls meant for this ill form conn's are not flushed. They will 919 * be processed during ipsq_exit and will not find the ill and will 920 * return error. 921 */ 922 mutex_enter(&ipsq->ipsq_lock); 923 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 924 curr = next) { 925 next = curr->b_next; 926 if (curr->b_queue == wq || curr->b_queue == rq) { 927 /* Unlink the mblk from the pending mp list */ 928 if (prev != NULL) { 929 prev->b_next = curr->b_next; 930 } else { 931 ASSERT(ipsq->ipsq_xopq_mphead == curr); 932 ipsq->ipsq_xopq_mphead = curr->b_next; 933 } 934 if (ipsq->ipsq_xopq_mptail == curr) 935 ipsq->ipsq_xopq_mptail = prev; 936 /* 937 * Create a temporary list and release the ipsq lock 938 * New elements are added to the head of the tmp_list 939 */ 940 curr->b_next = tmp_list; 941 tmp_list = curr; 942 } else { 943 prev = curr; 944 } 945 } 946 mutex_exit(&ipsq->ipsq_lock); 947 948 while (tmp_list != NULL) { 949 curr = tmp_list; 950 tmp_list = curr->b_next; 951 curr->b_next = NULL; 952 curr->b_prev = NULL; 953 curr->b_queue = NULL; 954 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 955 DTRACE_PROBE4(ipif__ioctl, 956 char *, "ipsq_xopq_mp_cleanup", 957 int, 0, ill_t *, NULL, ipif_t *, NULL); 958 ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ? 959 CONN_CLOSE : NO_COPYOUT, NULL); 960 } else { 961 /* 962 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 963 * this can't be just inet_freemsg. we have to 964 * restart it otherwise the thread will be stuck. 965 */ 966 inet_freemsg(curr); 967 } 968 } 969 } 970 971 /* 972 * This conn has started closing. Cleanup any pending ioctl from this conn. 973 * STREAMS ensures that there can be at most 1 active ioctl on a stream. 974 */ 975 void 976 conn_ioctl_cleanup(conn_t *connp) 977 { 978 ipsq_t *ipsq; 979 ill_t *ill; 980 boolean_t refheld; 981 982 /* 983 * Check for a queued ioctl. If the ioctl has not yet started, the mp 984 * is pending in the list headed by ipsq_xopq_head. If the ioctl has 985 * started the mp could be present in ipx_pending_mp. Note that if 986 * conn_oper_pending_ill is NULL, the ioctl may still be in flight and 987 * not yet queued anywhere. In this case, the conn close code will wait 988 * until the conn_ref is dropped. If the stream was a tcp stream, then 989 * tcp_close will wait first until all ioctls have completed for this 990 * conn. 991 */ 992 mutex_enter(&connp->conn_lock); 993 ill = connp->conn_oper_pending_ill; 994 if (ill == NULL) { 995 mutex_exit(&connp->conn_lock); 996 return; 997 } 998 999 /* 1000 * We may not be able to refhold the ill if the ill/ipif 1001 * is changing. But we need to make sure that the ill will 1002 * not vanish. So we just bump up the ill_waiter count. 1003 */ 1004 refheld = ill_waiter_inc(ill); 1005 mutex_exit(&connp->conn_lock); 1006 if (refheld) { 1007 if (ipsq_enter(ill, B_TRUE, NEW_OP)) { 1008 ill_waiter_dcr(ill); 1009 /* 1010 * Check whether this ioctl has started and is 1011 * pending. If it is not found there then check 1012 * whether this ioctl has not even started and is in 1013 * the ipsq_xopq list. 1014 */ 1015 if (!ipsq_pending_mp_cleanup(ill, connp)) 1016 ipsq_xopq_mp_cleanup(ill, connp); 1017 ipsq = ill->ill_phyint->phyint_ipsq; 1018 ipsq_exit(ipsq); 1019 return; 1020 } 1021 } 1022 1023 /* 1024 * The ill is also closing and we could not bump up the 1025 * ill_waiter_count or we could not enter the ipsq. Leave 1026 * the cleanup to ill_delete 1027 */ 1028 mutex_enter(&connp->conn_lock); 1029 while (connp->conn_oper_pending_ill != NULL) 1030 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1031 mutex_exit(&connp->conn_lock); 1032 if (refheld) 1033 ill_waiter_dcr(ill); 1034 } 1035 1036 /* 1037 * ipcl_walk function for cleaning up conn_*_ill fields. 1038 * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and 1039 * conn_bound_if in place. We prefer dropping 1040 * packets instead of sending them out the wrong interface, or accepting 1041 * packets from the wrong ifindex. 1042 */ 1043 static void 1044 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1045 { 1046 ill_t *ill = (ill_t *)arg; 1047 1048 mutex_enter(&connp->conn_lock); 1049 if (connp->conn_dhcpinit_ill == ill) { 1050 connp->conn_dhcpinit_ill = NULL; 1051 ASSERT(ill->ill_dhcpinit != 0); 1052 atomic_dec_32(&ill->ill_dhcpinit); 1053 ill_set_inputfn(ill); 1054 } 1055 mutex_exit(&connp->conn_lock); 1056 } 1057 1058 static int 1059 ill_down_ipifs_tail(ill_t *ill) 1060 { 1061 ipif_t *ipif; 1062 int err; 1063 1064 ASSERT(IAM_WRITER_ILL(ill)); 1065 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1066 ipif_non_duplicate(ipif); 1067 /* 1068 * ipif_down_tail will call arp_ll_down on the last ipif 1069 * and typically return EINPROGRESS when the DL_UNBIND is sent. 1070 */ 1071 if ((err = ipif_down_tail(ipif)) != 0) 1072 return (err); 1073 } 1074 return (0); 1075 } 1076 1077 /* ARGSUSED */ 1078 void 1079 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1080 { 1081 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1082 (void) ill_down_ipifs_tail(q->q_ptr); 1083 freemsg(mp); 1084 ipsq_current_finish(ipsq); 1085 } 1086 1087 /* 1088 * ill_down_start is called when we want to down this ill and bring it up again 1089 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1090 * all interfaces, but don't tear down any plumbing. 1091 */ 1092 boolean_t 1093 ill_down_start(queue_t *q, mblk_t *mp) 1094 { 1095 ill_t *ill = q->q_ptr; 1096 ipif_t *ipif; 1097 1098 ASSERT(IAM_WRITER_ILL(ill)); 1099 mutex_enter(&ill->ill_lock); 1100 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 1101 /* no more nce addition allowed */ 1102 mutex_exit(&ill->ill_lock); 1103 1104 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1105 (void) ipif_down(ipif, NULL, NULL); 1106 1107 ill_down(ill); 1108 1109 /* 1110 * Walk all CONNs that can have a reference on an ire or nce for this 1111 * ill (we actually walk all that now have stale references). 1112 */ 1113 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst); 1114 1115 /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 1116 if (ill->ill_isv6) 1117 dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst); 1118 1119 1120 (void) ipsq_pending_mp_cleanup(ill, NULL); 1121 1122 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); 1123 1124 /* 1125 * Atomically test and add the pending mp if references are active. 1126 */ 1127 mutex_enter(&ill->ill_lock); 1128 if (!ill_is_quiescent(ill)) { 1129 /* call cannot fail since `conn_t *' argument is NULL */ 1130 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1131 mp, ILL_DOWN); 1132 mutex_exit(&ill->ill_lock); 1133 return (B_FALSE); 1134 } 1135 mutex_exit(&ill->ill_lock); 1136 return (B_TRUE); 1137 } 1138 1139 static void 1140 ill_down(ill_t *ill) 1141 { 1142 mblk_t *mp; 1143 ip_stack_t *ipst = ill->ill_ipst; 1144 1145 /* 1146 * Blow off any IREs dependent on this ILL. 1147 * The caller needs to handle conn_ixa_cleanup 1148 */ 1149 ill_delete_ires(ill); 1150 1151 ire_walk_ill(0, 0, ill_downi, ill, ill); 1152 1153 /* Remove any conn_*_ill depending on this ill */ 1154 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); 1155 1156 /* 1157 * Free state for additional IREs. 1158 */ 1159 mutex_enter(&ill->ill_saved_ire_lock); 1160 mp = ill->ill_saved_ire_mp; 1161 ill->ill_saved_ire_mp = NULL; 1162 ill->ill_saved_ire_cnt = 0; 1163 mutex_exit(&ill->ill_saved_ire_lock); 1164 freemsg(mp); 1165 } 1166 1167 /* 1168 * ire_walk routine used to delete every IRE that depends on 1169 * 'ill'. (Always called as writer.) 1170 * 1171 * Note: since the routes added by the kernel are deleted separately, 1172 * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE. 1173 * 1174 * We also remove references on ire_nce_cache entries that refer to the ill. 1175 */ 1176 void 1177 ill_downi(ire_t *ire, char *ill_arg) 1178 { 1179 ill_t *ill = (ill_t *)ill_arg; 1180 nce_t *nce; 1181 1182 mutex_enter(&ire->ire_lock); 1183 nce = ire->ire_nce_cache; 1184 if (nce != NULL && nce->nce_ill == ill) 1185 ire->ire_nce_cache = NULL; 1186 else 1187 nce = NULL; 1188 mutex_exit(&ire->ire_lock); 1189 if (nce != NULL) 1190 nce_refrele(nce); 1191 if (ire->ire_ill == ill) 1192 ire_delete(ire); 1193 } 1194 1195 /* Remove IRE_IF_CLONE on this ill */ 1196 void 1197 ill_downi_if_clone(ire_t *ire, char *ill_arg) 1198 { 1199 ill_t *ill = (ill_t *)ill_arg; 1200 1201 ASSERT(ire->ire_type & IRE_IF_CLONE); 1202 if (ire->ire_ill == ill) 1203 ire_delete(ire); 1204 } 1205 1206 /* Consume an M_IOCACK of the fastpath probe. */ 1207 void 1208 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1209 { 1210 mblk_t *mp1 = mp; 1211 1212 /* 1213 * If this was the first attempt turn on the fastpath probing. 1214 */ 1215 mutex_enter(&ill->ill_lock); 1216 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1217 ill->ill_dlpi_fastpath_state = IDS_OK; 1218 mutex_exit(&ill->ill_lock); 1219 1220 /* Free the M_IOCACK mblk, hold on to the data */ 1221 mp = mp->b_cont; 1222 freeb(mp1); 1223 if (mp == NULL) 1224 return; 1225 if (mp->b_cont != NULL) 1226 nce_fastpath_update(ill, mp); 1227 else 1228 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1229 freemsg(mp); 1230 } 1231 1232 /* 1233 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1234 * The data portion of the request is a dl_unitdata_req_t template for 1235 * what we would send downstream in the absence of a fastpath confirmation. 1236 */ 1237 int 1238 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1239 { 1240 struct iocblk *ioc; 1241 mblk_t *mp; 1242 1243 if (dlur_mp == NULL) 1244 return (EINVAL); 1245 1246 mutex_enter(&ill->ill_lock); 1247 switch (ill->ill_dlpi_fastpath_state) { 1248 case IDS_FAILED: 1249 /* 1250 * Driver NAKed the first fastpath ioctl - assume it doesn't 1251 * support it. 1252 */ 1253 mutex_exit(&ill->ill_lock); 1254 return (ENOTSUP); 1255 case IDS_UNKNOWN: 1256 /* This is the first probe */ 1257 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1258 break; 1259 default: 1260 break; 1261 } 1262 mutex_exit(&ill->ill_lock); 1263 1264 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1265 return (EAGAIN); 1266 1267 mp->b_cont = copyb(dlur_mp); 1268 if (mp->b_cont == NULL) { 1269 freeb(mp); 1270 return (EAGAIN); 1271 } 1272 1273 ioc = (struct iocblk *)mp->b_rptr; 1274 ioc->ioc_count = msgdsize(mp->b_cont); 1275 1276 DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe", 1277 char *, "DL_IOC_HDR_INFO", ill_t *, ill); 1278 putnext(ill->ill_wq, mp); 1279 return (0); 1280 } 1281 1282 void 1283 ill_capability_probe(ill_t *ill) 1284 { 1285 mblk_t *mp; 1286 1287 ASSERT(IAM_WRITER_ILL(ill)); 1288 1289 if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN && 1290 ill->ill_dlpi_capab_state != IDCS_FAILED) 1291 return; 1292 1293 /* 1294 * We are starting a new cycle of capability negotiation. 1295 * Free up the capab reset messages of any previous incarnation. 1296 * We will do a fresh allocation when we get the response to our probe 1297 */ 1298 if (ill->ill_capab_reset_mp != NULL) { 1299 freemsg(ill->ill_capab_reset_mp); 1300 ill->ill_capab_reset_mp = NULL; 1301 } 1302 1303 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1304 1305 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); 1306 if (mp == NULL) 1307 return; 1308 1309 ill_capability_send(ill, mp); 1310 ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; 1311 } 1312 1313 void 1314 ill_capability_reset(ill_t *ill, boolean_t reneg) 1315 { 1316 ASSERT(IAM_WRITER_ILL(ill)); 1317 1318 if (ill->ill_dlpi_capab_state != IDCS_OK) 1319 return; 1320 1321 ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; 1322 1323 ill_capability_send(ill, ill->ill_capab_reset_mp); 1324 ill->ill_capab_reset_mp = NULL; 1325 /* 1326 * We turn off all capabilities except those pertaining to 1327 * direct function call capabilities viz. ILL_CAPAB_DLD* 1328 * which will be turned off by the corresponding reset functions. 1329 */ 1330 ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY); 1331 } 1332 1333 static void 1334 ill_capability_reset_alloc(ill_t *ill) 1335 { 1336 mblk_t *mp; 1337 size_t size = 0; 1338 int err; 1339 dl_capability_req_t *capb; 1340 1341 ASSERT(IAM_WRITER_ILL(ill)); 1342 ASSERT(ill->ill_capab_reset_mp == NULL); 1343 1344 if (ILL_HCKSUM_CAPABLE(ill)) { 1345 size += sizeof (dl_capability_sub_t) + 1346 sizeof (dl_capab_hcksum_t); 1347 } 1348 1349 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) { 1350 size += sizeof (dl_capability_sub_t) + 1351 sizeof (dl_capab_zerocopy_t); 1352 } 1353 1354 if (ill->ill_capabilities & ILL_CAPAB_DLD) { 1355 size += sizeof (dl_capability_sub_t) + 1356 sizeof (dl_capab_dld_t); 1357 } 1358 1359 mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED, 1360 STR_NOSIG, &err); 1361 1362 mp->b_datap->db_type = M_PROTO; 1363 bzero(mp->b_rptr, size + sizeof (dl_capability_req_t)); 1364 1365 capb = (dl_capability_req_t *)mp->b_rptr; 1366 capb->dl_primitive = DL_CAPABILITY_REQ; 1367 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1368 capb->dl_sub_length = size; 1369 1370 mp->b_wptr += sizeof (dl_capability_req_t); 1371 1372 /* 1373 * Each handler fills in the corresponding dl_capability_sub_t 1374 * inside the mblk, 1375 */ 1376 ill_capability_hcksum_reset_fill(ill, mp); 1377 ill_capability_zerocopy_reset_fill(ill, mp); 1378 ill_capability_dld_reset_fill(ill, mp); 1379 1380 ill->ill_capab_reset_mp = mp; 1381 } 1382 1383 static void 1384 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1385 { 1386 dl_capab_id_t *id_ic; 1387 uint_t sub_dl_cap = outers->dl_cap; 1388 dl_capability_sub_t *inners; 1389 uint8_t *capend; 1390 1391 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1392 1393 /* 1394 * Note: range checks here are not absolutely sufficient to 1395 * make us robust against malformed messages sent by drivers; 1396 * this is in keeping with the rest of IP's dlpi handling. 1397 * (Remember, it's coming from something else in the kernel 1398 * address space) 1399 */ 1400 1401 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1402 if (capend > mp->b_wptr) { 1403 cmn_err(CE_WARN, "ill_capability_id_ack: " 1404 "malformed sub-capability too long for mblk"); 1405 return; 1406 } 1407 1408 id_ic = (dl_capab_id_t *)(outers + 1); 1409 1410 if (outers->dl_length < sizeof (*id_ic) || 1411 (inners = &id_ic->id_subcap, 1412 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1413 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1414 "encapsulated capab type %d too long for mblk", 1415 inners->dl_cap); 1416 return; 1417 } 1418 1419 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1420 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1421 "isn't as expected; pass-thru module(s) detected, " 1422 "discarding capability\n", inners->dl_cap)); 1423 return; 1424 } 1425 1426 /* Process the encapsulated sub-capability */ 1427 ill_capability_dispatch(ill, mp, inners); 1428 } 1429 1430 static void 1431 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp) 1432 { 1433 dl_capability_sub_t *dl_subcap; 1434 1435 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 1436 return; 1437 1438 /* 1439 * The dl_capab_dld_t that follows the dl_capability_sub_t is not 1440 * initialized below since it is not used by DLD. 1441 */ 1442 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1443 dl_subcap->dl_cap = DL_CAPAB_DLD; 1444 dl_subcap->dl_length = sizeof (dl_capab_dld_t); 1445 1446 mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); 1447 } 1448 1449 static void 1450 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp) 1451 { 1452 /* 1453 * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK 1454 * is only to get the VRRP capability. 1455 * 1456 * Note that we cannot check ill_ipif_up_count here since 1457 * ill_ipif_up_count is only incremented when the resolver is setup. 1458 * That is done asynchronously, and can race with this function. 1459 */ 1460 if (!ill->ill_dl_up) { 1461 if (subp->dl_cap == DL_CAPAB_VRRP) 1462 ill_capability_vrrp_ack(ill, mp, subp); 1463 return; 1464 } 1465 1466 switch (subp->dl_cap) { 1467 case DL_CAPAB_HCKSUM: 1468 ill_capability_hcksum_ack(ill, mp, subp); 1469 break; 1470 case DL_CAPAB_ZEROCOPY: 1471 ill_capability_zerocopy_ack(ill, mp, subp); 1472 break; 1473 case DL_CAPAB_DLD: 1474 ill_capability_dld_ack(ill, mp, subp); 1475 break; 1476 case DL_CAPAB_VRRP: 1477 break; 1478 default: 1479 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 1480 subp->dl_cap)); 1481 } 1482 } 1483 1484 /* 1485 * Process the vrrp capability received from a DLS Provider. isub must point 1486 * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message. 1487 */ 1488 static void 1489 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1490 { 1491 dl_capab_vrrp_t *vrrp; 1492 uint_t sub_dl_cap = isub->dl_cap; 1493 uint8_t *capend; 1494 1495 ASSERT(IAM_WRITER_ILL(ill)); 1496 ASSERT(sub_dl_cap == DL_CAPAB_VRRP); 1497 1498 /* 1499 * Note: range checks here are not absolutely sufficient to 1500 * make us robust against malformed messages sent by drivers; 1501 * this is in keeping with the rest of IP's dlpi handling. 1502 * (Remember, it's coming from something else in the kernel 1503 * address space) 1504 */ 1505 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1506 if (capend > mp->b_wptr) { 1507 cmn_err(CE_WARN, "ill_capability_vrrp_ack: " 1508 "malformed sub-capability too long for mblk"); 1509 return; 1510 } 1511 vrrp = (dl_capab_vrrp_t *)(isub + 1); 1512 1513 /* 1514 * Compare the IP address family and set ILLF_VRRP for the right ill. 1515 */ 1516 if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) || 1517 (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) { 1518 ill->ill_flags |= ILLF_VRRP; 1519 } 1520 } 1521 1522 /* 1523 * Process a hardware checksum offload capability negotiation ack received 1524 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 1525 * of a DL_CAPABILITY_ACK message. 1526 */ 1527 static void 1528 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1529 { 1530 dl_capability_req_t *ocap; 1531 dl_capab_hcksum_t *ihck, *ohck; 1532 ill_hcksum_capab_t **ill_hcksum; 1533 mblk_t *nmp = NULL; 1534 uint_t sub_dl_cap = isub->dl_cap; 1535 uint8_t *capend; 1536 1537 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 1538 1539 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 1540 1541 /* 1542 * Note: range checks here are not absolutely sufficient to 1543 * make us robust against malformed messages sent by drivers; 1544 * this is in keeping with the rest of IP's dlpi handling. 1545 * (Remember, it's coming from something else in the kernel 1546 * address space) 1547 */ 1548 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1549 if (capend > mp->b_wptr) { 1550 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1551 "malformed sub-capability too long for mblk"); 1552 return; 1553 } 1554 1555 /* 1556 * There are two types of acks we process here: 1557 * 1. acks in reply to a (first form) generic capability req 1558 * (no ENABLE flag set) 1559 * 2. acks in reply to a ENABLE capability req. 1560 * (ENABLE flag set) 1561 */ 1562 ihck = (dl_capab_hcksum_t *)(isub + 1); 1563 1564 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 1565 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 1566 "unsupported hardware checksum " 1567 "sub-capability (version %d, expected %d)", 1568 ihck->hcksum_version, HCKSUM_VERSION_1); 1569 return; 1570 } 1571 1572 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 1573 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 1574 "checksum capability isn't as expected; pass-thru " 1575 "module(s) detected, discarding capability\n")); 1576 return; 1577 } 1578 1579 #define CURR_HCKSUM_CAPAB \ 1580 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 1581 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 1582 1583 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 1584 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 1585 /* do ENABLE processing */ 1586 if (*ill_hcksum == NULL) { 1587 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 1588 KM_NOSLEEP); 1589 1590 if (*ill_hcksum == NULL) { 1591 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1592 "could not enable hcksum version %d " 1593 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 1594 ill->ill_name); 1595 return; 1596 } 1597 } 1598 1599 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 1600 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 1601 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 1602 ip1dbg(("ill_capability_hcksum_ack: interface %s " 1603 "has enabled hardware checksumming\n ", 1604 ill->ill_name)); 1605 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 1606 /* 1607 * Enabling hardware checksum offload 1608 * Currently IP supports {TCP,UDP}/IPv4 1609 * partial and full cksum offload and 1610 * IPv4 header checksum offload. 1611 * Allocate new mblk which will 1612 * contain a new capability request 1613 * to enable hardware checksum offload. 1614 */ 1615 uint_t size; 1616 uchar_t *rptr; 1617 1618 size = sizeof (dl_capability_req_t) + 1619 sizeof (dl_capability_sub_t) + isub->dl_length; 1620 1621 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1622 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1623 "could not enable hardware cksum for %s (ENOMEM)\n", 1624 ill->ill_name); 1625 return; 1626 } 1627 1628 rptr = nmp->b_rptr; 1629 /* initialize dl_capability_req_t */ 1630 ocap = (dl_capability_req_t *)nmp->b_rptr; 1631 ocap->dl_sub_offset = 1632 sizeof (dl_capability_req_t); 1633 ocap->dl_sub_length = 1634 sizeof (dl_capability_sub_t) + 1635 isub->dl_length; 1636 nmp->b_rptr += sizeof (dl_capability_req_t); 1637 1638 /* initialize dl_capability_sub_t */ 1639 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 1640 nmp->b_rptr += sizeof (*isub); 1641 1642 /* initialize dl_capab_hcksum_t */ 1643 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 1644 bcopy(ihck, ohck, sizeof (*ihck)); 1645 1646 nmp->b_rptr = rptr; 1647 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 1648 1649 /* Set ENABLE flag */ 1650 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 1651 ohck->hcksum_txflags |= HCKSUM_ENABLE; 1652 1653 /* 1654 * nmp points to a DL_CAPABILITY_REQ message to enable 1655 * hardware checksum acceleration. 1656 */ 1657 ill_capability_send(ill, nmp); 1658 } else { 1659 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 1660 "advertised %x hardware checksum capability flags\n", 1661 ill->ill_name, ihck->hcksum_txflags)); 1662 } 1663 } 1664 1665 static void 1666 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp) 1667 { 1668 dl_capab_hcksum_t *hck_subcap; 1669 dl_capability_sub_t *dl_subcap; 1670 1671 if (!ILL_HCKSUM_CAPABLE(ill)) 1672 return; 1673 1674 ASSERT(ill->ill_hcksum_capab != NULL); 1675 1676 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1677 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 1678 dl_subcap->dl_length = sizeof (*hck_subcap); 1679 1680 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 1681 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 1682 hck_subcap->hcksum_txflags = 0; 1683 1684 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap); 1685 } 1686 1687 static void 1688 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1689 { 1690 mblk_t *nmp = NULL; 1691 dl_capability_req_t *oc; 1692 dl_capab_zerocopy_t *zc_ic, *zc_oc; 1693 ill_zerocopy_capab_t **ill_zerocopy_capab; 1694 uint_t sub_dl_cap = isub->dl_cap; 1695 uint8_t *capend; 1696 1697 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 1698 1699 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 1700 1701 /* 1702 * Note: range checks here are not absolutely sufficient to 1703 * make us robust against malformed messages sent by drivers; 1704 * this is in keeping with the rest of IP's dlpi handling. 1705 * (Remember, it's coming from something else in the kernel 1706 * address space) 1707 */ 1708 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1709 if (capend > mp->b_wptr) { 1710 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1711 "malformed sub-capability too long for mblk"); 1712 return; 1713 } 1714 1715 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 1716 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 1717 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 1718 "unsupported ZEROCOPY sub-capability (version %d, " 1719 "expected %d)", zc_ic->zerocopy_version, 1720 ZEROCOPY_VERSION_1); 1721 return; 1722 } 1723 1724 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 1725 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 1726 "capability isn't as expected; pass-thru module(s) " 1727 "detected, discarding capability\n")); 1728 return; 1729 } 1730 1731 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 1732 if (*ill_zerocopy_capab == NULL) { 1733 *ill_zerocopy_capab = 1734 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 1735 KM_NOSLEEP); 1736 1737 if (*ill_zerocopy_capab == NULL) { 1738 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1739 "could not enable Zero-copy version %d " 1740 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 1741 ill->ill_name); 1742 return; 1743 } 1744 } 1745 1746 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 1747 "supports Zero-copy version %d\n", ill->ill_name, 1748 ZEROCOPY_VERSION_1)); 1749 1750 (*ill_zerocopy_capab)->ill_zerocopy_version = 1751 zc_ic->zerocopy_version; 1752 (*ill_zerocopy_capab)->ill_zerocopy_flags = 1753 zc_ic->zerocopy_flags; 1754 1755 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 1756 } else { 1757 uint_t size; 1758 uchar_t *rptr; 1759 1760 size = sizeof (dl_capability_req_t) + 1761 sizeof (dl_capability_sub_t) + 1762 sizeof (dl_capab_zerocopy_t); 1763 1764 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1765 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1766 "could not enable zerocopy for %s (ENOMEM)\n", 1767 ill->ill_name); 1768 return; 1769 } 1770 1771 rptr = nmp->b_rptr; 1772 /* initialize dl_capability_req_t */ 1773 oc = (dl_capability_req_t *)rptr; 1774 oc->dl_sub_offset = sizeof (dl_capability_req_t); 1775 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 1776 sizeof (dl_capab_zerocopy_t); 1777 rptr += sizeof (dl_capability_req_t); 1778 1779 /* initialize dl_capability_sub_t */ 1780 bcopy(isub, rptr, sizeof (*isub)); 1781 rptr += sizeof (*isub); 1782 1783 /* initialize dl_capab_zerocopy_t */ 1784 zc_oc = (dl_capab_zerocopy_t *)rptr; 1785 *zc_oc = *zc_ic; 1786 1787 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 1788 "to enable zero-copy version %d\n", ill->ill_name, 1789 ZEROCOPY_VERSION_1)); 1790 1791 /* set VMSAFE_MEM flag */ 1792 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 1793 1794 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 1795 ill_capability_send(ill, nmp); 1796 } 1797 } 1798 1799 static void 1800 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp) 1801 { 1802 dl_capab_zerocopy_t *zerocopy_subcap; 1803 dl_capability_sub_t *dl_subcap; 1804 1805 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 1806 return; 1807 1808 ASSERT(ill->ill_zerocopy_capab != NULL); 1809 1810 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1811 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 1812 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 1813 1814 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 1815 zerocopy_subcap->zerocopy_version = 1816 ill->ill_zerocopy_capab->ill_zerocopy_version; 1817 zerocopy_subcap->zerocopy_flags = 0; 1818 1819 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 1820 } 1821 1822 /* 1823 * DLD capability 1824 * Refer to dld.h for more information regarding the purpose and usage 1825 * of this capability. 1826 */ 1827 static void 1828 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1829 { 1830 dl_capab_dld_t *dld_ic, dld; 1831 uint_t sub_dl_cap = isub->dl_cap; 1832 uint8_t *capend; 1833 ill_dld_capab_t *idc; 1834 1835 ASSERT(IAM_WRITER_ILL(ill)); 1836 ASSERT(sub_dl_cap == DL_CAPAB_DLD); 1837 1838 /* 1839 * Note: range checks here are not absolutely sufficient to 1840 * make us robust against malformed messages sent by drivers; 1841 * this is in keeping with the rest of IP's dlpi handling. 1842 * (Remember, it's coming from something else in the kernel 1843 * address space) 1844 */ 1845 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1846 if (capend > mp->b_wptr) { 1847 cmn_err(CE_WARN, "ill_capability_dld_ack: " 1848 "malformed sub-capability too long for mblk"); 1849 return; 1850 } 1851 dld_ic = (dl_capab_dld_t *)(isub + 1); 1852 if (dld_ic->dld_version != DLD_CURRENT_VERSION) { 1853 cmn_err(CE_CONT, "ill_capability_dld_ack: " 1854 "unsupported DLD sub-capability (version %d, " 1855 "expected %d)", dld_ic->dld_version, 1856 DLD_CURRENT_VERSION); 1857 return; 1858 } 1859 if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) { 1860 ip1dbg(("ill_capability_dld_ack: mid token for dld " 1861 "capability isn't as expected; pass-thru module(s) " 1862 "detected, discarding capability\n")); 1863 return; 1864 } 1865 1866 /* 1867 * Copy locally to ensure alignment. 1868 */ 1869 bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t)); 1870 1871 if ((idc = ill->ill_dld_capab) == NULL) { 1872 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP); 1873 if (idc == NULL) { 1874 cmn_err(CE_WARN, "ill_capability_dld_ack: " 1875 "could not enable DLD version %d " 1876 "for %s (ENOMEM)\n", DLD_CURRENT_VERSION, 1877 ill->ill_name); 1878 return; 1879 } 1880 ill->ill_dld_capab = idc; 1881 } 1882 idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab; 1883 idc->idc_capab_dh = (void *)dld.dld_capab_handle; 1884 ip1dbg(("ill_capability_dld_ack: interface %s " 1885 "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION)); 1886 1887 ill_capability_dld_enable(ill); 1888 } 1889 1890 /* 1891 * Typically capability negotiation between IP and the driver happens via 1892 * DLPI message exchange. However GLD also offers a direct function call 1893 * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities, 1894 * But arbitrary function calls into IP or GLD are not permitted, since both 1895 * of them are protected by their own perimeter mechanism. The perimeter can 1896 * be viewed as a coarse lock or serialization mechanism. The hierarchy of 1897 * these perimeters is IP -> MAC. Thus for example to enable the squeue 1898 * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter 1899 * to enter the mac perimeter and then do the direct function calls into 1900 * GLD to enable squeue polling. The ring related callbacks from the mac into 1901 * the stack to add, bind, quiesce, restart or cleanup a ring are all 1902 * protected by the mac perimeter. 1903 */ 1904 static void 1905 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp) 1906 { 1907 ill_dld_capab_t *idc = ill->ill_dld_capab; 1908 int err; 1909 1910 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp, 1911 DLD_ENABLE); 1912 ASSERT(err == 0); 1913 } 1914 1915 static void 1916 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph) 1917 { 1918 ill_dld_capab_t *idc = ill->ill_dld_capab; 1919 int err; 1920 1921 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph, 1922 DLD_DISABLE); 1923 ASSERT(err == 0); 1924 } 1925 1926 boolean_t 1927 ill_mac_perim_held(ill_t *ill) 1928 { 1929 ill_dld_capab_t *idc = ill->ill_dld_capab; 1930 1931 return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL, 1932 DLD_QUERY)); 1933 } 1934 1935 static void 1936 ill_capability_direct_enable(ill_t *ill) 1937 { 1938 ill_dld_capab_t *idc = ill->ill_dld_capab; 1939 ill_dld_direct_t *idd = &idc->idc_direct; 1940 dld_capab_direct_t direct; 1941 int rc; 1942 1943 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 1944 1945 bzero(&direct, sizeof (direct)); 1946 direct.di_rx_cf = (uintptr_t)ip_input; 1947 direct.di_rx_ch = ill; 1948 1949 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct, 1950 DLD_ENABLE); 1951 if (rc == 0) { 1952 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df; 1953 idd->idd_tx_dh = direct.di_tx_dh; 1954 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df; 1955 idd->idd_tx_cb_dh = direct.di_tx_cb_dh; 1956 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df; 1957 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh; 1958 ASSERT(idd->idd_tx_cb_df != NULL); 1959 ASSERT(idd->idd_tx_fctl_df != NULL); 1960 ASSERT(idd->idd_tx_df != NULL); 1961 /* 1962 * One time registration of flow enable callback function 1963 */ 1964 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh, 1965 ill_flow_enable, ill); 1966 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT; 1967 DTRACE_PROBE1(direct_on, (ill_t *), ill); 1968 } else { 1969 cmn_err(CE_WARN, "warning: could not enable DIRECT " 1970 "capability, rc = %d\n", rc); 1971 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc); 1972 } 1973 } 1974 1975 static void 1976 ill_capability_poll_enable(ill_t *ill) 1977 { 1978 ill_dld_capab_t *idc = ill->ill_dld_capab; 1979 dld_capab_poll_t poll; 1980 int rc; 1981 1982 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 1983 1984 bzero(&poll, sizeof (poll)); 1985 poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring; 1986 poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring; 1987 poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring; 1988 poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring; 1989 poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring; 1990 poll.poll_ring_ch = ill; 1991 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll, 1992 DLD_ENABLE); 1993 if (rc == 0) { 1994 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL; 1995 DTRACE_PROBE1(poll_on, (ill_t *), ill); 1996 } else { 1997 ip1dbg(("warning: could not enable POLL " 1998 "capability, rc = %d\n", rc)); 1999 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc); 2000 } 2001 } 2002 2003 /* 2004 * Enable the LSO capability. 2005 */ 2006 static void 2007 ill_capability_lso_enable(ill_t *ill) 2008 { 2009 ill_dld_capab_t *idc = ill->ill_dld_capab; 2010 dld_capab_lso_t lso; 2011 int rc; 2012 2013 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 2014 2015 if (ill->ill_lso_capab == NULL) { 2016 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), 2017 KM_NOSLEEP); 2018 if (ill->ill_lso_capab == NULL) { 2019 cmn_err(CE_WARN, "ill_capability_lso_enable: " 2020 "could not enable LSO for %s (ENOMEM)\n", 2021 ill->ill_name); 2022 return; 2023 } 2024 } 2025 2026 bzero(&lso, sizeof (lso)); 2027 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso, 2028 DLD_ENABLE)) == 0) { 2029 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags; 2030 ill->ill_lso_capab->ill_lso_max = lso.lso_max; 2031 ill->ill_capabilities |= ILL_CAPAB_LSO; 2032 ip1dbg(("ill_capability_lso_enable: interface %s " 2033 "has enabled LSO\n ", ill->ill_name)); 2034 } else { 2035 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 2036 ill->ill_lso_capab = NULL; 2037 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc); 2038 } 2039 } 2040 2041 static void 2042 ill_capability_dld_enable(ill_t *ill) 2043 { 2044 mac_perim_handle_t mph; 2045 2046 ASSERT(IAM_WRITER_ILL(ill)); 2047 2048 if (ill->ill_isv6) 2049 return; 2050 2051 ill_mac_perim_enter(ill, &mph); 2052 if (!ill->ill_isv6) { 2053 ill_capability_direct_enable(ill); 2054 ill_capability_poll_enable(ill); 2055 ill_capability_lso_enable(ill); 2056 } 2057 ill->ill_capabilities |= ILL_CAPAB_DLD; 2058 ill_mac_perim_exit(ill, mph); 2059 } 2060 2061 static void 2062 ill_capability_dld_disable(ill_t *ill) 2063 { 2064 ill_dld_capab_t *idc; 2065 ill_dld_direct_t *idd; 2066 mac_perim_handle_t mph; 2067 2068 ASSERT(IAM_WRITER_ILL(ill)); 2069 2070 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 2071 return; 2072 2073 ill_mac_perim_enter(ill, &mph); 2074 2075 idc = ill->ill_dld_capab; 2076 if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) { 2077 /* 2078 * For performance we avoid locks in the transmit data path 2079 * and don't maintain a count of the number of threads using 2080 * direct calls. Thus some threads could be using direct 2081 * transmit calls to GLD, even after the capability mechanism 2082 * turns it off. This is still safe since the handles used in 2083 * the direct calls continue to be valid until the unplumb is 2084 * completed. Remove the callback that was added (1-time) at 2085 * capab enable time. 2086 */ 2087 mutex_enter(&ill->ill_lock); 2088 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT; 2089 mutex_exit(&ill->ill_lock); 2090 if (ill->ill_flownotify_mh != NULL) { 2091 idd = &idc->idc_direct; 2092 idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL, 2093 ill->ill_flownotify_mh); 2094 ill->ill_flownotify_mh = NULL; 2095 } 2096 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, 2097 NULL, DLD_DISABLE); 2098 } 2099 2100 if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) { 2101 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL; 2102 ip_squeue_clean_all(ill); 2103 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, 2104 NULL, DLD_DISABLE); 2105 } 2106 2107 if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) { 2108 ASSERT(ill->ill_lso_capab != NULL); 2109 /* 2110 * Clear the capability flag for LSO but retain the 2111 * ill_lso_capab structure since it's possible that another 2112 * thread is still referring to it. The structure only gets 2113 * deallocated when we destroy the ill. 2114 */ 2115 2116 ill->ill_capabilities &= ~ILL_CAPAB_LSO; 2117 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, 2118 NULL, DLD_DISABLE); 2119 } 2120 2121 ill->ill_capabilities &= ~ILL_CAPAB_DLD; 2122 ill_mac_perim_exit(ill, mph); 2123 } 2124 2125 /* 2126 * Capability Negotiation protocol 2127 * 2128 * We don't wait for DLPI capability operations to finish during interface 2129 * bringup or teardown. Doing so would introduce more asynchrony and the 2130 * interface up/down operations will need multiple return and restarts. 2131 * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as 2132 * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next 2133 * exclusive operation won't start until the DLPI operations of the previous 2134 * exclusive operation complete. 2135 * 2136 * The capability state machine is shown below. 2137 * 2138 * state next state event, action 2139 * 2140 * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe 2141 * IDCS_PROBE_SENT IDCS_OK ill_capability_ack 2142 * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack) 2143 * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG 2144 * IDCS_OK IDCS_RESET_SENT ill_capability_reset 2145 * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr 2146 * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr -> 2147 * ill_capability_probe. 2148 */ 2149 2150 /* 2151 * Dedicated thread started from ip_stack_init that handles capability 2152 * disable. This thread ensures the taskq dispatch does not fail by waiting 2153 * for resources using TQ_SLEEP. The taskq mechanism is used to ensure 2154 * that direct calls to DLD are done in a cv_waitable context. 2155 */ 2156 void 2157 ill_taskq_dispatch(ip_stack_t *ipst) 2158 { 2159 callb_cpr_t cprinfo; 2160 char name[64]; 2161 mblk_t *mp; 2162 2163 (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d", 2164 ipst->ips_netstack->netstack_stackid); 2165 CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr, 2166 name); 2167 mutex_enter(&ipst->ips_capab_taskq_lock); 2168 2169 for (;;) { 2170 mp = ipst->ips_capab_taskq_head; 2171 while (mp != NULL) { 2172 ipst->ips_capab_taskq_head = mp->b_next; 2173 if (ipst->ips_capab_taskq_head == NULL) 2174 ipst->ips_capab_taskq_tail = NULL; 2175 mutex_exit(&ipst->ips_capab_taskq_lock); 2176 mp->b_next = NULL; 2177 2178 VERIFY(taskq_dispatch(system_taskq, 2179 ill_capability_ack_thr, mp, TQ_SLEEP) != 0); 2180 mutex_enter(&ipst->ips_capab_taskq_lock); 2181 mp = ipst->ips_capab_taskq_head; 2182 } 2183 2184 if (ipst->ips_capab_taskq_quit) 2185 break; 2186 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2187 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock); 2188 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock); 2189 } 2190 VERIFY(ipst->ips_capab_taskq_head == NULL); 2191 VERIFY(ipst->ips_capab_taskq_tail == NULL); 2192 CALLB_CPR_EXIT(&cprinfo); 2193 thread_exit(); 2194 } 2195 2196 /* 2197 * Consume a new-style hardware capabilities negotiation ack. 2198 * Called via taskq on receipt of DL_CAPABILITY_ACK. 2199 */ 2200 static void 2201 ill_capability_ack_thr(void *arg) 2202 { 2203 mblk_t *mp = arg; 2204 dl_capability_ack_t *capp; 2205 dl_capability_sub_t *subp, *endp; 2206 ill_t *ill; 2207 boolean_t reneg; 2208 2209 ill = (ill_t *)mp->b_prev; 2210 mp->b_prev = NULL; 2211 2212 VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE); 2213 2214 if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT || 2215 ill->ill_dlpi_capab_state == IDCS_RENEG) { 2216 /* 2217 * We have received the ack for our DL_CAPAB reset request. 2218 * There isnt' anything in the message that needs processing. 2219 * All message based capabilities have been disabled, now 2220 * do the function call based capability disable. 2221 */ 2222 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG; 2223 ill_capability_dld_disable(ill); 2224 ill->ill_dlpi_capab_state = IDCS_UNKNOWN; 2225 if (reneg) 2226 ill_capability_probe(ill); 2227 goto done; 2228 } 2229 2230 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT) 2231 ill->ill_dlpi_capab_state = IDCS_OK; 2232 2233 capp = (dl_capability_ack_t *)mp->b_rptr; 2234 2235 if (capp->dl_sub_length == 0) { 2236 /* no new-style capabilities */ 2237 goto done; 2238 } 2239 2240 /* make sure the driver supplied correct dl_sub_length */ 2241 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 2242 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 2243 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 2244 goto done; 2245 } 2246 2247 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 2248 /* 2249 * There are sub-capabilities. Process the ones we know about. 2250 * Loop until we don't have room for another sub-cap header.. 2251 */ 2252 for (subp = SC(capp, capp->dl_sub_offset), 2253 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 2254 subp <= endp; 2255 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 2256 2257 switch (subp->dl_cap) { 2258 case DL_CAPAB_ID_WRAPPER: 2259 ill_capability_id_ack(ill, mp, subp); 2260 break; 2261 default: 2262 ill_capability_dispatch(ill, mp, subp); 2263 break; 2264 } 2265 } 2266 #undef SC 2267 done: 2268 inet_freemsg(mp); 2269 ill_capability_done(ill); 2270 ipsq_exit(ill->ill_phyint->phyint_ipsq); 2271 } 2272 2273 /* 2274 * This needs to be started in a taskq thread to provide a cv_waitable 2275 * context. 2276 */ 2277 void 2278 ill_capability_ack(ill_t *ill, mblk_t *mp) 2279 { 2280 ip_stack_t *ipst = ill->ill_ipst; 2281 2282 mp->b_prev = (mblk_t *)ill; 2283 ASSERT(mp->b_next == NULL); 2284 2285 if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp, 2286 TQ_NOSLEEP) != 0) 2287 return; 2288 2289 /* 2290 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread 2291 * which will do the dispatch using TQ_SLEEP to guarantee success. 2292 */ 2293 mutex_enter(&ipst->ips_capab_taskq_lock); 2294 if (ipst->ips_capab_taskq_head == NULL) { 2295 ASSERT(ipst->ips_capab_taskq_tail == NULL); 2296 ipst->ips_capab_taskq_head = mp; 2297 } else { 2298 ipst->ips_capab_taskq_tail->b_next = mp; 2299 } 2300 ipst->ips_capab_taskq_tail = mp; 2301 2302 cv_signal(&ipst->ips_capab_taskq_cv); 2303 mutex_exit(&ipst->ips_capab_taskq_lock); 2304 } 2305 2306 /* 2307 * This routine is called to scan the fragmentation reassembly table for 2308 * the specified ILL for any packets that are starting to smell. 2309 * dead_interval is the maximum time in seconds that will be tolerated. It 2310 * will either be the value specified in ip_g_frag_timeout, or zero if the 2311 * ILL is shutting down and it is time to blow everything off. 2312 * 2313 * It returns the number of seconds (as a time_t) that the next frag timer 2314 * should be scheduled for, 0 meaning that the timer doesn't need to be 2315 * re-started. Note that the method of calculating next_timeout isn't 2316 * entirely accurate since time will flow between the time we grab 2317 * current_time and the time we schedule the next timeout. This isn't a 2318 * big problem since this is the timer for sending an ICMP reassembly time 2319 * exceeded messages, and it doesn't have to be exactly accurate. 2320 * 2321 * This function is 2322 * sometimes called as writer, although this is not required. 2323 */ 2324 time_t 2325 ill_frag_timeout(ill_t *ill, time_t dead_interval) 2326 { 2327 ipfb_t *ipfb; 2328 ipfb_t *endp; 2329 ipf_t *ipf; 2330 ipf_t *ipfnext; 2331 mblk_t *mp; 2332 time_t current_time = gethrestime_sec(); 2333 time_t next_timeout = 0; 2334 uint32_t hdr_length; 2335 mblk_t *send_icmp_head; 2336 mblk_t *send_icmp_head_v6; 2337 ip_stack_t *ipst = ill->ill_ipst; 2338 ip_recv_attr_t iras; 2339 2340 bzero(&iras, sizeof (iras)); 2341 iras.ira_flags = 0; 2342 iras.ira_ill = iras.ira_rill = ill; 2343 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2344 iras.ira_rifindex = iras.ira_ruifindex; 2345 2346 ipfb = ill->ill_frag_hash_tbl; 2347 if (ipfb == NULL) 2348 return (B_FALSE); 2349 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 2350 /* Walk the frag hash table. */ 2351 for (; ipfb < endp; ipfb++) { 2352 send_icmp_head = NULL; 2353 send_icmp_head_v6 = NULL; 2354 mutex_enter(&ipfb->ipfb_lock); 2355 while ((ipf = ipfb->ipfb_ipf) != 0) { 2356 time_t frag_time = current_time - ipf->ipf_timestamp; 2357 time_t frag_timeout; 2358 2359 if (frag_time < dead_interval) { 2360 /* 2361 * There are some outstanding fragments 2362 * that will timeout later. Make note of 2363 * the time so that we can reschedule the 2364 * next timeout appropriately. 2365 */ 2366 frag_timeout = dead_interval - frag_time; 2367 if (next_timeout == 0 || 2368 frag_timeout < next_timeout) { 2369 next_timeout = frag_timeout; 2370 } 2371 break; 2372 } 2373 /* Time's up. Get it out of here. */ 2374 hdr_length = ipf->ipf_nf_hdr_len; 2375 ipfnext = ipf->ipf_hash_next; 2376 if (ipfnext) 2377 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 2378 *ipf->ipf_ptphn = ipfnext; 2379 mp = ipf->ipf_mp->b_cont; 2380 for (; mp; mp = mp->b_cont) { 2381 /* Extra points for neatness. */ 2382 IP_REASS_SET_START(mp, 0); 2383 IP_REASS_SET_END(mp, 0); 2384 } 2385 mp = ipf->ipf_mp->b_cont; 2386 atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count); 2387 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 2388 ipfb->ipfb_count -= ipf->ipf_count; 2389 ASSERT(ipfb->ipfb_frag_pkts > 0); 2390 ipfb->ipfb_frag_pkts--; 2391 /* 2392 * We do not send any icmp message from here because 2393 * we currently are holding the ipfb_lock for this 2394 * hash chain. If we try and send any icmp messages 2395 * from here we may end up via a put back into ip 2396 * trying to get the same lock, causing a recursive 2397 * mutex panic. Instead we build a list and send all 2398 * the icmp messages after we have dropped the lock. 2399 */ 2400 if (ill->ill_isv6) { 2401 if (hdr_length != 0) { 2402 mp->b_next = send_icmp_head_v6; 2403 send_icmp_head_v6 = mp; 2404 } else { 2405 freemsg(mp); 2406 } 2407 } else { 2408 if (hdr_length != 0) { 2409 mp->b_next = send_icmp_head; 2410 send_icmp_head = mp; 2411 } else { 2412 freemsg(mp); 2413 } 2414 } 2415 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2416 ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill); 2417 freeb(ipf->ipf_mp); 2418 } 2419 mutex_exit(&ipfb->ipfb_lock); 2420 /* 2421 * Now need to send any icmp messages that we delayed from 2422 * above. 2423 */ 2424 while (send_icmp_head_v6 != NULL) { 2425 ip6_t *ip6h; 2426 2427 mp = send_icmp_head_v6; 2428 send_icmp_head_v6 = send_icmp_head_v6->b_next; 2429 mp->b_next = NULL; 2430 ip6h = (ip6_t *)mp->b_rptr; 2431 iras.ira_flags = 0; 2432 /* 2433 * This will result in an incorrect ALL_ZONES zoneid 2434 * for multicast packets, but we 2435 * don't send ICMP errors for those in any case. 2436 */ 2437 iras.ira_zoneid = 2438 ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 2439 ill, ipst); 2440 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2441 icmp_time_exceeded_v6(mp, 2442 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 2443 &iras); 2444 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2445 } 2446 while (send_icmp_head != NULL) { 2447 ipaddr_t dst; 2448 2449 mp = send_icmp_head; 2450 send_icmp_head = send_icmp_head->b_next; 2451 mp->b_next = NULL; 2452 2453 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 2454 2455 iras.ira_flags = IRAF_IS_IPV4; 2456 /* 2457 * This will result in an incorrect ALL_ZONES zoneid 2458 * for broadcast and multicast packets, but we 2459 * don't send ICMP errors for those in any case. 2460 */ 2461 iras.ira_zoneid = ipif_lookup_addr_zoneid(dst, 2462 ill, ipst); 2463 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2464 icmp_time_exceeded(mp, 2465 ICMP_REASSEMBLY_TIME_EXCEEDED, &iras); 2466 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2467 } 2468 } 2469 /* 2470 * A non-dying ILL will use the return value to decide whether to 2471 * restart the frag timer, and for how long. 2472 */ 2473 return (next_timeout); 2474 } 2475 2476 /* 2477 * This routine is called when the approximate count of mblk memory used 2478 * for the specified ILL has exceeded max_count. 2479 */ 2480 void 2481 ill_frag_prune(ill_t *ill, uint_t max_count) 2482 { 2483 ipfb_t *ipfb; 2484 ipf_t *ipf; 2485 size_t count; 2486 clock_t now; 2487 2488 /* 2489 * If we are here within ip_min_frag_prune_time msecs remove 2490 * ill_frag_free_num_pkts oldest packets from each bucket and increment 2491 * ill_frag_free_num_pkts. 2492 */ 2493 mutex_enter(&ill->ill_lock); 2494 now = ddi_get_lbolt(); 2495 if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <= 2496 (ip_min_frag_prune_time != 0 ? 2497 ip_min_frag_prune_time : msec_per_tick)) { 2498 2499 ill->ill_frag_free_num_pkts++; 2500 2501 } else { 2502 ill->ill_frag_free_num_pkts = 0; 2503 } 2504 ill->ill_last_frag_clean_time = now; 2505 mutex_exit(&ill->ill_lock); 2506 2507 /* 2508 * free ill_frag_free_num_pkts oldest packets from each bucket. 2509 */ 2510 if (ill->ill_frag_free_num_pkts != 0) { 2511 int ix; 2512 2513 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2514 ipfb = &ill->ill_frag_hash_tbl[ix]; 2515 mutex_enter(&ipfb->ipfb_lock); 2516 if (ipfb->ipfb_ipf != NULL) { 2517 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 2518 ill->ill_frag_free_num_pkts); 2519 } 2520 mutex_exit(&ipfb->ipfb_lock); 2521 } 2522 } 2523 /* 2524 * While the reassembly list for this ILL is too big, prune a fragment 2525 * queue by age, oldest first. 2526 */ 2527 while (ill->ill_frag_count > max_count) { 2528 int ix; 2529 ipfb_t *oipfb = NULL; 2530 uint_t oldest = UINT_MAX; 2531 2532 count = 0; 2533 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2534 ipfb = &ill->ill_frag_hash_tbl[ix]; 2535 mutex_enter(&ipfb->ipfb_lock); 2536 ipf = ipfb->ipfb_ipf; 2537 if (ipf != NULL && ipf->ipf_gen < oldest) { 2538 oldest = ipf->ipf_gen; 2539 oipfb = ipfb; 2540 } 2541 count += ipfb->ipfb_count; 2542 mutex_exit(&ipfb->ipfb_lock); 2543 } 2544 if (oipfb == NULL) 2545 break; 2546 2547 if (count <= max_count) 2548 return; /* Somebody beat us to it, nothing to do */ 2549 mutex_enter(&oipfb->ipfb_lock); 2550 ipf = oipfb->ipfb_ipf; 2551 if (ipf != NULL) { 2552 ill_frag_free_pkts(ill, oipfb, ipf, 1); 2553 } 2554 mutex_exit(&oipfb->ipfb_lock); 2555 } 2556 } 2557 2558 /* 2559 * free 'free_cnt' fragmented packets starting at ipf. 2560 */ 2561 void 2562 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 2563 { 2564 size_t count; 2565 mblk_t *mp; 2566 mblk_t *tmp; 2567 ipf_t **ipfp = ipf->ipf_ptphn; 2568 2569 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 2570 ASSERT(ipfp != NULL); 2571 ASSERT(ipf != NULL); 2572 2573 while (ipf != NULL && free_cnt-- > 0) { 2574 count = ipf->ipf_count; 2575 mp = ipf->ipf_mp; 2576 ipf = ipf->ipf_hash_next; 2577 for (tmp = mp; tmp; tmp = tmp->b_cont) { 2578 IP_REASS_SET_START(tmp, 0); 2579 IP_REASS_SET_END(tmp, 0); 2580 } 2581 atomic_add_32(&ill->ill_frag_count, -count); 2582 ASSERT(ipfb->ipfb_count >= count); 2583 ipfb->ipfb_count -= count; 2584 ASSERT(ipfb->ipfb_frag_pkts > 0); 2585 ipfb->ipfb_frag_pkts--; 2586 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2587 ip_drop_input("ipIfStatsReasmFails", mp, ill); 2588 freemsg(mp); 2589 } 2590 2591 if (ipf) 2592 ipf->ipf_ptphn = ipfp; 2593 ipfp[0] = ipf; 2594 } 2595 2596 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 2597 "obsolete and may be removed in a future release of Solaris. Use " \ 2598 "ifconfig(1M) to manipulate the forwarding status of an interface." 2599 2600 /* 2601 * For obsolete per-interface forwarding configuration; 2602 * called in response to ND_GET. 2603 */ 2604 /* ARGSUSED */ 2605 static int 2606 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 2607 { 2608 ill_t *ill = (ill_t *)cp; 2609 2610 cmn_err(CE_WARN, ND_FORWARD_WARNING); 2611 2612 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 2613 return (0); 2614 } 2615 2616 /* 2617 * For obsolete per-interface forwarding configuration; 2618 * called in response to ND_SET. 2619 */ 2620 /* ARGSUSED */ 2621 static int 2622 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 2623 cred_t *ioc_cr) 2624 { 2625 long value; 2626 int retval; 2627 ip_stack_t *ipst = CONNQ_TO_IPST(q); 2628 2629 cmn_err(CE_WARN, ND_FORWARD_WARNING); 2630 2631 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 2632 value < 0 || value > 1) { 2633 return (EINVAL); 2634 } 2635 2636 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2637 retval = ill_forward_set((ill_t *)cp, (value != 0)); 2638 rw_exit(&ipst->ips_ill_g_lock); 2639 return (retval); 2640 } 2641 2642 /* 2643 * Helper function for ill_forward_set(). 2644 */ 2645 static void 2646 ill_forward_set_on_ill(ill_t *ill, boolean_t enable) 2647 { 2648 ip_stack_t *ipst = ill->ill_ipst; 2649 2650 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2651 2652 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 2653 (enable ? "Enabling" : "Disabling"), 2654 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 2655 mutex_enter(&ill->ill_lock); 2656 if (enable) 2657 ill->ill_flags |= ILLF_ROUTER; 2658 else 2659 ill->ill_flags &= ~ILLF_ROUTER; 2660 mutex_exit(&ill->ill_lock); 2661 if (ill->ill_isv6) 2662 ill_set_nce_router_flags(ill, enable); 2663 /* Notify routing socket listeners of this change. */ 2664 if (ill->ill_ipif != NULL) 2665 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 2666 } 2667 2668 /* 2669 * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing 2670 * socket messages for each interface whose flags we change. 2671 */ 2672 int 2673 ill_forward_set(ill_t *ill, boolean_t enable) 2674 { 2675 ipmp_illgrp_t *illg; 2676 ip_stack_t *ipst = ill->ill_ipst; 2677 2678 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2679 2680 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 2681 (!enable && !(ill->ill_flags & ILLF_ROUTER))) 2682 return (0); 2683 2684 if (IS_LOOPBACK(ill)) 2685 return (EINVAL); 2686 2687 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { 2688 /* 2689 * Update all of the interfaces in the group. 2690 */ 2691 illg = ill->ill_grp; 2692 ill = list_head(&illg->ig_if); 2693 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) 2694 ill_forward_set_on_ill(ill, enable); 2695 2696 /* 2697 * Update the IPMP meta-interface. 2698 */ 2699 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable); 2700 return (0); 2701 } 2702 2703 ill_forward_set_on_ill(ill, enable); 2704 return (0); 2705 } 2706 2707 /* 2708 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 2709 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 2710 * set or clear. 2711 */ 2712 static void 2713 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 2714 { 2715 ipif_t *ipif; 2716 ncec_t *ncec; 2717 nce_t *nce; 2718 2719 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 2720 /* 2721 * NOTE: we match across the illgrp because nce's for 2722 * addresses on IPMP interfaces have an nce_ill that points to 2723 * the bound underlying ill. 2724 */ 2725 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); 2726 if (nce != NULL) { 2727 ncec = nce->nce_common; 2728 mutex_enter(&ncec->ncec_lock); 2729 if (enable) 2730 ncec->ncec_flags |= NCE_F_ISROUTER; 2731 else 2732 ncec->ncec_flags &= ~NCE_F_ISROUTER; 2733 mutex_exit(&ncec->ncec_lock); 2734 nce_refrele(nce); 2735 } 2736 } 2737 } 2738 2739 /* 2740 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 2741 * for this ill. Make sure the v6/v4 question has been answered about this 2742 * ill. The creation of this ndd variable is only for backwards compatibility. 2743 * The preferred way to control per-interface IP forwarding is through the 2744 * ILLF_ROUTER interface flag. 2745 */ 2746 static int 2747 ill_set_ndd_name(ill_t *ill) 2748 { 2749 char *suffix; 2750 ip_stack_t *ipst = ill->ill_ipst; 2751 2752 ASSERT(IAM_WRITER_ILL(ill)); 2753 2754 if (ill->ill_isv6) 2755 suffix = ipv6_forward_suffix; 2756 else 2757 suffix = ipv4_forward_suffix; 2758 2759 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 2760 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 2761 /* 2762 * Copies over the '\0'. 2763 * Note that strlen(suffix) is always bounded. 2764 */ 2765 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 2766 strlen(suffix) + 1); 2767 2768 /* 2769 * Use of the nd table requires holding the reader lock. 2770 * Modifying the nd table thru nd_load/nd_unload requires 2771 * the writer lock. 2772 */ 2773 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 2774 if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 2775 nd_ill_forward_set, (caddr_t)ill)) { 2776 /* 2777 * If the nd_load failed, it only meant that it could not 2778 * allocate a new bunch of room for further NDD expansion. 2779 * Because of that, the ill_ndd_name will be set to 0, and 2780 * this interface is at the mercy of the global ip_forwarding 2781 * variable. 2782 */ 2783 rw_exit(&ipst->ips_ip_g_nd_lock); 2784 ill->ill_ndd_name = NULL; 2785 return (ENOMEM); 2786 } 2787 rw_exit(&ipst->ips_ip_g_nd_lock); 2788 return (0); 2789 } 2790 2791 /* 2792 * Intializes the context structure and returns the first ill in the list 2793 * cuurently start_list and end_list can have values: 2794 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 2795 * IP_V4_G_HEAD Traverse IPV4 list only. 2796 * IP_V6_G_HEAD Traverse IPV6 list only. 2797 */ 2798 2799 /* 2800 * We don't check for CONDEMNED ills here. Caller must do that if 2801 * necessary under the ill lock. 2802 */ 2803 ill_t * 2804 ill_first(int start_list, int end_list, ill_walk_context_t *ctx, 2805 ip_stack_t *ipst) 2806 { 2807 ill_if_t *ifp; 2808 ill_t *ill; 2809 avl_tree_t *avl_tree; 2810 2811 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 2812 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 2813 2814 /* 2815 * setup the lists to search 2816 */ 2817 if (end_list != MAX_G_HEADS) { 2818 ctx->ctx_current_list = start_list; 2819 ctx->ctx_last_list = end_list; 2820 } else { 2821 ctx->ctx_last_list = MAX_G_HEADS - 1; 2822 ctx->ctx_current_list = 0; 2823 } 2824 2825 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 2826 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2827 if (ifp != (ill_if_t *) 2828 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2829 avl_tree = &ifp->illif_avl_by_ppa; 2830 ill = avl_first(avl_tree); 2831 /* 2832 * ill is guaranteed to be non NULL or ifp should have 2833 * not existed. 2834 */ 2835 ASSERT(ill != NULL); 2836 return (ill); 2837 } 2838 ctx->ctx_current_list++; 2839 } 2840 2841 return (NULL); 2842 } 2843 2844 /* 2845 * returns the next ill in the list. ill_first() must have been called 2846 * before calling ill_next() or bad things will happen. 2847 */ 2848 2849 /* 2850 * We don't check for CONDEMNED ills here. Caller must do that if 2851 * necessary under the ill lock. 2852 */ 2853 ill_t * 2854 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 2855 { 2856 ill_if_t *ifp; 2857 ill_t *ill; 2858 ip_stack_t *ipst = lastill->ill_ipst; 2859 2860 ASSERT(lastill->ill_ifptr != (ill_if_t *) 2861 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)); 2862 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 2863 AVL_AFTER)) != NULL) { 2864 return (ill); 2865 } 2866 2867 /* goto next ill_ifp in the list. */ 2868 ifp = lastill->ill_ifptr->illif_next; 2869 2870 /* make sure not at end of circular list */ 2871 while (ifp == 2872 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2873 if (++ctx->ctx_current_list > ctx->ctx_last_list) 2874 return (NULL); 2875 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2876 } 2877 2878 return (avl_first(&ifp->illif_avl_by_ppa)); 2879 } 2880 2881 /* 2882 * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+ 2883 * The final number (PPA) must not have any leading zeros. Upon success, a 2884 * pointer to the start of the PPA is returned; otherwise NULL is returned. 2885 */ 2886 static char * 2887 ill_get_ppa_ptr(char *name) 2888 { 2889 int namelen = strlen(name); 2890 int end_ndx = namelen - 1; 2891 int ppa_ndx, i; 2892 2893 /* 2894 * Check that the first character is [a-zA-Z], and that the last 2895 * character is [0-9]. 2896 */ 2897 if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx])) 2898 return (NULL); 2899 2900 /* 2901 * Set `ppa_ndx' to the PPA start, and check for leading zeroes. 2902 */ 2903 for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--) 2904 if (!isdigit(name[ppa_ndx - 1])) 2905 break; 2906 2907 if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx) 2908 return (NULL); 2909 2910 /* 2911 * Check that the intermediate characters are [a-z0-9.] 2912 */ 2913 for (i = 1; i < ppa_ndx; i++) { 2914 if (!isalpha(name[i]) && !isdigit(name[i]) && 2915 name[i] != '.' && name[i] != '_') { 2916 return (NULL); 2917 } 2918 } 2919 2920 return (name + ppa_ndx); 2921 } 2922 2923 /* 2924 * use avl tree to locate the ill. 2925 */ 2926 static ill_t * 2927 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst) 2928 { 2929 char *ppa_ptr = NULL; 2930 int len; 2931 uint_t ppa; 2932 ill_t *ill = NULL; 2933 ill_if_t *ifp; 2934 int list; 2935 2936 /* 2937 * get ppa ptr 2938 */ 2939 if (isv6) 2940 list = IP_V6_G_HEAD; 2941 else 2942 list = IP_V4_G_HEAD; 2943 2944 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 2945 return (NULL); 2946 } 2947 2948 len = ppa_ptr - name + 1; 2949 2950 ppa = stoi(&ppa_ptr); 2951 2952 ifp = IP_VX_ILL_G_LIST(list, ipst); 2953 2954 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 2955 /* 2956 * match is done on len - 1 as the name is not null 2957 * terminated it contains ppa in addition to the interface 2958 * name. 2959 */ 2960 if ((ifp->illif_name_len == len) && 2961 bcmp(ifp->illif_name, name, len - 1) == 0) { 2962 break; 2963 } else { 2964 ifp = ifp->illif_next; 2965 } 2966 } 2967 2968 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 2969 /* 2970 * Even the interface type does not exist. 2971 */ 2972 return (NULL); 2973 } 2974 2975 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 2976 if (ill != NULL) { 2977 mutex_enter(&ill->ill_lock); 2978 if (ILL_CAN_LOOKUP(ill)) { 2979 ill_refhold_locked(ill); 2980 mutex_exit(&ill->ill_lock); 2981 return (ill); 2982 } 2983 mutex_exit(&ill->ill_lock); 2984 } 2985 return (NULL); 2986 } 2987 2988 /* 2989 * comparison function for use with avl. 2990 */ 2991 static int 2992 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 2993 { 2994 uint_t ppa; 2995 uint_t ill_ppa; 2996 2997 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 2998 2999 ppa = *((uint_t *)ppa_ptr); 3000 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 3001 /* 3002 * We want the ill with the lowest ppa to be on the 3003 * top. 3004 */ 3005 if (ill_ppa < ppa) 3006 return (1); 3007 if (ill_ppa > ppa) 3008 return (-1); 3009 return (0); 3010 } 3011 3012 /* 3013 * remove an interface type from the global list. 3014 */ 3015 static void 3016 ill_delete_interface_type(ill_if_t *interface) 3017 { 3018 ASSERT(interface != NULL); 3019 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 3020 3021 avl_destroy(&interface->illif_avl_by_ppa); 3022 if (interface->illif_ppa_arena != NULL) 3023 vmem_destroy(interface->illif_ppa_arena); 3024 3025 remque(interface); 3026 3027 mi_free(interface); 3028 } 3029 3030 /* 3031 * remove ill from the global list. 3032 */ 3033 static void 3034 ill_glist_delete(ill_t *ill) 3035 { 3036 ip_stack_t *ipst; 3037 phyint_t *phyi; 3038 3039 if (ill == NULL) 3040 return; 3041 ipst = ill->ill_ipst; 3042 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 3043 3044 /* 3045 * If the ill was never inserted into the AVL tree 3046 * we skip the if branch. 3047 */ 3048 if (ill->ill_ifptr != NULL) { 3049 /* 3050 * remove from AVL tree and free ppa number 3051 */ 3052 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 3053 3054 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 3055 vmem_free(ill->ill_ifptr->illif_ppa_arena, 3056 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 3057 } 3058 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 3059 ill_delete_interface_type(ill->ill_ifptr); 3060 } 3061 3062 /* 3063 * Indicate ill is no longer in the list. 3064 */ 3065 ill->ill_ifptr = NULL; 3066 ill->ill_name_length = 0; 3067 ill->ill_name[0] = '\0'; 3068 ill->ill_ppa = UINT_MAX; 3069 } 3070 3071 /* Generate one last event for this ill. */ 3072 ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name, 3073 ill->ill_name_length); 3074 3075 ASSERT(ill->ill_phyint != NULL); 3076 phyi = ill->ill_phyint; 3077 ill->ill_phyint = NULL; 3078 3079 /* 3080 * ill_init allocates a phyint always to store the copy 3081 * of flags relevant to phyint. At that point in time, we could 3082 * not assign the name and hence phyint_illv4/v6 could not be 3083 * initialized. Later in ipif_set_values, we assign the name to 3084 * the ill, at which point in time we assign phyint_illv4/v6. 3085 * Thus we don't rely on phyint_illv6 to be initialized always. 3086 */ 3087 if (ill->ill_flags & ILLF_IPV6) 3088 phyi->phyint_illv6 = NULL; 3089 else 3090 phyi->phyint_illv4 = NULL; 3091 3092 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) { 3093 rw_exit(&ipst->ips_ill_g_lock); 3094 return; 3095 } 3096 3097 /* 3098 * There are no ills left on this phyint; pull it out of the phyint 3099 * avl trees, and free it. 3100 */ 3101 if (phyi->phyint_ifindex > 0) { 3102 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3103 phyi); 3104 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 3105 phyi); 3106 } 3107 rw_exit(&ipst->ips_ill_g_lock); 3108 3109 phyint_free(phyi); 3110 } 3111 3112 /* 3113 * allocate a ppa, if the number of plumbed interfaces of this type are 3114 * less than ill_no_arena do a linear search to find a unused ppa. 3115 * When the number goes beyond ill_no_arena switch to using an arena. 3116 * Note: ppa value of zero cannot be allocated from vmem_arena as it 3117 * is the return value for an error condition, so allocation starts at one 3118 * and is decremented by one. 3119 */ 3120 static int 3121 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 3122 { 3123 ill_t *tmp_ill; 3124 uint_t start, end; 3125 int ppa; 3126 3127 if (ifp->illif_ppa_arena == NULL && 3128 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 3129 /* 3130 * Create an arena. 3131 */ 3132 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 3133 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 3134 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 3135 /* allocate what has already been assigned */ 3136 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 3137 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 3138 tmp_ill, AVL_AFTER)) { 3139 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 3140 1, /* size */ 3141 1, /* align/quantum */ 3142 0, /* phase */ 3143 0, /* nocross */ 3144 /* minaddr */ 3145 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), 3146 /* maxaddr */ 3147 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), 3148 VM_NOSLEEP|VM_FIRSTFIT); 3149 if (ppa == 0) { 3150 ip1dbg(("ill_alloc_ppa: ppa allocation" 3151 " failed while switching")); 3152 vmem_destroy(ifp->illif_ppa_arena); 3153 ifp->illif_ppa_arena = NULL; 3154 break; 3155 } 3156 } 3157 } 3158 3159 if (ifp->illif_ppa_arena != NULL) { 3160 if (ill->ill_ppa == UINT_MAX) { 3161 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 3162 1, VM_NOSLEEP|VM_FIRSTFIT); 3163 if (ppa == 0) 3164 return (EAGAIN); 3165 ill->ill_ppa = --ppa; 3166 } else { 3167 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 3168 1, /* size */ 3169 1, /* align/quantum */ 3170 0, /* phase */ 3171 0, /* nocross */ 3172 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 3173 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 3174 VM_NOSLEEP|VM_FIRSTFIT); 3175 /* 3176 * Most likely the allocation failed because 3177 * the requested ppa was in use. 3178 */ 3179 if (ppa == 0) 3180 return (EEXIST); 3181 } 3182 return (0); 3183 } 3184 3185 /* 3186 * No arena is in use and not enough (>ill_no_arena) interfaces have 3187 * been plumbed to create one. Do a linear search to get a unused ppa. 3188 */ 3189 if (ill->ill_ppa == UINT_MAX) { 3190 end = UINT_MAX - 1; 3191 start = 0; 3192 } else { 3193 end = start = ill->ill_ppa; 3194 } 3195 3196 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 3197 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 3198 if (start++ >= end) { 3199 if (ill->ill_ppa == UINT_MAX) 3200 return (EAGAIN); 3201 else 3202 return (EEXIST); 3203 } 3204 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 3205 } 3206 ill->ill_ppa = start; 3207 return (0); 3208 } 3209 3210 /* 3211 * Insert ill into the list of configured ill's. Once this function completes, 3212 * the ill is globally visible and is available through lookups. More precisely 3213 * this happens after the caller drops the ill_g_lock. 3214 */ 3215 static int 3216 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 3217 { 3218 ill_if_t *ill_interface; 3219 avl_index_t where = 0; 3220 int error; 3221 int name_length; 3222 int index; 3223 boolean_t check_length = B_FALSE; 3224 ip_stack_t *ipst = ill->ill_ipst; 3225 3226 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 3227 3228 name_length = mi_strlen(name) + 1; 3229 3230 if (isv6) 3231 index = IP_V6_G_HEAD; 3232 else 3233 index = IP_V4_G_HEAD; 3234 3235 ill_interface = IP_VX_ILL_G_LIST(index, ipst); 3236 /* 3237 * Search for interface type based on name 3238 */ 3239 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 3240 if ((ill_interface->illif_name_len == name_length) && 3241 (strcmp(ill_interface->illif_name, name) == 0)) { 3242 break; 3243 } 3244 ill_interface = ill_interface->illif_next; 3245 } 3246 3247 /* 3248 * Interface type not found, create one. 3249 */ 3250 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 3251 ill_g_head_t ghead; 3252 3253 /* 3254 * allocate ill_if_t structure 3255 */ 3256 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 3257 if (ill_interface == NULL) { 3258 return (ENOMEM); 3259 } 3260 3261 (void) strcpy(ill_interface->illif_name, name); 3262 ill_interface->illif_name_len = name_length; 3263 3264 avl_create(&ill_interface->illif_avl_by_ppa, 3265 ill_compare_ppa, sizeof (ill_t), 3266 offsetof(struct ill_s, ill_avl_byppa)); 3267 3268 /* 3269 * link the structure in the back to maintain order 3270 * of configuration for ifconfig output. 3271 */ 3272 ghead = ipst->ips_ill_g_heads[index]; 3273 insque(ill_interface, ghead.ill_g_list_tail); 3274 } 3275 3276 if (ill->ill_ppa == UINT_MAX) 3277 check_length = B_TRUE; 3278 3279 error = ill_alloc_ppa(ill_interface, ill); 3280 if (error != 0) { 3281 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 3282 ill_delete_interface_type(ill->ill_ifptr); 3283 return (error); 3284 } 3285 3286 /* 3287 * When the ppa is choosen by the system, check that there is 3288 * enough space to insert ppa. if a specific ppa was passed in this 3289 * check is not required as the interface name passed in will have 3290 * the right ppa in it. 3291 */ 3292 if (check_length) { 3293 /* 3294 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 3295 */ 3296 char buf[sizeof (uint_t) * 3]; 3297 3298 /* 3299 * convert ppa to string to calculate the amount of space 3300 * required for it in the name. 3301 */ 3302 numtos(ill->ill_ppa, buf); 3303 3304 /* Do we have enough space to insert ppa ? */ 3305 3306 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 3307 /* Free ppa and interface type struct */ 3308 if (ill_interface->illif_ppa_arena != NULL) { 3309 vmem_free(ill_interface->illif_ppa_arena, 3310 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 3311 } 3312 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 3313 ill_delete_interface_type(ill->ill_ifptr); 3314 3315 return (EINVAL); 3316 } 3317 } 3318 3319 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 3320 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 3321 3322 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 3323 &where); 3324 ill->ill_ifptr = ill_interface; 3325 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 3326 3327 ill_phyint_reinit(ill); 3328 return (0); 3329 } 3330 3331 /* Initialize the per phyint ipsq used for serialization */ 3332 static boolean_t 3333 ipsq_init(ill_t *ill, boolean_t enter) 3334 { 3335 ipsq_t *ipsq; 3336 ipxop_t *ipx; 3337 3338 if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL) 3339 return (B_FALSE); 3340 3341 ill->ill_phyint->phyint_ipsq = ipsq; 3342 ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop; 3343 ipx->ipx_ipsq = ipsq; 3344 ipsq->ipsq_next = ipsq; 3345 ipsq->ipsq_phyint = ill->ill_phyint; 3346 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 3347 mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0); 3348 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */ 3349 if (enter) { 3350 ipx->ipx_writer = curthread; 3351 ipx->ipx_forced = B_FALSE; 3352 ipx->ipx_reentry_cnt = 1; 3353 #ifdef DEBUG 3354 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 3355 #endif 3356 } 3357 return (B_TRUE); 3358 } 3359 3360 /* 3361 * ill_init is called by ip_open when a device control stream is opened. 3362 * It does a few initializations, and shoots a DL_INFO_REQ message down 3363 * to the driver. The response is later picked up in ip_rput_dlpi and 3364 * used to set up default mechanisms for talking to the driver. (Always 3365 * called as writer.) 3366 * 3367 * If this function returns error, ip_open will call ip_close which in 3368 * turn will call ill_delete to clean up any memory allocated here that 3369 * is not yet freed. 3370 */ 3371 int 3372 ill_init(queue_t *q, ill_t *ill) 3373 { 3374 int count; 3375 dl_info_req_t *dlir; 3376 mblk_t *info_mp; 3377 uchar_t *frag_ptr; 3378 3379 /* 3380 * The ill is initialized to zero by mi_alloc*(). In addition 3381 * some fields already contain valid values, initialized in 3382 * ip_open(), before we reach here. 3383 */ 3384 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 3385 mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 3386 ill->ill_saved_ire_cnt = 0; 3387 3388 ill->ill_rq = q; 3389 ill->ill_wq = WR(q); 3390 3391 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 3392 BPRI_HI); 3393 if (info_mp == NULL) 3394 return (ENOMEM); 3395 3396 /* 3397 * Allocate sufficient space to contain our fragment hash table and 3398 * the device name. 3399 */ 3400 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 3401 2 * LIFNAMSIZ + strlen(ipv6_forward_suffix)); 3402 if (frag_ptr == NULL) { 3403 freemsg(info_mp); 3404 return (ENOMEM); 3405 } 3406 ill->ill_frag_ptr = frag_ptr; 3407 ill->ill_frag_free_num_pkts = 0; 3408 ill->ill_last_frag_clean_time = 0; 3409 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 3410 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 3411 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 3412 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 3413 NULL, MUTEX_DEFAULT, NULL); 3414 } 3415 3416 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 3417 if (ill->ill_phyint == NULL) { 3418 freemsg(info_mp); 3419 mi_free(frag_ptr); 3420 return (ENOMEM); 3421 } 3422 3423 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 3424 /* 3425 * For now pretend this is a v4 ill. We need to set phyint_ill* 3426 * at this point because of the following reason. If we can't 3427 * enter the ipsq at some point and cv_wait, the writer that 3428 * wakes us up tries to locate us using the list of all phyints 3429 * in an ipsq and the ills from the phyint thru the phyint_ill*. 3430 * If we don't set it now, we risk a missed wakeup. 3431 */ 3432 ill->ill_phyint->phyint_illv4 = ill; 3433 ill->ill_ppa = UINT_MAX; 3434 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); 3435 3436 ill_set_inputfn(ill); 3437 3438 if (!ipsq_init(ill, B_TRUE)) { 3439 freemsg(info_mp); 3440 mi_free(frag_ptr); 3441 mi_free(ill->ill_phyint); 3442 return (ENOMEM); 3443 } 3444 3445 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 3446 3447 /* Frag queue limit stuff */ 3448 ill->ill_frag_count = 0; 3449 ill->ill_ipf_gen = 0; 3450 3451 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); 3452 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); 3453 ill->ill_global_timer = INFINITY; 3454 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 3455 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 3456 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 3457 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 3458 3459 /* 3460 * Initialize IPv6 configuration variables. The IP module is always 3461 * opened as an IPv4 module. Instead tracking down the cases where 3462 * it switches to do ipv6, we'll just initialize the IPv6 configuration 3463 * here for convenience, this has no effect until the ill is set to do 3464 * IPv6. 3465 */ 3466 ill->ill_reachable_time = ND_REACHABLE_TIME; 3467 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 3468 ill->ill_max_buf = ND_MAX_Q; 3469 ill->ill_refcnt = 0; 3470 3471 /* Send down the Info Request to the driver. */ 3472 info_mp->b_datap->db_type = M_PCPROTO; 3473 dlir = (dl_info_req_t *)info_mp->b_rptr; 3474 info_mp->b_wptr = (uchar_t *)&dlir[1]; 3475 dlir->dl_primitive = DL_INFO_REQ; 3476 3477 ill->ill_dlpi_pending = DL_PRIM_INVAL; 3478 3479 qprocson(q); 3480 ill_dlpi_send(ill, info_mp); 3481 3482 return (0); 3483 } 3484 3485 /* 3486 * ill_dls_info 3487 * creates datalink socket info from the device. 3488 */ 3489 int 3490 ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill) 3491 { 3492 size_t len; 3493 3494 sdl->sdl_family = AF_LINK; 3495 sdl->sdl_index = ill_get_upper_ifindex(ill); 3496 sdl->sdl_type = ill->ill_type; 3497 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); 3498 len = strlen(sdl->sdl_data); 3499 ASSERT(len < 256); 3500 sdl->sdl_nlen = (uchar_t)len; 3501 sdl->sdl_alen = ill->ill_phys_addr_length; 3502 sdl->sdl_slen = 0; 3503 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) 3504 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen); 3505 3506 return (sizeof (struct sockaddr_dl)); 3507 } 3508 3509 /* 3510 * ill_xarp_info 3511 * creates xarp info from the device. 3512 */ 3513 static int 3514 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 3515 { 3516 sdl->sdl_family = AF_LINK; 3517 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 3518 sdl->sdl_type = ill->ill_type; 3519 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); 3520 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 3521 sdl->sdl_alen = ill->ill_phys_addr_length; 3522 sdl->sdl_slen = 0; 3523 return (sdl->sdl_nlen); 3524 } 3525 3526 static int 3527 loopback_kstat_update(kstat_t *ksp, int rw) 3528 { 3529 kstat_named_t *kn; 3530 netstackid_t stackid; 3531 netstack_t *ns; 3532 ip_stack_t *ipst; 3533 3534 if (ksp == NULL || ksp->ks_data == NULL) 3535 return (EIO); 3536 3537 if (rw == KSTAT_WRITE) 3538 return (EACCES); 3539 3540 kn = KSTAT_NAMED_PTR(ksp); 3541 stackid = (zoneid_t)(uintptr_t)ksp->ks_private; 3542 3543 ns = netstack_find_by_stackid(stackid); 3544 if (ns == NULL) 3545 return (-1); 3546 3547 ipst = ns->netstack_ip; 3548 if (ipst == NULL) { 3549 netstack_rele(ns); 3550 return (-1); 3551 } 3552 kn[0].value.ui32 = ipst->ips_loopback_packets; 3553 kn[1].value.ui32 = ipst->ips_loopback_packets; 3554 netstack_rele(ns); 3555 return (0); 3556 } 3557 3558 /* 3559 * Has ifindex been plumbed already? 3560 */ 3561 static boolean_t 3562 phyint_exists(uint_t index, ip_stack_t *ipst) 3563 { 3564 ASSERT(index != 0); 3565 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 3566 3567 return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3568 &index, NULL) != NULL); 3569 } 3570 3571 /* Pick a unique ifindex */ 3572 boolean_t 3573 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst) 3574 { 3575 uint_t starting_index; 3576 3577 if (!ipst->ips_ill_index_wrap) { 3578 *indexp = ipst->ips_ill_index++; 3579 if (ipst->ips_ill_index == 0) { 3580 /* Reached the uint_t limit Next time wrap */ 3581 ipst->ips_ill_index_wrap = B_TRUE; 3582 } 3583 return (B_TRUE); 3584 } 3585 3586 /* 3587 * Start reusing unused indexes. Note that we hold the ill_g_lock 3588 * at this point and don't want to call any function that attempts 3589 * to get the lock again. 3590 */ 3591 starting_index = ipst->ips_ill_index++; 3592 for (; ipst->ips_ill_index != starting_index; ipst->ips_ill_index++) { 3593 if (ipst->ips_ill_index != 0 && 3594 !phyint_exists(ipst->ips_ill_index, ipst)) { 3595 /* found unused index - use it */ 3596 *indexp = ipst->ips_ill_index; 3597 return (B_TRUE); 3598 } 3599 } 3600 3601 /* 3602 * all interface indicies are inuse. 3603 */ 3604 return (B_FALSE); 3605 } 3606 3607 /* 3608 * Assign a unique interface index for the phyint. 3609 */ 3610 static boolean_t 3611 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst) 3612 { 3613 ASSERT(phyi->phyint_ifindex == 0); 3614 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst)); 3615 } 3616 3617 /* 3618 * Initialize the flags on `phyi' as per the provided mactype. 3619 */ 3620 static void 3621 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype) 3622 { 3623 uint64_t flags = 0; 3624 3625 /* 3626 * Initialize PHYI_RUNNING and PHYI_FAILED. For non-IPMP interfaces, 3627 * we always presume the underlying hardware is working and set 3628 * PHYI_RUNNING (if it's not, the driver will subsequently send a 3629 * DL_NOTE_LINK_DOWN message). For IPMP interfaces, at initialization 3630 * there are no active interfaces in the group so we set PHYI_FAILED. 3631 */ 3632 if (mactype == SUNW_DL_IPMP) 3633 flags |= PHYI_FAILED; 3634 else 3635 flags |= PHYI_RUNNING; 3636 3637 switch (mactype) { 3638 case SUNW_DL_VNI: 3639 flags |= PHYI_VIRTUAL; 3640 break; 3641 case SUNW_DL_IPMP: 3642 flags |= PHYI_IPMP; 3643 break; 3644 case DL_LOOP: 3645 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL); 3646 break; 3647 } 3648 3649 mutex_enter(&phyi->phyint_lock); 3650 phyi->phyint_flags |= flags; 3651 mutex_exit(&phyi->phyint_lock); 3652 } 3653 3654 /* 3655 * Return a pointer to the ill which matches the supplied name. Note that 3656 * the ill name length includes the null termination character. (May be 3657 * called as writer.) 3658 * If do_alloc and the interface is "lo0" it will be automatically created. 3659 * Cannot bump up reference on condemned ills. So dup detect can't be done 3660 * using this func. 3661 */ 3662 ill_t * 3663 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 3664 boolean_t *did_alloc, ip_stack_t *ipst) 3665 { 3666 ill_t *ill; 3667 ipif_t *ipif; 3668 ipsq_t *ipsq; 3669 kstat_named_t *kn; 3670 boolean_t isloopback; 3671 in6_addr_t ov6addr; 3672 3673 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 3674 3675 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3676 ill = ill_find_by_name(name, isv6, ipst); 3677 rw_exit(&ipst->ips_ill_g_lock); 3678 if (ill != NULL) 3679 return (ill); 3680 3681 /* 3682 * Couldn't find it. Does this happen to be a lookup for the 3683 * loopback device and are we allowed to allocate it? 3684 */ 3685 if (!isloopback || !do_alloc) 3686 return (NULL); 3687 3688 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 3689 ill = ill_find_by_name(name, isv6, ipst); 3690 if (ill != NULL) { 3691 rw_exit(&ipst->ips_ill_g_lock); 3692 return (ill); 3693 } 3694 3695 /* Create the loopback device on demand */ 3696 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 3697 sizeof (ipif_loopback_name), BPRI_MED)); 3698 if (ill == NULL) 3699 goto done; 3700 3701 *ill = ill_null; 3702 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 3703 ill->ill_ipst = ipst; 3704 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); 3705 netstack_hold(ipst->ips_netstack); 3706 /* 3707 * For exclusive stacks we set the zoneid to zero 3708 * to make IP operate as if in the global zone. 3709 */ 3710 ill->ill_zoneid = GLOBAL_ZONEID; 3711 3712 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 3713 if (ill->ill_phyint == NULL) 3714 goto done; 3715 3716 if (isv6) 3717 ill->ill_phyint->phyint_illv6 = ill; 3718 else 3719 ill->ill_phyint->phyint_illv4 = ill; 3720 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 3721 phyint_flags_init(ill->ill_phyint, DL_LOOP); 3722 3723 if (isv6) { 3724 ill->ill_isv6 = B_TRUE; 3725 ill->ill_max_frag = ip_loopback_mtu_v6plus; 3726 } else { 3727 ill->ill_max_frag = ip_loopback_mtuplus; 3728 } 3729 if (!ill_allocate_mibs(ill)) 3730 goto done; 3731 ill->ill_current_frag = ill->ill_max_frag; 3732 ill->ill_mtu = ill->ill_max_frag; /* Initial value */ 3733 /* 3734 * ipif_loopback_name can't be pointed at directly because its used 3735 * by both the ipv4 and ipv6 interfaces. When the ill is removed 3736 * from the glist, ill_glist_delete() sets the first character of 3737 * ill_name to '\0'. 3738 */ 3739 ill->ill_name = (char *)ill + sizeof (*ill); 3740 (void) strcpy(ill->ill_name, ipif_loopback_name); 3741 ill->ill_name_length = sizeof (ipif_loopback_name); 3742 /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */ 3743 ill->ill_dlpi_pending = DL_PRIM_INVAL; 3744 3745 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); 3746 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); 3747 ill->ill_global_timer = INFINITY; 3748 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 3749 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 3750 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 3751 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 3752 3753 /* No resolver here. */ 3754 ill->ill_net_type = IRE_LOOPBACK; 3755 3756 /* Initialize the ipsq */ 3757 if (!ipsq_init(ill, B_FALSE)) 3758 goto done; 3759 3760 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL); 3761 if (ipif == NULL) 3762 goto done; 3763 3764 ill->ill_flags = ILLF_MULTICAST; 3765 3766 ov6addr = ipif->ipif_v6lcl_addr; 3767 /* Set up default loopback address and mask. */ 3768 if (!isv6) { 3769 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 3770 3771 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 3772 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 3773 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 3774 ipif->ipif_v6subnet); 3775 ill->ill_flags |= ILLF_IPV4; 3776 } else { 3777 ipif->ipif_v6lcl_addr = ipv6_loopback; 3778 ipif->ipif_v6net_mask = ipv6_all_ones; 3779 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 3780 ipif->ipif_v6subnet); 3781 ill->ill_flags |= ILLF_IPV6; 3782 } 3783 3784 /* 3785 * Chain us in at the end of the ill list. hold the ill 3786 * before we make it globally visible. 1 for the lookup. 3787 */ 3788 ill->ill_refcnt = 0; 3789 ill_refhold(ill); 3790 3791 ill->ill_frag_count = 0; 3792 ill->ill_frag_free_num_pkts = 0; 3793 ill->ill_last_frag_clean_time = 0; 3794 3795 ipsq = ill->ill_phyint->phyint_ipsq; 3796 3797 ill_set_inputfn(ill); 3798 3799 if (ill_glist_insert(ill, "lo", isv6) != 0) 3800 cmn_err(CE_PANIC, "cannot insert loopback interface"); 3801 3802 /* Let SCTP know so that it can add this to its list */ 3803 sctp_update_ill(ill, SCTP_ILL_INSERT); 3804 3805 /* 3806 * We have already assigned ipif_v6lcl_addr above, but we need to 3807 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which 3808 * requires to be after ill_glist_insert() since we need the 3809 * ill_index set. Pass on ipv6_loopback as the old address. 3810 */ 3811 sctp_update_ipif_addr(ipif, ov6addr); 3812 3813 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT); 3814 3815 /* 3816 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs. 3817 * If so, free our original one. 3818 */ 3819 if (ipsq != ill->ill_phyint->phyint_ipsq) 3820 ipsq_delete(ipsq); 3821 3822 if (ipst->ips_loopback_ksp == NULL) { 3823 /* Export loopback interface statistics */ 3824 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0, 3825 ipif_loopback_name, "net", 3826 KSTAT_TYPE_NAMED, 2, 0, 3827 ipst->ips_netstack->netstack_stackid); 3828 if (ipst->ips_loopback_ksp != NULL) { 3829 ipst->ips_loopback_ksp->ks_update = 3830 loopback_kstat_update; 3831 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp); 3832 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 3833 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 3834 ipst->ips_loopback_ksp->ks_private = 3835 (void *)(uintptr_t)ipst->ips_netstack-> 3836 netstack_stackid; 3837 kstat_install(ipst->ips_loopback_ksp); 3838 } 3839 } 3840 3841 *did_alloc = B_TRUE; 3842 rw_exit(&ipst->ips_ill_g_lock); 3843 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id), 3844 NE_PLUMB, ill->ill_name, ill->ill_name_length); 3845 return (ill); 3846 done: 3847 if (ill != NULL) { 3848 if (ill->ill_phyint != NULL) { 3849 ipsq = ill->ill_phyint->phyint_ipsq; 3850 if (ipsq != NULL) { 3851 ipsq->ipsq_phyint = NULL; 3852 ipsq_delete(ipsq); 3853 } 3854 mi_free(ill->ill_phyint); 3855 } 3856 ill_free_mib(ill); 3857 if (ill->ill_ipst != NULL) 3858 netstack_rele(ill->ill_ipst->ips_netstack); 3859 mi_free(ill); 3860 } 3861 rw_exit(&ipst->ips_ill_g_lock); 3862 return (NULL); 3863 } 3864 3865 /* 3866 * For IPP calls - use the ip_stack_t for global stack. 3867 */ 3868 ill_t * 3869 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6) 3870 { 3871 ip_stack_t *ipst; 3872 ill_t *ill; 3873 3874 ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip; 3875 if (ipst == NULL) { 3876 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n"); 3877 return (NULL); 3878 } 3879 3880 ill = ill_lookup_on_ifindex(index, isv6, ipst); 3881 netstack_rele(ipst->ips_netstack); 3882 return (ill); 3883 } 3884 3885 /* 3886 * Return a pointer to the ill which matches the index and IP version type. 3887 */ 3888 ill_t * 3889 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 3890 { 3891 ill_t *ill; 3892 phyint_t *phyi; 3893 3894 /* 3895 * Indexes are stored in the phyint - a common structure 3896 * to both IPv4 and IPv6. 3897 */ 3898 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3899 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3900 (void *) &index, NULL); 3901 if (phyi != NULL) { 3902 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 3903 if (ill != NULL) { 3904 mutex_enter(&ill->ill_lock); 3905 if (!ILL_IS_CONDEMNED(ill)) { 3906 ill_refhold_locked(ill); 3907 mutex_exit(&ill->ill_lock); 3908 rw_exit(&ipst->ips_ill_g_lock); 3909 return (ill); 3910 } 3911 mutex_exit(&ill->ill_lock); 3912 } 3913 } 3914 rw_exit(&ipst->ips_ill_g_lock); 3915 return (NULL); 3916 } 3917 3918 /* 3919 * Verify whether or not an interface index is valid for the specified zoneid 3920 * to transmit packets. 3921 * It can be zero (meaning "reset") or an interface index assigned 3922 * to a non-VNI interface. (We don't use VNI interface to send packets.) 3923 */ 3924 boolean_t 3925 ip_xmit_ifindex_valid(uint_t ifindex, zoneid_t zoneid, boolean_t isv6, 3926 ip_stack_t *ipst) 3927 { 3928 ill_t *ill; 3929 3930 if (ifindex == 0) 3931 return (B_TRUE); 3932 3933 ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid, isv6, ipst); 3934 if (ill == NULL) 3935 return (B_FALSE); 3936 if (IS_VNI(ill)) { 3937 ill_refrele(ill); 3938 return (B_FALSE); 3939 } 3940 ill_refrele(ill); 3941 return (B_TRUE); 3942 } 3943 3944 /* 3945 * Return the ifindex next in sequence after the passed in ifindex. 3946 * If there is no next ifindex for the given protocol, return 0. 3947 */ 3948 uint_t 3949 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 3950 { 3951 phyint_t *phyi; 3952 phyint_t *phyi_initial; 3953 uint_t ifindex; 3954 3955 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3956 3957 if (index == 0) { 3958 phyi = avl_first( 3959 &ipst->ips_phyint_g_list->phyint_list_avl_by_index); 3960 } else { 3961 phyi = phyi_initial = avl_find( 3962 &ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3963 (void *) &index, NULL); 3964 } 3965 3966 for (; phyi != NULL; 3967 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3968 phyi, AVL_AFTER)) { 3969 /* 3970 * If we're not returning the first interface in the tree 3971 * and we still haven't moved past the phyint_t that 3972 * corresponds to index, avl_walk needs to be called again 3973 */ 3974 if (!((index != 0) && (phyi == phyi_initial))) { 3975 if (isv6) { 3976 if ((phyi->phyint_illv6) && 3977 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 3978 (phyi->phyint_illv6->ill_isv6 == 1)) 3979 break; 3980 } else { 3981 if ((phyi->phyint_illv4) && 3982 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 3983 (phyi->phyint_illv4->ill_isv6 == 0)) 3984 break; 3985 } 3986 } 3987 } 3988 3989 rw_exit(&ipst->ips_ill_g_lock); 3990 3991 if (phyi != NULL) 3992 ifindex = phyi->phyint_ifindex; 3993 else 3994 ifindex = 0; 3995 3996 return (ifindex); 3997 } 3998 3999 /* 4000 * Return the ifindex for the named interface. 4001 * If there is no next ifindex for the interface, return 0. 4002 */ 4003 uint_t 4004 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst) 4005 { 4006 phyint_t *phyi; 4007 avl_index_t where = 0; 4008 uint_t ifindex; 4009 4010 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4011 4012 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 4013 name, &where)) == NULL) { 4014 rw_exit(&ipst->ips_ill_g_lock); 4015 return (0); 4016 } 4017 4018 ifindex = phyi->phyint_ifindex; 4019 4020 rw_exit(&ipst->ips_ill_g_lock); 4021 4022 return (ifindex); 4023 } 4024 4025 /* 4026 * Return the ifindex to be used by upper layer protocols for instance 4027 * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill. 4028 */ 4029 uint_t 4030 ill_get_upper_ifindex(const ill_t *ill) 4031 { 4032 if (IS_UNDER_IPMP(ill)) 4033 return (ipmp_ill_get_ipmp_ifindex(ill)); 4034 else 4035 return (ill->ill_phyint->phyint_ifindex); 4036 } 4037 4038 4039 /* 4040 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 4041 * that gives a running thread a reference to the ill. This reference must be 4042 * released by the thread when it is done accessing the ill and related 4043 * objects. ill_refcnt can not be used to account for static references 4044 * such as other structures pointing to an ill. Callers must generally 4045 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 4046 * or be sure that the ill is not being deleted or changing state before 4047 * calling the refhold functions. A non-zero ill_refcnt ensures that the 4048 * ill won't change any of its critical state such as address, netmask etc. 4049 */ 4050 void 4051 ill_refhold(ill_t *ill) 4052 { 4053 mutex_enter(&ill->ill_lock); 4054 ill->ill_refcnt++; 4055 ILL_TRACE_REF(ill); 4056 mutex_exit(&ill->ill_lock); 4057 } 4058 4059 void 4060 ill_refhold_locked(ill_t *ill) 4061 { 4062 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4063 ill->ill_refcnt++; 4064 ILL_TRACE_REF(ill); 4065 } 4066 4067 /* Returns true if we managed to get a refhold */ 4068 boolean_t 4069 ill_check_and_refhold(ill_t *ill) 4070 { 4071 mutex_enter(&ill->ill_lock); 4072 if (!ILL_IS_CONDEMNED(ill)) { 4073 ill_refhold_locked(ill); 4074 mutex_exit(&ill->ill_lock); 4075 return (B_TRUE); 4076 } 4077 mutex_exit(&ill->ill_lock); 4078 return (B_FALSE); 4079 } 4080 4081 /* 4082 * Must not be called while holding any locks. Otherwise if this is 4083 * the last reference to be released, there is a chance of recursive mutex 4084 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 4085 * to restart an ioctl. 4086 */ 4087 void 4088 ill_refrele(ill_t *ill) 4089 { 4090 mutex_enter(&ill->ill_lock); 4091 ASSERT(ill->ill_refcnt != 0); 4092 ill->ill_refcnt--; 4093 ILL_UNTRACE_REF(ill); 4094 if (ill->ill_refcnt != 0) { 4095 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 4096 mutex_exit(&ill->ill_lock); 4097 return; 4098 } 4099 4100 /* Drops the ill_lock */ 4101 ipif_ill_refrele_tail(ill); 4102 } 4103 4104 /* 4105 * Obtain a weak reference count on the ill. This reference ensures the 4106 * ill won't be freed, but the ill may change any of its critical state 4107 * such as netmask, address etc. Returns an error if the ill has started 4108 * closing. 4109 */ 4110 boolean_t 4111 ill_waiter_inc(ill_t *ill) 4112 { 4113 mutex_enter(&ill->ill_lock); 4114 if (ill->ill_state_flags & ILL_CONDEMNED) { 4115 mutex_exit(&ill->ill_lock); 4116 return (B_FALSE); 4117 } 4118 ill->ill_waiters++; 4119 mutex_exit(&ill->ill_lock); 4120 return (B_TRUE); 4121 } 4122 4123 void 4124 ill_waiter_dcr(ill_t *ill) 4125 { 4126 mutex_enter(&ill->ill_lock); 4127 ill->ill_waiters--; 4128 if (ill->ill_waiters == 0) 4129 cv_broadcast(&ill->ill_cv); 4130 mutex_exit(&ill->ill_lock); 4131 } 4132 4133 /* 4134 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 4135 * driver. We construct best guess defaults for lower level information that 4136 * we need. If an interface is brought up without injection of any overriding 4137 * information from outside, we have to be ready to go with these defaults. 4138 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 4139 * we primarely want the dl_provider_style. 4140 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 4141 * at which point we assume the other part of the information is valid. 4142 */ 4143 void 4144 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 4145 { 4146 uchar_t *brdcst_addr; 4147 uint_t brdcst_addr_length, phys_addr_length; 4148 t_scalar_t sap_length; 4149 dl_info_ack_t *dlia; 4150 ip_m_t *ipm; 4151 dl_qos_cl_sel1_t *sel1; 4152 int min_mtu; 4153 4154 ASSERT(IAM_WRITER_ILL(ill)); 4155 4156 /* 4157 * Till the ill is fully up the ill is not globally visible. 4158 * So no need for a lock. 4159 */ 4160 dlia = (dl_info_ack_t *)mp->b_rptr; 4161 ill->ill_mactype = dlia->dl_mac_type; 4162 4163 ipm = ip_m_lookup(dlia->dl_mac_type); 4164 if (ipm == NULL) { 4165 ipm = ip_m_lookup(DL_OTHER); 4166 ASSERT(ipm != NULL); 4167 } 4168 ill->ill_media = ipm; 4169 4170 /* 4171 * When the new DLPI stuff is ready we'll pull lengths 4172 * from dlia. 4173 */ 4174 if (dlia->dl_version == DL_VERSION_2) { 4175 brdcst_addr_length = dlia->dl_brdcst_addr_length; 4176 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 4177 brdcst_addr_length); 4178 if (brdcst_addr == NULL) { 4179 brdcst_addr_length = 0; 4180 } 4181 sap_length = dlia->dl_sap_length; 4182 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 4183 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 4184 brdcst_addr_length, sap_length, phys_addr_length)); 4185 } else { 4186 brdcst_addr_length = 6; 4187 brdcst_addr = ip_six_byte_all_ones; 4188 sap_length = -2; 4189 phys_addr_length = brdcst_addr_length; 4190 } 4191 4192 ill->ill_bcast_addr_length = brdcst_addr_length; 4193 ill->ill_phys_addr_length = phys_addr_length; 4194 ill->ill_sap_length = sap_length; 4195 4196 /* 4197 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU, 4198 * but we must ensure a minimum IP MTU is used since other bits of 4199 * IP will fly apart otherwise. 4200 */ 4201 min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; 4202 ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu); 4203 ill->ill_current_frag = ill->ill_max_frag; 4204 ill->ill_mtu = ill->ill_max_frag; 4205 4206 ill->ill_type = ipm->ip_m_type; 4207 4208 if (!ill->ill_dlpi_style_set) { 4209 if (dlia->dl_provider_style == DL_STYLE2) 4210 ill->ill_needs_attach = 1; 4211 4212 phyint_flags_init(ill->ill_phyint, ill->ill_mactype); 4213 4214 /* 4215 * Allocate the first ipif on this ill. We don't delay it 4216 * further as ioctl handling assumes at least one ipif exists. 4217 * 4218 * At this point we don't know whether the ill is v4 or v6. 4219 * We will know this whan the SIOCSLIFNAME happens and 4220 * the correct value for ill_isv6 will be assigned in 4221 * ipif_set_values(). We need to hold the ill lock and 4222 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 4223 * the wakeup. 4224 */ 4225 (void) ipif_allocate(ill, 0, IRE_LOCAL, 4226 dlia->dl_provider_style != DL_STYLE2, B_TRUE, NULL); 4227 mutex_enter(&ill->ill_lock); 4228 ASSERT(ill->ill_dlpi_style_set == 0); 4229 ill->ill_dlpi_style_set = 1; 4230 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 4231 cv_broadcast(&ill->ill_cv); 4232 mutex_exit(&ill->ill_lock); 4233 freemsg(mp); 4234 return; 4235 } 4236 ASSERT(ill->ill_ipif != NULL); 4237 /* 4238 * We know whether it is IPv4 or IPv6 now, as this is the 4239 * second DL_INFO_ACK we are recieving in response to the 4240 * DL_INFO_REQ sent in ipif_set_values. 4241 */ 4242 ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap; 4243 /* 4244 * Clear all the flags that were set based on ill_bcast_addr_length 4245 * and ill_phys_addr_length (in ipif_set_values) as these could have 4246 * changed now and we need to re-evaluate. 4247 */ 4248 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 4249 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 4250 4251 /* 4252 * Free ill_bcast_mp as things could have changed now. 4253 * 4254 * NOTE: The IPMP meta-interface is special-cased because it starts 4255 * with no underlying interfaces (and thus an unknown broadcast 4256 * address length), but we enforce that an interface is broadcast- 4257 * capable as part of allowing it to join a group. 4258 */ 4259 if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) { 4260 if (ill->ill_bcast_mp != NULL) 4261 freemsg(ill->ill_bcast_mp); 4262 ill->ill_net_type = IRE_IF_NORESOLVER; 4263 4264 ill->ill_bcast_mp = ill_dlur_gen(NULL, 4265 ill->ill_phys_addr_length, 4266 ill->ill_sap, 4267 ill->ill_sap_length); 4268 4269 if (ill->ill_isv6) 4270 /* 4271 * Note: xresolv interfaces will eventually need NOARP 4272 * set here as well, but that will require those 4273 * external resolvers to have some knowledge of 4274 * that flag and act appropriately. Not to be changed 4275 * at present. 4276 */ 4277 ill->ill_flags |= ILLF_NONUD; 4278 else 4279 ill->ill_flags |= ILLF_NOARP; 4280 4281 if (ill->ill_mactype == SUNW_DL_VNI) { 4282 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 4283 } else if (ill->ill_phys_addr_length == 0 || 4284 ill->ill_mactype == DL_IPV4 || 4285 ill->ill_mactype == DL_IPV6) { 4286 /* 4287 * The underying link is point-to-point, so mark the 4288 * interface as such. We can do IP multicast over 4289 * such a link since it transmits all network-layer 4290 * packets to the remote side the same way. 4291 */ 4292 ill->ill_flags |= ILLF_MULTICAST; 4293 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 4294 } 4295 } else { 4296 ill->ill_net_type = IRE_IF_RESOLVER; 4297 if (ill->ill_bcast_mp != NULL) 4298 freemsg(ill->ill_bcast_mp); 4299 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 4300 ill->ill_bcast_addr_length, ill->ill_sap, 4301 ill->ill_sap_length); 4302 /* 4303 * Later detect lack of DLPI driver multicast 4304 * capability by catching DL_ENABMULTI errors in 4305 * ip_rput_dlpi. 4306 */ 4307 ill->ill_flags |= ILLF_MULTICAST; 4308 if (!ill->ill_isv6) 4309 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 4310 } 4311 4312 /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */ 4313 if (ill->ill_mactype == SUNW_DL_IPMP) 4314 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP); 4315 4316 /* By default an interface does not support any CoS marking */ 4317 ill->ill_flags &= ~ILLF_COS_ENABLED; 4318 4319 /* 4320 * If we get QoS information in DL_INFO_ACK, the device supports 4321 * some form of CoS marking, set ILLF_COS_ENABLED. 4322 */ 4323 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 4324 dlia->dl_qos_length); 4325 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 4326 ill->ill_flags |= ILLF_COS_ENABLED; 4327 } 4328 4329 /* Clear any previous error indication. */ 4330 ill->ill_error = 0; 4331 freemsg(mp); 4332 } 4333 4334 /* 4335 * Perform various checks to verify that an address would make sense as a 4336 * local, remote, or subnet interface address. 4337 */ 4338 static boolean_t 4339 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 4340 { 4341 ipaddr_t net_mask; 4342 4343 /* 4344 * Don't allow all zeroes, or all ones, but allow 4345 * all ones netmask. 4346 */ 4347 if ((net_mask = ip_net_mask(addr)) == 0) 4348 return (B_FALSE); 4349 /* A given netmask overrides the "guess" netmask */ 4350 if (subnet_mask != 0) 4351 net_mask = subnet_mask; 4352 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 4353 (addr == (addr | ~net_mask)))) { 4354 return (B_FALSE); 4355 } 4356 4357 /* 4358 * Even if the netmask is all ones, we do not allow address to be 4359 * 255.255.255.255 4360 */ 4361 if (addr == INADDR_BROADCAST) 4362 return (B_FALSE); 4363 4364 if (CLASSD(addr)) 4365 return (B_FALSE); 4366 4367 return (B_TRUE); 4368 } 4369 4370 #define V6_IPIF_LINKLOCAL(p) \ 4371 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr) 4372 4373 /* 4374 * Compare two given ipifs and check if the second one is better than 4375 * the first one using the order of preference (not taking deprecated 4376 * into acount) specified in ipif_lookup_multicast(). 4377 */ 4378 static boolean_t 4379 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) 4380 { 4381 /* Check the least preferred first. */ 4382 if (IS_LOOPBACK(old_ipif->ipif_ill)) { 4383 /* If both ipifs are the same, use the first one. */ 4384 if (IS_LOOPBACK(new_ipif->ipif_ill)) 4385 return (B_FALSE); 4386 else 4387 return (B_TRUE); 4388 } 4389 4390 /* For IPv6, check for link local address. */ 4391 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) { 4392 if (IS_LOOPBACK(new_ipif->ipif_ill) || 4393 V6_IPIF_LINKLOCAL(new_ipif)) { 4394 /* The second one is equal or less preferred. */ 4395 return (B_FALSE); 4396 } else { 4397 return (B_TRUE); 4398 } 4399 } 4400 4401 /* Then check for point to point interface. */ 4402 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) { 4403 if (IS_LOOPBACK(new_ipif->ipif_ill) || 4404 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) || 4405 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) { 4406 return (B_FALSE); 4407 } else { 4408 return (B_TRUE); 4409 } 4410 } 4411 4412 /* old_ipif is a normal interface, so no need to use the new one. */ 4413 return (B_FALSE); 4414 } 4415 4416 /* 4417 * Find a mulitcast-capable ipif given an IP instance and zoneid. 4418 * The ipif must be up, and its ill must multicast-capable, not 4419 * condemned, not an underlying interface in an IPMP group, and 4420 * not a VNI interface. Order of preference: 4421 * 4422 * 1a. normal 4423 * 1b. normal, but deprecated 4424 * 2a. point to point 4425 * 2b. point to point, but deprecated 4426 * 3a. link local 4427 * 3b. link local, but deprecated 4428 * 4. loopback. 4429 */ 4430 static ipif_t * 4431 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 4432 { 4433 ill_t *ill; 4434 ill_walk_context_t ctx; 4435 ipif_t *ipif; 4436 ipif_t *saved_ipif = NULL; 4437 ipif_t *dep_ipif = NULL; 4438 4439 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4440 if (isv6) 4441 ill = ILL_START_WALK_V6(&ctx, ipst); 4442 else 4443 ill = ILL_START_WALK_V4(&ctx, ipst); 4444 4445 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4446 mutex_enter(&ill->ill_lock); 4447 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || 4448 ILL_IS_CONDEMNED(ill) || 4449 !(ill->ill_flags & ILLF_MULTICAST)) { 4450 mutex_exit(&ill->ill_lock); 4451 continue; 4452 } 4453 for (ipif = ill->ill_ipif; ipif != NULL; 4454 ipif = ipif->ipif_next) { 4455 if (zoneid != ipif->ipif_zoneid && 4456 zoneid != ALL_ZONES && 4457 ipif->ipif_zoneid != ALL_ZONES) { 4458 continue; 4459 } 4460 if (!(ipif->ipif_flags & IPIF_UP) || 4461 IPIF_IS_CONDEMNED(ipif)) { 4462 continue; 4463 } 4464 4465 /* 4466 * Found one candidate. If it is deprecated, 4467 * remember it in dep_ipif. If it is not deprecated, 4468 * remember it in saved_ipif. 4469 */ 4470 if (ipif->ipif_flags & IPIF_DEPRECATED) { 4471 if (dep_ipif == NULL) { 4472 dep_ipif = ipif; 4473 } else if (ipif_comp_multi(dep_ipif, ipif, 4474 isv6)) { 4475 /* 4476 * If the previous dep_ipif does not 4477 * belong to the same ill, we've done 4478 * a ipif_refhold() on it. So we need 4479 * to release it. 4480 */ 4481 if (dep_ipif->ipif_ill != ill) 4482 ipif_refrele(dep_ipif); 4483 dep_ipif = ipif; 4484 } 4485 continue; 4486 } 4487 if (saved_ipif == NULL) { 4488 saved_ipif = ipif; 4489 } else { 4490 if (ipif_comp_multi(saved_ipif, ipif, isv6)) { 4491 if (saved_ipif->ipif_ill != ill) 4492 ipif_refrele(saved_ipif); 4493 saved_ipif = ipif; 4494 } 4495 } 4496 } 4497 /* 4498 * Before going to the next ill, do a ipif_refhold() on the 4499 * saved ones. 4500 */ 4501 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill) 4502 ipif_refhold_locked(saved_ipif); 4503 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill) 4504 ipif_refhold_locked(dep_ipif); 4505 mutex_exit(&ill->ill_lock); 4506 } 4507 rw_exit(&ipst->ips_ill_g_lock); 4508 4509 /* 4510 * If we have only the saved_ipif, return it. But if we have both 4511 * saved_ipif and dep_ipif, check to see which one is better. 4512 */ 4513 if (saved_ipif != NULL) { 4514 if (dep_ipif != NULL) { 4515 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) { 4516 ipif_refrele(saved_ipif); 4517 return (dep_ipif); 4518 } else { 4519 ipif_refrele(dep_ipif); 4520 return (saved_ipif); 4521 } 4522 } 4523 return (saved_ipif); 4524 } else { 4525 return (dep_ipif); 4526 } 4527 } 4528 4529 ill_t * 4530 ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 4531 { 4532 ipif_t *ipif; 4533 ill_t *ill; 4534 4535 ipif = ipif_lookup_multicast(ipst, zoneid, isv6); 4536 if (ipif == NULL) 4537 return (NULL); 4538 4539 ill = ipif->ipif_ill; 4540 ill_refhold(ill); 4541 ipif_refrele(ipif); 4542 return (ill); 4543 } 4544 4545 /* 4546 * This function is called when an application does not specify an interface 4547 * to be used for multicast traffic (joining a group/sending data). It 4548 * calls ire_lookup_multi() to look for an interface route for the 4549 * specified multicast group. Doing this allows the administrator to add 4550 * prefix routes for multicast to indicate which interface to be used for 4551 * multicast traffic in the above scenario. The route could be for all 4552 * multicast (224.0/4), for a single multicast group (a /32 route) or 4553 * anything in between. If there is no such multicast route, we just find 4554 * any multicast capable interface and return it. The returned ipif 4555 * is refhold'ed. 4556 * 4557 * We support MULTIRT and RTF_SETSRC on the multicast routes added to the 4558 * unicast table. This is used by CGTP. 4559 */ 4560 ill_t * 4561 ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 4562 boolean_t *multirtp, ipaddr_t *setsrcp) 4563 { 4564 ill_t *ill; 4565 4566 ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp); 4567 if (ill != NULL) 4568 return (ill); 4569 4570 return (ill_lookup_multicast(ipst, zoneid, B_FALSE)); 4571 } 4572 4573 /* 4574 * Look for an ipif with the specified interface address and destination. 4575 * The destination address is used only for matching point-to-point interfaces. 4576 */ 4577 ipif_t * 4578 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst) 4579 { 4580 ipif_t *ipif; 4581 ill_t *ill; 4582 ill_walk_context_t ctx; 4583 4584 /* 4585 * First match all the point-to-point interfaces 4586 * before looking at non-point-to-point interfaces. 4587 * This is done to avoid returning non-point-to-point 4588 * ipif instead of unnumbered point-to-point ipif. 4589 */ 4590 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4591 ill = ILL_START_WALK_V4(&ctx, ipst); 4592 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4593 mutex_enter(&ill->ill_lock); 4594 for (ipif = ill->ill_ipif; ipif != NULL; 4595 ipif = ipif->ipif_next) { 4596 /* Allow the ipif to be down */ 4597 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 4598 (ipif->ipif_lcl_addr == if_addr) && 4599 (ipif->ipif_pp_dst_addr == dst)) { 4600 if (!IPIF_IS_CONDEMNED(ipif)) { 4601 ipif_refhold_locked(ipif); 4602 mutex_exit(&ill->ill_lock); 4603 rw_exit(&ipst->ips_ill_g_lock); 4604 return (ipif); 4605 } 4606 } 4607 } 4608 mutex_exit(&ill->ill_lock); 4609 } 4610 rw_exit(&ipst->ips_ill_g_lock); 4611 4612 /* lookup the ipif based on interface address */ 4613 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst); 4614 ASSERT(ipif == NULL || !ipif->ipif_isv6); 4615 return (ipif); 4616 } 4617 4618 /* 4619 * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact(). 4620 */ 4621 static ipif_t * 4622 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags, 4623 zoneid_t zoneid, ip_stack_t *ipst) 4624 { 4625 ipif_t *ipif; 4626 ill_t *ill; 4627 boolean_t ptp = B_FALSE; 4628 ill_walk_context_t ctx; 4629 boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP); 4630 boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP); 4631 4632 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4633 /* 4634 * Repeat twice, first based on local addresses and 4635 * next time for pointopoint. 4636 */ 4637 repeat: 4638 ill = ILL_START_WALK_V4(&ctx, ipst); 4639 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4640 if (match_ill != NULL && ill != match_ill && 4641 (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { 4642 continue; 4643 } 4644 mutex_enter(&ill->ill_lock); 4645 for (ipif = ill->ill_ipif; ipif != NULL; 4646 ipif = ipif->ipif_next) { 4647 if (zoneid != ALL_ZONES && 4648 zoneid != ipif->ipif_zoneid && 4649 ipif->ipif_zoneid != ALL_ZONES) 4650 continue; 4651 4652 if (no_duplicate && !(ipif->ipif_flags & IPIF_UP)) 4653 continue; 4654 4655 /* Allow the ipif to be down */ 4656 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 4657 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 4658 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 4659 (ipif->ipif_pp_dst_addr == addr))) { 4660 if (!IPIF_IS_CONDEMNED(ipif)) { 4661 ipif_refhold_locked(ipif); 4662 mutex_exit(&ill->ill_lock); 4663 rw_exit(&ipst->ips_ill_g_lock); 4664 return (ipif); 4665 } 4666 } 4667 } 4668 mutex_exit(&ill->ill_lock); 4669 } 4670 4671 /* If we already did the ptp case, then we are done */ 4672 if (ptp) { 4673 rw_exit(&ipst->ips_ill_g_lock); 4674 return (NULL); 4675 } 4676 ptp = B_TRUE; 4677 goto repeat; 4678 } 4679 4680 /* 4681 * Lookup an ipif with the specified address. For point-to-point links we 4682 * look for matches on either the destination address or the local address, 4683 * but we skip the local address check if IPIF_UNNUMBERED is set. If the 4684 * `match_ill' argument is non-NULL, the lookup is restricted to that ill 4685 * (or illgrp if `match_ill' is in an IPMP group). 4686 */ 4687 ipif_t * 4688 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, 4689 ip_stack_t *ipst) 4690 { 4691 return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP, 4692 zoneid, ipst)); 4693 } 4694 4695 /* 4696 * Lookup an ipif with the specified address. Similar to ipif_lookup_addr, 4697 * except that we will only return an address if it is not marked as 4698 * IPIF_DUPLICATE 4699 */ 4700 ipif_t * 4701 ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, 4702 ip_stack_t *ipst) 4703 { 4704 return (ipif_lookup_addr_common(addr, match_ill, 4705 (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP), 4706 zoneid, ipst)); 4707 } 4708 4709 /* 4710 * Special abbreviated version of ipif_lookup_addr() that doesn't match 4711 * `match_ill' across the IPMP group. This function is only needed in some 4712 * corner-cases; almost everything should use ipif_lookup_addr(). 4713 */ 4714 ipif_t * 4715 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 4716 { 4717 ASSERT(match_ill != NULL); 4718 return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES, 4719 ipst)); 4720 } 4721 4722 /* 4723 * Look for an ipif with the specified address. For point-point links 4724 * we look for matches on either the destination address and the local 4725 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 4726 * is set. 4727 * If the `match_ill' argument is non-NULL, the lookup is restricted to that 4728 * ill (or illgrp if `match_ill' is in an IPMP group). 4729 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 4730 */ 4731 zoneid_t 4732 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 4733 { 4734 zoneid_t zoneid; 4735 ipif_t *ipif; 4736 ill_t *ill; 4737 boolean_t ptp = B_FALSE; 4738 ill_walk_context_t ctx; 4739 4740 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4741 /* 4742 * Repeat twice, first based on local addresses and 4743 * next time for pointopoint. 4744 */ 4745 repeat: 4746 ill = ILL_START_WALK_V4(&ctx, ipst); 4747 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4748 if (match_ill != NULL && ill != match_ill && 4749 !IS_IN_SAME_ILLGRP(ill, match_ill)) { 4750 continue; 4751 } 4752 mutex_enter(&ill->ill_lock); 4753 for (ipif = ill->ill_ipif; ipif != NULL; 4754 ipif = ipif->ipif_next) { 4755 /* Allow the ipif to be down */ 4756 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 4757 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 4758 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 4759 (ipif->ipif_pp_dst_addr == addr)) && 4760 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 4761 zoneid = ipif->ipif_zoneid; 4762 mutex_exit(&ill->ill_lock); 4763 rw_exit(&ipst->ips_ill_g_lock); 4764 /* 4765 * If ipif_zoneid was ALL_ZONES then we have 4766 * a trusted extensions shared IP address. 4767 * In that case GLOBAL_ZONEID works to send. 4768 */ 4769 if (zoneid == ALL_ZONES) 4770 zoneid = GLOBAL_ZONEID; 4771 return (zoneid); 4772 } 4773 } 4774 mutex_exit(&ill->ill_lock); 4775 } 4776 4777 /* If we already did the ptp case, then we are done */ 4778 if (ptp) { 4779 rw_exit(&ipst->ips_ill_g_lock); 4780 return (ALL_ZONES); 4781 } 4782 ptp = B_TRUE; 4783 goto repeat; 4784 } 4785 4786 /* 4787 * Look for an ipif that matches the specified remote address i.e. the 4788 * ipif that would receive the specified packet. 4789 * First look for directly connected interfaces and then do a recursive 4790 * IRE lookup and pick the first ipif corresponding to the source address in the 4791 * ire. 4792 * Returns: held ipif 4793 * 4794 * This is only used for ICMP_ADDRESS_MASK_REQUESTs 4795 */ 4796 ipif_t * 4797 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 4798 { 4799 ipif_t *ipif; 4800 4801 ASSERT(!ill->ill_isv6); 4802 4803 /* 4804 * Someone could be changing this ipif currently or change it 4805 * after we return this. Thus a few packets could use the old 4806 * old values. However structure updates/creates (ire, ilg, ilm etc) 4807 * will atomically be updated or cleaned up with the new value 4808 * Thus we don't need a lock to check the flags or other attrs below. 4809 */ 4810 mutex_enter(&ill->ill_lock); 4811 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4812 if (IPIF_IS_CONDEMNED(ipif)) 4813 continue; 4814 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 4815 ipif->ipif_zoneid != ALL_ZONES) 4816 continue; 4817 /* Allow the ipif to be down */ 4818 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 4819 if ((ipif->ipif_pp_dst_addr == addr) || 4820 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 4821 ipif->ipif_lcl_addr == addr)) { 4822 ipif_refhold_locked(ipif); 4823 mutex_exit(&ill->ill_lock); 4824 return (ipif); 4825 } 4826 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 4827 ipif_refhold_locked(ipif); 4828 mutex_exit(&ill->ill_lock); 4829 return (ipif); 4830 } 4831 } 4832 mutex_exit(&ill->ill_lock); 4833 /* 4834 * For a remote destination it isn't possible to nail down a particular 4835 * ipif. 4836 */ 4837 4838 /* Pick the first interface */ 4839 ipif = ipif_get_next_ipif(NULL, ill); 4840 return (ipif); 4841 } 4842 4843 /* 4844 * This func does not prevent refcnt from increasing. But if 4845 * the caller has taken steps to that effect, then this func 4846 * can be used to determine whether the ill has become quiescent 4847 */ 4848 static boolean_t 4849 ill_is_quiescent(ill_t *ill) 4850 { 4851 ipif_t *ipif; 4852 4853 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4854 4855 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4856 if (ipif->ipif_refcnt != 0) 4857 return (B_FALSE); 4858 } 4859 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) { 4860 return (B_FALSE); 4861 } 4862 return (B_TRUE); 4863 } 4864 4865 boolean_t 4866 ill_is_freeable(ill_t *ill) 4867 { 4868 ipif_t *ipif; 4869 4870 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4871 4872 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4873 if (ipif->ipif_refcnt != 0) { 4874 return (B_FALSE); 4875 } 4876 } 4877 if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) { 4878 return (B_FALSE); 4879 } 4880 return (B_TRUE); 4881 } 4882 4883 /* 4884 * This func does not prevent refcnt from increasing. But if 4885 * the caller has taken steps to that effect, then this func 4886 * can be used to determine whether the ipif has become quiescent 4887 */ 4888 static boolean_t 4889 ipif_is_quiescent(ipif_t *ipif) 4890 { 4891 ill_t *ill; 4892 4893 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 4894 4895 if (ipif->ipif_refcnt != 0) 4896 return (B_FALSE); 4897 4898 ill = ipif->ipif_ill; 4899 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 4900 ill->ill_logical_down) { 4901 return (B_TRUE); 4902 } 4903 4904 /* This is the last ipif going down or being deleted on this ill */ 4905 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { 4906 return (B_FALSE); 4907 } 4908 4909 return (B_TRUE); 4910 } 4911 4912 /* 4913 * return true if the ipif can be destroyed: the ipif has to be quiescent 4914 * with zero references from ire/ilm to it. 4915 */ 4916 static boolean_t 4917 ipif_is_freeable(ipif_t *ipif) 4918 { 4919 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 4920 ASSERT(ipif->ipif_id != 0); 4921 return (ipif->ipif_refcnt == 0); 4922 } 4923 4924 /* 4925 * The ipif/ill/ire has been refreled. Do the tail processing. 4926 * Determine if the ipif or ill in question has become quiescent and if so 4927 * wakeup close and/or restart any queued pending ioctl that is waiting 4928 * for the ipif_down (or ill_down) 4929 */ 4930 void 4931 ipif_ill_refrele_tail(ill_t *ill) 4932 { 4933 mblk_t *mp; 4934 conn_t *connp; 4935 ipsq_t *ipsq; 4936 ipxop_t *ipx; 4937 ipif_t *ipif; 4938 dl_notify_ind_t *dlindp; 4939 4940 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4941 4942 if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) { 4943 /* ip_modclose() may be waiting */ 4944 cv_broadcast(&ill->ill_cv); 4945 } 4946 4947 ipsq = ill->ill_phyint->phyint_ipsq; 4948 mutex_enter(&ipsq->ipsq_lock); 4949 ipx = ipsq->ipsq_xop; 4950 mutex_enter(&ipx->ipx_lock); 4951 if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */ 4952 goto unlock; 4953 4954 ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL); 4955 4956 ipif = ipx->ipx_pending_ipif; 4957 if (ipif->ipif_ill != ill) /* wait is for another ill; bail */ 4958 goto unlock; 4959 4960 switch (ipx->ipx_waitfor) { 4961 case IPIF_DOWN: 4962 if (!ipif_is_quiescent(ipif)) 4963 goto unlock; 4964 break; 4965 case IPIF_FREE: 4966 if (!ipif_is_freeable(ipif)) 4967 goto unlock; 4968 break; 4969 case ILL_DOWN: 4970 if (!ill_is_quiescent(ill)) 4971 goto unlock; 4972 break; 4973 case ILL_FREE: 4974 /* 4975 * ILL_FREE is only for loopback; normal ill teardown waits 4976 * synchronously in ip_modclose() without using ipx_waitfor, 4977 * handled by the cv_broadcast() at the top of this function. 4978 */ 4979 if (!ill_is_freeable(ill)) 4980 goto unlock; 4981 break; 4982 default: 4983 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n", 4984 (void *)ipsq, ipx->ipx_waitfor); 4985 } 4986 4987 ill_refhold_locked(ill); /* for qwriter_ip() call below */ 4988 mutex_exit(&ipx->ipx_lock); 4989 mp = ipsq_pending_mp_get(ipsq, &connp); 4990 mutex_exit(&ipsq->ipsq_lock); 4991 mutex_exit(&ill->ill_lock); 4992 4993 ASSERT(mp != NULL); 4994 /* 4995 * NOTE: all of the qwriter_ip() calls below use CUR_OP since 4996 * we can only get here when the current operation decides it 4997 * it needs to quiesce via ipsq_pending_mp_add(). 4998 */ 4999 switch (mp->b_datap->db_type) { 5000 case M_PCPROTO: 5001 case M_PROTO: 5002 /* 5003 * For now, only DL_NOTIFY_IND messages can use this facility. 5004 */ 5005 dlindp = (dl_notify_ind_t *)mp->b_rptr; 5006 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND); 5007 5008 switch (dlindp->dl_notification) { 5009 case DL_NOTE_PHYS_ADDR: 5010 qwriter_ip(ill, ill->ill_rq, mp, 5011 ill_set_phys_addr_tail, CUR_OP, B_TRUE); 5012 return; 5013 case DL_NOTE_REPLUMB: 5014 qwriter_ip(ill, ill->ill_rq, mp, 5015 ill_replumb_tail, CUR_OP, B_TRUE); 5016 return; 5017 default: 5018 ASSERT(0); 5019 ill_refrele(ill); 5020 } 5021 break; 5022 5023 case M_ERROR: 5024 case M_HANGUP: 5025 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP, 5026 B_TRUE); 5027 return; 5028 5029 case M_IOCTL: 5030 case M_IOCDATA: 5031 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) : 5032 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE); 5033 return; 5034 5035 default: 5036 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 5037 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 5038 } 5039 return; 5040 unlock: 5041 mutex_exit(&ipsq->ipsq_lock); 5042 mutex_exit(&ipx->ipx_lock); 5043 mutex_exit(&ill->ill_lock); 5044 } 5045 5046 #ifdef DEBUG 5047 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 5048 static void 5049 th_trace_rrecord(th_trace_t *th_trace) 5050 { 5051 tr_buf_t *tr_buf; 5052 uint_t lastref; 5053 5054 lastref = th_trace->th_trace_lastref; 5055 lastref++; 5056 if (lastref == TR_BUF_MAX) 5057 lastref = 0; 5058 th_trace->th_trace_lastref = lastref; 5059 tr_buf = &th_trace->th_trbuf[lastref]; 5060 tr_buf->tr_time = ddi_get_lbolt(); 5061 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH); 5062 } 5063 5064 static void 5065 th_trace_free(void *value) 5066 { 5067 th_trace_t *th_trace = value; 5068 5069 ASSERT(th_trace->th_refcnt == 0); 5070 kmem_free(th_trace, sizeof (*th_trace)); 5071 } 5072 5073 /* 5074 * Find or create the per-thread hash table used to track object references. 5075 * The ipst argument is NULL if we shouldn't allocate. 5076 * 5077 * Accesses per-thread data, so there's no need to lock here. 5078 */ 5079 static mod_hash_t * 5080 th_trace_gethash(ip_stack_t *ipst) 5081 { 5082 th_hash_t *thh; 5083 5084 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) { 5085 mod_hash_t *mh; 5086 char name[256]; 5087 size_t objsize, rshift; 5088 int retv; 5089 5090 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL) 5091 return (NULL); 5092 (void) snprintf(name, sizeof (name), "th_trace_%p", 5093 (void *)curthread); 5094 5095 /* 5096 * We use mod_hash_create_extended here rather than the more 5097 * obvious mod_hash_create_ptrhash because the latter has a 5098 * hard-coded KM_SLEEP, and we'd prefer to fail rather than 5099 * block. 5100 */ 5101 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)), 5102 MAX(sizeof (ire_t), sizeof (ncec_t))); 5103 rshift = highbit(objsize); 5104 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor, 5105 th_trace_free, mod_hash_byptr, (void *)rshift, 5106 mod_hash_ptrkey_cmp, KM_NOSLEEP); 5107 if (mh == NULL) { 5108 kmem_free(thh, sizeof (*thh)); 5109 return (NULL); 5110 } 5111 thh->thh_hash = mh; 5112 thh->thh_ipst = ipst; 5113 /* 5114 * We trace ills, ipifs, ires, and nces. All of these are 5115 * per-IP-stack, so the lock on the thread list is as well. 5116 */ 5117 rw_enter(&ip_thread_rwlock, RW_WRITER); 5118 list_insert_tail(&ip_thread_list, thh); 5119 rw_exit(&ip_thread_rwlock); 5120 retv = tsd_set(ip_thread_data, thh); 5121 ASSERT(retv == 0); 5122 } 5123 return (thh != NULL ? thh->thh_hash : NULL); 5124 } 5125 5126 boolean_t 5127 th_trace_ref(const void *obj, ip_stack_t *ipst) 5128 { 5129 th_trace_t *th_trace; 5130 mod_hash_t *mh; 5131 mod_hash_val_t val; 5132 5133 if ((mh = th_trace_gethash(ipst)) == NULL) 5134 return (B_FALSE); 5135 5136 /* 5137 * Attempt to locate the trace buffer for this obj and thread. 5138 * If it does not exist, then allocate a new trace buffer and 5139 * insert into the hash. 5140 */ 5141 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) { 5142 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP); 5143 if (th_trace == NULL) 5144 return (B_FALSE); 5145 5146 th_trace->th_id = curthread; 5147 if (mod_hash_insert(mh, (mod_hash_key_t)obj, 5148 (mod_hash_val_t)th_trace) != 0) { 5149 kmem_free(th_trace, sizeof (th_trace_t)); 5150 return (B_FALSE); 5151 } 5152 } else { 5153 th_trace = (th_trace_t *)val; 5154 } 5155 5156 ASSERT(th_trace->th_refcnt >= 0 && 5157 th_trace->th_refcnt < TR_BUF_MAX - 1); 5158 5159 th_trace->th_refcnt++; 5160 th_trace_rrecord(th_trace); 5161 return (B_TRUE); 5162 } 5163 5164 /* 5165 * For the purpose of tracing a reference release, we assume that global 5166 * tracing is always on and that the same thread initiated the reference hold 5167 * is releasing. 5168 */ 5169 void 5170 th_trace_unref(const void *obj) 5171 { 5172 int retv; 5173 mod_hash_t *mh; 5174 th_trace_t *th_trace; 5175 mod_hash_val_t val; 5176 5177 mh = th_trace_gethash(NULL); 5178 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val); 5179 ASSERT(retv == 0); 5180 th_trace = (th_trace_t *)val; 5181 5182 ASSERT(th_trace->th_refcnt > 0); 5183 th_trace->th_refcnt--; 5184 th_trace_rrecord(th_trace); 5185 } 5186 5187 /* 5188 * If tracing has been disabled, then we assume that the reference counts are 5189 * now useless, and we clear them out before destroying the entries. 5190 */ 5191 void 5192 th_trace_cleanup(const void *obj, boolean_t trace_disable) 5193 { 5194 th_hash_t *thh; 5195 mod_hash_t *mh; 5196 mod_hash_val_t val; 5197 th_trace_t *th_trace; 5198 int retv; 5199 5200 rw_enter(&ip_thread_rwlock, RW_READER); 5201 for (thh = list_head(&ip_thread_list); thh != NULL; 5202 thh = list_next(&ip_thread_list, thh)) { 5203 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj, 5204 &val) == 0) { 5205 th_trace = (th_trace_t *)val; 5206 if (trace_disable) 5207 th_trace->th_refcnt = 0; 5208 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj); 5209 ASSERT(retv == 0); 5210 } 5211 } 5212 rw_exit(&ip_thread_rwlock); 5213 } 5214 5215 void 5216 ipif_trace_ref(ipif_t *ipif) 5217 { 5218 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5219 5220 if (ipif->ipif_trace_disable) 5221 return; 5222 5223 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) { 5224 ipif->ipif_trace_disable = B_TRUE; 5225 ipif_trace_cleanup(ipif); 5226 } 5227 } 5228 5229 void 5230 ipif_untrace_ref(ipif_t *ipif) 5231 { 5232 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5233 5234 if (!ipif->ipif_trace_disable) 5235 th_trace_unref(ipif); 5236 } 5237 5238 void 5239 ill_trace_ref(ill_t *ill) 5240 { 5241 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5242 5243 if (ill->ill_trace_disable) 5244 return; 5245 5246 if (!th_trace_ref(ill, ill->ill_ipst)) { 5247 ill->ill_trace_disable = B_TRUE; 5248 ill_trace_cleanup(ill); 5249 } 5250 } 5251 5252 void 5253 ill_untrace_ref(ill_t *ill) 5254 { 5255 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5256 5257 if (!ill->ill_trace_disable) 5258 th_trace_unref(ill); 5259 } 5260 5261 /* 5262 * Called when ipif is unplumbed or when memory alloc fails. Note that on 5263 * failure, ipif_trace_disable is set. 5264 */ 5265 static void 5266 ipif_trace_cleanup(const ipif_t *ipif) 5267 { 5268 th_trace_cleanup(ipif, ipif->ipif_trace_disable); 5269 } 5270 5271 /* 5272 * Called when ill is unplumbed or when memory alloc fails. Note that on 5273 * failure, ill_trace_disable is set. 5274 */ 5275 static void 5276 ill_trace_cleanup(const ill_t *ill) 5277 { 5278 th_trace_cleanup(ill, ill->ill_trace_disable); 5279 } 5280 #endif /* DEBUG */ 5281 5282 void 5283 ipif_refhold_locked(ipif_t *ipif) 5284 { 5285 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5286 ipif->ipif_refcnt++; 5287 IPIF_TRACE_REF(ipif); 5288 } 5289 5290 void 5291 ipif_refhold(ipif_t *ipif) 5292 { 5293 ill_t *ill; 5294 5295 ill = ipif->ipif_ill; 5296 mutex_enter(&ill->ill_lock); 5297 ipif->ipif_refcnt++; 5298 IPIF_TRACE_REF(ipif); 5299 mutex_exit(&ill->ill_lock); 5300 } 5301 5302 /* 5303 * Must not be called while holding any locks. Otherwise if this is 5304 * the last reference to be released there is a chance of recursive mutex 5305 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5306 * to restart an ioctl. 5307 */ 5308 void 5309 ipif_refrele(ipif_t *ipif) 5310 { 5311 ill_t *ill; 5312 5313 ill = ipif->ipif_ill; 5314 5315 mutex_enter(&ill->ill_lock); 5316 ASSERT(ipif->ipif_refcnt != 0); 5317 ipif->ipif_refcnt--; 5318 IPIF_UNTRACE_REF(ipif); 5319 if (ipif->ipif_refcnt != 0) { 5320 mutex_exit(&ill->ill_lock); 5321 return; 5322 } 5323 5324 /* Drops the ill_lock */ 5325 ipif_ill_refrele_tail(ill); 5326 } 5327 5328 ipif_t * 5329 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 5330 { 5331 ipif_t *ipif; 5332 5333 mutex_enter(&ill->ill_lock); 5334 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 5335 ipif != NULL; ipif = ipif->ipif_next) { 5336 if (IPIF_IS_CONDEMNED(ipif)) 5337 continue; 5338 ipif_refhold_locked(ipif); 5339 mutex_exit(&ill->ill_lock); 5340 return (ipif); 5341 } 5342 mutex_exit(&ill->ill_lock); 5343 return (NULL); 5344 } 5345 5346 /* 5347 * TODO: make this table extendible at run time 5348 * Return a pointer to the mac type info for 'mac_type' 5349 */ 5350 static ip_m_t * 5351 ip_m_lookup(t_uscalar_t mac_type) 5352 { 5353 ip_m_t *ipm; 5354 5355 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 5356 if (ipm->ip_m_mac_type == mac_type) 5357 return (ipm); 5358 return (NULL); 5359 } 5360 5361 /* 5362 * Make a link layer address from the multicast IP address *addr. 5363 * To form the link layer address, invoke the ip_m_v*mapping function 5364 * associated with the link-layer type. 5365 */ 5366 void 5367 ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr) 5368 { 5369 ip_m_t *ipm; 5370 5371 if (ill->ill_net_type == IRE_IF_NORESOLVER) 5372 return; 5373 5374 ASSERT(addr != NULL); 5375 5376 ipm = ip_m_lookup(ill->ill_mactype); 5377 if (ipm == NULL || 5378 (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) || 5379 (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) { 5380 ip0dbg(("no mapping for ill %s mactype 0x%x\n", 5381 ill->ill_name, ill->ill_mactype)); 5382 return; 5383 } 5384 if (ill->ill_isv6) 5385 (*ipm->ip_m_v6mapping)(ill, addr, hwaddr); 5386 else 5387 (*ipm->ip_m_v4mapping)(ill, addr, hwaddr); 5388 } 5389 5390 /* 5391 * ip_rt_add is called to add an IPv4 route to the forwarding table. 5392 * ill is passed in to associate it with the correct interface. 5393 * If ire_arg is set, then we return the held IRE in that location. 5394 */ 5395 int 5396 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 5397 ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg, 5398 boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid) 5399 { 5400 ire_t *ire, *nire; 5401 ire_t *gw_ire = NULL; 5402 ipif_t *ipif = NULL; 5403 uint_t type; 5404 int match_flags = MATCH_IRE_TYPE; 5405 tsol_gc_t *gc = NULL; 5406 tsol_gcgrp_t *gcgrp = NULL; 5407 boolean_t gcgrp_xtraref = B_FALSE; 5408 boolean_t cgtp_broadcast; 5409 5410 ip1dbg(("ip_rt_add:")); 5411 5412 if (ire_arg != NULL) 5413 *ire_arg = NULL; 5414 5415 /* 5416 * If this is the case of RTF_HOST being set, then we set the netmask 5417 * to all ones (regardless if one was supplied). 5418 */ 5419 if (flags & RTF_HOST) 5420 mask = IP_HOST_MASK; 5421 5422 /* 5423 * Prevent routes with a zero gateway from being created (since 5424 * interfaces can currently be plumbed and brought up no assigned 5425 * address). 5426 */ 5427 if (gw_addr == 0) 5428 return (ENETUNREACH); 5429 /* 5430 * Get the ipif, if any, corresponding to the gw_addr 5431 * If -ifp was specified we restrict ourselves to the ill, otherwise 5432 * we match on the gatway and destination to handle unnumbered pt-pt 5433 * interfaces. 5434 */ 5435 if (ill != NULL) 5436 ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst); 5437 else 5438 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); 5439 if (ipif != NULL) { 5440 if (IS_VNI(ipif->ipif_ill)) { 5441 ipif_refrele(ipif); 5442 return (EINVAL); 5443 } 5444 } 5445 5446 /* 5447 * GateD will attempt to create routes with a loopback interface 5448 * address as the gateway and with RTF_GATEWAY set. We allow 5449 * these routes to be added, but create them as interface routes 5450 * since the gateway is an interface address. 5451 */ 5452 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 5453 flags &= ~RTF_GATEWAY; 5454 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 5455 mask == IP_HOST_MASK) { 5456 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK, 5457 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, 5458 NULL); 5459 if (ire != NULL) { 5460 ire_refrele(ire); 5461 ipif_refrele(ipif); 5462 return (EEXIST); 5463 } 5464 ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x" 5465 "for 0x%x\n", (void *)ipif, 5466 ipif->ipif_ire_type, 5467 ntohl(ipif->ipif_lcl_addr))); 5468 ire = ire_create( 5469 (uchar_t *)&dst_addr, /* dest address */ 5470 (uchar_t *)&mask, /* mask */ 5471 NULL, /* no gateway */ 5472 ipif->ipif_ire_type, /* LOOPBACK */ 5473 ipif->ipif_ill, 5474 zoneid, 5475 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0, 5476 NULL, 5477 ipst); 5478 5479 if (ire == NULL) { 5480 ipif_refrele(ipif); 5481 return (ENOMEM); 5482 } 5483 /* src address assigned by the caller? */ 5484 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5485 ire->ire_setsrc_addr = src_addr; 5486 5487 nire = ire_add(ire); 5488 if (nire == NULL) { 5489 /* 5490 * In the result of failure, ire_add() will have 5491 * already deleted the ire in question, so there 5492 * is no need to do that here. 5493 */ 5494 ipif_refrele(ipif); 5495 return (ENOMEM); 5496 } 5497 /* 5498 * Check if it was a duplicate entry. This handles 5499 * the case of two racing route adds for the same route 5500 */ 5501 if (nire != ire) { 5502 ASSERT(nire->ire_identical_ref > 1); 5503 ire_delete(nire); 5504 ire_refrele(nire); 5505 ipif_refrele(ipif); 5506 return (EEXIST); 5507 } 5508 ire = nire; 5509 goto save_ire; 5510 } 5511 } 5512 5513 /* 5514 * The routes for multicast with CGTP are quite special in that 5515 * the gateway is the local interface address, yet RTF_GATEWAY 5516 * is set. We turn off RTF_GATEWAY to provide compatibility with 5517 * this undocumented and unusual use of multicast routes. 5518 */ 5519 if ((flags & RTF_MULTIRT) && ipif != NULL) 5520 flags &= ~RTF_GATEWAY; 5521 5522 /* 5523 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 5524 * and the gateway address provided is one of the system's interface 5525 * addresses. By using the routing socket interface and supplying an 5526 * RTA_IFP sockaddr with an interface index, an alternate method of 5527 * specifying an interface route to be created is available which uses 5528 * the interface index that specifies the outgoing interface rather than 5529 * the address of an outgoing interface (which may not be able to 5530 * uniquely identify an interface). When coupled with the RTF_GATEWAY 5531 * flag, routes can be specified which not only specify the next-hop to 5532 * be used when routing to a certain prefix, but also which outgoing 5533 * interface should be used. 5534 * 5535 * Previously, interfaces would have unique addresses assigned to them 5536 * and so the address assigned to a particular interface could be used 5537 * to identify a particular interface. One exception to this was the 5538 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 5539 * 5540 * With the advent of IPv6 and its link-local addresses, this 5541 * restriction was relaxed and interfaces could share addresses between 5542 * themselves. In fact, typically all of the link-local interfaces on 5543 * an IPv6 node or router will have the same link-local address. In 5544 * order to differentiate between these interfaces, the use of an 5545 * interface index is necessary and this index can be carried inside a 5546 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 5547 * of using the interface index, however, is that all of the ipif's that 5548 * are part of an ill have the same index and so the RTA_IFP sockaddr 5549 * cannot be used to differentiate between ipif's (or logical 5550 * interfaces) that belong to the same ill (physical interface). 5551 * 5552 * For example, in the following case involving IPv4 interfaces and 5553 * logical interfaces 5554 * 5555 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 5556 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0 5557 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0 5558 * 5559 * the ipif's corresponding to each of these interface routes can be 5560 * uniquely identified by the "gateway" (actually interface address). 5561 * 5562 * In this case involving multiple IPv6 default routes to a particular 5563 * link-local gateway, the use of RTA_IFP is necessary to specify which 5564 * default route is of interest: 5565 * 5566 * default fe80::123:4567:89ab:cdef U if0 5567 * default fe80::123:4567:89ab:cdef U if1 5568 */ 5569 5570 /* RTF_GATEWAY not set */ 5571 if (!(flags & RTF_GATEWAY)) { 5572 if (sp != NULL) { 5573 ip2dbg(("ip_rt_add: gateway security attributes " 5574 "cannot be set with interface route\n")); 5575 if (ipif != NULL) 5576 ipif_refrele(ipif); 5577 return (EINVAL); 5578 } 5579 5580 /* 5581 * Whether or not ill (RTA_IFP) is set, we require that 5582 * the gateway is one of our local addresses. 5583 */ 5584 if (ipif == NULL) 5585 return (ENETUNREACH); 5586 5587 /* 5588 * We use MATCH_IRE_ILL here. If the caller specified an 5589 * interface (from the RTA_IFP sockaddr) we use it, otherwise 5590 * we use the ill derived from the gateway address. 5591 * We can always match the gateway address since we record it 5592 * in ire_gateway_addr. 5593 * We don't allow RTA_IFP to specify a different ill than the 5594 * one matching the ipif to make sure we can delete the route. 5595 */ 5596 match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL; 5597 if (ill == NULL) { 5598 ill = ipif->ipif_ill; 5599 } else if (ill != ipif->ipif_ill) { 5600 ipif_refrele(ipif); 5601 return (EINVAL); 5602 } 5603 5604 /* 5605 * We check for an existing entry at this point. 5606 * 5607 * Since a netmask isn't passed in via the ioctl interface 5608 * (SIOCADDRT), we don't check for a matching netmask in that 5609 * case. 5610 */ 5611 if (!ioctl_msg) 5612 match_flags |= MATCH_IRE_MASK; 5613 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, 5614 IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst, 5615 NULL); 5616 if (ire != NULL) { 5617 ire_refrele(ire); 5618 ipif_refrele(ipif); 5619 return (EEXIST); 5620 } 5621 5622 /* 5623 * Create a copy of the IRE_LOOPBACK, IRE_IF_NORESOLVER or 5624 * IRE_IF_RESOLVER with the modified address, netmask, and 5625 * gateway. 5626 */ 5627 ire = ire_create( 5628 (uchar_t *)&dst_addr, 5629 (uint8_t *)&mask, 5630 (uint8_t *)&gw_addr, 5631 ill->ill_net_type, 5632 ill, 5633 zoneid, 5634 flags, 5635 NULL, 5636 ipst); 5637 if (ire == NULL) { 5638 ipif_refrele(ipif); 5639 return (ENOMEM); 5640 } 5641 5642 /* 5643 * Some software (for example, GateD and Sun Cluster) attempts 5644 * to create (what amount to) IRE_PREFIX routes with the 5645 * loopback address as the gateway. This is primarily done to 5646 * set up prefixes with the RTF_REJECT flag set (for example, 5647 * when generating aggregate routes.) 5648 * 5649 * If the IRE type (as defined by ill->ill_net_type) is 5650 * IRE_LOOPBACK, then we map the request into a 5651 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as 5652 * these interface routes, by definition, can only be that. 5653 * 5654 * Needless to say, the real IRE_LOOPBACK is NOT created by this 5655 * routine, but rather using ire_create() directly. 5656 * 5657 */ 5658 if (ill->ill_net_type == IRE_LOOPBACK) { 5659 ire->ire_type = IRE_IF_NORESOLVER; 5660 ire->ire_flags |= RTF_BLACKHOLE; 5661 } 5662 5663 /* src address assigned by the caller? */ 5664 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5665 ire->ire_setsrc_addr = src_addr; 5666 5667 nire = ire_add(ire); 5668 if (nire == NULL) { 5669 /* 5670 * In the result of failure, ire_add() will have 5671 * already deleted the ire in question, so there 5672 * is no need to do that here. 5673 */ 5674 ipif_refrele(ipif); 5675 return (ENOMEM); 5676 } 5677 /* 5678 * Check if it was a duplicate entry. This handles 5679 * the case of two racing route adds for the same route 5680 */ 5681 if (nire != ire) { 5682 ire_delete(nire); 5683 ire_refrele(nire); 5684 ipif_refrele(ipif); 5685 return (EEXIST); 5686 } 5687 ire = nire; 5688 goto save_ire; 5689 } 5690 5691 /* 5692 * Get an interface IRE for the specified gateway. 5693 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 5694 * gateway, it is currently unreachable and we fail the request 5695 * accordingly. We reject any RTF_GATEWAY routes where the gateway 5696 * is an IRE_LOCAL or IRE_LOOPBACK. 5697 * If RTA_IFP was specified we look on that particular ill. 5698 */ 5699 if (ill != NULL) 5700 match_flags |= MATCH_IRE_ILL; 5701 5702 /* Check whether the gateway is reachable. */ 5703 again: 5704 type = IRE_INTERFACE | IRE_LOCAL | IRE_LOOPBACK; 5705 if (flags & RTF_INDIRECT) 5706 type |= IRE_OFFLINK; 5707 5708 gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill, 5709 ALL_ZONES, NULL, match_flags, 0, ipst, NULL); 5710 if (gw_ire == NULL) { 5711 /* 5712 * With IPMP, we allow host routes to influence in.mpathd's 5713 * target selection. However, if the test addresses are on 5714 * their own network, the above lookup will fail since the 5715 * underlying IRE_INTERFACEs are marked hidden. So allow 5716 * hidden test IREs to be found and try again. 5717 */ 5718 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) { 5719 match_flags |= MATCH_IRE_TESTHIDDEN; 5720 goto again; 5721 } 5722 if (ipif != NULL) 5723 ipif_refrele(ipif); 5724 return (ENETUNREACH); 5725 } 5726 if (gw_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) { 5727 ire_refrele(gw_ire); 5728 if (ipif != NULL) 5729 ipif_refrele(ipif); 5730 return (ENETUNREACH); 5731 } 5732 5733 /* 5734 * We create one of three types of IREs as a result of this request 5735 * based on the netmask. A netmask of all ones (which is automatically 5736 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 5737 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 5738 * created. Otherwise, an IRE_PREFIX route is created for the 5739 * destination prefix. 5740 */ 5741 if (mask == IP_HOST_MASK) 5742 type = IRE_HOST; 5743 else if (mask == 0) 5744 type = IRE_DEFAULT; 5745 else 5746 type = IRE_PREFIX; 5747 5748 /* check for a duplicate entry */ 5749 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, 5750 ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, 5751 0, ipst, NULL); 5752 if (ire != NULL) { 5753 if (ipif != NULL) 5754 ipif_refrele(ipif); 5755 ire_refrele(gw_ire); 5756 ire_refrele(ire); 5757 return (EEXIST); 5758 } 5759 5760 /* Security attribute exists */ 5761 if (sp != NULL) { 5762 tsol_gcgrp_addr_t ga; 5763 5764 /* find or create the gateway credentials group */ 5765 ga.ga_af = AF_INET; 5766 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 5767 5768 /* we hold reference to it upon success */ 5769 gcgrp = gcgrp_lookup(&ga, B_TRUE); 5770 if (gcgrp == NULL) { 5771 if (ipif != NULL) 5772 ipif_refrele(ipif); 5773 ire_refrele(gw_ire); 5774 return (ENOMEM); 5775 } 5776 5777 /* 5778 * Create and add the security attribute to the group; a 5779 * reference to the group is made upon allocating a new 5780 * entry successfully. If it finds an already-existing 5781 * entry for the security attribute in the group, it simply 5782 * returns it and no new reference is made to the group. 5783 */ 5784 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 5785 if (gc == NULL) { 5786 if (ipif != NULL) 5787 ipif_refrele(ipif); 5788 /* release reference held by gcgrp_lookup */ 5789 GCGRP_REFRELE(gcgrp); 5790 ire_refrele(gw_ire); 5791 return (ENOMEM); 5792 } 5793 } 5794 5795 /* Create the IRE. */ 5796 ire = ire_create( 5797 (uchar_t *)&dst_addr, /* dest address */ 5798 (uchar_t *)&mask, /* mask */ 5799 (uchar_t *)&gw_addr, /* gateway address */ 5800 (ushort_t)type, /* IRE type */ 5801 ill, 5802 zoneid, 5803 flags, 5804 gc, /* security attribute */ 5805 ipst); 5806 5807 /* 5808 * The ire holds a reference to the 'gc' and the 'gc' holds a 5809 * reference to the 'gcgrp'. We can now release the extra reference 5810 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 5811 */ 5812 if (gcgrp_xtraref) 5813 GCGRP_REFRELE(gcgrp); 5814 if (ire == NULL) { 5815 if (gc != NULL) 5816 GC_REFRELE(gc); 5817 if (ipif != NULL) 5818 ipif_refrele(ipif); 5819 ire_refrele(gw_ire); 5820 return (ENOMEM); 5821 } 5822 5823 /* Before we add, check if an extra CGTP broadcast is needed */ 5824 cgtp_broadcast = ((flags & RTF_MULTIRT) && 5825 ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST); 5826 5827 /* src address assigned by the caller? */ 5828 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5829 ire->ire_setsrc_addr = src_addr; 5830 5831 /* 5832 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 5833 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 5834 */ 5835 5836 /* Add the new IRE. */ 5837 nire = ire_add(ire); 5838 if (nire == NULL) { 5839 /* 5840 * In the result of failure, ire_add() will have 5841 * already deleted the ire in question, so there 5842 * is no need to do that here. 5843 */ 5844 if (ipif != NULL) 5845 ipif_refrele(ipif); 5846 ire_refrele(gw_ire); 5847 return (ENOMEM); 5848 } 5849 /* 5850 * Check if it was a duplicate entry. This handles 5851 * the case of two racing route adds for the same route 5852 */ 5853 if (nire != ire) { 5854 ire_delete(nire); 5855 ire_refrele(nire); 5856 if (ipif != NULL) 5857 ipif_refrele(ipif); 5858 ire_refrele(gw_ire); 5859 return (EEXIST); 5860 } 5861 ire = nire; 5862 5863 if (flags & RTF_MULTIRT) { 5864 /* 5865 * Invoke the CGTP (multirouting) filtering module 5866 * to add the dst address in the filtering database. 5867 * Replicated inbound packets coming from that address 5868 * will be filtered to discard the duplicates. 5869 * It is not necessary to call the CGTP filter hook 5870 * when the dst address is a broadcast or multicast, 5871 * because an IP source address cannot be a broadcast 5872 * or a multicast. 5873 */ 5874 if (cgtp_broadcast) { 5875 ip_cgtp_bcast_add(ire, ipst); 5876 goto save_ire; 5877 } 5878 if (ipst->ips_ip_cgtp_filter_ops != NULL && 5879 !CLASSD(ire->ire_addr)) { 5880 int res; 5881 ipif_t *src_ipif; 5882 5883 /* Find the source address corresponding to gw_ire */ 5884 src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr, 5885 NULL, zoneid, ipst); 5886 if (src_ipif != NULL) { 5887 res = ipst->ips_ip_cgtp_filter_ops-> 5888 cfo_add_dest_v4( 5889 ipst->ips_netstack->netstack_stackid, 5890 ire->ire_addr, 5891 ire->ire_gateway_addr, 5892 ire->ire_setsrc_addr, 5893 src_ipif->ipif_lcl_addr); 5894 ipif_refrele(src_ipif); 5895 } else { 5896 res = EADDRNOTAVAIL; 5897 } 5898 if (res != 0) { 5899 if (ipif != NULL) 5900 ipif_refrele(ipif); 5901 ire_refrele(gw_ire); 5902 ire_delete(ire); 5903 ire_refrele(ire); /* Held in ire_add */ 5904 return (res); 5905 } 5906 } 5907 } 5908 5909 save_ire: 5910 if (gw_ire != NULL) { 5911 ire_refrele(gw_ire); 5912 gw_ire = NULL; 5913 } 5914 if (ill != NULL) { 5915 /* 5916 * Save enough information so that we can recreate the IRE if 5917 * the interface goes down and then up. The metrics associated 5918 * with the route will be saved as well when rts_setmetrics() is 5919 * called after the IRE has been created. In the case where 5920 * memory cannot be allocated, none of this information will be 5921 * saved. 5922 */ 5923 ill_save_ire(ill, ire); 5924 } 5925 if (ioctl_msg) 5926 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst); 5927 if (ire_arg != NULL) { 5928 /* 5929 * Store the ire that was successfully added into where ire_arg 5930 * points to so that callers don't have to look it up 5931 * themselves (but they are responsible for ire_refrele()ing 5932 * the ire when they are finished with it). 5933 */ 5934 *ire_arg = ire; 5935 } else { 5936 ire_refrele(ire); /* Held in ire_add */ 5937 } 5938 if (ipif != NULL) 5939 ipif_refrele(ipif); 5940 return (0); 5941 } 5942 5943 /* 5944 * ip_rt_delete is called to delete an IPv4 route. 5945 * ill is passed in to associate it with the correct interface. 5946 */ 5947 /* ARGSUSED4 */ 5948 int 5949 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 5950 uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg, 5951 ip_stack_t *ipst, zoneid_t zoneid) 5952 { 5953 ire_t *ire = NULL; 5954 ipif_t *ipif; 5955 uint_t type; 5956 uint_t match_flags = MATCH_IRE_TYPE; 5957 int err = 0; 5958 5959 ip1dbg(("ip_rt_delete:")); 5960 /* 5961 * If this is the case of RTF_HOST being set, then we set the netmask 5962 * to all ones. Otherwise, we use the netmask if one was supplied. 5963 */ 5964 if (flags & RTF_HOST) { 5965 mask = IP_HOST_MASK; 5966 match_flags |= MATCH_IRE_MASK; 5967 } else if (rtm_addrs & RTA_NETMASK) { 5968 match_flags |= MATCH_IRE_MASK; 5969 } 5970 5971 /* 5972 * Note that RTF_GATEWAY is never set on a delete, therefore 5973 * we check if the gateway address is one of our interfaces first, 5974 * and fall back on RTF_GATEWAY routes. 5975 * 5976 * This makes it possible to delete an original 5977 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 5978 * However, we have RTF_KERNEL set on the ones created by ipif_up 5979 * and those can not be deleted here. 5980 * 5981 * We use MATCH_IRE_ILL if we know the interface. If the caller 5982 * specified an interface (from the RTA_IFP sockaddr) we use it, 5983 * otherwise we use the ill derived from the gateway address. 5984 * We can always match the gateway address since we record it 5985 * in ire_gateway_addr. 5986 * 5987 * For more detail on specifying routes by gateway address and by 5988 * interface index, see the comments in ip_rt_add(). 5989 */ 5990 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); 5991 if (ipif != NULL) { 5992 ill_t *ill_match; 5993 5994 if (ill != NULL) 5995 ill_match = ill; 5996 else 5997 ill_match = ipif->ipif_ill; 5998 5999 match_flags |= MATCH_IRE_ILL; 6000 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 6001 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK, 6002 ill_match, ALL_ZONES, NULL, match_flags, 0, ipst, 6003 NULL); 6004 } 6005 if (ire == NULL) { 6006 match_flags |= MATCH_IRE_GW; 6007 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, 6008 IRE_INTERFACE, ill_match, ALL_ZONES, NULL, 6009 match_flags, 0, ipst, NULL); 6010 } 6011 /* Avoid deleting routes created by kernel from an ipif */ 6012 if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) { 6013 ire_refrele(ire); 6014 ire = NULL; 6015 } 6016 6017 /* Restore in case we didn't find a match */ 6018 match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL); 6019 } 6020 6021 if (ire == NULL) { 6022 /* 6023 * At this point, the gateway address is not one of our own 6024 * addresses or a matching interface route was not found. We 6025 * set the IRE type to lookup based on whether 6026 * this is a host route, a default route or just a prefix. 6027 * 6028 * If an ill was passed in, then the lookup is based on an 6029 * interface index so MATCH_IRE_ILL is added to match_flags. 6030 */ 6031 match_flags |= MATCH_IRE_GW; 6032 if (ill != NULL) 6033 match_flags |= MATCH_IRE_ILL; 6034 if (mask == IP_HOST_MASK) 6035 type = IRE_HOST; 6036 else if (mask == 0) 6037 type = IRE_DEFAULT; 6038 else 6039 type = IRE_PREFIX; 6040 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, 6041 ALL_ZONES, NULL, match_flags, 0, ipst, NULL); 6042 } 6043 6044 if (ipif != NULL) { 6045 ipif_refrele(ipif); 6046 ipif = NULL; 6047 } 6048 6049 if (ire == NULL) 6050 return (ESRCH); 6051 6052 if (ire->ire_flags & RTF_MULTIRT) { 6053 /* 6054 * Invoke the CGTP (multirouting) filtering module 6055 * to remove the dst address from the filtering database. 6056 * Packets coming from that address will no longer be 6057 * filtered to remove duplicates. 6058 */ 6059 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 6060 err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4( 6061 ipst->ips_netstack->netstack_stackid, 6062 ire->ire_addr, ire->ire_gateway_addr); 6063 } 6064 ip_cgtp_bcast_delete(ire, ipst); 6065 } 6066 6067 ill = ire->ire_ill; 6068 if (ill != NULL) 6069 ill_remove_saved_ire(ill, ire); 6070 if (ioctl_msg) 6071 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst); 6072 ire_delete(ire); 6073 ire_refrele(ire); 6074 return (err); 6075 } 6076 6077 /* 6078 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 6079 */ 6080 /* ARGSUSED */ 6081 int 6082 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6083 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6084 { 6085 ipaddr_t dst_addr; 6086 ipaddr_t gw_addr; 6087 ipaddr_t mask; 6088 int error = 0; 6089 mblk_t *mp1; 6090 struct rtentry *rt; 6091 ipif_t *ipif = NULL; 6092 ip_stack_t *ipst; 6093 6094 ASSERT(q->q_next == NULL); 6095 ipst = CONNQ_TO_IPST(q); 6096 6097 ip1dbg(("ip_siocaddrt:")); 6098 /* Existence of mp1 verified in ip_wput_nondata */ 6099 mp1 = mp->b_cont->b_cont; 6100 rt = (struct rtentry *)mp1->b_rptr; 6101 6102 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6103 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6104 6105 /* 6106 * If the RTF_HOST flag is on, this is a request to assign a gateway 6107 * to a particular host address. In this case, we set the netmask to 6108 * all ones for the particular destination address. Otherwise, 6109 * determine the netmask to be used based on dst_addr and the interfaces 6110 * in use. 6111 */ 6112 if (rt->rt_flags & RTF_HOST) { 6113 mask = IP_HOST_MASK; 6114 } else { 6115 /* 6116 * Note that ip_subnet_mask returns a zero mask in the case of 6117 * default (an all-zeroes address). 6118 */ 6119 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 6120 } 6121 6122 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 6123 B_TRUE, NULL, ipst, ALL_ZONES); 6124 if (ipif != NULL) 6125 ipif_refrele(ipif); 6126 return (error); 6127 } 6128 6129 /* 6130 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 6131 */ 6132 /* ARGSUSED */ 6133 int 6134 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6135 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6136 { 6137 ipaddr_t dst_addr; 6138 ipaddr_t gw_addr; 6139 ipaddr_t mask; 6140 int error; 6141 mblk_t *mp1; 6142 struct rtentry *rt; 6143 ipif_t *ipif = NULL; 6144 ip_stack_t *ipst; 6145 6146 ASSERT(q->q_next == NULL); 6147 ipst = CONNQ_TO_IPST(q); 6148 6149 ip1dbg(("ip_siocdelrt:")); 6150 /* Existence of mp1 verified in ip_wput_nondata */ 6151 mp1 = mp->b_cont->b_cont; 6152 rt = (struct rtentry *)mp1->b_rptr; 6153 6154 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6155 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6156 6157 /* 6158 * If the RTF_HOST flag is on, this is a request to delete a gateway 6159 * to a particular host address. In this case, we set the netmask to 6160 * all ones for the particular destination address. Otherwise, 6161 * determine the netmask to be used based on dst_addr and the interfaces 6162 * in use. 6163 */ 6164 if (rt->rt_flags & RTF_HOST) { 6165 mask = IP_HOST_MASK; 6166 } else { 6167 /* 6168 * Note that ip_subnet_mask returns a zero mask in the case of 6169 * default (an all-zeroes address). 6170 */ 6171 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 6172 } 6173 6174 error = ip_rt_delete(dst_addr, mask, gw_addr, 6175 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, 6176 ipst, ALL_ZONES); 6177 if (ipif != NULL) 6178 ipif_refrele(ipif); 6179 return (error); 6180 } 6181 6182 /* 6183 * Enqueue the mp onto the ipsq, chained by b_next. 6184 * b_prev stores the function to be executed later, and b_queue the queue 6185 * where this mp originated. 6186 */ 6187 void 6188 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 6189 ill_t *pending_ill) 6190 { 6191 conn_t *connp; 6192 ipxop_t *ipx = ipsq->ipsq_xop; 6193 6194 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 6195 ASSERT(MUTEX_HELD(&ipx->ipx_lock)); 6196 ASSERT(func != NULL); 6197 6198 mp->b_queue = q; 6199 mp->b_prev = (void *)func; 6200 mp->b_next = NULL; 6201 6202 switch (type) { 6203 case CUR_OP: 6204 if (ipx->ipx_mptail != NULL) { 6205 ASSERT(ipx->ipx_mphead != NULL); 6206 ipx->ipx_mptail->b_next = mp; 6207 } else { 6208 ASSERT(ipx->ipx_mphead == NULL); 6209 ipx->ipx_mphead = mp; 6210 } 6211 ipx->ipx_mptail = mp; 6212 break; 6213 6214 case NEW_OP: 6215 if (ipsq->ipsq_xopq_mptail != NULL) { 6216 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 6217 ipsq->ipsq_xopq_mptail->b_next = mp; 6218 } else { 6219 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 6220 ipsq->ipsq_xopq_mphead = mp; 6221 } 6222 ipsq->ipsq_xopq_mptail = mp; 6223 ipx->ipx_ipsq_queued = B_TRUE; 6224 break; 6225 6226 case SWITCH_OP: 6227 ASSERT(ipsq->ipsq_swxop != NULL); 6228 /* only one switch operation is currently allowed */ 6229 ASSERT(ipsq->ipsq_switch_mp == NULL); 6230 ipsq->ipsq_switch_mp = mp; 6231 ipx->ipx_ipsq_queued = B_TRUE; 6232 break; 6233 default: 6234 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 6235 } 6236 6237 if (CONN_Q(q) && pending_ill != NULL) { 6238 connp = Q_TO_CONN(q); 6239 ASSERT(MUTEX_HELD(&connp->conn_lock)); 6240 connp->conn_oper_pending_ill = pending_ill; 6241 } 6242 } 6243 6244 /* 6245 * Dequeue the next message that requested exclusive access to this IPSQ's 6246 * xop. Specifically: 6247 * 6248 * 1. If we're still processing the current operation on `ipsq', then 6249 * dequeue the next message for the operation (from ipx_mphead), or 6250 * return NULL if there are no queued messages for the operation. 6251 * These messages are queued via CUR_OP to qwriter_ip() and friends. 6252 * 6253 * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is 6254 * not set) see if the ipsq has requested an xop switch. If so, switch 6255 * `ipsq' to a different xop. Xop switches only happen when joining or 6256 * leaving IPMP groups and require a careful dance -- see the comments 6257 * in-line below for details. If we're leaving a group xop or if we're 6258 * joining a group xop and become writer on it, then we proceed to (3). 6259 * Otherwise, we return NULL and exit the xop. 6260 * 6261 * 3. For each IPSQ in the xop, return any switch operation stored on 6262 * ipsq_switch_mp (set via SWITCH_OP); these must be processed before 6263 * any other messages queued on the IPSQ. Otherwise, dequeue the next 6264 * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead. 6265 * Note that if the phyint tied to `ipsq' is not using IPMP there will 6266 * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for 6267 * each phyint in the group, including the IPMP meta-interface phyint. 6268 */ 6269 static mblk_t * 6270 ipsq_dq(ipsq_t *ipsq) 6271 { 6272 ill_t *illv4, *illv6; 6273 mblk_t *mp; 6274 ipsq_t *xopipsq; 6275 ipsq_t *leftipsq = NULL; 6276 ipxop_t *ipx; 6277 phyint_t *phyi = ipsq->ipsq_phyint; 6278 ip_stack_t *ipst = ipsq->ipsq_ipst; 6279 boolean_t emptied = B_FALSE; 6280 6281 /* 6282 * Grab all the locks we need in the defined order (ill_g_lock -> 6283 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next. 6284 */ 6285 rw_enter(&ipst->ips_ill_g_lock, 6286 ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER); 6287 mutex_enter(&ipsq->ipsq_lock); 6288 ipx = ipsq->ipsq_xop; 6289 mutex_enter(&ipx->ipx_lock); 6290 6291 /* 6292 * Dequeue the next message associated with the current exclusive 6293 * operation, if any. 6294 */ 6295 if ((mp = ipx->ipx_mphead) != NULL) { 6296 ipx->ipx_mphead = mp->b_next; 6297 if (ipx->ipx_mphead == NULL) 6298 ipx->ipx_mptail = NULL; 6299 mp->b_next = (void *)ipsq; 6300 goto out; 6301 } 6302 6303 if (ipx->ipx_current_ipif != NULL) 6304 goto empty; 6305 6306 if (ipsq->ipsq_swxop != NULL) { 6307 /* 6308 * The exclusive operation that is now being completed has 6309 * requested a switch to a different xop. This happens 6310 * when an interface joins or leaves an IPMP group. Joins 6311 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()). 6312 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb 6313 * (phyint_free()), or interface plumb for an ill type 6314 * not in the IPMP group (ip_rput_dlpi_writer()). 6315 * 6316 * Xop switches are not allowed on the IPMP meta-interface. 6317 */ 6318 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP)); 6319 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 6320 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq); 6321 6322 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) { 6323 /* 6324 * We're switching back to our own xop, so we have two 6325 * xop's to drain/exit: our own, and the group xop 6326 * that we are leaving. 6327 * 6328 * First, pull ourselves out of the group ipsq list. 6329 * This is safe since we're writer on ill_g_lock. 6330 */ 6331 ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop); 6332 6333 xopipsq = ipx->ipx_ipsq; 6334 while (xopipsq->ipsq_next != ipsq) 6335 xopipsq = xopipsq->ipsq_next; 6336 6337 xopipsq->ipsq_next = ipsq->ipsq_next; 6338 ipsq->ipsq_next = ipsq; 6339 ipsq->ipsq_xop = ipsq->ipsq_swxop; 6340 ipsq->ipsq_swxop = NULL; 6341 6342 /* 6343 * Second, prepare to exit the group xop. The actual 6344 * ipsq_exit() is done at the end of this function 6345 * since we cannot hold any locks across ipsq_exit(). 6346 * Note that although we drop the group's ipx_lock, no 6347 * threads can proceed since we're still ipx_writer. 6348 */ 6349 leftipsq = xopipsq; 6350 mutex_exit(&ipx->ipx_lock); 6351 6352 /* 6353 * Third, set ipx to point to our own xop (which was 6354 * inactive and therefore can be entered). 6355 */ 6356 ipx = ipsq->ipsq_xop; 6357 mutex_enter(&ipx->ipx_lock); 6358 ASSERT(ipx->ipx_writer == NULL); 6359 ASSERT(ipx->ipx_current_ipif == NULL); 6360 } else { 6361 /* 6362 * We're switching from our own xop to a group xop. 6363 * The requestor of the switch must ensure that the 6364 * group xop cannot go away (e.g. by ensuring the 6365 * phyint associated with the xop cannot go away). 6366 * 6367 * If we can become writer on our new xop, then we'll 6368 * do the drain. Otherwise, the current writer of our 6369 * new xop will do the drain when it exits. 6370 * 6371 * First, splice ourselves into the group IPSQ list. 6372 * This is safe since we're writer on ill_g_lock. 6373 */ 6374 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 6375 6376 xopipsq = ipsq->ipsq_swxop->ipx_ipsq; 6377 while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq) 6378 xopipsq = xopipsq->ipsq_next; 6379 6380 xopipsq->ipsq_next = ipsq; 6381 ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq; 6382 ipsq->ipsq_xop = ipsq->ipsq_swxop; 6383 ipsq->ipsq_swxop = NULL; 6384 6385 /* 6386 * Second, exit our own xop, since it's now unused. 6387 * This is safe since we've got the only reference. 6388 */ 6389 ASSERT(ipx->ipx_writer == curthread); 6390 ipx->ipx_writer = NULL; 6391 VERIFY(--ipx->ipx_reentry_cnt == 0); 6392 ipx->ipx_ipsq_queued = B_FALSE; 6393 mutex_exit(&ipx->ipx_lock); 6394 6395 /* 6396 * Third, set ipx to point to our new xop, and check 6397 * if we can become writer on it. If we cannot, then 6398 * the current writer will drain the IPSQ group when 6399 * it exits. Our ipsq_xop is guaranteed to be stable 6400 * because we're still holding ipsq_lock. 6401 */ 6402 ipx = ipsq->ipsq_xop; 6403 mutex_enter(&ipx->ipx_lock); 6404 if (ipx->ipx_writer != NULL || 6405 ipx->ipx_current_ipif != NULL) { 6406 goto out; 6407 } 6408 } 6409 6410 /* 6411 * Fourth, become writer on our new ipx before we continue 6412 * with the drain. Note that we never dropped ipsq_lock 6413 * above, so no other thread could've raced with us to 6414 * become writer first. Also, we're holding ipx_lock, so 6415 * no other thread can examine the ipx right now. 6416 */ 6417 ASSERT(ipx->ipx_current_ipif == NULL); 6418 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 6419 VERIFY(ipx->ipx_reentry_cnt++ == 0); 6420 ipx->ipx_writer = curthread; 6421 ipx->ipx_forced = B_FALSE; 6422 #ifdef DEBUG 6423 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6424 #endif 6425 } 6426 6427 xopipsq = ipsq; 6428 do { 6429 /* 6430 * So that other operations operate on a consistent and 6431 * complete phyint, a switch message on an IPSQ must be 6432 * handled prior to any other operations on that IPSQ. 6433 */ 6434 if ((mp = xopipsq->ipsq_switch_mp) != NULL) { 6435 xopipsq->ipsq_switch_mp = NULL; 6436 ASSERT(mp->b_next == NULL); 6437 mp->b_next = (void *)xopipsq; 6438 goto out; 6439 } 6440 6441 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) { 6442 xopipsq->ipsq_xopq_mphead = mp->b_next; 6443 if (xopipsq->ipsq_xopq_mphead == NULL) 6444 xopipsq->ipsq_xopq_mptail = NULL; 6445 mp->b_next = (void *)xopipsq; 6446 goto out; 6447 } 6448 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 6449 empty: 6450 /* 6451 * There are no messages. Further, we are holding ipx_lock, hence no 6452 * new messages can end up on any IPSQ in the xop. 6453 */ 6454 ipx->ipx_writer = NULL; 6455 ipx->ipx_forced = B_FALSE; 6456 VERIFY(--ipx->ipx_reentry_cnt == 0); 6457 ipx->ipx_ipsq_queued = B_FALSE; 6458 emptied = B_TRUE; 6459 #ifdef DEBUG 6460 ipx->ipx_depth = 0; 6461 #endif 6462 out: 6463 mutex_exit(&ipx->ipx_lock); 6464 mutex_exit(&ipsq->ipsq_lock); 6465 6466 /* 6467 * If we completely emptied the xop, then wake up any threads waiting 6468 * to enter any of the IPSQ's associated with it. 6469 */ 6470 if (emptied) { 6471 xopipsq = ipsq; 6472 do { 6473 if ((phyi = xopipsq->ipsq_phyint) == NULL) 6474 continue; 6475 6476 illv4 = phyi->phyint_illv4; 6477 illv6 = phyi->phyint_illv6; 6478 6479 GRAB_ILL_LOCKS(illv4, illv6); 6480 if (illv4 != NULL) 6481 cv_broadcast(&illv4->ill_cv); 6482 if (illv6 != NULL) 6483 cv_broadcast(&illv6->ill_cv); 6484 RELEASE_ILL_LOCKS(illv4, illv6); 6485 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 6486 } 6487 rw_exit(&ipst->ips_ill_g_lock); 6488 6489 /* 6490 * Now that all locks are dropped, exit the IPSQ we left. 6491 */ 6492 if (leftipsq != NULL) 6493 ipsq_exit(leftipsq); 6494 6495 return (mp); 6496 } 6497 6498 /* 6499 * Return completion status of previously initiated DLPI operations on 6500 * ills in the purview of an ipsq. 6501 */ 6502 static boolean_t 6503 ipsq_dlpi_done(ipsq_t *ipsq) 6504 { 6505 ipsq_t *ipsq_start; 6506 phyint_t *phyi; 6507 ill_t *ill; 6508 6509 ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock)); 6510 ipsq_start = ipsq; 6511 6512 do { 6513 /* 6514 * The only current users of this function are ipsq_try_enter 6515 * and ipsq_enter which have made sure that ipsq_writer is 6516 * NULL before we reach here. ill_dlpi_pending is modified 6517 * only by an ipsq writer 6518 */ 6519 ASSERT(ipsq->ipsq_xop->ipx_writer == NULL); 6520 phyi = ipsq->ipsq_phyint; 6521 /* 6522 * phyi could be NULL if a phyint that is part of an 6523 * IPMP group is being unplumbed. A more detailed 6524 * comment is in ipmp_grp_update_kstats() 6525 */ 6526 if (phyi != NULL) { 6527 ill = phyi->phyint_illv4; 6528 if (ill != NULL && 6529 (ill->ill_dlpi_pending != DL_PRIM_INVAL || 6530 ill->ill_arl_dlpi_pending)) 6531 return (B_FALSE); 6532 6533 ill = phyi->phyint_illv6; 6534 if (ill != NULL && 6535 ill->ill_dlpi_pending != DL_PRIM_INVAL) 6536 return (B_FALSE); 6537 } 6538 6539 } while ((ipsq = ipsq->ipsq_next) != ipsq_start); 6540 6541 return (B_TRUE); 6542 } 6543 6544 /* 6545 * Enter the ipsq corresponding to ill, by waiting synchronously till 6546 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 6547 * will have to drain completely before ipsq_enter returns success. 6548 * ipx_current_ipif will be set if some exclusive op is in progress, 6549 * and the ipsq_exit logic will start the next enqueued op after 6550 * completion of the current op. If 'force' is used, we don't wait 6551 * for the enqueued ops. This is needed when a conn_close wants to 6552 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 6553 * of an ill can also use this option. But we dont' use it currently. 6554 */ 6555 #define ENTER_SQ_WAIT_TICKS 100 6556 boolean_t 6557 ipsq_enter(ill_t *ill, boolean_t force, int type) 6558 { 6559 ipsq_t *ipsq; 6560 ipxop_t *ipx; 6561 boolean_t waited_enough = B_FALSE; 6562 ip_stack_t *ipst = ill->ill_ipst; 6563 6564 /* 6565 * Note that the relationship between ill and ipsq is fixed as long as 6566 * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the 6567 * relationship between the IPSQ and xop cannot change. However, 6568 * since we cannot hold ipsq_lock across the cv_wait(), it may change 6569 * while we're waiting. We wait on ill_cv and rely on ipsq_exit() 6570 * waking up all ills in the xop when it becomes available. 6571 */ 6572 for (;;) { 6573 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6574 mutex_enter(&ill->ill_lock); 6575 if (ill->ill_state_flags & ILL_CONDEMNED) { 6576 mutex_exit(&ill->ill_lock); 6577 rw_exit(&ipst->ips_ill_g_lock); 6578 return (B_FALSE); 6579 } 6580 6581 ipsq = ill->ill_phyint->phyint_ipsq; 6582 mutex_enter(&ipsq->ipsq_lock); 6583 ipx = ipsq->ipsq_xop; 6584 mutex_enter(&ipx->ipx_lock); 6585 6586 if (ipx->ipx_writer == NULL && (type == CUR_OP || 6587 (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) || 6588 waited_enough)) 6589 break; 6590 6591 rw_exit(&ipst->ips_ill_g_lock); 6592 6593 if (!force || ipx->ipx_writer != NULL) { 6594 mutex_exit(&ipx->ipx_lock); 6595 mutex_exit(&ipsq->ipsq_lock); 6596 cv_wait(&ill->ill_cv, &ill->ill_lock); 6597 } else { 6598 mutex_exit(&ipx->ipx_lock); 6599 mutex_exit(&ipsq->ipsq_lock); 6600 (void) cv_reltimedwait(&ill->ill_cv, 6601 &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK); 6602 waited_enough = B_TRUE; 6603 } 6604 mutex_exit(&ill->ill_lock); 6605 } 6606 6607 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 6608 ASSERT(ipx->ipx_reentry_cnt == 0); 6609 ipx->ipx_writer = curthread; 6610 ipx->ipx_forced = (ipx->ipx_current_ipif != NULL); 6611 ipx->ipx_reentry_cnt++; 6612 #ifdef DEBUG 6613 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6614 #endif 6615 mutex_exit(&ipx->ipx_lock); 6616 mutex_exit(&ipsq->ipsq_lock); 6617 mutex_exit(&ill->ill_lock); 6618 rw_exit(&ipst->ips_ill_g_lock); 6619 6620 return (B_TRUE); 6621 } 6622 6623 /* 6624 * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock 6625 * across the call to the core interface ipsq_try_enter() and hence calls this 6626 * function directly. This is explained more fully in ipif_set_values(). 6627 * In order to support the above constraint, ipsq_try_enter is implemented as 6628 * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently 6629 */ 6630 static ipsq_t * 6631 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, 6632 int type, boolean_t reentry_ok) 6633 { 6634 ipsq_t *ipsq; 6635 ipxop_t *ipx; 6636 ip_stack_t *ipst = ill->ill_ipst; 6637 6638 /* 6639 * lock ordering: 6640 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock. 6641 * 6642 * ipx of an ipsq can't change when ipsq_lock is held. 6643 */ 6644 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 6645 GRAB_CONN_LOCK(q); 6646 mutex_enter(&ill->ill_lock); 6647 ipsq = ill->ill_phyint->phyint_ipsq; 6648 mutex_enter(&ipsq->ipsq_lock); 6649 ipx = ipsq->ipsq_xop; 6650 mutex_enter(&ipx->ipx_lock); 6651 6652 /* 6653 * 1. Enter the ipsq if we are already writer and reentry is ok. 6654 * (Note: If the caller does not specify reentry_ok then neither 6655 * 'func' nor any of its callees must ever attempt to enter the ipsq 6656 * again. Otherwise it can lead to an infinite loop 6657 * 2. Enter the ipsq if there is no current writer and this attempted 6658 * entry is part of the current operation 6659 * 3. Enter the ipsq if there is no current writer and this is a new 6660 * operation and the operation queue is empty and there is no 6661 * operation currently in progress and if all previously initiated 6662 * DLPI operations have completed. 6663 */ 6664 if ((ipx->ipx_writer == curthread && reentry_ok) || 6665 (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP && 6666 !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL && 6667 ipsq_dlpi_done(ipsq))))) { 6668 /* Success. */ 6669 ipx->ipx_reentry_cnt++; 6670 ipx->ipx_writer = curthread; 6671 ipx->ipx_forced = B_FALSE; 6672 mutex_exit(&ipx->ipx_lock); 6673 mutex_exit(&ipsq->ipsq_lock); 6674 mutex_exit(&ill->ill_lock); 6675 RELEASE_CONN_LOCK(q); 6676 #ifdef DEBUG 6677 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6678 #endif 6679 return (ipsq); 6680 } 6681 6682 if (func != NULL) 6683 ipsq_enq(ipsq, q, mp, func, type, ill); 6684 6685 mutex_exit(&ipx->ipx_lock); 6686 mutex_exit(&ipsq->ipsq_lock); 6687 mutex_exit(&ill->ill_lock); 6688 RELEASE_CONN_LOCK(q); 6689 return (NULL); 6690 } 6691 6692 /* 6693 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 6694 * certain critical operations like plumbing (i.e. most set ioctls), etc. 6695 * There is one ipsq per phyint. The ipsq 6696 * serializes exclusive ioctls issued by applications on a per ipsq basis in 6697 * ipsq_xopq_mphead. It also protects against multiple threads executing in 6698 * the ipsq. Responses from the driver pertain to the current ioctl (say a 6699 * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing 6700 * up the interface) and are enqueued in ipx_mphead. 6701 * 6702 * If a thread does not want to reenter the ipsq when it is already writer, 6703 * it must make sure that the specified reentry point to be called later 6704 * when the ipsq is empty, nor any code path starting from the specified reentry 6705 * point must never ever try to enter the ipsq again. Otherwise it can lead 6706 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 6707 * When the thread that is currently exclusive finishes, it (ipsq_exit) 6708 * dequeues the requests waiting to become exclusive in ipx_mphead and calls 6709 * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit 6710 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 6711 * ioctl if the current ioctl has completed. If the current ioctl is still 6712 * in progress it simply returns. The current ioctl could be waiting for 6713 * a response from another module (the driver or could be waiting for 6714 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp 6715 * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the 6716 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 6717 * ipx_current_ipif is NULL which happens only once the ioctl is complete and 6718 * all associated DLPI operations have completed. 6719 */ 6720 6721 /* 6722 * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif' 6723 * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ 6724 * on success, or NULL on failure. The caller ensures ipif/ill is valid by 6725 * refholding it as necessary. If the IPSQ cannot be entered and `func' is 6726 * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ 6727 * can be entered. If `func' is NULL, then `q' and `mp' are ignored. 6728 */ 6729 ipsq_t * 6730 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 6731 ipsq_func_t func, int type, boolean_t reentry_ok) 6732 { 6733 ip_stack_t *ipst; 6734 ipsq_t *ipsq; 6735 6736 /* Only 1 of ipif or ill can be specified */ 6737 ASSERT((ipif != NULL) ^ (ill != NULL)); 6738 6739 if (ipif != NULL) 6740 ill = ipif->ipif_ill; 6741 ipst = ill->ill_ipst; 6742 6743 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6744 ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok); 6745 rw_exit(&ipst->ips_ill_g_lock); 6746 6747 return (ipsq); 6748 } 6749 6750 /* 6751 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures 6752 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ 6753 * cannot be entered, the mp is queued for completion. 6754 */ 6755 void 6756 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 6757 boolean_t reentry_ok) 6758 { 6759 ipsq_t *ipsq; 6760 6761 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok); 6762 6763 /* 6764 * Drop the caller's refhold on the ill. This is safe since we either 6765 * entered the IPSQ (and thus are exclusive), or failed to enter the 6766 * IPSQ, in which case we return without accessing ill anymore. This 6767 * is needed because func needs to see the correct refcount. 6768 * e.g. removeif can work only then. 6769 */ 6770 ill_refrele(ill); 6771 if (ipsq != NULL) { 6772 (*func)(ipsq, q, mp, NULL); 6773 ipsq_exit(ipsq); 6774 } 6775 } 6776 6777 /* 6778 * Exit the specified IPSQ. If this is the final exit on it then drain it 6779 * prior to exiting. Caller must be writer on the specified IPSQ. 6780 */ 6781 void 6782 ipsq_exit(ipsq_t *ipsq) 6783 { 6784 mblk_t *mp; 6785 ipsq_t *mp_ipsq; 6786 queue_t *q; 6787 phyint_t *phyi; 6788 ipsq_func_t func; 6789 6790 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6791 6792 ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1); 6793 if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) { 6794 ipsq->ipsq_xop->ipx_reentry_cnt--; 6795 return; 6796 } 6797 6798 for (;;) { 6799 phyi = ipsq->ipsq_phyint; 6800 mp = ipsq_dq(ipsq); 6801 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next; 6802 6803 /* 6804 * If we've changed to a new IPSQ, and the phyint associated 6805 * with the old one has gone away, free the old IPSQ. Note 6806 * that this cannot happen while the IPSQ is in a group. 6807 */ 6808 if (mp_ipsq != ipsq && phyi == NULL) { 6809 ASSERT(ipsq->ipsq_next == ipsq); 6810 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 6811 ipsq_delete(ipsq); 6812 } 6813 6814 if (mp == NULL) 6815 break; 6816 6817 q = mp->b_queue; 6818 func = (ipsq_func_t)mp->b_prev; 6819 ipsq = mp_ipsq; 6820 mp->b_next = mp->b_prev = NULL; 6821 mp->b_queue = NULL; 6822 6823 /* 6824 * If 'q' is an conn queue, it is valid, since we did a 6825 * a refhold on the conn at the start of the ioctl. 6826 * If 'q' is an ill queue, it is valid, since close of an 6827 * ill will clean up its IPSQ. 6828 */ 6829 (*func)(ipsq, q, mp, NULL); 6830 } 6831 } 6832 6833 /* 6834 * Used to start any igmp or mld timers that could not be started 6835 * while holding ill_mcast_lock. The timers can't be started while holding 6836 * the lock, since mld/igmp_start_timers may need to call untimeout() 6837 * which can't be done while holding the lock which the timeout handler 6838 * acquires. Otherwise 6839 * there could be a deadlock since the timeout handlers 6840 * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire 6841 * ill_mcast_lock. 6842 */ 6843 void 6844 ill_mcast_timer_start(ip_stack_t *ipst) 6845 { 6846 int next; 6847 6848 mutex_enter(&ipst->ips_igmp_timer_lock); 6849 next = ipst->ips_igmp_deferred_next; 6850 ipst->ips_igmp_deferred_next = INFINITY; 6851 mutex_exit(&ipst->ips_igmp_timer_lock); 6852 6853 if (next != INFINITY) 6854 igmp_start_timers(next, ipst); 6855 6856 mutex_enter(&ipst->ips_mld_timer_lock); 6857 next = ipst->ips_mld_deferred_next; 6858 ipst->ips_mld_deferred_next = INFINITY; 6859 mutex_exit(&ipst->ips_mld_timer_lock); 6860 6861 if (next != INFINITY) 6862 mld_start_timers(next, ipst); 6863 } 6864 6865 /* 6866 * Start the current exclusive operation on `ipsq'; associate it with `ipif' 6867 * and `ioccmd'. 6868 */ 6869 void 6870 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) 6871 { 6872 ill_t *ill = ipif->ipif_ill; 6873 ipxop_t *ipx = ipsq->ipsq_xop; 6874 6875 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6876 ASSERT(ipx->ipx_current_ipif == NULL); 6877 ASSERT(ipx->ipx_current_ioctl == 0); 6878 6879 ipx->ipx_current_done = B_FALSE; 6880 ipx->ipx_current_ioctl = ioccmd; 6881 mutex_enter(&ipx->ipx_lock); 6882 ipx->ipx_current_ipif = ipif; 6883 mutex_exit(&ipx->ipx_lock); 6884 6885 /* 6886 * Set IPIF_CHANGING on one or more ipifs associated with the 6887 * current exclusive operation. IPIF_CHANGING prevents any new 6888 * references to the ipif (so that the references will eventually 6889 * drop to zero) and also prevents any "get" operations (e.g., 6890 * SIOCGLIFFLAGS) from being able to access the ipif until the 6891 * operation has completed and the ipif is again in a stable state. 6892 * 6893 * For ioctls, IPIF_CHANGING is set on the ipif associated with the 6894 * ioctl. For internal operations (where ioccmd is zero), all ipifs 6895 * on the ill are marked with IPIF_CHANGING since it's unclear which 6896 * ipifs will be affected. 6897 * 6898 * Note that SIOCLIFREMOVEIF is a special case as it sets 6899 * IPIF_CONDEMNED internally after identifying the right ipif to 6900 * operate on. 6901 */ 6902 switch (ioccmd) { 6903 case SIOCLIFREMOVEIF: 6904 break; 6905 case 0: 6906 mutex_enter(&ill->ill_lock); 6907 ipif = ipif->ipif_ill->ill_ipif; 6908 for (; ipif != NULL; ipif = ipif->ipif_next) 6909 ipif->ipif_state_flags |= IPIF_CHANGING; 6910 mutex_exit(&ill->ill_lock); 6911 break; 6912 default: 6913 mutex_enter(&ill->ill_lock); 6914 ipif->ipif_state_flags |= IPIF_CHANGING; 6915 mutex_exit(&ill->ill_lock); 6916 } 6917 } 6918 6919 /* 6920 * Finish the current exclusive operation on `ipsq'. Usually, this will allow 6921 * the next exclusive operation to begin once we ipsq_exit(). However, if 6922 * pending DLPI operations remain, then we will wait for the queue to drain 6923 * before allowing the next exclusive operation to begin. This ensures that 6924 * DLPI operations from one exclusive operation are never improperly processed 6925 * as part of a subsequent exclusive operation. 6926 */ 6927 void 6928 ipsq_current_finish(ipsq_t *ipsq) 6929 { 6930 ipxop_t *ipx = ipsq->ipsq_xop; 6931 t_uscalar_t dlpi_pending = DL_PRIM_INVAL; 6932 ipif_t *ipif = ipx->ipx_current_ipif; 6933 6934 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6935 6936 /* 6937 * For SIOCLIFREMOVEIF, the ipif has been already been blown away 6938 * (but in that case, IPIF_CHANGING will already be clear and no 6939 * pending DLPI messages can remain). 6940 */ 6941 if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) { 6942 ill_t *ill = ipif->ipif_ill; 6943 6944 mutex_enter(&ill->ill_lock); 6945 dlpi_pending = ill->ill_dlpi_pending; 6946 if (ipx->ipx_current_ioctl == 0) { 6947 ipif = ill->ill_ipif; 6948 for (; ipif != NULL; ipif = ipif->ipif_next) 6949 ipif->ipif_state_flags &= ~IPIF_CHANGING; 6950 } else { 6951 ipif->ipif_state_flags &= ~IPIF_CHANGING; 6952 } 6953 mutex_exit(&ill->ill_lock); 6954 } 6955 6956 ASSERT(!ipx->ipx_current_done); 6957 ipx->ipx_current_done = B_TRUE; 6958 ipx->ipx_current_ioctl = 0; 6959 if (dlpi_pending == DL_PRIM_INVAL) { 6960 mutex_enter(&ipx->ipx_lock); 6961 ipx->ipx_current_ipif = NULL; 6962 mutex_exit(&ipx->ipx_lock); 6963 } 6964 } 6965 6966 /* 6967 * The ill is closing. Flush all messages on the ipsq that originated 6968 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 6969 * for this ill since ipsq_enter could not have entered until then. 6970 * New messages can't be queued since the CONDEMNED flag is set. 6971 */ 6972 static void 6973 ipsq_flush(ill_t *ill) 6974 { 6975 queue_t *q; 6976 mblk_t *prev; 6977 mblk_t *mp; 6978 mblk_t *mp_next; 6979 ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 6980 6981 ASSERT(IAM_WRITER_ILL(ill)); 6982 6983 /* 6984 * Flush any messages sent up by the driver. 6985 */ 6986 mutex_enter(&ipx->ipx_lock); 6987 for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) { 6988 mp_next = mp->b_next; 6989 q = mp->b_queue; 6990 if (q == ill->ill_rq || q == ill->ill_wq) { 6991 /* dequeue mp */ 6992 if (prev == NULL) 6993 ipx->ipx_mphead = mp->b_next; 6994 else 6995 prev->b_next = mp->b_next; 6996 if (ipx->ipx_mptail == mp) { 6997 ASSERT(mp_next == NULL); 6998 ipx->ipx_mptail = prev; 6999 } 7000 inet_freemsg(mp); 7001 } else { 7002 prev = mp; 7003 } 7004 } 7005 mutex_exit(&ipx->ipx_lock); 7006 (void) ipsq_pending_mp_cleanup(ill, NULL); 7007 ipsq_xopq_mp_cleanup(ill, NULL); 7008 } 7009 7010 /* 7011 * Parse an ifreq or lifreq struct coming down ioctls and refhold 7012 * and return the associated ipif. 7013 * Return value: 7014 * Non zero: An error has occurred. ci may not be filled out. 7015 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 7016 * a held ipif in ci.ci_ipif. 7017 */ 7018 int 7019 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 7020 cmd_info_t *ci) 7021 { 7022 char *name; 7023 struct ifreq *ifr; 7024 struct lifreq *lifr; 7025 ipif_t *ipif = NULL; 7026 ill_t *ill; 7027 conn_t *connp; 7028 boolean_t isv6; 7029 boolean_t exists; 7030 mblk_t *mp1; 7031 zoneid_t zoneid; 7032 ip_stack_t *ipst; 7033 7034 if (q->q_next != NULL) { 7035 ill = (ill_t *)q->q_ptr; 7036 isv6 = ill->ill_isv6; 7037 connp = NULL; 7038 zoneid = ALL_ZONES; 7039 ipst = ill->ill_ipst; 7040 } else { 7041 ill = NULL; 7042 connp = Q_TO_CONN(q); 7043 isv6 = (connp->conn_family == AF_INET6); 7044 zoneid = connp->conn_zoneid; 7045 if (zoneid == GLOBAL_ZONEID) { 7046 /* global zone can access ipifs in all zones */ 7047 zoneid = ALL_ZONES; 7048 } 7049 ipst = connp->conn_netstack->netstack_ip; 7050 } 7051 7052 /* Has been checked in ip_wput_nondata */ 7053 mp1 = mp->b_cont->b_cont; 7054 7055 if (ipip->ipi_cmd_type == IF_CMD) { 7056 /* This a old style SIOC[GS]IF* command */ 7057 ifr = (struct ifreq *)mp1->b_rptr; 7058 /* 7059 * Null terminate the string to protect against buffer 7060 * overrun. String was generated by user code and may not 7061 * be trusted. 7062 */ 7063 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 7064 name = ifr->ifr_name; 7065 ci->ci_sin = (sin_t *)&ifr->ifr_addr; 7066 ci->ci_sin6 = NULL; 7067 ci->ci_lifr = (struct lifreq *)ifr; 7068 } else { 7069 /* This a new style SIOC[GS]LIF* command */ 7070 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 7071 lifr = (struct lifreq *)mp1->b_rptr; 7072 /* 7073 * Null terminate the string to protect against buffer 7074 * overrun. String was generated by user code and may not 7075 * be trusted. 7076 */ 7077 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 7078 name = lifr->lifr_name; 7079 ci->ci_sin = (sin_t *)&lifr->lifr_addr; 7080 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr; 7081 ci->ci_lifr = lifr; 7082 } 7083 7084 if (ipip->ipi_cmd == SIOCSLIFNAME) { 7085 /* 7086 * The ioctl will be failed if the ioctl comes down 7087 * an conn stream 7088 */ 7089 if (ill == NULL) { 7090 /* 7091 * Not an ill queue, return EINVAL same as the 7092 * old error code. 7093 */ 7094 return (ENXIO); 7095 } 7096 ipif = ill->ill_ipif; 7097 ipif_refhold(ipif); 7098 } else { 7099 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 7100 &exists, isv6, zoneid, ipst); 7101 7102 /* 7103 * Ensure that get ioctls don't see any internal state changes 7104 * caused by set ioctls by deferring them if IPIF_CHANGING is 7105 * set. 7106 */ 7107 if (ipif != NULL && !(ipip->ipi_flags & IPI_WR) && 7108 !IAM_WRITER_IPIF(ipif)) { 7109 ipsq_t *ipsq; 7110 7111 if (connp != NULL) 7112 mutex_enter(&connp->conn_lock); 7113 mutex_enter(&ipif->ipif_ill->ill_lock); 7114 if (IPIF_IS_CHANGING(ipif) && 7115 !IPIF_IS_CONDEMNED(ipif)) { 7116 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 7117 mutex_enter(&ipsq->ipsq_lock); 7118 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 7119 mutex_exit(&ipif->ipif_ill->ill_lock); 7120 ipsq_enq(ipsq, q, mp, ip_process_ioctl, 7121 NEW_OP, ipif->ipif_ill); 7122 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 7123 mutex_exit(&ipsq->ipsq_lock); 7124 if (connp != NULL) 7125 mutex_exit(&connp->conn_lock); 7126 ipif_refrele(ipif); 7127 return (EINPROGRESS); 7128 } 7129 mutex_exit(&ipif->ipif_ill->ill_lock); 7130 if (connp != NULL) 7131 mutex_exit(&connp->conn_lock); 7132 } 7133 } 7134 7135 /* 7136 * Old style [GS]IFCMD does not admit IPv6 ipif 7137 */ 7138 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) { 7139 ipif_refrele(ipif); 7140 return (ENXIO); 7141 } 7142 7143 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 7144 name[0] == '\0') { 7145 /* 7146 * Handle a or a SIOC?IF* with a null name 7147 * during plumb (on the ill queue before the I_PLINK). 7148 */ 7149 ipif = ill->ill_ipif; 7150 ipif_refhold(ipif); 7151 } 7152 7153 if (ipif == NULL) 7154 return (ENXIO); 7155 7156 DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq", 7157 int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif); 7158 7159 ci->ci_ipif = ipif; 7160 return (0); 7161 } 7162 7163 /* 7164 * Return the total number of ipifs. 7165 */ 7166 static uint_t 7167 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst) 7168 { 7169 uint_t numifs = 0; 7170 ill_t *ill; 7171 ill_walk_context_t ctx; 7172 ipif_t *ipif; 7173 7174 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7175 ill = ILL_START_WALK_V4(&ctx, ipst); 7176 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7177 if (IS_UNDER_IPMP(ill)) 7178 continue; 7179 for (ipif = ill->ill_ipif; ipif != NULL; 7180 ipif = ipif->ipif_next) { 7181 if (ipif->ipif_zoneid == zoneid || 7182 ipif->ipif_zoneid == ALL_ZONES) 7183 numifs++; 7184 } 7185 } 7186 rw_exit(&ipst->ips_ill_g_lock); 7187 return (numifs); 7188 } 7189 7190 /* 7191 * Return the total number of ipifs. 7192 */ 7193 static uint_t 7194 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst) 7195 { 7196 uint_t numifs = 0; 7197 ill_t *ill; 7198 ipif_t *ipif; 7199 ill_walk_context_t ctx; 7200 7201 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 7202 7203 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7204 if (family == AF_INET) 7205 ill = ILL_START_WALK_V4(&ctx, ipst); 7206 else if (family == AF_INET6) 7207 ill = ILL_START_WALK_V6(&ctx, ipst); 7208 else 7209 ill = ILL_START_WALK_ALL(&ctx, ipst); 7210 7211 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7212 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP)) 7213 continue; 7214 7215 for (ipif = ill->ill_ipif; ipif != NULL; 7216 ipif = ipif->ipif_next) { 7217 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7218 !(lifn_flags & LIFC_NOXMIT)) 7219 continue; 7220 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7221 !(lifn_flags & LIFC_TEMPORARY)) 7222 continue; 7223 if (((ipif->ipif_flags & 7224 (IPIF_NOXMIT|IPIF_NOLOCAL| 7225 IPIF_DEPRECATED)) || 7226 IS_LOOPBACK(ill) || 7227 !(ipif->ipif_flags & IPIF_UP)) && 7228 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 7229 continue; 7230 7231 if (zoneid != ipif->ipif_zoneid && 7232 ipif->ipif_zoneid != ALL_ZONES && 7233 (zoneid != GLOBAL_ZONEID || 7234 !(lifn_flags & LIFC_ALLZONES))) 7235 continue; 7236 7237 numifs++; 7238 } 7239 } 7240 rw_exit(&ipst->ips_ill_g_lock); 7241 return (numifs); 7242 } 7243 7244 uint_t 7245 ip_get_lifsrcofnum(ill_t *ill) 7246 { 7247 uint_t numifs = 0; 7248 ill_t *ill_head = ill; 7249 ip_stack_t *ipst = ill->ill_ipst; 7250 7251 /* 7252 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 7253 * other thread may be trying to relink the ILLs in this usesrc group 7254 * and adjusting the ill_usesrc_grp_next pointers 7255 */ 7256 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 7257 if ((ill->ill_usesrc_ifindex == 0) && 7258 (ill->ill_usesrc_grp_next != NULL)) { 7259 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 7260 ill = ill->ill_usesrc_grp_next) 7261 numifs++; 7262 } 7263 rw_exit(&ipst->ips_ill_g_usesrc_lock); 7264 7265 return (numifs); 7266 } 7267 7268 /* Null values are passed in for ipif, sin, and ifreq */ 7269 /* ARGSUSED */ 7270 int 7271 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7272 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7273 { 7274 int *nump; 7275 conn_t *connp = Q_TO_CONN(q); 7276 7277 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7278 7279 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 7280 nump = (int *)mp->b_cont->b_cont->b_rptr; 7281 7282 *nump = ip_get_numifs(connp->conn_zoneid, 7283 connp->conn_netstack->netstack_ip); 7284 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 7285 return (0); 7286 } 7287 7288 /* Null values are passed in for ipif, sin, and ifreq */ 7289 /* ARGSUSED */ 7290 int 7291 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 7292 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7293 { 7294 struct lifnum *lifn; 7295 mblk_t *mp1; 7296 conn_t *connp = Q_TO_CONN(q); 7297 7298 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7299 7300 /* Existence checked in ip_wput_nondata */ 7301 mp1 = mp->b_cont->b_cont; 7302 7303 lifn = (struct lifnum *)mp1->b_rptr; 7304 switch (lifn->lifn_family) { 7305 case AF_UNSPEC: 7306 case AF_INET: 7307 case AF_INET6: 7308 break; 7309 default: 7310 return (EAFNOSUPPORT); 7311 } 7312 7313 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 7314 connp->conn_zoneid, connp->conn_netstack->netstack_ip); 7315 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 7316 return (0); 7317 } 7318 7319 /* ARGSUSED */ 7320 int 7321 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7322 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7323 { 7324 STRUCT_HANDLE(ifconf, ifc); 7325 mblk_t *mp1; 7326 struct iocblk *iocp; 7327 struct ifreq *ifr; 7328 ill_walk_context_t ctx; 7329 ill_t *ill; 7330 ipif_t *ipif; 7331 struct sockaddr_in *sin; 7332 int32_t ifclen; 7333 zoneid_t zoneid; 7334 ip_stack_t *ipst = CONNQ_TO_IPST(q); 7335 7336 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 7337 7338 ip1dbg(("ip_sioctl_get_ifconf")); 7339 /* Existence verified in ip_wput_nondata */ 7340 mp1 = mp->b_cont->b_cont; 7341 iocp = (struct iocblk *)mp->b_rptr; 7342 zoneid = Q_TO_CONN(q)->conn_zoneid; 7343 7344 /* 7345 * The original SIOCGIFCONF passed in a struct ifconf which specified 7346 * the user buffer address and length into which the list of struct 7347 * ifreqs was to be copied. Since AT&T Streams does not seem to 7348 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 7349 * the SIOCGIFCONF operation was redefined to simply provide 7350 * a large output buffer into which we are supposed to jam the ifreq 7351 * array. The same ioctl command code was used, despite the fact that 7352 * both the applications and the kernel code had to change, thus making 7353 * it impossible to support both interfaces. 7354 * 7355 * For reasons not good enough to try to explain, the following 7356 * algorithm is used for deciding what to do with one of these: 7357 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 7358 * form with the output buffer coming down as the continuation message. 7359 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 7360 * and we have to copy in the ifconf structure to find out how big the 7361 * output buffer is and where to copy out to. Sure no problem... 7362 * 7363 */ 7364 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 7365 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 7366 int numifs = 0; 7367 size_t ifc_bufsize; 7368 7369 /* 7370 * Must be (better be!) continuation of a TRANSPARENT 7371 * IOCTL. We just copied in the ifconf structure. 7372 */ 7373 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 7374 (struct ifconf *)mp1->b_rptr); 7375 7376 /* 7377 * Allocate a buffer to hold requested information. 7378 * 7379 * If ifc_len is larger than what is needed, we only 7380 * allocate what we will use. 7381 * 7382 * If ifc_len is smaller than what is needed, return 7383 * EINVAL. 7384 * 7385 * XXX: the ill_t structure can hava 2 counters, for 7386 * v4 and v6 (not just ill_ipif_up_count) to store the 7387 * number of interfaces for a device, so we don't need 7388 * to count them here... 7389 */ 7390 numifs = ip_get_numifs(zoneid, ipst); 7391 7392 ifclen = STRUCT_FGET(ifc, ifc_len); 7393 ifc_bufsize = numifs * sizeof (struct ifreq); 7394 if (ifc_bufsize > ifclen) { 7395 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 7396 /* old behaviour */ 7397 return (EINVAL); 7398 } else { 7399 ifc_bufsize = ifclen; 7400 } 7401 } 7402 7403 mp1 = mi_copyout_alloc(q, mp, 7404 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 7405 if (mp1 == NULL) 7406 return (ENOMEM); 7407 7408 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 7409 } 7410 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 7411 /* 7412 * the SIOCGIFCONF ioctl only knows about 7413 * IPv4 addresses, so don't try to tell 7414 * it about interfaces with IPv6-only 7415 * addresses. (Last parm 'isv6' is B_FALSE) 7416 */ 7417 7418 ifr = (struct ifreq *)mp1->b_rptr; 7419 7420 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7421 ill = ILL_START_WALK_V4(&ctx, ipst); 7422 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7423 if (IS_UNDER_IPMP(ill)) 7424 continue; 7425 for (ipif = ill->ill_ipif; ipif != NULL; 7426 ipif = ipif->ipif_next) { 7427 if (zoneid != ipif->ipif_zoneid && 7428 ipif->ipif_zoneid != ALL_ZONES) 7429 continue; 7430 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 7431 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 7432 /* old behaviour */ 7433 rw_exit(&ipst->ips_ill_g_lock); 7434 return (EINVAL); 7435 } else { 7436 goto if_copydone; 7437 } 7438 } 7439 ipif_get_name(ipif, ifr->ifr_name, 7440 sizeof (ifr->ifr_name)); 7441 sin = (sin_t *)&ifr->ifr_addr; 7442 *sin = sin_null; 7443 sin->sin_family = AF_INET; 7444 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 7445 ifr++; 7446 } 7447 } 7448 if_copydone: 7449 rw_exit(&ipst->ips_ill_g_lock); 7450 mp1->b_wptr = (uchar_t *)ifr; 7451 7452 if (STRUCT_BUF(ifc) != NULL) { 7453 STRUCT_FSET(ifc, ifc_len, 7454 (int)((uchar_t *)ifr - mp1->b_rptr)); 7455 } 7456 return (0); 7457 } 7458 7459 /* 7460 * Get the interfaces using the address hosted on the interface passed in, 7461 * as a source adddress 7462 */ 7463 /* ARGSUSED */ 7464 int 7465 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7466 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7467 { 7468 mblk_t *mp1; 7469 ill_t *ill, *ill_head; 7470 ipif_t *ipif, *orig_ipif; 7471 int numlifs = 0; 7472 size_t lifs_bufsize, lifsmaxlen; 7473 struct lifreq *lifr; 7474 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7475 uint_t ifindex; 7476 zoneid_t zoneid; 7477 boolean_t isv6 = B_FALSE; 7478 struct sockaddr_in *sin; 7479 struct sockaddr_in6 *sin6; 7480 STRUCT_HANDLE(lifsrcof, lifs); 7481 ip_stack_t *ipst; 7482 7483 ipst = CONNQ_TO_IPST(q); 7484 7485 ASSERT(q->q_next == NULL); 7486 7487 zoneid = Q_TO_CONN(q)->conn_zoneid; 7488 7489 /* Existence verified in ip_wput_nondata */ 7490 mp1 = mp->b_cont->b_cont; 7491 7492 /* 7493 * Must be (better be!) continuation of a TRANSPARENT 7494 * IOCTL. We just copied in the lifsrcof structure. 7495 */ 7496 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 7497 (struct lifsrcof *)mp1->b_rptr); 7498 7499 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 7500 return (EINVAL); 7501 7502 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 7503 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; 7504 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst); 7505 if (ipif == NULL) { 7506 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 7507 ifindex)); 7508 return (ENXIO); 7509 } 7510 7511 /* Allocate a buffer to hold requested information */ 7512 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 7513 lifs_bufsize = numlifs * sizeof (struct lifreq); 7514 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 7515 /* The actual size needed is always returned in lifs_len */ 7516 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 7517 7518 /* If the amount we need is more than what is passed in, abort */ 7519 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 7520 ipif_refrele(ipif); 7521 return (0); 7522 } 7523 7524 mp1 = mi_copyout_alloc(q, mp, 7525 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 7526 if (mp1 == NULL) { 7527 ipif_refrele(ipif); 7528 return (ENOMEM); 7529 } 7530 7531 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 7532 bzero(mp1->b_rptr, lifs_bufsize); 7533 7534 lifr = (struct lifreq *)mp1->b_rptr; 7535 7536 ill = ill_head = ipif->ipif_ill; 7537 orig_ipif = ipif; 7538 7539 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 7540 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 7541 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7542 7543 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 7544 for (; (ill != NULL) && (ill != ill_head); 7545 ill = ill->ill_usesrc_grp_next) { 7546 7547 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 7548 break; 7549 7550 ipif = ill->ill_ipif; 7551 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name)); 7552 if (ipif->ipif_isv6) { 7553 sin6 = (sin6_t *)&lifr->lifr_addr; 7554 *sin6 = sin6_null; 7555 sin6->sin6_family = AF_INET6; 7556 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 7557 lifr->lifr_addrlen = ip_mask_to_plen_v6( 7558 &ipif->ipif_v6net_mask); 7559 } else { 7560 sin = (sin_t *)&lifr->lifr_addr; 7561 *sin = sin_null; 7562 sin->sin_family = AF_INET; 7563 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 7564 lifr->lifr_addrlen = ip_mask_to_plen( 7565 ipif->ipif_net_mask); 7566 } 7567 lifr++; 7568 } 7569 rw_exit(&ipst->ips_ill_g_usesrc_lock); 7570 rw_exit(&ipst->ips_ill_g_lock); 7571 ipif_refrele(orig_ipif); 7572 mp1->b_wptr = (uchar_t *)lifr; 7573 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 7574 7575 return (0); 7576 } 7577 7578 /* ARGSUSED */ 7579 int 7580 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7581 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7582 { 7583 mblk_t *mp1; 7584 int list; 7585 ill_t *ill; 7586 ipif_t *ipif; 7587 int flags; 7588 int numlifs = 0; 7589 size_t lifc_bufsize; 7590 struct lifreq *lifr; 7591 sa_family_t family; 7592 struct sockaddr_in *sin; 7593 struct sockaddr_in6 *sin6; 7594 ill_walk_context_t ctx; 7595 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7596 int32_t lifclen; 7597 zoneid_t zoneid; 7598 STRUCT_HANDLE(lifconf, lifc); 7599 ip_stack_t *ipst = CONNQ_TO_IPST(q); 7600 7601 ip1dbg(("ip_sioctl_get_lifconf")); 7602 7603 ASSERT(q->q_next == NULL); 7604 7605 zoneid = Q_TO_CONN(q)->conn_zoneid; 7606 7607 /* Existence verified in ip_wput_nondata */ 7608 mp1 = mp->b_cont->b_cont; 7609 7610 /* 7611 * An extended version of SIOCGIFCONF that takes an 7612 * additional address family and flags field. 7613 * AF_UNSPEC retrieve both IPv4 and IPv6. 7614 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 7615 * interfaces are omitted. 7616 * Similarly, IPIF_TEMPORARY interfaces are omitted 7617 * unless LIFC_TEMPORARY is specified. 7618 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 7619 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 7620 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 7621 * has priority over LIFC_NOXMIT. 7622 */ 7623 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 7624 7625 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 7626 return (EINVAL); 7627 7628 /* 7629 * Must be (better be!) continuation of a TRANSPARENT 7630 * IOCTL. We just copied in the lifconf structure. 7631 */ 7632 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 7633 7634 family = STRUCT_FGET(lifc, lifc_family); 7635 flags = STRUCT_FGET(lifc, lifc_flags); 7636 7637 switch (family) { 7638 case AF_UNSPEC: 7639 /* 7640 * walk all ILL's. 7641 */ 7642 list = MAX_G_HEADS; 7643 break; 7644 case AF_INET: 7645 /* 7646 * walk only IPV4 ILL's. 7647 */ 7648 list = IP_V4_G_HEAD; 7649 break; 7650 case AF_INET6: 7651 /* 7652 * walk only IPV6 ILL's. 7653 */ 7654 list = IP_V6_G_HEAD; 7655 break; 7656 default: 7657 return (EAFNOSUPPORT); 7658 } 7659 7660 /* 7661 * Allocate a buffer to hold requested information. 7662 * 7663 * If lifc_len is larger than what is needed, we only 7664 * allocate what we will use. 7665 * 7666 * If lifc_len is smaller than what is needed, return 7667 * EINVAL. 7668 */ 7669 numlifs = ip_get_numlifs(family, flags, zoneid, ipst); 7670 lifc_bufsize = numlifs * sizeof (struct lifreq); 7671 lifclen = STRUCT_FGET(lifc, lifc_len); 7672 if (lifc_bufsize > lifclen) { 7673 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 7674 return (EINVAL); 7675 else 7676 lifc_bufsize = lifclen; 7677 } 7678 7679 mp1 = mi_copyout_alloc(q, mp, 7680 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 7681 if (mp1 == NULL) 7682 return (ENOMEM); 7683 7684 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 7685 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 7686 7687 lifr = (struct lifreq *)mp1->b_rptr; 7688 7689 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7690 ill = ill_first(list, list, &ctx, ipst); 7691 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7692 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP)) 7693 continue; 7694 7695 for (ipif = ill->ill_ipif; ipif != NULL; 7696 ipif = ipif->ipif_next) { 7697 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7698 !(flags & LIFC_NOXMIT)) 7699 continue; 7700 7701 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7702 !(flags & LIFC_TEMPORARY)) 7703 continue; 7704 7705 if (((ipif->ipif_flags & 7706 (IPIF_NOXMIT|IPIF_NOLOCAL| 7707 IPIF_DEPRECATED)) || 7708 IS_LOOPBACK(ill) || 7709 !(ipif->ipif_flags & IPIF_UP)) && 7710 (flags & LIFC_EXTERNAL_SOURCE)) 7711 continue; 7712 7713 if (zoneid != ipif->ipif_zoneid && 7714 ipif->ipif_zoneid != ALL_ZONES && 7715 (zoneid != GLOBAL_ZONEID || 7716 !(flags & LIFC_ALLZONES))) 7717 continue; 7718 7719 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 7720 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 7721 rw_exit(&ipst->ips_ill_g_lock); 7722 return (EINVAL); 7723 } else { 7724 goto lif_copydone; 7725 } 7726 } 7727 7728 ipif_get_name(ipif, lifr->lifr_name, 7729 sizeof (lifr->lifr_name)); 7730 lifr->lifr_type = ill->ill_type; 7731 if (ipif->ipif_isv6) { 7732 sin6 = (sin6_t *)&lifr->lifr_addr; 7733 *sin6 = sin6_null; 7734 sin6->sin6_family = AF_INET6; 7735 sin6->sin6_addr = 7736 ipif->ipif_v6lcl_addr; 7737 lifr->lifr_addrlen = 7738 ip_mask_to_plen_v6( 7739 &ipif->ipif_v6net_mask); 7740 } else { 7741 sin = (sin_t *)&lifr->lifr_addr; 7742 *sin = sin_null; 7743 sin->sin_family = AF_INET; 7744 sin->sin_addr.s_addr = 7745 ipif->ipif_lcl_addr; 7746 lifr->lifr_addrlen = 7747 ip_mask_to_plen( 7748 ipif->ipif_net_mask); 7749 } 7750 lifr++; 7751 } 7752 } 7753 lif_copydone: 7754 rw_exit(&ipst->ips_ill_g_lock); 7755 7756 mp1->b_wptr = (uchar_t *)lifr; 7757 if (STRUCT_BUF(lifc) != NULL) { 7758 STRUCT_FSET(lifc, lifc_len, 7759 (int)((uchar_t *)lifr - mp1->b_rptr)); 7760 } 7761 return (0); 7762 } 7763 7764 static void 7765 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 7766 { 7767 ip6_asp_t *table; 7768 size_t table_size; 7769 mblk_t *data_mp; 7770 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7771 ip_stack_t *ipst; 7772 7773 if (q->q_next == NULL) 7774 ipst = CONNQ_TO_IPST(q); 7775 else 7776 ipst = ILLQ_TO_IPST(q); 7777 7778 /* These two ioctls are I_STR only */ 7779 if (iocp->ioc_count == TRANSPARENT) { 7780 miocnak(q, mp, 0, EINVAL); 7781 return; 7782 } 7783 7784 data_mp = mp->b_cont; 7785 if (data_mp == NULL) { 7786 /* The user passed us a NULL argument */ 7787 table = NULL; 7788 table_size = iocp->ioc_count; 7789 } else { 7790 /* 7791 * The user provided a table. The stream head 7792 * may have copied in the user data in chunks, 7793 * so make sure everything is pulled up 7794 * properly. 7795 */ 7796 if (MBLKL(data_mp) < iocp->ioc_count) { 7797 mblk_t *new_data_mp; 7798 if ((new_data_mp = msgpullup(data_mp, -1)) == 7799 NULL) { 7800 miocnak(q, mp, 0, ENOMEM); 7801 return; 7802 } 7803 freemsg(data_mp); 7804 data_mp = new_data_mp; 7805 mp->b_cont = data_mp; 7806 } 7807 table = (ip6_asp_t *)data_mp->b_rptr; 7808 table_size = iocp->ioc_count; 7809 } 7810 7811 switch (iocp->ioc_cmd) { 7812 case SIOCGIP6ADDRPOLICY: 7813 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst); 7814 if (iocp->ioc_rval == -1) 7815 iocp->ioc_error = EINVAL; 7816 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 7817 else if (table != NULL && 7818 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 7819 ip6_asp_t *src = table; 7820 ip6_asp32_t *dst = (void *)table; 7821 int count = table_size / sizeof (ip6_asp_t); 7822 int i; 7823 7824 /* 7825 * We need to do an in-place shrink of the array 7826 * to match the alignment attributes of the 7827 * 32-bit ABI looking at it. 7828 */ 7829 /* LINTED: logical expression always true: op "||" */ 7830 ASSERT(sizeof (*src) > sizeof (*dst)); 7831 for (i = 1; i < count; i++) 7832 bcopy(src + i, dst + i, sizeof (*dst)); 7833 } 7834 #endif 7835 break; 7836 7837 case SIOCSIP6ADDRPOLICY: 7838 ASSERT(mp->b_prev == NULL); 7839 mp->b_prev = (void *)q; 7840 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 7841 /* 7842 * We pass in the datamodel here so that the ip6_asp_replace() 7843 * routine can handle converting from 32-bit to native formats 7844 * where necessary. 7845 * 7846 * A better way to handle this might be to convert the inbound 7847 * data structure here, and hang it off a new 'mp'; thus the 7848 * ip6_asp_replace() logic would always be dealing with native 7849 * format data structures.. 7850 * 7851 * (An even simpler way to handle these ioctls is to just 7852 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 7853 * and just recompile everything that depends on it.) 7854 */ 7855 #endif 7856 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst, 7857 iocp->ioc_flag & IOC_MODELS); 7858 return; 7859 } 7860 7861 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 7862 qreply(q, mp); 7863 } 7864 7865 static void 7866 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 7867 { 7868 mblk_t *data_mp; 7869 struct dstinforeq *dir; 7870 uint8_t *end, *cur; 7871 in6_addr_t *daddr, *saddr; 7872 ipaddr_t v4daddr; 7873 ire_t *ire; 7874 ipaddr_t v4setsrc; 7875 in6_addr_t v6setsrc; 7876 char *slabel, *dlabel; 7877 boolean_t isipv4; 7878 int match_ire; 7879 ill_t *dst_ill; 7880 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7881 conn_t *connp = Q_TO_CONN(q); 7882 zoneid_t zoneid = IPCL_ZONEID(connp); 7883 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 7884 uint64_t ipif_flags; 7885 7886 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 7887 7888 /* 7889 * This ioctl is I_STR only, and must have a 7890 * data mblk following the M_IOCTL mblk. 7891 */ 7892 data_mp = mp->b_cont; 7893 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 7894 miocnak(q, mp, 0, EINVAL); 7895 return; 7896 } 7897 7898 if (MBLKL(data_mp) < iocp->ioc_count) { 7899 mblk_t *new_data_mp; 7900 7901 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 7902 miocnak(q, mp, 0, ENOMEM); 7903 return; 7904 } 7905 freemsg(data_mp); 7906 data_mp = new_data_mp; 7907 mp->b_cont = data_mp; 7908 } 7909 match_ire = MATCH_IRE_DSTONLY; 7910 7911 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 7912 end - cur >= sizeof (struct dstinforeq); 7913 cur += sizeof (struct dstinforeq)) { 7914 dir = (struct dstinforeq *)cur; 7915 daddr = &dir->dir_daddr; 7916 saddr = &dir->dir_saddr; 7917 7918 /* 7919 * ip_addr_scope_v6() and ip6_asp_lookup() handle 7920 * v4 mapped addresses; ire_ftable_lookup_v6() 7921 * and ip_select_source_v6() do not. 7922 */ 7923 dir->dir_dscope = ip_addr_scope_v6(daddr); 7924 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst); 7925 7926 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 7927 if (isipv4) { 7928 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 7929 v4setsrc = INADDR_ANY; 7930 ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid, 7931 NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v4setsrc, 7932 NULL, NULL); 7933 } else { 7934 v6setsrc = ipv6_all_zeros; 7935 ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid, 7936 NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v6setsrc, 7937 NULL, NULL); 7938 } 7939 ASSERT(ire != NULL); 7940 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 7941 ire_refrele(ire); 7942 dir->dir_dreachable = 0; 7943 7944 /* move on to next dst addr */ 7945 continue; 7946 } 7947 dir->dir_dreachable = 1; 7948 7949 dst_ill = ire_nexthop_ill(ire); 7950 if (dst_ill == NULL) { 7951 ire_refrele(ire); 7952 continue; 7953 } 7954 7955 /* With ipmp we most likely look at the ipmp ill here */ 7956 dir->dir_dmactype = dst_ill->ill_mactype; 7957 7958 if (isipv4) { 7959 ipaddr_t v4saddr; 7960 7961 if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr, 7962 connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst, 7963 &v4saddr, NULL, &ipif_flags) != 0) { 7964 v4saddr = INADDR_ANY; 7965 ipif_flags = 0; 7966 } 7967 IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr); 7968 } else { 7969 if (ip_select_source_v6(dst_ill, &v6setsrc, daddr, 7970 zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT, 7971 saddr, NULL, &ipif_flags) != 0) { 7972 *saddr = ipv6_all_zeros; 7973 ipif_flags = 0; 7974 } 7975 } 7976 7977 dir->dir_sscope = ip_addr_scope_v6(saddr); 7978 slabel = ip6_asp_lookup(saddr, NULL, ipst); 7979 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 7980 dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 7981 ire_refrele(ire); 7982 ill_refrele(dst_ill); 7983 } 7984 miocack(q, mp, iocp->ioc_count, 0); 7985 } 7986 7987 /* 7988 * Check if this is an address assigned to this machine. 7989 * Skips interfaces that are down by using ire checks. 7990 * Translates mapped addresses to v4 addresses and then 7991 * treats them as such, returning true if the v4 address 7992 * associated with this mapped address is configured. 7993 * Note: Applications will have to be careful what they do 7994 * with the response; use of mapped addresses limits 7995 * what can be done with the socket, especially with 7996 * respect to socket options and ioctls - neither IPv4 7997 * options nor IPv6 sticky options/ancillary data options 7998 * may be used. 7999 */ 8000 /* ARGSUSED */ 8001 int 8002 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8003 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8004 { 8005 struct sioc_addrreq *sia; 8006 sin_t *sin; 8007 ire_t *ire; 8008 mblk_t *mp1; 8009 zoneid_t zoneid; 8010 ip_stack_t *ipst; 8011 8012 ip1dbg(("ip_sioctl_tmyaddr")); 8013 8014 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8015 zoneid = Q_TO_CONN(q)->conn_zoneid; 8016 ipst = CONNQ_TO_IPST(q); 8017 8018 /* Existence verified in ip_wput_nondata */ 8019 mp1 = mp->b_cont->b_cont; 8020 sia = (struct sioc_addrreq *)mp1->b_rptr; 8021 sin = (sin_t *)&sia->sa_addr; 8022 switch (sin->sin_family) { 8023 case AF_INET6: { 8024 sin6_t *sin6 = (sin6_t *)sin; 8025 8026 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8027 ipaddr_t v4_addr; 8028 8029 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8030 v4_addr); 8031 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 8032 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, 8033 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8034 } else { 8035 in6_addr_t v6addr; 8036 8037 v6addr = sin6->sin6_addr; 8038 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 8039 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, 8040 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8041 } 8042 break; 8043 } 8044 case AF_INET: { 8045 ipaddr_t v4addr; 8046 8047 v4addr = sin->sin_addr.s_addr; 8048 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 8049 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8050 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8051 break; 8052 } 8053 default: 8054 return (EAFNOSUPPORT); 8055 } 8056 if (ire != NULL) { 8057 sia->sa_res = 1; 8058 ire_refrele(ire); 8059 } else { 8060 sia->sa_res = 0; 8061 } 8062 return (0); 8063 } 8064 8065 /* 8066 * Check if this is an address assigned on-link i.e. neighbor, 8067 * and makes sure it's reachable from the current zone. 8068 * Returns true for my addresses as well. 8069 * Translates mapped addresses to v4 addresses and then 8070 * treats them as such, returning true if the v4 address 8071 * associated with this mapped address is configured. 8072 * Note: Applications will have to be careful what they do 8073 * with the response; use of mapped addresses limits 8074 * what can be done with the socket, especially with 8075 * respect to socket options and ioctls - neither IPv4 8076 * options nor IPv6 sticky options/ancillary data options 8077 * may be used. 8078 */ 8079 /* ARGSUSED */ 8080 int 8081 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8082 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 8083 { 8084 struct sioc_addrreq *sia; 8085 sin_t *sin; 8086 mblk_t *mp1; 8087 ire_t *ire = NULL; 8088 zoneid_t zoneid; 8089 ip_stack_t *ipst; 8090 8091 ip1dbg(("ip_sioctl_tonlink")); 8092 8093 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8094 zoneid = Q_TO_CONN(q)->conn_zoneid; 8095 ipst = CONNQ_TO_IPST(q); 8096 8097 /* Existence verified in ip_wput_nondata */ 8098 mp1 = mp->b_cont->b_cont; 8099 sia = (struct sioc_addrreq *)mp1->b_rptr; 8100 sin = (sin_t *)&sia->sa_addr; 8101 8102 /* 8103 * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST 8104 * to make sure we only look at on-link unicast address. 8105 */ 8106 switch (sin->sin_family) { 8107 case AF_INET6: { 8108 sin6_t *sin6 = (sin6_t *)sin; 8109 8110 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8111 ipaddr_t v4_addr; 8112 8113 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8114 v4_addr); 8115 if (!CLASSD(v4_addr)) { 8116 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0, 8117 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 8118 0, ipst, NULL); 8119 } 8120 } else { 8121 in6_addr_t v6addr; 8122 8123 v6addr = sin6->sin6_addr; 8124 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 8125 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0, 8126 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 0, 8127 ipst, NULL); 8128 } 8129 } 8130 break; 8131 } 8132 case AF_INET: { 8133 ipaddr_t v4addr; 8134 8135 v4addr = sin->sin_addr.s_addr; 8136 if (!CLASSD(v4addr)) { 8137 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL, 8138 zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 8139 } 8140 break; 8141 } 8142 default: 8143 return (EAFNOSUPPORT); 8144 } 8145 sia->sa_res = 0; 8146 if (ire != NULL) { 8147 ASSERT(!(ire->ire_type & IRE_MULTICAST)); 8148 8149 if ((ire->ire_type & IRE_ONLINK) && 8150 !(ire->ire_type & IRE_BROADCAST)) 8151 sia->sa_res = 1; 8152 ire_refrele(ire); 8153 } 8154 return (0); 8155 } 8156 8157 /* 8158 * TBD: implement when kernel maintaines a list of site prefixes. 8159 */ 8160 /* ARGSUSED */ 8161 int 8162 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8163 ip_ioctl_cmd_t *ipip, void *ifreq) 8164 { 8165 return (ENXIO); 8166 } 8167 8168 /* ARP IOCTLs. */ 8169 /* ARGSUSED */ 8170 int 8171 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8172 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8173 { 8174 int err; 8175 ipaddr_t ipaddr; 8176 struct iocblk *iocp; 8177 conn_t *connp; 8178 struct arpreq *ar; 8179 struct xarpreq *xar; 8180 int arp_flags, flags, alength; 8181 uchar_t *lladdr; 8182 ip_stack_t *ipst; 8183 ill_t *ill = ipif->ipif_ill; 8184 ill_t *proxy_ill = NULL; 8185 ipmp_arpent_t *entp = NULL; 8186 boolean_t proxyarp = B_FALSE; 8187 boolean_t if_arp_ioctl = B_FALSE; 8188 ncec_t *ncec = NULL; 8189 nce_t *nce; 8190 8191 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8192 connp = Q_TO_CONN(q); 8193 ipst = connp->conn_netstack->netstack_ip; 8194 iocp = (struct iocblk *)mp->b_rptr; 8195 8196 if (ipip->ipi_cmd_type == XARP_CMD) { 8197 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 8198 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 8199 ar = NULL; 8200 8201 arp_flags = xar->xarp_flags; 8202 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha); 8203 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0); 8204 /* 8205 * Validate against user's link layer address length 8206 * input and name and addr length limits. 8207 */ 8208 alength = ill->ill_phys_addr_length; 8209 if (ipip->ipi_cmd == SIOCSXARP) { 8210 if (alength != xar->xarp_ha.sdl_alen || 8211 (alength + xar->xarp_ha.sdl_nlen > 8212 sizeof (xar->xarp_ha.sdl_data))) 8213 return (EINVAL); 8214 } 8215 } else { 8216 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 8217 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 8218 xar = NULL; 8219 8220 arp_flags = ar->arp_flags; 8221 lladdr = (uchar_t *)ar->arp_ha.sa_data; 8222 /* 8223 * Theoretically, the sa_family could tell us what link 8224 * layer type this operation is trying to deal with. By 8225 * common usage AF_UNSPEC means ethernet. We'll assume 8226 * any attempt to use the SIOC?ARP ioctls is for ethernet, 8227 * for now. Our new SIOC*XARP ioctls can be used more 8228 * generally. 8229 * 8230 * If the underlying media happens to have a non 6 byte 8231 * address, arp module will fail set/get, but the del 8232 * operation will succeed. 8233 */ 8234 alength = 6; 8235 if ((ipip->ipi_cmd != SIOCDARP) && 8236 (alength != ill->ill_phys_addr_length)) { 8237 return (EINVAL); 8238 } 8239 } 8240 8241 /* Translate ATF* flags to NCE* flags */ 8242 flags = 0; 8243 if (arp_flags & ATF_AUTHORITY) 8244 flags |= NCE_F_AUTHORITY; 8245 if (arp_flags & ATF_PERM) 8246 flags |= NCE_F_NONUD; /* not subject to aging */ 8247 if (arp_flags & ATF_PUBL) 8248 flags |= NCE_F_PUBLISH; 8249 8250 /* 8251 * IPMP ARP special handling: 8252 * 8253 * 1. Since ARP mappings must appear consistent across the group, 8254 * prohibit changing ARP mappings on the underlying interfaces. 8255 * 8256 * 2. Since ARP mappings for IPMP data addresses are maintained by 8257 * IP itself, prohibit changing them. 8258 * 8259 * 3. For proxy ARP, use a functioning hardware address in the group, 8260 * provided one exists. If one doesn't, just add the entry as-is; 8261 * ipmp_illgrp_refresh_arpent() will refresh it if things change. 8262 */ 8263 if (IS_UNDER_IPMP(ill)) { 8264 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP) 8265 return (EPERM); 8266 } 8267 if (IS_IPMP(ill)) { 8268 ipmp_illgrp_t *illg = ill->ill_grp; 8269 8270 switch (ipip->ipi_cmd) { 8271 case SIOCSARP: 8272 case SIOCSXARP: 8273 proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength); 8274 if (proxy_ill != NULL) { 8275 proxyarp = B_TRUE; 8276 if (!ipmp_ill_is_active(proxy_ill)) 8277 proxy_ill = ipmp_illgrp_next_ill(illg); 8278 if (proxy_ill != NULL) 8279 lladdr = proxy_ill->ill_phys_addr; 8280 } 8281 /* FALLTHRU */ 8282 } 8283 } 8284 8285 ipaddr = sin->sin_addr.s_addr; 8286 /* 8287 * don't match across illgrp per case (1) and (2). 8288 * XXX use IS_IPMP(ill) like ndp_sioc_update? 8289 */ 8290 nce = nce_lookup_v4(ill, &ipaddr); 8291 if (nce != NULL) 8292 ncec = nce->nce_common; 8293 8294 switch (iocp->ioc_cmd) { 8295 case SIOCDARP: 8296 case SIOCDXARP: { 8297 /* 8298 * Delete the NCE if any. 8299 */ 8300 if (ncec == NULL) { 8301 iocp->ioc_error = ENXIO; 8302 break; 8303 } 8304 /* Don't allow changes to arp mappings of local addresses. */ 8305 if (NCE_MYADDR(ncec)) { 8306 nce_refrele(nce); 8307 return (ENOTSUP); 8308 } 8309 iocp->ioc_error = 0; 8310 8311 /* 8312 * Delete the nce_common which has ncec_ill set to ipmp_ill. 8313 * This will delete all the nce entries on the under_ills. 8314 */ 8315 ncec_delete(ncec); 8316 /* 8317 * Once the NCE has been deleted, then the ire_dep* consistency 8318 * mechanism will find any IRE which depended on the now 8319 * condemned NCE (as part of sending packets). 8320 * That mechanism handles redirects by deleting redirects 8321 * that refer to UNREACHABLE nces. 8322 */ 8323 break; 8324 } 8325 case SIOCGARP: 8326 case SIOCGXARP: 8327 if (ncec != NULL) { 8328 lladdr = ncec->ncec_lladdr; 8329 flags = ncec->ncec_flags; 8330 iocp->ioc_error = 0; 8331 ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags); 8332 } else { 8333 iocp->ioc_error = ENXIO; 8334 } 8335 break; 8336 case SIOCSARP: 8337 case SIOCSXARP: 8338 /* Don't allow changes to arp mappings of local addresses. */ 8339 if (ncec != NULL && NCE_MYADDR(ncec)) { 8340 nce_refrele(nce); 8341 return (ENOTSUP); 8342 } 8343 8344 /* static arp entries will undergo NUD if ATF_PERM is not set */ 8345 flags |= NCE_F_STATIC; 8346 if (!if_arp_ioctl) { 8347 ip_nce_lookup_and_update(&ipaddr, NULL, ipst, 8348 lladdr, alength, flags); 8349 } else { 8350 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 8351 if (ipif != NULL) { 8352 ip_nce_lookup_and_update(&ipaddr, ipif, ipst, 8353 lladdr, alength, flags); 8354 ipif_refrele(ipif); 8355 } 8356 } 8357 if (nce != NULL) { 8358 nce_refrele(nce); 8359 nce = NULL; 8360 } 8361 /* 8362 * NCE_F_STATIC entries will be added in state ND_REACHABLE 8363 * by nce_add_common() 8364 */ 8365 err = nce_lookup_then_add_v4(ill, lladdr, 8366 ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED, 8367 &nce); 8368 if (err == EEXIST) { 8369 ncec = nce->nce_common; 8370 mutex_enter(&ncec->ncec_lock); 8371 ncec->ncec_state = ND_REACHABLE; 8372 ncec->ncec_flags = flags; 8373 nce_update(ncec, ND_UNCHANGED, lladdr); 8374 mutex_exit(&ncec->ncec_lock); 8375 err = 0; 8376 } 8377 if (nce != NULL) { 8378 nce_refrele(nce); 8379 nce = NULL; 8380 } 8381 if (IS_IPMP(ill) && err == 0) { 8382 entp = ipmp_illgrp_create_arpent(ill->ill_grp, 8383 proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length, 8384 flags); 8385 if (entp == NULL || (proxyarp && proxy_ill == NULL)) { 8386 iocp->ioc_error = (entp == NULL ? ENOMEM : 0); 8387 break; 8388 } 8389 } 8390 iocp->ioc_error = err; 8391 } 8392 8393 if (nce != NULL) { 8394 nce_refrele(nce); 8395 } 8396 8397 /* 8398 * If we created an IPMP ARP entry, mark that we've notified ARP. 8399 */ 8400 if (entp != NULL) 8401 ipmp_illgrp_mark_arpent(ill->ill_grp, entp); 8402 8403 return (iocp->ioc_error); 8404 } 8405 8406 /* 8407 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify 8408 * the associated sin and refhold and return the associated ipif via `ci'. 8409 */ 8410 int 8411 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 8412 cmd_info_t *ci) 8413 { 8414 mblk_t *mp1; 8415 sin_t *sin; 8416 conn_t *connp; 8417 ipif_t *ipif; 8418 ire_t *ire = NULL; 8419 ill_t *ill = NULL; 8420 boolean_t exists; 8421 ip_stack_t *ipst; 8422 struct arpreq *ar; 8423 struct xarpreq *xar; 8424 struct sockaddr_dl *sdl; 8425 8426 /* ioctl comes down on a conn */ 8427 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8428 connp = Q_TO_CONN(q); 8429 if (connp->conn_family == AF_INET6) 8430 return (ENXIO); 8431 8432 ipst = connp->conn_netstack->netstack_ip; 8433 8434 /* Verified in ip_wput_nondata */ 8435 mp1 = mp->b_cont->b_cont; 8436 8437 if (ipip->ipi_cmd_type == XARP_CMD) { 8438 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq)); 8439 xar = (struct xarpreq *)mp1->b_rptr; 8440 sin = (sin_t *)&xar->xarp_pa; 8441 sdl = &xar->xarp_ha; 8442 8443 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET) 8444 return (ENXIO); 8445 if (sdl->sdl_nlen >= LIFNAMSIZ) 8446 return (EINVAL); 8447 } else { 8448 ASSERT(ipip->ipi_cmd_type == ARP_CMD); 8449 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq)); 8450 ar = (struct arpreq *)mp1->b_rptr; 8451 sin = (sin_t *)&ar->arp_pa; 8452 } 8453 8454 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) { 8455 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen, 8456 B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst); 8457 if (ipif == NULL) 8458 return (ENXIO); 8459 if (ipif->ipif_id != 0) { 8460 ipif_refrele(ipif); 8461 return (ENXIO); 8462 } 8463 } else { 8464 /* 8465 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen 8466 * of 0: use the IP address to find the ipif. If the IP 8467 * address is an IPMP test address, ire_ftable_lookup() will 8468 * find the wrong ill, so we first do an ipif_lookup_addr(). 8469 */ 8470 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES, 8471 ipst); 8472 if (ipif == NULL) { 8473 ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr, 8474 0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES, 8475 NULL, MATCH_IRE_TYPE, 0, ipst, NULL); 8476 if (ire == NULL || ((ill = ire->ire_ill) == NULL)) { 8477 if (ire != NULL) 8478 ire_refrele(ire); 8479 return (ENXIO); 8480 } 8481 ASSERT(ire != NULL && ill != NULL); 8482 ipif = ill->ill_ipif; 8483 ipif_refhold(ipif); 8484 ire_refrele(ire); 8485 } 8486 } 8487 8488 if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) { 8489 ipif_refrele(ipif); 8490 return (ENXIO); 8491 } 8492 8493 ci->ci_sin = sin; 8494 ci->ci_ipif = ipif; 8495 return (0); 8496 } 8497 8498 /* 8499 * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the 8500 * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is 8501 * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it 8502 * up and thus an ill can join that illgrp. 8503 * 8504 * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than 8505 * open()/close() primarily because close() is not allowed to fail or block 8506 * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason 8507 * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure 8508 * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the 8509 * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts 8510 * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent 8511 * state if I_UNLINK didn't occur. 8512 * 8513 * Note that for each plumb/unplumb operation, we may end up here more than 8514 * once because of the way ifconfig works. However, it's OK to link the same 8515 * illgrp more than once, or unlink an illgrp that's already unlinked. 8516 */ 8517 static int 8518 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd) 8519 { 8520 int err; 8521 ip_stack_t *ipst = ill->ill_ipst; 8522 8523 ASSERT(IS_IPMP(ill)); 8524 ASSERT(IAM_WRITER_ILL(ill)); 8525 8526 switch (ioccmd) { 8527 case I_LINK: 8528 return (ENOTSUP); 8529 8530 case I_PLINK: 8531 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 8532 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp); 8533 rw_exit(&ipst->ips_ipmp_lock); 8534 break; 8535 8536 case I_PUNLINK: 8537 /* 8538 * Require all UP ipifs be brought down prior to unlinking the 8539 * illgrp so any associated IREs (and other state) is torched. 8540 */ 8541 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) 8542 return (EBUSY); 8543 8544 /* 8545 * NOTE: We hold ipmp_lock across the unlink to prevent a race 8546 * with an SIOCSLIFGROUPNAME request from an ill trying to 8547 * join this group. Specifically: ills trying to join grab 8548 * ipmp_lock and bump a "pending join" counter checked by 8549 * ipmp_illgrp_unlink_grp(). During the unlink no new pending 8550 * joins can occur (since we have ipmp_lock). Once we drop 8551 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not 8552 * find the illgrp (since we unlinked it) and will return 8553 * EAFNOSUPPORT. This will then take them back through the 8554 * IPMP meta-interface plumbing logic in ifconfig, and thus 8555 * back through I_PLINK above. 8556 */ 8557 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 8558 err = ipmp_illgrp_unlink_grp(ill->ill_grp); 8559 rw_exit(&ipst->ips_ipmp_lock); 8560 return (err); 8561 default: 8562 break; 8563 } 8564 return (0); 8565 } 8566 8567 /* 8568 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 8569 * atomically set/clear the muxids. Also complete the ioctl by acking or 8570 * naking it. Note that the code is structured such that the link type, 8571 * whether it's persistent or not, is treated equally. ifconfig(1M) and 8572 * its clones use the persistent link, while pppd(1M) and perhaps many 8573 * other daemons may use non-persistent link. When combined with some 8574 * ill_t states, linking and unlinking lower streams may be used as 8575 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 8576 */ 8577 /* ARGSUSED */ 8578 void 8579 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 8580 { 8581 mblk_t *mp1; 8582 struct linkblk *li; 8583 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; 8584 int err = 0; 8585 8586 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK || 8587 ioccmd == I_LINK || ioccmd == I_UNLINK); 8588 8589 mp1 = mp->b_cont; /* This is the linkblk info */ 8590 li = (struct linkblk *)mp1->b_rptr; 8591 8592 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li); 8593 if (err == EINPROGRESS) 8594 return; 8595 done: 8596 if (err == 0) 8597 miocack(q, mp, 0, 0); 8598 else 8599 miocnak(q, mp, 0, err); 8600 8601 /* Conn was refheld in ip_sioctl_copyin_setup */ 8602 if (CONN_Q(q)) 8603 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 8604 } 8605 8606 /* 8607 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to 8608 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP 8609 * module stream). If `doconsist' is set, then do the extended consistency 8610 * checks requested by ifconfig(1M) and (atomically) set ill_muxid here. 8611 * Returns zero on success, EINPROGRESS if the operation is still pending, or 8612 * an error code on failure. 8613 */ 8614 static int 8615 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, 8616 struct linkblk *li) 8617 { 8618 int err = 0; 8619 ill_t *ill; 8620 queue_t *ipwq, *dwq; 8621 const char *name; 8622 struct qinit *qinfo; 8623 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 8624 boolean_t entered_ipsq = B_FALSE; 8625 boolean_t is_ip = B_FALSE; 8626 arl_t *arl; 8627 8628 /* 8629 * Walk the lower stream to verify it's the IP module stream. 8630 * The IP module is identified by its name, wput function, 8631 * and non-NULL q_next. STREAMS ensures that the lower stream 8632 * (li->l_qbot) will not vanish until this ioctl completes. 8633 */ 8634 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) { 8635 qinfo = ipwq->q_qinfo; 8636 name = qinfo->qi_minfo->mi_idname; 8637 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 && 8638 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 8639 is_ip = B_TRUE; 8640 break; 8641 } 8642 if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 && 8643 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 8644 break; 8645 } 8646 } 8647 8648 /* 8649 * If this isn't an IP module stream, bail. 8650 */ 8651 if (ipwq == NULL) 8652 return (0); 8653 8654 if (!is_ip) { 8655 arl = (arl_t *)ipwq->q_ptr; 8656 ill = arl_to_ill(arl); 8657 if (ill == NULL) 8658 return (0); 8659 } else { 8660 ill = ipwq->q_ptr; 8661 } 8662 ASSERT(ill != NULL); 8663 8664 if (ipsq == NULL) { 8665 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 8666 NEW_OP, B_FALSE); 8667 if (ipsq == NULL) { 8668 if (!is_ip) 8669 ill_refrele(ill); 8670 return (EINPROGRESS); 8671 } 8672 entered_ipsq = B_TRUE; 8673 } 8674 ASSERT(IAM_WRITER_ILL(ill)); 8675 mutex_enter(&ill->ill_lock); 8676 if (!is_ip) { 8677 if (islink && ill->ill_muxid == 0) { 8678 /* 8679 * Plumbing has to be done with IP plumbed first, arp 8680 * second, but here we have arp being plumbed first. 8681 */ 8682 mutex_exit(&ill->ill_lock); 8683 ipsq_exit(ipsq); 8684 ill_refrele(ill); 8685 return (EINVAL); 8686 } 8687 } 8688 mutex_exit(&ill->ill_lock); 8689 if (!is_ip) { 8690 arl->arl_muxid = islink ? li->l_index : 0; 8691 ill_refrele(ill); 8692 goto done; 8693 } 8694 8695 if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) 8696 goto done; 8697 8698 /* 8699 * As part of I_{P}LINKing, stash the number of downstream modules and 8700 * the read queue of the module immediately below IP in the ill. 8701 * These are used during the capability negotiation below. 8702 */ 8703 ill->ill_lmod_rq = NULL; 8704 ill->ill_lmod_cnt = 0; 8705 if (islink && ((dwq = ipwq->q_next) != NULL)) { 8706 ill->ill_lmod_rq = RD(dwq); 8707 for (; dwq != NULL; dwq = dwq->q_next) 8708 ill->ill_lmod_cnt++; 8709 } 8710 8711 ill->ill_muxid = islink ? li->l_index : 0; 8712 8713 /* 8714 * Mark the ipsq busy until the capability operations initiated below 8715 * complete. The PLINK/UNLINK ioctl itself completes when our caller 8716 * returns, but the capability operation may complete asynchronously 8717 * much later. 8718 */ 8719 ipsq_current_start(ipsq, ill->ill_ipif, ioccmd); 8720 /* 8721 * If there's at least one up ipif on this ill, then we're bound to 8722 * the underlying driver via DLPI. In that case, renegotiate 8723 * capabilities to account for any possible change in modules 8724 * interposed between IP and the driver. 8725 */ 8726 if (ill->ill_ipif_up_count > 0) { 8727 if (islink) 8728 ill_capability_probe(ill); 8729 else 8730 ill_capability_reset(ill, B_FALSE); 8731 } 8732 ipsq_current_finish(ipsq); 8733 done: 8734 if (entered_ipsq) 8735 ipsq_exit(ipsq); 8736 8737 return (err); 8738 } 8739 8740 /* 8741 * Search the ioctl command in the ioctl tables and return a pointer 8742 * to the ioctl command information. The ioctl command tables are 8743 * static and fully populated at compile time. 8744 */ 8745 ip_ioctl_cmd_t * 8746 ip_sioctl_lookup(int ioc_cmd) 8747 { 8748 int index; 8749 ip_ioctl_cmd_t *ipip; 8750 ip_ioctl_cmd_t *ipip_end; 8751 8752 if (ioc_cmd == IPI_DONTCARE) 8753 return (NULL); 8754 8755 /* 8756 * Do a 2 step search. First search the indexed table 8757 * based on the least significant byte of the ioctl cmd. 8758 * If we don't find a match, then search the misc table 8759 * serially. 8760 */ 8761 index = ioc_cmd & 0xFF; 8762 if (index < ip_ndx_ioctl_count) { 8763 ipip = &ip_ndx_ioctl_table[index]; 8764 if (ipip->ipi_cmd == ioc_cmd) { 8765 /* Found a match in the ndx table */ 8766 return (ipip); 8767 } 8768 } 8769 8770 /* Search the misc table */ 8771 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 8772 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 8773 if (ipip->ipi_cmd == ioc_cmd) 8774 /* Found a match in the misc table */ 8775 return (ipip); 8776 } 8777 8778 return (NULL); 8779 } 8780 8781 /* 8782 * Wrapper function for resuming deferred ioctl processing 8783 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 8784 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 8785 */ 8786 /* ARGSUSED */ 8787 void 8788 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 8789 void *dummy_arg) 8790 { 8791 ip_sioctl_copyin_setup(q, mp); 8792 } 8793 8794 /* 8795 * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message 8796 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 8797 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 8798 * We establish here the size of the block to be copied in. mi_copyin 8799 * arranges for this to happen, an processing continues in ip_wput_nondata with 8800 * an M_IOCDATA message. 8801 */ 8802 void 8803 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 8804 { 8805 int copyin_size; 8806 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8807 ip_ioctl_cmd_t *ipip; 8808 cred_t *cr; 8809 ip_stack_t *ipst; 8810 8811 if (CONN_Q(q)) 8812 ipst = CONNQ_TO_IPST(q); 8813 else 8814 ipst = ILLQ_TO_IPST(q); 8815 8816 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 8817 if (ipip == NULL) { 8818 /* 8819 * The ioctl is not one we understand or own. 8820 * Pass it along to be processed down stream, 8821 * if this is a module instance of IP, else nak 8822 * the ioctl. 8823 */ 8824 if (q->q_next == NULL) { 8825 goto nak; 8826 } else { 8827 putnext(q, mp); 8828 return; 8829 } 8830 } 8831 8832 /* 8833 * If this is deferred, then we will do all the checks when we 8834 * come back. 8835 */ 8836 if ((iocp->ioc_cmd == SIOCGDSTINFO || 8837 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) { 8838 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 8839 return; 8840 } 8841 8842 /* 8843 * Only allow a very small subset of IP ioctls on this stream if 8844 * IP is a module and not a driver. Allowing ioctls to be processed 8845 * in this case may cause assert failures or data corruption. 8846 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 8847 * ioctls allowed on an IP module stream, after which this stream 8848 * normally becomes a multiplexor (at which time the stream head 8849 * will fail all ioctls). 8850 */ 8851 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 8852 goto nak; 8853 } 8854 8855 /* Make sure we have ioctl data to process. */ 8856 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 8857 goto nak; 8858 8859 /* 8860 * Prefer dblk credential over ioctl credential; some synthesized 8861 * ioctls have kcred set because there's no way to crhold() 8862 * a credential in some contexts. (ioc_cr is not crfree() by 8863 * the framework; the caller of ioctl needs to hold the reference 8864 * for the duration of the call). 8865 */ 8866 cr = msg_getcred(mp, NULL); 8867 if (cr == NULL) 8868 cr = iocp->ioc_cr; 8869 8870 /* Make sure normal users don't send down privileged ioctls */ 8871 if ((ipip->ipi_flags & IPI_PRIV) && 8872 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) { 8873 /* We checked the privilege earlier but log it here */ 8874 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE)); 8875 return; 8876 } 8877 8878 /* 8879 * The ioctl command tables can only encode fixed length 8880 * ioctl data. If the length is variable, the table will 8881 * encode the length as zero. Such special cases are handled 8882 * below in the switch. 8883 */ 8884 if (ipip->ipi_copyin_size != 0) { 8885 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 8886 return; 8887 } 8888 8889 switch (iocp->ioc_cmd) { 8890 case O_SIOCGIFCONF: 8891 case SIOCGIFCONF: 8892 /* 8893 * This IOCTL is hilarious. See comments in 8894 * ip_sioctl_get_ifconf for the story. 8895 */ 8896 if (iocp->ioc_count == TRANSPARENT) 8897 copyin_size = SIZEOF_STRUCT(ifconf, 8898 iocp->ioc_flag); 8899 else 8900 copyin_size = iocp->ioc_count; 8901 mi_copyin(q, mp, NULL, copyin_size); 8902 return; 8903 8904 case O_SIOCGLIFCONF: 8905 case SIOCGLIFCONF: 8906 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 8907 mi_copyin(q, mp, NULL, copyin_size); 8908 return; 8909 8910 case SIOCGLIFSRCOF: 8911 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 8912 mi_copyin(q, mp, NULL, copyin_size); 8913 return; 8914 case SIOCGIP6ADDRPOLICY: 8915 ip_sioctl_ip6addrpolicy(q, mp); 8916 ip6_asp_table_refrele(ipst); 8917 return; 8918 8919 case SIOCSIP6ADDRPOLICY: 8920 ip_sioctl_ip6addrpolicy(q, mp); 8921 return; 8922 8923 case SIOCGDSTINFO: 8924 ip_sioctl_dstinfo(q, mp); 8925 ip6_asp_table_refrele(ipst); 8926 return; 8927 8928 case I_PLINK: 8929 case I_PUNLINK: 8930 case I_LINK: 8931 case I_UNLINK: 8932 /* 8933 * We treat non-persistent link similarly as the persistent 8934 * link case, in terms of plumbing/unplumbing, as well as 8935 * dynamic re-plumbing events indicator. See comments 8936 * in ip_sioctl_plink() for more. 8937 * 8938 * Request can be enqueued in the 'ipsq' while waiting 8939 * to become exclusive. So bump up the conn ref. 8940 */ 8941 if (CONN_Q(q)) 8942 CONN_INC_REF(Q_TO_CONN(q)); 8943 ip_sioctl_plink(NULL, q, mp, NULL); 8944 return; 8945 8946 case ND_GET: 8947 case ND_SET: 8948 /* 8949 * Use of the nd table requires holding the reader lock. 8950 * Modifying the nd table thru nd_load/nd_unload requires 8951 * the writer lock. 8952 */ 8953 rw_enter(&ipst->ips_ip_g_nd_lock, RW_READER); 8954 if (nd_getset(q, ipst->ips_ip_g_nd, mp)) { 8955 rw_exit(&ipst->ips_ip_g_nd_lock); 8956 8957 if (iocp->ioc_error) 8958 iocp->ioc_count = 0; 8959 mp->b_datap->db_type = M_IOCACK; 8960 qreply(q, mp); 8961 return; 8962 } 8963 rw_exit(&ipst->ips_ip_g_nd_lock); 8964 /* 8965 * We don't understand this subioctl of ND_GET / ND_SET. 8966 * Maybe intended for some driver / module below us 8967 */ 8968 if (q->q_next) { 8969 putnext(q, mp); 8970 } else { 8971 iocp->ioc_error = ENOENT; 8972 mp->b_datap->db_type = M_IOCNAK; 8973 iocp->ioc_count = 0; 8974 qreply(q, mp); 8975 } 8976 return; 8977 8978 case IP_IOCTL: 8979 ip_wput_ioctl(q, mp); 8980 return; 8981 8982 case SIOCILB: 8983 /* The ioctl length varies depending on the ILB command. */ 8984 copyin_size = iocp->ioc_count; 8985 if (copyin_size < sizeof (ilb_cmd_t)) 8986 goto nak; 8987 mi_copyin(q, mp, NULL, copyin_size); 8988 return; 8989 8990 default: 8991 cmn_err(CE_PANIC, "should not happen "); 8992 } 8993 nak: 8994 if (mp->b_cont != NULL) { 8995 freemsg(mp->b_cont); 8996 mp->b_cont = NULL; 8997 } 8998 iocp->ioc_error = EINVAL; 8999 mp->b_datap->db_type = M_IOCNAK; 9000 iocp->ioc_count = 0; 9001 qreply(q, mp); 9002 } 9003 9004 static void 9005 ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags) 9006 { 9007 struct arpreq *ar; 9008 struct xarpreq *xar; 9009 mblk_t *tmp; 9010 struct iocblk *iocp; 9011 int x_arp_ioctl = B_FALSE; 9012 int *flagsp; 9013 char *storage = NULL; 9014 9015 ASSERT(ill != NULL); 9016 9017 iocp = (struct iocblk *)mp->b_rptr; 9018 ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP); 9019 9020 tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */ 9021 if ((iocp->ioc_cmd == SIOCGXARP) || 9022 (iocp->ioc_cmd == SIOCSXARP)) { 9023 x_arp_ioctl = B_TRUE; 9024 xar = (struct xarpreq *)tmp->b_rptr; 9025 flagsp = &xar->xarp_flags; 9026 storage = xar->xarp_ha.sdl_data; 9027 } else { 9028 ar = (struct arpreq *)tmp->b_rptr; 9029 flagsp = &ar->arp_flags; 9030 storage = ar->arp_ha.sa_data; 9031 } 9032 9033 /* 9034 * We're done if this is not an SIOCG{X}ARP 9035 */ 9036 if (x_arp_ioctl) { 9037 storage += ill_xarp_info(&xar->xarp_ha, ill); 9038 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 9039 sizeof (xar->xarp_ha.sdl_data)) { 9040 iocp->ioc_error = EINVAL; 9041 return; 9042 } 9043 } 9044 *flagsp = ATF_INUSE; 9045 /* 9046 * If /sbin/arp told us we are the authority using the "permanent" 9047 * flag, or if this is one of my addresses print "permanent" 9048 * in the /sbin/arp output. 9049 */ 9050 if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY)) 9051 *flagsp |= ATF_AUTHORITY; 9052 if (flags & NCE_F_NONUD) 9053 *flagsp |= ATF_PERM; /* not subject to aging */ 9054 if (flags & NCE_F_PUBLISH) 9055 *flagsp |= ATF_PUBL; 9056 if (hwaddr != NULL) { 9057 *flagsp |= ATF_COM; 9058 bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length); 9059 } 9060 } 9061 9062 /* 9063 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 9064 * interface) create the next available logical interface for this 9065 * physical interface. 9066 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 9067 * ipif with the specified name. 9068 * 9069 * If the address family is not AF_UNSPEC then set the address as well. 9070 * 9071 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 9072 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 9073 * 9074 * Executed as a writer on the ill. 9075 * So no lock is needed to traverse the ipif chain, or examine the 9076 * phyint flags. 9077 */ 9078 /* ARGSUSED */ 9079 int 9080 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9081 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9082 { 9083 mblk_t *mp1; 9084 struct lifreq *lifr; 9085 boolean_t isv6; 9086 boolean_t exists; 9087 char *name; 9088 char *endp; 9089 char *cp; 9090 int namelen; 9091 ipif_t *ipif; 9092 long id; 9093 ipsq_t *ipsq; 9094 ill_t *ill; 9095 sin_t *sin; 9096 int err = 0; 9097 boolean_t found_sep = B_FALSE; 9098 conn_t *connp; 9099 zoneid_t zoneid; 9100 ip_stack_t *ipst = CONNQ_TO_IPST(q); 9101 9102 ASSERT(q->q_next == NULL); 9103 ip1dbg(("ip_sioctl_addif\n")); 9104 /* Existence of mp1 has been checked in ip_wput_nondata */ 9105 mp1 = mp->b_cont->b_cont; 9106 /* 9107 * Null terminate the string to protect against buffer 9108 * overrun. String was generated by user code and may not 9109 * be trusted. 9110 */ 9111 lifr = (struct lifreq *)mp1->b_rptr; 9112 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 9113 name = lifr->lifr_name; 9114 ASSERT(CONN_Q(q)); 9115 connp = Q_TO_CONN(q); 9116 isv6 = (connp->conn_family == AF_INET6); 9117 zoneid = connp->conn_zoneid; 9118 namelen = mi_strlen(name); 9119 if (namelen == 0) 9120 return (EINVAL); 9121 9122 exists = B_FALSE; 9123 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 9124 (mi_strcmp(name, ipif_loopback_name) == 0)) { 9125 /* 9126 * Allow creating lo0 using SIOCLIFADDIF. 9127 * can't be any other writer thread. So can pass null below 9128 * for the last 4 args to ipif_lookup_name. 9129 */ 9130 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE, 9131 &exists, isv6, zoneid, ipst); 9132 /* Prevent any further action */ 9133 if (ipif == NULL) { 9134 return (ENOBUFS); 9135 } else if (!exists) { 9136 /* We created the ipif now and as writer */ 9137 ipif_refrele(ipif); 9138 return (0); 9139 } else { 9140 ill = ipif->ipif_ill; 9141 ill_refhold(ill); 9142 ipif_refrele(ipif); 9143 } 9144 } else { 9145 /* Look for a colon in the name. */ 9146 endp = &name[namelen]; 9147 for (cp = endp; --cp > name; ) { 9148 if (*cp == IPIF_SEPARATOR_CHAR) { 9149 found_sep = B_TRUE; 9150 /* 9151 * Reject any non-decimal aliases for plumbing 9152 * of logical interfaces. Aliases with leading 9153 * zeroes are also rejected as they introduce 9154 * ambiguity in the naming of the interfaces. 9155 * Comparing with "0" takes care of all such 9156 * cases. 9157 */ 9158 if ((strncmp("0", cp+1, 1)) == 0) 9159 return (EINVAL); 9160 9161 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 9162 id <= 0 || *endp != '\0') { 9163 return (EINVAL); 9164 } 9165 *cp = '\0'; 9166 break; 9167 } 9168 } 9169 ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst); 9170 if (found_sep) 9171 *cp = IPIF_SEPARATOR_CHAR; 9172 if (ill == NULL) 9173 return (ENXIO); 9174 } 9175 9176 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 9177 B_TRUE); 9178 9179 /* 9180 * Release the refhold due to the lookup, now that we are excl 9181 * or we are just returning 9182 */ 9183 ill_refrele(ill); 9184 9185 if (ipsq == NULL) 9186 return (EINPROGRESS); 9187 9188 /* We are now exclusive on the IPSQ */ 9189 ASSERT(IAM_WRITER_ILL(ill)); 9190 9191 if (found_sep) { 9192 /* Now see if there is an IPIF with this unit number. */ 9193 for (ipif = ill->ill_ipif; ipif != NULL; 9194 ipif = ipif->ipif_next) { 9195 if (ipif->ipif_id == id) { 9196 err = EEXIST; 9197 goto done; 9198 } 9199 } 9200 } 9201 9202 /* 9203 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 9204 * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name() 9205 * instead. 9206 */ 9207 if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL, 9208 B_TRUE, B_TRUE, &err)) == NULL) { 9209 goto done; 9210 } 9211 9212 /* Return created name with ioctl */ 9213 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 9214 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 9215 ip1dbg(("created %s\n", lifr->lifr_name)); 9216 9217 /* Set address */ 9218 sin = (sin_t *)&lifr->lifr_addr; 9219 if (sin->sin_family != AF_UNSPEC) { 9220 err = ip_sioctl_addr(ipif, sin, q, mp, 9221 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 9222 } 9223 9224 done: 9225 ipsq_exit(ipsq); 9226 return (err); 9227 } 9228 9229 /* 9230 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 9231 * interface) delete it based on the IP address (on this physical interface). 9232 * Otherwise delete it based on the ipif_id. 9233 * Also, special handling to allow a removeif of lo0. 9234 */ 9235 /* ARGSUSED */ 9236 int 9237 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9238 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 9239 { 9240 conn_t *connp; 9241 ill_t *ill = ipif->ipif_ill; 9242 boolean_t success; 9243 ip_stack_t *ipst; 9244 9245 ipst = CONNQ_TO_IPST(q); 9246 9247 ASSERT(q->q_next == NULL); 9248 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 9249 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9250 ASSERT(IAM_WRITER_IPIF(ipif)); 9251 9252 connp = Q_TO_CONN(q); 9253 /* 9254 * Special case for unplumbing lo0 (the loopback physical interface). 9255 * If unplumbing lo0, the incoming address structure has been 9256 * initialized to all zeros. When unplumbing lo0, all its logical 9257 * interfaces must be removed too. 9258 * 9259 * Note that this interface may be called to remove a specific 9260 * loopback logical interface (eg, lo0:1). But in that case 9261 * ipif->ipif_id != 0 so that the code path for that case is the 9262 * same as any other interface (meaning it skips the code directly 9263 * below). 9264 */ 9265 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { 9266 if (sin->sin_family == AF_UNSPEC && 9267 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 9268 /* 9269 * Mark it condemned. No new ref. will be made to ill. 9270 */ 9271 mutex_enter(&ill->ill_lock); 9272 ill->ill_state_flags |= ILL_CONDEMNED; 9273 for (ipif = ill->ill_ipif; ipif != NULL; 9274 ipif = ipif->ipif_next) { 9275 ipif->ipif_state_flags |= IPIF_CONDEMNED; 9276 } 9277 mutex_exit(&ill->ill_lock); 9278 9279 ipif = ill->ill_ipif; 9280 /* unplumb the loopback interface */ 9281 ill_delete(ill); 9282 mutex_enter(&connp->conn_lock); 9283 mutex_enter(&ill->ill_lock); 9284 9285 /* Are any references to this ill active */ 9286 if (ill_is_freeable(ill)) { 9287 mutex_exit(&ill->ill_lock); 9288 mutex_exit(&connp->conn_lock); 9289 ill_delete_tail(ill); 9290 mi_free(ill); 9291 return (0); 9292 } 9293 success = ipsq_pending_mp_add(connp, ipif, 9294 CONNP_TO_WQ(connp), mp, ILL_FREE); 9295 mutex_exit(&connp->conn_lock); 9296 mutex_exit(&ill->ill_lock); 9297 if (success) 9298 return (EINPROGRESS); 9299 else 9300 return (EINTR); 9301 } 9302 } 9303 9304 if (ipif->ipif_id == 0) { 9305 ipsq_t *ipsq; 9306 9307 /* Find based on address */ 9308 if (ipif->ipif_isv6) { 9309 sin6_t *sin6; 9310 9311 if (sin->sin_family != AF_INET6) 9312 return (EAFNOSUPPORT); 9313 9314 sin6 = (sin6_t *)sin; 9315 /* We are a writer, so we should be able to lookup */ 9316 ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill, 9317 ipst); 9318 } else { 9319 if (sin->sin_family != AF_INET) 9320 return (EAFNOSUPPORT); 9321 9322 /* We are a writer, so we should be able to lookup */ 9323 ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill, 9324 ipst); 9325 } 9326 if (ipif == NULL) { 9327 return (EADDRNOTAVAIL); 9328 } 9329 9330 /* 9331 * It is possible for a user to send an SIOCLIFREMOVEIF with 9332 * lifr_name of the physical interface but with an ip address 9333 * lifr_addr of a logical interface plumbed over it. 9334 * So update ipx_current_ipif now that ipif points to the 9335 * correct one. 9336 */ 9337 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 9338 ipsq->ipsq_xop->ipx_current_ipif = ipif; 9339 9340 /* This is a writer */ 9341 ipif_refrele(ipif); 9342 } 9343 9344 /* 9345 * Can not delete instance zero since it is tied to the ill. 9346 */ 9347 if (ipif->ipif_id == 0) 9348 return (EBUSY); 9349 9350 mutex_enter(&ill->ill_lock); 9351 ipif->ipif_state_flags |= IPIF_CONDEMNED; 9352 mutex_exit(&ill->ill_lock); 9353 9354 ipif_free(ipif); 9355 9356 mutex_enter(&connp->conn_lock); 9357 mutex_enter(&ill->ill_lock); 9358 9359 /* Are any references to this ipif active */ 9360 if (ipif_is_freeable(ipif)) { 9361 mutex_exit(&ill->ill_lock); 9362 mutex_exit(&connp->conn_lock); 9363 ipif_non_duplicate(ipif); 9364 (void) ipif_down_tail(ipif); 9365 ipif_free_tail(ipif); /* frees ipif */ 9366 return (0); 9367 } 9368 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 9369 IPIF_FREE); 9370 mutex_exit(&ill->ill_lock); 9371 mutex_exit(&connp->conn_lock); 9372 if (success) 9373 return (EINPROGRESS); 9374 else 9375 return (EINTR); 9376 } 9377 9378 /* 9379 * Restart the removeif ioctl. The refcnt has gone down to 0. 9380 * The ipif is already condemned. So can't find it thru lookups. 9381 */ 9382 /* ARGSUSED */ 9383 int 9384 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 9385 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 9386 { 9387 ill_t *ill = ipif->ipif_ill; 9388 9389 ASSERT(IAM_WRITER_IPIF(ipif)); 9390 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 9391 9392 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 9393 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9394 9395 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { 9396 ASSERT(ill->ill_state_flags & ILL_CONDEMNED); 9397 ill_delete_tail(ill); 9398 mi_free(ill); 9399 return (0); 9400 } 9401 9402 ipif_non_duplicate(ipif); 9403 (void) ipif_down_tail(ipif); 9404 ipif_free_tail(ipif); 9405 9406 return (0); 9407 } 9408 9409 /* 9410 * Set the local interface address. 9411 * Allow an address of all zero when the interface is down. 9412 */ 9413 /* ARGSUSED */ 9414 int 9415 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9416 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9417 { 9418 int err = 0; 9419 in6_addr_t v6addr; 9420 boolean_t need_up = B_FALSE; 9421 9422 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 9423 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9424 9425 ASSERT(IAM_WRITER_IPIF(ipif)); 9426 9427 if (ipif->ipif_isv6) { 9428 sin6_t *sin6; 9429 ill_t *ill; 9430 phyint_t *phyi; 9431 9432 if (sin->sin_family != AF_INET6) 9433 return (EAFNOSUPPORT); 9434 9435 sin6 = (sin6_t *)sin; 9436 v6addr = sin6->sin6_addr; 9437 ill = ipif->ipif_ill; 9438 phyi = ill->ill_phyint; 9439 9440 /* 9441 * Enforce that true multicast interfaces have a link-local 9442 * address for logical unit 0. 9443 */ 9444 if (ipif->ipif_id == 0 && 9445 (ill->ill_flags & ILLF_MULTICAST) && 9446 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 9447 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 9448 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 9449 return (EADDRNOTAVAIL); 9450 } 9451 9452 /* 9453 * up interfaces shouldn't have the unspecified address 9454 * unless they also have the IPIF_NOLOCAL flags set and 9455 * have a subnet assigned. 9456 */ 9457 if ((ipif->ipif_flags & IPIF_UP) && 9458 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 9459 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 9460 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 9461 return (EADDRNOTAVAIL); 9462 } 9463 9464 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 9465 return (EADDRNOTAVAIL); 9466 } else { 9467 ipaddr_t addr; 9468 9469 if (sin->sin_family != AF_INET) 9470 return (EAFNOSUPPORT); 9471 9472 addr = sin->sin_addr.s_addr; 9473 9474 /* Allow 0 as the local address. */ 9475 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 9476 return (EADDRNOTAVAIL); 9477 9478 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9479 } 9480 9481 /* 9482 * Even if there is no change we redo things just to rerun 9483 * ipif_set_default. 9484 */ 9485 if (ipif->ipif_flags & IPIF_UP) { 9486 /* 9487 * Setting a new local address, make sure 9488 * we have net and subnet bcast ire's for 9489 * the old address if we need them. 9490 */ 9491 /* 9492 * If the interface is already marked up, 9493 * we call ipif_down which will take care 9494 * of ditching any IREs that have been set 9495 * up based on the old interface address. 9496 */ 9497 err = ipif_logical_down(ipif, q, mp); 9498 if (err == EINPROGRESS) 9499 return (err); 9500 (void) ipif_down_tail(ipif); 9501 need_up = 1; 9502 } 9503 9504 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 9505 return (err); 9506 } 9507 9508 int 9509 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9510 boolean_t need_up) 9511 { 9512 in6_addr_t v6addr; 9513 in6_addr_t ov6addr; 9514 ipaddr_t addr; 9515 sin6_t *sin6; 9516 int sinlen; 9517 int err = 0; 9518 ill_t *ill = ipif->ipif_ill; 9519 boolean_t need_dl_down; 9520 boolean_t need_arp_down; 9521 struct iocblk *iocp; 9522 9523 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL; 9524 9525 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 9526 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9527 ASSERT(IAM_WRITER_IPIF(ipif)); 9528 9529 /* Must cancel any pending timer before taking the ill_lock */ 9530 if (ipif->ipif_recovery_id != 0) 9531 (void) untimeout(ipif->ipif_recovery_id); 9532 ipif->ipif_recovery_id = 0; 9533 9534 if (ipif->ipif_isv6) { 9535 sin6 = (sin6_t *)sin; 9536 v6addr = sin6->sin6_addr; 9537 sinlen = sizeof (struct sockaddr_in6); 9538 } else { 9539 addr = sin->sin_addr.s_addr; 9540 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9541 sinlen = sizeof (struct sockaddr_in); 9542 } 9543 mutex_enter(&ill->ill_lock); 9544 ov6addr = ipif->ipif_v6lcl_addr; 9545 ipif->ipif_v6lcl_addr = v6addr; 9546 sctp_update_ipif_addr(ipif, ov6addr); 9547 ipif->ipif_addr_ready = 0; 9548 9549 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT); 9550 9551 /* 9552 * If the interface was previously marked as a duplicate, then since 9553 * we've now got a "new" address, it should no longer be considered a 9554 * duplicate -- even if the "new" address is the same as the old one. 9555 * Note that if all ipifs are down, we may have a pending ARP down 9556 * event to handle. This is because we want to recover from duplicates 9557 * and thus delay tearing down ARP until the duplicates have been 9558 * removed or disabled. 9559 */ 9560 need_dl_down = need_arp_down = B_FALSE; 9561 if (ipif->ipif_flags & IPIF_DUPLICATE) { 9562 need_arp_down = !need_up; 9563 ipif->ipif_flags &= ~IPIF_DUPLICATE; 9564 if (--ill->ill_ipif_dup_count == 0 && !need_up && 9565 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 9566 need_dl_down = B_TRUE; 9567 } 9568 } 9569 9570 ipif_set_default(ipif); 9571 9572 /* 9573 * If we've just manually set the IPv6 link-local address (0th ipif), 9574 * tag the ill so that future updates to the interface ID don't result 9575 * in this address getting automatically reconfigured from under the 9576 * administrator. 9577 */ 9578 if (ipif->ipif_isv6 && ipif->ipif_id == 0) 9579 ill->ill_manual_linklocal = 1; 9580 9581 /* 9582 * When publishing an interface address change event, we only notify 9583 * the event listeners of the new address. It is assumed that if they 9584 * actively care about the addresses assigned that they will have 9585 * already discovered the previous address assigned (if there was one.) 9586 * 9587 * Don't attach nic event message for SIOCLIFADDIF ioctl. 9588 */ 9589 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) { 9590 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id), 9591 NE_ADDRESS_CHANGE, sin, sinlen); 9592 } 9593 9594 mutex_exit(&ill->ill_lock); 9595 9596 if (need_up) { 9597 /* 9598 * Now bring the interface back up. If this 9599 * is the only IPIF for the ILL, ipif_up 9600 * will have to re-bind to the device, so 9601 * we may get back EINPROGRESS, in which 9602 * case, this IOCTL will get completed in 9603 * ip_rput_dlpi when we see the DL_BIND_ACK. 9604 */ 9605 err = ipif_up(ipif, q, mp); 9606 } else { 9607 /* Perhaps ilgs should use this ill */ 9608 update_conn_ill(NULL, ill->ill_ipst); 9609 } 9610 9611 if (need_dl_down) 9612 ill_dl_down(ill); 9613 9614 if (need_arp_down && !ill->ill_isv6) 9615 (void) ipif_arp_down(ipif); 9616 9617 /* 9618 * The default multicast interface might have changed (for 9619 * instance if the IPv6 scope of the address changed) 9620 */ 9621 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); 9622 9623 return (err); 9624 } 9625 9626 /* 9627 * Restart entry point to restart the address set operation after the 9628 * refcounts have dropped to zero. 9629 */ 9630 /* ARGSUSED */ 9631 int 9632 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9633 ip_ioctl_cmd_t *ipip, void *ifreq) 9634 { 9635 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 9636 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9637 ASSERT(IAM_WRITER_IPIF(ipif)); 9638 (void) ipif_down_tail(ipif); 9639 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 9640 } 9641 9642 /* ARGSUSED */ 9643 int 9644 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9645 ip_ioctl_cmd_t *ipip, void *if_req) 9646 { 9647 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 9648 struct lifreq *lifr = (struct lifreq *)if_req; 9649 9650 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 9651 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9652 /* 9653 * The net mask and address can't change since we have a 9654 * reference to the ipif. So no lock is necessary. 9655 */ 9656 if (ipif->ipif_isv6) { 9657 *sin6 = sin6_null; 9658 sin6->sin6_family = AF_INET6; 9659 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 9660 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 9661 lifr->lifr_addrlen = 9662 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 9663 } else { 9664 *sin = sin_null; 9665 sin->sin_family = AF_INET; 9666 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 9667 if (ipip->ipi_cmd_type == LIF_CMD) { 9668 lifr->lifr_addrlen = 9669 ip_mask_to_plen(ipif->ipif_net_mask); 9670 } 9671 } 9672 return (0); 9673 } 9674 9675 /* 9676 * Set the destination address for a pt-pt interface. 9677 */ 9678 /* ARGSUSED */ 9679 int 9680 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9681 ip_ioctl_cmd_t *ipip, void *if_req) 9682 { 9683 int err = 0; 9684 in6_addr_t v6addr; 9685 boolean_t need_up = B_FALSE; 9686 9687 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 9688 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9689 ASSERT(IAM_WRITER_IPIF(ipif)); 9690 9691 if (ipif->ipif_isv6) { 9692 sin6_t *sin6; 9693 9694 if (sin->sin_family != AF_INET6) 9695 return (EAFNOSUPPORT); 9696 9697 sin6 = (sin6_t *)sin; 9698 v6addr = sin6->sin6_addr; 9699 9700 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 9701 return (EADDRNOTAVAIL); 9702 } else { 9703 ipaddr_t addr; 9704 9705 if (sin->sin_family != AF_INET) 9706 return (EAFNOSUPPORT); 9707 9708 addr = sin->sin_addr.s_addr; 9709 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 9710 return (EADDRNOTAVAIL); 9711 9712 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9713 } 9714 9715 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 9716 return (0); /* No change */ 9717 9718 if (ipif->ipif_flags & IPIF_UP) { 9719 /* 9720 * If the interface is already marked up, 9721 * we call ipif_down which will take care 9722 * of ditching any IREs that have been set 9723 * up based on the old pp dst address. 9724 */ 9725 err = ipif_logical_down(ipif, q, mp); 9726 if (err == EINPROGRESS) 9727 return (err); 9728 (void) ipif_down_tail(ipif); 9729 need_up = B_TRUE; 9730 } 9731 /* 9732 * could return EINPROGRESS. If so ioctl will complete in 9733 * ip_rput_dlpi_writer 9734 */ 9735 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 9736 return (err); 9737 } 9738 9739 static int 9740 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9741 boolean_t need_up) 9742 { 9743 in6_addr_t v6addr; 9744 ill_t *ill = ipif->ipif_ill; 9745 int err = 0; 9746 boolean_t need_dl_down; 9747 boolean_t need_arp_down; 9748 9749 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 9750 ipif->ipif_id, (void *)ipif)); 9751 9752 /* Must cancel any pending timer before taking the ill_lock */ 9753 if (ipif->ipif_recovery_id != 0) 9754 (void) untimeout(ipif->ipif_recovery_id); 9755 ipif->ipif_recovery_id = 0; 9756 9757 if (ipif->ipif_isv6) { 9758 sin6_t *sin6; 9759 9760 sin6 = (sin6_t *)sin; 9761 v6addr = sin6->sin6_addr; 9762 } else { 9763 ipaddr_t addr; 9764 9765 addr = sin->sin_addr.s_addr; 9766 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9767 } 9768 mutex_enter(&ill->ill_lock); 9769 /* Set point to point destination address. */ 9770 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 9771 /* 9772 * Allow this as a means of creating logical 9773 * pt-pt interfaces on top of e.g. an Ethernet. 9774 * XXX Undocumented HACK for testing. 9775 * pt-pt interfaces are created with NUD disabled. 9776 */ 9777 ipif->ipif_flags |= IPIF_POINTOPOINT; 9778 ipif->ipif_flags &= ~IPIF_BROADCAST; 9779 if (ipif->ipif_isv6) 9780 ill->ill_flags |= ILLF_NONUD; 9781 } 9782 9783 /* 9784 * If the interface was previously marked as a duplicate, then since 9785 * we've now got a "new" address, it should no longer be considered a 9786 * duplicate -- even if the "new" address is the same as the old one. 9787 * Note that if all ipifs are down, we may have a pending ARP down 9788 * event to handle. 9789 */ 9790 need_dl_down = need_arp_down = B_FALSE; 9791 if (ipif->ipif_flags & IPIF_DUPLICATE) { 9792 need_arp_down = !need_up; 9793 ipif->ipif_flags &= ~IPIF_DUPLICATE; 9794 if (--ill->ill_ipif_dup_count == 0 && !need_up && 9795 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 9796 need_dl_down = B_TRUE; 9797 } 9798 } 9799 9800 /* 9801 * If we've just manually set the IPv6 destination link-local address 9802 * (0th ipif), tag the ill so that future updates to the destination 9803 * interface ID (as can happen with interfaces over IP tunnels) don't 9804 * result in this address getting automatically reconfigured from 9805 * under the administrator. 9806 */ 9807 if (ipif->ipif_isv6 && ipif->ipif_id == 0) 9808 ill->ill_manual_dst_linklocal = 1; 9809 9810 /* Set the new address. */ 9811 ipif->ipif_v6pp_dst_addr = v6addr; 9812 /* Make sure subnet tracks pp_dst */ 9813 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 9814 mutex_exit(&ill->ill_lock); 9815 9816 if (need_up) { 9817 /* 9818 * Now bring the interface back up. If this 9819 * is the only IPIF for the ILL, ipif_up 9820 * will have to re-bind to the device, so 9821 * we may get back EINPROGRESS, in which 9822 * case, this IOCTL will get completed in 9823 * ip_rput_dlpi when we see the DL_BIND_ACK. 9824 */ 9825 err = ipif_up(ipif, q, mp); 9826 } 9827 9828 if (need_dl_down) 9829 ill_dl_down(ill); 9830 if (need_arp_down && !ipif->ipif_isv6) 9831 (void) ipif_arp_down(ipif); 9832 9833 return (err); 9834 } 9835 9836 /* 9837 * Restart entry point to restart the dstaddress set operation after the 9838 * refcounts have dropped to zero. 9839 */ 9840 /* ARGSUSED */ 9841 int 9842 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9843 ip_ioctl_cmd_t *ipip, void *ifreq) 9844 { 9845 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 9846 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9847 (void) ipif_down_tail(ipif); 9848 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 9849 } 9850 9851 /* ARGSUSED */ 9852 int 9853 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9854 ip_ioctl_cmd_t *ipip, void *if_req) 9855 { 9856 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 9857 9858 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 9859 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9860 /* 9861 * Get point to point destination address. The addresses can't 9862 * change since we hold a reference to the ipif. 9863 */ 9864 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 9865 return (EADDRNOTAVAIL); 9866 9867 if (ipif->ipif_isv6) { 9868 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 9869 *sin6 = sin6_null; 9870 sin6->sin6_family = AF_INET6; 9871 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 9872 } else { 9873 *sin = sin_null; 9874 sin->sin_family = AF_INET; 9875 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 9876 } 9877 return (0); 9878 } 9879 9880 /* 9881 * Check which flags will change by the given flags being set 9882 * silently ignore flags which userland is not allowed to control. 9883 * (Because these flags may change between SIOCGLIFFLAGS and 9884 * SIOCSLIFFLAGS, and that's outside of userland's control, 9885 * we need to silently ignore them rather than fail.) 9886 */ 9887 static void 9888 ip_sioctl_flags_onoff(ipif_t *ipif, uint64_t flags, uint64_t *onp, 9889 uint64_t *offp) 9890 { 9891 ill_t *ill = ipif->ipif_ill; 9892 phyint_t *phyi = ill->ill_phyint; 9893 uint64_t cantchange_flags, intf_flags; 9894 uint64_t turn_on, turn_off; 9895 9896 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 9897 cantchange_flags = IFF_CANTCHANGE; 9898 if (IS_IPMP(ill)) 9899 cantchange_flags |= IFF_IPMP_CANTCHANGE; 9900 turn_on = (flags ^ intf_flags) & ~cantchange_flags; 9901 turn_off = intf_flags & turn_on; 9902 turn_on ^= turn_off; 9903 *onp = turn_on; 9904 *offp = turn_off; 9905 } 9906 9907 /* 9908 * Set interface flags. Many flags require special handling (e.g., 9909 * bringing the interface down); see below for details. 9910 * 9911 * NOTE : We really don't enforce that ipif_id zero should be used 9912 * for setting any flags other than IFF_LOGINT_FLAGS. This 9913 * is because applications generally does SICGLIFFLAGS and 9914 * ORs in the new flags (that affects the logical) and does a 9915 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 9916 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 9917 * flags that will be turned on is correct with respect to 9918 * ipif_id 0. For backward compatibility reasons, it is not done. 9919 */ 9920 /* ARGSUSED */ 9921 int 9922 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9923 ip_ioctl_cmd_t *ipip, void *if_req) 9924 { 9925 uint64_t turn_on; 9926 uint64_t turn_off; 9927 int err = 0; 9928 phyint_t *phyi; 9929 ill_t *ill; 9930 conn_t *connp; 9931 uint64_t intf_flags; 9932 boolean_t phyint_flags_modified = B_FALSE; 9933 uint64_t flags; 9934 struct ifreq *ifr; 9935 struct lifreq *lifr; 9936 boolean_t set_linklocal = B_FALSE; 9937 9938 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 9939 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9940 9941 ASSERT(IAM_WRITER_IPIF(ipif)); 9942 9943 ill = ipif->ipif_ill; 9944 phyi = ill->ill_phyint; 9945 9946 if (ipip->ipi_cmd_type == IF_CMD) { 9947 ifr = (struct ifreq *)if_req; 9948 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 9949 } else { 9950 lifr = (struct lifreq *)if_req; 9951 flags = lifr->lifr_flags; 9952 } 9953 9954 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 9955 9956 /* 9957 * Have the flags been set correctly until now? 9958 */ 9959 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 9960 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 9961 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 9962 /* 9963 * Compare the new flags to the old, and partition 9964 * into those coming on and those going off. 9965 * For the 16 bit command keep the bits above bit 16 unchanged. 9966 */ 9967 if (ipip->ipi_cmd == SIOCSIFFLAGS) 9968 flags |= intf_flags & ~0xFFFF; 9969 9970 /* 9971 * Explicitly fail attempts to change flags that are always invalid on 9972 * an IPMP meta-interface. 9973 */ 9974 if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID)) 9975 return (EINVAL); 9976 9977 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 9978 if ((turn_on|turn_off) == 0) 9979 return (0); /* No change */ 9980 9981 /* 9982 * All test addresses must be IFF_DEPRECATED (to ensure source address 9983 * selection avoids them) -- so force IFF_DEPRECATED on, and do not 9984 * allow it to be turned off. 9985 */ 9986 if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED && 9987 (turn_on|intf_flags) & IFF_NOFAILOVER) 9988 return (EINVAL); 9989 9990 if ((connp = Q_TO_CONN(q)) == NULL) 9991 return (EINVAL); 9992 9993 /* 9994 * Only vrrp control socket is allowed to change IFF_UP and 9995 * IFF_NOACCEPT flags when IFF_VRRP is set. 9996 */ 9997 if ((intf_flags & IFF_VRRP) && ((turn_off | turn_on) & IFF_UP)) { 9998 if (!connp->conn_isvrrp) 9999 return (EINVAL); 10000 } 10001 10002 /* 10003 * The IFF_NOACCEPT flag can only be set on an IFF_VRRP IP address by 10004 * VRRP control socket. 10005 */ 10006 if ((turn_off | turn_on) & IFF_NOACCEPT) { 10007 if (!connp->conn_isvrrp || !(intf_flags & IFF_VRRP)) 10008 return (EINVAL); 10009 } 10010 10011 if (turn_on & IFF_NOFAILOVER) { 10012 turn_on |= IFF_DEPRECATED; 10013 flags |= IFF_DEPRECATED; 10014 } 10015 10016 /* 10017 * On underlying interfaces, only allow applications to manage test 10018 * addresses -- otherwise, they may get confused when the address 10019 * moves as part of being brought up. Likewise, prevent an 10020 * application-managed test address from being converted to a data 10021 * address. To prevent migration of administratively up addresses in 10022 * the kernel, we don't allow them to be converted either. 10023 */ 10024 if (IS_UNDER_IPMP(ill)) { 10025 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF; 10026 10027 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER)) 10028 return (EINVAL); 10029 10030 if ((turn_off & IFF_NOFAILOVER) && 10031 (flags & (appflags | IFF_UP | IFF_DUPLICATE))) 10032 return (EINVAL); 10033 } 10034 10035 /* 10036 * Only allow IFF_TEMPORARY flag to be set on 10037 * IPv6 interfaces. 10038 */ 10039 if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6)) 10040 return (EINVAL); 10041 10042 /* 10043 * cannot turn off IFF_NOXMIT on VNI interfaces. 10044 */ 10045 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill)) 10046 return (EINVAL); 10047 10048 /* 10049 * Don't allow the IFF_ROUTER flag to be turned on on loopback 10050 * interfaces. It makes no sense in that context. 10051 */ 10052 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 10053 return (EINVAL); 10054 10055 /* 10056 * For IPv6 ipif_id 0, don't allow the interface to be up without 10057 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 10058 * If the link local address isn't set, and can be set, it will get 10059 * set later on in this function. 10060 */ 10061 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 10062 (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) && 10063 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 10064 if (ipif_cant_setlinklocal(ipif)) 10065 return (EINVAL); 10066 set_linklocal = B_TRUE; 10067 } 10068 10069 /* 10070 * If we modify physical interface flags, we'll potentially need to 10071 * send up two routing socket messages for the changes (one for the 10072 * IPv4 ill, and another for the IPv6 ill). Note that here. 10073 */ 10074 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 10075 phyint_flags_modified = B_TRUE; 10076 10077 /* 10078 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE 10079 * (otherwise, we'd immediately use them, defeating standby). Also, 10080 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not 10081 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already 10082 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We 10083 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics 10084 * will not be honored. 10085 */ 10086 if (turn_on & PHYI_STANDBY) { 10087 /* 10088 * No need to grab ill_g_usesrc_lock here; see the 10089 * synchronization notes in ip.c. 10090 */ 10091 if (ill->ill_usesrc_grp_next != NULL || 10092 intf_flags & PHYI_INACTIVE) 10093 return (EINVAL); 10094 if (!(flags & PHYI_FAILED)) { 10095 flags |= PHYI_INACTIVE; 10096 turn_on |= PHYI_INACTIVE; 10097 } 10098 } 10099 10100 if (turn_off & PHYI_STANDBY) { 10101 flags &= ~PHYI_INACTIVE; 10102 turn_off |= PHYI_INACTIVE; 10103 } 10104 10105 /* 10106 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both 10107 * would end up on. 10108 */ 10109 if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) == 10110 (PHYI_FAILED | PHYI_INACTIVE)) 10111 return (EINVAL); 10112 10113 /* 10114 * If ILLF_ROUTER changes, we need to change the ip forwarding 10115 * status of the interface. 10116 */ 10117 if ((turn_on | turn_off) & ILLF_ROUTER) 10118 (void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0)); 10119 10120 /* 10121 * If the interface is not UP and we are not going to 10122 * bring it UP, record the flags and return. When the 10123 * interface comes UP later, the right actions will be 10124 * taken. 10125 */ 10126 if (!(ipif->ipif_flags & IPIF_UP) && 10127 !(turn_on & IPIF_UP)) { 10128 /* Record new flags in their respective places. */ 10129 mutex_enter(&ill->ill_lock); 10130 mutex_enter(&ill->ill_phyint->phyint_lock); 10131 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 10132 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 10133 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 10134 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 10135 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 10136 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 10137 mutex_exit(&ill->ill_lock); 10138 mutex_exit(&ill->ill_phyint->phyint_lock); 10139 10140 /* 10141 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the 10142 * same to the kernel: if any of them has been set by 10143 * userland, the interface cannot be used for data traffic. 10144 */ 10145 if ((turn_on|turn_off) & 10146 (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 10147 ASSERT(!IS_IPMP(ill)); 10148 /* 10149 * It's possible the ill is part of an "anonymous" 10150 * IPMP group rather than a real group. In that case, 10151 * there are no other interfaces in the group and thus 10152 * no need to call ipmp_phyint_refresh_active(). 10153 */ 10154 if (IS_UNDER_IPMP(ill)) 10155 ipmp_phyint_refresh_active(phyi); 10156 } 10157 10158 if (phyint_flags_modified) { 10159 if (phyi->phyint_illv4 != NULL) { 10160 ip_rts_ifmsg(phyi->phyint_illv4-> 10161 ill_ipif, RTSQ_DEFAULT); 10162 } 10163 if (phyi->phyint_illv6 != NULL) { 10164 ip_rts_ifmsg(phyi->phyint_illv6-> 10165 ill_ipif, RTSQ_DEFAULT); 10166 } 10167 } 10168 /* The default multicast interface might have changed */ 10169 ire_increment_multicast_generation(ill->ill_ipst, 10170 ill->ill_isv6); 10171 10172 return (0); 10173 } else if (set_linklocal) { 10174 mutex_enter(&ill->ill_lock); 10175 if (set_linklocal) 10176 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 10177 mutex_exit(&ill->ill_lock); 10178 } 10179 10180 /* 10181 * Disallow IPv6 interfaces coming up that have the unspecified address, 10182 * or point-to-point interfaces with an unspecified destination. We do 10183 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 10184 * have a subnet assigned, which is how in.ndpd currently manages its 10185 * onlink prefix list when no addresses are configured with those 10186 * prefixes. 10187 */ 10188 if (ipif->ipif_isv6 && 10189 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 10190 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 10191 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 10192 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 10193 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 10194 return (EINVAL); 10195 } 10196 10197 /* 10198 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 10199 * from being brought up. 10200 */ 10201 if (!ipif->ipif_isv6 && 10202 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 10203 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 10204 return (EINVAL); 10205 } 10206 10207 /* 10208 * If we are going to change one or more of the flags that are 10209 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP, 10210 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and 10211 * IPIF_NOFAILOVER, we will take special action. This is 10212 * done by bring the ipif down, changing the flags and bringing 10213 * it back up again. For IPIF_NOFAILOVER, the act of bringing it 10214 * back up will trigger the address to be moved. 10215 * 10216 * If we are going to change IFF_NOACCEPT, we need to bring 10217 * all the ipifs down then bring them up again. The act of 10218 * bringing all the ipifs back up will trigger the local 10219 * ires being recreated with "no_accept" set/cleared. 10220 * 10221 * Note that ILLF_NOACCEPT is always set separately from the 10222 * other flags. 10223 */ 10224 if ((turn_on|turn_off) & 10225 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 10226 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED| 10227 IPIF_NOFAILOVER)) { 10228 /* 10229 * ipif_down() will ire_delete bcast ire's for the subnet, 10230 * while the ire_identical_ref tracks the case of IRE_BROADCAST 10231 * entries shared between multiple ipifs on the same subnet. 10232 */ 10233 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 10234 !(turn_off & IPIF_UP)) { 10235 if (ipif->ipif_flags & IPIF_UP) 10236 ill->ill_logical_down = 1; 10237 turn_on &= ~IPIF_UP; 10238 } 10239 err = ipif_down(ipif, q, mp); 10240 ip1dbg(("ipif_down returns %d err ", err)); 10241 if (err == EINPROGRESS) 10242 return (err); 10243 (void) ipif_down_tail(ipif); 10244 } else if ((turn_on|turn_off) & ILLF_NOACCEPT) { 10245 /* 10246 * If we can quiesce the ill, then continue. If not, then 10247 * ip_sioctl_flags_tail() will be called from 10248 * ipif_ill_refrele_tail(). 10249 */ 10250 ill_down_ipifs(ill, B_TRUE); 10251 10252 mutex_enter(&connp->conn_lock); 10253 mutex_enter(&ill->ill_lock); 10254 if (!ill_is_quiescent(ill)) { 10255 boolean_t success; 10256 10257 success = ipsq_pending_mp_add(connp, ill->ill_ipif, 10258 q, mp, ILL_DOWN); 10259 mutex_exit(&ill->ill_lock); 10260 mutex_exit(&connp->conn_lock); 10261 return (success ? EINPROGRESS : EINTR); 10262 } 10263 mutex_exit(&ill->ill_lock); 10264 mutex_exit(&connp->conn_lock); 10265 } 10266 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 10267 } 10268 10269 static int 10270 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) 10271 { 10272 ill_t *ill; 10273 phyint_t *phyi; 10274 uint64_t turn_on, turn_off; 10275 boolean_t phyint_flags_modified = B_FALSE; 10276 int err = 0; 10277 boolean_t set_linklocal = B_FALSE; 10278 10279 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 10280 ipif->ipif_ill->ill_name, ipif->ipif_id)); 10281 10282 ASSERT(IAM_WRITER_IPIF(ipif)); 10283 10284 ill = ipif->ipif_ill; 10285 phyi = ill->ill_phyint; 10286 10287 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10288 10289 /* 10290 * IFF_UP is handled separately. 10291 */ 10292 turn_on &= ~IFF_UP; 10293 turn_off &= ~IFF_UP; 10294 10295 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 10296 phyint_flags_modified = B_TRUE; 10297 10298 /* 10299 * Now we change the flags. Track current value of 10300 * other flags in their respective places. 10301 */ 10302 mutex_enter(&ill->ill_lock); 10303 mutex_enter(&phyi->phyint_lock); 10304 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 10305 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 10306 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 10307 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 10308 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 10309 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 10310 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 10311 set_linklocal = B_TRUE; 10312 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 10313 } 10314 10315 mutex_exit(&ill->ill_lock); 10316 mutex_exit(&phyi->phyint_lock); 10317 10318 if (set_linklocal) 10319 (void) ipif_setlinklocal(ipif); 10320 10321 /* 10322 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to 10323 * the kernel: if any of them has been set by userland, the interface 10324 * cannot be used for data traffic. 10325 */ 10326 if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 10327 ASSERT(!IS_IPMP(ill)); 10328 /* 10329 * It's possible the ill is part of an "anonymous" IPMP group 10330 * rather than a real group. In that case, there are no other 10331 * interfaces in the group and thus no need for us to call 10332 * ipmp_phyint_refresh_active(). 10333 */ 10334 if (IS_UNDER_IPMP(ill)) 10335 ipmp_phyint_refresh_active(phyi); 10336 } 10337 10338 if ((turn_on|turn_off) & ILLF_NOACCEPT) { 10339 /* 10340 * If the ILLF_NOACCEPT flag is changed, bring up all the 10341 * ipifs that were brought down. 10342 * 10343 * The routing sockets messages are sent as the result 10344 * of ill_up_ipifs(), further, SCTP's IPIF list was updated 10345 * as well. 10346 */ 10347 err = ill_up_ipifs(ill, q, mp); 10348 } else if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) { 10349 /* 10350 * XXX ipif_up really does not know whether a phyint flags 10351 * was modified or not. So, it sends up information on 10352 * only one routing sockets message. As we don't bring up 10353 * the interface and also set PHYI_ flags simultaneously 10354 * it should be okay. 10355 */ 10356 err = ipif_up(ipif, q, mp); 10357 } else { 10358 /* 10359 * Make sure routing socket sees all changes to the flags. 10360 * ipif_up_done* handles this when we use ipif_up. 10361 */ 10362 if (phyint_flags_modified) { 10363 if (phyi->phyint_illv4 != NULL) { 10364 ip_rts_ifmsg(phyi->phyint_illv4-> 10365 ill_ipif, RTSQ_DEFAULT); 10366 } 10367 if (phyi->phyint_illv6 != NULL) { 10368 ip_rts_ifmsg(phyi->phyint_illv6-> 10369 ill_ipif, RTSQ_DEFAULT); 10370 } 10371 } else { 10372 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 10373 } 10374 /* 10375 * Update the flags in SCTP's IPIF list, ipif_up() will do 10376 * this in need_up case. 10377 */ 10378 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10379 } 10380 10381 /* The default multicast interface might have changed */ 10382 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); 10383 return (err); 10384 } 10385 10386 /* 10387 * Restart the flags operation now that the refcounts have dropped to zero. 10388 */ 10389 /* ARGSUSED */ 10390 int 10391 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10392 ip_ioctl_cmd_t *ipip, void *if_req) 10393 { 10394 uint64_t flags; 10395 struct ifreq *ifr = if_req; 10396 struct lifreq *lifr = if_req; 10397 uint64_t turn_on, turn_off; 10398 10399 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 10400 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10401 10402 if (ipip->ipi_cmd_type == IF_CMD) { 10403 /* cast to uint16_t prevents unwanted sign extension */ 10404 flags = (uint16_t)ifr->ifr_flags; 10405 } else { 10406 flags = lifr->lifr_flags; 10407 } 10408 10409 /* 10410 * If this function call is a result of the ILLF_NOACCEPT flag 10411 * change, do not call ipif_down_tail(). See ip_sioctl_flags(). 10412 */ 10413 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10414 if (!((turn_on|turn_off) & ILLF_NOACCEPT)) 10415 (void) ipif_down_tail(ipif); 10416 10417 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 10418 } 10419 10420 /* 10421 * Can operate on either a module or a driver queue. 10422 */ 10423 /* ARGSUSED */ 10424 int 10425 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10426 ip_ioctl_cmd_t *ipip, void *if_req) 10427 { 10428 /* 10429 * Has the flags been set correctly till now ? 10430 */ 10431 ill_t *ill = ipif->ipif_ill; 10432 phyint_t *phyi = ill->ill_phyint; 10433 10434 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 10435 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10436 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 10437 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 10438 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 10439 10440 /* 10441 * Need a lock since some flags can be set even when there are 10442 * references to the ipif. 10443 */ 10444 mutex_enter(&ill->ill_lock); 10445 if (ipip->ipi_cmd_type == IF_CMD) { 10446 struct ifreq *ifr = (struct ifreq *)if_req; 10447 10448 /* Get interface flags (low 16 only). */ 10449 ifr->ifr_flags = ((ipif->ipif_flags | 10450 ill->ill_flags | phyi->phyint_flags) & 0xffff); 10451 } else { 10452 struct lifreq *lifr = (struct lifreq *)if_req; 10453 10454 /* Get interface flags. */ 10455 lifr->lifr_flags = ipif->ipif_flags | 10456 ill->ill_flags | phyi->phyint_flags; 10457 } 10458 mutex_exit(&ill->ill_lock); 10459 return (0); 10460 } 10461 10462 /* 10463 * We allow the MTU to be set on an ILL, but not have it be different 10464 * for different IPIFs since we don't actually send packets on IPIFs. 10465 */ 10466 /* ARGSUSED */ 10467 int 10468 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10469 ip_ioctl_cmd_t *ipip, void *if_req) 10470 { 10471 int mtu; 10472 int ip_min_mtu; 10473 struct ifreq *ifr; 10474 struct lifreq *lifr; 10475 ill_t *ill; 10476 10477 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 10478 ipif->ipif_id, (void *)ipif)); 10479 if (ipip->ipi_cmd_type == IF_CMD) { 10480 ifr = (struct ifreq *)if_req; 10481 mtu = ifr->ifr_metric; 10482 } else { 10483 lifr = (struct lifreq *)if_req; 10484 mtu = lifr->lifr_mtu; 10485 } 10486 /* Only allow for logical unit zero i.e. not on "bge0:17" */ 10487 if (ipif->ipif_id != 0) 10488 return (EINVAL); 10489 10490 ill = ipif->ipif_ill; 10491 if (ipif->ipif_isv6) 10492 ip_min_mtu = IPV6_MIN_MTU; 10493 else 10494 ip_min_mtu = IP_MIN_MTU; 10495 10496 mutex_enter(&ill->ill_lock); 10497 if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) { 10498 mutex_exit(&ill->ill_lock); 10499 return (EINVAL); 10500 } 10501 /* 10502 * The dce and fragmentation code can handle changes to ill_mtu 10503 * concurrent with sending/fragmenting packets. 10504 */ 10505 ill->ill_mtu = mtu; 10506 ill->ill_flags |= ILLF_FIXEDMTU; 10507 mutex_exit(&ill->ill_lock); 10508 10509 /* 10510 * Make sure all dce_generation checks find out 10511 * that ill_mtu has changed. 10512 */ 10513 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); 10514 10515 /* Update the MTU in SCTP's list */ 10516 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10517 return (0); 10518 } 10519 10520 /* Get interface MTU. */ 10521 /* ARGSUSED */ 10522 int 10523 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10524 ip_ioctl_cmd_t *ipip, void *if_req) 10525 { 10526 struct ifreq *ifr; 10527 struct lifreq *lifr; 10528 10529 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 10530 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10531 10532 /* 10533 * We allow a get on any logical interface even though the set 10534 * can only be done on logical unit 0. 10535 */ 10536 if (ipip->ipi_cmd_type == IF_CMD) { 10537 ifr = (struct ifreq *)if_req; 10538 ifr->ifr_metric = ipif->ipif_ill->ill_mtu; 10539 } else { 10540 lifr = (struct lifreq *)if_req; 10541 lifr->lifr_mtu = ipif->ipif_ill->ill_mtu; 10542 } 10543 return (0); 10544 } 10545 10546 /* Set interface broadcast address. */ 10547 /* ARGSUSED2 */ 10548 int 10549 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10550 ip_ioctl_cmd_t *ipip, void *if_req) 10551 { 10552 ipaddr_t addr; 10553 ire_t *ire; 10554 ill_t *ill = ipif->ipif_ill; 10555 ip_stack_t *ipst = ill->ill_ipst; 10556 10557 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name, 10558 ipif->ipif_id)); 10559 10560 ASSERT(IAM_WRITER_IPIF(ipif)); 10561 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 10562 return (EADDRNOTAVAIL); 10563 10564 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 10565 10566 if (sin->sin_family != AF_INET) 10567 return (EAFNOSUPPORT); 10568 10569 addr = sin->sin_addr.s_addr; 10570 if (ipif->ipif_flags & IPIF_UP) { 10571 /* 10572 * If we are already up, make sure the new 10573 * broadcast address makes sense. If it does, 10574 * there should be an IRE for it already. 10575 */ 10576 ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST, 10577 ill, ipif->ipif_zoneid, NULL, 10578 (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL); 10579 if (ire == NULL) { 10580 return (EINVAL); 10581 } else { 10582 ire_refrele(ire); 10583 } 10584 } 10585 /* 10586 * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST 10587 * needs to already exist we never need to change the set of 10588 * IRE_BROADCASTs when we are UP. 10589 */ 10590 if (addr != ipif->ipif_brd_addr) 10591 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 10592 10593 return (0); 10594 } 10595 10596 /* Get interface broadcast address. */ 10597 /* ARGSUSED */ 10598 int 10599 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10600 ip_ioctl_cmd_t *ipip, void *if_req) 10601 { 10602 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 10603 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10604 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 10605 return (EADDRNOTAVAIL); 10606 10607 /* IPIF_BROADCAST not possible with IPv6 */ 10608 ASSERT(!ipif->ipif_isv6); 10609 *sin = sin_null; 10610 sin->sin_family = AF_INET; 10611 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 10612 return (0); 10613 } 10614 10615 /* 10616 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 10617 */ 10618 /* ARGSUSED */ 10619 int 10620 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10621 ip_ioctl_cmd_t *ipip, void *if_req) 10622 { 10623 int err = 0; 10624 in6_addr_t v6mask; 10625 10626 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 10627 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10628 10629 ASSERT(IAM_WRITER_IPIF(ipif)); 10630 10631 if (ipif->ipif_isv6) { 10632 sin6_t *sin6; 10633 10634 if (sin->sin_family != AF_INET6) 10635 return (EAFNOSUPPORT); 10636 10637 sin6 = (sin6_t *)sin; 10638 v6mask = sin6->sin6_addr; 10639 } else { 10640 ipaddr_t mask; 10641 10642 if (sin->sin_family != AF_INET) 10643 return (EAFNOSUPPORT); 10644 10645 mask = sin->sin_addr.s_addr; 10646 V4MASK_TO_V6(mask, v6mask); 10647 } 10648 10649 /* 10650 * No big deal if the interface isn't already up, or the mask 10651 * isn't really changing, or this is pt-pt. 10652 */ 10653 if (!(ipif->ipif_flags & IPIF_UP) || 10654 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 10655 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 10656 ipif->ipif_v6net_mask = v6mask; 10657 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10658 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 10659 ipif->ipif_v6net_mask, 10660 ipif->ipif_v6subnet); 10661 } 10662 return (0); 10663 } 10664 /* 10665 * Make sure we have valid net and subnet broadcast ire's 10666 * for the old netmask, if needed by other logical interfaces. 10667 */ 10668 err = ipif_logical_down(ipif, q, mp); 10669 if (err == EINPROGRESS) 10670 return (err); 10671 (void) ipif_down_tail(ipif); 10672 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 10673 return (err); 10674 } 10675 10676 static int 10677 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 10678 { 10679 in6_addr_t v6mask; 10680 int err = 0; 10681 10682 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 10683 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10684 10685 if (ipif->ipif_isv6) { 10686 sin6_t *sin6; 10687 10688 sin6 = (sin6_t *)sin; 10689 v6mask = sin6->sin6_addr; 10690 } else { 10691 ipaddr_t mask; 10692 10693 mask = sin->sin_addr.s_addr; 10694 V4MASK_TO_V6(mask, v6mask); 10695 } 10696 10697 ipif->ipif_v6net_mask = v6mask; 10698 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10699 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 10700 ipif->ipif_v6subnet); 10701 } 10702 err = ipif_up(ipif, q, mp); 10703 10704 if (err == 0 || err == EINPROGRESS) { 10705 /* 10706 * The interface must be DL_BOUND if this packet has to 10707 * go out on the wire. Since we only go through a logical 10708 * down and are bound with the driver during an internal 10709 * down/up that is satisfied. 10710 */ 10711 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 10712 /* Potentially broadcast an address mask reply. */ 10713 ipif_mask_reply(ipif); 10714 } 10715 } 10716 return (err); 10717 } 10718 10719 /* ARGSUSED */ 10720 int 10721 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10722 ip_ioctl_cmd_t *ipip, void *if_req) 10723 { 10724 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 10725 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10726 (void) ipif_down_tail(ipif); 10727 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 10728 } 10729 10730 /* Get interface net mask. */ 10731 /* ARGSUSED */ 10732 int 10733 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10734 ip_ioctl_cmd_t *ipip, void *if_req) 10735 { 10736 struct lifreq *lifr = (struct lifreq *)if_req; 10737 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 10738 10739 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 10740 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10741 10742 /* 10743 * net mask can't change since we have a reference to the ipif. 10744 */ 10745 if (ipif->ipif_isv6) { 10746 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 10747 *sin6 = sin6_null; 10748 sin6->sin6_family = AF_INET6; 10749 sin6->sin6_addr = ipif->ipif_v6net_mask; 10750 lifr->lifr_addrlen = 10751 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 10752 } else { 10753 *sin = sin_null; 10754 sin->sin_family = AF_INET; 10755 sin->sin_addr.s_addr = ipif->ipif_net_mask; 10756 if (ipip->ipi_cmd_type == LIF_CMD) { 10757 lifr->lifr_addrlen = 10758 ip_mask_to_plen(ipif->ipif_net_mask); 10759 } 10760 } 10761 return (0); 10762 } 10763 10764 /* ARGSUSED */ 10765 int 10766 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10767 ip_ioctl_cmd_t *ipip, void *if_req) 10768 { 10769 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 10770 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10771 10772 /* 10773 * Since no applications should ever be setting metrics on underlying 10774 * interfaces, we explicitly fail to smoke 'em out. 10775 */ 10776 if (IS_UNDER_IPMP(ipif->ipif_ill)) 10777 return (EINVAL); 10778 10779 /* 10780 * Set interface metric. We don't use this for 10781 * anything but we keep track of it in case it is 10782 * important to routing applications or such. 10783 */ 10784 if (ipip->ipi_cmd_type == IF_CMD) { 10785 struct ifreq *ifr; 10786 10787 ifr = (struct ifreq *)if_req; 10788 ipif->ipif_metric = ifr->ifr_metric; 10789 } else { 10790 struct lifreq *lifr; 10791 10792 lifr = (struct lifreq *)if_req; 10793 ipif->ipif_metric = lifr->lifr_metric; 10794 } 10795 return (0); 10796 } 10797 10798 /* ARGSUSED */ 10799 int 10800 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10801 ip_ioctl_cmd_t *ipip, void *if_req) 10802 { 10803 /* Get interface metric. */ 10804 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 10805 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10806 10807 if (ipip->ipi_cmd_type == IF_CMD) { 10808 struct ifreq *ifr; 10809 10810 ifr = (struct ifreq *)if_req; 10811 ifr->ifr_metric = ipif->ipif_metric; 10812 } else { 10813 struct lifreq *lifr; 10814 10815 lifr = (struct lifreq *)if_req; 10816 lifr->lifr_metric = ipif->ipif_metric; 10817 } 10818 10819 return (0); 10820 } 10821 10822 /* ARGSUSED */ 10823 int 10824 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10825 ip_ioctl_cmd_t *ipip, void *if_req) 10826 { 10827 int arp_muxid; 10828 10829 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 10830 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10831 /* 10832 * Set the muxid returned from I_PLINK. 10833 */ 10834 if (ipip->ipi_cmd_type == IF_CMD) { 10835 struct ifreq *ifr = (struct ifreq *)if_req; 10836 10837 ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid; 10838 arp_muxid = ifr->ifr_arp_muxid; 10839 } else { 10840 struct lifreq *lifr = (struct lifreq *)if_req; 10841 10842 ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid; 10843 arp_muxid = lifr->lifr_arp_muxid; 10844 } 10845 arl_set_muxid(ipif->ipif_ill, arp_muxid); 10846 return (0); 10847 } 10848 10849 /* ARGSUSED */ 10850 int 10851 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10852 ip_ioctl_cmd_t *ipip, void *if_req) 10853 { 10854 int arp_muxid = 0; 10855 10856 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 10857 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10858 /* 10859 * Get the muxid saved in ill for I_PUNLINK. 10860 */ 10861 arp_muxid = arl_get_muxid(ipif->ipif_ill); 10862 if (ipip->ipi_cmd_type == IF_CMD) { 10863 struct ifreq *ifr = (struct ifreq *)if_req; 10864 10865 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid; 10866 ifr->ifr_arp_muxid = arp_muxid; 10867 } else { 10868 struct lifreq *lifr = (struct lifreq *)if_req; 10869 10870 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid; 10871 lifr->lifr_arp_muxid = arp_muxid; 10872 } 10873 return (0); 10874 } 10875 10876 /* 10877 * Set the subnet prefix. Does not modify the broadcast address. 10878 */ 10879 /* ARGSUSED */ 10880 int 10881 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10882 ip_ioctl_cmd_t *ipip, void *if_req) 10883 { 10884 int err = 0; 10885 in6_addr_t v6addr; 10886 in6_addr_t v6mask; 10887 boolean_t need_up = B_FALSE; 10888 int addrlen; 10889 10890 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 10891 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10892 10893 ASSERT(IAM_WRITER_IPIF(ipif)); 10894 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 10895 10896 if (ipif->ipif_isv6) { 10897 sin6_t *sin6; 10898 10899 if (sin->sin_family != AF_INET6) 10900 return (EAFNOSUPPORT); 10901 10902 sin6 = (sin6_t *)sin; 10903 v6addr = sin6->sin6_addr; 10904 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 10905 return (EADDRNOTAVAIL); 10906 } else { 10907 ipaddr_t addr; 10908 10909 if (sin->sin_family != AF_INET) 10910 return (EAFNOSUPPORT); 10911 10912 addr = sin->sin_addr.s_addr; 10913 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 10914 return (EADDRNOTAVAIL); 10915 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10916 /* Add 96 bits */ 10917 addrlen += IPV6_ABITS - IP_ABITS; 10918 } 10919 10920 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 10921 return (EINVAL); 10922 10923 /* Check if bits in the address is set past the mask */ 10924 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 10925 return (EINVAL); 10926 10927 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 10928 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 10929 return (0); /* No change */ 10930 10931 if (ipif->ipif_flags & IPIF_UP) { 10932 /* 10933 * If the interface is already marked up, 10934 * we call ipif_down which will take care 10935 * of ditching any IREs that have been set 10936 * up based on the old interface address. 10937 */ 10938 err = ipif_logical_down(ipif, q, mp); 10939 if (err == EINPROGRESS) 10940 return (err); 10941 (void) ipif_down_tail(ipif); 10942 need_up = B_TRUE; 10943 } 10944 10945 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 10946 return (err); 10947 } 10948 10949 static int 10950 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 10951 queue_t *q, mblk_t *mp, boolean_t need_up) 10952 { 10953 ill_t *ill = ipif->ipif_ill; 10954 int err = 0; 10955 10956 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 10957 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10958 10959 /* Set the new address. */ 10960 mutex_enter(&ill->ill_lock); 10961 ipif->ipif_v6net_mask = v6mask; 10962 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10963 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 10964 ipif->ipif_v6subnet); 10965 } 10966 mutex_exit(&ill->ill_lock); 10967 10968 if (need_up) { 10969 /* 10970 * Now bring the interface back up. If this 10971 * is the only IPIF for the ILL, ipif_up 10972 * will have to re-bind to the device, so 10973 * we may get back EINPROGRESS, in which 10974 * case, this IOCTL will get completed in 10975 * ip_rput_dlpi when we see the DL_BIND_ACK. 10976 */ 10977 err = ipif_up(ipif, q, mp); 10978 if (err == EINPROGRESS) 10979 return (err); 10980 } 10981 return (err); 10982 } 10983 10984 /* ARGSUSED */ 10985 int 10986 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10987 ip_ioctl_cmd_t *ipip, void *if_req) 10988 { 10989 int addrlen; 10990 in6_addr_t v6addr; 10991 in6_addr_t v6mask; 10992 struct lifreq *lifr = (struct lifreq *)if_req; 10993 10994 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 10995 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10996 (void) ipif_down_tail(ipif); 10997 10998 addrlen = lifr->lifr_addrlen; 10999 if (ipif->ipif_isv6) { 11000 sin6_t *sin6; 11001 11002 sin6 = (sin6_t *)sin; 11003 v6addr = sin6->sin6_addr; 11004 } else { 11005 ipaddr_t addr; 11006 11007 addr = sin->sin_addr.s_addr; 11008 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11009 addrlen += IPV6_ABITS - IP_ABITS; 11010 } 11011 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 11012 11013 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 11014 } 11015 11016 /* ARGSUSED */ 11017 int 11018 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11019 ip_ioctl_cmd_t *ipip, void *if_req) 11020 { 11021 struct lifreq *lifr = (struct lifreq *)if_req; 11022 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 11023 11024 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 11025 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11026 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11027 11028 if (ipif->ipif_isv6) { 11029 *sin6 = sin6_null; 11030 sin6->sin6_family = AF_INET6; 11031 sin6->sin6_addr = ipif->ipif_v6subnet; 11032 lifr->lifr_addrlen = 11033 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11034 } else { 11035 *sin = sin_null; 11036 sin->sin_family = AF_INET; 11037 sin->sin_addr.s_addr = ipif->ipif_subnet; 11038 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 11039 } 11040 return (0); 11041 } 11042 11043 /* 11044 * Set the IPv6 address token. 11045 */ 11046 /* ARGSUSED */ 11047 int 11048 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11049 ip_ioctl_cmd_t *ipi, void *if_req) 11050 { 11051 ill_t *ill = ipif->ipif_ill; 11052 int err; 11053 in6_addr_t v6addr; 11054 in6_addr_t v6mask; 11055 boolean_t need_up = B_FALSE; 11056 int i; 11057 sin6_t *sin6 = (sin6_t *)sin; 11058 struct lifreq *lifr = (struct lifreq *)if_req; 11059 int addrlen; 11060 11061 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 11062 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11063 ASSERT(IAM_WRITER_IPIF(ipif)); 11064 11065 addrlen = lifr->lifr_addrlen; 11066 /* Only allow for logical unit zero i.e. not on "le0:17" */ 11067 if (ipif->ipif_id != 0) 11068 return (EINVAL); 11069 11070 if (!ipif->ipif_isv6) 11071 return (EINVAL); 11072 11073 if (addrlen > IPV6_ABITS) 11074 return (EINVAL); 11075 11076 v6addr = sin6->sin6_addr; 11077 11078 /* 11079 * The length of the token is the length from the end. To get 11080 * the proper mask for this, compute the mask of the bits not 11081 * in the token; ie. the prefix, and then xor to get the mask. 11082 */ 11083 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 11084 return (EINVAL); 11085 for (i = 0; i < 4; i++) { 11086 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 11087 } 11088 11089 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 11090 ill->ill_token_length == addrlen) 11091 return (0); /* No change */ 11092 11093 if (ipif->ipif_flags & IPIF_UP) { 11094 err = ipif_logical_down(ipif, q, mp); 11095 if (err == EINPROGRESS) 11096 return (err); 11097 (void) ipif_down_tail(ipif); 11098 need_up = B_TRUE; 11099 } 11100 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 11101 return (err); 11102 } 11103 11104 static int 11105 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 11106 mblk_t *mp, boolean_t need_up) 11107 { 11108 in6_addr_t v6addr; 11109 in6_addr_t v6mask; 11110 ill_t *ill = ipif->ipif_ill; 11111 int i; 11112 int err = 0; 11113 11114 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 11115 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11116 v6addr = sin6->sin6_addr; 11117 /* 11118 * The length of the token is the length from the end. To get 11119 * the proper mask for this, compute the mask of the bits not 11120 * in the token; ie. the prefix, and then xor to get the mask. 11121 */ 11122 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 11123 for (i = 0; i < 4; i++) 11124 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 11125 11126 mutex_enter(&ill->ill_lock); 11127 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 11128 ill->ill_token_length = addrlen; 11129 ill->ill_manual_token = 1; 11130 11131 /* Reconfigure the link-local address based on this new token */ 11132 ipif_setlinklocal(ill->ill_ipif); 11133 11134 mutex_exit(&ill->ill_lock); 11135 11136 if (need_up) { 11137 /* 11138 * Now bring the interface back up. If this 11139 * is the only IPIF for the ILL, ipif_up 11140 * will have to re-bind to the device, so 11141 * we may get back EINPROGRESS, in which 11142 * case, this IOCTL will get completed in 11143 * ip_rput_dlpi when we see the DL_BIND_ACK. 11144 */ 11145 err = ipif_up(ipif, q, mp); 11146 if (err == EINPROGRESS) 11147 return (err); 11148 } 11149 return (err); 11150 } 11151 11152 /* ARGSUSED */ 11153 int 11154 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11155 ip_ioctl_cmd_t *ipi, void *if_req) 11156 { 11157 ill_t *ill; 11158 sin6_t *sin6 = (sin6_t *)sin; 11159 struct lifreq *lifr = (struct lifreq *)if_req; 11160 11161 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 11162 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11163 if (ipif->ipif_id != 0) 11164 return (EINVAL); 11165 11166 ill = ipif->ipif_ill; 11167 if (!ill->ill_isv6) 11168 return (ENXIO); 11169 11170 *sin6 = sin6_null; 11171 sin6->sin6_family = AF_INET6; 11172 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 11173 sin6->sin6_addr = ill->ill_token; 11174 lifr->lifr_addrlen = ill->ill_token_length; 11175 return (0); 11176 } 11177 11178 /* 11179 * Set (hardware) link specific information that might override 11180 * what was acquired through the DL_INFO_ACK. 11181 */ 11182 /* ARGSUSED */ 11183 int 11184 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11185 ip_ioctl_cmd_t *ipi, void *if_req) 11186 { 11187 ill_t *ill = ipif->ipif_ill; 11188 int ip_min_mtu; 11189 struct lifreq *lifr = (struct lifreq *)if_req; 11190 lif_ifinfo_req_t *lir; 11191 11192 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 11193 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11194 lir = &lifr->lifr_ifinfo; 11195 ASSERT(IAM_WRITER_IPIF(ipif)); 11196 11197 /* Only allow for logical unit zero i.e. not on "bge0:17" */ 11198 if (ipif->ipif_id != 0) 11199 return (EINVAL); 11200 11201 /* Set interface MTU. */ 11202 if (ipif->ipif_isv6) 11203 ip_min_mtu = IPV6_MIN_MTU; 11204 else 11205 ip_min_mtu = IP_MIN_MTU; 11206 11207 /* 11208 * Verify values before we set anything. Allow zero to 11209 * mean unspecified. 11210 * 11211 * XXX We should be able to set the user-defined lir_mtu to some value 11212 * that is greater than ill_current_frag but less than ill_max_frag- the 11213 * ill_max_frag value tells us the max MTU that can be handled by the 11214 * datalink, whereas the ill_current_frag is dynamically computed for 11215 * some link-types like tunnels, based on the tunnel PMTU. However, 11216 * since there is currently no way of distinguishing between 11217 * administratively fixed link mtu values (e.g., those set via 11218 * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered 11219 * for tunnels) we conservatively choose the ill_current_frag as the 11220 * upper-bound. 11221 */ 11222 if (lir->lir_maxmtu != 0 && 11223 (lir->lir_maxmtu > ill->ill_current_frag || 11224 lir->lir_maxmtu < ip_min_mtu)) 11225 return (EINVAL); 11226 if (lir->lir_reachtime != 0 && 11227 lir->lir_reachtime > ND_MAX_REACHTIME) 11228 return (EINVAL); 11229 if (lir->lir_reachretrans != 0 && 11230 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 11231 return (EINVAL); 11232 11233 mutex_enter(&ill->ill_lock); 11234 /* 11235 * The dce and fragmentation code can handle changes to ill_mtu 11236 * concurrent with sending/fragmenting packets. 11237 */ 11238 if (lir->lir_maxmtu != 0) 11239 ill->ill_user_mtu = lir->lir_maxmtu; 11240 11241 if (lir->lir_reachtime != 0) 11242 ill->ill_reachable_time = lir->lir_reachtime; 11243 11244 if (lir->lir_reachretrans != 0) 11245 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 11246 11247 ill->ill_max_hops = lir->lir_maxhops; 11248 ill->ill_max_buf = ND_MAX_Q; 11249 if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) { 11250 /* 11251 * ill_mtu is the actual interface MTU, obtained as the min 11252 * of user-configured mtu and the value announced by the 11253 * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since 11254 * we have already made the choice of requiring 11255 * ill_user_mtu < ill_current_frag by the time we get here, 11256 * the ill_mtu effectively gets assigned to the ill_user_mtu 11257 * here. 11258 */ 11259 ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu); 11260 } 11261 mutex_exit(&ill->ill_lock); 11262 11263 /* 11264 * Make sure all dce_generation checks find out 11265 * that ill_mtu has changed. 11266 */ 11267 if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0)) 11268 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); 11269 11270 /* 11271 * Refresh IPMP meta-interface MTU if necessary. 11272 */ 11273 if (IS_UNDER_IPMP(ill)) 11274 ipmp_illgrp_refresh_mtu(ill->ill_grp); 11275 11276 return (0); 11277 } 11278 11279 /* ARGSUSED */ 11280 int 11281 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11282 ip_ioctl_cmd_t *ipi, void *if_req) 11283 { 11284 struct lif_ifinfo_req *lir; 11285 ill_t *ill = ipif->ipif_ill; 11286 11287 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 11288 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11289 if (ipif->ipif_id != 0) 11290 return (EINVAL); 11291 11292 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 11293 lir->lir_maxhops = ill->ill_max_hops; 11294 lir->lir_reachtime = ill->ill_reachable_time; 11295 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 11296 lir->lir_maxmtu = ill->ill_mtu; 11297 11298 return (0); 11299 } 11300 11301 /* 11302 * Return best guess as to the subnet mask for the specified address. 11303 * Based on the subnet masks for all the configured interfaces. 11304 * 11305 * We end up returning a zero mask in the case of default, multicast or 11306 * experimental. 11307 */ 11308 static ipaddr_t 11309 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst) 11310 { 11311 ipaddr_t net_mask; 11312 ill_t *ill; 11313 ipif_t *ipif; 11314 ill_walk_context_t ctx; 11315 ipif_t *fallback_ipif = NULL; 11316 11317 net_mask = ip_net_mask(addr); 11318 if (net_mask == 0) { 11319 *ipifp = NULL; 11320 return (0); 11321 } 11322 11323 /* Let's check to see if this is maybe a local subnet route. */ 11324 /* this function only applies to IPv4 interfaces */ 11325 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 11326 ill = ILL_START_WALK_V4(&ctx, ipst); 11327 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 11328 mutex_enter(&ill->ill_lock); 11329 for (ipif = ill->ill_ipif; ipif != NULL; 11330 ipif = ipif->ipif_next) { 11331 if (IPIF_IS_CONDEMNED(ipif)) 11332 continue; 11333 if (!(ipif->ipif_flags & IPIF_UP)) 11334 continue; 11335 if ((ipif->ipif_subnet & net_mask) == 11336 (addr & net_mask)) { 11337 /* 11338 * Don't trust pt-pt interfaces if there are 11339 * other interfaces. 11340 */ 11341 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 11342 if (fallback_ipif == NULL) { 11343 ipif_refhold_locked(ipif); 11344 fallback_ipif = ipif; 11345 } 11346 continue; 11347 } 11348 11349 /* 11350 * Fine. Just assume the same net mask as the 11351 * directly attached subnet interface is using. 11352 */ 11353 ipif_refhold_locked(ipif); 11354 mutex_exit(&ill->ill_lock); 11355 rw_exit(&ipst->ips_ill_g_lock); 11356 if (fallback_ipif != NULL) 11357 ipif_refrele(fallback_ipif); 11358 *ipifp = ipif; 11359 return (ipif->ipif_net_mask); 11360 } 11361 } 11362 mutex_exit(&ill->ill_lock); 11363 } 11364 rw_exit(&ipst->ips_ill_g_lock); 11365 11366 *ipifp = fallback_ipif; 11367 return ((fallback_ipif != NULL) ? 11368 fallback_ipif->ipif_net_mask : net_mask); 11369 } 11370 11371 /* 11372 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 11373 */ 11374 static void 11375 ip_wput_ioctl(queue_t *q, mblk_t *mp) 11376 { 11377 IOCP iocp; 11378 ipft_t *ipft; 11379 ipllc_t *ipllc; 11380 mblk_t *mp1; 11381 cred_t *cr; 11382 int error = 0; 11383 conn_t *connp; 11384 11385 ip1dbg(("ip_wput_ioctl")); 11386 iocp = (IOCP)mp->b_rptr; 11387 mp1 = mp->b_cont; 11388 if (mp1 == NULL) { 11389 iocp->ioc_error = EINVAL; 11390 mp->b_datap->db_type = M_IOCNAK; 11391 iocp->ioc_count = 0; 11392 qreply(q, mp); 11393 return; 11394 } 11395 11396 /* 11397 * These IOCTLs provide various control capabilities to 11398 * upstream agents such as ULPs and processes. There 11399 * are currently two such IOCTLs implemented. They 11400 * are used by TCP to provide update information for 11401 * existing IREs and to forcibly delete an IRE for a 11402 * host that is not responding, thereby forcing an 11403 * attempt at a new route. 11404 */ 11405 iocp->ioc_error = EINVAL; 11406 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 11407 goto done; 11408 11409 ipllc = (ipllc_t *)mp1->b_rptr; 11410 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 11411 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 11412 break; 11413 } 11414 /* 11415 * prefer credential from mblk over ioctl; 11416 * see ip_sioctl_copyin_setup 11417 */ 11418 cr = msg_getcred(mp, NULL); 11419 if (cr == NULL) 11420 cr = iocp->ioc_cr; 11421 11422 /* 11423 * Refhold the conn in case the request gets queued up in some lookup 11424 */ 11425 ASSERT(CONN_Q(q)); 11426 connp = Q_TO_CONN(q); 11427 CONN_INC_REF(connp); 11428 if (ipft->ipft_pfi && 11429 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 11430 pullupmsg(mp1, ipft->ipft_min_size))) { 11431 error = (*ipft->ipft_pfi)(q, 11432 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 11433 } 11434 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 11435 /* 11436 * CONN_OPER_PENDING_DONE happens in the function called 11437 * through ipft_pfi above. 11438 */ 11439 return; 11440 } 11441 11442 CONN_OPER_PENDING_DONE(connp); 11443 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 11444 freemsg(mp); 11445 return; 11446 } 11447 iocp->ioc_error = error; 11448 11449 done: 11450 mp->b_datap->db_type = M_IOCACK; 11451 if (iocp->ioc_error) 11452 iocp->ioc_count = 0; 11453 qreply(q, mp); 11454 } 11455 11456 /* 11457 * Assign a unique id for the ipif. This is used by sctp_addr.c 11458 * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures. 11459 */ 11460 static void 11461 ipif_assign_seqid(ipif_t *ipif) 11462 { 11463 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 11464 11465 ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1); 11466 } 11467 11468 /* 11469 * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are 11470 * administratively down (i.e., no DAD), of the same type, and locked. Note 11471 * that the clone is complete -- including the seqid -- and the expectation is 11472 * that the caller will either free or overwrite `sipif' before it's unlocked. 11473 */ 11474 static void 11475 ipif_clone(const ipif_t *sipif, ipif_t *dipif) 11476 { 11477 ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock)); 11478 ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock)); 11479 ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 11480 ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 11481 ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type); 11482 11483 dipif->ipif_flags = sipif->ipif_flags; 11484 dipif->ipif_metric = sipif->ipif_metric; 11485 dipif->ipif_zoneid = sipif->ipif_zoneid; 11486 dipif->ipif_v6subnet = sipif->ipif_v6subnet; 11487 dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr; 11488 dipif->ipif_v6net_mask = sipif->ipif_v6net_mask; 11489 dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr; 11490 dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr; 11491 11492 /* 11493 * As per the comment atop the function, we assume that these sipif 11494 * fields will be changed before sipif is unlocked. 11495 */ 11496 dipif->ipif_seqid = sipif->ipif_seqid; 11497 dipif->ipif_state_flags = sipif->ipif_state_flags; 11498 } 11499 11500 /* 11501 * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif' 11502 * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin 11503 * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then 11504 * transfer the xop to `dipif'. Requires that all ipifs are administratively 11505 * down (i.e., no DAD), of the same type, and unlocked. 11506 */ 11507 static void 11508 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif) 11509 { 11510 ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq; 11511 ipxop_t *ipx = ipsq->ipsq_xop; 11512 11513 ASSERT(sipif != dipif); 11514 ASSERT(sipif != virgipif); 11515 11516 /* 11517 * Grab all of the locks that protect the ipif in a defined order. 11518 */ 11519 GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 11520 11521 ipif_clone(sipif, dipif); 11522 if (virgipif != NULL) { 11523 ipif_clone(virgipif, sipif); 11524 mi_free(virgipif); 11525 } 11526 11527 RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 11528 11529 /* 11530 * Transfer ownership of the current xop, if necessary. 11531 */ 11532 if (ipx->ipx_current_ipif == sipif) { 11533 ASSERT(ipx->ipx_pending_ipif == NULL); 11534 mutex_enter(&ipx->ipx_lock); 11535 ipx->ipx_current_ipif = dipif; 11536 mutex_exit(&ipx->ipx_lock); 11537 } 11538 11539 if (virgipif == NULL) 11540 mi_free(sipif); 11541 } 11542 11543 /* 11544 * checks if: 11545 * - <ill_name>:<ipif_id> is at most LIFNAMSIZ - 1 and 11546 * - logical interface is within the allowed range 11547 */ 11548 static int 11549 is_lifname_valid(ill_t *ill, unsigned int ipif_id) 11550 { 11551 if (snprintf(NULL, 0, "%s:%d", ill->ill_name, ipif_id) >= LIFNAMSIZ) 11552 return (ENAMETOOLONG); 11553 11554 if (ipif_id >= ill->ill_ipst->ips_ip_addrs_per_if) 11555 return (ERANGE); 11556 return (0); 11557 } 11558 11559 /* 11560 * Insert the ipif, so that the list of ipifs on the ill will be sorted 11561 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 11562 * be inserted into the first space available in the list. The value of 11563 * ipif_id will then be set to the appropriate value for its position. 11564 */ 11565 static int 11566 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock) 11567 { 11568 ill_t *ill; 11569 ipif_t *tipif; 11570 ipif_t **tipifp; 11571 int id, err; 11572 ip_stack_t *ipst; 11573 11574 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 11575 IAM_WRITER_IPIF(ipif)); 11576 11577 ill = ipif->ipif_ill; 11578 ASSERT(ill != NULL); 11579 ipst = ill->ill_ipst; 11580 11581 /* 11582 * In the case of lo0:0 we already hold the ill_g_lock. 11583 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 11584 * ipif_insert. 11585 */ 11586 if (acquire_g_lock) 11587 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 11588 mutex_enter(&ill->ill_lock); 11589 id = ipif->ipif_id; 11590 tipifp = &(ill->ill_ipif); 11591 if (id == -1) { /* need to find a real id */ 11592 id = 0; 11593 while ((tipif = *tipifp) != NULL) { 11594 ASSERT(tipif->ipif_id >= id); 11595 if (tipif->ipif_id != id) 11596 break; /* non-consecutive id */ 11597 id++; 11598 tipifp = &(tipif->ipif_next); 11599 } 11600 if ((err = is_lifname_valid(ill, id)) != 0) { 11601 mutex_exit(&ill->ill_lock); 11602 if (acquire_g_lock) 11603 rw_exit(&ipst->ips_ill_g_lock); 11604 return (err); 11605 } 11606 ipif->ipif_id = id; /* assign new id */ 11607 } else if ((err = is_lifname_valid(ill, id)) == 0) { 11608 /* we have a real id; insert ipif in the right place */ 11609 while ((tipif = *tipifp) != NULL) { 11610 ASSERT(tipif->ipif_id != id); 11611 if (tipif->ipif_id > id) 11612 break; /* found correct location */ 11613 tipifp = &(tipif->ipif_next); 11614 } 11615 } else { 11616 mutex_exit(&ill->ill_lock); 11617 if (acquire_g_lock) 11618 rw_exit(&ipst->ips_ill_g_lock); 11619 return (err); 11620 } 11621 11622 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 11623 11624 ipif->ipif_next = tipif; 11625 *tipifp = ipif; 11626 mutex_exit(&ill->ill_lock); 11627 if (acquire_g_lock) 11628 rw_exit(&ipst->ips_ill_g_lock); 11629 11630 return (0); 11631 } 11632 11633 static void 11634 ipif_remove(ipif_t *ipif) 11635 { 11636 ipif_t **ipifp; 11637 ill_t *ill = ipif->ipif_ill; 11638 11639 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock)); 11640 11641 mutex_enter(&ill->ill_lock); 11642 ipifp = &ill->ill_ipif; 11643 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 11644 if (*ipifp == ipif) { 11645 *ipifp = ipif->ipif_next; 11646 break; 11647 } 11648 } 11649 mutex_exit(&ill->ill_lock); 11650 } 11651 11652 /* 11653 * Allocate and initialize a new interface control structure. (Always 11654 * called as writer.) 11655 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 11656 * is not part of the global linked list of ills. ipif_seqid is unique 11657 * in the system and to preserve the uniqueness, it is assigned only 11658 * when ill becomes part of the global list. At that point ill will 11659 * have a name. If it doesn't get assigned here, it will get assigned 11660 * in ipif_set_values() as part of SIOCSLIFNAME processing. 11661 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 11662 * the interface flags or any other information from the DL_INFO_ACK for 11663 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 11664 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 11665 * second DL_INFO_ACK comes in from the driver. 11666 */ 11667 static ipif_t * 11668 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, 11669 boolean_t insert, int *errorp) 11670 { 11671 int err; 11672 ipif_t *ipif; 11673 ip_stack_t *ipst = ill->ill_ipst; 11674 11675 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 11676 ill->ill_name, id, (void *)ill)); 11677 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 11678 11679 if (errorp != NULL) 11680 *errorp = 0; 11681 11682 if ((ipif = mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) { 11683 if (errorp != NULL) 11684 *errorp = ENOMEM; 11685 return (NULL); 11686 } 11687 *ipif = ipif_zero; /* start clean */ 11688 11689 ipif->ipif_ill = ill; 11690 ipif->ipif_id = id; /* could be -1 */ 11691 /* 11692 * Inherit the zoneid from the ill; for the shared stack instance 11693 * this is always the global zone 11694 */ 11695 ipif->ipif_zoneid = ill->ill_zoneid; 11696 11697 ipif->ipif_refcnt = 0; 11698 11699 if (insert) { 11700 if ((err = ipif_insert(ipif, ire_type != IRE_LOOPBACK)) != 0) { 11701 mi_free(ipif); 11702 if (errorp != NULL) 11703 *errorp = err; 11704 return (NULL); 11705 } 11706 /* -1 id should have been replaced by real id */ 11707 id = ipif->ipif_id; 11708 ASSERT(id >= 0); 11709 } 11710 11711 if (ill->ill_name[0] != '\0') 11712 ipif_assign_seqid(ipif); 11713 11714 /* 11715 * If this is the zeroth ipif on the IPMP ill, create the illgrp 11716 * (which must not exist yet because the zeroth ipif is created once 11717 * per ill). However, do not not link it to the ipmp_grp_t until 11718 * I_PLINK is called; see ip_sioctl_plink_ipmp() for details. 11719 */ 11720 if (id == 0 && IS_IPMP(ill)) { 11721 if (ipmp_illgrp_create(ill) == NULL) { 11722 if (insert) { 11723 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 11724 ipif_remove(ipif); 11725 rw_exit(&ipst->ips_ill_g_lock); 11726 } 11727 mi_free(ipif); 11728 if (errorp != NULL) 11729 *errorp = ENOMEM; 11730 return (NULL); 11731 } 11732 } 11733 11734 /* 11735 * We grab ill_lock to protect the flag changes. The ipif is still 11736 * not up and can't be looked up until the ioctl completes and the 11737 * IPIF_CHANGING flag is cleared. 11738 */ 11739 mutex_enter(&ill->ill_lock); 11740 11741 ipif->ipif_ire_type = ire_type; 11742 11743 if (ipif->ipif_isv6) { 11744 ill->ill_flags |= ILLF_IPV6; 11745 } else { 11746 ipaddr_t inaddr_any = INADDR_ANY; 11747 11748 ill->ill_flags |= ILLF_IPV4; 11749 11750 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 11751 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11752 &ipif->ipif_v6lcl_addr); 11753 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11754 &ipif->ipif_v6subnet); 11755 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11756 &ipif->ipif_v6net_mask); 11757 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11758 &ipif->ipif_v6brd_addr); 11759 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11760 &ipif->ipif_v6pp_dst_addr); 11761 } 11762 11763 /* 11764 * Don't set the interface flags etc. now, will do it in 11765 * ip_ll_subnet_defaults. 11766 */ 11767 if (!initialize) 11768 goto out; 11769 11770 /* 11771 * NOTE: The IPMP meta-interface is special-cased because it starts 11772 * with no underlying interfaces (and thus an unknown broadcast 11773 * address length), but all interfaces that can be placed into an IPMP 11774 * group are required to be broadcast-capable. 11775 */ 11776 if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) { 11777 /* 11778 * Later detect lack of DLPI driver multicast capability by 11779 * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi(). 11780 */ 11781 ill->ill_flags |= ILLF_MULTICAST; 11782 if (!ipif->ipif_isv6) 11783 ipif->ipif_flags |= IPIF_BROADCAST; 11784 } else { 11785 if (ill->ill_net_type != IRE_LOOPBACK) { 11786 if (ipif->ipif_isv6) 11787 /* 11788 * Note: xresolv interfaces will eventually need 11789 * NOARP set here as well, but that will require 11790 * those external resolvers to have some 11791 * knowledge of that flag and act appropriately. 11792 * Not to be changed at present. 11793 */ 11794 ill->ill_flags |= ILLF_NONUD; 11795 else 11796 ill->ill_flags |= ILLF_NOARP; 11797 } 11798 if (ill->ill_phys_addr_length == 0) { 11799 if (IS_VNI(ill)) { 11800 ipif->ipif_flags |= IPIF_NOXMIT; 11801 } else { 11802 /* pt-pt supports multicast. */ 11803 ill->ill_flags |= ILLF_MULTICAST; 11804 if (ill->ill_net_type != IRE_LOOPBACK) 11805 ipif->ipif_flags |= IPIF_POINTOPOINT; 11806 } 11807 } 11808 } 11809 out: 11810 mutex_exit(&ill->ill_lock); 11811 return (ipif); 11812 } 11813 11814 /* 11815 * Remove the neighbor cache entries associated with this logical 11816 * interface. 11817 */ 11818 int 11819 ipif_arp_down(ipif_t *ipif) 11820 { 11821 ill_t *ill = ipif->ipif_ill; 11822 int err = 0; 11823 11824 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 11825 ASSERT(IAM_WRITER_IPIF(ipif)); 11826 11827 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down", 11828 ill_t *, ill, ipif_t *, ipif); 11829 ipif_nce_down(ipif); 11830 11831 /* 11832 * If this is the last ipif that is going down and there are no 11833 * duplicate addresses we may yet attempt to re-probe, then we need to 11834 * clean up ARP completely. 11835 */ 11836 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 11837 !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) { 11838 /* 11839 * If this was the last ipif on an IPMP interface, purge any 11840 * static ARP entries associated with it. 11841 */ 11842 if (IS_IPMP(ill)) 11843 ipmp_illgrp_refresh_arpent(ill->ill_grp); 11844 11845 /* UNBIND, DETACH */ 11846 err = arp_ll_down(ill); 11847 } 11848 11849 return (err); 11850 } 11851 11852 /* 11853 * Get the resolver set up for a new IP address. (Always called as writer.) 11854 * Called both for IPv4 and IPv6 interfaces, though it only does some 11855 * basic DAD related initialization for IPv6. Honors ILLF_NOARP. 11856 * 11857 * The enumerated value res_act tunes the behavior: 11858 * * Res_act_initial: set up all the resolver structures for a new 11859 * IP address. 11860 * * Res_act_defend: tell ARP that it needs to send a single gratuitous 11861 * ARP message in defense of the address. 11862 * * Res_act_rebind: tell ARP to change the hardware address for an IP 11863 * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif(). 11864 * 11865 * Returns zero on success, or an errno upon failure. 11866 */ 11867 int 11868 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 11869 { 11870 ill_t *ill = ipif->ipif_ill; 11871 int err; 11872 boolean_t was_dup; 11873 11874 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 11875 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 11876 ASSERT(IAM_WRITER_IPIF(ipif)); 11877 11878 was_dup = B_FALSE; 11879 if (res_act == Res_act_initial) { 11880 ipif->ipif_addr_ready = 0; 11881 /* 11882 * We're bringing an interface up here. There's no way that we 11883 * should need to shut down ARP now. 11884 */ 11885 mutex_enter(&ill->ill_lock); 11886 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11887 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11888 ill->ill_ipif_dup_count--; 11889 was_dup = B_TRUE; 11890 } 11891 mutex_exit(&ill->ill_lock); 11892 } 11893 if (ipif->ipif_recovery_id != 0) 11894 (void) untimeout(ipif->ipif_recovery_id); 11895 ipif->ipif_recovery_id = 0; 11896 if (ill->ill_net_type != IRE_IF_RESOLVER) { 11897 ipif->ipif_addr_ready = 1; 11898 return (0); 11899 } 11900 /* NDP will set the ipif_addr_ready flag when it's ready */ 11901 if (ill->ill_isv6) 11902 return (0); 11903 11904 err = ipif_arp_up(ipif, res_act, was_dup); 11905 return (err); 11906 } 11907 11908 /* 11909 * This routine restarts IPv4/IPv6 duplicate address detection (DAD) 11910 * when a link has just gone back up. 11911 */ 11912 static void 11913 ipif_nce_start_dad(ipif_t *ipif) 11914 { 11915 ncec_t *ncec; 11916 ill_t *ill = ipif->ipif_ill; 11917 boolean_t isv6 = ill->ill_isv6; 11918 11919 if (isv6) { 11920 ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill, 11921 &ipif->ipif_v6lcl_addr); 11922 } else { 11923 ipaddr_t v4addr; 11924 11925 if (ill->ill_net_type != IRE_IF_RESOLVER || 11926 (ipif->ipif_flags & IPIF_UNNUMBERED) || 11927 ipif->ipif_lcl_addr == INADDR_ANY) { 11928 /* 11929 * If we can't contact ARP for some reason, 11930 * that's not really a problem. Just send 11931 * out the routing socket notification that 11932 * DAD completion would have done, and continue. 11933 */ 11934 ipif_mask_reply(ipif); 11935 ipif_up_notify(ipif); 11936 ipif->ipif_addr_ready = 1; 11937 return; 11938 } 11939 11940 IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr); 11941 ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr); 11942 } 11943 11944 if (ncec == NULL) { 11945 ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n", 11946 (void *)ipif)); 11947 return; 11948 } 11949 if (!nce_restart_dad(ncec)) { 11950 /* 11951 * If we can't restart DAD for some reason, that's not really a 11952 * problem. Just send out the routing socket notification that 11953 * DAD completion would have done, and continue. 11954 */ 11955 ipif_up_notify(ipif); 11956 ipif->ipif_addr_ready = 1; 11957 } 11958 ncec_refrele(ncec); 11959 } 11960 11961 /* 11962 * Restart duplicate address detection on all interfaces on the given ill. 11963 * 11964 * This is called when an interface transitions from down to up 11965 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 11966 * 11967 * Note that since the underlying physical link has transitioned, we must cause 11968 * at least one routing socket message to be sent here, either via DAD 11969 * completion or just by default on the first ipif. (If we don't do this, then 11970 * in.mpathd will see long delays when doing link-based failure recovery.) 11971 */ 11972 void 11973 ill_restart_dad(ill_t *ill, boolean_t went_up) 11974 { 11975 ipif_t *ipif; 11976 11977 if (ill == NULL) 11978 return; 11979 11980 /* 11981 * If layer two doesn't support duplicate address detection, then just 11982 * send the routing socket message now and be done with it. 11983 */ 11984 if (!ill->ill_isv6 && arp_no_defense) { 11985 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 11986 return; 11987 } 11988 11989 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 11990 if (went_up) { 11991 11992 if (ipif->ipif_flags & IPIF_UP) { 11993 ipif_nce_start_dad(ipif); 11994 } else if (ipif->ipif_flags & IPIF_DUPLICATE) { 11995 /* 11996 * kick off the bring-up process now. 11997 */ 11998 ipif_do_recovery(ipif); 11999 } else { 12000 /* 12001 * Unfortunately, the first ipif is "special" 12002 * and represents the underlying ill in the 12003 * routing socket messages. Thus, when this 12004 * one ipif is down, we must still notify so 12005 * that the user knows the IFF_RUNNING status 12006 * change. (If the first ipif is up, then 12007 * we'll handle eventual routing socket 12008 * notification via DAD completion.) 12009 */ 12010 if (ipif == ill->ill_ipif) { 12011 ip_rts_ifmsg(ill->ill_ipif, 12012 RTSQ_DEFAULT); 12013 } 12014 } 12015 } else { 12016 /* 12017 * After link down, we'll need to send a new routing 12018 * message when the link comes back, so clear 12019 * ipif_addr_ready. 12020 */ 12021 ipif->ipif_addr_ready = 0; 12022 } 12023 } 12024 12025 /* 12026 * If we've torn down links, then notify the user right away. 12027 */ 12028 if (!went_up) 12029 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 12030 } 12031 12032 static void 12033 ipsq_delete(ipsq_t *ipsq) 12034 { 12035 ipxop_t *ipx = ipsq->ipsq_xop; 12036 12037 ipsq->ipsq_ipst = NULL; 12038 ASSERT(ipsq->ipsq_phyint == NULL); 12039 ASSERT(ipsq->ipsq_xop != NULL); 12040 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL); 12041 ASSERT(ipx->ipx_pending_mp == NULL); 12042 kmem_free(ipsq, sizeof (ipsq_t)); 12043 } 12044 12045 static int 12046 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp) 12047 { 12048 int err = 0; 12049 ipif_t *ipif; 12050 12051 if (ill == NULL) 12052 return (0); 12053 12054 ASSERT(IAM_WRITER_ILL(ill)); 12055 ill->ill_up_ipifs = B_TRUE; 12056 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12057 if (ipif->ipif_was_up) { 12058 if (!(ipif->ipif_flags & IPIF_UP)) 12059 err = ipif_up(ipif, q, mp); 12060 ipif->ipif_was_up = B_FALSE; 12061 if (err != 0) { 12062 ASSERT(err == EINPROGRESS); 12063 return (err); 12064 } 12065 } 12066 } 12067 ill->ill_up_ipifs = B_FALSE; 12068 return (0); 12069 } 12070 12071 /* 12072 * This function is called to bring up all the ipifs that were up before 12073 * bringing the ill down via ill_down_ipifs(). 12074 */ 12075 int 12076 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 12077 { 12078 int err; 12079 12080 ASSERT(IAM_WRITER_ILL(ill)); 12081 12082 if (ill->ill_replumbing) { 12083 ill->ill_replumbing = 0; 12084 /* 12085 * Send down REPLUMB_DONE notification followed by the 12086 * BIND_REQ on the arp stream. 12087 */ 12088 if (!ill->ill_isv6) 12089 arp_send_replumb_conf(ill); 12090 } 12091 err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp); 12092 if (err != 0) 12093 return (err); 12094 12095 return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp)); 12096 } 12097 12098 /* 12099 * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring 12100 * down the ipifs without sending DL_UNBIND_REQ to the driver. 12101 */ 12102 static void 12103 ill_down_ipifs(ill_t *ill, boolean_t logical) 12104 { 12105 ipif_t *ipif; 12106 12107 ASSERT(IAM_WRITER_ILL(ill)); 12108 12109 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12110 /* 12111 * We go through the ipif_down logic even if the ipif 12112 * is already down, since routes can be added based 12113 * on down ipifs. Going through ipif_down once again 12114 * will delete any IREs created based on these routes. 12115 */ 12116 if (ipif->ipif_flags & IPIF_UP) 12117 ipif->ipif_was_up = B_TRUE; 12118 12119 if (logical) { 12120 (void) ipif_logical_down(ipif, NULL, NULL); 12121 ipif_non_duplicate(ipif); 12122 (void) ipif_down_tail(ipif); 12123 } else { 12124 (void) ipif_down(ipif, NULL, NULL); 12125 } 12126 } 12127 } 12128 12129 /* 12130 * Redo source address selection. This makes IXAF_VERIFY_SOURCE take 12131 * a look again at valid source addresses. 12132 * This should be called each time after the set of source addresses has been 12133 * changed. 12134 */ 12135 void 12136 ip_update_source_selection(ip_stack_t *ipst) 12137 { 12138 /* We skip past SRC_GENERATION_VERIFY */ 12139 if (atomic_add_32_nv(&ipst->ips_src_generation, 1) == 12140 SRC_GENERATION_VERIFY) 12141 atomic_add_32(&ipst->ips_src_generation, 1); 12142 } 12143 12144 /* 12145 * Finish the group join started in ip_sioctl_groupname(). 12146 */ 12147 /* ARGSUSED */ 12148 static void 12149 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 12150 { 12151 ill_t *ill = q->q_ptr; 12152 phyint_t *phyi = ill->ill_phyint; 12153 ipmp_grp_t *grp = phyi->phyint_grp; 12154 ip_stack_t *ipst = ill->ill_ipst; 12155 12156 /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */ 12157 ASSERT(!IS_IPMP(ill) && grp != NULL); 12158 ASSERT(IAM_WRITER_IPSQ(ipsq)); 12159 12160 if (phyi->phyint_illv4 != NULL) { 12161 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12162 VERIFY(grp->gr_pendv4-- > 0); 12163 rw_exit(&ipst->ips_ipmp_lock); 12164 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4); 12165 } 12166 if (phyi->phyint_illv6 != NULL) { 12167 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12168 VERIFY(grp->gr_pendv6-- > 0); 12169 rw_exit(&ipst->ips_ipmp_lock); 12170 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6); 12171 } 12172 freemsg(mp); 12173 } 12174 12175 /* 12176 * Process an SIOCSLIFGROUPNAME request. 12177 */ 12178 /* ARGSUSED */ 12179 int 12180 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12181 ip_ioctl_cmd_t *ipip, void *ifreq) 12182 { 12183 struct lifreq *lifr = ifreq; 12184 ill_t *ill = ipif->ipif_ill; 12185 ip_stack_t *ipst = ill->ill_ipst; 12186 phyint_t *phyi = ill->ill_phyint; 12187 ipmp_grp_t *grp = phyi->phyint_grp; 12188 mblk_t *ipsq_mp; 12189 int err = 0; 12190 12191 /* 12192 * Note that phyint_grp can only change here, where we're exclusive. 12193 */ 12194 ASSERT(IAM_WRITER_ILL(ill)); 12195 12196 if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL || 12197 (phyi->phyint_flags & PHYI_VIRTUAL)) 12198 return (EINVAL); 12199 12200 lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0'; 12201 12202 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12203 12204 /* 12205 * If the name hasn't changed, there's nothing to do. 12206 */ 12207 if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0) 12208 goto unlock; 12209 12210 /* 12211 * Handle requests to rename an IPMP meta-interface. 12212 * 12213 * Note that creation of the IPMP meta-interface is handled in 12214 * userland through the standard plumbing sequence. As part of the 12215 * plumbing the IPMP meta-interface, its initial groupname is set to 12216 * the name of the interface (see ipif_set_values_tail()). 12217 */ 12218 if (IS_IPMP(ill)) { 12219 err = ipmp_grp_rename(grp, lifr->lifr_groupname); 12220 goto unlock; 12221 } 12222 12223 /* 12224 * Handle requests to add or remove an IP interface from a group. 12225 */ 12226 if (lifr->lifr_groupname[0] != '\0') { /* add */ 12227 /* 12228 * Moves are handled by first removing the interface from 12229 * its existing group, and then adding it to another group. 12230 * So, fail if it's already in a group. 12231 */ 12232 if (IS_UNDER_IPMP(ill)) { 12233 err = EALREADY; 12234 goto unlock; 12235 } 12236 12237 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst); 12238 if (grp == NULL) { 12239 err = ENOENT; 12240 goto unlock; 12241 } 12242 12243 /* 12244 * Check if the phyint and its ills are suitable for 12245 * inclusion into the group. 12246 */ 12247 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0) 12248 goto unlock; 12249 12250 /* 12251 * Checks pass; join the group, and enqueue the remaining 12252 * illgrp joins for when we've become part of the group xop 12253 * and are exclusive across its IPSQs. Since qwriter_ip() 12254 * requires an mblk_t to scribble on, and since `mp' will be 12255 * freed as part of completing the ioctl, allocate another. 12256 */ 12257 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) { 12258 err = ENOMEM; 12259 goto unlock; 12260 } 12261 12262 /* 12263 * Before we drop ipmp_lock, bump gr_pend* to ensure that the 12264 * IPMP meta-interface ills needed by `phyi' cannot go away 12265 * before ip_join_illgrps() is called back. See the comments 12266 * in ip_sioctl_plink_ipmp() for more. 12267 */ 12268 if (phyi->phyint_illv4 != NULL) 12269 grp->gr_pendv4++; 12270 if (phyi->phyint_illv6 != NULL) 12271 grp->gr_pendv6++; 12272 12273 rw_exit(&ipst->ips_ipmp_lock); 12274 12275 ipmp_phyint_join_grp(phyi, grp); 12276 ill_refhold(ill); 12277 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps, 12278 SWITCH_OP, B_FALSE); 12279 return (0); 12280 } else { 12281 /* 12282 * Request to remove the interface from a group. If the 12283 * interface is not in a group, this trivially succeeds. 12284 */ 12285 rw_exit(&ipst->ips_ipmp_lock); 12286 if (IS_UNDER_IPMP(ill)) 12287 ipmp_phyint_leave_grp(phyi); 12288 return (0); 12289 } 12290 unlock: 12291 rw_exit(&ipst->ips_ipmp_lock); 12292 return (err); 12293 } 12294 12295 /* 12296 * Process an SIOCGLIFBINDING request. 12297 */ 12298 /* ARGSUSED */ 12299 int 12300 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12301 ip_ioctl_cmd_t *ipip, void *ifreq) 12302 { 12303 ill_t *ill; 12304 struct lifreq *lifr = ifreq; 12305 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12306 12307 if (!IS_IPMP(ipif->ipif_ill)) 12308 return (EINVAL); 12309 12310 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12311 if ((ill = ipif->ipif_bound_ill) == NULL) 12312 lifr->lifr_binding[0] = '\0'; 12313 else 12314 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ); 12315 rw_exit(&ipst->ips_ipmp_lock); 12316 return (0); 12317 } 12318 12319 /* 12320 * Process an SIOCGLIFGROUPNAME request. 12321 */ 12322 /* ARGSUSED */ 12323 int 12324 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12325 ip_ioctl_cmd_t *ipip, void *ifreq) 12326 { 12327 ipmp_grp_t *grp; 12328 struct lifreq *lifr = ifreq; 12329 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12330 12331 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12332 if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL) 12333 lifr->lifr_groupname[0] = '\0'; 12334 else 12335 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ); 12336 rw_exit(&ipst->ips_ipmp_lock); 12337 return (0); 12338 } 12339 12340 /* 12341 * Process an SIOCGLIFGROUPINFO request. 12342 */ 12343 /* ARGSUSED */ 12344 int 12345 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12346 ip_ioctl_cmd_t *ipip, void *dummy) 12347 { 12348 ipmp_grp_t *grp; 12349 lifgroupinfo_t *lifgr; 12350 ip_stack_t *ipst = CONNQ_TO_IPST(q); 12351 12352 /* ip_wput_nondata() verified mp->b_cont->b_cont */ 12353 lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr; 12354 lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0'; 12355 12356 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12357 if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) { 12358 rw_exit(&ipst->ips_ipmp_lock); 12359 return (ENOENT); 12360 } 12361 ipmp_grp_info(grp, lifgr); 12362 rw_exit(&ipst->ips_ipmp_lock); 12363 return (0); 12364 } 12365 12366 static void 12367 ill_dl_down(ill_t *ill) 12368 { 12369 DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill); 12370 12371 /* 12372 * The ill is down; unbind but stay attached since we're still 12373 * associated with a PPA. If we have negotiated DLPI capabilites 12374 * with the data link service provider (IDS_OK) then reset them. 12375 * The interval between unbinding and rebinding is potentially 12376 * unbounded hence we cannot assume things will be the same. 12377 * The DLPI capabilities will be probed again when the data link 12378 * is brought up. 12379 */ 12380 mblk_t *mp = ill->ill_unbind_mp; 12381 12382 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 12383 12384 if (!ill->ill_replumbing) { 12385 /* Free all ilms for this ill */ 12386 update_conn_ill(ill, ill->ill_ipst); 12387 } else { 12388 ill_leave_multicast(ill); 12389 } 12390 12391 ill->ill_unbind_mp = NULL; 12392 if (mp != NULL) { 12393 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 12394 dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 12395 ill->ill_name)); 12396 mutex_enter(&ill->ill_lock); 12397 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 12398 mutex_exit(&ill->ill_lock); 12399 /* 12400 * ip_rput does not pass up normal (M_PROTO) DLPI messages 12401 * after ILL_CONDEMNED is set. So in the unplumb case, we call 12402 * ill_capability_dld_disable disable rightaway. If this is not 12403 * an unplumb operation then the disable happens on receipt of 12404 * the capab ack via ip_rput_dlpi_writer -> 12405 * ill_capability_ack_thr. In both cases the order of 12406 * the operations seen by DLD is capability disable followed 12407 * by DL_UNBIND. Also the DLD capability disable needs a 12408 * cv_wait'able context. 12409 */ 12410 if (ill->ill_state_flags & ILL_CONDEMNED) 12411 ill_capability_dld_disable(ill); 12412 ill_capability_reset(ill, B_FALSE); 12413 ill_dlpi_send(ill, mp); 12414 } 12415 mutex_enter(&ill->ill_lock); 12416 ill->ill_dl_up = 0; 12417 ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); 12418 mutex_exit(&ill->ill_lock); 12419 } 12420 12421 void 12422 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 12423 { 12424 union DL_primitives *dlp; 12425 t_uscalar_t prim; 12426 boolean_t waitack = B_FALSE; 12427 12428 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12429 12430 dlp = (union DL_primitives *)mp->b_rptr; 12431 prim = dlp->dl_primitive; 12432 12433 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 12434 dl_primstr(prim), prim, ill->ill_name)); 12435 12436 switch (prim) { 12437 case DL_PHYS_ADDR_REQ: 12438 { 12439 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 12440 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 12441 break; 12442 } 12443 case DL_BIND_REQ: 12444 mutex_enter(&ill->ill_lock); 12445 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 12446 mutex_exit(&ill->ill_lock); 12447 break; 12448 } 12449 12450 /* 12451 * Except for the ACKs for the M_PCPROTO messages, all other ACKs 12452 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore 12453 * we only wait for the ACK of the DL_UNBIND_REQ. 12454 */ 12455 mutex_enter(&ill->ill_lock); 12456 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 12457 (prim == DL_UNBIND_REQ)) { 12458 ill->ill_dlpi_pending = prim; 12459 waitack = B_TRUE; 12460 } 12461 12462 mutex_exit(&ill->ill_lock); 12463 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch", 12464 char *, dl_primstr(prim), ill_t *, ill); 12465 putnext(ill->ill_wq, mp); 12466 12467 /* 12468 * There is no ack for DL_NOTIFY_CONF messages 12469 */ 12470 if (waitack && prim == DL_NOTIFY_CONF) 12471 ill_dlpi_done(ill, prim); 12472 } 12473 12474 /* 12475 * Helper function for ill_dlpi_send(). 12476 */ 12477 /* ARGSUSED */ 12478 static void 12479 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 12480 { 12481 ill_dlpi_send(q->q_ptr, mp); 12482 } 12483 12484 /* 12485 * Send a DLPI control message to the driver but make sure there 12486 * is only one outstanding message. Uses ill_dlpi_pending to tell 12487 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 12488 * when an ACK or a NAK is received to process the next queued message. 12489 */ 12490 void 12491 ill_dlpi_send(ill_t *ill, mblk_t *mp) 12492 { 12493 mblk_t **mpp; 12494 12495 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12496 12497 /* 12498 * To ensure that any DLPI requests for current exclusive operation 12499 * are always completely sent before any DLPI messages for other 12500 * operations, require writer access before enqueuing. 12501 */ 12502 if (!IAM_WRITER_ILL(ill)) { 12503 ill_refhold(ill); 12504 /* qwriter_ip() does the ill_refrele() */ 12505 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer, 12506 NEW_OP, B_TRUE); 12507 return; 12508 } 12509 12510 mutex_enter(&ill->ill_lock); 12511 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 12512 /* Must queue message. Tail insertion */ 12513 mpp = &ill->ill_dlpi_deferred; 12514 while (*mpp != NULL) 12515 mpp = &((*mpp)->b_next); 12516 12517 ip1dbg(("ill_dlpi_send: deferring request for %s " 12518 "while %s pending\n", ill->ill_name, 12519 dl_primstr(ill->ill_dlpi_pending))); 12520 12521 *mpp = mp; 12522 mutex_exit(&ill->ill_lock); 12523 return; 12524 } 12525 mutex_exit(&ill->ill_lock); 12526 ill_dlpi_dispatch(ill, mp); 12527 } 12528 12529 void 12530 ill_capability_send(ill_t *ill, mblk_t *mp) 12531 { 12532 ill->ill_capab_pending_cnt++; 12533 ill_dlpi_send(ill, mp); 12534 } 12535 12536 void 12537 ill_capability_done(ill_t *ill) 12538 { 12539 ASSERT(ill->ill_capab_pending_cnt != 0); 12540 12541 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 12542 12543 ill->ill_capab_pending_cnt--; 12544 if (ill->ill_capab_pending_cnt == 0 && 12545 ill->ill_dlpi_capab_state == IDCS_OK) 12546 ill_capability_reset_alloc(ill); 12547 } 12548 12549 /* 12550 * Send all deferred DLPI messages without waiting for their ACKs. 12551 */ 12552 void 12553 ill_dlpi_send_deferred(ill_t *ill) 12554 { 12555 mblk_t *mp, *nextmp; 12556 12557 /* 12558 * Clear ill_dlpi_pending so that the message is not queued in 12559 * ill_dlpi_send(). 12560 */ 12561 mutex_enter(&ill->ill_lock); 12562 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12563 mp = ill->ill_dlpi_deferred; 12564 ill->ill_dlpi_deferred = NULL; 12565 mutex_exit(&ill->ill_lock); 12566 12567 for (; mp != NULL; mp = nextmp) { 12568 nextmp = mp->b_next; 12569 mp->b_next = NULL; 12570 ill_dlpi_send(ill, mp); 12571 } 12572 } 12573 12574 /* 12575 * Check if the DLPI primitive `prim' is pending; print a warning if not. 12576 */ 12577 boolean_t 12578 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim) 12579 { 12580 t_uscalar_t pending; 12581 12582 mutex_enter(&ill->ill_lock); 12583 if (ill->ill_dlpi_pending == prim) { 12584 mutex_exit(&ill->ill_lock); 12585 return (B_TRUE); 12586 } 12587 12588 /* 12589 * During teardown, ill_dlpi_dispatch() will send DLPI requests 12590 * without waiting, so don't print any warnings in that case. 12591 */ 12592 if (ill->ill_state_flags & ILL_CONDEMNED) { 12593 mutex_exit(&ill->ill_lock); 12594 return (B_FALSE); 12595 } 12596 pending = ill->ill_dlpi_pending; 12597 mutex_exit(&ill->ill_lock); 12598 12599 if (pending == DL_PRIM_INVAL) { 12600 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 12601 "received unsolicited ack for %s on %s\n", 12602 dl_primstr(prim), ill->ill_name); 12603 } else { 12604 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 12605 "received unexpected ack for %s on %s (expecting %s)\n", 12606 dl_primstr(prim), ill->ill_name, dl_primstr(pending)); 12607 } 12608 return (B_FALSE); 12609 } 12610 12611 /* 12612 * Complete the current DLPI operation associated with `prim' on `ill' and 12613 * start the next queued DLPI operation (if any). If there are no queued DLPI 12614 * operations and the ill's current exclusive IPSQ operation has finished 12615 * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to 12616 * allow the next exclusive IPSQ operation to begin upon ipsq_exit(). See 12617 * the comments above ipsq_current_finish() for details. 12618 */ 12619 void 12620 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 12621 { 12622 mblk_t *mp; 12623 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 12624 ipxop_t *ipx = ipsq->ipsq_xop; 12625 12626 ASSERT(IAM_WRITER_IPSQ(ipsq)); 12627 mutex_enter(&ill->ill_lock); 12628 12629 ASSERT(prim != DL_PRIM_INVAL); 12630 ASSERT(ill->ill_dlpi_pending == prim); 12631 12632 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 12633 dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 12634 12635 if ((mp = ill->ill_dlpi_deferred) == NULL) { 12636 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12637 if (ipx->ipx_current_done) { 12638 mutex_enter(&ipx->ipx_lock); 12639 ipx->ipx_current_ipif = NULL; 12640 mutex_exit(&ipx->ipx_lock); 12641 } 12642 cv_signal(&ill->ill_cv); 12643 mutex_exit(&ill->ill_lock); 12644 return; 12645 } 12646 12647 ill->ill_dlpi_deferred = mp->b_next; 12648 mp->b_next = NULL; 12649 mutex_exit(&ill->ill_lock); 12650 12651 ill_dlpi_dispatch(ill, mp); 12652 } 12653 12654 /* 12655 * Queue a (multicast) DLPI control message to be sent to the driver by 12656 * later calling ill_dlpi_send_queued. 12657 * We queue them while holding a lock (ill_mcast_lock) to ensure that they 12658 * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ 12659 * for the same group to race. 12660 * We send DLPI control messages in order using ill_lock. 12661 * For IPMP we should be called on the cast_ill. 12662 */ 12663 void 12664 ill_dlpi_queue(ill_t *ill, mblk_t *mp) 12665 { 12666 mblk_t **mpp; 12667 12668 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12669 12670 mutex_enter(&ill->ill_lock); 12671 /* Must queue message. Tail insertion */ 12672 mpp = &ill->ill_dlpi_deferred; 12673 while (*mpp != NULL) 12674 mpp = &((*mpp)->b_next); 12675 12676 *mpp = mp; 12677 mutex_exit(&ill->ill_lock); 12678 } 12679 12680 /* 12681 * Send the messages that were queued. Make sure there is only 12682 * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done() 12683 * when an ACK or a NAK is received to process the next queued message. 12684 * For IPMP we are called on the upper ill, but when send what is queued 12685 * on the cast_ill. 12686 */ 12687 void 12688 ill_dlpi_send_queued(ill_t *ill) 12689 { 12690 mblk_t *mp; 12691 union DL_primitives *dlp; 12692 t_uscalar_t prim; 12693 ill_t *release_ill = NULL; 12694 12695 if (IS_IPMP(ill)) { 12696 /* On the upper IPMP ill. */ 12697 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12698 if (release_ill == NULL) { 12699 /* Avoid ever sending anything down to the ipmpstub */ 12700 return; 12701 } 12702 ill = release_ill; 12703 } 12704 mutex_enter(&ill->ill_lock); 12705 while ((mp = ill->ill_dlpi_deferred) != NULL) { 12706 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 12707 /* Can't send. Somebody else will send it */ 12708 mutex_exit(&ill->ill_lock); 12709 goto done; 12710 } 12711 ill->ill_dlpi_deferred = mp->b_next; 12712 mp->b_next = NULL; 12713 if (!ill->ill_dl_up) { 12714 /* 12715 * Nobody there. All multicast addresses will be 12716 * re-joined when we get the DL_BIND_ACK bringing the 12717 * interface up. 12718 */ 12719 freemsg(mp); 12720 continue; 12721 } 12722 dlp = (union DL_primitives *)mp->b_rptr; 12723 prim = dlp->dl_primitive; 12724 12725 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 12726 (prim == DL_UNBIND_REQ)) { 12727 ill->ill_dlpi_pending = prim; 12728 } 12729 mutex_exit(&ill->ill_lock); 12730 12731 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued", 12732 char *, dl_primstr(prim), ill_t *, ill); 12733 putnext(ill->ill_wq, mp); 12734 mutex_enter(&ill->ill_lock); 12735 } 12736 mutex_exit(&ill->ill_lock); 12737 done: 12738 if (release_ill != NULL) 12739 ill_refrele(release_ill); 12740 } 12741 12742 /* 12743 * Queue an IP (IGMP/MLD) message to be sent by IP from 12744 * ill_mcast_send_queued 12745 * We queue them while holding a lock (ill_mcast_lock) to ensure that they 12746 * are sent in order i.e., prevent a IGMP leave and IGMP join for the same 12747 * group to race. 12748 * We send them in order using ill_lock. 12749 * For IPMP we are called on the upper ill, but we queue on the cast_ill. 12750 */ 12751 void 12752 ill_mcast_queue(ill_t *ill, mblk_t *mp) 12753 { 12754 mblk_t **mpp; 12755 ill_t *release_ill = NULL; 12756 12757 ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock)); 12758 12759 if (IS_IPMP(ill)) { 12760 /* On the upper IPMP ill. */ 12761 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12762 if (release_ill == NULL) { 12763 /* Discard instead of queuing for the ipmp interface */ 12764 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 12765 ip_drop_output("ipIfStatsOutDiscards - no cast_ill", 12766 mp, ill); 12767 freemsg(mp); 12768 return; 12769 } 12770 ill = release_ill; 12771 } 12772 12773 mutex_enter(&ill->ill_lock); 12774 /* Must queue message. Tail insertion */ 12775 mpp = &ill->ill_mcast_deferred; 12776 while (*mpp != NULL) 12777 mpp = &((*mpp)->b_next); 12778 12779 *mpp = mp; 12780 mutex_exit(&ill->ill_lock); 12781 if (release_ill != NULL) 12782 ill_refrele(release_ill); 12783 } 12784 12785 /* 12786 * Send the IP packets that were queued by ill_mcast_queue. 12787 * These are IGMP/MLD packets. 12788 * 12789 * For IPMP we are called on the upper ill, but when send what is queued 12790 * on the cast_ill. 12791 * 12792 * Request loopback of the report if we are acting as a multicast 12793 * router, so that the process-level routing demon can hear it. 12794 * This will run multiple times for the same group if there are members 12795 * on the same group for multiple ipif's on the same ill. The 12796 * igmp_input/mld_input code will suppress this due to the loopback thus we 12797 * always loopback membership report. 12798 * 12799 * We also need to make sure that this does not get load balanced 12800 * by IPMP. We do this by passing an ill to ip_output_simple. 12801 */ 12802 void 12803 ill_mcast_send_queued(ill_t *ill) 12804 { 12805 mblk_t *mp; 12806 ip_xmit_attr_t ixas; 12807 ill_t *release_ill = NULL; 12808 12809 if (IS_IPMP(ill)) { 12810 /* On the upper IPMP ill. */ 12811 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12812 if (release_ill == NULL) { 12813 /* 12814 * We should have no messages on the ipmp interface 12815 * but no point in trying to send them. 12816 */ 12817 return; 12818 } 12819 ill = release_ill; 12820 } 12821 bzero(&ixas, sizeof (ixas)); 12822 ixas.ixa_zoneid = ALL_ZONES; 12823 ixas.ixa_cred = kcred; 12824 ixas.ixa_cpid = NOPID; 12825 ixas.ixa_tsl = NULL; 12826 /* 12827 * Here we set ixa_ifindex. If IPMP it will be the lower ill which 12828 * makes ip_select_route pick the IRE_MULTICAST for the cast_ill. 12829 * That is necessary to handle IGMP/MLD snooping switches. 12830 */ 12831 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 12832 ixas.ixa_ipst = ill->ill_ipst; 12833 12834 mutex_enter(&ill->ill_lock); 12835 while ((mp = ill->ill_mcast_deferred) != NULL) { 12836 ill->ill_mcast_deferred = mp->b_next; 12837 mp->b_next = NULL; 12838 if (!ill->ill_dl_up) { 12839 /* 12840 * Nobody there. Just drop the ip packets. 12841 * IGMP/MLD will resend later, if this is a replumb. 12842 */ 12843 freemsg(mp); 12844 continue; 12845 } 12846 mutex_enter(&ill->ill_phyint->phyint_lock); 12847 if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) { 12848 /* 12849 * When the ill is getting deactivated, we only want to 12850 * send the DLPI messages, so drop IGMP/MLD packets. 12851 * DLPI messages are handled by ill_dlpi_send_queued() 12852 */ 12853 mutex_exit(&ill->ill_phyint->phyint_lock); 12854 freemsg(mp); 12855 continue; 12856 } 12857 mutex_exit(&ill->ill_phyint->phyint_lock); 12858 mutex_exit(&ill->ill_lock); 12859 12860 /* Check whether we are sending IPv4 or IPv6. */ 12861 if (ill->ill_isv6) { 12862 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 12863 12864 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 12865 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; 12866 } else { 12867 ipha_t *ipha = (ipha_t *)mp->b_rptr; 12868 12869 ixas.ixa_multicast_ttl = ipha->ipha_ttl; 12870 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 12871 ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM; 12872 } 12873 12874 ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE; 12875 (void) ip_output_simple(mp, &ixas); 12876 ixa_cleanup(&ixas); 12877 12878 mutex_enter(&ill->ill_lock); 12879 } 12880 mutex_exit(&ill->ill_lock); 12881 12882 done: 12883 if (release_ill != NULL) 12884 ill_refrele(release_ill); 12885 } 12886 12887 /* 12888 * Take down a specific interface, but don't lose any information about it. 12889 * (Always called as writer.) 12890 * This function goes through the down sequence even if the interface is 12891 * already down. There are 2 reasons. 12892 * a. Currently we permit interface routes that depend on down interfaces 12893 * to be added. This behaviour itself is questionable. However it appears 12894 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 12895 * time. We go thru the cleanup in order to remove these routes. 12896 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 12897 * DL_ERROR_ACK in response to the DL_BIND request. The interface is 12898 * down, but we need to cleanup i.e. do ill_dl_down and 12899 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 12900 * 12901 * IP-MT notes: 12902 * 12903 * Model of reference to interfaces. 12904 * 12905 * The following members in ipif_t track references to the ipif. 12906 * int ipif_refcnt; Active reference count 12907 * 12908 * The following members in ill_t track references to the ill. 12909 * int ill_refcnt; active refcnt 12910 * uint_t ill_ire_cnt; Number of ires referencing ill 12911 * uint_t ill_ncec_cnt; Number of ncecs referencing ill 12912 * uint_t ill_nce_cnt; Number of nces referencing ill 12913 * uint_t ill_ilm_cnt; Number of ilms referencing ill 12914 * 12915 * Reference to an ipif or ill can be obtained in any of the following ways. 12916 * 12917 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 12918 * Pointers to ipif / ill from other data structures viz ire and conn. 12919 * Implicit reference to the ipif / ill by holding a reference to the ire. 12920 * 12921 * The ipif/ill lookup functions return a reference held ipif / ill. 12922 * ipif_refcnt and ill_refcnt track the reference counts respectively. 12923 * This is a purely dynamic reference count associated with threads holding 12924 * references to the ipif / ill. Pointers from other structures do not 12925 * count towards this reference count. 12926 * 12927 * ill_ire_cnt is the number of ire's associated with the 12928 * ill. This is incremented whenever a new ire is created referencing the 12929 * ill. This is done atomically inside ire_add_v[46] where the ire is 12930 * actually added to the ire hash table. The count is decremented in 12931 * ire_inactive where the ire is destroyed. 12932 * 12933 * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill. 12934 * This is incremented atomically in 12935 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the 12936 * table. Similarly it is decremented in ncec_inactive() where the ncec 12937 * is destroyed. 12938 * 12939 * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is 12940 * incremented atomically in nce_add() where the nce is actually added to the 12941 * ill_nce. Similarly it is decremented in nce_inactive() where the nce 12942 * is destroyed. 12943 * 12944 * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in 12945 * ilm_add() and decremented before the ilm is freed in ilm_delete(). 12946 * 12947 * Flow of ioctls involving interface down/up 12948 * 12949 * The following is the sequence of an attempt to set some critical flags on an 12950 * up interface. 12951 * ip_sioctl_flags 12952 * ipif_down 12953 * wait for ipif to be quiescent 12954 * ipif_down_tail 12955 * ip_sioctl_flags_tail 12956 * 12957 * All set ioctls that involve down/up sequence would have a skeleton similar 12958 * to the above. All the *tail functions are called after the refcounts have 12959 * dropped to the appropriate values. 12960 * 12961 * SIOC ioctls during the IPIF_CHANGING interval. 12962 * 12963 * Threads handling SIOC set ioctls serialize on the squeue, but this 12964 * is not done for SIOC get ioctls. Since a set ioctl can cause several 12965 * steps of internal changes to the state, some of which are visible in 12966 * ipif_flags (such as IFF_UP being cleared and later set), and we want 12967 * the set ioctl to be atomic related to the get ioctls, the SIOC get code 12968 * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then 12969 * enqueued in the ipsq and the operation is restarted by ipsq_exit() when 12970 * the current exclusive operation completes. The IPIF_CHANGING check 12971 * and enqueue is atomic using the ill_lock and ipsq_lock. The 12972 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 12973 * change while the ill_lock is held. Before dropping the ill_lock we acquire 12974 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 12975 * until we release the ipsq_lock, even though the ill/ipif state flags 12976 * can change after we drop the ill_lock. 12977 */ 12978 int 12979 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 12980 { 12981 ill_t *ill = ipif->ipif_ill; 12982 conn_t *connp; 12983 boolean_t success; 12984 boolean_t ipif_was_up = B_FALSE; 12985 ip_stack_t *ipst = ill->ill_ipst; 12986 12987 ASSERT(IAM_WRITER_IPIF(ipif)); 12988 12989 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 12990 12991 DTRACE_PROBE3(ipif__downup, char *, "ipif_down", 12992 ill_t *, ill, ipif_t *, ipif); 12993 12994 if (ipif->ipif_flags & IPIF_UP) { 12995 mutex_enter(&ill->ill_lock); 12996 ipif->ipif_flags &= ~IPIF_UP; 12997 ASSERT(ill->ill_ipif_up_count > 0); 12998 --ill->ill_ipif_up_count; 12999 mutex_exit(&ill->ill_lock); 13000 ipif_was_up = B_TRUE; 13001 /* Update status in SCTP's list */ 13002 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 13003 ill_nic_event_dispatch(ipif->ipif_ill, 13004 MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0); 13005 } 13006 13007 /* 13008 * Blow away memberships we established in ipif_multicast_up(). 13009 */ 13010 ipif_multicast_down(ipif); 13011 13012 /* 13013 * Remove from the mapping for __sin6_src_id. We insert only 13014 * when the address is not INADDR_ANY. As IPv4 addresses are 13015 * stored as mapped addresses, we need to check for mapped 13016 * INADDR_ANY also. 13017 */ 13018 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 13019 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 13020 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 13021 int err; 13022 13023 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 13024 ipif->ipif_zoneid, ipst); 13025 if (err != 0) { 13026 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 13027 } 13028 } 13029 13030 if (ipif_was_up) { 13031 /* only delete if we'd added ire's before */ 13032 if (ipif->ipif_isv6) 13033 ipif_delete_ires_v6(ipif); 13034 else 13035 ipif_delete_ires_v4(ipif); 13036 } 13037 13038 if (ipif_was_up && ill->ill_ipif_up_count == 0) { 13039 /* 13040 * Since the interface is now down, it may have just become 13041 * inactive. Note that this needs to be done even for a 13042 * lll_logical_down(), or ARP entries will not get correctly 13043 * restored when the interface comes back up. 13044 */ 13045 if (IS_UNDER_IPMP(ill)) 13046 ipmp_ill_refresh_active(ill); 13047 } 13048 13049 /* 13050 * neighbor-discovery or arp entries for this interface. The ipif 13051 * has to be quiesced, so we walk all the nce's and delete those 13052 * that point at the ipif->ipif_ill. At the same time, we also 13053 * update IPMP so that ipifs for data addresses are unbound. We dont 13054 * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer 13055 * that for ipif_down_tail() 13056 */ 13057 ipif_nce_down(ipif); 13058 13059 /* 13060 * If this is the last ipif on the ill, we also need to remove 13061 * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will 13062 * never succeed. 13063 */ 13064 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) 13065 ire_walk_ill(0, 0, ill_downi, ill, ill); 13066 13067 /* 13068 * Walk all CONNs that can have a reference on an ire for this 13069 * ipif (we actually walk all that now have stale references). 13070 */ 13071 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); 13072 13073 /* 13074 * If mp is NULL the caller will wait for the appropriate refcnt. 13075 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 13076 * and ill_delete -> ipif_free -> ipif_down 13077 */ 13078 if (mp == NULL) { 13079 ASSERT(q == NULL); 13080 return (0); 13081 } 13082 13083 if (CONN_Q(q)) { 13084 connp = Q_TO_CONN(q); 13085 mutex_enter(&connp->conn_lock); 13086 } else { 13087 connp = NULL; 13088 } 13089 mutex_enter(&ill->ill_lock); 13090 /* 13091 * Are there any ire's pointing to this ipif that are still active ? 13092 * If this is the last ipif going down, are there any ire's pointing 13093 * to this ill that are still active ? 13094 */ 13095 if (ipif_is_quiescent(ipif)) { 13096 mutex_exit(&ill->ill_lock); 13097 if (connp != NULL) 13098 mutex_exit(&connp->conn_lock); 13099 return (0); 13100 } 13101 13102 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 13103 ill->ill_name, (void *)ill)); 13104 /* 13105 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 13106 * drops down, the operation will be restarted by ipif_ill_refrele_tail 13107 * which in turn is called by the last refrele on the ipif/ill/ire. 13108 */ 13109 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 13110 if (!success) { 13111 /* The conn is closing. So just return */ 13112 ASSERT(connp != NULL); 13113 mutex_exit(&ill->ill_lock); 13114 mutex_exit(&connp->conn_lock); 13115 return (EINTR); 13116 } 13117 13118 mutex_exit(&ill->ill_lock); 13119 if (connp != NULL) 13120 mutex_exit(&connp->conn_lock); 13121 return (EINPROGRESS); 13122 } 13123 13124 int 13125 ipif_down_tail(ipif_t *ipif) 13126 { 13127 ill_t *ill = ipif->ipif_ill; 13128 int err = 0; 13129 13130 DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail", 13131 ill_t *, ill, ipif_t *, ipif); 13132 13133 /* 13134 * Skip any loopback interface (null wq). 13135 * If this is the last logical interface on the ill 13136 * have ill_dl_down tell the driver we are gone (unbind) 13137 * Note that lun 0 can ipif_down even though 13138 * there are other logical units that are up. 13139 * This occurs e.g. when we change a "significant" IFF_ flag. 13140 */ 13141 if (ill->ill_wq != NULL && !ill->ill_logical_down && 13142 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 13143 ill->ill_dl_up) { 13144 ill_dl_down(ill); 13145 } 13146 if (!ipif->ipif_isv6) 13147 err = ipif_arp_down(ipif); 13148 13149 ill->ill_logical_down = 0; 13150 13151 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 13152 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT); 13153 return (err); 13154 } 13155 13156 /* 13157 * Bring interface logically down without bringing the physical interface 13158 * down e.g. when the netmask is changed. This avoids long lasting link 13159 * negotiations between an ethernet interface and a certain switches. 13160 */ 13161 static int 13162 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 13163 { 13164 DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down", 13165 ill_t *, ipif->ipif_ill, ipif_t *, ipif); 13166 13167 /* 13168 * The ill_logical_down flag is a transient flag. It is set here 13169 * and is cleared once the down has completed in ipif_down_tail. 13170 * This flag does not indicate whether the ill stream is in the 13171 * DL_BOUND state with the driver. Instead this flag is used by 13172 * ipif_down_tail to determine whether to DL_UNBIND the stream with 13173 * the driver. The state of the ill stream i.e. whether it is 13174 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 13175 */ 13176 ipif->ipif_ill->ill_logical_down = 1; 13177 return (ipif_down(ipif, q, mp)); 13178 } 13179 13180 /* 13181 * Initiate deallocate of an IPIF. Always called as writer. Called by 13182 * ill_delete or ip_sioctl_removeif. 13183 */ 13184 static void 13185 ipif_free(ipif_t *ipif) 13186 { 13187 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13188 13189 ASSERT(IAM_WRITER_IPIF(ipif)); 13190 13191 if (ipif->ipif_recovery_id != 0) 13192 (void) untimeout(ipif->ipif_recovery_id); 13193 ipif->ipif_recovery_id = 0; 13194 13195 /* 13196 * Take down the interface. We can be called either from ill_delete 13197 * or from ip_sioctl_removeif. 13198 */ 13199 (void) ipif_down(ipif, NULL, NULL); 13200 13201 /* 13202 * Now that the interface is down, there's no chance it can still 13203 * become a duplicate. Cancel any timer that may have been set while 13204 * tearing down. 13205 */ 13206 if (ipif->ipif_recovery_id != 0) 13207 (void) untimeout(ipif->ipif_recovery_id); 13208 ipif->ipif_recovery_id = 0; 13209 13210 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13211 /* Remove pointers to this ill in the multicast routing tables */ 13212 reset_mrt_vif_ipif(ipif); 13213 /* If necessary, clear the cached source ipif rotor. */ 13214 if (ipif->ipif_ill->ill_src_ipif == ipif) 13215 ipif->ipif_ill->ill_src_ipif = NULL; 13216 rw_exit(&ipst->ips_ill_g_lock); 13217 } 13218 13219 static void 13220 ipif_free_tail(ipif_t *ipif) 13221 { 13222 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13223 13224 /* 13225 * Need to hold both ill_g_lock and ill_lock while 13226 * inserting or removing an ipif from the linked list 13227 * of ipifs hanging off the ill. 13228 */ 13229 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13230 13231 #ifdef DEBUG 13232 ipif_trace_cleanup(ipif); 13233 #endif 13234 13235 /* Ask SCTP to take it out of it list */ 13236 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 13237 ip_rts_newaddrmsg(RTM_FREEADDR, 0, ipif, RTSQ_DEFAULT); 13238 13239 /* Get it out of the ILL interface list. */ 13240 ipif_remove(ipif); 13241 rw_exit(&ipst->ips_ill_g_lock); 13242 13243 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 13244 ASSERT(ipif->ipif_recovery_id == 0); 13245 ASSERT(ipif->ipif_ire_local == NULL); 13246 ASSERT(ipif->ipif_ire_if == NULL); 13247 13248 /* Free the memory. */ 13249 mi_free(ipif); 13250 } 13251 13252 /* 13253 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id" 13254 * is zero. 13255 */ 13256 void 13257 ipif_get_name(const ipif_t *ipif, char *buf, int len) 13258 { 13259 char lbuf[LIFNAMSIZ]; 13260 char *name; 13261 size_t name_len; 13262 13263 buf[0] = '\0'; 13264 name = ipif->ipif_ill->ill_name; 13265 name_len = ipif->ipif_ill->ill_name_length; 13266 if (ipif->ipif_id != 0) { 13267 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 13268 ipif->ipif_id); 13269 name = lbuf; 13270 name_len = mi_strlen(name) + 1; 13271 } 13272 len -= 1; 13273 buf[len] = '\0'; 13274 len = MIN(len, name_len); 13275 bcopy(name, buf, len); 13276 } 13277 13278 /* 13279 * Sets `buf' to an ill name. 13280 */ 13281 void 13282 ill_get_name(const ill_t *ill, char *buf, int len) 13283 { 13284 char *name; 13285 size_t name_len; 13286 13287 name = ill->ill_name; 13288 name_len = ill->ill_name_length; 13289 len -= 1; 13290 buf[len] = '\0'; 13291 len = MIN(len, name_len); 13292 bcopy(name, buf, len); 13293 } 13294 13295 /* 13296 * Find an IPIF based on the name passed in. Names can be of the form <phys> 13297 * (e.g., le0) or <phys>:<#> (e.g., le0:1). When there is no colon, the 13298 * implied unit id is zero. <phys> must correspond to the name of an ILL. 13299 * (May be called as writer.) 13300 */ 13301 static ipif_t * 13302 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 13303 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst) 13304 { 13305 char *cp; 13306 char *endp; 13307 long id; 13308 ill_t *ill; 13309 ipif_t *ipif; 13310 uint_t ire_type; 13311 boolean_t did_alloc = B_FALSE; 13312 13313 /* 13314 * If the caller wants to us to create the ipif, make sure we have a 13315 * valid zoneid 13316 */ 13317 ASSERT(!do_alloc || zoneid != ALL_ZONES); 13318 13319 if (namelen == 0) { 13320 return (NULL); 13321 } 13322 13323 *exists = B_FALSE; 13324 /* Look for a colon in the name. */ 13325 endp = &name[namelen]; 13326 for (cp = endp; --cp > name; ) { 13327 if (*cp == IPIF_SEPARATOR_CHAR) 13328 break; 13329 } 13330 13331 if (*cp == IPIF_SEPARATOR_CHAR) { 13332 /* 13333 * Reject any non-decimal aliases for logical 13334 * interfaces. Aliases with leading zeroes 13335 * are also rejected as they introduce ambiguity 13336 * in the naming of the interfaces. 13337 * In order to confirm with existing semantics, 13338 * and to not break any programs/script relying 13339 * on that behaviour, if<0>:0 is considered to be 13340 * a valid interface. 13341 * 13342 * If alias has two or more digits and the first 13343 * is zero, fail. 13344 */ 13345 if (&cp[2] < endp && cp[1] == '0') { 13346 return (NULL); 13347 } 13348 } 13349 13350 if (cp <= name) { 13351 cp = endp; 13352 } else { 13353 *cp = '\0'; 13354 } 13355 13356 /* 13357 * Look up the ILL, based on the portion of the name 13358 * before the slash. ill_lookup_on_name returns a held ill. 13359 * Temporary to check whether ill exists already. If so 13360 * ill_lookup_on_name will clear it. 13361 */ 13362 ill = ill_lookup_on_name(name, do_alloc, isv6, 13363 &did_alloc, ipst); 13364 if (cp != endp) 13365 *cp = IPIF_SEPARATOR_CHAR; 13366 if (ill == NULL) 13367 return (NULL); 13368 13369 /* Establish the unit number in the name. */ 13370 id = 0; 13371 if (cp < endp && *endp == '\0') { 13372 /* If there was a colon, the unit number follows. */ 13373 cp++; 13374 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 13375 ill_refrele(ill); 13376 return (NULL); 13377 } 13378 } 13379 13380 mutex_enter(&ill->ill_lock); 13381 /* Now see if there is an IPIF with this unit number. */ 13382 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13383 if (ipif->ipif_id == id) { 13384 if (zoneid != ALL_ZONES && 13385 zoneid != ipif->ipif_zoneid && 13386 ipif->ipif_zoneid != ALL_ZONES) { 13387 mutex_exit(&ill->ill_lock); 13388 ill_refrele(ill); 13389 return (NULL); 13390 } 13391 if (IPIF_CAN_LOOKUP(ipif)) { 13392 ipif_refhold_locked(ipif); 13393 mutex_exit(&ill->ill_lock); 13394 if (!did_alloc) 13395 *exists = B_TRUE; 13396 /* 13397 * Drop locks before calling ill_refrele 13398 * since it can potentially call into 13399 * ipif_ill_refrele_tail which can end up 13400 * in trying to acquire any lock. 13401 */ 13402 ill_refrele(ill); 13403 return (ipif); 13404 } 13405 } 13406 } 13407 13408 if (!do_alloc) { 13409 mutex_exit(&ill->ill_lock); 13410 ill_refrele(ill); 13411 return (NULL); 13412 } 13413 13414 /* 13415 * If none found, atomically allocate and return a new one. 13416 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 13417 * to support "receive only" use of lo0:1 etc. as is still done 13418 * below as an initial guess. 13419 * However, this is now likely to be overriden later in ipif_up_done() 13420 * when we know for sure what address has been configured on the 13421 * interface, since we might have more than one loopback interface 13422 * with a loopback address, e.g. in the case of zones, and all the 13423 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 13424 */ 13425 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 13426 ire_type = IRE_LOOPBACK; 13427 else 13428 ire_type = IRE_LOCAL; 13429 ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE, NULL); 13430 if (ipif != NULL) 13431 ipif_refhold_locked(ipif); 13432 mutex_exit(&ill->ill_lock); 13433 ill_refrele(ill); 13434 return (ipif); 13435 } 13436 13437 /* 13438 * This routine is called whenever a new address comes up on an ipif. If 13439 * we are configured to respond to address mask requests, then we are supposed 13440 * to broadcast an address mask reply at this time. This routine is also 13441 * called if we are already up, but a netmask change is made. This is legal 13442 * but might not make the system manager very popular. (May be called 13443 * as writer.) 13444 */ 13445 void 13446 ipif_mask_reply(ipif_t *ipif) 13447 { 13448 icmph_t *icmph; 13449 ipha_t *ipha; 13450 mblk_t *mp; 13451 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13452 ip_xmit_attr_t ixas; 13453 13454 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 13455 13456 if (!ipst->ips_ip_respond_to_address_mask_broadcast) 13457 return; 13458 13459 /* ICMP mask reply is IPv4 only */ 13460 ASSERT(!ipif->ipif_isv6); 13461 /* ICMP mask reply is not for a loopback interface */ 13462 ASSERT(ipif->ipif_ill->ill_wq != NULL); 13463 13464 if (ipif->ipif_lcl_addr == INADDR_ANY) 13465 return; 13466 13467 mp = allocb(REPLY_LEN, BPRI_HI); 13468 if (mp == NULL) 13469 return; 13470 mp->b_wptr = mp->b_rptr + REPLY_LEN; 13471 13472 ipha = (ipha_t *)mp->b_rptr; 13473 bzero(ipha, REPLY_LEN); 13474 *ipha = icmp_ipha; 13475 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 13476 ipha->ipha_src = ipif->ipif_lcl_addr; 13477 ipha->ipha_dst = ipif->ipif_brd_addr; 13478 ipha->ipha_length = htons(REPLY_LEN); 13479 ipha->ipha_ident = 0; 13480 13481 icmph = (icmph_t *)&ipha[1]; 13482 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 13483 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 13484 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 13485 13486 bzero(&ixas, sizeof (ixas)); 13487 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 13488 ixas.ixa_flags |= IXAF_SET_SOURCE; 13489 ixas.ixa_zoneid = ALL_ZONES; 13490 ixas.ixa_ifindex = 0; 13491 ixas.ixa_ipst = ipst; 13492 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 13493 (void) ip_output_simple(mp, &ixas); 13494 ixa_cleanup(&ixas); 13495 #undef REPLY_LEN 13496 } 13497 13498 /* 13499 * Join the ipif specific multicast groups. 13500 * Must be called after a mapping has been set up in the resolver. (Always 13501 * called as writer.) 13502 */ 13503 void 13504 ipif_multicast_up(ipif_t *ipif) 13505 { 13506 int err; 13507 ill_t *ill; 13508 ilm_t *ilm; 13509 13510 ASSERT(IAM_WRITER_IPIF(ipif)); 13511 13512 ill = ipif->ipif_ill; 13513 13514 ip1dbg(("ipif_multicast_up\n")); 13515 if (!(ill->ill_flags & ILLF_MULTICAST) || 13516 ipif->ipif_allhosts_ilm != NULL) 13517 return; 13518 13519 if (ipif->ipif_isv6) { 13520 in6_addr_t v6allmc = ipv6_all_hosts_mcast; 13521 in6_addr_t v6solmc = ipv6_solicited_node_mcast; 13522 13523 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 13524 13525 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 13526 return; 13527 13528 ip1dbg(("ipif_multicast_up - addmulti\n")); 13529 13530 /* 13531 * Join the all hosts multicast address. We skip this for 13532 * underlying IPMP interfaces since they should be invisible. 13533 */ 13534 if (!IS_UNDER_IPMP(ill)) { 13535 ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid, 13536 &err); 13537 if (ilm == NULL) { 13538 ASSERT(err != 0); 13539 ip0dbg(("ipif_multicast_up: " 13540 "all_hosts_mcast failed %d\n", err)); 13541 return; 13542 } 13543 ipif->ipif_allhosts_ilm = ilm; 13544 } 13545 13546 /* 13547 * Enable multicast for the solicited node multicast address. 13548 * If IPMP we need to put the membership on the upper ill. 13549 */ 13550 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 13551 ill_t *mcast_ill = NULL; 13552 boolean_t need_refrele; 13553 13554 if (IS_UNDER_IPMP(ill) && 13555 (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) { 13556 need_refrele = B_TRUE; 13557 } else { 13558 mcast_ill = ill; 13559 need_refrele = B_FALSE; 13560 } 13561 13562 ilm = ip_addmulti(&v6solmc, mcast_ill, 13563 ipif->ipif_zoneid, &err); 13564 if (need_refrele) 13565 ill_refrele(mcast_ill); 13566 13567 if (ilm == NULL) { 13568 ASSERT(err != 0); 13569 ip0dbg(("ipif_multicast_up: solicited MC" 13570 " failed %d\n", err)); 13571 if ((ilm = ipif->ipif_allhosts_ilm) != NULL) { 13572 ipif->ipif_allhosts_ilm = NULL; 13573 (void) ip_delmulti(ilm); 13574 } 13575 return; 13576 } 13577 ipif->ipif_solmulti_ilm = ilm; 13578 } 13579 } else { 13580 in6_addr_t v6group; 13581 13582 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill)) 13583 return; 13584 13585 /* Join the all hosts multicast address */ 13586 ip1dbg(("ipif_multicast_up - addmulti\n")); 13587 IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group); 13588 13589 ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err); 13590 if (ilm == NULL) { 13591 ASSERT(err != 0); 13592 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 13593 return; 13594 } 13595 ipif->ipif_allhosts_ilm = ilm; 13596 } 13597 } 13598 13599 /* 13600 * Blow away any multicast groups that we joined in ipif_multicast_up(). 13601 * (ilms from explicit memberships are handled in conn_update_ill.) 13602 */ 13603 void 13604 ipif_multicast_down(ipif_t *ipif) 13605 { 13606 ASSERT(IAM_WRITER_IPIF(ipif)); 13607 13608 ip1dbg(("ipif_multicast_down\n")); 13609 13610 if (ipif->ipif_allhosts_ilm != NULL) { 13611 (void) ip_delmulti(ipif->ipif_allhosts_ilm); 13612 ipif->ipif_allhosts_ilm = NULL; 13613 } 13614 if (ipif->ipif_solmulti_ilm != NULL) { 13615 (void) ip_delmulti(ipif->ipif_solmulti_ilm); 13616 ipif->ipif_solmulti_ilm = NULL; 13617 } 13618 } 13619 13620 /* 13621 * Used when an interface comes up to recreate any extra routes on this 13622 * interface. 13623 */ 13624 int 13625 ill_recover_saved_ire(ill_t *ill) 13626 { 13627 mblk_t *mp; 13628 ip_stack_t *ipst = ill->ill_ipst; 13629 13630 ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name)); 13631 13632 mutex_enter(&ill->ill_saved_ire_lock); 13633 for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 13634 ire_t *ire, *nire; 13635 ifrt_t *ifrt; 13636 13637 ifrt = (ifrt_t *)mp->b_rptr; 13638 /* 13639 * Create a copy of the IRE with the saved address and netmask. 13640 */ 13641 if (ill->ill_isv6) { 13642 ire = ire_create_v6( 13643 &ifrt->ifrt_v6addr, 13644 &ifrt->ifrt_v6mask, 13645 &ifrt->ifrt_v6gateway_addr, 13646 ifrt->ifrt_type, 13647 ill, 13648 ifrt->ifrt_zoneid, 13649 ifrt->ifrt_flags, 13650 NULL, 13651 ipst); 13652 } else { 13653 ire = ire_create( 13654 (uint8_t *)&ifrt->ifrt_addr, 13655 (uint8_t *)&ifrt->ifrt_mask, 13656 (uint8_t *)&ifrt->ifrt_gateway_addr, 13657 ifrt->ifrt_type, 13658 ill, 13659 ifrt->ifrt_zoneid, 13660 ifrt->ifrt_flags, 13661 NULL, 13662 ipst); 13663 } 13664 if (ire == NULL) { 13665 mutex_exit(&ill->ill_saved_ire_lock); 13666 return (ENOMEM); 13667 } 13668 13669 if (ifrt->ifrt_flags & RTF_SETSRC) { 13670 if (ill->ill_isv6) { 13671 ire->ire_setsrc_addr_v6 = 13672 ifrt->ifrt_v6setsrc_addr; 13673 } else { 13674 ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr; 13675 } 13676 } 13677 13678 /* 13679 * Some software (for example, GateD and Sun Cluster) attempts 13680 * to create (what amount to) IRE_PREFIX routes with the 13681 * loopback address as the gateway. This is primarily done to 13682 * set up prefixes with the RTF_REJECT flag set (for example, 13683 * when generating aggregate routes.) 13684 * 13685 * If the IRE type (as defined by ill->ill_net_type) is 13686 * IRE_LOOPBACK, then we map the request into a 13687 * IRE_IF_NORESOLVER. 13688 */ 13689 if (ill->ill_net_type == IRE_LOOPBACK) 13690 ire->ire_type = IRE_IF_NORESOLVER; 13691 13692 /* 13693 * ire held by ire_add, will be refreled' towards the 13694 * the end of ipif_up_done 13695 */ 13696 nire = ire_add(ire); 13697 /* 13698 * Check if it was a duplicate entry. This handles 13699 * the case of two racing route adds for the same route 13700 */ 13701 if (nire == NULL) { 13702 ip1dbg(("ill_recover_saved_ire: FAILED\n")); 13703 } else if (nire != ire) { 13704 ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n", 13705 (void *)nire)); 13706 ire_delete(nire); 13707 } else { 13708 ip1dbg(("ill_recover_saved_ire: added ire %p\n", 13709 (void *)nire)); 13710 } 13711 if (nire != NULL) 13712 ire_refrele(nire); 13713 } 13714 mutex_exit(&ill->ill_saved_ire_lock); 13715 return (0); 13716 } 13717 13718 /* 13719 * Used to set the netmask and broadcast address to default values when the 13720 * interface is brought up. (Always called as writer.) 13721 */ 13722 static void 13723 ipif_set_default(ipif_t *ipif) 13724 { 13725 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 13726 13727 if (!ipif->ipif_isv6) { 13728 /* 13729 * Interface holds an IPv4 address. Default 13730 * mask is the natural netmask. 13731 */ 13732 if (!ipif->ipif_net_mask) { 13733 ipaddr_t v4mask; 13734 13735 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 13736 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 13737 } 13738 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 13739 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 13740 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 13741 } else { 13742 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 13743 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 13744 } 13745 /* 13746 * NOTE: SunOS 4.X does this even if the broadcast address 13747 * has been already set thus we do the same here. 13748 */ 13749 if (ipif->ipif_flags & IPIF_BROADCAST) { 13750 ipaddr_t v4addr; 13751 13752 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 13753 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 13754 } 13755 } else { 13756 /* 13757 * Interface holds an IPv6-only address. Default 13758 * mask is all-ones. 13759 */ 13760 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 13761 ipif->ipif_v6net_mask = ipv6_all_ones; 13762 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 13763 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 13764 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 13765 } else { 13766 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 13767 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 13768 } 13769 } 13770 } 13771 13772 /* 13773 * Return 0 if this address can be used as local address without causing 13774 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 13775 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 13776 * Note that the same IPv6 link-local address is allowed as long as the ills 13777 * are not on the same link. 13778 */ 13779 int 13780 ip_addr_availability_check(ipif_t *new_ipif) 13781 { 13782 in6_addr_t our_v6addr; 13783 ill_t *ill; 13784 ipif_t *ipif; 13785 ill_walk_context_t ctx; 13786 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst; 13787 13788 ASSERT(IAM_WRITER_IPIF(new_ipif)); 13789 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock)); 13790 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); 13791 13792 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 13793 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 13794 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 13795 return (0); 13796 13797 our_v6addr = new_ipif->ipif_v6lcl_addr; 13798 13799 if (new_ipif->ipif_isv6) 13800 ill = ILL_START_WALK_V6(&ctx, ipst); 13801 else 13802 ill = ILL_START_WALK_V4(&ctx, ipst); 13803 13804 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 13805 for (ipif = ill->ill_ipif; ipif != NULL; 13806 ipif = ipif->ipif_next) { 13807 if ((ipif == new_ipif) || 13808 !(ipif->ipif_flags & IPIF_UP) || 13809 (ipif->ipif_flags & IPIF_UNNUMBERED) || 13810 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 13811 &our_v6addr)) 13812 continue; 13813 13814 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 13815 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 13816 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 13817 ipif->ipif_flags |= IPIF_UNNUMBERED; 13818 else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) || 13819 IN6_IS_ADDR_SITELOCAL(&our_v6addr)) && 13820 !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill)) 13821 continue; 13822 else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid && 13823 ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill)) 13824 continue; 13825 else if (new_ipif->ipif_ill == ill) 13826 return (EADDRINUSE); 13827 else 13828 return (EADDRNOTAVAIL); 13829 } 13830 } 13831 13832 return (0); 13833 } 13834 13835 /* 13836 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 13837 * IREs for the ipif. 13838 * When the routine returns EINPROGRESS then mp has been consumed and 13839 * the ioctl will be acked from ip_rput_dlpi. 13840 */ 13841 int 13842 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 13843 { 13844 ill_t *ill = ipif->ipif_ill; 13845 boolean_t isv6 = ipif->ipif_isv6; 13846 int err = 0; 13847 boolean_t success; 13848 uint_t ipif_orig_id; 13849 ip_stack_t *ipst = ill->ill_ipst; 13850 13851 ASSERT(IAM_WRITER_IPIF(ipif)); 13852 13853 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13854 DTRACE_PROBE3(ipif__downup, char *, "ipif_up", 13855 ill_t *, ill, ipif_t *, ipif); 13856 13857 /* Shouldn't get here if it is already up. */ 13858 if (ipif->ipif_flags & IPIF_UP) 13859 return (EALREADY); 13860 13861 /* 13862 * If this is a request to bring up a data address on an interface 13863 * under IPMP, then move the address to its IPMP meta-interface and 13864 * try to bring it up. One complication is that the zeroth ipif for 13865 * an ill is special, in that every ill always has one, and that code 13866 * throughout IP deferences ill->ill_ipif without holding any locks. 13867 */ 13868 if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) && 13869 (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) { 13870 ipif_t *stubipif = NULL, *moveipif = NULL; 13871 ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); 13872 13873 /* 13874 * The ipif being brought up should be quiesced. If it's not, 13875 * something has gone amiss and we need to bail out. (If it's 13876 * quiesced, we know it will remain so via IPIF_CONDEMNED.) 13877 */ 13878 mutex_enter(&ill->ill_lock); 13879 if (!ipif_is_quiescent(ipif)) { 13880 mutex_exit(&ill->ill_lock); 13881 return (EINVAL); 13882 } 13883 mutex_exit(&ill->ill_lock); 13884 13885 /* 13886 * If we're going to need to allocate ipifs, do it prior 13887 * to starting the move (and grabbing locks). 13888 */ 13889 if (ipif->ipif_id == 0) { 13890 if ((moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 13891 B_FALSE, &err)) == NULL) { 13892 return (err); 13893 } 13894 if ((stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 13895 B_FALSE, &err)) == NULL) { 13896 mi_free(moveipif); 13897 return (err); 13898 } 13899 } 13900 13901 /* 13902 * Grab or transfer the ipif to move. During the move, keep 13903 * ill_g_lock held to prevent any ill walker threads from 13904 * seeing things in an inconsistent state. 13905 */ 13906 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13907 if (ipif->ipif_id != 0) { 13908 ipif_remove(ipif); 13909 } else { 13910 ipif_transfer(ipif, moveipif, stubipif); 13911 ipif = moveipif; 13912 } 13913 13914 /* 13915 * Place the ipif on the IPMP ill. If the zeroth ipif on 13916 * the IPMP ill is a stub (0.0.0.0 down address) then we 13917 * replace that one. Otherwise, pick the next available slot. 13918 */ 13919 ipif->ipif_ill = ipmp_ill; 13920 ipif_orig_id = ipif->ipif_id; 13921 13922 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) { 13923 ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL); 13924 ipif = ipmp_ill->ill_ipif; 13925 } else { 13926 ipif->ipif_id = -1; 13927 if ((err = ipif_insert(ipif, B_FALSE)) != 0) { 13928 /* 13929 * No more available ipif_id's -- put it back 13930 * on the original ill and fail the operation. 13931 * Since we're writer on the ill, we can be 13932 * sure our old slot is still available. 13933 */ 13934 ipif->ipif_id = ipif_orig_id; 13935 ipif->ipif_ill = ill; 13936 if (ipif_orig_id == 0) { 13937 ipif_transfer(ipif, ill->ill_ipif, 13938 NULL); 13939 } else { 13940 VERIFY(ipif_insert(ipif, B_FALSE) == 0); 13941 } 13942 rw_exit(&ipst->ips_ill_g_lock); 13943 return (err); 13944 } 13945 } 13946 rw_exit(&ipst->ips_ill_g_lock); 13947 13948 /* 13949 * Tell SCTP that the ipif has moved. Note that even if we 13950 * had to allocate a new ipif, the original sequence id was 13951 * preserved and therefore SCTP won't know. 13952 */ 13953 sctp_move_ipif(ipif, ill, ipmp_ill); 13954 13955 /* 13956 * If the ipif being brought up was on slot zero, then we 13957 * first need to bring up the placeholder we stuck there. In 13958 * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive 13959 * call to ipif_up() itself, if we successfully bring up the 13960 * placeholder, we'll check ill_move_ipif and bring it up too. 13961 */ 13962 if (ipif_orig_id == 0) { 13963 ASSERT(ill->ill_move_ipif == NULL); 13964 ill->ill_move_ipif = ipif; 13965 if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0) 13966 ASSERT(ill->ill_move_ipif == NULL); 13967 if (err != EINPROGRESS) 13968 ill->ill_move_ipif = NULL; 13969 return (err); 13970 } 13971 13972 /* 13973 * Bring it up on the IPMP ill. 13974 */ 13975 return (ipif_up(ipif, q, mp)); 13976 } 13977 13978 /* Skip arp/ndp for any loopback interface. */ 13979 if (ill->ill_wq != NULL) { 13980 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 13981 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 13982 13983 if (!ill->ill_dl_up) { 13984 /* 13985 * ill_dl_up is not yet set. i.e. we are yet to 13986 * DL_BIND with the driver and this is the first 13987 * logical interface on the ill to become "up". 13988 * Tell the driver to get going (via DL_BIND_REQ). 13989 * Note that changing "significant" IFF_ flags 13990 * address/netmask etc cause a down/up dance, but 13991 * does not cause an unbind (DL_UNBIND) with the driver 13992 */ 13993 return (ill_dl_up(ill, ipif, mp, q)); 13994 } 13995 13996 /* 13997 * ipif_resolver_up may end up needeing to bind/attach 13998 * the ARP stream, which in turn necessitates a 13999 * DLPI message exchange with the driver. ioctls are 14000 * serialized and so we cannot send more than one 14001 * interface up message at a time. If ipif_resolver_up 14002 * does need to wait for the DLPI handshake for the ARP stream, 14003 * we get EINPROGRESS and we will complete in arp_bringup_done. 14004 */ 14005 14006 ASSERT(connp != NULL || !CONN_Q(q)); 14007 if (connp != NULL) 14008 mutex_enter(&connp->conn_lock); 14009 mutex_enter(&ill->ill_lock); 14010 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 14011 mutex_exit(&ill->ill_lock); 14012 if (connp != NULL) 14013 mutex_exit(&connp->conn_lock); 14014 if (!success) 14015 return (EINTR); 14016 14017 /* 14018 * Crank up IPv6 neighbor discovery. Unlike ARP, this should 14019 * complete when ipif_ndp_up returns. 14020 */ 14021 err = ipif_resolver_up(ipif, Res_act_initial); 14022 if (err == EINPROGRESS) { 14023 /* We will complete it in arp_bringup_done() */ 14024 return (err); 14025 } 14026 14027 if (isv6 && err == 0) 14028 err = ipif_ndp_up(ipif, B_TRUE); 14029 14030 ASSERT(err != EINPROGRESS); 14031 mp = ipsq_pending_mp_get(ipsq, &connp); 14032 ASSERT(mp != NULL); 14033 if (err != 0) 14034 return (err); 14035 } else { 14036 /* 14037 * Interfaces without underlying hardware don't do duplicate 14038 * address detection. 14039 */ 14040 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 14041 ipif->ipif_addr_ready = 1; 14042 err = ill_add_ires(ill); 14043 /* allocation failure? */ 14044 if (err != 0) 14045 return (err); 14046 } 14047 14048 err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 14049 if (err == 0 && ill->ill_move_ipif != NULL) { 14050 ipif = ill->ill_move_ipif; 14051 ill->ill_move_ipif = NULL; 14052 return (ipif_up(ipif, q, mp)); 14053 } 14054 return (err); 14055 } 14056 14057 /* 14058 * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST. 14059 * The identical set of IREs need to be removed in ill_delete_ires(). 14060 */ 14061 int 14062 ill_add_ires(ill_t *ill) 14063 { 14064 ire_t *ire; 14065 in6_addr_t dummy6 = {(uint32_t)V6_MCAST, 0, 0, 1}; 14066 in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP); 14067 14068 if (ill->ill_ire_multicast != NULL) 14069 return (0); 14070 14071 /* 14072 * provide some dummy ire_addr for creating the ire. 14073 */ 14074 if (ill->ill_isv6) { 14075 ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill, 14076 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); 14077 } else { 14078 ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill, 14079 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); 14080 } 14081 if (ire == NULL) 14082 return (ENOMEM); 14083 14084 ill->ill_ire_multicast = ire; 14085 return (0); 14086 } 14087 14088 void 14089 ill_delete_ires(ill_t *ill) 14090 { 14091 if (ill->ill_ire_multicast != NULL) { 14092 /* 14093 * BIND/ATTACH completed; Release the ref for ill_ire_multicast 14094 * which was taken without any th_tracing enabled. 14095 * We also mark it as condemned (note that it was never added) 14096 * so that caching conn's can move off of it. 14097 */ 14098 ire_make_condemned(ill->ill_ire_multicast); 14099 ire_refrele_notr(ill->ill_ire_multicast); 14100 ill->ill_ire_multicast = NULL; 14101 } 14102 } 14103 14104 /* 14105 * Perform a bind for the physical device. 14106 * When the routine returns EINPROGRESS then mp has been consumed and 14107 * the ioctl will be acked from ip_rput_dlpi. 14108 * Allocate an unbind message and save it until ipif_down. 14109 */ 14110 static int 14111 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 14112 { 14113 mblk_t *bind_mp = NULL; 14114 mblk_t *unbind_mp = NULL; 14115 conn_t *connp; 14116 boolean_t success; 14117 int err; 14118 14119 DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill); 14120 14121 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 14122 ASSERT(IAM_WRITER_ILL(ill)); 14123 ASSERT(mp != NULL); 14124 14125 /* 14126 * Make sure we have an IRE_MULTICAST in case we immediately 14127 * start receiving packets. 14128 */ 14129 err = ill_add_ires(ill); 14130 if (err != 0) 14131 goto bad; 14132 14133 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 14134 DL_BIND_REQ); 14135 if (bind_mp == NULL) 14136 goto bad; 14137 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 14138 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 14139 14140 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 14141 if (unbind_mp == NULL) 14142 goto bad; 14143 14144 /* 14145 * Record state needed to complete this operation when the 14146 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 14147 */ 14148 connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 14149 ASSERT(connp != NULL || !CONN_Q(q)); 14150 GRAB_CONN_LOCK(q); 14151 mutex_enter(&ipif->ipif_ill->ill_lock); 14152 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 14153 mutex_exit(&ipif->ipif_ill->ill_lock); 14154 RELEASE_CONN_LOCK(q); 14155 if (!success) 14156 goto bad; 14157 14158 /* 14159 * Save the unbind message for ill_dl_down(); it will be consumed when 14160 * the interface goes down. 14161 */ 14162 ASSERT(ill->ill_unbind_mp == NULL); 14163 ill->ill_unbind_mp = unbind_mp; 14164 14165 ill_dlpi_send(ill, bind_mp); 14166 /* Send down link-layer capabilities probe if not already done. */ 14167 ill_capability_probe(ill); 14168 14169 /* 14170 * Sysid used to rely on the fact that netboots set domainname 14171 * and the like. Now that miniroot boots aren't strictly netboots 14172 * and miniroot network configuration is driven from userland 14173 * these things still need to be set. This situation can be detected 14174 * by comparing the interface being configured here to the one 14175 * dhcifname was set to reference by the boot loader. Once sysid is 14176 * converted to use dhcp_ipc_getinfo() this call can go away. 14177 */ 14178 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && 14179 (strcmp(ill->ill_name, dhcifname) == 0) && 14180 (strlen(srpc_domain) == 0)) { 14181 if (dhcpinit() != 0) 14182 cmn_err(CE_WARN, "no cached dhcp response"); 14183 } 14184 14185 /* 14186 * This operation will complete in ip_rput_dlpi with either 14187 * a DL_BIND_ACK or DL_ERROR_ACK. 14188 */ 14189 return (EINPROGRESS); 14190 bad: 14191 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 14192 14193 freemsg(bind_mp); 14194 freemsg(unbind_mp); 14195 return (ENOMEM); 14196 } 14197 14198 /* Add room for tcp+ip headers */ 14199 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 14200 14201 /* 14202 * DLPI and ARP is up. 14203 * Create all the IREs associated with an interface. Bring up multicast. 14204 * Set the interface flag and finish other initialization 14205 * that potentially had to be deferred to after DL_BIND_ACK. 14206 */ 14207 int 14208 ipif_up_done(ipif_t *ipif) 14209 { 14210 ill_t *ill = ipif->ipif_ill; 14211 int err = 0; 14212 boolean_t loopback = B_FALSE; 14213 boolean_t update_src_selection = B_TRUE; 14214 ipif_t *tmp_ipif; 14215 14216 ip1dbg(("ipif_up_done(%s:%u)\n", 14217 ipif->ipif_ill->ill_name, ipif->ipif_id)); 14218 DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done", 14219 ill_t *, ill, ipif_t *, ipif); 14220 14221 /* Check if this is a loopback interface */ 14222 if (ipif->ipif_ill->ill_wq == NULL) 14223 loopback = B_TRUE; 14224 14225 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 14226 14227 /* 14228 * If all other interfaces for this ill are down or DEPRECATED, 14229 * or otherwise unsuitable for source address selection, 14230 * reset the src generation numbers to make sure source 14231 * address selection gets to take this new ipif into account. 14232 * No need to hold ill_lock while traversing the ipif list since 14233 * we are writer 14234 */ 14235 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 14236 tmp_ipif = tmp_ipif->ipif_next) { 14237 if (((tmp_ipif->ipif_flags & 14238 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 14239 !(tmp_ipif->ipif_flags & IPIF_UP)) || 14240 (tmp_ipif == ipif)) 14241 continue; 14242 /* first useable pre-existing interface */ 14243 update_src_selection = B_FALSE; 14244 break; 14245 } 14246 if (update_src_selection) 14247 ip_update_source_selection(ill->ill_ipst); 14248 14249 if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) { 14250 nce_t *loop_nce = NULL; 14251 uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD); 14252 14253 /* 14254 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 14255 * ipif_lookup_on_name(), but in the case of zones we can have 14256 * several loopback addresses on lo0. So all the interfaces with 14257 * loopback addresses need to be marked IRE_LOOPBACK. 14258 */ 14259 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 14260 htonl(INADDR_LOOPBACK)) 14261 ipif->ipif_ire_type = IRE_LOOPBACK; 14262 else 14263 ipif->ipif_ire_type = IRE_LOCAL; 14264 if (ill->ill_net_type != IRE_LOOPBACK) 14265 flags |= NCE_F_PUBLISH; 14266 14267 /* add unicast nce for the local addr */ 14268 err = nce_lookup_then_add_v4(ill, NULL, 14269 ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags, 14270 ND_REACHABLE, &loop_nce); 14271 /* A shared-IP zone sees EEXIST for lo0:N */ 14272 if (err == 0 || err == EEXIST) { 14273 ipif->ipif_added_nce = 1; 14274 loop_nce->nce_ipif_cnt++; 14275 nce_refrele(loop_nce); 14276 err = 0; 14277 } else { 14278 ASSERT(loop_nce == NULL); 14279 return (err); 14280 } 14281 } 14282 14283 /* Create all the IREs associated with this interface */ 14284 err = ipif_add_ires_v4(ipif, loopback); 14285 if (err != 0) { 14286 /* 14287 * see comments about return value from 14288 * ip_addr_availability_check() in ipif_add_ires_v4(). 14289 */ 14290 if (err != EADDRINUSE) { 14291 (void) ipif_arp_down(ipif); 14292 } else { 14293 /* 14294 * Make IPMP aware of the deleted ipif so that 14295 * the needed ipmp cleanup (e.g., of ipif_bound_ill) 14296 * can be completed. Note that we do not want to 14297 * destroy the nce that was created on the ipmp_ill 14298 * for the active copy of the duplicate address in 14299 * use. 14300 */ 14301 if (IS_IPMP(ill)) 14302 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 14303 err = EADDRNOTAVAIL; 14304 } 14305 return (err); 14306 } 14307 14308 if (ill->ill_ipif_up_count == 1 && !loopback) { 14309 /* Recover any additional IREs entries for this ill */ 14310 (void) ill_recover_saved_ire(ill); 14311 } 14312 14313 if (ill->ill_need_recover_multicast) { 14314 /* 14315 * Need to recover all multicast memberships in the driver. 14316 * This had to be deferred until we had attached. The same 14317 * code exists in ipif_up_done_v6() to recover IPv6 14318 * memberships. 14319 * 14320 * Note that it would be preferable to unconditionally do the 14321 * ill_recover_multicast() in ill_dl_up(), but we cannot do 14322 * that since ill_join_allmulti() depends on ill_dl_up being 14323 * set, and it is not set until we receive a DL_BIND_ACK after 14324 * having called ill_dl_up(). 14325 */ 14326 ill_recover_multicast(ill); 14327 } 14328 14329 if (ill->ill_ipif_up_count == 1) { 14330 /* 14331 * Since the interface is now up, it may now be active. 14332 */ 14333 if (IS_UNDER_IPMP(ill)) 14334 ipmp_ill_refresh_active(ill); 14335 14336 /* 14337 * If this is an IPMP interface, we may now be able to 14338 * establish ARP entries. 14339 */ 14340 if (IS_IPMP(ill)) 14341 ipmp_illgrp_refresh_arpent(ill->ill_grp); 14342 } 14343 14344 /* Join the allhosts multicast address */ 14345 ipif_multicast_up(ipif); 14346 14347 if (!loopback && !update_src_selection && 14348 !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) 14349 ip_update_source_selection(ill->ill_ipst); 14350 14351 if (!loopback && ipif->ipif_addr_ready) { 14352 /* Broadcast an address mask reply. */ 14353 ipif_mask_reply(ipif); 14354 } 14355 /* Perhaps ilgs should use this ill */ 14356 update_conn_ill(NULL, ill->ill_ipst); 14357 14358 /* 14359 * This had to be deferred until we had bound. Tell routing sockets and 14360 * others that this interface is up if it looks like the address has 14361 * been validated. Otherwise, if it isn't ready yet, wait for 14362 * duplicate address detection to do its thing. 14363 */ 14364 if (ipif->ipif_addr_ready) 14365 ipif_up_notify(ipif); 14366 return (0); 14367 } 14368 14369 /* 14370 * Add the IREs associated with the ipif. 14371 * Those MUST be explicitly removed in ipif_delete_ires_v4. 14372 */ 14373 static int 14374 ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback) 14375 { 14376 ill_t *ill = ipif->ipif_ill; 14377 ip_stack_t *ipst = ill->ill_ipst; 14378 ire_t *ire_array[20]; 14379 ire_t **irep = ire_array; 14380 ire_t **irep1; 14381 ipaddr_t net_mask = 0; 14382 ipaddr_t subnet_mask, route_mask; 14383 int err; 14384 ire_t *ire_local = NULL; /* LOCAL or LOOPBACK */ 14385 ire_t *ire_if = NULL; 14386 14387 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14388 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14389 /* 14390 * If we're on a labeled system then make sure that zone- 14391 * private addresses have proper remote host database entries. 14392 */ 14393 if (is_system_labeled() && 14394 ipif->ipif_ire_type != IRE_LOOPBACK && 14395 !tsol_check_interface_address(ipif)) 14396 return (EINVAL); 14397 14398 /* Register the source address for __sin6_src_id */ 14399 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 14400 ipif->ipif_zoneid, ipst); 14401 if (err != 0) { 14402 ip0dbg(("ipif_add_ires: srcid_insert %d\n", err)); 14403 return (err); 14404 } 14405 14406 /* If the interface address is set, create the local IRE. */ 14407 ire_local = ire_create( 14408 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 14409 (uchar_t *)&ip_g_all_ones, /* mask */ 14410 NULL, /* no gateway */ 14411 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 14412 ipif->ipif_ill, 14413 ipif->ipif_zoneid, 14414 ((ipif->ipif_flags & IPIF_PRIVATE) ? 14415 RTF_PRIVATE : 0) | RTF_KERNEL, 14416 NULL, 14417 ipst); 14418 ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x" 14419 " for 0x%x\n", (void *)ipif, (void *)ire_local, 14420 ipif->ipif_ire_type, 14421 ntohl(ipif->ipif_lcl_addr))); 14422 if (ire_local == NULL) { 14423 ip1dbg(("ipif_up_done: NULL ire_local\n")); 14424 err = ENOMEM; 14425 goto bad; 14426 } 14427 } else { 14428 ip1dbg(( 14429 "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n", 14430 ipif->ipif_ire_type, 14431 ntohl(ipif->ipif_lcl_addr), 14432 (uint_t)ipif->ipif_flags)); 14433 } 14434 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14435 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14436 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14437 } else { 14438 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 14439 } 14440 14441 subnet_mask = ipif->ipif_net_mask; 14442 14443 /* 14444 * If mask was not specified, use natural netmask of 14445 * interface address. Also, store this mask back into the 14446 * ipif struct. 14447 */ 14448 if (subnet_mask == 0) { 14449 subnet_mask = net_mask; 14450 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 14451 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 14452 ipif->ipif_v6subnet); 14453 } 14454 14455 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 14456 if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) && 14457 ipif->ipif_subnet != INADDR_ANY) { 14458 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 14459 14460 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 14461 route_mask = IP_HOST_MASK; 14462 } else { 14463 route_mask = subnet_mask; 14464 } 14465 14466 ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p " 14467 "creating if IRE ill_net_type 0x%x for 0x%x\n", 14468 (void *)ipif, (void *)ill, ill->ill_net_type, 14469 ntohl(ipif->ipif_subnet))); 14470 ire_if = ire_create( 14471 (uchar_t *)&ipif->ipif_subnet, 14472 (uchar_t *)&route_mask, 14473 (uchar_t *)&ipif->ipif_lcl_addr, 14474 ill->ill_net_type, 14475 ill, 14476 ipif->ipif_zoneid, 14477 ((ipif->ipif_flags & IPIF_PRIVATE) ? 14478 RTF_PRIVATE: 0) | RTF_KERNEL, 14479 NULL, 14480 ipst); 14481 if (ire_if == NULL) { 14482 ip1dbg(("ipif_up_done: NULL ire_if\n")); 14483 err = ENOMEM; 14484 goto bad; 14485 } 14486 } 14487 14488 /* 14489 * Create any necessary broadcast IREs. 14490 */ 14491 if ((ipif->ipif_flags & IPIF_BROADCAST) && 14492 !(ipif->ipif_flags & IPIF_NOXMIT)) 14493 irep = ipif_create_bcast_ires(ipif, irep); 14494 14495 /* If an earlier ire_create failed, get out now */ 14496 for (irep1 = irep; irep1 > ire_array; ) { 14497 irep1--; 14498 if (*irep1 == NULL) { 14499 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 14500 err = ENOMEM; 14501 goto bad; 14502 } 14503 } 14504 14505 /* 14506 * Need to atomically check for IP address availability under 14507 * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new 14508 * ills or new ipifs can be added while we are checking availability. 14509 */ 14510 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 14511 mutex_enter(&ipst->ips_ip_addr_avail_lock); 14512 /* Mark it up, and increment counters. */ 14513 ipif->ipif_flags |= IPIF_UP; 14514 ill->ill_ipif_up_count++; 14515 err = ip_addr_availability_check(ipif); 14516 mutex_exit(&ipst->ips_ip_addr_avail_lock); 14517 rw_exit(&ipst->ips_ill_g_lock); 14518 14519 if (err != 0) { 14520 /* 14521 * Our address may already be up on the same ill. In this case, 14522 * the ARP entry for our ipif replaced the one for the other 14523 * ipif. So we don't want to delete it (otherwise the other ipif 14524 * would be unable to send packets). 14525 * ip_addr_availability_check() identifies this case for us and 14526 * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL 14527 * which is the expected error code. 14528 */ 14529 ill->ill_ipif_up_count--; 14530 ipif->ipif_flags &= ~IPIF_UP; 14531 goto bad; 14532 } 14533 14534 /* 14535 * Add in all newly created IREs. ire_create_bcast() has 14536 * already checked for duplicates of the IRE_BROADCAST type. 14537 * We add the IRE_INTERFACE before the IRE_LOCAL to ensure 14538 * that lookups find the IRE_LOCAL even if the IRE_INTERFACE is 14539 * a /32 route. 14540 */ 14541 if (ire_if != NULL) { 14542 ire_if = ire_add(ire_if); 14543 if (ire_if == NULL) { 14544 err = ENOMEM; 14545 goto bad2; 14546 } 14547 #ifdef DEBUG 14548 ire_refhold_notr(ire_if); 14549 ire_refrele(ire_if); 14550 #endif 14551 } 14552 if (ire_local != NULL) { 14553 ire_local = ire_add(ire_local); 14554 if (ire_local == NULL) { 14555 err = ENOMEM; 14556 goto bad2; 14557 } 14558 #ifdef DEBUG 14559 ire_refhold_notr(ire_local); 14560 ire_refrele(ire_local); 14561 #endif 14562 } 14563 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14564 if (ire_local != NULL) 14565 ipif->ipif_ire_local = ire_local; 14566 if (ire_if != NULL) 14567 ipif->ipif_ire_if = ire_if; 14568 rw_exit(&ipst->ips_ill_g_lock); 14569 ire_local = NULL; 14570 ire_if = NULL; 14571 14572 /* 14573 * We first add all of them, and if that succeeds we refrele the 14574 * bunch. That enables us to delete all of them should any of the 14575 * ire_adds fail. 14576 */ 14577 for (irep1 = irep; irep1 > ire_array; ) { 14578 irep1--; 14579 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock))); 14580 *irep1 = ire_add(*irep1); 14581 if (*irep1 == NULL) { 14582 err = ENOMEM; 14583 goto bad2; 14584 } 14585 } 14586 14587 for (irep1 = irep; irep1 > ire_array; ) { 14588 irep1--; 14589 /* refheld by ire_add. */ 14590 if (*irep1 != NULL) { 14591 ire_refrele(*irep1); 14592 *irep1 = NULL; 14593 } 14594 } 14595 14596 if (!loopback) { 14597 /* 14598 * If the broadcast address has been set, make sure it makes 14599 * sense based on the interface address. 14600 * Only match on ill since we are sharing broadcast addresses. 14601 */ 14602 if ((ipif->ipif_brd_addr != INADDR_ANY) && 14603 (ipif->ipif_flags & IPIF_BROADCAST)) { 14604 ire_t *ire; 14605 14606 ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0, 14607 IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES, NULL, 14608 (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL); 14609 14610 if (ire == NULL) { 14611 /* 14612 * If there isn't a matching broadcast IRE, 14613 * revert to the default for this netmask. 14614 */ 14615 ipif->ipif_v6brd_addr = ipv6_all_zeros; 14616 mutex_enter(&ipif->ipif_ill->ill_lock); 14617 ipif_set_default(ipif); 14618 mutex_exit(&ipif->ipif_ill->ill_lock); 14619 } else { 14620 ire_refrele(ire); 14621 } 14622 } 14623 14624 } 14625 return (0); 14626 14627 bad2: 14628 ill->ill_ipif_up_count--; 14629 ipif->ipif_flags &= ~IPIF_UP; 14630 14631 bad: 14632 ip1dbg(("ipif_add_ires: FAILED \n")); 14633 if (ire_local != NULL) 14634 ire_delete(ire_local); 14635 if (ire_if != NULL) 14636 ire_delete(ire_if); 14637 14638 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14639 ire_local = ipif->ipif_ire_local; 14640 ipif->ipif_ire_local = NULL; 14641 ire_if = ipif->ipif_ire_if; 14642 ipif->ipif_ire_if = NULL; 14643 rw_exit(&ipst->ips_ill_g_lock); 14644 if (ire_local != NULL) { 14645 ire_delete(ire_local); 14646 ire_refrele_notr(ire_local); 14647 } 14648 if (ire_if != NULL) { 14649 ire_delete(ire_if); 14650 ire_refrele_notr(ire_if); 14651 } 14652 14653 while (irep > ire_array) { 14654 irep--; 14655 if (*irep != NULL) { 14656 ire_delete(*irep); 14657 } 14658 } 14659 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); 14660 14661 return (err); 14662 } 14663 14664 /* Remove all the IREs created by ipif_add_ires_v4 */ 14665 void 14666 ipif_delete_ires_v4(ipif_t *ipif) 14667 { 14668 ill_t *ill = ipif->ipif_ill; 14669 ip_stack_t *ipst = ill->ill_ipst; 14670 ire_t *ire; 14671 14672 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14673 ire = ipif->ipif_ire_local; 14674 ipif->ipif_ire_local = NULL; 14675 rw_exit(&ipst->ips_ill_g_lock); 14676 if (ire != NULL) { 14677 /* 14678 * Move count to ipif so we don't loose the count due to 14679 * a down/up dance. 14680 */ 14681 atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count); 14682 14683 ire_delete(ire); 14684 ire_refrele_notr(ire); 14685 } 14686 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14687 ire = ipif->ipif_ire_if; 14688 ipif->ipif_ire_if = NULL; 14689 rw_exit(&ipst->ips_ill_g_lock); 14690 if (ire != NULL) { 14691 ire_delete(ire); 14692 ire_refrele_notr(ire); 14693 } 14694 14695 /* 14696 * Delete the broadcast IREs. 14697 */ 14698 if ((ipif->ipif_flags & IPIF_BROADCAST) && 14699 !(ipif->ipif_flags & IPIF_NOXMIT)) 14700 ipif_delete_bcast_ires(ipif); 14701 } 14702 14703 /* 14704 * Checks for availbility of a usable source address (if there is one) when the 14705 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 14706 * this selection is done regardless of the destination. 14707 */ 14708 boolean_t 14709 ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid, 14710 ip_stack_t *ipst) 14711 { 14712 ipif_t *ipif = NULL; 14713 ill_t *uill; 14714 14715 ASSERT(ifindex != 0); 14716 14717 uill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 14718 if (uill == NULL) 14719 return (B_FALSE); 14720 14721 mutex_enter(&uill->ill_lock); 14722 for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14723 if (IPIF_IS_CONDEMNED(ipif)) 14724 continue; 14725 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 14726 continue; 14727 if (!(ipif->ipif_flags & IPIF_UP)) 14728 continue; 14729 if (ipif->ipif_zoneid != zoneid) 14730 continue; 14731 if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : 14732 ipif->ipif_lcl_addr == INADDR_ANY) 14733 continue; 14734 mutex_exit(&uill->ill_lock); 14735 ill_refrele(uill); 14736 return (B_TRUE); 14737 } 14738 mutex_exit(&uill->ill_lock); 14739 ill_refrele(uill); 14740 return (B_FALSE); 14741 } 14742 14743 /* 14744 * Find an ipif with a good local address on the ill+zoneid. 14745 */ 14746 ipif_t * 14747 ipif_good_addr(ill_t *ill, zoneid_t zoneid) 14748 { 14749 ipif_t *ipif; 14750 14751 mutex_enter(&ill->ill_lock); 14752 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14753 if (IPIF_IS_CONDEMNED(ipif)) 14754 continue; 14755 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 14756 continue; 14757 if (!(ipif->ipif_flags & IPIF_UP)) 14758 continue; 14759 if (ipif->ipif_zoneid != zoneid && 14760 ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES) 14761 continue; 14762 if (ill->ill_isv6 ? 14763 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : 14764 ipif->ipif_lcl_addr == INADDR_ANY) 14765 continue; 14766 ipif_refhold_locked(ipif); 14767 mutex_exit(&ill->ill_lock); 14768 return (ipif); 14769 } 14770 mutex_exit(&ill->ill_lock); 14771 return (NULL); 14772 } 14773 14774 /* 14775 * IP source address type, sorted from worst to best. For a given type, 14776 * always prefer IP addresses on the same subnet. All-zones addresses are 14777 * suboptimal because they pose problems with unlabeled destinations. 14778 */ 14779 typedef enum { 14780 IPIF_NONE, 14781 IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */ 14782 IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */ 14783 IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */ 14784 IPIF_SAMENET_ALLZONES, /* allzones and same subnet */ 14785 IPIF_DIFFNET, /* normal and different subnet */ 14786 IPIF_SAMENET, /* normal and same subnet */ 14787 IPIF_LOCALADDR /* local loopback */ 14788 } ipif_type_t; 14789 14790 /* 14791 * Pick the optimal ipif on `ill' for sending to destination `dst' from zone 14792 * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t 14793 * enumeration, and return the highest-rated ipif. If there's a tie, we pick 14794 * the first one, unless IPMP is used in which case we round-robin among them; 14795 * see below for more. 14796 * 14797 * Returns NULL if there is no suitable source address for the ill. 14798 * This only occurs when there is no valid source address for the ill. 14799 */ 14800 ipif_t * 14801 ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid, 14802 boolean_t allow_usesrc, boolean_t *notreadyp) 14803 { 14804 ill_t *usill = NULL; 14805 ill_t *ipmp_ill = NULL; 14806 ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif; 14807 ipif_type_t type, best_type; 14808 tsol_tpc_t *src_rhtp, *dst_rhtp; 14809 ip_stack_t *ipst = ill->ill_ipst; 14810 boolean_t samenet; 14811 14812 if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) { 14813 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, 14814 B_FALSE, ipst); 14815 if (usill != NULL) 14816 ill = usill; /* Select source from usesrc ILL */ 14817 else 14818 return (NULL); 14819 } 14820 14821 /* 14822 * Test addresses should never be used for source address selection, 14823 * so if we were passed one, switch to the IPMP meta-interface. 14824 */ 14825 if (IS_UNDER_IPMP(ill)) { 14826 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) 14827 ill = ipmp_ill; /* Select source from IPMP ill */ 14828 else 14829 return (NULL); 14830 } 14831 14832 /* 14833 * If we're dealing with an unlabeled destination on a labeled system, 14834 * make sure that we ignore source addresses that are incompatible with 14835 * the destination's default label. That destination's default label 14836 * must dominate the minimum label on the source address. 14837 */ 14838 dst_rhtp = NULL; 14839 if (is_system_labeled()) { 14840 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 14841 if (dst_rhtp == NULL) 14842 return (NULL); 14843 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 14844 TPC_RELE(dst_rhtp); 14845 dst_rhtp = NULL; 14846 } 14847 } 14848 14849 /* 14850 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill 14851 * can be deleted. But an ipif/ill can get CONDEMNED any time. 14852 * After selecting the right ipif, under ill_lock make sure ipif is 14853 * not condemned, and increment refcnt. If ipif is CONDEMNED, 14854 * we retry. Inside the loop we still need to check for CONDEMNED, 14855 * but not under a lock. 14856 */ 14857 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 14858 retry: 14859 /* 14860 * For source address selection, we treat the ipif list as circular 14861 * and continue until we get back to where we started. This allows 14862 * IPMP to vary source address selection (which improves inbound load 14863 * spreading) by caching its last ending point and starting from 14864 * there. NOTE: we don't have to worry about ill_src_ipif changing 14865 * ills since that can't happen on the IPMP ill. 14866 */ 14867 start_ipif = ill->ill_ipif; 14868 if (IS_IPMP(ill) && ill->ill_src_ipif != NULL) 14869 start_ipif = ill->ill_src_ipif; 14870 14871 ipif = start_ipif; 14872 best_ipif = NULL; 14873 best_type = IPIF_NONE; 14874 do { 14875 if ((next_ipif = ipif->ipif_next) == NULL) 14876 next_ipif = ill->ill_ipif; 14877 14878 if (IPIF_IS_CONDEMNED(ipif)) 14879 continue; 14880 /* Always skip NOLOCAL and ANYCAST interfaces */ 14881 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 14882 continue; 14883 /* Always skip NOACCEPT interfaces */ 14884 if (ipif->ipif_ill->ill_flags & ILLF_NOACCEPT) 14885 continue; 14886 if (!(ipif->ipif_flags & IPIF_UP)) 14887 continue; 14888 14889 if (!ipif->ipif_addr_ready) { 14890 if (notreadyp != NULL) 14891 *notreadyp = B_TRUE; 14892 continue; 14893 } 14894 14895 if (zoneid != ALL_ZONES && 14896 ipif->ipif_zoneid != zoneid && 14897 ipif->ipif_zoneid != ALL_ZONES) 14898 continue; 14899 14900 /* 14901 * Interfaces with 0.0.0.0 address are allowed to be UP, but 14902 * are not valid as source addresses. 14903 */ 14904 if (ipif->ipif_lcl_addr == INADDR_ANY) 14905 continue; 14906 14907 /* 14908 * Check compatibility of local address for destination's 14909 * default label if we're on a labeled system. Incompatible 14910 * addresses can't be used at all. 14911 */ 14912 if (dst_rhtp != NULL) { 14913 boolean_t incompat; 14914 14915 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 14916 IPV4_VERSION, B_FALSE); 14917 if (src_rhtp == NULL) 14918 continue; 14919 incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO || 14920 src_rhtp->tpc_tp.tp_doi != 14921 dst_rhtp->tpc_tp.tp_doi || 14922 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 14923 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 14924 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 14925 src_rhtp->tpc_tp.tp_sl_set_cipso)); 14926 TPC_RELE(src_rhtp); 14927 if (incompat) 14928 continue; 14929 } 14930 14931 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet); 14932 14933 if (ipif->ipif_lcl_addr == dst) { 14934 type = IPIF_LOCALADDR; 14935 } else if (ipif->ipif_flags & IPIF_DEPRECATED) { 14936 type = samenet ? IPIF_SAMENET_DEPRECATED : 14937 IPIF_DIFFNET_DEPRECATED; 14938 } else if (ipif->ipif_zoneid == ALL_ZONES) { 14939 type = samenet ? IPIF_SAMENET_ALLZONES : 14940 IPIF_DIFFNET_ALLZONES; 14941 } else { 14942 type = samenet ? IPIF_SAMENET : IPIF_DIFFNET; 14943 } 14944 14945 if (type > best_type) { 14946 best_type = type; 14947 best_ipif = ipif; 14948 if (best_type == IPIF_LOCALADDR) 14949 break; /* can't get better */ 14950 } 14951 } while ((ipif = next_ipif) != start_ipif); 14952 14953 if ((ipif = best_ipif) != NULL) { 14954 mutex_enter(&ipif->ipif_ill->ill_lock); 14955 if (IPIF_IS_CONDEMNED(ipif)) { 14956 mutex_exit(&ipif->ipif_ill->ill_lock); 14957 goto retry; 14958 } 14959 ipif_refhold_locked(ipif); 14960 14961 /* 14962 * For IPMP, update the source ipif rotor to the next ipif, 14963 * provided we can look it up. (We must not use it if it's 14964 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after 14965 * ipif_free() checked ill_src_ipif.) 14966 */ 14967 if (IS_IPMP(ill) && ipif != NULL) { 14968 next_ipif = ipif->ipif_next; 14969 if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif)) 14970 ill->ill_src_ipif = next_ipif; 14971 else 14972 ill->ill_src_ipif = NULL; 14973 } 14974 mutex_exit(&ipif->ipif_ill->ill_lock); 14975 } 14976 14977 rw_exit(&ipst->ips_ill_g_lock); 14978 if (usill != NULL) 14979 ill_refrele(usill); 14980 if (ipmp_ill != NULL) 14981 ill_refrele(ipmp_ill); 14982 if (dst_rhtp != NULL) 14983 TPC_RELE(dst_rhtp); 14984 14985 #ifdef DEBUG 14986 if (ipif == NULL) { 14987 char buf1[INET6_ADDRSTRLEN]; 14988 14989 ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n", 14990 ill->ill_name, 14991 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 14992 } else { 14993 char buf1[INET6_ADDRSTRLEN]; 14994 char buf2[INET6_ADDRSTRLEN]; 14995 14996 ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n", 14997 ipif->ipif_ill->ill_name, 14998 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 14999 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 15000 buf2, sizeof (buf2)))); 15001 } 15002 #endif /* DEBUG */ 15003 return (ipif); 15004 } 15005 15006 /* 15007 * Pick a source address based on the destination ill and an optional setsrc 15008 * address. 15009 * The result is stored in srcp. If generation is set, then put the source 15010 * generation number there before we look for the source address (to avoid 15011 * missing changes in the set of source addresses. 15012 * If flagsp is set, then us it to pass back ipif_flags. 15013 * 15014 * If the caller wants to cache the returned source address and detect when 15015 * that might be stale, the caller should pass in a generation argument, 15016 * which the caller can later compare against ips_src_generation 15017 * 15018 * The precedence order for selecting an IPv4 source address is: 15019 * - RTF_SETSRC on the offlink ire always wins. 15020 * - If usrsrc is set, swap the ill to be the usesrc one. 15021 * - If IPMP is used on the ill, select a random address from the most 15022 * preferred ones below: 15023 * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES 15024 * 2. Not deprecated, not ALL_ZONES 15025 * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES 15026 * 4. Not deprecated, ALL_ZONES 15027 * 5. If onlink destination, same subnet and deprecated 15028 * 6. Deprecated. 15029 * 15030 * We have lower preference for ALL_ZONES IP addresses, 15031 * as they pose problems with unlabeled destinations. 15032 * 15033 * Note that when multiple IP addresses match e.g., #1 we pick 15034 * the first one if IPMP is not in use. With IPMP we randomize. 15035 */ 15036 int 15037 ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst, 15038 ipaddr_t multicast_ifaddr, 15039 zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp, 15040 uint32_t *generation, uint64_t *flagsp) 15041 { 15042 ipif_t *ipif; 15043 boolean_t notready = B_FALSE; /* Set if !ipif_addr_ready found */ 15044 15045 if (flagsp != NULL) 15046 *flagsp = 0; 15047 15048 /* 15049 * Need to grab the generation number before we check to 15050 * avoid a race with a change to the set of local addresses. 15051 * No lock needed since the thread which updates the set of local 15052 * addresses use ipif/ill locks and exit those (hence a store memory 15053 * barrier) before doing the atomic increase of ips_src_generation. 15054 */ 15055 if (generation != NULL) { 15056 *generation = ipst->ips_src_generation; 15057 } 15058 15059 if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) { 15060 *srcp = multicast_ifaddr; 15061 return (0); 15062 } 15063 15064 /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */ 15065 if (setsrc != INADDR_ANY) { 15066 *srcp = setsrc; 15067 return (0); 15068 } 15069 ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, ¬ready); 15070 if (ipif == NULL) { 15071 if (notready) 15072 return (ENETDOWN); 15073 else 15074 return (EADDRNOTAVAIL); 15075 } 15076 *srcp = ipif->ipif_lcl_addr; 15077 if (flagsp != NULL) 15078 *flagsp = ipif->ipif_flags; 15079 ipif_refrele(ipif); 15080 return (0); 15081 } 15082 15083 /* ARGSUSED */ 15084 int 15085 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15086 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15087 { 15088 /* 15089 * ill_phyint_reinit merged the v4 and v6 into a single 15090 * ipsq. We might not have been able to complete the 15091 * operation in ipif_set_values, if we could not become 15092 * exclusive. If so restart it here. 15093 */ 15094 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 15095 } 15096 15097 /* 15098 * Can operate on either a module or a driver queue. 15099 * Returns an error if not a module queue. 15100 */ 15101 /* ARGSUSED */ 15102 int 15103 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15104 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15105 { 15106 queue_t *q1 = q; 15107 char *cp; 15108 char interf_name[LIFNAMSIZ]; 15109 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 15110 15111 if (q->q_next == NULL) { 15112 ip1dbg(( 15113 "if_unitsel: IF_UNITSEL: no q_next\n")); 15114 return (EINVAL); 15115 } 15116 15117 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 15118 return (EALREADY); 15119 15120 do { 15121 q1 = q1->q_next; 15122 } while (q1->q_next); 15123 cp = q1->q_qinfo->qi_minfo->mi_idname; 15124 (void) sprintf(interf_name, "%s%d", cp, ppa); 15125 15126 /* 15127 * Here we are not going to delay the ioack until after 15128 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 15129 * original ioctl message before sending the requests. 15130 */ 15131 return (ipif_set_values(q, mp, interf_name, &ppa)); 15132 } 15133 15134 /* ARGSUSED */ 15135 int 15136 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15137 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15138 { 15139 return (ENXIO); 15140 } 15141 15142 /* 15143 * Create any IRE_BROADCAST entries for `ipif', and store those entries in 15144 * `irep'. Returns a pointer to the next free `irep' entry 15145 * A mirror exists in ipif_delete_bcast_ires(). 15146 * 15147 * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is 15148 * done in ire_add. 15149 */ 15150 static ire_t ** 15151 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) 15152 { 15153 ipaddr_t addr; 15154 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 15155 ipaddr_t subnetmask = ipif->ipif_net_mask; 15156 ill_t *ill = ipif->ipif_ill; 15157 zoneid_t zoneid = ipif->ipif_zoneid; 15158 15159 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n")); 15160 15161 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 15162 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); 15163 15164 if (ipif->ipif_lcl_addr == INADDR_ANY || 15165 (ipif->ipif_flags & IPIF_NOLOCAL)) 15166 netmask = htonl(IN_CLASSA_NET); /* fallback */ 15167 15168 irep = ire_create_bcast(ill, 0, zoneid, irep); 15169 irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep); 15170 15171 /* 15172 * For backward compatibility, we create net broadcast IREs based on 15173 * the old "IP address class system", since some old machines only 15174 * respond to these class derived net broadcast. However, we must not 15175 * create these net broadcast IREs if the subnetmask is shorter than 15176 * the IP address class based derived netmask. Otherwise, we may 15177 * create a net broadcast address which is the same as an IP address 15178 * on the subnet -- and then TCP will refuse to talk to that address. 15179 */ 15180 if (netmask < subnetmask) { 15181 addr = netmask & ipif->ipif_subnet; 15182 irep = ire_create_bcast(ill, addr, zoneid, irep); 15183 irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep); 15184 } 15185 15186 /* 15187 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 15188 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 15189 * created. Creating these broadcast IREs will only create confusion 15190 * as `addr' will be the same as the IP address. 15191 */ 15192 if (subnetmask != 0xFFFFFFFF) { 15193 addr = ipif->ipif_subnet; 15194 irep = ire_create_bcast(ill, addr, zoneid, irep); 15195 irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep); 15196 } 15197 15198 return (irep); 15199 } 15200 15201 /* 15202 * Mirror of ipif_create_bcast_ires() 15203 */ 15204 static void 15205 ipif_delete_bcast_ires(ipif_t *ipif) 15206 { 15207 ipaddr_t addr; 15208 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 15209 ipaddr_t subnetmask = ipif->ipif_net_mask; 15210 ill_t *ill = ipif->ipif_ill; 15211 zoneid_t zoneid = ipif->ipif_zoneid; 15212 ire_t *ire; 15213 15214 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 15215 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); 15216 15217 if (ipif->ipif_lcl_addr == INADDR_ANY || 15218 (ipif->ipif_flags & IPIF_NOLOCAL)) 15219 netmask = htonl(IN_CLASSA_NET); /* fallback */ 15220 15221 ire = ire_lookup_bcast(ill, 0, zoneid); 15222 ASSERT(ire != NULL); 15223 ire_delete(ire); ire_refrele(ire); 15224 ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid); 15225 ASSERT(ire != NULL); 15226 ire_delete(ire); ire_refrele(ire); 15227 15228 /* 15229 * For backward compatibility, we create net broadcast IREs based on 15230 * the old "IP address class system", since some old machines only 15231 * respond to these class derived net broadcast. However, we must not 15232 * create these net broadcast IREs if the subnetmask is shorter than 15233 * the IP address class based derived netmask. Otherwise, we may 15234 * create a net broadcast address which is the same as an IP address 15235 * on the subnet -- and then TCP will refuse to talk to that address. 15236 */ 15237 if (netmask < subnetmask) { 15238 addr = netmask & ipif->ipif_subnet; 15239 ire = ire_lookup_bcast(ill, addr, zoneid); 15240 ASSERT(ire != NULL); 15241 ire_delete(ire); ire_refrele(ire); 15242 ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid); 15243 ASSERT(ire != NULL); 15244 ire_delete(ire); ire_refrele(ire); 15245 } 15246 15247 /* 15248 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 15249 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 15250 * created. Creating these broadcast IREs will only create confusion 15251 * as `addr' will be the same as the IP address. 15252 */ 15253 if (subnetmask != 0xFFFFFFFF) { 15254 addr = ipif->ipif_subnet; 15255 ire = ire_lookup_bcast(ill, addr, zoneid); 15256 ASSERT(ire != NULL); 15257 ire_delete(ire); ire_refrele(ire); 15258 ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid); 15259 ASSERT(ire != NULL); 15260 ire_delete(ire); ire_refrele(ire); 15261 } 15262 } 15263 15264 /* 15265 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 15266 * from lifr_flags and the name from lifr_name. 15267 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 15268 * since ipif_lookup_on_name uses the _isv6 flags when matching. 15269 * Returns EINPROGRESS when mp has been consumed by queueing it on 15270 * ipx_pending_mp and the ioctl will complete in ip_rput. 15271 * 15272 * Can operate on either a module or a driver queue. 15273 * Returns an error if not a module queue. 15274 */ 15275 /* ARGSUSED */ 15276 int 15277 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15278 ip_ioctl_cmd_t *ipip, void *if_req) 15279 { 15280 ill_t *ill = q->q_ptr; 15281 phyint_t *phyi; 15282 ip_stack_t *ipst; 15283 struct lifreq *lifr = if_req; 15284 uint64_t new_flags; 15285 15286 ASSERT(ipif != NULL); 15287 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 15288 15289 if (q->q_next == NULL) { 15290 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n")); 15291 return (EINVAL); 15292 } 15293 15294 /* 15295 * If we are not writer on 'q' then this interface exists already 15296 * and previous lookups (ip_extract_lifreq()) found this ipif -- 15297 * so return EALREADY. 15298 */ 15299 if (ill != ipif->ipif_ill) 15300 return (EALREADY); 15301 15302 if (ill->ill_name[0] != '\0') 15303 return (EALREADY); 15304 15305 /* 15306 * If there's another ill already with the requested name, ensure 15307 * that it's of the same type. Otherwise, ill_phyint_reinit() will 15308 * fuse together two unrelated ills, which will cause chaos. 15309 */ 15310 ipst = ill->ill_ipst; 15311 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 15312 lifr->lifr_name, NULL); 15313 if (phyi != NULL) { 15314 ill_t *ill_mate = phyi->phyint_illv4; 15315 15316 if (ill_mate == NULL) 15317 ill_mate = phyi->phyint_illv6; 15318 ASSERT(ill_mate != NULL); 15319 15320 if (ill_mate->ill_media->ip_m_mac_type != 15321 ill->ill_media->ip_m_mac_type) { 15322 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to " 15323 "use the same ill name on differing media\n")); 15324 return (EINVAL); 15325 } 15326 } 15327 15328 /* 15329 * We start off as IFF_IPV4 in ipif_allocate and become 15330 * IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value. 15331 * The only flags that we read from user space are IFF_IPV4, 15332 * IFF_IPV6, and IFF_BROADCAST. 15333 * 15334 * This ill has not been inserted into the global list. 15335 * So we are still single threaded and don't need any lock 15336 * 15337 * Saniy check the flags. 15338 */ 15339 15340 if ((lifr->lifr_flags & IFF_BROADCAST) && 15341 ((lifr->lifr_flags & IFF_IPV6) || 15342 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 15343 ip1dbg(("ip_sioctl_slifname: link not broadcast capable " 15344 "or IPv6 i.e., no broadcast \n")); 15345 return (EINVAL); 15346 } 15347 15348 new_flags = 15349 lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST); 15350 15351 if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) { 15352 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of " 15353 "IFF_IPV4 or IFF_IPV6\n")); 15354 return (EINVAL); 15355 } 15356 15357 /* 15358 * We always start off as IPv4, so only need to check for IPv6. 15359 */ 15360 if ((new_flags & IFF_IPV6) != 0) { 15361 ill->ill_flags |= ILLF_IPV6; 15362 ill->ill_flags &= ~ILLF_IPV4; 15363 } 15364 15365 if ((new_flags & IFF_BROADCAST) != 0) 15366 ipif->ipif_flags |= IPIF_BROADCAST; 15367 else 15368 ipif->ipif_flags &= ~IPIF_BROADCAST; 15369 15370 /* We started off as V4. */ 15371 if (ill->ill_flags & ILLF_IPV6) { 15372 ill->ill_phyint->phyint_illv6 = ill; 15373 ill->ill_phyint->phyint_illv4 = NULL; 15374 } 15375 15376 return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa)); 15377 } 15378 15379 /* ARGSUSED */ 15380 int 15381 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15382 ip_ioctl_cmd_t *ipip, void *if_req) 15383 { 15384 /* 15385 * ill_phyint_reinit merged the v4 and v6 into a single 15386 * ipsq. We might not have been able to complete the 15387 * slifname in ipif_set_values, if we could not become 15388 * exclusive. If so restart it here 15389 */ 15390 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 15391 } 15392 15393 /* 15394 * Return a pointer to the ipif which matches the index, IP version type and 15395 * zoneid. 15396 */ 15397 ipif_t * 15398 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 15399 ip_stack_t *ipst) 15400 { 15401 ill_t *ill; 15402 ipif_t *ipif = NULL; 15403 15404 ill = ill_lookup_on_ifindex(index, isv6, ipst); 15405 if (ill != NULL) { 15406 mutex_enter(&ill->ill_lock); 15407 for (ipif = ill->ill_ipif; ipif != NULL; 15408 ipif = ipif->ipif_next) { 15409 if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES || 15410 zoneid == ipif->ipif_zoneid || 15411 ipif->ipif_zoneid == ALL_ZONES)) { 15412 ipif_refhold_locked(ipif); 15413 break; 15414 } 15415 } 15416 mutex_exit(&ill->ill_lock); 15417 ill_refrele(ill); 15418 } 15419 return (ipif); 15420 } 15421 15422 /* 15423 * Change an existing physical interface's index. If the new index 15424 * is acceptable we update the index and the phyint_list_avl_by_index tree. 15425 * Finally, we update other systems which may have a dependence on the 15426 * index value. 15427 */ 15428 /* ARGSUSED */ 15429 int 15430 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15431 ip_ioctl_cmd_t *ipip, void *ifreq) 15432 { 15433 ill_t *ill; 15434 phyint_t *phyi; 15435 struct ifreq *ifr = (struct ifreq *)ifreq; 15436 struct lifreq *lifr = (struct lifreq *)ifreq; 15437 uint_t old_index, index; 15438 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15439 avl_index_t where; 15440 15441 if (ipip->ipi_cmd_type == IF_CMD) 15442 index = ifr->ifr_index; 15443 else 15444 index = lifr->lifr_index; 15445 15446 /* 15447 * Only allow on physical interface. Also, index zero is illegal. 15448 */ 15449 ill = ipif->ipif_ill; 15450 phyi = ill->ill_phyint; 15451 if (ipif->ipif_id != 0 || index == 0) { 15452 return (EINVAL); 15453 } 15454 15455 /* If the index is not changing, no work to do */ 15456 if (phyi->phyint_ifindex == index) 15457 return (0); 15458 15459 /* 15460 * Use phyint_exists() to determine if the new interface index 15461 * is already in use. If the index is unused then we need to 15462 * change the phyint's position in the phyint_list_avl_by_index 15463 * tree. If we do not do this, subsequent lookups (using the new 15464 * index value) will not find the phyint. 15465 */ 15466 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15467 if (phyint_exists(index, ipst)) { 15468 rw_exit(&ipst->ips_ill_g_lock); 15469 return (EEXIST); 15470 } 15471 15472 /* 15473 * The new index is unused. Set it in the phyint. However we must not 15474 * forget to trigger NE_IFINDEX_CHANGE event before the ifindex 15475 * changes. The event must be bound to old ifindex value. 15476 */ 15477 ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE, 15478 &index, sizeof (index)); 15479 15480 old_index = phyi->phyint_ifindex; 15481 phyi->phyint_ifindex = index; 15482 15483 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi); 15484 (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 15485 &index, &where); 15486 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 15487 phyi, where); 15488 rw_exit(&ipst->ips_ill_g_lock); 15489 15490 /* Update SCTP's ILL list */ 15491 sctp_ill_reindex(ill, old_index); 15492 15493 /* Send the routing sockets message */ 15494 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 15495 if (ILL_OTHER(ill)) 15496 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT); 15497 15498 /* Perhaps ilgs should use this ill */ 15499 update_conn_ill(NULL, ill->ill_ipst); 15500 return (0); 15501 } 15502 15503 /* ARGSUSED */ 15504 int 15505 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15506 ip_ioctl_cmd_t *ipip, void *ifreq) 15507 { 15508 struct ifreq *ifr = (struct ifreq *)ifreq; 15509 struct lifreq *lifr = (struct lifreq *)ifreq; 15510 15511 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 15512 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15513 /* Get the interface index */ 15514 if (ipip->ipi_cmd_type == IF_CMD) { 15515 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 15516 } else { 15517 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 15518 } 15519 return (0); 15520 } 15521 15522 /* ARGSUSED */ 15523 int 15524 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15525 ip_ioctl_cmd_t *ipip, void *ifreq) 15526 { 15527 struct lifreq *lifr = (struct lifreq *)ifreq; 15528 15529 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 15530 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15531 /* Get the interface zone */ 15532 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15533 lifr->lifr_zoneid = ipif->ipif_zoneid; 15534 return (0); 15535 } 15536 15537 /* 15538 * Set the zoneid of an interface. 15539 */ 15540 /* ARGSUSED */ 15541 int 15542 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15543 ip_ioctl_cmd_t *ipip, void *ifreq) 15544 { 15545 struct lifreq *lifr = (struct lifreq *)ifreq; 15546 int err = 0; 15547 boolean_t need_up = B_FALSE; 15548 zone_t *zptr; 15549 zone_status_t status; 15550 zoneid_t zoneid; 15551 15552 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15553 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 15554 if (!is_system_labeled()) 15555 return (ENOTSUP); 15556 zoneid = GLOBAL_ZONEID; 15557 } 15558 15559 /* cannot assign instance zero to a non-global zone */ 15560 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 15561 return (ENOTSUP); 15562 15563 /* 15564 * Cannot assign to a zone that doesn't exist or is shutting down. In 15565 * the event of a race with the zone shutdown processing, since IP 15566 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 15567 * interface will be cleaned up even if the zone is shut down 15568 * immediately after the status check. If the interface can't be brought 15569 * down right away, and the zone is shut down before the restart 15570 * function is called, we resolve the possible races by rechecking the 15571 * zone status in the restart function. 15572 */ 15573 if ((zptr = zone_find_by_id(zoneid)) == NULL) 15574 return (EINVAL); 15575 status = zone_status_get(zptr); 15576 zone_rele(zptr); 15577 15578 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 15579 return (EINVAL); 15580 15581 if (ipif->ipif_flags & IPIF_UP) { 15582 /* 15583 * If the interface is already marked up, 15584 * we call ipif_down which will take care 15585 * of ditching any IREs that have been set 15586 * up based on the old interface address. 15587 */ 15588 err = ipif_logical_down(ipif, q, mp); 15589 if (err == EINPROGRESS) 15590 return (err); 15591 (void) ipif_down_tail(ipif); 15592 need_up = B_TRUE; 15593 } 15594 15595 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 15596 return (err); 15597 } 15598 15599 static int 15600 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 15601 queue_t *q, mblk_t *mp, boolean_t need_up) 15602 { 15603 int err = 0; 15604 ip_stack_t *ipst; 15605 15606 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 15607 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15608 15609 if (CONN_Q(q)) 15610 ipst = CONNQ_TO_IPST(q); 15611 else 15612 ipst = ILLQ_TO_IPST(q); 15613 15614 /* 15615 * For exclusive stacks we don't allow a different zoneid than 15616 * global. 15617 */ 15618 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID && 15619 zoneid != GLOBAL_ZONEID) 15620 return (EINVAL); 15621 15622 /* Set the new zone id. */ 15623 ipif->ipif_zoneid = zoneid; 15624 15625 /* Update sctp list */ 15626 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 15627 15628 /* The default multicast interface might have changed */ 15629 ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6); 15630 15631 if (need_up) { 15632 /* 15633 * Now bring the interface back up. If this 15634 * is the only IPIF for the ILL, ipif_up 15635 * will have to re-bind to the device, so 15636 * we may get back EINPROGRESS, in which 15637 * case, this IOCTL will get completed in 15638 * ip_rput_dlpi when we see the DL_BIND_ACK. 15639 */ 15640 err = ipif_up(ipif, q, mp); 15641 } 15642 return (err); 15643 } 15644 15645 /* ARGSUSED */ 15646 int 15647 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15648 ip_ioctl_cmd_t *ipip, void *if_req) 15649 { 15650 struct lifreq *lifr = (struct lifreq *)if_req; 15651 zoneid_t zoneid; 15652 zone_t *zptr; 15653 zone_status_t status; 15654 15655 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15656 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 15657 zoneid = GLOBAL_ZONEID; 15658 15659 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 15660 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15661 15662 /* 15663 * We recheck the zone status to resolve the following race condition: 15664 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 15665 * 2) hme0:1 is up and can't be brought down right away; 15666 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 15667 * 3) zone "myzone" is halted; the zone status switches to 15668 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 15669 * the interfaces to remove - hme0:1 is not returned because it's not 15670 * yet in "myzone", so it won't be removed; 15671 * 4) the restart function for SIOCSLIFZONE is called; without the 15672 * status check here, we would have hme0:1 in "myzone" after it's been 15673 * destroyed. 15674 * Note that if the status check fails, we need to bring the interface 15675 * back to its state prior to ip_sioctl_slifzone(), hence the call to 15676 * ipif_up_done[_v6](). 15677 */ 15678 status = ZONE_IS_UNINITIALIZED; 15679 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 15680 status = zone_status_get(zptr); 15681 zone_rele(zptr); 15682 } 15683 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 15684 if (ipif->ipif_isv6) { 15685 (void) ipif_up_done_v6(ipif); 15686 } else { 15687 (void) ipif_up_done(ipif); 15688 } 15689 return (EINVAL); 15690 } 15691 15692 (void) ipif_down_tail(ipif); 15693 15694 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 15695 B_TRUE)); 15696 } 15697 15698 /* 15699 * Return the number of addresses on `ill' with one or more of the values 15700 * in `set' set and all of the values in `clear' clear. 15701 */ 15702 static uint_t 15703 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear) 15704 { 15705 ipif_t *ipif; 15706 uint_t cnt = 0; 15707 15708 ASSERT(IAM_WRITER_ILL(ill)); 15709 15710 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 15711 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear)) 15712 cnt++; 15713 15714 return (cnt); 15715 } 15716 15717 /* 15718 * Return the number of migratable addresses on `ill' that are under 15719 * application control. 15720 */ 15721 uint_t 15722 ill_appaddr_cnt(const ill_t *ill) 15723 { 15724 return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF, 15725 IPIF_NOFAILOVER)); 15726 } 15727 15728 /* 15729 * Return the number of point-to-point addresses on `ill'. 15730 */ 15731 uint_t 15732 ill_ptpaddr_cnt(const ill_t *ill) 15733 { 15734 return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0)); 15735 } 15736 15737 /* ARGSUSED */ 15738 int 15739 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15740 ip_ioctl_cmd_t *ipip, void *ifreq) 15741 { 15742 struct lifreq *lifr = ifreq; 15743 15744 ASSERT(q->q_next == NULL); 15745 ASSERT(CONN_Q(q)); 15746 15747 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 15748 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15749 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 15750 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 15751 15752 return (0); 15753 } 15754 15755 /* Find the previous ILL in this usesrc group */ 15756 static ill_t * 15757 ill_prev_usesrc(ill_t *uill) 15758 { 15759 ill_t *ill; 15760 15761 for (ill = uill->ill_usesrc_grp_next; 15762 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 15763 ill = ill->ill_usesrc_grp_next) 15764 /* do nothing */; 15765 return (ill); 15766 } 15767 15768 /* 15769 * Release all members of the usesrc group. This routine is called 15770 * from ill_delete when the interface being unplumbed is the 15771 * group head. 15772 * 15773 * This silently clears the usesrc that ifconfig setup. 15774 * An alternative would be to keep that ifindex, and drop packets on the floor 15775 * since no source address can be selected. 15776 * Even if we keep the current semantics, don't need a lock and a linked list. 15777 * Can walk all the ills checking if they have a ill_usesrc_ifindex matching 15778 * the one that is being removed. Issue is how we return the usesrc users 15779 * (SIOCGLIFSRCOF). We want to be able to find the ills which have an 15780 * ill_usesrc_ifindex matching a target ill. We could also do that with an 15781 * ill walk, but the walker would need to insert in the ioctl response. 15782 */ 15783 static void 15784 ill_disband_usesrc_group(ill_t *uill) 15785 { 15786 ill_t *next_ill, *tmp_ill; 15787 ip_stack_t *ipst = uill->ill_ipst; 15788 15789 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 15790 next_ill = uill->ill_usesrc_grp_next; 15791 15792 do { 15793 ASSERT(next_ill != NULL); 15794 tmp_ill = next_ill->ill_usesrc_grp_next; 15795 ASSERT(tmp_ill != NULL); 15796 next_ill->ill_usesrc_grp_next = NULL; 15797 next_ill->ill_usesrc_ifindex = 0; 15798 next_ill = tmp_ill; 15799 } while (next_ill->ill_usesrc_ifindex != 0); 15800 uill->ill_usesrc_grp_next = NULL; 15801 } 15802 15803 /* 15804 * Remove the client usesrc ILL from the list and relink to a new list 15805 */ 15806 int 15807 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 15808 { 15809 ill_t *ill, *tmp_ill; 15810 ip_stack_t *ipst = ucill->ill_ipst; 15811 15812 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 15813 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 15814 15815 /* 15816 * Check if the usesrc client ILL passed in is not already 15817 * in use as a usesrc ILL i.e one whose source address is 15818 * in use OR a usesrc ILL is not already in use as a usesrc 15819 * client ILL 15820 */ 15821 if ((ucill->ill_usesrc_ifindex == 0) || 15822 (uill->ill_usesrc_ifindex != 0)) { 15823 return (-1); 15824 } 15825 15826 ill = ill_prev_usesrc(ucill); 15827 ASSERT(ill->ill_usesrc_grp_next != NULL); 15828 15829 /* Remove from the current list */ 15830 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 15831 /* Only two elements in the list */ 15832 ASSERT(ill->ill_usesrc_ifindex == 0); 15833 ill->ill_usesrc_grp_next = NULL; 15834 } else { 15835 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 15836 } 15837 15838 if (ifindex == 0) { 15839 ucill->ill_usesrc_ifindex = 0; 15840 ucill->ill_usesrc_grp_next = NULL; 15841 return (0); 15842 } 15843 15844 ucill->ill_usesrc_ifindex = ifindex; 15845 tmp_ill = uill->ill_usesrc_grp_next; 15846 uill->ill_usesrc_grp_next = ucill; 15847 ucill->ill_usesrc_grp_next = 15848 (tmp_ill != NULL) ? tmp_ill : uill; 15849 return (0); 15850 } 15851 15852 /* 15853 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 15854 * ip.c for locking details. 15855 */ 15856 /* ARGSUSED */ 15857 int 15858 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15859 ip_ioctl_cmd_t *ipip, void *ifreq) 15860 { 15861 struct lifreq *lifr = (struct lifreq *)ifreq; 15862 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE; 15863 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 15864 int err = 0, ret; 15865 uint_t ifindex; 15866 ipsq_t *ipsq = NULL; 15867 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15868 15869 ASSERT(IAM_WRITER_IPIF(ipif)); 15870 ASSERT(q->q_next == NULL); 15871 ASSERT(CONN_Q(q)); 15872 15873 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; 15874 15875 ifindex = lifr->lifr_index; 15876 if (ifindex == 0) { 15877 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 15878 /* non usesrc group interface, nothing to reset */ 15879 return (0); 15880 } 15881 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 15882 /* valid reset request */ 15883 reset_flg = B_TRUE; 15884 } 15885 15886 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 15887 if (usesrc_ill == NULL) { 15888 return (ENXIO); 15889 } 15890 15891 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 15892 NEW_OP, B_TRUE); 15893 if (ipsq == NULL) { 15894 err = EINPROGRESS; 15895 /* Operation enqueued on the ipsq of the usesrc ILL */ 15896 goto done; 15897 } 15898 15899 /* USESRC isn't currently supported with IPMP */ 15900 if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) { 15901 err = ENOTSUP; 15902 goto done; 15903 } 15904 15905 /* 15906 * USESRC isn't compatible with the STANDBY flag. (STANDBY is only 15907 * used by IPMP underlying interfaces, but someone might think it's 15908 * more general and try to use it independently with VNI.) 15909 */ 15910 if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 15911 err = ENOTSUP; 15912 goto done; 15913 } 15914 15915 /* 15916 * If the client is already in use as a usesrc_ill or a usesrc_ill is 15917 * already a client then return EINVAL 15918 */ 15919 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 15920 err = EINVAL; 15921 goto done; 15922 } 15923 15924 /* 15925 * If the ill_usesrc_ifindex field is already set to what it needs to 15926 * be then this is a duplicate operation. 15927 */ 15928 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 15929 err = 0; 15930 goto done; 15931 } 15932 15933 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 15934 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 15935 usesrc_ill->ill_isv6)); 15936 15937 /* 15938 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 15939 * and the ill_usesrc_ifindex fields 15940 */ 15941 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 15942 15943 if (reset_flg) { 15944 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 15945 if (ret != 0) { 15946 err = EINVAL; 15947 } 15948 rw_exit(&ipst->ips_ill_g_usesrc_lock); 15949 goto done; 15950 } 15951 15952 /* 15953 * Four possibilities to consider: 15954 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 15955 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 15956 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 15957 * 4. Both are part of their respective usesrc groups 15958 */ 15959 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 15960 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 15961 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 15962 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 15963 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 15964 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 15965 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 15966 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 15967 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 15968 /* Insert at head of list */ 15969 usesrc_cli_ill->ill_usesrc_grp_next = 15970 usesrc_ill->ill_usesrc_grp_next; 15971 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 15972 } else { 15973 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 15974 ifindex); 15975 if (ret != 0) 15976 err = EINVAL; 15977 } 15978 rw_exit(&ipst->ips_ill_g_usesrc_lock); 15979 15980 done: 15981 if (ipsq != NULL) 15982 ipsq_exit(ipsq); 15983 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 15984 ill_refrele(usesrc_ill); 15985 15986 /* Let conn_ixa caching know that source address selection changed */ 15987 ip_update_source_selection(ipst); 15988 15989 return (err); 15990 } 15991 15992 /* 15993 * comparison function used by avl. 15994 */ 15995 static int 15996 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 15997 { 15998 15999 uint_t index; 16000 16001 ASSERT(phyip != NULL && index_ptr != NULL); 16002 16003 index = *((uint_t *)index_ptr); 16004 /* 16005 * let the phyint with the lowest index be on top. 16006 */ 16007 if (((phyint_t *)phyip)->phyint_ifindex < index) 16008 return (1); 16009 if (((phyint_t *)phyip)->phyint_ifindex > index) 16010 return (-1); 16011 return (0); 16012 } 16013 16014 /* 16015 * comparison function used by avl. 16016 */ 16017 static int 16018 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 16019 { 16020 ill_t *ill; 16021 int res = 0; 16022 16023 ASSERT(phyip != NULL && name_ptr != NULL); 16024 16025 if (((phyint_t *)phyip)->phyint_illv4) 16026 ill = ((phyint_t *)phyip)->phyint_illv4; 16027 else 16028 ill = ((phyint_t *)phyip)->phyint_illv6; 16029 ASSERT(ill != NULL); 16030 16031 res = strcmp(ill->ill_name, (char *)name_ptr); 16032 if (res > 0) 16033 return (1); 16034 else if (res < 0) 16035 return (-1); 16036 return (0); 16037 } 16038 16039 /* 16040 * This function is called on the unplumb path via ill_glist_delete() when 16041 * there are no ills left on the phyint and thus the phyint can be freed. 16042 */ 16043 static void 16044 phyint_free(phyint_t *phyi) 16045 { 16046 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 16047 16048 ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL); 16049 16050 /* 16051 * If this phyint was an IPMP meta-interface, blow away the group. 16052 * This is safe to do because all of the illgrps have already been 16053 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us. 16054 * If we're cleaning up as a result of failed initialization, 16055 * phyint_grp may be NULL. 16056 */ 16057 if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) { 16058 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 16059 ipmp_grp_destroy(phyi->phyint_grp); 16060 phyi->phyint_grp = NULL; 16061 rw_exit(&ipst->ips_ipmp_lock); 16062 } 16063 16064 /* 16065 * If this interface was under IPMP, take it out of the group. 16066 */ 16067 if (phyi->phyint_grp != NULL) 16068 ipmp_phyint_leave_grp(phyi); 16069 16070 /* 16071 * Delete the phyint and disassociate its ipsq. The ipsq itself 16072 * will be freed in ipsq_exit(). 16073 */ 16074 phyi->phyint_ipsq->ipsq_phyint = NULL; 16075 phyi->phyint_name[0] = '\0'; 16076 16077 mi_free(phyi); 16078 } 16079 16080 /* 16081 * Attach the ill to the phyint structure which can be shared by both 16082 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 16083 * function is called from ipif_set_values and ill_lookup_on_name (for 16084 * loopback) where we know the name of the ill. We lookup the ill and if 16085 * there is one present already with the name use that phyint. Otherwise 16086 * reuse the one allocated by ill_init. 16087 */ 16088 static void 16089 ill_phyint_reinit(ill_t *ill) 16090 { 16091 boolean_t isv6 = ill->ill_isv6; 16092 phyint_t *phyi_old; 16093 phyint_t *phyi; 16094 avl_index_t where = 0; 16095 ill_t *ill_other = NULL; 16096 ip_stack_t *ipst = ill->ill_ipst; 16097 16098 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 16099 16100 phyi_old = ill->ill_phyint; 16101 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 16102 phyi_old->phyint_illv6 == NULL)); 16103 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 16104 phyi_old->phyint_illv4 == NULL)); 16105 ASSERT(phyi_old->phyint_ifindex == 0); 16106 16107 /* 16108 * Now that our ill has a name, set it in the phyint. 16109 */ 16110 (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ); 16111 16112 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16113 ill->ill_name, &where); 16114 16115 /* 16116 * 1. We grabbed the ill_g_lock before inserting this ill into 16117 * the global list of ills. So no other thread could have located 16118 * this ill and hence the ipsq of this ill is guaranteed to be empty. 16119 * 2. Now locate the other protocol instance of this ill. 16120 * 3. Now grab both ill locks in the right order, and the phyint lock of 16121 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 16122 * of neither ill can change. 16123 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 16124 * other ill. 16125 * 5. Release all locks. 16126 */ 16127 16128 /* 16129 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 16130 * we are initializing IPv4. 16131 */ 16132 if (phyi != NULL) { 16133 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6; 16134 ASSERT(ill_other->ill_phyint != NULL); 16135 ASSERT((isv6 && !ill_other->ill_isv6) || 16136 (!isv6 && ill_other->ill_isv6)); 16137 GRAB_ILL_LOCKS(ill, ill_other); 16138 /* 16139 * We are potentially throwing away phyint_flags which 16140 * could be different from the one that we obtain from 16141 * ill_other->ill_phyint. But it is okay as we are assuming 16142 * that the state maintained within IP is correct. 16143 */ 16144 mutex_enter(&phyi->phyint_lock); 16145 if (isv6) { 16146 ASSERT(phyi->phyint_illv6 == NULL); 16147 phyi->phyint_illv6 = ill; 16148 } else { 16149 ASSERT(phyi->phyint_illv4 == NULL); 16150 phyi->phyint_illv4 = ill; 16151 } 16152 16153 /* 16154 * Delete the old phyint and make its ipsq eligible 16155 * to be freed in ipsq_exit(). 16156 */ 16157 phyi_old->phyint_illv4 = NULL; 16158 phyi_old->phyint_illv6 = NULL; 16159 phyi_old->phyint_ipsq->ipsq_phyint = NULL; 16160 phyi_old->phyint_name[0] = '\0'; 16161 mi_free(phyi_old); 16162 } else { 16163 mutex_enter(&ill->ill_lock); 16164 /* 16165 * We don't need to acquire any lock, since 16166 * the ill is not yet visible globally and we 16167 * have not yet released the ill_g_lock. 16168 */ 16169 phyi = phyi_old; 16170 mutex_enter(&phyi->phyint_lock); 16171 /* XXX We need a recovery strategy here. */ 16172 if (!phyint_assign_ifindex(phyi, ipst)) 16173 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 16174 16175 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16176 (void *)phyi, where); 16177 16178 (void) avl_find(&ipst->ips_phyint_g_list-> 16179 phyint_list_avl_by_index, 16180 &phyi->phyint_ifindex, &where); 16181 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16182 (void *)phyi, where); 16183 } 16184 16185 /* 16186 * Reassigning ill_phyint automatically reassigns the ipsq also. 16187 * pending mp is not affected because that is per ill basis. 16188 */ 16189 ill->ill_phyint = phyi; 16190 16191 /* 16192 * Now that the phyint's ifindex has been assigned, complete the 16193 * remaining 16194 */ 16195 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex; 16196 if (ill->ill_isv6) { 16197 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 16198 ill->ill_phyint->phyint_ifindex; 16199 ill->ill_mcast_type = ipst->ips_mld_max_version; 16200 } else { 16201 ill->ill_mcast_type = ipst->ips_igmp_max_version; 16202 } 16203 16204 /* 16205 * Generate an event within the hooks framework to indicate that 16206 * a new interface has just been added to IP. For this event to 16207 * be generated, the network interface must, at least, have an 16208 * ifindex assigned to it. (We don't generate the event for 16209 * loopback since ill_lookup_on_name() has its own NE_PLUMB event.) 16210 * 16211 * This needs to be run inside the ill_g_lock perimeter to ensure 16212 * that the ordering of delivered events to listeners matches the 16213 * order of them in the kernel. 16214 */ 16215 if (!IS_LOOPBACK(ill)) { 16216 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name, 16217 ill->ill_name_length); 16218 } 16219 RELEASE_ILL_LOCKS(ill, ill_other); 16220 mutex_exit(&phyi->phyint_lock); 16221 } 16222 16223 /* 16224 * Notify any downstream modules of the name of this interface. 16225 * An M_IOCTL is used even though we don't expect a successful reply. 16226 * Any reply message from the driver (presumably an M_IOCNAK) will 16227 * eventually get discarded somewhere upstream. The message format is 16228 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 16229 * to IP. 16230 */ 16231 static void 16232 ip_ifname_notify(ill_t *ill, queue_t *q) 16233 { 16234 mblk_t *mp1, *mp2; 16235 struct iocblk *iocp; 16236 struct lifreq *lifr; 16237 16238 mp1 = mkiocb(SIOCSLIFNAME); 16239 if (mp1 == NULL) 16240 return; 16241 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 16242 if (mp2 == NULL) { 16243 freeb(mp1); 16244 return; 16245 } 16246 16247 mp1->b_cont = mp2; 16248 iocp = (struct iocblk *)mp1->b_rptr; 16249 iocp->ioc_count = sizeof (struct lifreq); 16250 16251 lifr = (struct lifreq *)mp2->b_rptr; 16252 mp2->b_wptr += sizeof (struct lifreq); 16253 bzero(lifr, sizeof (struct lifreq)); 16254 16255 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 16256 lifr->lifr_ppa = ill->ill_ppa; 16257 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 16258 16259 DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify", 16260 char *, "SIOCSLIFNAME", ill_t *, ill); 16261 putnext(q, mp1); 16262 } 16263 16264 static int 16265 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 16266 { 16267 int err; 16268 ip_stack_t *ipst = ill->ill_ipst; 16269 phyint_t *phyi = ill->ill_phyint; 16270 16271 /* Set the obsolete NDD per-interface forwarding name. */ 16272 err = ill_set_ndd_name(ill); 16273 if (err != 0) { 16274 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 16275 err); 16276 } 16277 16278 /* 16279 * Now that ill_name is set, the configuration for the IPMP 16280 * meta-interface can be performed. 16281 */ 16282 if (IS_IPMP(ill)) { 16283 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 16284 /* 16285 * If phyi->phyint_grp is NULL, then this is the first IPMP 16286 * meta-interface and we need to create the IPMP group. 16287 */ 16288 if (phyi->phyint_grp == NULL) { 16289 /* 16290 * If someone has renamed another IPMP group to have 16291 * the same name as our interface, bail. 16292 */ 16293 if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) { 16294 rw_exit(&ipst->ips_ipmp_lock); 16295 return (EEXIST); 16296 } 16297 phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi); 16298 if (phyi->phyint_grp == NULL) { 16299 rw_exit(&ipst->ips_ipmp_lock); 16300 return (ENOMEM); 16301 } 16302 } 16303 rw_exit(&ipst->ips_ipmp_lock); 16304 } 16305 16306 /* Tell downstream modules where they are. */ 16307 ip_ifname_notify(ill, q); 16308 16309 /* 16310 * ill_dl_phys returns EINPROGRESS in the usual case. 16311 * Error cases are ENOMEM ... 16312 */ 16313 err = ill_dl_phys(ill, ipif, mp, q); 16314 16315 if (ill->ill_isv6) { 16316 mutex_enter(&ipst->ips_mld_slowtimeout_lock); 16317 if (ipst->ips_mld_slowtimeout_id == 0) { 16318 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, 16319 (void *)ipst, 16320 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 16321 } 16322 mutex_exit(&ipst->ips_mld_slowtimeout_lock); 16323 } else { 16324 mutex_enter(&ipst->ips_igmp_slowtimeout_lock); 16325 if (ipst->ips_igmp_slowtimeout_id == 0) { 16326 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, 16327 (void *)ipst, 16328 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 16329 } 16330 mutex_exit(&ipst->ips_igmp_slowtimeout_lock); 16331 } 16332 16333 return (err); 16334 } 16335 16336 /* 16337 * Common routine for ppa and ifname setting. Should be called exclusive. 16338 * 16339 * Returns EINPROGRESS when mp has been consumed by queueing it on 16340 * ipx_pending_mp and the ioctl will complete in ip_rput. 16341 * 16342 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 16343 * the new name and new ppa in lifr_name and lifr_ppa respectively. 16344 * For SLIFNAME, we pass these values back to the userland. 16345 */ 16346 static int 16347 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 16348 { 16349 ill_t *ill; 16350 ipif_t *ipif; 16351 ipsq_t *ipsq; 16352 char *ppa_ptr; 16353 char *old_ptr; 16354 char old_char; 16355 int error; 16356 ip_stack_t *ipst; 16357 16358 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 16359 ASSERT(q->q_next != NULL); 16360 ASSERT(interf_name != NULL); 16361 16362 ill = (ill_t *)q->q_ptr; 16363 ipst = ill->ill_ipst; 16364 16365 ASSERT(ill->ill_ipst != NULL); 16366 ASSERT(ill->ill_name[0] == '\0'); 16367 ASSERT(IAM_WRITER_ILL(ill)); 16368 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 16369 ASSERT(ill->ill_ppa == UINT_MAX); 16370 16371 ill->ill_defend_start = ill->ill_defend_count = 0; 16372 /* The ppa is sent down by ifconfig or is chosen */ 16373 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 16374 return (EINVAL); 16375 } 16376 16377 /* 16378 * make sure ppa passed in is same as ppa in the name. 16379 * This check is not made when ppa == UINT_MAX in that case ppa 16380 * in the name could be anything. System will choose a ppa and 16381 * update new_ppa_ptr and inter_name to contain the choosen ppa. 16382 */ 16383 if (*new_ppa_ptr != UINT_MAX) { 16384 /* stoi changes the pointer */ 16385 old_ptr = ppa_ptr; 16386 /* 16387 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 16388 * (they don't have an externally visible ppa). We assign one 16389 * here so that we can manage the interface. Note that in 16390 * the past this value was always 0 for DLPI 1 drivers. 16391 */ 16392 if (*new_ppa_ptr == 0) 16393 *new_ppa_ptr = stoi(&old_ptr); 16394 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 16395 return (EINVAL); 16396 } 16397 /* 16398 * terminate string before ppa 16399 * save char at that location. 16400 */ 16401 old_char = ppa_ptr[0]; 16402 ppa_ptr[0] = '\0'; 16403 16404 ill->ill_ppa = *new_ppa_ptr; 16405 /* 16406 * Finish as much work now as possible before calling ill_glist_insert 16407 * which makes the ill globally visible and also merges it with the 16408 * other protocol instance of this phyint. The remaining work is 16409 * done after entering the ipsq which may happen sometime later. 16410 * ill_set_ndd_name occurs after the ill has been made globally visible. 16411 */ 16412 ipif = ill->ill_ipif; 16413 16414 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 16415 ipif_assign_seqid(ipif); 16416 16417 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 16418 ill->ill_flags |= ILLF_IPV4; 16419 16420 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 16421 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 16422 16423 if (ill->ill_flags & ILLF_IPV6) { 16424 16425 ill->ill_isv6 = B_TRUE; 16426 ill_set_inputfn(ill); 16427 if (ill->ill_rq != NULL) { 16428 ill->ill_rq->q_qinfo = &iprinitv6; 16429 } 16430 16431 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 16432 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 16433 ipif->ipif_v6subnet = ipv6_all_zeros; 16434 ipif->ipif_v6net_mask = ipv6_all_zeros; 16435 ipif->ipif_v6brd_addr = ipv6_all_zeros; 16436 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 16437 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 16438 /* 16439 * point-to-point or Non-mulicast capable 16440 * interfaces won't do NUD unless explicitly 16441 * configured to do so. 16442 */ 16443 if (ipif->ipif_flags & IPIF_POINTOPOINT || 16444 !(ill->ill_flags & ILLF_MULTICAST)) { 16445 ill->ill_flags |= ILLF_NONUD; 16446 } 16447 /* Make sure IPv4 specific flag is not set on IPv6 if */ 16448 if (ill->ill_flags & ILLF_NOARP) { 16449 /* 16450 * Note: xresolv interfaces will eventually need 16451 * NOARP set here as well, but that will require 16452 * those external resolvers to have some 16453 * knowledge of that flag and act appropriately. 16454 * Not to be changed at present. 16455 */ 16456 ill->ill_flags &= ~ILLF_NOARP; 16457 } 16458 /* 16459 * Set the ILLF_ROUTER flag according to the global 16460 * IPv6 forwarding policy. 16461 */ 16462 if (ipst->ips_ipv6_forward != 0) 16463 ill->ill_flags |= ILLF_ROUTER; 16464 } else if (ill->ill_flags & ILLF_IPV4) { 16465 ill->ill_isv6 = B_FALSE; 16466 ill_set_inputfn(ill); 16467 ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER; 16468 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 16469 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 16470 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 16471 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 16472 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 16473 /* 16474 * Set the ILLF_ROUTER flag according to the global 16475 * IPv4 forwarding policy. 16476 */ 16477 if (ipst->ips_ip_g_forward != 0) 16478 ill->ill_flags |= ILLF_ROUTER; 16479 } 16480 16481 ASSERT(ill->ill_phyint != NULL); 16482 16483 /* 16484 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will 16485 * be completed in ill_glist_insert -> ill_phyint_reinit 16486 */ 16487 if (!ill_allocate_mibs(ill)) 16488 return (ENOMEM); 16489 16490 /* 16491 * Pick a default sap until we get the DL_INFO_ACK back from 16492 * the driver. 16493 */ 16494 ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap : 16495 ill->ill_media->ip_m_ipv4sap; 16496 16497 ill->ill_ifname_pending = 1; 16498 ill->ill_ifname_pending_err = 0; 16499 16500 /* 16501 * When the first ipif comes up in ipif_up_done(), multicast groups 16502 * that were joined while this ill was not bound to the DLPI link need 16503 * to be recovered by ill_recover_multicast(). 16504 */ 16505 ill->ill_need_recover_multicast = 1; 16506 16507 ill_refhold(ill); 16508 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16509 if ((error = ill_glist_insert(ill, interf_name, 16510 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 16511 ill->ill_ppa = UINT_MAX; 16512 ill->ill_name[0] = '\0'; 16513 /* 16514 * undo null termination done above. 16515 */ 16516 ppa_ptr[0] = old_char; 16517 rw_exit(&ipst->ips_ill_g_lock); 16518 ill_refrele(ill); 16519 return (error); 16520 } 16521 16522 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 16523 16524 /* 16525 * When we return the buffer pointed to by interf_name should contain 16526 * the same name as in ill_name. 16527 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 16528 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 16529 * so copy full name and update the ppa ptr. 16530 * When ppa passed in != UINT_MAX all values are correct just undo 16531 * null termination, this saves a bcopy. 16532 */ 16533 if (*new_ppa_ptr == UINT_MAX) { 16534 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 16535 *new_ppa_ptr = ill->ill_ppa; 16536 } else { 16537 /* 16538 * undo null termination done above. 16539 */ 16540 ppa_ptr[0] = old_char; 16541 } 16542 16543 /* Let SCTP know about this ILL */ 16544 sctp_update_ill(ill, SCTP_ILL_INSERT); 16545 16546 /* 16547 * ill_glist_insert has made the ill visible globally, and 16548 * ill_phyint_reinit could have changed the ipsq. At this point, 16549 * we need to hold the ips_ill_g_lock across the call to enter the 16550 * ipsq to enforce atomicity and prevent reordering. In the event 16551 * the ipsq has changed, and if the new ipsq is currently busy, 16552 * we need to make sure that this half-completed ioctl is ahead of 16553 * any subsequent ioctl. We achieve this by not dropping the 16554 * ips_ill_g_lock which prevents any ill lookup itself thereby 16555 * ensuring that new ioctls can't start. 16556 */ 16557 ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP, 16558 B_TRUE); 16559 16560 rw_exit(&ipst->ips_ill_g_lock); 16561 ill_refrele(ill); 16562 if (ipsq == NULL) 16563 return (EINPROGRESS); 16564 16565 /* 16566 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. 16567 */ 16568 if (ipsq->ipsq_xop->ipx_current_ipif == NULL) 16569 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); 16570 else 16571 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif); 16572 16573 error = ipif_set_values_tail(ill, ipif, mp, q); 16574 ipsq_exit(ipsq); 16575 if (error != 0 && error != EINPROGRESS) { 16576 /* 16577 * restore previous values 16578 */ 16579 ill->ill_isv6 = B_FALSE; 16580 ill_set_inputfn(ill); 16581 } 16582 return (error); 16583 } 16584 16585 void 16586 ipif_init(ip_stack_t *ipst) 16587 { 16588 int i; 16589 16590 for (i = 0; i < MAX_G_HEADS; i++) { 16591 ipst->ips_ill_g_heads[i].ill_g_list_head = 16592 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 16593 ipst->ips_ill_g_heads[i].ill_g_list_tail = 16594 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 16595 } 16596 16597 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16598 ill_phyint_compare_index, 16599 sizeof (phyint_t), 16600 offsetof(struct phyint, phyint_avl_by_index)); 16601 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16602 ill_phyint_compare_name, 16603 sizeof (phyint_t), 16604 offsetof(struct phyint, phyint_avl_by_name)); 16605 } 16606 16607 /* 16608 * Save enough information so that we can recreate the IRE if 16609 * the interface goes down and then up. 16610 */ 16611 void 16612 ill_save_ire(ill_t *ill, ire_t *ire) 16613 { 16614 mblk_t *save_mp; 16615 16616 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 16617 if (save_mp != NULL) { 16618 ifrt_t *ifrt; 16619 16620 save_mp->b_wptr += sizeof (ifrt_t); 16621 ifrt = (ifrt_t *)save_mp->b_rptr; 16622 bzero(ifrt, sizeof (ifrt_t)); 16623 ifrt->ifrt_type = ire->ire_type; 16624 if (ire->ire_ipversion == IPV4_VERSION) { 16625 ASSERT(!ill->ill_isv6); 16626 ifrt->ifrt_addr = ire->ire_addr; 16627 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 16628 ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr; 16629 ifrt->ifrt_mask = ire->ire_mask; 16630 } else { 16631 ASSERT(ill->ill_isv6); 16632 ifrt->ifrt_v6addr = ire->ire_addr_v6; 16633 /* ire_gateway_addr_v6 can change due to RTM_CHANGE */ 16634 mutex_enter(&ire->ire_lock); 16635 ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6; 16636 mutex_exit(&ire->ire_lock); 16637 ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6; 16638 ifrt->ifrt_v6mask = ire->ire_mask_v6; 16639 } 16640 ifrt->ifrt_flags = ire->ire_flags; 16641 ifrt->ifrt_zoneid = ire->ire_zoneid; 16642 mutex_enter(&ill->ill_saved_ire_lock); 16643 save_mp->b_cont = ill->ill_saved_ire_mp; 16644 ill->ill_saved_ire_mp = save_mp; 16645 ill->ill_saved_ire_cnt++; 16646 mutex_exit(&ill->ill_saved_ire_lock); 16647 } 16648 } 16649 16650 /* 16651 * Remove one entry from ill_saved_ire_mp. 16652 */ 16653 void 16654 ill_remove_saved_ire(ill_t *ill, ire_t *ire) 16655 { 16656 mblk_t **mpp; 16657 mblk_t *mp; 16658 ifrt_t *ifrt; 16659 16660 /* Remove from ill_saved_ire_mp list if it is there */ 16661 mutex_enter(&ill->ill_saved_ire_lock); 16662 for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL; 16663 mpp = &(*mpp)->b_cont) { 16664 in6_addr_t gw_addr_v6; 16665 16666 /* 16667 * On a given ill, the tuple of address, gateway, mask, 16668 * ire_type, and zoneid is unique for each saved IRE. 16669 */ 16670 mp = *mpp; 16671 ifrt = (ifrt_t *)mp->b_rptr; 16672 /* ire_gateway_addr_v6 can change - need lock */ 16673 mutex_enter(&ire->ire_lock); 16674 gw_addr_v6 = ire->ire_gateway_addr_v6; 16675 mutex_exit(&ire->ire_lock); 16676 16677 if (ifrt->ifrt_zoneid != ire->ire_zoneid || 16678 ifrt->ifrt_type != ire->ire_type) 16679 continue; 16680 16681 if (ill->ill_isv6 ? 16682 (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr, 16683 &ire->ire_addr_v6) && 16684 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr, 16685 &gw_addr_v6) && 16686 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask, 16687 &ire->ire_mask_v6)) : 16688 (ifrt->ifrt_addr == ire->ire_addr && 16689 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 16690 ifrt->ifrt_mask == ire->ire_mask)) { 16691 *mpp = mp->b_cont; 16692 ill->ill_saved_ire_cnt--; 16693 freeb(mp); 16694 break; 16695 } 16696 } 16697 mutex_exit(&ill->ill_saved_ire_lock); 16698 } 16699 16700 /* 16701 * IP multirouting broadcast routes handling 16702 * Append CGTP broadcast IREs to regular ones created 16703 * at ifconfig time. 16704 * The usage is a route add <cgtp_bc> <nic_bc> -multirt i.e., both 16705 * the destination and the gateway are broadcast addresses. 16706 * The caller has verified that the destination is an IRE_BROADCAST and that 16707 * RTF_MULTIRT was set. Here if the gateway is a broadcast address, then 16708 * we create a MULTIRT IRE_BROADCAST. 16709 * Note that the IRE_HOST created by ire_rt_add doesn't get found by anything 16710 * since the IRE_BROADCAST takes precedence; ire_add_v4 does head insertion. 16711 */ 16712 static void 16713 ip_cgtp_bcast_add(ire_t *ire, ip_stack_t *ipst) 16714 { 16715 ire_t *ire_prim; 16716 16717 ASSERT(ire != NULL); 16718 16719 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, 16720 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, 16721 NULL); 16722 if (ire_prim != NULL) { 16723 /* 16724 * We are in the special case of broadcasts for 16725 * CGTP. We add an IRE_BROADCAST that holds 16726 * the RTF_MULTIRT flag, the destination 16727 * address and the low level 16728 * info of ire_prim. In other words, CGTP 16729 * broadcast is added to the redundant ipif. 16730 */ 16731 ill_t *ill_prim; 16732 ire_t *bcast_ire; 16733 16734 ill_prim = ire_prim->ire_ill; 16735 16736 ip2dbg(("ip_cgtp_filter_bcast_add: ire_prim %p, ill_prim %p\n", 16737 (void *)ire_prim, (void *)ill_prim)); 16738 16739 bcast_ire = ire_create( 16740 (uchar_t *)&ire->ire_addr, 16741 (uchar_t *)&ip_g_all_ones, 16742 (uchar_t *)&ire->ire_gateway_addr, 16743 IRE_BROADCAST, 16744 ill_prim, 16745 GLOBAL_ZONEID, /* CGTP is only for the global zone */ 16746 ire->ire_flags | RTF_KERNEL, 16747 NULL, 16748 ipst); 16749 16750 /* 16751 * Here we assume that ire_add does head insertion so that 16752 * the added IRE_BROADCAST comes before the existing IRE_HOST. 16753 */ 16754 if (bcast_ire != NULL) { 16755 if (ire->ire_flags & RTF_SETSRC) { 16756 bcast_ire->ire_setsrc_addr = 16757 ire->ire_setsrc_addr; 16758 } 16759 bcast_ire = ire_add(bcast_ire); 16760 if (bcast_ire != NULL) { 16761 ip2dbg(("ip_cgtp_filter_bcast_add: " 16762 "added bcast_ire %p\n", 16763 (void *)bcast_ire)); 16764 16765 ill_save_ire(ill_prim, bcast_ire); 16766 ire_refrele(bcast_ire); 16767 } 16768 } 16769 ire_refrele(ire_prim); 16770 } 16771 } 16772 16773 /* 16774 * IP multirouting broadcast routes handling 16775 * Remove the broadcast ire. 16776 * The usage is a route delete <cgtp_bc> <nic_bc> -multirt i.e., both 16777 * the destination and the gateway are broadcast addresses. 16778 * The caller has only verified that RTF_MULTIRT was set. We check 16779 * that the destination is broadcast and that the gateway is a broadcast 16780 * address, and if so delete the IRE added by ip_cgtp_bcast_add(). 16781 */ 16782 static void 16783 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst) 16784 { 16785 ASSERT(ire != NULL); 16786 16787 if (ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST) { 16788 ire_t *ire_prim; 16789 16790 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, 16791 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, 16792 ipst, NULL); 16793 if (ire_prim != NULL) { 16794 ill_t *ill_prim; 16795 ire_t *bcast_ire; 16796 16797 ill_prim = ire_prim->ire_ill; 16798 16799 ip2dbg(("ip_cgtp_filter_bcast_delete: " 16800 "ire_prim %p, ill_prim %p\n", 16801 (void *)ire_prim, (void *)ill_prim)); 16802 16803 bcast_ire = ire_ftable_lookup_v4(ire->ire_addr, 0, 16804 ire->ire_gateway_addr, IRE_BROADCAST, 16805 ill_prim, ALL_ZONES, NULL, 16806 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_ILL | 16807 MATCH_IRE_MASK, 0, ipst, NULL); 16808 16809 if (bcast_ire != NULL) { 16810 ip2dbg(("ip_cgtp_filter_bcast_delete: " 16811 "looked up bcast_ire %p\n", 16812 (void *)bcast_ire)); 16813 ill_remove_saved_ire(bcast_ire->ire_ill, 16814 bcast_ire); 16815 ire_delete(bcast_ire); 16816 ire_refrele(bcast_ire); 16817 } 16818 ire_refrele(ire_prim); 16819 } 16820 } 16821 } 16822 16823 /* 16824 * Derive an interface id from the link layer address. 16825 * Knows about IEEE 802 and IEEE EUI-64 mappings. 16826 */ 16827 static void 16828 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16829 { 16830 char *addr; 16831 16832 /* 16833 * Note that some IPv6 interfaces get plumbed over links that claim to 16834 * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g. 16835 * PPP links). The ETHERADDRL check here ensures that we only set the 16836 * interface ID on IPv6 interfaces above links that actually have real 16837 * Ethernet addresses. 16838 */ 16839 if (ill->ill_phys_addr_length == ETHERADDRL) { 16840 /* Form EUI-64 like address */ 16841 addr = (char *)&v6addr->s6_addr32[2]; 16842 bcopy(ill->ill_phys_addr, addr, 3); 16843 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 16844 addr[3] = (char)0xff; 16845 addr[4] = (char)0xfe; 16846 bcopy(ill->ill_phys_addr + 3, addr + 5, 3); 16847 } 16848 } 16849 16850 /* ARGSUSED */ 16851 static void 16852 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16853 { 16854 } 16855 16856 typedef struct ipmp_ifcookie { 16857 uint32_t ic_hostid; 16858 char ic_ifname[LIFNAMSIZ]; 16859 char ic_zonename[ZONENAME_MAX]; 16860 } ipmp_ifcookie_t; 16861 16862 /* 16863 * Construct a pseudo-random interface ID for the IPMP interface that's both 16864 * predictable and (almost) guaranteed to be unique. 16865 */ 16866 static void 16867 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16868 { 16869 zone_t *zp; 16870 uint8_t *addr; 16871 uchar_t hash[16]; 16872 ulong_t hostid; 16873 MD5_CTX ctx; 16874 ipmp_ifcookie_t ic = { 0 }; 16875 16876 ASSERT(IS_IPMP(ill)); 16877 16878 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); 16879 ic.ic_hostid = htonl((uint32_t)hostid); 16880 16881 (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ); 16882 16883 if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) { 16884 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX); 16885 zone_rele(zp); 16886 } 16887 16888 MD5Init(&ctx); 16889 MD5Update(&ctx, &ic, sizeof (ic)); 16890 MD5Final(hash, &ctx); 16891 16892 /* 16893 * Map the hash to an interface ID per the basic approach in RFC3041. 16894 */ 16895 addr = &v6addr->s6_addr8[8]; 16896 bcopy(hash + 8, addr, sizeof (uint64_t)); 16897 addr[0] &= ~0x2; /* set local bit */ 16898 } 16899 16900 /* 16901 * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet. 16902 */ 16903 static void 16904 ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr) 16905 { 16906 phyint_t *phyi = ill->ill_phyint; 16907 16908 /* 16909 * Check PHYI_MULTI_BCAST and length of physical 16910 * address to determine if we use the mapping or the 16911 * broadcast address. 16912 */ 16913 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || 16914 ill->ill_phys_addr_length != ETHERADDRL) { 16915 ip_mbcast_mapping(ill, m_ip6addr, m_physaddr); 16916 return; 16917 } 16918 m_physaddr[0] = 0x33; 16919 m_physaddr[1] = 0x33; 16920 m_physaddr[2] = m_ip6addr[12]; 16921 m_physaddr[3] = m_ip6addr[13]; 16922 m_physaddr[4] = m_ip6addr[14]; 16923 m_physaddr[5] = m_ip6addr[15]; 16924 } 16925 16926 /* 16927 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet. 16928 */ 16929 static void 16930 ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 16931 { 16932 phyint_t *phyi = ill->ill_phyint; 16933 16934 /* 16935 * Check PHYI_MULTI_BCAST and length of physical 16936 * address to determine if we use the mapping or the 16937 * broadcast address. 16938 */ 16939 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || 16940 ill->ill_phys_addr_length != ETHERADDRL) { 16941 ip_mbcast_mapping(ill, m_ipaddr, m_physaddr); 16942 return; 16943 } 16944 m_physaddr[0] = 0x01; 16945 m_physaddr[1] = 0x00; 16946 m_physaddr[2] = 0x5e; 16947 m_physaddr[3] = m_ipaddr[1] & 0x7f; 16948 m_physaddr[4] = m_ipaddr[2]; 16949 m_physaddr[5] = m_ipaddr[3]; 16950 } 16951 16952 /* ARGSUSED */ 16953 static void 16954 ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 16955 { 16956 /* 16957 * for the MULTI_BCAST case and other cases when we want to 16958 * use the link-layer broadcast address for multicast. 16959 */ 16960 uint8_t *bphys_addr; 16961 dl_unitdata_req_t *dlur; 16962 16963 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 16964 if (ill->ill_sap_length < 0) { 16965 bphys_addr = (uchar_t *)dlur + 16966 dlur->dl_dest_addr_offset; 16967 } else { 16968 bphys_addr = (uchar_t *)dlur + 16969 dlur->dl_dest_addr_offset + ill->ill_sap_length; 16970 } 16971 16972 bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length); 16973 } 16974 16975 /* 16976 * Derive IPoIB interface id from the link layer address. 16977 */ 16978 static void 16979 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16980 { 16981 char *addr; 16982 16983 ASSERT(ill->ill_phys_addr_length == 20); 16984 addr = (char *)&v6addr->s6_addr32[2]; 16985 bcopy(ill->ill_phys_addr + 12, addr, 8); 16986 /* 16987 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 16988 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 16989 * rules. In these cases, the IBA considers these GUIDs to be in 16990 * "Modified EUI-64" format, and thus toggling the u/l bit is not 16991 * required; vendors are required not to assign global EUI-64's 16992 * that differ only in u/l bit values, thus guaranteeing uniqueness 16993 * of the interface identifier. Whether the GUID is in modified 16994 * or proper EUI-64 format, the ipv6 identifier must have the u/l 16995 * bit set to 1. 16996 */ 16997 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 16998 } 16999 17000 /* 17001 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand. 17002 * Note on mapping from multicast IP addresses to IPoIB multicast link 17003 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 17004 * The format of an IPoIB multicast address is: 17005 * 17006 * 4 byte QPN Scope Sign. Pkey 17007 * +--------------------------------------------+ 17008 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 17009 * +--------------------------------------------+ 17010 * 17011 * The Scope and Pkey components are properties of the IBA port and 17012 * network interface. They can be ascertained from the broadcast address. 17013 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 17014 */ 17015 static void 17016 ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17017 { 17018 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 17019 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 17020 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 17021 uint8_t *bphys_addr; 17022 dl_unitdata_req_t *dlur; 17023 17024 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); 17025 17026 /* 17027 * RFC 4391: IPv4 MGID is 28-bit long. 17028 */ 17029 m_physaddr[16] = m_ipaddr[0] & 0x0f; 17030 m_physaddr[17] = m_ipaddr[1]; 17031 m_physaddr[18] = m_ipaddr[2]; 17032 m_physaddr[19] = m_ipaddr[3]; 17033 17034 17035 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17036 if (ill->ill_sap_length < 0) { 17037 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 17038 } else { 17039 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + 17040 ill->ill_sap_length; 17041 } 17042 /* 17043 * Now fill in the IBA scope/Pkey values from the broadcast address. 17044 */ 17045 m_physaddr[5] = bphys_addr[5]; 17046 m_physaddr[8] = bphys_addr[8]; 17047 m_physaddr[9] = bphys_addr[9]; 17048 } 17049 17050 static void 17051 ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17052 { 17053 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 17054 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 17055 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 17056 uint8_t *bphys_addr; 17057 dl_unitdata_req_t *dlur; 17058 17059 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); 17060 17061 /* 17062 * RFC 4391: IPv4 MGID is 80-bit long. 17063 */ 17064 bcopy(&m_ipaddr[6], &m_physaddr[10], 10); 17065 17066 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17067 if (ill->ill_sap_length < 0) { 17068 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 17069 } else { 17070 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + 17071 ill->ill_sap_length; 17072 } 17073 /* 17074 * Now fill in the IBA scope/Pkey values from the broadcast address. 17075 */ 17076 m_physaddr[5] = bphys_addr[5]; 17077 m_physaddr[8] = bphys_addr[8]; 17078 m_physaddr[9] = bphys_addr[9]; 17079 } 17080 17081 /* 17082 * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4 17083 * tunnel). The IPv4 address simply get placed in the lower 4 bytes of the 17084 * IPv6 interface id. This is a suggested mechanism described in section 3.7 17085 * of RFC4213. 17086 */ 17087 static void 17088 ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 17089 { 17090 ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t)); 17091 v6addr->s6_addr32[2] = 0; 17092 bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t)); 17093 } 17094 17095 /* 17096 * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6 17097 * tunnel). The lower 8 bytes of the IPv6 address simply become the interface 17098 * id. 17099 */ 17100 static void 17101 ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 17102 { 17103 in6_addr_t *v6lladdr = (in6_addr_t *)physaddr; 17104 17105 ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t)); 17106 bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8); 17107 } 17108 17109 static void 17110 ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17111 { 17112 ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr); 17113 } 17114 17115 static void 17116 ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 17117 { 17118 ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr); 17119 } 17120 17121 static void 17122 ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17123 { 17124 ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr); 17125 } 17126 17127 static void 17128 ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 17129 { 17130 ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr); 17131 } 17132 17133 /* 17134 * Lookup an ill and verify that the zoneid has an ipif on that ill. 17135 * Returns an held ill, or NULL. 17136 */ 17137 ill_t * 17138 ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6, 17139 ip_stack_t *ipst) 17140 { 17141 ill_t *ill; 17142 ipif_t *ipif; 17143 17144 ill = ill_lookup_on_ifindex(index, isv6, ipst); 17145 if (ill == NULL) 17146 return (NULL); 17147 17148 mutex_enter(&ill->ill_lock); 17149 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17150 if (IPIF_IS_CONDEMNED(ipif)) 17151 continue; 17152 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 17153 ipif->ipif_zoneid != ALL_ZONES) 17154 continue; 17155 17156 mutex_exit(&ill->ill_lock); 17157 return (ill); 17158 } 17159 mutex_exit(&ill->ill_lock); 17160 ill_refrele(ill); 17161 return (NULL); 17162 } 17163 17164 /* 17165 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 17166 * If a pointer to an ipif_t is returned then the caller will need to do 17167 * an ill_refrele(). 17168 */ 17169 ipif_t * 17170 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, 17171 ip_stack_t *ipst) 17172 { 17173 ipif_t *ipif; 17174 ill_t *ill; 17175 17176 ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 17177 if (ill == NULL) 17178 return (NULL); 17179 17180 mutex_enter(&ill->ill_lock); 17181 if (ill->ill_state_flags & ILL_CONDEMNED) { 17182 mutex_exit(&ill->ill_lock); 17183 ill_refrele(ill); 17184 return (NULL); 17185 } 17186 17187 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17188 if (!IPIF_CAN_LOOKUP(ipif)) 17189 continue; 17190 if (lifidx == ipif->ipif_id) { 17191 ipif_refhold_locked(ipif); 17192 break; 17193 } 17194 } 17195 17196 mutex_exit(&ill->ill_lock); 17197 ill_refrele(ill); 17198 return (ipif); 17199 } 17200 17201 /* 17202 * Set ill_inputfn based on the current know state. 17203 * This needs to be called when any of the factors taken into 17204 * account changes. 17205 */ 17206 void 17207 ill_set_inputfn(ill_t *ill) 17208 { 17209 ip_stack_t *ipst = ill->ill_ipst; 17210 17211 if (ill->ill_isv6) { 17212 if (is_system_labeled()) 17213 ill->ill_inputfn = ill_input_full_v6; 17214 else 17215 ill->ill_inputfn = ill_input_short_v6; 17216 } else { 17217 if (is_system_labeled()) 17218 ill->ill_inputfn = ill_input_full_v4; 17219 else if (ill->ill_dhcpinit != 0) 17220 ill->ill_inputfn = ill_input_full_v4; 17221 else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head 17222 != NULL) 17223 ill->ill_inputfn = ill_input_full_v4; 17224 else if (ipst->ips_ip_cgtp_filter && 17225 ipst->ips_ip_cgtp_filter_ops != NULL) 17226 ill->ill_inputfn = ill_input_full_v4; 17227 else 17228 ill->ill_inputfn = ill_input_short_v4; 17229 } 17230 } 17231 17232 /* 17233 * Re-evaluate ill_inputfn for all the IPv4 ills. 17234 * Used when RSVP and CGTP comes and goes. 17235 */ 17236 void 17237 ill_set_inputfn_all(ip_stack_t *ipst) 17238 { 17239 ill_walk_context_t ctx; 17240 ill_t *ill; 17241 17242 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 17243 ill = ILL_START_WALK_V4(&ctx, ipst); 17244 for (; ill != NULL; ill = ill_next(&ctx, ill)) 17245 ill_set_inputfn(ill); 17246 17247 rw_exit(&ipst->ips_ill_g_lock); 17248 } 17249 17250 /* 17251 * Set the physical address information for `ill' to the contents of the 17252 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be 17253 * asynchronous if `ill' cannot immediately be quiesced -- in which case 17254 * EINPROGRESS will be returned. 17255 */ 17256 int 17257 ill_set_phys_addr(ill_t *ill, mblk_t *mp) 17258 { 17259 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 17260 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr; 17261 17262 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17263 17264 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR && 17265 dlindp->dl_data != DL_CURR_DEST_ADDR && 17266 dlindp->dl_data != DL_CURR_PHYS_ADDR) { 17267 /* Changing DL_IPV6_TOKEN is not yet supported */ 17268 return (0); 17269 } 17270 17271 /* 17272 * We need to store up to two copies of `mp' in `ill'. Due to the 17273 * design of ipsq_pending_mp_add(), we can't pass them as separate 17274 * arguments to ill_set_phys_addr_tail(). Instead, chain them 17275 * together here, then pull 'em apart in ill_set_phys_addr_tail(). 17276 */ 17277 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) { 17278 freemsg(mp); 17279 return (ENOMEM); 17280 } 17281 17282 ipsq_current_start(ipsq, ill->ill_ipif, 0); 17283 mutex_enter(&ill->ill_lock); 17284 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 17285 /* no more nce addition allowed */ 17286 mutex_exit(&ill->ill_lock); 17287 17288 /* 17289 * If we can quiesce the ill, then set the address. If not, then 17290 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). 17291 */ 17292 ill_down_ipifs(ill, B_TRUE); 17293 mutex_enter(&ill->ill_lock); 17294 if (!ill_is_quiescent(ill)) { 17295 /* call cannot fail since `conn_t *' argument is NULL */ 17296 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 17297 mp, ILL_DOWN); 17298 mutex_exit(&ill->ill_lock); 17299 return (EINPROGRESS); 17300 } 17301 mutex_exit(&ill->ill_lock); 17302 17303 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL); 17304 return (0); 17305 } 17306 17307 /* 17308 * Once the ill associated with `q' has quiesced, set its physical address 17309 * information to the values in `addrmp'. Note that two copies of `addrmp' 17310 * are passed (linked by b_cont), since we sometimes need to save two distinct 17311 * copies in the ill_t, and our context doesn't permit sleeping or allocation 17312 * failure (we'll free the other copy if it's not needed). Since the ill_t 17313 * is quiesced, we know any stale nce's with the old address information have 17314 * already been removed, so we don't need to call nce_flush(). 17315 */ 17316 /* ARGSUSED */ 17317 static void 17318 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) 17319 { 17320 ill_t *ill = q->q_ptr; 17321 mblk_t *addrmp2 = unlinkb(addrmp); 17322 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr; 17323 uint_t addrlen, addroff; 17324 int status; 17325 17326 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17327 17328 addroff = dlindp->dl_addr_offset; 17329 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length); 17330 17331 switch (dlindp->dl_data) { 17332 case DL_IPV6_LINK_LAYER_ADDR: 17333 ill_set_ndmp(ill, addrmp, addroff, addrlen); 17334 freemsg(addrmp2); 17335 break; 17336 17337 case DL_CURR_DEST_ADDR: 17338 freemsg(ill->ill_dest_addr_mp); 17339 ill->ill_dest_addr = addrmp->b_rptr + addroff; 17340 ill->ill_dest_addr_mp = addrmp; 17341 if (ill->ill_isv6) { 17342 ill_setdesttoken(ill); 17343 ipif_setdestlinklocal(ill->ill_ipif); 17344 } 17345 freemsg(addrmp2); 17346 break; 17347 17348 case DL_CURR_PHYS_ADDR: 17349 freemsg(ill->ill_phys_addr_mp); 17350 ill->ill_phys_addr = addrmp->b_rptr + addroff; 17351 ill->ill_phys_addr_mp = addrmp; 17352 ill->ill_phys_addr_length = addrlen; 17353 if (ill->ill_isv6) 17354 ill_set_ndmp(ill, addrmp2, addroff, addrlen); 17355 else 17356 freemsg(addrmp2); 17357 if (ill->ill_isv6) { 17358 ill_setdefaulttoken(ill); 17359 ipif_setlinklocal(ill->ill_ipif); 17360 } 17361 break; 17362 default: 17363 ASSERT(0); 17364 } 17365 17366 /* 17367 * If there are ipifs to bring up, ill_up_ipifs() will return 17368 * EINPROGRESS, and ipsq_current_finish() will be called by 17369 * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is 17370 * brought up. 17371 */ 17372 status = ill_up_ipifs(ill, q, addrmp); 17373 mutex_enter(&ill->ill_lock); 17374 if (ill->ill_dl_up) 17375 ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS; 17376 mutex_exit(&ill->ill_lock); 17377 if (status != EINPROGRESS) 17378 ipsq_current_finish(ipsq); 17379 } 17380 17381 /* 17382 * Helper routine for setting the ill_nd_lla fields. 17383 */ 17384 void 17385 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen) 17386 { 17387 freemsg(ill->ill_nd_lla_mp); 17388 ill->ill_nd_lla = ndmp->b_rptr + addroff; 17389 ill->ill_nd_lla_mp = ndmp; 17390 ill->ill_nd_lla_len = addrlen; 17391 } 17392 17393 /* 17394 * Replumb the ill. 17395 */ 17396 int 17397 ill_replumb(ill_t *ill, mblk_t *mp) 17398 { 17399 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 17400 17401 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17402 17403 ipsq_current_start(ipsq, ill->ill_ipif, 0); 17404 17405 mutex_enter(&ill->ill_lock); 17406 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 17407 /* no more nce addition allowed */ 17408 mutex_exit(&ill->ill_lock); 17409 17410 /* 17411 * If we can quiesce the ill, then continue. If not, then 17412 * ill_replumb_tail() will be called from ipif_ill_refrele_tail(). 17413 */ 17414 ill_down_ipifs(ill, B_FALSE); 17415 17416 mutex_enter(&ill->ill_lock); 17417 if (!ill_is_quiescent(ill)) { 17418 /* call cannot fail since `conn_t *' argument is NULL */ 17419 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 17420 mp, ILL_DOWN); 17421 mutex_exit(&ill->ill_lock); 17422 return (EINPROGRESS); 17423 } 17424 mutex_exit(&ill->ill_lock); 17425 17426 ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL); 17427 return (0); 17428 } 17429 17430 /* ARGSUSED */ 17431 static void 17432 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 17433 { 17434 ill_t *ill = q->q_ptr; 17435 int err; 17436 conn_t *connp = NULL; 17437 17438 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17439 freemsg(ill->ill_replumb_mp); 17440 ill->ill_replumb_mp = copyb(mp); 17441 17442 if (ill->ill_replumb_mp == NULL) { 17443 /* out of memory */ 17444 ipsq_current_finish(ipsq); 17445 return; 17446 } 17447 17448 mutex_enter(&ill->ill_lock); 17449 ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif, 17450 ill->ill_rq, ill->ill_replumb_mp, 0); 17451 mutex_exit(&ill->ill_lock); 17452 17453 if (!ill->ill_up_ipifs) { 17454 /* already closing */ 17455 ipsq_current_finish(ipsq); 17456 return; 17457 } 17458 ill->ill_replumbing = 1; 17459 err = ill_down_ipifs_tail(ill); 17460 17461 /* 17462 * Successfully quiesced and brought down the interface, now we send 17463 * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the 17464 * DL_NOTE_REPLUMB message. 17465 */ 17466 mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO, 17467 DL_NOTIFY_CONF); 17468 ASSERT(mp != NULL); 17469 ((dl_notify_conf_t *)mp->b_rptr)->dl_notification = 17470 DL_NOTE_REPLUMB_DONE; 17471 ill_dlpi_send(ill, mp); 17472 17473 /* 17474 * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP 17475 * streams have to be unbound. When all the DLPI exchanges are done, 17476 * ipsq_current_finish() will be called by arp_bringup_done(). The 17477 * remainder of ipif bringup via ill_up_ipifs() will also be done in 17478 * arp_bringup_done(). 17479 */ 17480 ASSERT(ill->ill_replumb_mp != NULL); 17481 if (err == EINPROGRESS) 17482 return; 17483 else 17484 ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp); 17485 ASSERT(connp == NULL); 17486 if (err == 0 && ill->ill_replumb_mp != NULL && 17487 ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) { 17488 return; 17489 } 17490 ipsq_current_finish(ipsq); 17491 } 17492 17493 /* 17494 * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf' 17495 * which is `bufsize' bytes. On success, zero is returned and `buf' updated 17496 * as per the ioctl. On failure, an errno is returned. 17497 */ 17498 static int 17499 ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr) 17500 { 17501 int rval; 17502 struct strioctl iocb; 17503 17504 iocb.ic_cmd = cmd; 17505 iocb.ic_timout = 15; 17506 iocb.ic_len = bufsize; 17507 iocb.ic_dp = buf; 17508 17509 return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval)); 17510 } 17511 17512 /* 17513 * Issue an SIOCGLIFCONF for address family `af' and store the result into a 17514 * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success. 17515 */ 17516 static int 17517 ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp, 17518 uint_t *bufsizep, cred_t *cr) 17519 { 17520 int err; 17521 struct lifnum lifn; 17522 17523 bzero(&lifn, sizeof (lifn)); 17524 lifn.lifn_family = af; 17525 lifn.lifn_flags = LIFC_UNDER_IPMP; 17526 17527 if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0) 17528 return (err); 17529 17530 /* 17531 * Pad the interface count to account for additional interfaces that 17532 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 17533 */ 17534 lifn.lifn_count += 4; 17535 bzero(lifcp, sizeof (*lifcp)); 17536 lifcp->lifc_flags = LIFC_UNDER_IPMP; 17537 lifcp->lifc_family = af; 17538 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 17539 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 17540 17541 err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr); 17542 if (err != 0) { 17543 kmem_free(lifcp->lifc_buf, *bufsizep); 17544 return (err); 17545 } 17546 17547 return (0); 17548 } 17549 17550 /* 17551 * Helper for ip_interface_cleanup() that removes the loopback interface. 17552 */ 17553 static void 17554 ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 17555 { 17556 int err; 17557 struct lifreq lifr; 17558 17559 bzero(&lifr, sizeof (lifr)); 17560 (void) strcpy(lifr.lifr_name, ipif_loopback_name); 17561 17562 err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr); 17563 if (err != 0) { 17564 ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: " 17565 "error %d\n", isv6 ? "v6" : "v4", err)); 17566 } 17567 } 17568 17569 /* 17570 * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP 17571 * groups and that IPMP data addresses are down. These conditions must be met 17572 * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp(). 17573 */ 17574 static void 17575 ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 17576 { 17577 int af = isv6 ? AF_INET6 : AF_INET; 17578 int i, nifs; 17579 int err; 17580 uint_t bufsize; 17581 uint_t lifrsize = sizeof (struct lifreq); 17582 struct lifconf lifc; 17583 struct lifreq *lifrp; 17584 17585 if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) { 17586 cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list " 17587 "(error %d); any IPMP interfaces cannot be shutdown", err); 17588 return; 17589 } 17590 17591 nifs = lifc.lifc_len / lifrsize; 17592 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 17593 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 17594 if (err != 0) { 17595 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get " 17596 "flags: error %d", lifrp->lifr_name, err); 17597 continue; 17598 } 17599 17600 if (lifrp->lifr_flags & IFF_IPMP) { 17601 if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0) 17602 continue; 17603 17604 lifrp->lifr_flags &= ~IFF_UP; 17605 err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr); 17606 if (err != 0) { 17607 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 17608 "bring down (error %d); IPMP interface may " 17609 "not be shutdown", lifrp->lifr_name, err); 17610 } 17611 17612 /* 17613 * Check if IFF_DUPLICATE is still set -- and if so, 17614 * reset the address to clear it. 17615 */ 17616 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 17617 if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE)) 17618 continue; 17619 17620 err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr); 17621 if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR, 17622 lifrp, lifrsize, cr)) != 0) { 17623 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 17624 "reset DAD (error %d); IPMP interface may " 17625 "not be shutdown", lifrp->lifr_name, err); 17626 } 17627 continue; 17628 } 17629 17630 lifrp->lifr_groupname[0] = '\0'; 17631 err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp, lifrsize, cr); 17632 if (err != 0) { 17633 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot leave " 17634 "IPMP group (error %d); associated IPMP interface " 17635 "may not be shutdown", lifrp->lifr_name, err); 17636 continue; 17637 } 17638 } 17639 17640 kmem_free(lifc.lifc_buf, bufsize); 17641 } 17642 17643 #define UDPDEV "/devices/pseudo/udp@0:udp" 17644 #define UDP6DEV "/devices/pseudo/udp6@0:udp6" 17645 17646 /* 17647 * Remove the loopback interfaces and prep the IPMP interfaces to be torn down. 17648 * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away 17649 * when the user-level processes in the zone are killed and the latter are 17650 * cleaned up by str_stack_shutdown(). 17651 */ 17652 void 17653 ip_interface_cleanup(ip_stack_t *ipst) 17654 { 17655 ldi_handle_t lh; 17656 ldi_ident_t li; 17657 cred_t *cr; 17658 int err; 17659 int i; 17660 char *devs[] = { UDP6DEV, UDPDEV }; 17661 netstackid_t stackid = ipst->ips_netstack->netstack_stackid; 17662 17663 if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) { 17664 cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:" 17665 " error %d", err); 17666 return; 17667 } 17668 17669 cr = zone_get_kcred(netstackid_to_zoneid(stackid)); 17670 ASSERT(cr != NULL); 17671 17672 /* 17673 * NOTE: loop executes exactly twice and is hardcoded to know that the 17674 * first iteration is IPv6. (Unrolling yields repetitious code, hence 17675 * the loop.) 17676 */ 17677 for (i = 0; i < 2; i++) { 17678 err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li); 17679 if (err != 0) { 17680 cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:" 17681 " error %d", devs[i], err); 17682 continue; 17683 } 17684 17685 ip_loopback_removeif(lh, i == 0, cr); 17686 ip_ipmp_cleanup(lh, i == 0, cr); 17687 17688 (void) ldi_close(lh, FREAD|FWRITE, cr); 17689 } 17690 17691 ldi_ident_release(li); 17692 crfree(cr); 17693 } 17694 17695 /* 17696 * This needs to be in-sync with nic_event_t definition 17697 */ 17698 static const char * 17699 ill_hook_event2str(nic_event_t event) 17700 { 17701 switch (event) { 17702 case NE_PLUMB: 17703 return ("PLUMB"); 17704 case NE_UNPLUMB: 17705 return ("UNPLUMB"); 17706 case NE_UP: 17707 return ("UP"); 17708 case NE_DOWN: 17709 return ("DOWN"); 17710 case NE_ADDRESS_CHANGE: 17711 return ("ADDRESS_CHANGE"); 17712 case NE_LIF_UP: 17713 return ("LIF_UP"); 17714 case NE_LIF_DOWN: 17715 return ("LIF_DOWN"); 17716 case NE_IFINDEX_CHANGE: 17717 return ("IFINDEX_CHANGE"); 17718 default: 17719 return ("UNKNOWN"); 17720 } 17721 } 17722 17723 void 17724 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event, 17725 nic_event_data_t data, size_t datalen) 17726 { 17727 ip_stack_t *ipst = ill->ill_ipst; 17728 hook_nic_event_int_t *info; 17729 const char *str = NULL; 17730 17731 /* create a new nic event info */ 17732 if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL) 17733 goto fail; 17734 17735 info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex; 17736 info->hnei_event.hne_lif = lif; 17737 info->hnei_event.hne_event = event; 17738 info->hnei_event.hne_protocol = ill->ill_isv6 ? 17739 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 17740 info->hnei_event.hne_data = NULL; 17741 info->hnei_event.hne_datalen = 0; 17742 info->hnei_stackid = ipst->ips_netstack->netstack_stackid; 17743 17744 if (data != NULL && datalen != 0) { 17745 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP); 17746 if (info->hnei_event.hne_data == NULL) 17747 goto fail; 17748 bcopy(data, info->hnei_event.hne_data, datalen); 17749 info->hnei_event.hne_datalen = datalen; 17750 } 17751 17752 if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info, 17753 DDI_NOSLEEP) == DDI_SUCCESS) 17754 return; 17755 17756 fail: 17757 if (info != NULL) { 17758 if (info->hnei_event.hne_data != NULL) { 17759 kmem_free(info->hnei_event.hne_data, 17760 info->hnei_event.hne_datalen); 17761 } 17762 kmem_free(info, sizeof (hook_nic_event_t)); 17763 } 17764 str = ill_hook_event2str(event); 17765 ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event " 17766 "information for %s (ENOMEM)\n", str, ill->ill_name)); 17767 } 17768 17769 static int 17770 ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act) 17771 { 17772 int err = 0; 17773 const in_addr_t *addr = NULL; 17774 nce_t *nce = NULL; 17775 ill_t *ill = ipif->ipif_ill; 17776 ill_t *bound_ill; 17777 boolean_t added_ipif = B_FALSE; 17778 uint16_t state; 17779 uint16_t flags; 17780 17781 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail", 17782 ill_t *, ill, ipif_t *, ipif); 17783 if (ipif->ipif_lcl_addr != INADDR_ANY) { 17784 addr = &ipif->ipif_lcl_addr; 17785 } 17786 17787 if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) { 17788 if (res_act != Res_act_initial) 17789 return (EINVAL); 17790 } 17791 17792 if (addr != NULL) { 17793 ipmp_illgrp_t *illg = ill->ill_grp; 17794 17795 /* add unicast nce for the local addr */ 17796 17797 if (IS_IPMP(ill)) { 17798 /* 17799 * If we're here via ipif_up(), then the ipif 17800 * won't be bound yet -- add it to the group, 17801 * which will bind it if possible. (We would 17802 * add it in ipif_up(), but deleting on failure 17803 * there is gruesome.) If we're here via 17804 * ipmp_ill_bind_ipif(), then the ipif has 17805 * already been added to the group and we 17806 * just need to use the binding. 17807 */ 17808 if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) { 17809 bound_ill = ipmp_illgrp_add_ipif(illg, ipif); 17810 if (bound_ill == NULL) { 17811 /* 17812 * We couldn't bind the ipif to an ill 17813 * yet, so we have nothing to publish. 17814 * Mark the address as ready and return. 17815 */ 17816 ipif->ipif_addr_ready = 1; 17817 return (0); 17818 } 17819 added_ipif = B_TRUE; 17820 } 17821 } else { 17822 bound_ill = ill; 17823 } 17824 17825 flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY | 17826 NCE_F_NONUD); 17827 /* 17828 * If this is an initial bring-up (or the ipif was never 17829 * completely brought up), do DAD. Otherwise, we're here 17830 * because IPMP has rebound an address to this ill: send 17831 * unsolicited advertisements (ARP announcements) to 17832 * inform others. 17833 */ 17834 if (res_act == Res_act_initial || !ipif->ipif_addr_ready) { 17835 state = ND_UNCHANGED; /* compute in nce_add_common() */ 17836 } else { 17837 state = ND_REACHABLE; 17838 flags |= NCE_F_UNSOL_ADV; 17839 } 17840 17841 retry: 17842 err = nce_lookup_then_add_v4(ill, 17843 bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length, 17844 addr, flags, state, &nce); 17845 17846 /* 17847 * note that we may encounter EEXIST if we are moving 17848 * the nce as a result of a rebind operation. 17849 */ 17850 switch (err) { 17851 case 0: 17852 ipif->ipif_added_nce = 1; 17853 nce->nce_ipif_cnt++; 17854 break; 17855 case EEXIST: 17856 ip1dbg(("ipif_arp_up: NCE already exists for %s\n", 17857 ill->ill_name)); 17858 if (!NCE_MYADDR(nce->nce_common)) { 17859 /* 17860 * A leftover nce from before this address 17861 * existed 17862 */ 17863 ncec_delete(nce->nce_common); 17864 nce_refrele(nce); 17865 nce = NULL; 17866 goto retry; 17867 } 17868 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 17869 nce_refrele(nce); 17870 nce = NULL; 17871 ip1dbg(("ipif_arp_up: NCE already exists " 17872 "for %s:%u\n", ill->ill_name, 17873 ipif->ipif_id)); 17874 goto arp_up_done; 17875 } 17876 /* 17877 * Duplicate local addresses are permissible for 17878 * IPIF_POINTOPOINT interfaces which will get marked 17879 * IPIF_UNNUMBERED later in 17880 * ip_addr_availability_check(). 17881 * 17882 * The nce_ipif_cnt field tracks the number of 17883 * ipifs that have nce_addr as their local address. 17884 */ 17885 ipif->ipif_addr_ready = 1; 17886 ipif->ipif_added_nce = 1; 17887 nce->nce_ipif_cnt++; 17888 err = 0; 17889 break; 17890 default: 17891 ASSERT(nce == NULL); 17892 goto arp_up_done; 17893 } 17894 if (arp_no_defense) { 17895 if ((ipif->ipif_flags & IPIF_UP) && 17896 !ipif->ipif_addr_ready) 17897 ipif_up_notify(ipif); 17898 ipif->ipif_addr_ready = 1; 17899 } 17900 } else { 17901 /* zero address. nothing to publish */ 17902 ipif->ipif_addr_ready = 1; 17903 } 17904 if (nce != NULL) 17905 nce_refrele(nce); 17906 arp_up_done: 17907 if (added_ipif && err != 0) 17908 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 17909 return (err); 17910 } 17911 17912 int 17913 ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup) 17914 { 17915 int err = 0; 17916 ill_t *ill = ipif->ipif_ill; 17917 boolean_t first_interface, wait_for_dlpi = B_FALSE; 17918 17919 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up", 17920 ill_t *, ill, ipif_t *, ipif); 17921 17922 /* 17923 * need to bring up ARP or setup mcast mapping only 17924 * when the first interface is coming UP. 17925 */ 17926 first_interface = (ill->ill_ipif_up_count == 0 && 17927 ill->ill_ipif_dup_count == 0 && !was_dup); 17928 17929 if (res_act == Res_act_initial && first_interface) { 17930 /* 17931 * Send ATTACH + BIND 17932 */ 17933 err = arp_ll_up(ill); 17934 if (err != EINPROGRESS && err != 0) 17935 return (err); 17936 17937 /* 17938 * Add NCE for local address. Start DAD. 17939 * we'll wait to hear that DAD has finished 17940 * before using the interface. 17941 */ 17942 if (err == EINPROGRESS) 17943 wait_for_dlpi = B_TRUE; 17944 } 17945 17946 if (!wait_for_dlpi) 17947 (void) ipif_arp_up_done_tail(ipif, res_act); 17948 17949 return (!wait_for_dlpi ? 0 : EINPROGRESS); 17950 } 17951 17952 /* 17953 * Finish processing of "arp_up" after all the DLPI message 17954 * exchanges have completed between arp and the driver. 17955 */ 17956 void 17957 arp_bringup_done(ill_t *ill, int err) 17958 { 17959 mblk_t *mp1; 17960 ipif_t *ipif; 17961 conn_t *connp = NULL; 17962 ipsq_t *ipsq; 17963 queue_t *q; 17964 17965 ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name)); 17966 17967 ASSERT(IAM_WRITER_ILL(ill)); 17968 17969 ipsq = ill->ill_phyint->phyint_ipsq; 17970 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 17971 mp1 = ipsq_pending_mp_get(ipsq, &connp); 17972 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 17973 if (mp1 == NULL) /* bringup was aborted by the user */ 17974 return; 17975 17976 /* 17977 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 17978 * must have an associated conn_t. Otherwise, we're bringing this 17979 * interface back up as part of handling an asynchronous event (e.g., 17980 * physical address change). 17981 */ 17982 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 17983 ASSERT(connp != NULL); 17984 q = CONNP_TO_WQ(connp); 17985 } else { 17986 ASSERT(connp == NULL); 17987 q = ill->ill_rq; 17988 } 17989 if (err == 0) { 17990 if (ipif->ipif_isv6) { 17991 if ((err = ipif_up_done_v6(ipif)) != 0) 17992 ip0dbg(("arp_bringup_done: init failed\n")); 17993 } else { 17994 err = ipif_arp_up_done_tail(ipif, Res_act_initial); 17995 if (err != 0 || 17996 (err = ipif_up_done(ipif)) != 0) { 17997 ip0dbg(("arp_bringup_done: " 17998 "init failed err %x\n", err)); 17999 (void) ipif_arp_down(ipif); 18000 } 18001 18002 } 18003 } else { 18004 ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n")); 18005 } 18006 18007 if ((err == 0) && (ill->ill_up_ipifs)) { 18008 err = ill_up_ipifs(ill, q, mp1); 18009 if (err == EINPROGRESS) 18010 return; 18011 } 18012 18013 /* 18014 * If we have a moved ipif to bring up, and everything has succeeded 18015 * to this point, bring it up on the IPMP ill. Otherwise, leave it 18016 * down -- the admin can try to bring it up by hand if need be. 18017 */ 18018 if (ill->ill_move_ipif != NULL) { 18019 ipif = ill->ill_move_ipif; 18020 ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif, 18021 ipif->ipif_ill->ill_name)); 18022 ill->ill_move_ipif = NULL; 18023 if (err == 0) { 18024 err = ipif_up(ipif, q, mp1); 18025 if (err == EINPROGRESS) 18026 return; 18027 } 18028 } 18029 18030 /* 18031 * The operation must complete without EINPROGRESS since 18032 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 18033 * Otherwise, the operation will be stuck forever in the ipsq. 18034 */ 18035 ASSERT(err != EINPROGRESS); 18036 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18037 DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish", 18038 int, ipsq->ipsq_xop->ipx_current_ioctl, 18039 ill_t *, ill, ipif_t *, ipif); 18040 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 18041 } else { 18042 ipsq_current_finish(ipsq); 18043 } 18044 } 18045 18046 /* 18047 * Finish processing of arp replumb after all the DLPI message 18048 * exchanges have completed between arp and the driver. 18049 */ 18050 void 18051 arp_replumb_done(ill_t *ill, int err) 18052 { 18053 mblk_t *mp1; 18054 ipif_t *ipif; 18055 conn_t *connp = NULL; 18056 ipsq_t *ipsq; 18057 queue_t *q; 18058 18059 ASSERT(IAM_WRITER_ILL(ill)); 18060 18061 ipsq = ill->ill_phyint->phyint_ipsq; 18062 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 18063 mp1 = ipsq_pending_mp_get(ipsq, &connp); 18064 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 18065 if (mp1 == NULL) { 18066 ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n", 18067 ipsq->ipsq_xop->ipx_current_ioctl)); 18068 /* bringup was aborted by the user */ 18069 return; 18070 } 18071 /* 18072 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 18073 * must have an associated conn_t. Otherwise, we're bringing this 18074 * interface back up as part of handling an asynchronous event (e.g., 18075 * physical address change). 18076 */ 18077 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18078 ASSERT(connp != NULL); 18079 q = CONNP_TO_WQ(connp); 18080 } else { 18081 ASSERT(connp == NULL); 18082 q = ill->ill_rq; 18083 } 18084 if ((err == 0) && (ill->ill_up_ipifs)) { 18085 err = ill_up_ipifs(ill, q, mp1); 18086 if (err == EINPROGRESS) 18087 return; 18088 } 18089 /* 18090 * The operation must complete without EINPROGRESS since 18091 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 18092 * Otherwise, the operation will be stuck forever in the ipsq. 18093 */ 18094 ASSERT(err != EINPROGRESS); 18095 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18096 DTRACE_PROBE4(ipif__ioctl, char *, 18097 "arp_replumb_done finish", 18098 int, ipsq->ipsq_xop->ipx_current_ioctl, 18099 ill_t *, ill, ipif_t *, ipif); 18100 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 18101 } else { 18102 ipsq_current_finish(ipsq); 18103 } 18104 } 18105 18106 void 18107 ipif_up_notify(ipif_t *ipif) 18108 { 18109 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 18110 ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT); 18111 sctp_update_ipif(ipif, SCTP_IPIF_UP); 18112 ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id), 18113 NE_LIF_UP, NULL, 0); 18114 } 18115 18116 /* 18117 * ILB ioctl uses cv_wait (such as deleting a rule or adding a server) and 18118 * this assumes the context is cv_wait'able. Hence it shouldnt' be used on 18119 * TPI end points with STREAMS modules pushed above. This is assured by not 18120 * having the IPI_MODOK flag for the ioctl. And IP ensures the ILB ioctl 18121 * never ends up on an ipsq, otherwise we may end up processing the ioctl 18122 * while unwinding from the ispq and that could be a thread from the bottom. 18123 */ 18124 /* ARGSUSED */ 18125 int 18126 ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 18127 ip_ioctl_cmd_t *ipip, void *arg) 18128 { 18129 mblk_t *cmd_mp = mp->b_cont->b_cont; 18130 ilb_cmd_t command = *((ilb_cmd_t *)cmd_mp->b_rptr); 18131 int ret = 0; 18132 int i; 18133 size_t size; 18134 ip_stack_t *ipst; 18135 zoneid_t zoneid; 18136 ilb_stack_t *ilbs; 18137 18138 ipst = CONNQ_TO_IPST(q); 18139 ilbs = ipst->ips_netstack->netstack_ilb; 18140 zoneid = Q_TO_CONN(q)->conn_zoneid; 18141 18142 switch (command) { 18143 case ILB_CREATE_RULE: { 18144 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 18145 18146 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 18147 ret = EINVAL; 18148 break; 18149 } 18150 18151 ret = ilb_rule_add(ilbs, zoneid, cmd); 18152 break; 18153 } 18154 case ILB_DESTROY_RULE: 18155 case ILB_ENABLE_RULE: 18156 case ILB_DISABLE_RULE: { 18157 ilb_name_cmd_t *cmd = (ilb_name_cmd_t *)cmd_mp->b_rptr; 18158 18159 if (MBLKL(cmd_mp) != sizeof (ilb_name_cmd_t)) { 18160 ret = EINVAL; 18161 break; 18162 } 18163 18164 if (cmd->flags & ILB_RULE_ALLRULES) { 18165 if (command == ILB_DESTROY_RULE) { 18166 ilb_rule_del_all(ilbs, zoneid); 18167 break; 18168 } else if (command == ILB_ENABLE_RULE) { 18169 ilb_rule_enable_all(ilbs, zoneid); 18170 break; 18171 } else if (command == ILB_DISABLE_RULE) { 18172 ilb_rule_disable_all(ilbs, zoneid); 18173 break; 18174 } 18175 } else { 18176 if (command == ILB_DESTROY_RULE) { 18177 ret = ilb_rule_del(ilbs, zoneid, cmd->name); 18178 } else if (command == ILB_ENABLE_RULE) { 18179 ret = ilb_rule_enable(ilbs, zoneid, cmd->name, 18180 NULL); 18181 } else if (command == ILB_DISABLE_RULE) { 18182 ret = ilb_rule_disable(ilbs, zoneid, cmd->name, 18183 NULL); 18184 } 18185 } 18186 break; 18187 } 18188 case ILB_NUM_RULES: { 18189 ilb_num_rules_cmd_t *cmd; 18190 18191 if (MBLKL(cmd_mp) != sizeof (ilb_num_rules_cmd_t)) { 18192 ret = EINVAL; 18193 break; 18194 } 18195 cmd = (ilb_num_rules_cmd_t *)cmd_mp->b_rptr; 18196 ilb_get_num_rules(ilbs, zoneid, &(cmd->num)); 18197 break; 18198 } 18199 case ILB_RULE_NAMES: { 18200 ilb_rule_names_cmd_t *cmd; 18201 18202 cmd = (ilb_rule_names_cmd_t *)cmd_mp->b_rptr; 18203 if (MBLKL(cmd_mp) < sizeof (ilb_rule_names_cmd_t) || 18204 cmd->num_names == 0) { 18205 ret = EINVAL; 18206 break; 18207 } 18208 size = cmd->num_names * ILB_RULE_NAMESZ; 18209 if (cmd_mp->b_rptr + offsetof(ilb_rule_names_cmd_t, buf) + 18210 size != cmd_mp->b_wptr) { 18211 ret = EINVAL; 18212 break; 18213 } 18214 ilb_get_rulenames(ilbs, zoneid, &cmd->num_names, cmd->buf); 18215 break; 18216 } 18217 case ILB_NUM_SERVERS: { 18218 ilb_num_servers_cmd_t *cmd; 18219 18220 if (MBLKL(cmd_mp) != sizeof (ilb_num_servers_cmd_t)) { 18221 ret = EINVAL; 18222 break; 18223 } 18224 cmd = (ilb_num_servers_cmd_t *)cmd_mp->b_rptr; 18225 ret = ilb_get_num_servers(ilbs, zoneid, cmd->name, 18226 &(cmd->num)); 18227 break; 18228 } 18229 case ILB_LIST_RULE: { 18230 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 18231 18232 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 18233 ret = EINVAL; 18234 break; 18235 } 18236 ret = ilb_rule_list(ilbs, zoneid, cmd); 18237 break; 18238 } 18239 case ILB_LIST_SERVERS: { 18240 ilb_servers_info_cmd_t *cmd; 18241 18242 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 18243 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t) || 18244 cmd->num_servers == 0) { 18245 ret = EINVAL; 18246 break; 18247 } 18248 size = cmd->num_servers * sizeof (ilb_server_info_t); 18249 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 18250 size != cmd_mp->b_wptr) { 18251 ret = EINVAL; 18252 break; 18253 } 18254 18255 ret = ilb_get_servers(ilbs, zoneid, cmd->name, cmd->servers, 18256 &cmd->num_servers); 18257 break; 18258 } 18259 case ILB_ADD_SERVERS: { 18260 ilb_servers_info_cmd_t *cmd; 18261 ilb_rule_t *rule; 18262 18263 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 18264 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t)) { 18265 ret = EINVAL; 18266 break; 18267 } 18268 size = cmd->num_servers * sizeof (ilb_server_info_t); 18269 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 18270 size != cmd_mp->b_wptr) { 18271 ret = EINVAL; 18272 break; 18273 } 18274 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 18275 if (rule == NULL) { 18276 ASSERT(ret != 0); 18277 break; 18278 } 18279 for (i = 0; i < cmd->num_servers; i++) { 18280 ilb_server_info_t *s; 18281 18282 s = &cmd->servers[i]; 18283 s->err = ilb_server_add(ilbs, rule, s); 18284 } 18285 ILB_RULE_REFRELE(rule); 18286 break; 18287 } 18288 case ILB_DEL_SERVERS: 18289 case ILB_ENABLE_SERVERS: 18290 case ILB_DISABLE_SERVERS: { 18291 ilb_servers_cmd_t *cmd; 18292 ilb_rule_t *rule; 18293 int (*f)(); 18294 18295 cmd = (ilb_servers_cmd_t *)cmd_mp->b_rptr; 18296 if (MBLKL(cmd_mp) < sizeof (ilb_servers_cmd_t)) { 18297 ret = EINVAL; 18298 break; 18299 } 18300 size = cmd->num_servers * sizeof (ilb_server_arg_t); 18301 if (cmd_mp->b_rptr + offsetof(ilb_servers_cmd_t, servers) + 18302 size != cmd_mp->b_wptr) { 18303 ret = EINVAL; 18304 break; 18305 } 18306 18307 if (command == ILB_DEL_SERVERS) 18308 f = ilb_server_del; 18309 else if (command == ILB_ENABLE_SERVERS) 18310 f = ilb_server_enable; 18311 else if (command == ILB_DISABLE_SERVERS) 18312 f = ilb_server_disable; 18313 18314 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 18315 if (rule == NULL) { 18316 ASSERT(ret != 0); 18317 break; 18318 } 18319 18320 for (i = 0; i < cmd->num_servers; i++) { 18321 ilb_server_arg_t *s; 18322 18323 s = &cmd->servers[i]; 18324 s->err = f(ilbs, zoneid, NULL, rule, &s->addr); 18325 } 18326 ILB_RULE_REFRELE(rule); 18327 break; 18328 } 18329 case ILB_LIST_NAT_TABLE: { 18330 ilb_list_nat_cmd_t *cmd; 18331 18332 cmd = (ilb_list_nat_cmd_t *)cmd_mp->b_rptr; 18333 if (MBLKL(cmd_mp) < sizeof (ilb_list_nat_cmd_t)) { 18334 ret = EINVAL; 18335 break; 18336 } 18337 size = cmd->num_nat * sizeof (ilb_nat_entry_t); 18338 if (cmd_mp->b_rptr + offsetof(ilb_list_nat_cmd_t, entries) + 18339 size != cmd_mp->b_wptr) { 18340 ret = EINVAL; 18341 break; 18342 } 18343 18344 ret = ilb_list_nat(ilbs, zoneid, cmd->entries, &cmd->num_nat, 18345 &cmd->flags); 18346 break; 18347 } 18348 case ILB_LIST_STICKY_TABLE: { 18349 ilb_list_sticky_cmd_t *cmd; 18350 18351 cmd = (ilb_list_sticky_cmd_t *)cmd_mp->b_rptr; 18352 if (MBLKL(cmd_mp) < sizeof (ilb_list_sticky_cmd_t)) { 18353 ret = EINVAL; 18354 break; 18355 } 18356 size = cmd->num_sticky * sizeof (ilb_sticky_entry_t); 18357 if (cmd_mp->b_rptr + offsetof(ilb_list_sticky_cmd_t, entries) + 18358 size != cmd_mp->b_wptr) { 18359 ret = EINVAL; 18360 break; 18361 } 18362 18363 ret = ilb_list_sticky(ilbs, zoneid, cmd->entries, 18364 &cmd->num_sticky, &cmd->flags); 18365 break; 18366 } 18367 default: 18368 ret = EINVAL; 18369 break; 18370 } 18371 done: 18372 return (ret); 18373 } 18374 18375 /* Remove all cache entries for this logical interface */ 18376 void 18377 ipif_nce_down(ipif_t *ipif) 18378 { 18379 ill_t *ill = ipif->ipif_ill; 18380 nce_t *nce; 18381 18382 DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down", 18383 ill_t *, ill, ipif_t *, ipif); 18384 if (ipif->ipif_added_nce) { 18385 if (ipif->ipif_isv6) 18386 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); 18387 else 18388 nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr); 18389 if (nce != NULL) { 18390 if (--nce->nce_ipif_cnt == 0) 18391 ncec_delete(nce->nce_common); 18392 ipif->ipif_added_nce = 0; 18393 nce_refrele(nce); 18394 } else { 18395 /* 18396 * nce may already be NULL because it was already 18397 * flushed, e.g., due to a call to nce_flush 18398 */ 18399 ipif->ipif_added_nce = 0; 18400 } 18401 } 18402 /* 18403 * Make IPMP aware of the deleted data address. 18404 */ 18405 if (IS_IPMP(ill)) 18406 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 18407 18408 /* 18409 * Remove all other nces dependent on this ill when the last ipif 18410 * is going away. 18411 */ 18412 if (ill->ill_ipif_up_count == 0) { 18413 ncec_walk(ill, (pfi_t)ncec_delete_per_ill, 18414 (uchar_t *)ill, ill->ill_ipst); 18415 if (IS_UNDER_IPMP(ill)) 18416 nce_flush(ill, B_TRUE); 18417 } 18418 } 18419