1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 /* 28 * This file contains the interface control functions for IP. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/stream.h> 33 #include <sys/dlpi.h> 34 #include <sys/stropts.h> 35 #include <sys/strsun.h> 36 #include <sys/sysmacros.h> 37 #include <sys/strsubr.h> 38 #include <sys/strlog.h> 39 #include <sys/ddi.h> 40 #include <sys/sunddi.h> 41 #include <sys/cmn_err.h> 42 #include <sys/kstat.h> 43 #include <sys/debug.h> 44 #include <sys/zone.h> 45 #include <sys/sunldi.h> 46 #include <sys/file.h> 47 #include <sys/bitmap.h> 48 #include <sys/cpuvar.h> 49 #include <sys/time.h> 50 #include <sys/ctype.h> 51 #include <sys/kmem.h> 52 #include <sys/systm.h> 53 #include <sys/param.h> 54 #include <sys/socket.h> 55 #include <sys/isa_defs.h> 56 #include <net/if.h> 57 #include <net/if_arp.h> 58 #include <net/if_types.h> 59 #include <net/if_dl.h> 60 #include <net/route.h> 61 #include <sys/sockio.h> 62 #include <netinet/in.h> 63 #include <netinet/ip6.h> 64 #include <netinet/icmp6.h> 65 #include <netinet/igmp_var.h> 66 #include <sys/policy.h> 67 #include <sys/ethernet.h> 68 #include <sys/callb.h> 69 #include <sys/md5.h> 70 71 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 72 #include <inet/mi.h> 73 #include <inet/nd.h> 74 #include <inet/arp.h> 75 #include <inet/ip_arp.h> 76 #include <inet/mib2.h> 77 #include <inet/ip.h> 78 #include <inet/ip6.h> 79 #include <inet/ip6_asp.h> 80 #include <inet/tcp.h> 81 #include <inet/ip_multi.h> 82 #include <inet/ip_ire.h> 83 #include <inet/ip_ftable.h> 84 #include <inet/ip_rts.h> 85 #include <inet/ip_ndp.h> 86 #include <inet/ip_if.h> 87 #include <inet/ip_impl.h> 88 #include <inet/sctp_ip.h> 89 #include <inet/ip_netinfo.h> 90 #include <inet/ilb_ip.h> 91 92 #include <netinet/igmp.h> 93 #include <inet/ip_listutils.h> 94 #include <inet/ipclassifier.h> 95 #include <sys/mac_client.h> 96 #include <sys/dld.h> 97 98 #include <sys/systeminfo.h> 99 #include <sys/bootconf.h> 100 101 #include <sys/tsol/tndb.h> 102 #include <sys/tsol/tnet.h> 103 104 /* The character which tells where the ill_name ends */ 105 #define IPIF_SEPARATOR_CHAR ':' 106 107 /* IP ioctl function table entry */ 108 typedef struct ipft_s { 109 int ipft_cmd; 110 pfi_t ipft_pfi; 111 int ipft_min_size; 112 int ipft_flags; 113 } ipft_t; 114 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 115 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 116 117 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 118 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 119 char *value, caddr_t cp, cred_t *ioc_cr); 120 121 static boolean_t ill_is_quiescent(ill_t *); 122 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 123 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 124 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 125 mblk_t *mp, boolean_t need_up); 126 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 127 mblk_t *mp, boolean_t need_up); 128 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 129 queue_t *q, mblk_t *mp, boolean_t need_up); 130 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 131 mblk_t *mp); 132 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 133 mblk_t *mp); 134 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 135 queue_t *q, mblk_t *mp, boolean_t need_up); 136 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, 137 int ioccmd, struct linkblk *li); 138 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); 139 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 140 static void ipsq_flush(ill_t *ill); 141 142 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 143 queue_t *q, mblk_t *mp, boolean_t need_up); 144 static void ipsq_delete(ipsq_t *); 145 146 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 147 boolean_t initialize, boolean_t insert, int *errorp); 148 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); 149 static void ipif_delete_bcast_ires(ipif_t *ipif); 150 static int ipif_add_ires_v4(ipif_t *, boolean_t); 151 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, 152 boolean_t isv6); 153 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 154 static void ipif_free(ipif_t *ipif); 155 static void ipif_free_tail(ipif_t *ipif); 156 static void ipif_set_default(ipif_t *ipif); 157 static int ipif_set_values(queue_t *q, mblk_t *mp, 158 char *interf_name, uint_t *ppa); 159 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 160 queue_t *q); 161 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 162 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 163 ip_stack_t *); 164 165 static int ill_alloc_ppa(ill_if_t *, ill_t *); 166 static void ill_delete_interface_type(ill_if_t *); 167 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 168 static void ill_dl_down(ill_t *ill); 169 static void ill_down(ill_t *ill); 170 static void ill_down_ipifs(ill_t *, boolean_t); 171 static void ill_free_mib(ill_t *ill); 172 static void ill_glist_delete(ill_t *); 173 static void ill_phyint_reinit(ill_t *ill); 174 static void ill_set_nce_router_flags(ill_t *, boolean_t); 175 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); 176 static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *); 177 178 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; 179 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid; 180 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; 181 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid; 182 static ip_v4mapinfo_func_t ip_ether_v4_mapping; 183 static ip_v6mapinfo_func_t ip_ether_v6_mapping; 184 static ip_v4mapinfo_func_t ip_ib_v4_mapping; 185 static ip_v6mapinfo_func_t ip_ib_v6_mapping; 186 static ip_v4mapinfo_func_t ip_mbcast_mapping; 187 static void ip_cgtp_bcast_add(ire_t *, ip_stack_t *); 188 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); 189 static void phyint_free(phyint_t *); 190 191 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *); 192 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 193 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 194 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 195 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *); 196 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 197 dl_capability_sub_t *); 198 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *); 199 static void ill_capability_dld_reset_fill(ill_t *, mblk_t *); 200 static void ill_capability_dld_ack(ill_t *, mblk_t *, 201 dl_capability_sub_t *); 202 static void ill_capability_dld_enable(ill_t *); 203 static void ill_capability_ack_thr(void *); 204 static void ill_capability_lso_enable(ill_t *); 205 206 static ill_t *ill_prev_usesrc(ill_t *); 207 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 208 static void ill_disband_usesrc_group(ill_t *); 209 static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int); 210 211 #ifdef DEBUG 212 static void ill_trace_cleanup(const ill_t *); 213 static void ipif_trace_cleanup(const ipif_t *); 214 #endif 215 216 /* 217 * if we go over the memory footprint limit more than once in this msec 218 * interval, we'll start pruning aggressively. 219 */ 220 int ip_min_frag_prune_time = 0; 221 222 static ipft_t ip_ioctl_ftbl[] = { 223 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 224 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 225 IPFT_F_NO_REPLY }, 226 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 227 { 0 } 228 }; 229 230 /* Simple ICMP IP Header Template */ 231 static ipha_t icmp_ipha = { 232 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 233 }; 234 235 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 236 237 static ip_m_t ip_m_tbl[] = { 238 { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 239 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 240 ip_nodef_v6intfid }, 241 { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6, 242 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 243 ip_nodef_v6intfid }, 244 { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6, 245 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 246 ip_nodef_v6intfid }, 247 { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6, 248 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 249 ip_nodef_v6intfid }, 250 { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6, 251 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 252 ip_nodef_v6intfid }, 253 { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6, 254 ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid, 255 ip_nodef_v6intfid }, 256 { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6, 257 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 258 ip_ipv4_v6destintfid }, 259 { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6, 260 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid, 261 ip_ipv6_v6destintfid }, 262 { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6, 263 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 264 ip_nodef_v6intfid }, 265 { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 266 NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid }, 267 { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 268 NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid }, 269 { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 270 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 271 ip_nodef_v6intfid } 272 }; 273 274 static ill_t ill_null; /* Empty ILL for init. */ 275 char ipif_loopback_name[] = "lo0"; 276 static char *ipv4_forward_suffix = ":ip_forwarding"; 277 static char *ipv6_forward_suffix = ":ip6_forwarding"; 278 static sin6_t sin6_null; /* Zero address for quick clears */ 279 static sin_t sin_null; /* Zero address for quick clears */ 280 281 /* When set search for unused ipif_seqid */ 282 static ipif_t ipif_zero; 283 284 /* 285 * ppa arena is created after these many 286 * interfaces have been plumbed. 287 */ 288 uint_t ill_no_arena = 12; /* Setable in /etc/system */ 289 290 /* 291 * Allocate per-interface mibs. 292 * Returns true if ok. False otherwise. 293 * ipsq may not yet be allocated (loopback case ). 294 */ 295 static boolean_t 296 ill_allocate_mibs(ill_t *ill) 297 { 298 /* Already allocated? */ 299 if (ill->ill_ip_mib != NULL) { 300 if (ill->ill_isv6) 301 ASSERT(ill->ill_icmp6_mib != NULL); 302 return (B_TRUE); 303 } 304 305 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), 306 KM_NOSLEEP); 307 if (ill->ill_ip_mib == NULL) { 308 return (B_FALSE); 309 } 310 311 /* Setup static information */ 312 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, 313 sizeof (mib2_ipIfStatsEntry_t)); 314 if (ill->ill_isv6) { 315 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 316 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 317 sizeof (mib2_ipv6AddrEntry_t)); 318 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 319 sizeof (mib2_ipv6RouteEntry_t)); 320 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 321 sizeof (mib2_ipv6NetToMediaEntry_t)); 322 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 323 sizeof (ipv6_member_t)); 324 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 325 sizeof (ipv6_grpsrc_t)); 326 } else { 327 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 328 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 329 sizeof (mib2_ipAddrEntry_t)); 330 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 331 sizeof (mib2_ipRouteEntry_t)); 332 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 333 sizeof (mib2_ipNetToMediaEntry_t)); 334 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 335 sizeof (ip_member_t)); 336 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 337 sizeof (ip_grpsrc_t)); 338 339 /* 340 * For a v4 ill, we are done at this point, because per ill 341 * icmp mibs are only used for v6. 342 */ 343 return (B_TRUE); 344 } 345 346 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 347 KM_NOSLEEP); 348 if (ill->ill_icmp6_mib == NULL) { 349 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 350 ill->ill_ip_mib = NULL; 351 return (B_FALSE); 352 } 353 /* static icmp info */ 354 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 355 sizeof (mib2_ipv6IfIcmpEntry_t); 356 /* 357 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later 358 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 359 * -> ill_phyint_reinit 360 */ 361 return (B_TRUE); 362 } 363 364 /* 365 * Completely vaporize a lower level tap and all associated interfaces. 366 * ill_delete is called only out of ip_close when the device control 367 * stream is being closed. 368 */ 369 void 370 ill_delete(ill_t *ill) 371 { 372 ipif_t *ipif; 373 ill_t *prev_ill; 374 ip_stack_t *ipst = ill->ill_ipst; 375 376 /* 377 * ill_delete may be forcibly entering the ipsq. The previous 378 * ioctl may not have completed and may need to be aborted. 379 * ipsq_flush takes care of it. If we don't need to enter the 380 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 381 * ill_delete_tail is sufficient. 382 */ 383 ipsq_flush(ill); 384 385 /* 386 * Nuke all interfaces. ipif_free will take down the interface, 387 * remove it from the list, and free the data structure. 388 * Walk down the ipif list and remove the logical interfaces 389 * first before removing the main ipif. We can't unplumb 390 * zeroth interface first in the case of IPv6 as update_conn_ill 391 * -> ip_ll_multireq de-references ill_ipif for checking 392 * POINTOPOINT. 393 * 394 * If ill_ipif was not properly initialized (i.e low on memory), 395 * then no interfaces to clean up. In this case just clean up the 396 * ill. 397 */ 398 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 399 ipif_free(ipif); 400 401 /* 402 * clean out all the nce_t entries that depend on this 403 * ill for the ill_phys_addr. 404 */ 405 nce_flush(ill, B_TRUE); 406 407 /* Clean up msgs on pending upcalls for mrouted */ 408 reset_mrt_ill(ill); 409 410 update_conn_ill(ill, ipst); 411 412 /* 413 * Remove multicast references added as a result of calls to 414 * ip_join_allmulti(). 415 */ 416 ip_purge_allmulti(ill); 417 418 /* 419 * If the ill being deleted is under IPMP, boot it out of the illgrp. 420 */ 421 if (IS_UNDER_IPMP(ill)) 422 ipmp_ill_leave_illgrp(ill); 423 424 /* 425 * ill_down will arrange to blow off any IRE's dependent on this 426 * ILL, and shut down fragmentation reassembly. 427 */ 428 ill_down(ill); 429 430 /* Let SCTP know, so that it can remove this from its list. */ 431 sctp_update_ill(ill, SCTP_ILL_REMOVE); 432 433 /* 434 * Walk all CONNs that can have a reference on an ire or nce for this 435 * ill (we actually walk all that now have stale references). 436 */ 437 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); 438 439 /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 440 if (ill->ill_isv6) 441 dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst); 442 443 /* 444 * If an address on this ILL is being used as a source address then 445 * clear out the pointers in other ILLs that point to this ILL. 446 */ 447 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 448 if (ill->ill_usesrc_grp_next != NULL) { 449 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 450 ill_disband_usesrc_group(ill); 451 } else { /* consumer of the usesrc ILL */ 452 prev_ill = ill_prev_usesrc(ill); 453 prev_ill->ill_usesrc_grp_next = 454 ill->ill_usesrc_grp_next; 455 } 456 } 457 rw_exit(&ipst->ips_ill_g_usesrc_lock); 458 } 459 460 static void 461 ipif_non_duplicate(ipif_t *ipif) 462 { 463 ill_t *ill = ipif->ipif_ill; 464 mutex_enter(&ill->ill_lock); 465 if (ipif->ipif_flags & IPIF_DUPLICATE) { 466 ipif->ipif_flags &= ~IPIF_DUPLICATE; 467 ASSERT(ill->ill_ipif_dup_count > 0); 468 ill->ill_ipif_dup_count--; 469 } 470 mutex_exit(&ill->ill_lock); 471 } 472 473 /* 474 * ill_delete_tail is called from ip_modclose after all references 475 * to the closing ill are gone. The wait is done in ip_modclose 476 */ 477 void 478 ill_delete_tail(ill_t *ill) 479 { 480 mblk_t **mpp; 481 ipif_t *ipif; 482 ip_stack_t *ipst = ill->ill_ipst; 483 484 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 485 ipif_non_duplicate(ipif); 486 (void) ipif_down_tail(ipif); 487 } 488 489 ASSERT(ill->ill_ipif_dup_count == 0); 490 491 /* 492 * If polling capability is enabled (which signifies direct 493 * upcall into IP and driver has ill saved as a handle), 494 * we need to make sure that unbind has completed before we 495 * let the ill disappear and driver no longer has any reference 496 * to this ill. 497 */ 498 mutex_enter(&ill->ill_lock); 499 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 500 cv_wait(&ill->ill_cv, &ill->ill_lock); 501 mutex_exit(&ill->ill_lock); 502 ASSERT(!(ill->ill_capabilities & 503 (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT))); 504 505 if (ill->ill_net_type != IRE_LOOPBACK) 506 qprocsoff(ill->ill_rq); 507 508 /* 509 * We do an ipsq_flush once again now. New messages could have 510 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 511 * could also have landed up if an ioctl thread had looked up 512 * the ill before we set the ILL_CONDEMNED flag, but not yet 513 * enqueued the ioctl when we did the ipsq_flush last time. 514 */ 515 ipsq_flush(ill); 516 517 /* 518 * Free capabilities. 519 */ 520 if (ill->ill_hcksum_capab != NULL) { 521 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 522 ill->ill_hcksum_capab = NULL; 523 } 524 525 if (ill->ill_zerocopy_capab != NULL) { 526 kmem_free(ill->ill_zerocopy_capab, 527 sizeof (ill_zerocopy_capab_t)); 528 ill->ill_zerocopy_capab = NULL; 529 } 530 531 if (ill->ill_lso_capab != NULL) { 532 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 533 ill->ill_lso_capab = NULL; 534 } 535 536 if (ill->ill_dld_capab != NULL) { 537 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t)); 538 ill->ill_dld_capab = NULL; 539 } 540 541 while (ill->ill_ipif != NULL) 542 ipif_free_tail(ill->ill_ipif); 543 544 /* 545 * We have removed all references to ilm from conn and the ones joined 546 * within the kernel. 547 * 548 * We don't walk conns, mrts and ires because 549 * 550 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts. 551 * 2) ill_down ->ill_downi walks all the ires and cleans up 552 * ill references. 553 */ 554 555 /* 556 * If this ill is an IPMP meta-interface, blow away the illgrp. This 557 * is safe to do because the illgrp has already been unlinked from the 558 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it. 559 */ 560 if (IS_IPMP(ill)) { 561 ipmp_illgrp_destroy(ill->ill_grp); 562 ill->ill_grp = NULL; 563 } 564 565 /* 566 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free 567 * could free the phyint. No more reference to the phyint after this 568 * point. 569 */ 570 (void) ill_glist_delete(ill); 571 572 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 573 if (ill->ill_ndd_name != NULL) 574 nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name); 575 rw_exit(&ipst->ips_ip_g_nd_lock); 576 577 if (ill->ill_frag_ptr != NULL) { 578 uint_t count; 579 580 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 581 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 582 } 583 mi_free(ill->ill_frag_ptr); 584 ill->ill_frag_ptr = NULL; 585 ill->ill_frag_hash_tbl = NULL; 586 } 587 588 freemsg(ill->ill_nd_lla_mp); 589 /* Free all retained control messages. */ 590 mpp = &ill->ill_first_mp_to_free; 591 do { 592 while (mpp[0]) { 593 mblk_t *mp; 594 mblk_t *mp1; 595 596 mp = mpp[0]; 597 mpp[0] = mp->b_next; 598 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 599 mp1->b_next = NULL; 600 mp1->b_prev = NULL; 601 } 602 freemsg(mp); 603 } 604 } while (mpp++ != &ill->ill_last_mp_to_free); 605 606 ill_free_mib(ill); 607 608 #ifdef DEBUG 609 ill_trace_cleanup(ill); 610 #endif 611 612 /* The default multicast interface might have changed */ 613 ire_increment_multicast_generation(ipst, ill->ill_isv6); 614 615 /* Drop refcnt here */ 616 netstack_rele(ill->ill_ipst->ips_netstack); 617 ill->ill_ipst = NULL; 618 } 619 620 static void 621 ill_free_mib(ill_t *ill) 622 { 623 ip_stack_t *ipst = ill->ill_ipst; 624 625 /* 626 * MIB statistics must not be lost, so when an interface 627 * goes away the counter values will be added to the global 628 * MIBs. 629 */ 630 if (ill->ill_ip_mib != NULL) { 631 if (ill->ill_isv6) { 632 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, 633 ill->ill_ip_mib); 634 } else { 635 ip_mib2_add_ip_stats(&ipst->ips_ip_mib, 636 ill->ill_ip_mib); 637 } 638 639 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 640 ill->ill_ip_mib = NULL; 641 } 642 if (ill->ill_icmp6_mib != NULL) { 643 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, 644 ill->ill_icmp6_mib); 645 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 646 ill->ill_icmp6_mib = NULL; 647 } 648 } 649 650 /* 651 * Concatenate together a physical address and a sap. 652 * 653 * Sap_lengths are interpreted as follows: 654 * sap_length == 0 ==> no sap 655 * sap_length > 0 ==> sap is at the head of the dlpi address 656 * sap_length < 0 ==> sap is at the tail of the dlpi address 657 */ 658 static void 659 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 660 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 661 { 662 uint16_t sap_addr = (uint16_t)sap_src; 663 664 if (sap_length == 0) { 665 if (phys_src == NULL) 666 bzero(dst, phys_length); 667 else 668 bcopy(phys_src, dst, phys_length); 669 } else if (sap_length < 0) { 670 if (phys_src == NULL) 671 bzero(dst, phys_length); 672 else 673 bcopy(phys_src, dst, phys_length); 674 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 675 } else { 676 bcopy(&sap_addr, dst, sizeof (sap_addr)); 677 if (phys_src == NULL) 678 bzero((char *)dst + sap_length, phys_length); 679 else 680 bcopy(phys_src, (char *)dst + sap_length, phys_length); 681 } 682 } 683 684 /* 685 * Generate a dl_unitdata_req mblk for the device and address given. 686 * addr_length is the length of the physical portion of the address. 687 * If addr is NULL include an all zero address of the specified length. 688 * TRUE? In any case, addr_length is taken to be the entire length of the 689 * dlpi address, including the absolute value of sap_length. 690 */ 691 mblk_t * 692 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 693 t_scalar_t sap_length) 694 { 695 dl_unitdata_req_t *dlur; 696 mblk_t *mp; 697 t_scalar_t abs_sap_length; /* absolute value */ 698 699 abs_sap_length = ABS(sap_length); 700 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 701 DL_UNITDATA_REQ); 702 if (mp == NULL) 703 return (NULL); 704 dlur = (dl_unitdata_req_t *)mp->b_rptr; 705 /* HACK: accomodate incompatible DLPI drivers */ 706 if (addr_length == 8) 707 addr_length = 6; 708 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 709 dlur->dl_dest_addr_offset = sizeof (*dlur); 710 dlur->dl_priority.dl_min = 0; 711 dlur->dl_priority.dl_max = 0; 712 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 713 (uchar_t *)&dlur[1]); 714 return (mp); 715 } 716 717 /* 718 * Add the pending mp to the list. There can be only 1 pending mp 719 * in the list. Any exclusive ioctl that needs to wait for a response 720 * from another module or driver needs to use this function to set 721 * the ipx_pending_mp to the ioctl mblk and wait for the response from 722 * the other module/driver. This is also used while waiting for the 723 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 724 */ 725 boolean_t 726 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 727 int waitfor) 728 { 729 ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop; 730 731 ASSERT(IAM_WRITER_IPIF(ipif)); 732 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 733 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 734 ASSERT(ipx->ipx_pending_mp == NULL); 735 /* 736 * The caller may be using a different ipif than the one passed into 737 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 738 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT 739 * that `ipx_current_ipif == ipif'. 740 */ 741 ASSERT(ipx->ipx_current_ipif != NULL); 742 743 /* 744 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the 745 * driver. 746 */ 747 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) || 748 (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) || 749 (DB_TYPE(add_mp) == M_PCPROTO)); 750 751 if (connp != NULL) { 752 ASSERT(MUTEX_HELD(&connp->conn_lock)); 753 /* 754 * Return error if the conn has started closing. The conn 755 * could have finished cleaning up the pending mp list, 756 * If so we should not add another mp to the list negating 757 * the cleanup. 758 */ 759 if (connp->conn_state_flags & CONN_CLOSING) 760 return (B_FALSE); 761 } 762 mutex_enter(&ipx->ipx_lock); 763 ipx->ipx_pending_ipif = ipif; 764 /* 765 * Note down the queue in b_queue. This will be returned by 766 * ipsq_pending_mp_get. Caller will then use these values to restart 767 * the processing 768 */ 769 add_mp->b_next = NULL; 770 add_mp->b_queue = q; 771 ipx->ipx_pending_mp = add_mp; 772 ipx->ipx_waitfor = waitfor; 773 mutex_exit(&ipx->ipx_lock); 774 775 if (connp != NULL) 776 connp->conn_oper_pending_ill = ipif->ipif_ill; 777 778 return (B_TRUE); 779 } 780 781 /* 782 * Retrieve the ipx_pending_mp and return it. There can be only 1 mp 783 * queued in the list. 784 */ 785 mblk_t * 786 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 787 { 788 mblk_t *curr = NULL; 789 ipxop_t *ipx = ipsq->ipsq_xop; 790 791 *connpp = NULL; 792 mutex_enter(&ipx->ipx_lock); 793 if (ipx->ipx_pending_mp == NULL) { 794 mutex_exit(&ipx->ipx_lock); 795 return (NULL); 796 } 797 798 /* There can be only 1 such excl message */ 799 curr = ipx->ipx_pending_mp; 800 ASSERT(curr->b_next == NULL); 801 ipx->ipx_pending_ipif = NULL; 802 ipx->ipx_pending_mp = NULL; 803 ipx->ipx_waitfor = 0; 804 mutex_exit(&ipx->ipx_lock); 805 806 if (CONN_Q(curr->b_queue)) { 807 /* 808 * This mp did a refhold on the conn, at the start of the ioctl. 809 * So we can safely return a pointer to the conn to the caller. 810 */ 811 *connpp = Q_TO_CONN(curr->b_queue); 812 } else { 813 *connpp = NULL; 814 } 815 curr->b_next = NULL; 816 curr->b_prev = NULL; 817 return (curr); 818 } 819 820 /* 821 * Cleanup the ioctl mp queued in ipx_pending_mp 822 * - Called in the ill_delete path 823 * - Called in the M_ERROR or M_HANGUP path on the ill. 824 * - Called in the conn close path. 825 */ 826 boolean_t 827 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 828 { 829 mblk_t *mp; 830 ipxop_t *ipx; 831 queue_t *q; 832 ipif_t *ipif; 833 int cmd; 834 835 ASSERT(IAM_WRITER_ILL(ill)); 836 ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 837 838 /* 839 * If connp is null, unconditionally clean up the ipx_pending_mp. 840 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 841 * even if it is meant for another ill, since we have to enqueue 842 * a new mp now in ipx_pending_mp to complete the ipif_down. 843 * If connp is non-null we are called from the conn close path. 844 */ 845 mutex_enter(&ipx->ipx_lock); 846 mp = ipx->ipx_pending_mp; 847 if (mp == NULL || (connp != NULL && 848 mp->b_queue != CONNP_TO_WQ(connp))) { 849 mutex_exit(&ipx->ipx_lock); 850 return (B_FALSE); 851 } 852 /* Now remove from the ipx_pending_mp */ 853 ipx->ipx_pending_mp = NULL; 854 q = mp->b_queue; 855 mp->b_next = NULL; 856 mp->b_prev = NULL; 857 mp->b_queue = NULL; 858 859 ipif = ipx->ipx_pending_ipif; 860 ipx->ipx_pending_ipif = NULL; 861 ipx->ipx_waitfor = 0; 862 ipx->ipx_current_ipif = NULL; 863 cmd = ipx->ipx_current_ioctl; 864 ipx->ipx_current_ioctl = 0; 865 ipx->ipx_current_done = B_TRUE; 866 mutex_exit(&ipx->ipx_lock); 867 868 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 869 DTRACE_PROBE4(ipif__ioctl, 870 char *, "ipsq_pending_mp_cleanup", 871 int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill, 872 ipif_t *, ipif); 873 if (connp == NULL) { 874 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 875 } else { 876 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); 877 mutex_enter(&ipif->ipif_ill->ill_lock); 878 ipif->ipif_state_flags &= ~IPIF_CHANGING; 879 mutex_exit(&ipif->ipif_ill->ill_lock); 880 } 881 } else { 882 /* 883 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 884 * be just inet_freemsg. we have to restart it 885 * otherwise the thread will be stuck. 886 */ 887 inet_freemsg(mp); 888 } 889 return (B_TRUE); 890 } 891 892 /* 893 * Called in the conn close path and ill delete path 894 */ 895 static void 896 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 897 { 898 ipsq_t *ipsq; 899 mblk_t *prev; 900 mblk_t *curr; 901 mblk_t *next; 902 queue_t *rq, *wq; 903 mblk_t *tmp_list = NULL; 904 905 ASSERT(IAM_WRITER_ILL(ill)); 906 if (connp != NULL) 907 wq = CONNP_TO_WQ(connp); 908 else 909 wq = ill->ill_wq; 910 rq = RD(wq); 911 912 ipsq = ill->ill_phyint->phyint_ipsq; 913 /* 914 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 915 * In the case of ioctl from a conn, there can be only 1 mp 916 * queued on the ipsq. If an ill is being unplumbed, only messages 917 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 918 * ioctls meant for this ill form conn's are not flushed. They will 919 * be processed during ipsq_exit and will not find the ill and will 920 * return error. 921 */ 922 mutex_enter(&ipsq->ipsq_lock); 923 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 924 curr = next) { 925 next = curr->b_next; 926 if (curr->b_queue == wq || curr->b_queue == rq) { 927 /* Unlink the mblk from the pending mp list */ 928 if (prev != NULL) { 929 prev->b_next = curr->b_next; 930 } else { 931 ASSERT(ipsq->ipsq_xopq_mphead == curr); 932 ipsq->ipsq_xopq_mphead = curr->b_next; 933 } 934 if (ipsq->ipsq_xopq_mptail == curr) 935 ipsq->ipsq_xopq_mptail = prev; 936 /* 937 * Create a temporary list and release the ipsq lock 938 * New elements are added to the head of the tmp_list 939 */ 940 curr->b_next = tmp_list; 941 tmp_list = curr; 942 } else { 943 prev = curr; 944 } 945 } 946 mutex_exit(&ipsq->ipsq_lock); 947 948 while (tmp_list != NULL) { 949 curr = tmp_list; 950 tmp_list = curr->b_next; 951 curr->b_next = NULL; 952 curr->b_prev = NULL; 953 curr->b_queue = NULL; 954 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 955 DTRACE_PROBE4(ipif__ioctl, 956 char *, "ipsq_xopq_mp_cleanup", 957 int, 0, ill_t *, NULL, ipif_t *, NULL); 958 ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ? 959 CONN_CLOSE : NO_COPYOUT, NULL); 960 } else { 961 /* 962 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 963 * this can't be just inet_freemsg. we have to 964 * restart it otherwise the thread will be stuck. 965 */ 966 inet_freemsg(curr); 967 } 968 } 969 } 970 971 /* 972 * This conn has started closing. Cleanup any pending ioctl from this conn. 973 * STREAMS ensures that there can be at most 1 active ioctl on a stream. 974 */ 975 void 976 conn_ioctl_cleanup(conn_t *connp) 977 { 978 ipsq_t *ipsq; 979 ill_t *ill; 980 boolean_t refheld; 981 982 /* 983 * Check for a queued ioctl. If the ioctl has not yet started, the mp 984 * is pending in the list headed by ipsq_xopq_head. If the ioctl has 985 * started the mp could be present in ipx_pending_mp. Note that if 986 * conn_oper_pending_ill is NULL, the ioctl may still be in flight and 987 * not yet queued anywhere. In this case, the conn close code will wait 988 * until the conn_ref is dropped. If the stream was a tcp stream, then 989 * tcp_close will wait first until all ioctls have completed for this 990 * conn. 991 */ 992 mutex_enter(&connp->conn_lock); 993 ill = connp->conn_oper_pending_ill; 994 if (ill == NULL) { 995 mutex_exit(&connp->conn_lock); 996 return; 997 } 998 999 /* 1000 * We may not be able to refhold the ill if the ill/ipif 1001 * is changing. But we need to make sure that the ill will 1002 * not vanish. So we just bump up the ill_waiter count. 1003 */ 1004 refheld = ill_waiter_inc(ill); 1005 mutex_exit(&connp->conn_lock); 1006 if (refheld) { 1007 if (ipsq_enter(ill, B_TRUE, NEW_OP)) { 1008 ill_waiter_dcr(ill); 1009 /* 1010 * Check whether this ioctl has started and is 1011 * pending. If it is not found there then check 1012 * whether this ioctl has not even started and is in 1013 * the ipsq_xopq list. 1014 */ 1015 if (!ipsq_pending_mp_cleanup(ill, connp)) 1016 ipsq_xopq_mp_cleanup(ill, connp); 1017 ipsq = ill->ill_phyint->phyint_ipsq; 1018 ipsq_exit(ipsq); 1019 return; 1020 } 1021 } 1022 1023 /* 1024 * The ill is also closing and we could not bump up the 1025 * ill_waiter_count or we could not enter the ipsq. Leave 1026 * the cleanup to ill_delete 1027 */ 1028 mutex_enter(&connp->conn_lock); 1029 while (connp->conn_oper_pending_ill != NULL) 1030 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1031 mutex_exit(&connp->conn_lock); 1032 if (refheld) 1033 ill_waiter_dcr(ill); 1034 } 1035 1036 /* 1037 * ipcl_walk function for cleaning up conn_*_ill fields. 1038 * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and 1039 * conn_bound_if in place. We prefer dropping 1040 * packets instead of sending them out the wrong interface, or accepting 1041 * packets from the wrong ifindex. 1042 */ 1043 static void 1044 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1045 { 1046 ill_t *ill = (ill_t *)arg; 1047 1048 mutex_enter(&connp->conn_lock); 1049 if (connp->conn_dhcpinit_ill == ill) { 1050 connp->conn_dhcpinit_ill = NULL; 1051 ASSERT(ill->ill_dhcpinit != 0); 1052 atomic_dec_32(&ill->ill_dhcpinit); 1053 ill_set_inputfn(ill); 1054 } 1055 mutex_exit(&connp->conn_lock); 1056 } 1057 1058 static int 1059 ill_down_ipifs_tail(ill_t *ill) 1060 { 1061 ipif_t *ipif; 1062 int err; 1063 1064 ASSERT(IAM_WRITER_ILL(ill)); 1065 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1066 ipif_non_duplicate(ipif); 1067 /* 1068 * ipif_down_tail will call arp_ll_down on the last ipif 1069 * and typically return EINPROGRESS when the DL_UNBIND is sent. 1070 */ 1071 if ((err = ipif_down_tail(ipif)) != 0) 1072 return (err); 1073 } 1074 return (0); 1075 } 1076 1077 /* ARGSUSED */ 1078 void 1079 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1080 { 1081 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1082 (void) ill_down_ipifs_tail(q->q_ptr); 1083 freemsg(mp); 1084 ipsq_current_finish(ipsq); 1085 } 1086 1087 /* 1088 * ill_down_start is called when we want to down this ill and bring it up again 1089 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1090 * all interfaces, but don't tear down any plumbing. 1091 */ 1092 boolean_t 1093 ill_down_start(queue_t *q, mblk_t *mp) 1094 { 1095 ill_t *ill = q->q_ptr; 1096 ipif_t *ipif; 1097 1098 ASSERT(IAM_WRITER_ILL(ill)); 1099 mutex_enter(&ill->ill_lock); 1100 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 1101 /* no more nce addition allowed */ 1102 mutex_exit(&ill->ill_lock); 1103 1104 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1105 (void) ipif_down(ipif, NULL, NULL); 1106 1107 ill_down(ill); 1108 1109 /* 1110 * Walk all CONNs that can have a reference on an ire or nce for this 1111 * ill (we actually walk all that now have stale references). 1112 */ 1113 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst); 1114 1115 /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 1116 if (ill->ill_isv6) 1117 dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst); 1118 1119 1120 (void) ipsq_pending_mp_cleanup(ill, NULL); 1121 1122 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); 1123 1124 /* 1125 * Atomically test and add the pending mp if references are active. 1126 */ 1127 mutex_enter(&ill->ill_lock); 1128 if (!ill_is_quiescent(ill)) { 1129 /* call cannot fail since `conn_t *' argument is NULL */ 1130 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1131 mp, ILL_DOWN); 1132 mutex_exit(&ill->ill_lock); 1133 return (B_FALSE); 1134 } 1135 mutex_exit(&ill->ill_lock); 1136 return (B_TRUE); 1137 } 1138 1139 static void 1140 ill_down(ill_t *ill) 1141 { 1142 mblk_t *mp; 1143 ip_stack_t *ipst = ill->ill_ipst; 1144 1145 /* 1146 * Blow off any IREs dependent on this ILL. 1147 * The caller needs to handle conn_ixa_cleanup 1148 */ 1149 ill_delete_ires(ill); 1150 1151 ire_walk_ill(0, 0, ill_downi, ill, ill); 1152 1153 /* Remove any conn_*_ill depending on this ill */ 1154 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); 1155 1156 /* 1157 * Free state for additional IREs. 1158 */ 1159 mutex_enter(&ill->ill_saved_ire_lock); 1160 mp = ill->ill_saved_ire_mp; 1161 ill->ill_saved_ire_mp = NULL; 1162 ill->ill_saved_ire_cnt = 0; 1163 mutex_exit(&ill->ill_saved_ire_lock); 1164 freemsg(mp); 1165 } 1166 1167 /* 1168 * ire_walk routine used to delete every IRE that depends on 1169 * 'ill'. (Always called as writer.) 1170 * 1171 * Note: since the routes added by the kernel are deleted separately, 1172 * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE. 1173 * 1174 * We also remove references on ire_nce_cache entries that refer to the ill. 1175 */ 1176 void 1177 ill_downi(ire_t *ire, char *ill_arg) 1178 { 1179 ill_t *ill = (ill_t *)ill_arg; 1180 nce_t *nce; 1181 1182 mutex_enter(&ire->ire_lock); 1183 nce = ire->ire_nce_cache; 1184 if (nce != NULL && nce->nce_ill == ill) 1185 ire->ire_nce_cache = NULL; 1186 else 1187 nce = NULL; 1188 mutex_exit(&ire->ire_lock); 1189 if (nce != NULL) 1190 nce_refrele(nce); 1191 if (ire->ire_ill == ill) 1192 ire_delete(ire); 1193 } 1194 1195 /* Remove IRE_IF_CLONE on this ill */ 1196 void 1197 ill_downi_if_clone(ire_t *ire, char *ill_arg) 1198 { 1199 ill_t *ill = (ill_t *)ill_arg; 1200 1201 ASSERT(ire->ire_type & IRE_IF_CLONE); 1202 if (ire->ire_ill == ill) 1203 ire_delete(ire); 1204 } 1205 1206 /* Consume an M_IOCACK of the fastpath probe. */ 1207 void 1208 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1209 { 1210 mblk_t *mp1 = mp; 1211 1212 /* 1213 * If this was the first attempt turn on the fastpath probing. 1214 */ 1215 mutex_enter(&ill->ill_lock); 1216 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1217 ill->ill_dlpi_fastpath_state = IDS_OK; 1218 mutex_exit(&ill->ill_lock); 1219 1220 /* Free the M_IOCACK mblk, hold on to the data */ 1221 mp = mp->b_cont; 1222 freeb(mp1); 1223 if (mp == NULL) 1224 return; 1225 if (mp->b_cont != NULL) 1226 nce_fastpath_update(ill, mp); 1227 else 1228 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1229 freemsg(mp); 1230 } 1231 1232 /* 1233 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1234 * The data portion of the request is a dl_unitdata_req_t template for 1235 * what we would send downstream in the absence of a fastpath confirmation. 1236 */ 1237 int 1238 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1239 { 1240 struct iocblk *ioc; 1241 mblk_t *mp; 1242 1243 if (dlur_mp == NULL) 1244 return (EINVAL); 1245 1246 mutex_enter(&ill->ill_lock); 1247 switch (ill->ill_dlpi_fastpath_state) { 1248 case IDS_FAILED: 1249 /* 1250 * Driver NAKed the first fastpath ioctl - assume it doesn't 1251 * support it. 1252 */ 1253 mutex_exit(&ill->ill_lock); 1254 return (ENOTSUP); 1255 case IDS_UNKNOWN: 1256 /* This is the first probe */ 1257 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1258 break; 1259 default: 1260 break; 1261 } 1262 mutex_exit(&ill->ill_lock); 1263 1264 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1265 return (EAGAIN); 1266 1267 mp->b_cont = copyb(dlur_mp); 1268 if (mp->b_cont == NULL) { 1269 freeb(mp); 1270 return (EAGAIN); 1271 } 1272 1273 ioc = (struct iocblk *)mp->b_rptr; 1274 ioc->ioc_count = msgdsize(mp->b_cont); 1275 1276 DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe", 1277 char *, "DL_IOC_HDR_INFO", ill_t *, ill); 1278 putnext(ill->ill_wq, mp); 1279 return (0); 1280 } 1281 1282 void 1283 ill_capability_probe(ill_t *ill) 1284 { 1285 mblk_t *mp; 1286 1287 ASSERT(IAM_WRITER_ILL(ill)); 1288 1289 if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN && 1290 ill->ill_dlpi_capab_state != IDCS_FAILED) 1291 return; 1292 1293 /* 1294 * We are starting a new cycle of capability negotiation. 1295 * Free up the capab reset messages of any previous incarnation. 1296 * We will do a fresh allocation when we get the response to our probe 1297 */ 1298 if (ill->ill_capab_reset_mp != NULL) { 1299 freemsg(ill->ill_capab_reset_mp); 1300 ill->ill_capab_reset_mp = NULL; 1301 } 1302 1303 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1304 1305 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); 1306 if (mp == NULL) 1307 return; 1308 1309 ill_capability_send(ill, mp); 1310 ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; 1311 } 1312 1313 void 1314 ill_capability_reset(ill_t *ill, boolean_t reneg) 1315 { 1316 ASSERT(IAM_WRITER_ILL(ill)); 1317 1318 if (ill->ill_dlpi_capab_state != IDCS_OK) 1319 return; 1320 1321 ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; 1322 1323 ill_capability_send(ill, ill->ill_capab_reset_mp); 1324 ill->ill_capab_reset_mp = NULL; 1325 /* 1326 * We turn off all capabilities except those pertaining to 1327 * direct function call capabilities viz. ILL_CAPAB_DLD* 1328 * which will be turned off by the corresponding reset functions. 1329 */ 1330 ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY); 1331 } 1332 1333 static void 1334 ill_capability_reset_alloc(ill_t *ill) 1335 { 1336 mblk_t *mp; 1337 size_t size = 0; 1338 int err; 1339 dl_capability_req_t *capb; 1340 1341 ASSERT(IAM_WRITER_ILL(ill)); 1342 ASSERT(ill->ill_capab_reset_mp == NULL); 1343 1344 if (ILL_HCKSUM_CAPABLE(ill)) { 1345 size += sizeof (dl_capability_sub_t) + 1346 sizeof (dl_capab_hcksum_t); 1347 } 1348 1349 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) { 1350 size += sizeof (dl_capability_sub_t) + 1351 sizeof (dl_capab_zerocopy_t); 1352 } 1353 1354 if (ill->ill_capabilities & ILL_CAPAB_DLD) { 1355 size += sizeof (dl_capability_sub_t) + 1356 sizeof (dl_capab_dld_t); 1357 } 1358 1359 mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED, 1360 STR_NOSIG, &err); 1361 1362 mp->b_datap->db_type = M_PROTO; 1363 bzero(mp->b_rptr, size + sizeof (dl_capability_req_t)); 1364 1365 capb = (dl_capability_req_t *)mp->b_rptr; 1366 capb->dl_primitive = DL_CAPABILITY_REQ; 1367 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1368 capb->dl_sub_length = size; 1369 1370 mp->b_wptr += sizeof (dl_capability_req_t); 1371 1372 /* 1373 * Each handler fills in the corresponding dl_capability_sub_t 1374 * inside the mblk, 1375 */ 1376 ill_capability_hcksum_reset_fill(ill, mp); 1377 ill_capability_zerocopy_reset_fill(ill, mp); 1378 ill_capability_dld_reset_fill(ill, mp); 1379 1380 ill->ill_capab_reset_mp = mp; 1381 } 1382 1383 static void 1384 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1385 { 1386 dl_capab_id_t *id_ic; 1387 uint_t sub_dl_cap = outers->dl_cap; 1388 dl_capability_sub_t *inners; 1389 uint8_t *capend; 1390 1391 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1392 1393 /* 1394 * Note: range checks here are not absolutely sufficient to 1395 * make us robust against malformed messages sent by drivers; 1396 * this is in keeping with the rest of IP's dlpi handling. 1397 * (Remember, it's coming from something else in the kernel 1398 * address space) 1399 */ 1400 1401 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1402 if (capend > mp->b_wptr) { 1403 cmn_err(CE_WARN, "ill_capability_id_ack: " 1404 "malformed sub-capability too long for mblk"); 1405 return; 1406 } 1407 1408 id_ic = (dl_capab_id_t *)(outers + 1); 1409 1410 if (outers->dl_length < sizeof (*id_ic) || 1411 (inners = &id_ic->id_subcap, 1412 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1413 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1414 "encapsulated capab type %d too long for mblk", 1415 inners->dl_cap); 1416 return; 1417 } 1418 1419 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1420 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1421 "isn't as expected; pass-thru module(s) detected, " 1422 "discarding capability\n", inners->dl_cap)); 1423 return; 1424 } 1425 1426 /* Process the encapsulated sub-capability */ 1427 ill_capability_dispatch(ill, mp, inners); 1428 } 1429 1430 static void 1431 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp) 1432 { 1433 dl_capability_sub_t *dl_subcap; 1434 1435 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 1436 return; 1437 1438 /* 1439 * The dl_capab_dld_t that follows the dl_capability_sub_t is not 1440 * initialized below since it is not used by DLD. 1441 */ 1442 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1443 dl_subcap->dl_cap = DL_CAPAB_DLD; 1444 dl_subcap->dl_length = sizeof (dl_capab_dld_t); 1445 1446 mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); 1447 } 1448 1449 static void 1450 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp) 1451 { 1452 /* 1453 * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK 1454 * is only to get the VRRP capability. 1455 * 1456 * Note that we cannot check ill_ipif_up_count here since 1457 * ill_ipif_up_count is only incremented when the resolver is setup. 1458 * That is done asynchronously, and can race with this function. 1459 */ 1460 if (!ill->ill_dl_up) { 1461 if (subp->dl_cap == DL_CAPAB_VRRP) 1462 ill_capability_vrrp_ack(ill, mp, subp); 1463 return; 1464 } 1465 1466 switch (subp->dl_cap) { 1467 case DL_CAPAB_HCKSUM: 1468 ill_capability_hcksum_ack(ill, mp, subp); 1469 break; 1470 case DL_CAPAB_ZEROCOPY: 1471 ill_capability_zerocopy_ack(ill, mp, subp); 1472 break; 1473 case DL_CAPAB_DLD: 1474 ill_capability_dld_ack(ill, mp, subp); 1475 break; 1476 case DL_CAPAB_VRRP: 1477 break; 1478 default: 1479 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 1480 subp->dl_cap)); 1481 } 1482 } 1483 1484 /* 1485 * Process the vrrp capability received from a DLS Provider. isub must point 1486 * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message. 1487 */ 1488 static void 1489 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1490 { 1491 dl_capab_vrrp_t *vrrp; 1492 uint_t sub_dl_cap = isub->dl_cap; 1493 uint8_t *capend; 1494 1495 ASSERT(IAM_WRITER_ILL(ill)); 1496 ASSERT(sub_dl_cap == DL_CAPAB_VRRP); 1497 1498 /* 1499 * Note: range checks here are not absolutely sufficient to 1500 * make us robust against malformed messages sent by drivers; 1501 * this is in keeping with the rest of IP's dlpi handling. 1502 * (Remember, it's coming from something else in the kernel 1503 * address space) 1504 */ 1505 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1506 if (capend > mp->b_wptr) { 1507 cmn_err(CE_WARN, "ill_capability_vrrp_ack: " 1508 "malformed sub-capability too long for mblk"); 1509 return; 1510 } 1511 vrrp = (dl_capab_vrrp_t *)(isub + 1); 1512 1513 /* 1514 * Compare the IP address family and set ILLF_VRRP for the right ill. 1515 */ 1516 if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) || 1517 (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) { 1518 ill->ill_flags |= ILLF_VRRP; 1519 } 1520 } 1521 1522 /* 1523 * Process a hardware checksum offload capability negotiation ack received 1524 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 1525 * of a DL_CAPABILITY_ACK message. 1526 */ 1527 static void 1528 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1529 { 1530 dl_capability_req_t *ocap; 1531 dl_capab_hcksum_t *ihck, *ohck; 1532 ill_hcksum_capab_t **ill_hcksum; 1533 mblk_t *nmp = NULL; 1534 uint_t sub_dl_cap = isub->dl_cap; 1535 uint8_t *capend; 1536 1537 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 1538 1539 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 1540 1541 /* 1542 * Note: range checks here are not absolutely sufficient to 1543 * make us robust against malformed messages sent by drivers; 1544 * this is in keeping with the rest of IP's dlpi handling. 1545 * (Remember, it's coming from something else in the kernel 1546 * address space) 1547 */ 1548 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1549 if (capend > mp->b_wptr) { 1550 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1551 "malformed sub-capability too long for mblk"); 1552 return; 1553 } 1554 1555 /* 1556 * There are two types of acks we process here: 1557 * 1. acks in reply to a (first form) generic capability req 1558 * (no ENABLE flag set) 1559 * 2. acks in reply to a ENABLE capability req. 1560 * (ENABLE flag set) 1561 */ 1562 ihck = (dl_capab_hcksum_t *)(isub + 1); 1563 1564 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 1565 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 1566 "unsupported hardware checksum " 1567 "sub-capability (version %d, expected %d)", 1568 ihck->hcksum_version, HCKSUM_VERSION_1); 1569 return; 1570 } 1571 1572 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 1573 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 1574 "checksum capability isn't as expected; pass-thru " 1575 "module(s) detected, discarding capability\n")); 1576 return; 1577 } 1578 1579 #define CURR_HCKSUM_CAPAB \ 1580 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 1581 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 1582 1583 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 1584 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 1585 /* do ENABLE processing */ 1586 if (*ill_hcksum == NULL) { 1587 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 1588 KM_NOSLEEP); 1589 1590 if (*ill_hcksum == NULL) { 1591 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1592 "could not enable hcksum version %d " 1593 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 1594 ill->ill_name); 1595 return; 1596 } 1597 } 1598 1599 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 1600 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 1601 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 1602 ip1dbg(("ill_capability_hcksum_ack: interface %s " 1603 "has enabled hardware checksumming\n ", 1604 ill->ill_name)); 1605 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 1606 /* 1607 * Enabling hardware checksum offload 1608 * Currently IP supports {TCP,UDP}/IPv4 1609 * partial and full cksum offload and 1610 * IPv4 header checksum offload. 1611 * Allocate new mblk which will 1612 * contain a new capability request 1613 * to enable hardware checksum offload. 1614 */ 1615 uint_t size; 1616 uchar_t *rptr; 1617 1618 size = sizeof (dl_capability_req_t) + 1619 sizeof (dl_capability_sub_t) + isub->dl_length; 1620 1621 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1622 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1623 "could not enable hardware cksum for %s (ENOMEM)\n", 1624 ill->ill_name); 1625 return; 1626 } 1627 1628 rptr = nmp->b_rptr; 1629 /* initialize dl_capability_req_t */ 1630 ocap = (dl_capability_req_t *)nmp->b_rptr; 1631 ocap->dl_sub_offset = 1632 sizeof (dl_capability_req_t); 1633 ocap->dl_sub_length = 1634 sizeof (dl_capability_sub_t) + 1635 isub->dl_length; 1636 nmp->b_rptr += sizeof (dl_capability_req_t); 1637 1638 /* initialize dl_capability_sub_t */ 1639 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 1640 nmp->b_rptr += sizeof (*isub); 1641 1642 /* initialize dl_capab_hcksum_t */ 1643 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 1644 bcopy(ihck, ohck, sizeof (*ihck)); 1645 1646 nmp->b_rptr = rptr; 1647 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 1648 1649 /* Set ENABLE flag */ 1650 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 1651 ohck->hcksum_txflags |= HCKSUM_ENABLE; 1652 1653 /* 1654 * nmp points to a DL_CAPABILITY_REQ message to enable 1655 * hardware checksum acceleration. 1656 */ 1657 ill_capability_send(ill, nmp); 1658 } else { 1659 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 1660 "advertised %x hardware checksum capability flags\n", 1661 ill->ill_name, ihck->hcksum_txflags)); 1662 } 1663 } 1664 1665 static void 1666 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp) 1667 { 1668 dl_capab_hcksum_t *hck_subcap; 1669 dl_capability_sub_t *dl_subcap; 1670 1671 if (!ILL_HCKSUM_CAPABLE(ill)) 1672 return; 1673 1674 ASSERT(ill->ill_hcksum_capab != NULL); 1675 1676 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1677 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 1678 dl_subcap->dl_length = sizeof (*hck_subcap); 1679 1680 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 1681 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 1682 hck_subcap->hcksum_txflags = 0; 1683 1684 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap); 1685 } 1686 1687 static void 1688 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1689 { 1690 mblk_t *nmp = NULL; 1691 dl_capability_req_t *oc; 1692 dl_capab_zerocopy_t *zc_ic, *zc_oc; 1693 ill_zerocopy_capab_t **ill_zerocopy_capab; 1694 uint_t sub_dl_cap = isub->dl_cap; 1695 uint8_t *capend; 1696 1697 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 1698 1699 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 1700 1701 /* 1702 * Note: range checks here are not absolutely sufficient to 1703 * make us robust against malformed messages sent by drivers; 1704 * this is in keeping with the rest of IP's dlpi handling. 1705 * (Remember, it's coming from something else in the kernel 1706 * address space) 1707 */ 1708 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1709 if (capend > mp->b_wptr) { 1710 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1711 "malformed sub-capability too long for mblk"); 1712 return; 1713 } 1714 1715 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 1716 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 1717 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 1718 "unsupported ZEROCOPY sub-capability (version %d, " 1719 "expected %d)", zc_ic->zerocopy_version, 1720 ZEROCOPY_VERSION_1); 1721 return; 1722 } 1723 1724 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 1725 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 1726 "capability isn't as expected; pass-thru module(s) " 1727 "detected, discarding capability\n")); 1728 return; 1729 } 1730 1731 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 1732 if (*ill_zerocopy_capab == NULL) { 1733 *ill_zerocopy_capab = 1734 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 1735 KM_NOSLEEP); 1736 1737 if (*ill_zerocopy_capab == NULL) { 1738 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1739 "could not enable Zero-copy version %d " 1740 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 1741 ill->ill_name); 1742 return; 1743 } 1744 } 1745 1746 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 1747 "supports Zero-copy version %d\n", ill->ill_name, 1748 ZEROCOPY_VERSION_1)); 1749 1750 (*ill_zerocopy_capab)->ill_zerocopy_version = 1751 zc_ic->zerocopy_version; 1752 (*ill_zerocopy_capab)->ill_zerocopy_flags = 1753 zc_ic->zerocopy_flags; 1754 1755 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 1756 } else { 1757 uint_t size; 1758 uchar_t *rptr; 1759 1760 size = sizeof (dl_capability_req_t) + 1761 sizeof (dl_capability_sub_t) + 1762 sizeof (dl_capab_zerocopy_t); 1763 1764 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1765 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1766 "could not enable zerocopy for %s (ENOMEM)\n", 1767 ill->ill_name); 1768 return; 1769 } 1770 1771 rptr = nmp->b_rptr; 1772 /* initialize dl_capability_req_t */ 1773 oc = (dl_capability_req_t *)rptr; 1774 oc->dl_sub_offset = sizeof (dl_capability_req_t); 1775 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 1776 sizeof (dl_capab_zerocopy_t); 1777 rptr += sizeof (dl_capability_req_t); 1778 1779 /* initialize dl_capability_sub_t */ 1780 bcopy(isub, rptr, sizeof (*isub)); 1781 rptr += sizeof (*isub); 1782 1783 /* initialize dl_capab_zerocopy_t */ 1784 zc_oc = (dl_capab_zerocopy_t *)rptr; 1785 *zc_oc = *zc_ic; 1786 1787 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 1788 "to enable zero-copy version %d\n", ill->ill_name, 1789 ZEROCOPY_VERSION_1)); 1790 1791 /* set VMSAFE_MEM flag */ 1792 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 1793 1794 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 1795 ill_capability_send(ill, nmp); 1796 } 1797 } 1798 1799 static void 1800 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp) 1801 { 1802 dl_capab_zerocopy_t *zerocopy_subcap; 1803 dl_capability_sub_t *dl_subcap; 1804 1805 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 1806 return; 1807 1808 ASSERT(ill->ill_zerocopy_capab != NULL); 1809 1810 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1811 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 1812 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 1813 1814 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 1815 zerocopy_subcap->zerocopy_version = 1816 ill->ill_zerocopy_capab->ill_zerocopy_version; 1817 zerocopy_subcap->zerocopy_flags = 0; 1818 1819 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 1820 } 1821 1822 /* 1823 * DLD capability 1824 * Refer to dld.h for more information regarding the purpose and usage 1825 * of this capability. 1826 */ 1827 static void 1828 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1829 { 1830 dl_capab_dld_t *dld_ic, dld; 1831 uint_t sub_dl_cap = isub->dl_cap; 1832 uint8_t *capend; 1833 ill_dld_capab_t *idc; 1834 1835 ASSERT(IAM_WRITER_ILL(ill)); 1836 ASSERT(sub_dl_cap == DL_CAPAB_DLD); 1837 1838 /* 1839 * Note: range checks here are not absolutely sufficient to 1840 * make us robust against malformed messages sent by drivers; 1841 * this is in keeping with the rest of IP's dlpi handling. 1842 * (Remember, it's coming from something else in the kernel 1843 * address space) 1844 */ 1845 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1846 if (capend > mp->b_wptr) { 1847 cmn_err(CE_WARN, "ill_capability_dld_ack: " 1848 "malformed sub-capability too long for mblk"); 1849 return; 1850 } 1851 dld_ic = (dl_capab_dld_t *)(isub + 1); 1852 if (dld_ic->dld_version != DLD_CURRENT_VERSION) { 1853 cmn_err(CE_CONT, "ill_capability_dld_ack: " 1854 "unsupported DLD sub-capability (version %d, " 1855 "expected %d)", dld_ic->dld_version, 1856 DLD_CURRENT_VERSION); 1857 return; 1858 } 1859 if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) { 1860 ip1dbg(("ill_capability_dld_ack: mid token for dld " 1861 "capability isn't as expected; pass-thru module(s) " 1862 "detected, discarding capability\n")); 1863 return; 1864 } 1865 1866 /* 1867 * Copy locally to ensure alignment. 1868 */ 1869 bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t)); 1870 1871 if ((idc = ill->ill_dld_capab) == NULL) { 1872 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP); 1873 if (idc == NULL) { 1874 cmn_err(CE_WARN, "ill_capability_dld_ack: " 1875 "could not enable DLD version %d " 1876 "for %s (ENOMEM)\n", DLD_CURRENT_VERSION, 1877 ill->ill_name); 1878 return; 1879 } 1880 ill->ill_dld_capab = idc; 1881 } 1882 idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab; 1883 idc->idc_capab_dh = (void *)dld.dld_capab_handle; 1884 ip1dbg(("ill_capability_dld_ack: interface %s " 1885 "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION)); 1886 1887 ill_capability_dld_enable(ill); 1888 } 1889 1890 /* 1891 * Typically capability negotiation between IP and the driver happens via 1892 * DLPI message exchange. However GLD also offers a direct function call 1893 * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities, 1894 * But arbitrary function calls into IP or GLD are not permitted, since both 1895 * of them are protected by their own perimeter mechanism. The perimeter can 1896 * be viewed as a coarse lock or serialization mechanism. The hierarchy of 1897 * these perimeters is IP -> MAC. Thus for example to enable the squeue 1898 * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter 1899 * to enter the mac perimeter and then do the direct function calls into 1900 * GLD to enable squeue polling. The ring related callbacks from the mac into 1901 * the stack to add, bind, quiesce, restart or cleanup a ring are all 1902 * protected by the mac perimeter. 1903 */ 1904 static void 1905 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp) 1906 { 1907 ill_dld_capab_t *idc = ill->ill_dld_capab; 1908 int err; 1909 1910 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp, 1911 DLD_ENABLE); 1912 ASSERT(err == 0); 1913 } 1914 1915 static void 1916 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph) 1917 { 1918 ill_dld_capab_t *idc = ill->ill_dld_capab; 1919 int err; 1920 1921 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph, 1922 DLD_DISABLE); 1923 ASSERT(err == 0); 1924 } 1925 1926 boolean_t 1927 ill_mac_perim_held(ill_t *ill) 1928 { 1929 ill_dld_capab_t *idc = ill->ill_dld_capab; 1930 1931 return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL, 1932 DLD_QUERY)); 1933 } 1934 1935 static void 1936 ill_capability_direct_enable(ill_t *ill) 1937 { 1938 ill_dld_capab_t *idc = ill->ill_dld_capab; 1939 ill_dld_direct_t *idd = &idc->idc_direct; 1940 dld_capab_direct_t direct; 1941 int rc; 1942 1943 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 1944 1945 bzero(&direct, sizeof (direct)); 1946 direct.di_rx_cf = (uintptr_t)ip_input; 1947 direct.di_rx_ch = ill; 1948 1949 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct, 1950 DLD_ENABLE); 1951 if (rc == 0) { 1952 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df; 1953 idd->idd_tx_dh = direct.di_tx_dh; 1954 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df; 1955 idd->idd_tx_cb_dh = direct.di_tx_cb_dh; 1956 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df; 1957 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh; 1958 ASSERT(idd->idd_tx_cb_df != NULL); 1959 ASSERT(idd->idd_tx_fctl_df != NULL); 1960 ASSERT(idd->idd_tx_df != NULL); 1961 /* 1962 * One time registration of flow enable callback function 1963 */ 1964 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh, 1965 ill_flow_enable, ill); 1966 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT; 1967 DTRACE_PROBE1(direct_on, (ill_t *), ill); 1968 } else { 1969 cmn_err(CE_WARN, "warning: could not enable DIRECT " 1970 "capability, rc = %d\n", rc); 1971 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc); 1972 } 1973 } 1974 1975 static void 1976 ill_capability_poll_enable(ill_t *ill) 1977 { 1978 ill_dld_capab_t *idc = ill->ill_dld_capab; 1979 dld_capab_poll_t poll; 1980 int rc; 1981 1982 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 1983 1984 bzero(&poll, sizeof (poll)); 1985 poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring; 1986 poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring; 1987 poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring; 1988 poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring; 1989 poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring; 1990 poll.poll_ring_ch = ill; 1991 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll, 1992 DLD_ENABLE); 1993 if (rc == 0) { 1994 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL; 1995 DTRACE_PROBE1(poll_on, (ill_t *), ill); 1996 } else { 1997 ip1dbg(("warning: could not enable POLL " 1998 "capability, rc = %d\n", rc)); 1999 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc); 2000 } 2001 } 2002 2003 /* 2004 * Enable the LSO capability. 2005 */ 2006 static void 2007 ill_capability_lso_enable(ill_t *ill) 2008 { 2009 ill_dld_capab_t *idc = ill->ill_dld_capab; 2010 dld_capab_lso_t lso; 2011 int rc; 2012 2013 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 2014 2015 if (ill->ill_lso_capab == NULL) { 2016 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), 2017 KM_NOSLEEP); 2018 if (ill->ill_lso_capab == NULL) { 2019 cmn_err(CE_WARN, "ill_capability_lso_enable: " 2020 "could not enable LSO for %s (ENOMEM)\n", 2021 ill->ill_name); 2022 return; 2023 } 2024 } 2025 2026 bzero(&lso, sizeof (lso)); 2027 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso, 2028 DLD_ENABLE)) == 0) { 2029 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags; 2030 ill->ill_lso_capab->ill_lso_max = lso.lso_max; 2031 ill->ill_capabilities |= ILL_CAPAB_LSO; 2032 ip1dbg(("ill_capability_lso_enable: interface %s " 2033 "has enabled LSO\n ", ill->ill_name)); 2034 } else { 2035 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 2036 ill->ill_lso_capab = NULL; 2037 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc); 2038 } 2039 } 2040 2041 static void 2042 ill_capability_dld_enable(ill_t *ill) 2043 { 2044 mac_perim_handle_t mph; 2045 2046 ASSERT(IAM_WRITER_ILL(ill)); 2047 2048 if (ill->ill_isv6) 2049 return; 2050 2051 ill_mac_perim_enter(ill, &mph); 2052 if (!ill->ill_isv6) { 2053 ill_capability_direct_enable(ill); 2054 ill_capability_poll_enable(ill); 2055 ill_capability_lso_enable(ill); 2056 } 2057 ill->ill_capabilities |= ILL_CAPAB_DLD; 2058 ill_mac_perim_exit(ill, mph); 2059 } 2060 2061 static void 2062 ill_capability_dld_disable(ill_t *ill) 2063 { 2064 ill_dld_capab_t *idc; 2065 ill_dld_direct_t *idd; 2066 mac_perim_handle_t mph; 2067 2068 ASSERT(IAM_WRITER_ILL(ill)); 2069 2070 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 2071 return; 2072 2073 ill_mac_perim_enter(ill, &mph); 2074 2075 idc = ill->ill_dld_capab; 2076 if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) { 2077 /* 2078 * For performance we avoid locks in the transmit data path 2079 * and don't maintain a count of the number of threads using 2080 * direct calls. Thus some threads could be using direct 2081 * transmit calls to GLD, even after the capability mechanism 2082 * turns it off. This is still safe since the handles used in 2083 * the direct calls continue to be valid until the unplumb is 2084 * completed. Remove the callback that was added (1-time) at 2085 * capab enable time. 2086 */ 2087 mutex_enter(&ill->ill_lock); 2088 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT; 2089 mutex_exit(&ill->ill_lock); 2090 if (ill->ill_flownotify_mh != NULL) { 2091 idd = &idc->idc_direct; 2092 idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL, 2093 ill->ill_flownotify_mh); 2094 ill->ill_flownotify_mh = NULL; 2095 } 2096 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, 2097 NULL, DLD_DISABLE); 2098 } 2099 2100 if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) { 2101 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL; 2102 ip_squeue_clean_all(ill); 2103 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, 2104 NULL, DLD_DISABLE); 2105 } 2106 2107 if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) { 2108 ASSERT(ill->ill_lso_capab != NULL); 2109 /* 2110 * Clear the capability flag for LSO but retain the 2111 * ill_lso_capab structure since it's possible that another 2112 * thread is still referring to it. The structure only gets 2113 * deallocated when we destroy the ill. 2114 */ 2115 2116 ill->ill_capabilities &= ~ILL_CAPAB_LSO; 2117 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, 2118 NULL, DLD_DISABLE); 2119 } 2120 2121 ill->ill_capabilities &= ~ILL_CAPAB_DLD; 2122 ill_mac_perim_exit(ill, mph); 2123 } 2124 2125 /* 2126 * Capability Negotiation protocol 2127 * 2128 * We don't wait for DLPI capability operations to finish during interface 2129 * bringup or teardown. Doing so would introduce more asynchrony and the 2130 * interface up/down operations will need multiple return and restarts. 2131 * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as 2132 * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next 2133 * exclusive operation won't start until the DLPI operations of the previous 2134 * exclusive operation complete. 2135 * 2136 * The capability state machine is shown below. 2137 * 2138 * state next state event, action 2139 * 2140 * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe 2141 * IDCS_PROBE_SENT IDCS_OK ill_capability_ack 2142 * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack) 2143 * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG 2144 * IDCS_OK IDCS_RESET_SENT ill_capability_reset 2145 * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr 2146 * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr -> 2147 * ill_capability_probe. 2148 */ 2149 2150 /* 2151 * Dedicated thread started from ip_stack_init that handles capability 2152 * disable. This thread ensures the taskq dispatch does not fail by waiting 2153 * for resources using TQ_SLEEP. The taskq mechanism is used to ensure 2154 * that direct calls to DLD are done in a cv_waitable context. 2155 */ 2156 void 2157 ill_taskq_dispatch(ip_stack_t *ipst) 2158 { 2159 callb_cpr_t cprinfo; 2160 char name[64]; 2161 mblk_t *mp; 2162 2163 (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d", 2164 ipst->ips_netstack->netstack_stackid); 2165 CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr, 2166 name); 2167 mutex_enter(&ipst->ips_capab_taskq_lock); 2168 2169 for (;;) { 2170 mp = ipst->ips_capab_taskq_head; 2171 while (mp != NULL) { 2172 ipst->ips_capab_taskq_head = mp->b_next; 2173 if (ipst->ips_capab_taskq_head == NULL) 2174 ipst->ips_capab_taskq_tail = NULL; 2175 mutex_exit(&ipst->ips_capab_taskq_lock); 2176 mp->b_next = NULL; 2177 2178 VERIFY(taskq_dispatch(system_taskq, 2179 ill_capability_ack_thr, mp, TQ_SLEEP) != 0); 2180 mutex_enter(&ipst->ips_capab_taskq_lock); 2181 mp = ipst->ips_capab_taskq_head; 2182 } 2183 2184 if (ipst->ips_capab_taskq_quit) 2185 break; 2186 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2187 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock); 2188 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock); 2189 } 2190 VERIFY(ipst->ips_capab_taskq_head == NULL); 2191 VERIFY(ipst->ips_capab_taskq_tail == NULL); 2192 CALLB_CPR_EXIT(&cprinfo); 2193 thread_exit(); 2194 } 2195 2196 /* 2197 * Consume a new-style hardware capabilities negotiation ack. 2198 * Called via taskq on receipt of DL_CAPABILITY_ACK. 2199 */ 2200 static void 2201 ill_capability_ack_thr(void *arg) 2202 { 2203 mblk_t *mp = arg; 2204 dl_capability_ack_t *capp; 2205 dl_capability_sub_t *subp, *endp; 2206 ill_t *ill; 2207 boolean_t reneg; 2208 2209 ill = (ill_t *)mp->b_prev; 2210 mp->b_prev = NULL; 2211 2212 VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE); 2213 2214 if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT || 2215 ill->ill_dlpi_capab_state == IDCS_RENEG) { 2216 /* 2217 * We have received the ack for our DL_CAPAB reset request. 2218 * There isnt' anything in the message that needs processing. 2219 * All message based capabilities have been disabled, now 2220 * do the function call based capability disable. 2221 */ 2222 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG; 2223 ill_capability_dld_disable(ill); 2224 ill->ill_dlpi_capab_state = IDCS_UNKNOWN; 2225 if (reneg) 2226 ill_capability_probe(ill); 2227 goto done; 2228 } 2229 2230 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT) 2231 ill->ill_dlpi_capab_state = IDCS_OK; 2232 2233 capp = (dl_capability_ack_t *)mp->b_rptr; 2234 2235 if (capp->dl_sub_length == 0) { 2236 /* no new-style capabilities */ 2237 goto done; 2238 } 2239 2240 /* make sure the driver supplied correct dl_sub_length */ 2241 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 2242 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 2243 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 2244 goto done; 2245 } 2246 2247 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 2248 /* 2249 * There are sub-capabilities. Process the ones we know about. 2250 * Loop until we don't have room for another sub-cap header.. 2251 */ 2252 for (subp = SC(capp, capp->dl_sub_offset), 2253 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 2254 subp <= endp; 2255 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 2256 2257 switch (subp->dl_cap) { 2258 case DL_CAPAB_ID_WRAPPER: 2259 ill_capability_id_ack(ill, mp, subp); 2260 break; 2261 default: 2262 ill_capability_dispatch(ill, mp, subp); 2263 break; 2264 } 2265 } 2266 #undef SC 2267 done: 2268 inet_freemsg(mp); 2269 ill_capability_done(ill); 2270 ipsq_exit(ill->ill_phyint->phyint_ipsq); 2271 } 2272 2273 /* 2274 * This needs to be started in a taskq thread to provide a cv_waitable 2275 * context. 2276 */ 2277 void 2278 ill_capability_ack(ill_t *ill, mblk_t *mp) 2279 { 2280 ip_stack_t *ipst = ill->ill_ipst; 2281 2282 mp->b_prev = (mblk_t *)ill; 2283 ASSERT(mp->b_next == NULL); 2284 2285 if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp, 2286 TQ_NOSLEEP) != 0) 2287 return; 2288 2289 /* 2290 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread 2291 * which will do the dispatch using TQ_SLEEP to guarantee success. 2292 */ 2293 mutex_enter(&ipst->ips_capab_taskq_lock); 2294 if (ipst->ips_capab_taskq_head == NULL) { 2295 ASSERT(ipst->ips_capab_taskq_tail == NULL); 2296 ipst->ips_capab_taskq_head = mp; 2297 } else { 2298 ipst->ips_capab_taskq_tail->b_next = mp; 2299 } 2300 ipst->ips_capab_taskq_tail = mp; 2301 2302 cv_signal(&ipst->ips_capab_taskq_cv); 2303 mutex_exit(&ipst->ips_capab_taskq_lock); 2304 } 2305 2306 /* 2307 * This routine is called to scan the fragmentation reassembly table for 2308 * the specified ILL for any packets that are starting to smell. 2309 * dead_interval is the maximum time in seconds that will be tolerated. It 2310 * will either be the value specified in ip_g_frag_timeout, or zero if the 2311 * ILL is shutting down and it is time to blow everything off. 2312 * 2313 * It returns the number of seconds (as a time_t) that the next frag timer 2314 * should be scheduled for, 0 meaning that the timer doesn't need to be 2315 * re-started. Note that the method of calculating next_timeout isn't 2316 * entirely accurate since time will flow between the time we grab 2317 * current_time and the time we schedule the next timeout. This isn't a 2318 * big problem since this is the timer for sending an ICMP reassembly time 2319 * exceeded messages, and it doesn't have to be exactly accurate. 2320 * 2321 * This function is 2322 * sometimes called as writer, although this is not required. 2323 */ 2324 time_t 2325 ill_frag_timeout(ill_t *ill, time_t dead_interval) 2326 { 2327 ipfb_t *ipfb; 2328 ipfb_t *endp; 2329 ipf_t *ipf; 2330 ipf_t *ipfnext; 2331 mblk_t *mp; 2332 time_t current_time = gethrestime_sec(); 2333 time_t next_timeout = 0; 2334 uint32_t hdr_length; 2335 mblk_t *send_icmp_head; 2336 mblk_t *send_icmp_head_v6; 2337 ip_stack_t *ipst = ill->ill_ipst; 2338 ip_recv_attr_t iras; 2339 2340 bzero(&iras, sizeof (iras)); 2341 iras.ira_flags = 0; 2342 iras.ira_ill = iras.ira_rill = ill; 2343 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2344 iras.ira_rifindex = iras.ira_ruifindex; 2345 2346 ipfb = ill->ill_frag_hash_tbl; 2347 if (ipfb == NULL) 2348 return (B_FALSE); 2349 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 2350 /* Walk the frag hash table. */ 2351 for (; ipfb < endp; ipfb++) { 2352 send_icmp_head = NULL; 2353 send_icmp_head_v6 = NULL; 2354 mutex_enter(&ipfb->ipfb_lock); 2355 while ((ipf = ipfb->ipfb_ipf) != 0) { 2356 time_t frag_time = current_time - ipf->ipf_timestamp; 2357 time_t frag_timeout; 2358 2359 if (frag_time < dead_interval) { 2360 /* 2361 * There are some outstanding fragments 2362 * that will timeout later. Make note of 2363 * the time so that we can reschedule the 2364 * next timeout appropriately. 2365 */ 2366 frag_timeout = dead_interval - frag_time; 2367 if (next_timeout == 0 || 2368 frag_timeout < next_timeout) { 2369 next_timeout = frag_timeout; 2370 } 2371 break; 2372 } 2373 /* Time's up. Get it out of here. */ 2374 hdr_length = ipf->ipf_nf_hdr_len; 2375 ipfnext = ipf->ipf_hash_next; 2376 if (ipfnext) 2377 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 2378 *ipf->ipf_ptphn = ipfnext; 2379 mp = ipf->ipf_mp->b_cont; 2380 for (; mp; mp = mp->b_cont) { 2381 /* Extra points for neatness. */ 2382 IP_REASS_SET_START(mp, 0); 2383 IP_REASS_SET_END(mp, 0); 2384 } 2385 mp = ipf->ipf_mp->b_cont; 2386 atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count); 2387 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 2388 ipfb->ipfb_count -= ipf->ipf_count; 2389 ASSERT(ipfb->ipfb_frag_pkts > 0); 2390 ipfb->ipfb_frag_pkts--; 2391 /* 2392 * We do not send any icmp message from here because 2393 * we currently are holding the ipfb_lock for this 2394 * hash chain. If we try and send any icmp messages 2395 * from here we may end up via a put back into ip 2396 * trying to get the same lock, causing a recursive 2397 * mutex panic. Instead we build a list and send all 2398 * the icmp messages after we have dropped the lock. 2399 */ 2400 if (ill->ill_isv6) { 2401 if (hdr_length != 0) { 2402 mp->b_next = send_icmp_head_v6; 2403 send_icmp_head_v6 = mp; 2404 } else { 2405 freemsg(mp); 2406 } 2407 } else { 2408 if (hdr_length != 0) { 2409 mp->b_next = send_icmp_head; 2410 send_icmp_head = mp; 2411 } else { 2412 freemsg(mp); 2413 } 2414 } 2415 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2416 ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill); 2417 freeb(ipf->ipf_mp); 2418 } 2419 mutex_exit(&ipfb->ipfb_lock); 2420 /* 2421 * Now need to send any icmp messages that we delayed from 2422 * above. 2423 */ 2424 while (send_icmp_head_v6 != NULL) { 2425 ip6_t *ip6h; 2426 2427 mp = send_icmp_head_v6; 2428 send_icmp_head_v6 = send_icmp_head_v6->b_next; 2429 mp->b_next = NULL; 2430 ip6h = (ip6_t *)mp->b_rptr; 2431 iras.ira_flags = 0; 2432 /* 2433 * This will result in an incorrect ALL_ZONES zoneid 2434 * for multicast packets, but we 2435 * don't send ICMP errors for those in any case. 2436 */ 2437 iras.ira_zoneid = 2438 ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 2439 ill, ipst); 2440 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2441 icmp_time_exceeded_v6(mp, 2442 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 2443 &iras); 2444 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2445 } 2446 while (send_icmp_head != NULL) { 2447 ipaddr_t dst; 2448 2449 mp = send_icmp_head; 2450 send_icmp_head = send_icmp_head->b_next; 2451 mp->b_next = NULL; 2452 2453 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 2454 2455 iras.ira_flags = IRAF_IS_IPV4; 2456 /* 2457 * This will result in an incorrect ALL_ZONES zoneid 2458 * for broadcast and multicast packets, but we 2459 * don't send ICMP errors for those in any case. 2460 */ 2461 iras.ira_zoneid = ipif_lookup_addr_zoneid(dst, 2462 ill, ipst); 2463 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2464 icmp_time_exceeded(mp, 2465 ICMP_REASSEMBLY_TIME_EXCEEDED, &iras); 2466 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2467 } 2468 } 2469 /* 2470 * A non-dying ILL will use the return value to decide whether to 2471 * restart the frag timer, and for how long. 2472 */ 2473 return (next_timeout); 2474 } 2475 2476 /* 2477 * This routine is called when the approximate count of mblk memory used 2478 * for the specified ILL has exceeded max_count. 2479 */ 2480 void 2481 ill_frag_prune(ill_t *ill, uint_t max_count) 2482 { 2483 ipfb_t *ipfb; 2484 ipf_t *ipf; 2485 size_t count; 2486 clock_t now; 2487 2488 /* 2489 * If we are here within ip_min_frag_prune_time msecs remove 2490 * ill_frag_free_num_pkts oldest packets from each bucket and increment 2491 * ill_frag_free_num_pkts. 2492 */ 2493 mutex_enter(&ill->ill_lock); 2494 now = ddi_get_lbolt(); 2495 if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <= 2496 (ip_min_frag_prune_time != 0 ? 2497 ip_min_frag_prune_time : msec_per_tick)) { 2498 2499 ill->ill_frag_free_num_pkts++; 2500 2501 } else { 2502 ill->ill_frag_free_num_pkts = 0; 2503 } 2504 ill->ill_last_frag_clean_time = now; 2505 mutex_exit(&ill->ill_lock); 2506 2507 /* 2508 * free ill_frag_free_num_pkts oldest packets from each bucket. 2509 */ 2510 if (ill->ill_frag_free_num_pkts != 0) { 2511 int ix; 2512 2513 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2514 ipfb = &ill->ill_frag_hash_tbl[ix]; 2515 mutex_enter(&ipfb->ipfb_lock); 2516 if (ipfb->ipfb_ipf != NULL) { 2517 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 2518 ill->ill_frag_free_num_pkts); 2519 } 2520 mutex_exit(&ipfb->ipfb_lock); 2521 } 2522 } 2523 /* 2524 * While the reassembly list for this ILL is too big, prune a fragment 2525 * queue by age, oldest first. 2526 */ 2527 while (ill->ill_frag_count > max_count) { 2528 int ix; 2529 ipfb_t *oipfb = NULL; 2530 uint_t oldest = UINT_MAX; 2531 2532 count = 0; 2533 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2534 ipfb = &ill->ill_frag_hash_tbl[ix]; 2535 mutex_enter(&ipfb->ipfb_lock); 2536 ipf = ipfb->ipfb_ipf; 2537 if (ipf != NULL && ipf->ipf_gen < oldest) { 2538 oldest = ipf->ipf_gen; 2539 oipfb = ipfb; 2540 } 2541 count += ipfb->ipfb_count; 2542 mutex_exit(&ipfb->ipfb_lock); 2543 } 2544 if (oipfb == NULL) 2545 break; 2546 2547 if (count <= max_count) 2548 return; /* Somebody beat us to it, nothing to do */ 2549 mutex_enter(&oipfb->ipfb_lock); 2550 ipf = oipfb->ipfb_ipf; 2551 if (ipf != NULL) { 2552 ill_frag_free_pkts(ill, oipfb, ipf, 1); 2553 } 2554 mutex_exit(&oipfb->ipfb_lock); 2555 } 2556 } 2557 2558 /* 2559 * free 'free_cnt' fragmented packets starting at ipf. 2560 */ 2561 void 2562 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 2563 { 2564 size_t count; 2565 mblk_t *mp; 2566 mblk_t *tmp; 2567 ipf_t **ipfp = ipf->ipf_ptphn; 2568 2569 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 2570 ASSERT(ipfp != NULL); 2571 ASSERT(ipf != NULL); 2572 2573 while (ipf != NULL && free_cnt-- > 0) { 2574 count = ipf->ipf_count; 2575 mp = ipf->ipf_mp; 2576 ipf = ipf->ipf_hash_next; 2577 for (tmp = mp; tmp; tmp = tmp->b_cont) { 2578 IP_REASS_SET_START(tmp, 0); 2579 IP_REASS_SET_END(tmp, 0); 2580 } 2581 atomic_add_32(&ill->ill_frag_count, -count); 2582 ASSERT(ipfb->ipfb_count >= count); 2583 ipfb->ipfb_count -= count; 2584 ASSERT(ipfb->ipfb_frag_pkts > 0); 2585 ipfb->ipfb_frag_pkts--; 2586 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2587 ip_drop_input("ipIfStatsReasmFails", mp, ill); 2588 freemsg(mp); 2589 } 2590 2591 if (ipf) 2592 ipf->ipf_ptphn = ipfp; 2593 ipfp[0] = ipf; 2594 } 2595 2596 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 2597 "obsolete and may be removed in a future release of Solaris. Use " \ 2598 "ifconfig(1M) to manipulate the forwarding status of an interface." 2599 2600 /* 2601 * For obsolete per-interface forwarding configuration; 2602 * called in response to ND_GET. 2603 */ 2604 /* ARGSUSED */ 2605 static int 2606 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 2607 { 2608 ill_t *ill = (ill_t *)cp; 2609 2610 cmn_err(CE_WARN, ND_FORWARD_WARNING); 2611 2612 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 2613 return (0); 2614 } 2615 2616 /* 2617 * For obsolete per-interface forwarding configuration; 2618 * called in response to ND_SET. 2619 */ 2620 /* ARGSUSED */ 2621 static int 2622 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 2623 cred_t *ioc_cr) 2624 { 2625 long value; 2626 int retval; 2627 ip_stack_t *ipst = CONNQ_TO_IPST(q); 2628 2629 cmn_err(CE_WARN, ND_FORWARD_WARNING); 2630 2631 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 2632 value < 0 || value > 1) { 2633 return (EINVAL); 2634 } 2635 2636 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2637 retval = ill_forward_set((ill_t *)cp, (value != 0)); 2638 rw_exit(&ipst->ips_ill_g_lock); 2639 return (retval); 2640 } 2641 2642 /* 2643 * Helper function for ill_forward_set(). 2644 */ 2645 static void 2646 ill_forward_set_on_ill(ill_t *ill, boolean_t enable) 2647 { 2648 ip_stack_t *ipst = ill->ill_ipst; 2649 2650 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2651 2652 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 2653 (enable ? "Enabling" : "Disabling"), 2654 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 2655 mutex_enter(&ill->ill_lock); 2656 if (enable) 2657 ill->ill_flags |= ILLF_ROUTER; 2658 else 2659 ill->ill_flags &= ~ILLF_ROUTER; 2660 mutex_exit(&ill->ill_lock); 2661 if (ill->ill_isv6) 2662 ill_set_nce_router_flags(ill, enable); 2663 /* Notify routing socket listeners of this change. */ 2664 if (ill->ill_ipif != NULL) 2665 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 2666 } 2667 2668 /* 2669 * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing 2670 * socket messages for each interface whose flags we change. 2671 */ 2672 int 2673 ill_forward_set(ill_t *ill, boolean_t enable) 2674 { 2675 ipmp_illgrp_t *illg; 2676 ip_stack_t *ipst = ill->ill_ipst; 2677 2678 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2679 2680 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 2681 (!enable && !(ill->ill_flags & ILLF_ROUTER))) 2682 return (0); 2683 2684 if (IS_LOOPBACK(ill)) 2685 return (EINVAL); 2686 2687 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { 2688 /* 2689 * Update all of the interfaces in the group. 2690 */ 2691 illg = ill->ill_grp; 2692 ill = list_head(&illg->ig_if); 2693 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) 2694 ill_forward_set_on_ill(ill, enable); 2695 2696 /* 2697 * Update the IPMP meta-interface. 2698 */ 2699 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable); 2700 return (0); 2701 } 2702 2703 ill_forward_set_on_ill(ill, enable); 2704 return (0); 2705 } 2706 2707 /* 2708 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 2709 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 2710 * set or clear. 2711 */ 2712 static void 2713 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 2714 { 2715 ipif_t *ipif; 2716 ncec_t *ncec; 2717 nce_t *nce; 2718 2719 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 2720 /* 2721 * NOTE: we match across the illgrp because nce's for 2722 * addresses on IPMP interfaces have an nce_ill that points to 2723 * the bound underlying ill. 2724 */ 2725 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); 2726 if (nce != NULL) { 2727 ncec = nce->nce_common; 2728 mutex_enter(&ncec->ncec_lock); 2729 if (enable) 2730 ncec->ncec_flags |= NCE_F_ISROUTER; 2731 else 2732 ncec->ncec_flags &= ~NCE_F_ISROUTER; 2733 mutex_exit(&ncec->ncec_lock); 2734 nce_refrele(nce); 2735 } 2736 } 2737 } 2738 2739 /* 2740 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 2741 * for this ill. Make sure the v6/v4 question has been answered about this 2742 * ill. The creation of this ndd variable is only for backwards compatibility. 2743 * The preferred way to control per-interface IP forwarding is through the 2744 * ILLF_ROUTER interface flag. 2745 */ 2746 static int 2747 ill_set_ndd_name(ill_t *ill) 2748 { 2749 char *suffix; 2750 ip_stack_t *ipst = ill->ill_ipst; 2751 2752 ASSERT(IAM_WRITER_ILL(ill)); 2753 2754 if (ill->ill_isv6) 2755 suffix = ipv6_forward_suffix; 2756 else 2757 suffix = ipv4_forward_suffix; 2758 2759 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 2760 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 2761 /* 2762 * Copies over the '\0'. 2763 * Note that strlen(suffix) is always bounded. 2764 */ 2765 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 2766 strlen(suffix) + 1); 2767 2768 /* 2769 * Use of the nd table requires holding the reader lock. 2770 * Modifying the nd table thru nd_load/nd_unload requires 2771 * the writer lock. 2772 */ 2773 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 2774 if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 2775 nd_ill_forward_set, (caddr_t)ill)) { 2776 /* 2777 * If the nd_load failed, it only meant that it could not 2778 * allocate a new bunch of room for further NDD expansion. 2779 * Because of that, the ill_ndd_name will be set to 0, and 2780 * this interface is at the mercy of the global ip_forwarding 2781 * variable. 2782 */ 2783 rw_exit(&ipst->ips_ip_g_nd_lock); 2784 ill->ill_ndd_name = NULL; 2785 return (ENOMEM); 2786 } 2787 rw_exit(&ipst->ips_ip_g_nd_lock); 2788 return (0); 2789 } 2790 2791 /* 2792 * Intializes the context structure and returns the first ill in the list 2793 * cuurently start_list and end_list can have values: 2794 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 2795 * IP_V4_G_HEAD Traverse IPV4 list only. 2796 * IP_V6_G_HEAD Traverse IPV6 list only. 2797 */ 2798 2799 /* 2800 * We don't check for CONDEMNED ills here. Caller must do that if 2801 * necessary under the ill lock. 2802 */ 2803 ill_t * 2804 ill_first(int start_list, int end_list, ill_walk_context_t *ctx, 2805 ip_stack_t *ipst) 2806 { 2807 ill_if_t *ifp; 2808 ill_t *ill; 2809 avl_tree_t *avl_tree; 2810 2811 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 2812 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 2813 2814 /* 2815 * setup the lists to search 2816 */ 2817 if (end_list != MAX_G_HEADS) { 2818 ctx->ctx_current_list = start_list; 2819 ctx->ctx_last_list = end_list; 2820 } else { 2821 ctx->ctx_last_list = MAX_G_HEADS - 1; 2822 ctx->ctx_current_list = 0; 2823 } 2824 2825 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 2826 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2827 if (ifp != (ill_if_t *) 2828 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2829 avl_tree = &ifp->illif_avl_by_ppa; 2830 ill = avl_first(avl_tree); 2831 /* 2832 * ill is guaranteed to be non NULL or ifp should have 2833 * not existed. 2834 */ 2835 ASSERT(ill != NULL); 2836 return (ill); 2837 } 2838 ctx->ctx_current_list++; 2839 } 2840 2841 return (NULL); 2842 } 2843 2844 /* 2845 * returns the next ill in the list. ill_first() must have been called 2846 * before calling ill_next() or bad things will happen. 2847 */ 2848 2849 /* 2850 * We don't check for CONDEMNED ills here. Caller must do that if 2851 * necessary under the ill lock. 2852 */ 2853 ill_t * 2854 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 2855 { 2856 ill_if_t *ifp; 2857 ill_t *ill; 2858 ip_stack_t *ipst = lastill->ill_ipst; 2859 2860 ASSERT(lastill->ill_ifptr != (ill_if_t *) 2861 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)); 2862 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 2863 AVL_AFTER)) != NULL) { 2864 return (ill); 2865 } 2866 2867 /* goto next ill_ifp in the list. */ 2868 ifp = lastill->ill_ifptr->illif_next; 2869 2870 /* make sure not at end of circular list */ 2871 while (ifp == 2872 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2873 if (++ctx->ctx_current_list > ctx->ctx_last_list) 2874 return (NULL); 2875 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2876 } 2877 2878 return (avl_first(&ifp->illif_avl_by_ppa)); 2879 } 2880 2881 /* 2882 * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+ 2883 * The final number (PPA) must not have any leading zeros. Upon success, a 2884 * pointer to the start of the PPA is returned; otherwise NULL is returned. 2885 */ 2886 static char * 2887 ill_get_ppa_ptr(char *name) 2888 { 2889 int namelen = strlen(name); 2890 int end_ndx = namelen - 1; 2891 int ppa_ndx, i; 2892 2893 /* 2894 * Check that the first character is [a-zA-Z], and that the last 2895 * character is [0-9]. 2896 */ 2897 if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx])) 2898 return (NULL); 2899 2900 /* 2901 * Set `ppa_ndx' to the PPA start, and check for leading zeroes. 2902 */ 2903 for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--) 2904 if (!isdigit(name[ppa_ndx - 1])) 2905 break; 2906 2907 if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx) 2908 return (NULL); 2909 2910 /* 2911 * Check that the intermediate characters are [a-z0-9.] 2912 */ 2913 for (i = 1; i < ppa_ndx; i++) { 2914 if (!isalpha(name[i]) && !isdigit(name[i]) && 2915 name[i] != '.' && name[i] != '_') { 2916 return (NULL); 2917 } 2918 } 2919 2920 return (name + ppa_ndx); 2921 } 2922 2923 /* 2924 * use avl tree to locate the ill. 2925 */ 2926 static ill_t * 2927 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst) 2928 { 2929 char *ppa_ptr = NULL; 2930 int len; 2931 uint_t ppa; 2932 ill_t *ill = NULL; 2933 ill_if_t *ifp; 2934 int list; 2935 2936 /* 2937 * get ppa ptr 2938 */ 2939 if (isv6) 2940 list = IP_V6_G_HEAD; 2941 else 2942 list = IP_V4_G_HEAD; 2943 2944 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 2945 return (NULL); 2946 } 2947 2948 len = ppa_ptr - name + 1; 2949 2950 ppa = stoi(&ppa_ptr); 2951 2952 ifp = IP_VX_ILL_G_LIST(list, ipst); 2953 2954 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 2955 /* 2956 * match is done on len - 1 as the name is not null 2957 * terminated it contains ppa in addition to the interface 2958 * name. 2959 */ 2960 if ((ifp->illif_name_len == len) && 2961 bcmp(ifp->illif_name, name, len - 1) == 0) { 2962 break; 2963 } else { 2964 ifp = ifp->illif_next; 2965 } 2966 } 2967 2968 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 2969 /* 2970 * Even the interface type does not exist. 2971 */ 2972 return (NULL); 2973 } 2974 2975 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 2976 if (ill != NULL) { 2977 mutex_enter(&ill->ill_lock); 2978 if (ILL_CAN_LOOKUP(ill)) { 2979 ill_refhold_locked(ill); 2980 mutex_exit(&ill->ill_lock); 2981 return (ill); 2982 } 2983 mutex_exit(&ill->ill_lock); 2984 } 2985 return (NULL); 2986 } 2987 2988 /* 2989 * comparison function for use with avl. 2990 */ 2991 static int 2992 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 2993 { 2994 uint_t ppa; 2995 uint_t ill_ppa; 2996 2997 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 2998 2999 ppa = *((uint_t *)ppa_ptr); 3000 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 3001 /* 3002 * We want the ill with the lowest ppa to be on the 3003 * top. 3004 */ 3005 if (ill_ppa < ppa) 3006 return (1); 3007 if (ill_ppa > ppa) 3008 return (-1); 3009 return (0); 3010 } 3011 3012 /* 3013 * remove an interface type from the global list. 3014 */ 3015 static void 3016 ill_delete_interface_type(ill_if_t *interface) 3017 { 3018 ASSERT(interface != NULL); 3019 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 3020 3021 avl_destroy(&interface->illif_avl_by_ppa); 3022 if (interface->illif_ppa_arena != NULL) 3023 vmem_destroy(interface->illif_ppa_arena); 3024 3025 remque(interface); 3026 3027 mi_free(interface); 3028 } 3029 3030 /* 3031 * remove ill from the global list. 3032 */ 3033 static void 3034 ill_glist_delete(ill_t *ill) 3035 { 3036 ip_stack_t *ipst; 3037 phyint_t *phyi; 3038 3039 if (ill == NULL) 3040 return; 3041 ipst = ill->ill_ipst; 3042 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 3043 3044 /* 3045 * If the ill was never inserted into the AVL tree 3046 * we skip the if branch. 3047 */ 3048 if (ill->ill_ifptr != NULL) { 3049 /* 3050 * remove from AVL tree and free ppa number 3051 */ 3052 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 3053 3054 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 3055 vmem_free(ill->ill_ifptr->illif_ppa_arena, 3056 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 3057 } 3058 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 3059 ill_delete_interface_type(ill->ill_ifptr); 3060 } 3061 3062 /* 3063 * Indicate ill is no longer in the list. 3064 */ 3065 ill->ill_ifptr = NULL; 3066 ill->ill_name_length = 0; 3067 ill->ill_name[0] = '\0'; 3068 ill->ill_ppa = UINT_MAX; 3069 } 3070 3071 /* Generate one last event for this ill. */ 3072 ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name, 3073 ill->ill_name_length); 3074 3075 ASSERT(ill->ill_phyint != NULL); 3076 phyi = ill->ill_phyint; 3077 ill->ill_phyint = NULL; 3078 3079 /* 3080 * ill_init allocates a phyint always to store the copy 3081 * of flags relevant to phyint. At that point in time, we could 3082 * not assign the name and hence phyint_illv4/v6 could not be 3083 * initialized. Later in ipif_set_values, we assign the name to 3084 * the ill, at which point in time we assign phyint_illv4/v6. 3085 * Thus we don't rely on phyint_illv6 to be initialized always. 3086 */ 3087 if (ill->ill_flags & ILLF_IPV6) 3088 phyi->phyint_illv6 = NULL; 3089 else 3090 phyi->phyint_illv4 = NULL; 3091 3092 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) { 3093 rw_exit(&ipst->ips_ill_g_lock); 3094 return; 3095 } 3096 3097 /* 3098 * There are no ills left on this phyint; pull it out of the phyint 3099 * avl trees, and free it. 3100 */ 3101 if (phyi->phyint_ifindex > 0) { 3102 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3103 phyi); 3104 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 3105 phyi); 3106 } 3107 rw_exit(&ipst->ips_ill_g_lock); 3108 3109 phyint_free(phyi); 3110 } 3111 3112 /* 3113 * allocate a ppa, if the number of plumbed interfaces of this type are 3114 * less than ill_no_arena do a linear search to find a unused ppa. 3115 * When the number goes beyond ill_no_arena switch to using an arena. 3116 * Note: ppa value of zero cannot be allocated from vmem_arena as it 3117 * is the return value for an error condition, so allocation starts at one 3118 * and is decremented by one. 3119 */ 3120 static int 3121 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 3122 { 3123 ill_t *tmp_ill; 3124 uint_t start, end; 3125 int ppa; 3126 3127 if (ifp->illif_ppa_arena == NULL && 3128 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 3129 /* 3130 * Create an arena. 3131 */ 3132 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 3133 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 3134 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 3135 /* allocate what has already been assigned */ 3136 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 3137 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 3138 tmp_ill, AVL_AFTER)) { 3139 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 3140 1, /* size */ 3141 1, /* align/quantum */ 3142 0, /* phase */ 3143 0, /* nocross */ 3144 /* minaddr */ 3145 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), 3146 /* maxaddr */ 3147 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), 3148 VM_NOSLEEP|VM_FIRSTFIT); 3149 if (ppa == 0) { 3150 ip1dbg(("ill_alloc_ppa: ppa allocation" 3151 " failed while switching")); 3152 vmem_destroy(ifp->illif_ppa_arena); 3153 ifp->illif_ppa_arena = NULL; 3154 break; 3155 } 3156 } 3157 } 3158 3159 if (ifp->illif_ppa_arena != NULL) { 3160 if (ill->ill_ppa == UINT_MAX) { 3161 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 3162 1, VM_NOSLEEP|VM_FIRSTFIT); 3163 if (ppa == 0) 3164 return (EAGAIN); 3165 ill->ill_ppa = --ppa; 3166 } else { 3167 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 3168 1, /* size */ 3169 1, /* align/quantum */ 3170 0, /* phase */ 3171 0, /* nocross */ 3172 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 3173 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 3174 VM_NOSLEEP|VM_FIRSTFIT); 3175 /* 3176 * Most likely the allocation failed because 3177 * the requested ppa was in use. 3178 */ 3179 if (ppa == 0) 3180 return (EEXIST); 3181 } 3182 return (0); 3183 } 3184 3185 /* 3186 * No arena is in use and not enough (>ill_no_arena) interfaces have 3187 * been plumbed to create one. Do a linear search to get a unused ppa. 3188 */ 3189 if (ill->ill_ppa == UINT_MAX) { 3190 end = UINT_MAX - 1; 3191 start = 0; 3192 } else { 3193 end = start = ill->ill_ppa; 3194 } 3195 3196 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 3197 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 3198 if (start++ >= end) { 3199 if (ill->ill_ppa == UINT_MAX) 3200 return (EAGAIN); 3201 else 3202 return (EEXIST); 3203 } 3204 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 3205 } 3206 ill->ill_ppa = start; 3207 return (0); 3208 } 3209 3210 /* 3211 * Insert ill into the list of configured ill's. Once this function completes, 3212 * the ill is globally visible and is available through lookups. More precisely 3213 * this happens after the caller drops the ill_g_lock. 3214 */ 3215 static int 3216 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 3217 { 3218 ill_if_t *ill_interface; 3219 avl_index_t where = 0; 3220 int error; 3221 int name_length; 3222 int index; 3223 boolean_t check_length = B_FALSE; 3224 ip_stack_t *ipst = ill->ill_ipst; 3225 3226 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 3227 3228 name_length = mi_strlen(name) + 1; 3229 3230 if (isv6) 3231 index = IP_V6_G_HEAD; 3232 else 3233 index = IP_V4_G_HEAD; 3234 3235 ill_interface = IP_VX_ILL_G_LIST(index, ipst); 3236 /* 3237 * Search for interface type based on name 3238 */ 3239 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 3240 if ((ill_interface->illif_name_len == name_length) && 3241 (strcmp(ill_interface->illif_name, name) == 0)) { 3242 break; 3243 } 3244 ill_interface = ill_interface->illif_next; 3245 } 3246 3247 /* 3248 * Interface type not found, create one. 3249 */ 3250 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 3251 ill_g_head_t ghead; 3252 3253 /* 3254 * allocate ill_if_t structure 3255 */ 3256 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 3257 if (ill_interface == NULL) { 3258 return (ENOMEM); 3259 } 3260 3261 (void) strcpy(ill_interface->illif_name, name); 3262 ill_interface->illif_name_len = name_length; 3263 3264 avl_create(&ill_interface->illif_avl_by_ppa, 3265 ill_compare_ppa, sizeof (ill_t), 3266 offsetof(struct ill_s, ill_avl_byppa)); 3267 3268 /* 3269 * link the structure in the back to maintain order 3270 * of configuration for ifconfig output. 3271 */ 3272 ghead = ipst->ips_ill_g_heads[index]; 3273 insque(ill_interface, ghead.ill_g_list_tail); 3274 } 3275 3276 if (ill->ill_ppa == UINT_MAX) 3277 check_length = B_TRUE; 3278 3279 error = ill_alloc_ppa(ill_interface, ill); 3280 if (error != 0) { 3281 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 3282 ill_delete_interface_type(ill->ill_ifptr); 3283 return (error); 3284 } 3285 3286 /* 3287 * When the ppa is choosen by the system, check that there is 3288 * enough space to insert ppa. if a specific ppa was passed in this 3289 * check is not required as the interface name passed in will have 3290 * the right ppa in it. 3291 */ 3292 if (check_length) { 3293 /* 3294 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 3295 */ 3296 char buf[sizeof (uint_t) * 3]; 3297 3298 /* 3299 * convert ppa to string to calculate the amount of space 3300 * required for it in the name. 3301 */ 3302 numtos(ill->ill_ppa, buf); 3303 3304 /* Do we have enough space to insert ppa ? */ 3305 3306 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 3307 /* Free ppa and interface type struct */ 3308 if (ill_interface->illif_ppa_arena != NULL) { 3309 vmem_free(ill_interface->illif_ppa_arena, 3310 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 3311 } 3312 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 3313 ill_delete_interface_type(ill->ill_ifptr); 3314 3315 return (EINVAL); 3316 } 3317 } 3318 3319 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 3320 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 3321 3322 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 3323 &where); 3324 ill->ill_ifptr = ill_interface; 3325 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 3326 3327 ill_phyint_reinit(ill); 3328 return (0); 3329 } 3330 3331 /* Initialize the per phyint ipsq used for serialization */ 3332 static boolean_t 3333 ipsq_init(ill_t *ill, boolean_t enter) 3334 { 3335 ipsq_t *ipsq; 3336 ipxop_t *ipx; 3337 3338 if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL) 3339 return (B_FALSE); 3340 3341 ill->ill_phyint->phyint_ipsq = ipsq; 3342 ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop; 3343 ipx->ipx_ipsq = ipsq; 3344 ipsq->ipsq_next = ipsq; 3345 ipsq->ipsq_phyint = ill->ill_phyint; 3346 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 3347 mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0); 3348 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */ 3349 if (enter) { 3350 ipx->ipx_writer = curthread; 3351 ipx->ipx_forced = B_FALSE; 3352 ipx->ipx_reentry_cnt = 1; 3353 #ifdef DEBUG 3354 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 3355 #endif 3356 } 3357 return (B_TRUE); 3358 } 3359 3360 /* 3361 * ill_init is called by ip_open when a device control stream is opened. 3362 * It does a few initializations, and shoots a DL_INFO_REQ message down 3363 * to the driver. The response is later picked up in ip_rput_dlpi and 3364 * used to set up default mechanisms for talking to the driver. (Always 3365 * called as writer.) 3366 * 3367 * If this function returns error, ip_open will call ip_close which in 3368 * turn will call ill_delete to clean up any memory allocated here that 3369 * is not yet freed. 3370 */ 3371 int 3372 ill_init(queue_t *q, ill_t *ill) 3373 { 3374 int count; 3375 dl_info_req_t *dlir; 3376 mblk_t *info_mp; 3377 uchar_t *frag_ptr; 3378 3379 /* 3380 * The ill is initialized to zero by mi_alloc*(). In addition 3381 * some fields already contain valid values, initialized in 3382 * ip_open(), before we reach here. 3383 */ 3384 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 3385 mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 3386 ill->ill_saved_ire_cnt = 0; 3387 3388 ill->ill_rq = q; 3389 ill->ill_wq = WR(q); 3390 3391 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 3392 BPRI_HI); 3393 if (info_mp == NULL) 3394 return (ENOMEM); 3395 3396 /* 3397 * Allocate sufficient space to contain our fragment hash table and 3398 * the device name. 3399 */ 3400 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 3401 2 * LIFNAMSIZ + strlen(ipv6_forward_suffix)); 3402 if (frag_ptr == NULL) { 3403 freemsg(info_mp); 3404 return (ENOMEM); 3405 } 3406 ill->ill_frag_ptr = frag_ptr; 3407 ill->ill_frag_free_num_pkts = 0; 3408 ill->ill_last_frag_clean_time = 0; 3409 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 3410 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 3411 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 3412 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 3413 NULL, MUTEX_DEFAULT, NULL); 3414 } 3415 3416 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 3417 if (ill->ill_phyint == NULL) { 3418 freemsg(info_mp); 3419 mi_free(frag_ptr); 3420 return (ENOMEM); 3421 } 3422 3423 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 3424 /* 3425 * For now pretend this is a v4 ill. We need to set phyint_ill* 3426 * at this point because of the following reason. If we can't 3427 * enter the ipsq at some point and cv_wait, the writer that 3428 * wakes us up tries to locate us using the list of all phyints 3429 * in an ipsq and the ills from the phyint thru the phyint_ill*. 3430 * If we don't set it now, we risk a missed wakeup. 3431 */ 3432 ill->ill_phyint->phyint_illv4 = ill; 3433 ill->ill_ppa = UINT_MAX; 3434 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); 3435 3436 ill_set_inputfn(ill); 3437 3438 if (!ipsq_init(ill, B_TRUE)) { 3439 freemsg(info_mp); 3440 mi_free(frag_ptr); 3441 mi_free(ill->ill_phyint); 3442 return (ENOMEM); 3443 } 3444 3445 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 3446 3447 /* Frag queue limit stuff */ 3448 ill->ill_frag_count = 0; 3449 ill->ill_ipf_gen = 0; 3450 3451 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); 3452 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); 3453 ill->ill_global_timer = INFINITY; 3454 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 3455 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 3456 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 3457 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 3458 3459 /* 3460 * Initialize IPv6 configuration variables. The IP module is always 3461 * opened as an IPv4 module. Instead tracking down the cases where 3462 * it switches to do ipv6, we'll just initialize the IPv6 configuration 3463 * here for convenience, this has no effect until the ill is set to do 3464 * IPv6. 3465 */ 3466 ill->ill_reachable_time = ND_REACHABLE_TIME; 3467 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 3468 ill->ill_max_buf = ND_MAX_Q; 3469 ill->ill_refcnt = 0; 3470 3471 /* Send down the Info Request to the driver. */ 3472 info_mp->b_datap->db_type = M_PCPROTO; 3473 dlir = (dl_info_req_t *)info_mp->b_rptr; 3474 info_mp->b_wptr = (uchar_t *)&dlir[1]; 3475 dlir->dl_primitive = DL_INFO_REQ; 3476 3477 ill->ill_dlpi_pending = DL_PRIM_INVAL; 3478 3479 qprocson(q); 3480 ill_dlpi_send(ill, info_mp); 3481 3482 return (0); 3483 } 3484 3485 /* 3486 * ill_dls_info 3487 * creates datalink socket info from the device. 3488 */ 3489 int 3490 ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill) 3491 { 3492 size_t len; 3493 3494 sdl->sdl_family = AF_LINK; 3495 sdl->sdl_index = ill_get_upper_ifindex(ill); 3496 sdl->sdl_type = ill->ill_type; 3497 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); 3498 len = strlen(sdl->sdl_data); 3499 ASSERT(len < 256); 3500 sdl->sdl_nlen = (uchar_t)len; 3501 sdl->sdl_alen = ill->ill_phys_addr_length; 3502 sdl->sdl_slen = 0; 3503 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) 3504 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen); 3505 3506 return (sizeof (struct sockaddr_dl)); 3507 } 3508 3509 /* 3510 * ill_xarp_info 3511 * creates xarp info from the device. 3512 */ 3513 static int 3514 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 3515 { 3516 sdl->sdl_family = AF_LINK; 3517 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 3518 sdl->sdl_type = ill->ill_type; 3519 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); 3520 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 3521 sdl->sdl_alen = ill->ill_phys_addr_length; 3522 sdl->sdl_slen = 0; 3523 return (sdl->sdl_nlen); 3524 } 3525 3526 static int 3527 loopback_kstat_update(kstat_t *ksp, int rw) 3528 { 3529 kstat_named_t *kn; 3530 netstackid_t stackid; 3531 netstack_t *ns; 3532 ip_stack_t *ipst; 3533 3534 if (ksp == NULL || ksp->ks_data == NULL) 3535 return (EIO); 3536 3537 if (rw == KSTAT_WRITE) 3538 return (EACCES); 3539 3540 kn = KSTAT_NAMED_PTR(ksp); 3541 stackid = (zoneid_t)(uintptr_t)ksp->ks_private; 3542 3543 ns = netstack_find_by_stackid(stackid); 3544 if (ns == NULL) 3545 return (-1); 3546 3547 ipst = ns->netstack_ip; 3548 if (ipst == NULL) { 3549 netstack_rele(ns); 3550 return (-1); 3551 } 3552 kn[0].value.ui32 = ipst->ips_loopback_packets; 3553 kn[1].value.ui32 = ipst->ips_loopback_packets; 3554 netstack_rele(ns); 3555 return (0); 3556 } 3557 3558 /* 3559 * Has ifindex been plumbed already? 3560 */ 3561 static boolean_t 3562 phyint_exists(uint_t index, ip_stack_t *ipst) 3563 { 3564 ASSERT(index != 0); 3565 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 3566 3567 return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3568 &index, NULL) != NULL); 3569 } 3570 3571 /* Pick a unique ifindex */ 3572 boolean_t 3573 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst) 3574 { 3575 uint_t starting_index; 3576 3577 if (!ipst->ips_ill_index_wrap) { 3578 *indexp = ipst->ips_ill_index++; 3579 if (ipst->ips_ill_index == 0) { 3580 /* Reached the uint_t limit Next time wrap */ 3581 ipst->ips_ill_index_wrap = B_TRUE; 3582 } 3583 return (B_TRUE); 3584 } 3585 3586 /* 3587 * Start reusing unused indexes. Note that we hold the ill_g_lock 3588 * at this point and don't want to call any function that attempts 3589 * to get the lock again. 3590 */ 3591 starting_index = ipst->ips_ill_index++; 3592 for (; ipst->ips_ill_index != starting_index; ipst->ips_ill_index++) { 3593 if (ipst->ips_ill_index != 0 && 3594 !phyint_exists(ipst->ips_ill_index, ipst)) { 3595 /* found unused index - use it */ 3596 *indexp = ipst->ips_ill_index; 3597 return (B_TRUE); 3598 } 3599 } 3600 3601 /* 3602 * all interface indicies are inuse. 3603 */ 3604 return (B_FALSE); 3605 } 3606 3607 /* 3608 * Assign a unique interface index for the phyint. 3609 */ 3610 static boolean_t 3611 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst) 3612 { 3613 ASSERT(phyi->phyint_ifindex == 0); 3614 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst)); 3615 } 3616 3617 /* 3618 * Initialize the flags on `phyi' as per the provided mactype. 3619 */ 3620 static void 3621 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype) 3622 { 3623 uint64_t flags = 0; 3624 3625 /* 3626 * Initialize PHYI_RUNNING and PHYI_FAILED. For non-IPMP interfaces, 3627 * we always presume the underlying hardware is working and set 3628 * PHYI_RUNNING (if it's not, the driver will subsequently send a 3629 * DL_NOTE_LINK_DOWN message). For IPMP interfaces, at initialization 3630 * there are no active interfaces in the group so we set PHYI_FAILED. 3631 */ 3632 if (mactype == SUNW_DL_IPMP) 3633 flags |= PHYI_FAILED; 3634 else 3635 flags |= PHYI_RUNNING; 3636 3637 switch (mactype) { 3638 case SUNW_DL_VNI: 3639 flags |= PHYI_VIRTUAL; 3640 break; 3641 case SUNW_DL_IPMP: 3642 flags |= PHYI_IPMP; 3643 break; 3644 case DL_LOOP: 3645 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL); 3646 break; 3647 } 3648 3649 mutex_enter(&phyi->phyint_lock); 3650 phyi->phyint_flags |= flags; 3651 mutex_exit(&phyi->phyint_lock); 3652 } 3653 3654 /* 3655 * Return a pointer to the ill which matches the supplied name. Note that 3656 * the ill name length includes the null termination character. (May be 3657 * called as writer.) 3658 * If do_alloc and the interface is "lo0" it will be automatically created. 3659 * Cannot bump up reference on condemned ills. So dup detect can't be done 3660 * using this func. 3661 */ 3662 ill_t * 3663 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 3664 boolean_t *did_alloc, ip_stack_t *ipst) 3665 { 3666 ill_t *ill; 3667 ipif_t *ipif; 3668 ipsq_t *ipsq; 3669 kstat_named_t *kn; 3670 boolean_t isloopback; 3671 in6_addr_t ov6addr; 3672 3673 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 3674 3675 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3676 ill = ill_find_by_name(name, isv6, ipst); 3677 rw_exit(&ipst->ips_ill_g_lock); 3678 if (ill != NULL) 3679 return (ill); 3680 3681 /* 3682 * Couldn't find it. Does this happen to be a lookup for the 3683 * loopback device and are we allowed to allocate it? 3684 */ 3685 if (!isloopback || !do_alloc) 3686 return (NULL); 3687 3688 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 3689 ill = ill_find_by_name(name, isv6, ipst); 3690 if (ill != NULL) { 3691 rw_exit(&ipst->ips_ill_g_lock); 3692 return (ill); 3693 } 3694 3695 /* Create the loopback device on demand */ 3696 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 3697 sizeof (ipif_loopback_name), BPRI_MED)); 3698 if (ill == NULL) 3699 goto done; 3700 3701 *ill = ill_null; 3702 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 3703 ill->ill_ipst = ipst; 3704 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); 3705 netstack_hold(ipst->ips_netstack); 3706 /* 3707 * For exclusive stacks we set the zoneid to zero 3708 * to make IP operate as if in the global zone. 3709 */ 3710 ill->ill_zoneid = GLOBAL_ZONEID; 3711 3712 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 3713 if (ill->ill_phyint == NULL) 3714 goto done; 3715 3716 if (isv6) 3717 ill->ill_phyint->phyint_illv6 = ill; 3718 else 3719 ill->ill_phyint->phyint_illv4 = ill; 3720 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 3721 phyint_flags_init(ill->ill_phyint, DL_LOOP); 3722 3723 if (isv6) { 3724 ill->ill_isv6 = B_TRUE; 3725 ill->ill_max_frag = ip_loopback_mtu_v6plus; 3726 } else { 3727 ill->ill_max_frag = ip_loopback_mtuplus; 3728 } 3729 if (!ill_allocate_mibs(ill)) 3730 goto done; 3731 ill->ill_current_frag = ill->ill_max_frag; 3732 ill->ill_mtu = ill->ill_max_frag; /* Initial value */ 3733 /* 3734 * ipif_loopback_name can't be pointed at directly because its used 3735 * by both the ipv4 and ipv6 interfaces. When the ill is removed 3736 * from the glist, ill_glist_delete() sets the first character of 3737 * ill_name to '\0'. 3738 */ 3739 ill->ill_name = (char *)ill + sizeof (*ill); 3740 (void) strcpy(ill->ill_name, ipif_loopback_name); 3741 ill->ill_name_length = sizeof (ipif_loopback_name); 3742 /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */ 3743 ill->ill_dlpi_pending = DL_PRIM_INVAL; 3744 3745 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); 3746 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); 3747 ill->ill_global_timer = INFINITY; 3748 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 3749 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 3750 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 3751 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 3752 3753 /* No resolver here. */ 3754 ill->ill_net_type = IRE_LOOPBACK; 3755 3756 /* Initialize the ipsq */ 3757 if (!ipsq_init(ill, B_FALSE)) 3758 goto done; 3759 3760 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL); 3761 if (ipif == NULL) 3762 goto done; 3763 3764 ill->ill_flags = ILLF_MULTICAST; 3765 3766 ov6addr = ipif->ipif_v6lcl_addr; 3767 /* Set up default loopback address and mask. */ 3768 if (!isv6) { 3769 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 3770 3771 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 3772 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 3773 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 3774 ipif->ipif_v6subnet); 3775 ill->ill_flags |= ILLF_IPV4; 3776 } else { 3777 ipif->ipif_v6lcl_addr = ipv6_loopback; 3778 ipif->ipif_v6net_mask = ipv6_all_ones; 3779 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 3780 ipif->ipif_v6subnet); 3781 ill->ill_flags |= ILLF_IPV6; 3782 } 3783 3784 /* 3785 * Chain us in at the end of the ill list. hold the ill 3786 * before we make it globally visible. 1 for the lookup. 3787 */ 3788 ill->ill_refcnt = 0; 3789 ill_refhold(ill); 3790 3791 ill->ill_frag_count = 0; 3792 ill->ill_frag_free_num_pkts = 0; 3793 ill->ill_last_frag_clean_time = 0; 3794 3795 ipsq = ill->ill_phyint->phyint_ipsq; 3796 3797 ill_set_inputfn(ill); 3798 3799 if (ill_glist_insert(ill, "lo", isv6) != 0) 3800 cmn_err(CE_PANIC, "cannot insert loopback interface"); 3801 3802 /* Let SCTP know so that it can add this to its list */ 3803 sctp_update_ill(ill, SCTP_ILL_INSERT); 3804 3805 /* 3806 * We have already assigned ipif_v6lcl_addr above, but we need to 3807 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which 3808 * requires to be after ill_glist_insert() since we need the 3809 * ill_index set. Pass on ipv6_loopback as the old address. 3810 */ 3811 sctp_update_ipif_addr(ipif, ov6addr); 3812 3813 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT); 3814 3815 /* 3816 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs. 3817 * If so, free our original one. 3818 */ 3819 if (ipsq != ill->ill_phyint->phyint_ipsq) 3820 ipsq_delete(ipsq); 3821 3822 if (ipst->ips_loopback_ksp == NULL) { 3823 /* Export loopback interface statistics */ 3824 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0, 3825 ipif_loopback_name, "net", 3826 KSTAT_TYPE_NAMED, 2, 0, 3827 ipst->ips_netstack->netstack_stackid); 3828 if (ipst->ips_loopback_ksp != NULL) { 3829 ipst->ips_loopback_ksp->ks_update = 3830 loopback_kstat_update; 3831 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp); 3832 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 3833 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 3834 ipst->ips_loopback_ksp->ks_private = 3835 (void *)(uintptr_t)ipst->ips_netstack-> 3836 netstack_stackid; 3837 kstat_install(ipst->ips_loopback_ksp); 3838 } 3839 } 3840 3841 *did_alloc = B_TRUE; 3842 rw_exit(&ipst->ips_ill_g_lock); 3843 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id), 3844 NE_PLUMB, ill->ill_name, ill->ill_name_length); 3845 return (ill); 3846 done: 3847 if (ill != NULL) { 3848 if (ill->ill_phyint != NULL) { 3849 ipsq = ill->ill_phyint->phyint_ipsq; 3850 if (ipsq != NULL) { 3851 ipsq->ipsq_phyint = NULL; 3852 ipsq_delete(ipsq); 3853 } 3854 mi_free(ill->ill_phyint); 3855 } 3856 ill_free_mib(ill); 3857 if (ill->ill_ipst != NULL) 3858 netstack_rele(ill->ill_ipst->ips_netstack); 3859 mi_free(ill); 3860 } 3861 rw_exit(&ipst->ips_ill_g_lock); 3862 return (NULL); 3863 } 3864 3865 /* 3866 * For IPP calls - use the ip_stack_t for global stack. 3867 */ 3868 ill_t * 3869 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6) 3870 { 3871 ip_stack_t *ipst; 3872 ill_t *ill; 3873 3874 ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip; 3875 if (ipst == NULL) { 3876 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n"); 3877 return (NULL); 3878 } 3879 3880 ill = ill_lookup_on_ifindex(index, isv6, ipst); 3881 netstack_rele(ipst->ips_netstack); 3882 return (ill); 3883 } 3884 3885 /* 3886 * Return a pointer to the ill which matches the index and IP version type. 3887 */ 3888 ill_t * 3889 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 3890 { 3891 ill_t *ill; 3892 phyint_t *phyi; 3893 3894 /* 3895 * Indexes are stored in the phyint - a common structure 3896 * to both IPv4 and IPv6. 3897 */ 3898 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3899 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3900 (void *) &index, NULL); 3901 if (phyi != NULL) { 3902 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 3903 if (ill != NULL) { 3904 mutex_enter(&ill->ill_lock); 3905 if (!ILL_IS_CONDEMNED(ill)) { 3906 ill_refhold_locked(ill); 3907 mutex_exit(&ill->ill_lock); 3908 rw_exit(&ipst->ips_ill_g_lock); 3909 return (ill); 3910 } 3911 mutex_exit(&ill->ill_lock); 3912 } 3913 } 3914 rw_exit(&ipst->ips_ill_g_lock); 3915 return (NULL); 3916 } 3917 3918 /* 3919 * Verify whether or not an interface index is valid. 3920 * It can be zero (meaning "reset") or an interface index assigned 3921 * to a non-VNI interface. (We don't use VNI interface to send packets.) 3922 */ 3923 boolean_t 3924 ip_ifindex_valid(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst) 3925 { 3926 ill_t *ill; 3927 3928 if (ifindex == 0) 3929 return (B_TRUE); 3930 3931 ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 3932 if (ill == NULL) 3933 return (B_FALSE); 3934 if (IS_VNI(ill)) { 3935 ill_refrele(ill); 3936 return (B_FALSE); 3937 } 3938 ill_refrele(ill); 3939 return (B_TRUE); 3940 } 3941 3942 /* 3943 * Return the ifindex next in sequence after the passed in ifindex. 3944 * If there is no next ifindex for the given protocol, return 0. 3945 */ 3946 uint_t 3947 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 3948 { 3949 phyint_t *phyi; 3950 phyint_t *phyi_initial; 3951 uint_t ifindex; 3952 3953 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3954 3955 if (index == 0) { 3956 phyi = avl_first( 3957 &ipst->ips_phyint_g_list->phyint_list_avl_by_index); 3958 } else { 3959 phyi = phyi_initial = avl_find( 3960 &ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3961 (void *) &index, NULL); 3962 } 3963 3964 for (; phyi != NULL; 3965 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3966 phyi, AVL_AFTER)) { 3967 /* 3968 * If we're not returning the first interface in the tree 3969 * and we still haven't moved past the phyint_t that 3970 * corresponds to index, avl_walk needs to be called again 3971 */ 3972 if (!((index != 0) && (phyi == phyi_initial))) { 3973 if (isv6) { 3974 if ((phyi->phyint_illv6) && 3975 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 3976 (phyi->phyint_illv6->ill_isv6 == 1)) 3977 break; 3978 } else { 3979 if ((phyi->phyint_illv4) && 3980 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 3981 (phyi->phyint_illv4->ill_isv6 == 0)) 3982 break; 3983 } 3984 } 3985 } 3986 3987 rw_exit(&ipst->ips_ill_g_lock); 3988 3989 if (phyi != NULL) 3990 ifindex = phyi->phyint_ifindex; 3991 else 3992 ifindex = 0; 3993 3994 return (ifindex); 3995 } 3996 3997 /* 3998 * Return the ifindex for the named interface. 3999 * If there is no next ifindex for the interface, return 0. 4000 */ 4001 uint_t 4002 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst) 4003 { 4004 phyint_t *phyi; 4005 avl_index_t where = 0; 4006 uint_t ifindex; 4007 4008 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4009 4010 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 4011 name, &where)) == NULL) { 4012 rw_exit(&ipst->ips_ill_g_lock); 4013 return (0); 4014 } 4015 4016 ifindex = phyi->phyint_ifindex; 4017 4018 rw_exit(&ipst->ips_ill_g_lock); 4019 4020 return (ifindex); 4021 } 4022 4023 /* 4024 * Return the ifindex to be used by upper layer protocols for instance 4025 * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill. 4026 */ 4027 uint_t 4028 ill_get_upper_ifindex(const ill_t *ill) 4029 { 4030 if (IS_UNDER_IPMP(ill)) 4031 return (ipmp_ill_get_ipmp_ifindex(ill)); 4032 else 4033 return (ill->ill_phyint->phyint_ifindex); 4034 } 4035 4036 4037 /* 4038 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 4039 * that gives a running thread a reference to the ill. This reference must be 4040 * released by the thread when it is done accessing the ill and related 4041 * objects. ill_refcnt can not be used to account for static references 4042 * such as other structures pointing to an ill. Callers must generally 4043 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 4044 * or be sure that the ill is not being deleted or changing state before 4045 * calling the refhold functions. A non-zero ill_refcnt ensures that the 4046 * ill won't change any of its critical state such as address, netmask etc. 4047 */ 4048 void 4049 ill_refhold(ill_t *ill) 4050 { 4051 mutex_enter(&ill->ill_lock); 4052 ill->ill_refcnt++; 4053 ILL_TRACE_REF(ill); 4054 mutex_exit(&ill->ill_lock); 4055 } 4056 4057 void 4058 ill_refhold_locked(ill_t *ill) 4059 { 4060 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4061 ill->ill_refcnt++; 4062 ILL_TRACE_REF(ill); 4063 } 4064 4065 /* Returns true if we managed to get a refhold */ 4066 boolean_t 4067 ill_check_and_refhold(ill_t *ill) 4068 { 4069 mutex_enter(&ill->ill_lock); 4070 if (!ILL_IS_CONDEMNED(ill)) { 4071 ill_refhold_locked(ill); 4072 mutex_exit(&ill->ill_lock); 4073 return (B_TRUE); 4074 } 4075 mutex_exit(&ill->ill_lock); 4076 return (B_FALSE); 4077 } 4078 4079 /* 4080 * Must not be called while holding any locks. Otherwise if this is 4081 * the last reference to be released, there is a chance of recursive mutex 4082 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 4083 * to restart an ioctl. 4084 */ 4085 void 4086 ill_refrele(ill_t *ill) 4087 { 4088 mutex_enter(&ill->ill_lock); 4089 ASSERT(ill->ill_refcnt != 0); 4090 ill->ill_refcnt--; 4091 ILL_UNTRACE_REF(ill); 4092 if (ill->ill_refcnt != 0) { 4093 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 4094 mutex_exit(&ill->ill_lock); 4095 return; 4096 } 4097 4098 /* Drops the ill_lock */ 4099 ipif_ill_refrele_tail(ill); 4100 } 4101 4102 /* 4103 * Obtain a weak reference count on the ill. This reference ensures the 4104 * ill won't be freed, but the ill may change any of its critical state 4105 * such as netmask, address etc. Returns an error if the ill has started 4106 * closing. 4107 */ 4108 boolean_t 4109 ill_waiter_inc(ill_t *ill) 4110 { 4111 mutex_enter(&ill->ill_lock); 4112 if (ill->ill_state_flags & ILL_CONDEMNED) { 4113 mutex_exit(&ill->ill_lock); 4114 return (B_FALSE); 4115 } 4116 ill->ill_waiters++; 4117 mutex_exit(&ill->ill_lock); 4118 return (B_TRUE); 4119 } 4120 4121 void 4122 ill_waiter_dcr(ill_t *ill) 4123 { 4124 mutex_enter(&ill->ill_lock); 4125 ill->ill_waiters--; 4126 if (ill->ill_waiters == 0) 4127 cv_broadcast(&ill->ill_cv); 4128 mutex_exit(&ill->ill_lock); 4129 } 4130 4131 /* 4132 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 4133 * driver. We construct best guess defaults for lower level information that 4134 * we need. If an interface is brought up without injection of any overriding 4135 * information from outside, we have to be ready to go with these defaults. 4136 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 4137 * we primarely want the dl_provider_style. 4138 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 4139 * at which point we assume the other part of the information is valid. 4140 */ 4141 void 4142 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 4143 { 4144 uchar_t *brdcst_addr; 4145 uint_t brdcst_addr_length, phys_addr_length; 4146 t_scalar_t sap_length; 4147 dl_info_ack_t *dlia; 4148 ip_m_t *ipm; 4149 dl_qos_cl_sel1_t *sel1; 4150 int min_mtu; 4151 4152 ASSERT(IAM_WRITER_ILL(ill)); 4153 4154 /* 4155 * Till the ill is fully up the ill is not globally visible. 4156 * So no need for a lock. 4157 */ 4158 dlia = (dl_info_ack_t *)mp->b_rptr; 4159 ill->ill_mactype = dlia->dl_mac_type; 4160 4161 ipm = ip_m_lookup(dlia->dl_mac_type); 4162 if (ipm == NULL) { 4163 ipm = ip_m_lookup(DL_OTHER); 4164 ASSERT(ipm != NULL); 4165 } 4166 ill->ill_media = ipm; 4167 4168 /* 4169 * When the new DLPI stuff is ready we'll pull lengths 4170 * from dlia. 4171 */ 4172 if (dlia->dl_version == DL_VERSION_2) { 4173 brdcst_addr_length = dlia->dl_brdcst_addr_length; 4174 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 4175 brdcst_addr_length); 4176 if (brdcst_addr == NULL) { 4177 brdcst_addr_length = 0; 4178 } 4179 sap_length = dlia->dl_sap_length; 4180 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 4181 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 4182 brdcst_addr_length, sap_length, phys_addr_length)); 4183 } else { 4184 brdcst_addr_length = 6; 4185 brdcst_addr = ip_six_byte_all_ones; 4186 sap_length = -2; 4187 phys_addr_length = brdcst_addr_length; 4188 } 4189 4190 ill->ill_bcast_addr_length = brdcst_addr_length; 4191 ill->ill_phys_addr_length = phys_addr_length; 4192 ill->ill_sap_length = sap_length; 4193 4194 /* 4195 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU, 4196 * but we must ensure a minimum IP MTU is used since other bits of 4197 * IP will fly apart otherwise. 4198 */ 4199 min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; 4200 ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu); 4201 ill->ill_current_frag = ill->ill_max_frag; 4202 ill->ill_mtu = ill->ill_max_frag; 4203 4204 ill->ill_type = ipm->ip_m_type; 4205 4206 if (!ill->ill_dlpi_style_set) { 4207 if (dlia->dl_provider_style == DL_STYLE2) 4208 ill->ill_needs_attach = 1; 4209 4210 phyint_flags_init(ill->ill_phyint, ill->ill_mactype); 4211 4212 /* 4213 * Allocate the first ipif on this ill. We don't delay it 4214 * further as ioctl handling assumes at least one ipif exists. 4215 * 4216 * At this point we don't know whether the ill is v4 or v6. 4217 * We will know this whan the SIOCSLIFNAME happens and 4218 * the correct value for ill_isv6 will be assigned in 4219 * ipif_set_values(). We need to hold the ill lock and 4220 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 4221 * the wakeup. 4222 */ 4223 (void) ipif_allocate(ill, 0, IRE_LOCAL, 4224 dlia->dl_provider_style != DL_STYLE2, B_TRUE, NULL); 4225 mutex_enter(&ill->ill_lock); 4226 ASSERT(ill->ill_dlpi_style_set == 0); 4227 ill->ill_dlpi_style_set = 1; 4228 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 4229 cv_broadcast(&ill->ill_cv); 4230 mutex_exit(&ill->ill_lock); 4231 freemsg(mp); 4232 return; 4233 } 4234 ASSERT(ill->ill_ipif != NULL); 4235 /* 4236 * We know whether it is IPv4 or IPv6 now, as this is the 4237 * second DL_INFO_ACK we are recieving in response to the 4238 * DL_INFO_REQ sent in ipif_set_values. 4239 */ 4240 ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap; 4241 /* 4242 * Clear all the flags that were set based on ill_bcast_addr_length 4243 * and ill_phys_addr_length (in ipif_set_values) as these could have 4244 * changed now and we need to re-evaluate. 4245 */ 4246 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 4247 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 4248 4249 /* 4250 * Free ill_bcast_mp as things could have changed now. 4251 * 4252 * NOTE: The IPMP meta-interface is special-cased because it starts 4253 * with no underlying interfaces (and thus an unknown broadcast 4254 * address length), but we enforce that an interface is broadcast- 4255 * capable as part of allowing it to join a group. 4256 */ 4257 if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) { 4258 if (ill->ill_bcast_mp != NULL) 4259 freemsg(ill->ill_bcast_mp); 4260 ill->ill_net_type = IRE_IF_NORESOLVER; 4261 4262 ill->ill_bcast_mp = ill_dlur_gen(NULL, 4263 ill->ill_phys_addr_length, 4264 ill->ill_sap, 4265 ill->ill_sap_length); 4266 4267 if (ill->ill_isv6) 4268 /* 4269 * Note: xresolv interfaces will eventually need NOARP 4270 * set here as well, but that will require those 4271 * external resolvers to have some knowledge of 4272 * that flag and act appropriately. Not to be changed 4273 * at present. 4274 */ 4275 ill->ill_flags |= ILLF_NONUD; 4276 else 4277 ill->ill_flags |= ILLF_NOARP; 4278 4279 if (ill->ill_mactype == SUNW_DL_VNI) { 4280 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 4281 } else if (ill->ill_phys_addr_length == 0 || 4282 ill->ill_mactype == DL_IPV4 || 4283 ill->ill_mactype == DL_IPV6) { 4284 /* 4285 * The underying link is point-to-point, so mark the 4286 * interface as such. We can do IP multicast over 4287 * such a link since it transmits all network-layer 4288 * packets to the remote side the same way. 4289 */ 4290 ill->ill_flags |= ILLF_MULTICAST; 4291 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 4292 } 4293 } else { 4294 ill->ill_net_type = IRE_IF_RESOLVER; 4295 if (ill->ill_bcast_mp != NULL) 4296 freemsg(ill->ill_bcast_mp); 4297 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 4298 ill->ill_bcast_addr_length, ill->ill_sap, 4299 ill->ill_sap_length); 4300 /* 4301 * Later detect lack of DLPI driver multicast 4302 * capability by catching DL_ENABMULTI errors in 4303 * ip_rput_dlpi. 4304 */ 4305 ill->ill_flags |= ILLF_MULTICAST; 4306 if (!ill->ill_isv6) 4307 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 4308 } 4309 4310 /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */ 4311 if (ill->ill_mactype == SUNW_DL_IPMP) 4312 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP); 4313 4314 /* By default an interface does not support any CoS marking */ 4315 ill->ill_flags &= ~ILLF_COS_ENABLED; 4316 4317 /* 4318 * If we get QoS information in DL_INFO_ACK, the device supports 4319 * some form of CoS marking, set ILLF_COS_ENABLED. 4320 */ 4321 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 4322 dlia->dl_qos_length); 4323 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 4324 ill->ill_flags |= ILLF_COS_ENABLED; 4325 } 4326 4327 /* Clear any previous error indication. */ 4328 ill->ill_error = 0; 4329 freemsg(mp); 4330 } 4331 4332 /* 4333 * Perform various checks to verify that an address would make sense as a 4334 * local, remote, or subnet interface address. 4335 */ 4336 static boolean_t 4337 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 4338 { 4339 ipaddr_t net_mask; 4340 4341 /* 4342 * Don't allow all zeroes, or all ones, but allow 4343 * all ones netmask. 4344 */ 4345 if ((net_mask = ip_net_mask(addr)) == 0) 4346 return (B_FALSE); 4347 /* A given netmask overrides the "guess" netmask */ 4348 if (subnet_mask != 0) 4349 net_mask = subnet_mask; 4350 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 4351 (addr == (addr | ~net_mask)))) { 4352 return (B_FALSE); 4353 } 4354 4355 /* 4356 * Even if the netmask is all ones, we do not allow address to be 4357 * 255.255.255.255 4358 */ 4359 if (addr == INADDR_BROADCAST) 4360 return (B_FALSE); 4361 4362 if (CLASSD(addr)) 4363 return (B_FALSE); 4364 4365 return (B_TRUE); 4366 } 4367 4368 #define V6_IPIF_LINKLOCAL(p) \ 4369 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr) 4370 4371 /* 4372 * Compare two given ipifs and check if the second one is better than 4373 * the first one using the order of preference (not taking deprecated 4374 * into acount) specified in ipif_lookup_multicast(). 4375 */ 4376 static boolean_t 4377 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) 4378 { 4379 /* Check the least preferred first. */ 4380 if (IS_LOOPBACK(old_ipif->ipif_ill)) { 4381 /* If both ipifs are the same, use the first one. */ 4382 if (IS_LOOPBACK(new_ipif->ipif_ill)) 4383 return (B_FALSE); 4384 else 4385 return (B_TRUE); 4386 } 4387 4388 /* For IPv6, check for link local address. */ 4389 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) { 4390 if (IS_LOOPBACK(new_ipif->ipif_ill) || 4391 V6_IPIF_LINKLOCAL(new_ipif)) { 4392 /* The second one is equal or less preferred. */ 4393 return (B_FALSE); 4394 } else { 4395 return (B_TRUE); 4396 } 4397 } 4398 4399 /* Then check for point to point interface. */ 4400 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) { 4401 if (IS_LOOPBACK(new_ipif->ipif_ill) || 4402 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) || 4403 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) { 4404 return (B_FALSE); 4405 } else { 4406 return (B_TRUE); 4407 } 4408 } 4409 4410 /* old_ipif is a normal interface, so no need to use the new one. */ 4411 return (B_FALSE); 4412 } 4413 4414 /* 4415 * Find a mulitcast-capable ipif given an IP instance and zoneid. 4416 * The ipif must be up, and its ill must multicast-capable, not 4417 * condemned, not an underlying interface in an IPMP group, and 4418 * not a VNI interface. Order of preference: 4419 * 4420 * 1a. normal 4421 * 1b. normal, but deprecated 4422 * 2a. point to point 4423 * 2b. point to point, but deprecated 4424 * 3a. link local 4425 * 3b. link local, but deprecated 4426 * 4. loopback. 4427 */ 4428 static ipif_t * 4429 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 4430 { 4431 ill_t *ill; 4432 ill_walk_context_t ctx; 4433 ipif_t *ipif; 4434 ipif_t *saved_ipif = NULL; 4435 ipif_t *dep_ipif = NULL; 4436 4437 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4438 if (isv6) 4439 ill = ILL_START_WALK_V6(&ctx, ipst); 4440 else 4441 ill = ILL_START_WALK_V4(&ctx, ipst); 4442 4443 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4444 mutex_enter(&ill->ill_lock); 4445 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || 4446 ILL_IS_CONDEMNED(ill) || 4447 !(ill->ill_flags & ILLF_MULTICAST)) { 4448 mutex_exit(&ill->ill_lock); 4449 continue; 4450 } 4451 for (ipif = ill->ill_ipif; ipif != NULL; 4452 ipif = ipif->ipif_next) { 4453 if (zoneid != ipif->ipif_zoneid && 4454 zoneid != ALL_ZONES && 4455 ipif->ipif_zoneid != ALL_ZONES) { 4456 continue; 4457 } 4458 if (!(ipif->ipif_flags & IPIF_UP) || 4459 IPIF_IS_CONDEMNED(ipif)) { 4460 continue; 4461 } 4462 4463 /* 4464 * Found one candidate. If it is deprecated, 4465 * remember it in dep_ipif. If it is not deprecated, 4466 * remember it in saved_ipif. 4467 */ 4468 if (ipif->ipif_flags & IPIF_DEPRECATED) { 4469 if (dep_ipif == NULL) { 4470 dep_ipif = ipif; 4471 } else if (ipif_comp_multi(dep_ipif, ipif, 4472 isv6)) { 4473 /* 4474 * If the previous dep_ipif does not 4475 * belong to the same ill, we've done 4476 * a ipif_refhold() on it. So we need 4477 * to release it. 4478 */ 4479 if (dep_ipif->ipif_ill != ill) 4480 ipif_refrele(dep_ipif); 4481 dep_ipif = ipif; 4482 } 4483 continue; 4484 } 4485 if (saved_ipif == NULL) { 4486 saved_ipif = ipif; 4487 } else { 4488 if (ipif_comp_multi(saved_ipif, ipif, isv6)) { 4489 if (saved_ipif->ipif_ill != ill) 4490 ipif_refrele(saved_ipif); 4491 saved_ipif = ipif; 4492 } 4493 } 4494 } 4495 /* 4496 * Before going to the next ill, do a ipif_refhold() on the 4497 * saved ones. 4498 */ 4499 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill) 4500 ipif_refhold_locked(saved_ipif); 4501 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill) 4502 ipif_refhold_locked(dep_ipif); 4503 mutex_exit(&ill->ill_lock); 4504 } 4505 rw_exit(&ipst->ips_ill_g_lock); 4506 4507 /* 4508 * If we have only the saved_ipif, return it. But if we have both 4509 * saved_ipif and dep_ipif, check to see which one is better. 4510 */ 4511 if (saved_ipif != NULL) { 4512 if (dep_ipif != NULL) { 4513 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) { 4514 ipif_refrele(saved_ipif); 4515 return (dep_ipif); 4516 } else { 4517 ipif_refrele(dep_ipif); 4518 return (saved_ipif); 4519 } 4520 } 4521 return (saved_ipif); 4522 } else { 4523 return (dep_ipif); 4524 } 4525 } 4526 4527 ill_t * 4528 ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 4529 { 4530 ipif_t *ipif; 4531 ill_t *ill; 4532 4533 ipif = ipif_lookup_multicast(ipst, zoneid, isv6); 4534 if (ipif == NULL) 4535 return (NULL); 4536 4537 ill = ipif->ipif_ill; 4538 ill_refhold(ill); 4539 ipif_refrele(ipif); 4540 return (ill); 4541 } 4542 4543 /* 4544 * This function is called when an application does not specify an interface 4545 * to be used for multicast traffic (joining a group/sending data). It 4546 * calls ire_lookup_multi() to look for an interface route for the 4547 * specified multicast group. Doing this allows the administrator to add 4548 * prefix routes for multicast to indicate which interface to be used for 4549 * multicast traffic in the above scenario. The route could be for all 4550 * multicast (224.0/4), for a single multicast group (a /32 route) or 4551 * anything in between. If there is no such multicast route, we just find 4552 * any multicast capable interface and return it. The returned ipif 4553 * is refhold'ed. 4554 * 4555 * We support MULTIRT and RTF_SETSRC on the multicast routes added to the 4556 * unicast table. This is used by CGTP. 4557 */ 4558 ill_t * 4559 ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 4560 boolean_t *multirtp, ipaddr_t *setsrcp) 4561 { 4562 ill_t *ill; 4563 4564 ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp); 4565 if (ill != NULL) 4566 return (ill); 4567 4568 return (ill_lookup_multicast(ipst, zoneid, B_FALSE)); 4569 } 4570 4571 /* 4572 * Look for an ipif with the specified interface address and destination. 4573 * The destination address is used only for matching point-to-point interfaces. 4574 */ 4575 ipif_t * 4576 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst) 4577 { 4578 ipif_t *ipif; 4579 ill_t *ill; 4580 ill_walk_context_t ctx; 4581 4582 /* 4583 * First match all the point-to-point interfaces 4584 * before looking at non-point-to-point interfaces. 4585 * This is done to avoid returning non-point-to-point 4586 * ipif instead of unnumbered point-to-point ipif. 4587 */ 4588 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4589 ill = ILL_START_WALK_V4(&ctx, ipst); 4590 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4591 mutex_enter(&ill->ill_lock); 4592 for (ipif = ill->ill_ipif; ipif != NULL; 4593 ipif = ipif->ipif_next) { 4594 /* Allow the ipif to be down */ 4595 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 4596 (ipif->ipif_lcl_addr == if_addr) && 4597 (ipif->ipif_pp_dst_addr == dst)) { 4598 if (!IPIF_IS_CONDEMNED(ipif)) { 4599 ipif_refhold_locked(ipif); 4600 mutex_exit(&ill->ill_lock); 4601 rw_exit(&ipst->ips_ill_g_lock); 4602 return (ipif); 4603 } 4604 } 4605 } 4606 mutex_exit(&ill->ill_lock); 4607 } 4608 rw_exit(&ipst->ips_ill_g_lock); 4609 4610 /* lookup the ipif based on interface address */ 4611 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst); 4612 ASSERT(ipif == NULL || !ipif->ipif_isv6); 4613 return (ipif); 4614 } 4615 4616 /* 4617 * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact(). 4618 */ 4619 static ipif_t * 4620 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags, 4621 zoneid_t zoneid, ip_stack_t *ipst) 4622 { 4623 ipif_t *ipif; 4624 ill_t *ill; 4625 boolean_t ptp = B_FALSE; 4626 ill_walk_context_t ctx; 4627 boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP); 4628 boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP); 4629 4630 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4631 /* 4632 * Repeat twice, first based on local addresses and 4633 * next time for pointopoint. 4634 */ 4635 repeat: 4636 ill = ILL_START_WALK_V4(&ctx, ipst); 4637 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4638 if (match_ill != NULL && ill != match_ill && 4639 (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { 4640 continue; 4641 } 4642 mutex_enter(&ill->ill_lock); 4643 for (ipif = ill->ill_ipif; ipif != NULL; 4644 ipif = ipif->ipif_next) { 4645 if (zoneid != ALL_ZONES && 4646 zoneid != ipif->ipif_zoneid && 4647 ipif->ipif_zoneid != ALL_ZONES) 4648 continue; 4649 4650 if (no_duplicate && !(ipif->ipif_flags & IPIF_UP)) 4651 continue; 4652 4653 /* Allow the ipif to be down */ 4654 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 4655 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 4656 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 4657 (ipif->ipif_pp_dst_addr == addr))) { 4658 if (!IPIF_IS_CONDEMNED(ipif)) { 4659 ipif_refhold_locked(ipif); 4660 mutex_exit(&ill->ill_lock); 4661 rw_exit(&ipst->ips_ill_g_lock); 4662 return (ipif); 4663 } 4664 } 4665 } 4666 mutex_exit(&ill->ill_lock); 4667 } 4668 4669 /* If we already did the ptp case, then we are done */ 4670 if (ptp) { 4671 rw_exit(&ipst->ips_ill_g_lock); 4672 return (NULL); 4673 } 4674 ptp = B_TRUE; 4675 goto repeat; 4676 } 4677 4678 /* 4679 * Lookup an ipif with the specified address. For point-to-point links we 4680 * look for matches on either the destination address or the local address, 4681 * but we skip the local address check if IPIF_UNNUMBERED is set. If the 4682 * `match_ill' argument is non-NULL, the lookup is restricted to that ill 4683 * (or illgrp if `match_ill' is in an IPMP group). 4684 */ 4685 ipif_t * 4686 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, 4687 ip_stack_t *ipst) 4688 { 4689 return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP, 4690 zoneid, ipst)); 4691 } 4692 4693 /* 4694 * Lookup an ipif with the specified address. Similar to ipif_lookup_addr, 4695 * except that we will only return an address if it is not marked as 4696 * IPIF_DUPLICATE 4697 */ 4698 ipif_t * 4699 ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, 4700 ip_stack_t *ipst) 4701 { 4702 return (ipif_lookup_addr_common(addr, match_ill, 4703 (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP), 4704 zoneid, ipst)); 4705 } 4706 4707 /* 4708 * Special abbreviated version of ipif_lookup_addr() that doesn't match 4709 * `match_ill' across the IPMP group. This function is only needed in some 4710 * corner-cases; almost everything should use ipif_lookup_addr(). 4711 */ 4712 ipif_t * 4713 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 4714 { 4715 ASSERT(match_ill != NULL); 4716 return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES, 4717 ipst)); 4718 } 4719 4720 /* 4721 * Look for an ipif with the specified address. For point-point links 4722 * we look for matches on either the destination address and the local 4723 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 4724 * is set. 4725 * If the `match_ill' argument is non-NULL, the lookup is restricted to that 4726 * ill (or illgrp if `match_ill' is in an IPMP group). 4727 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 4728 */ 4729 zoneid_t 4730 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 4731 { 4732 zoneid_t zoneid; 4733 ipif_t *ipif; 4734 ill_t *ill; 4735 boolean_t ptp = B_FALSE; 4736 ill_walk_context_t ctx; 4737 4738 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4739 /* 4740 * Repeat twice, first based on local addresses and 4741 * next time for pointopoint. 4742 */ 4743 repeat: 4744 ill = ILL_START_WALK_V4(&ctx, ipst); 4745 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4746 if (match_ill != NULL && ill != match_ill && 4747 !IS_IN_SAME_ILLGRP(ill, match_ill)) { 4748 continue; 4749 } 4750 mutex_enter(&ill->ill_lock); 4751 for (ipif = ill->ill_ipif; ipif != NULL; 4752 ipif = ipif->ipif_next) { 4753 /* Allow the ipif to be down */ 4754 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 4755 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 4756 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 4757 (ipif->ipif_pp_dst_addr == addr)) && 4758 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 4759 zoneid = ipif->ipif_zoneid; 4760 mutex_exit(&ill->ill_lock); 4761 rw_exit(&ipst->ips_ill_g_lock); 4762 /* 4763 * If ipif_zoneid was ALL_ZONES then we have 4764 * a trusted extensions shared IP address. 4765 * In that case GLOBAL_ZONEID works to send. 4766 */ 4767 if (zoneid == ALL_ZONES) 4768 zoneid = GLOBAL_ZONEID; 4769 return (zoneid); 4770 } 4771 } 4772 mutex_exit(&ill->ill_lock); 4773 } 4774 4775 /* If we already did the ptp case, then we are done */ 4776 if (ptp) { 4777 rw_exit(&ipst->ips_ill_g_lock); 4778 return (ALL_ZONES); 4779 } 4780 ptp = B_TRUE; 4781 goto repeat; 4782 } 4783 4784 /* 4785 * Look for an ipif that matches the specified remote address i.e. the 4786 * ipif that would receive the specified packet. 4787 * First look for directly connected interfaces and then do a recursive 4788 * IRE lookup and pick the first ipif corresponding to the source address in the 4789 * ire. 4790 * Returns: held ipif 4791 * 4792 * This is only used for ICMP_ADDRESS_MASK_REQUESTs 4793 */ 4794 ipif_t * 4795 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 4796 { 4797 ipif_t *ipif; 4798 4799 ASSERT(!ill->ill_isv6); 4800 4801 /* 4802 * Someone could be changing this ipif currently or change it 4803 * after we return this. Thus a few packets could use the old 4804 * old values. However structure updates/creates (ire, ilg, ilm etc) 4805 * will atomically be updated or cleaned up with the new value 4806 * Thus we don't need a lock to check the flags or other attrs below. 4807 */ 4808 mutex_enter(&ill->ill_lock); 4809 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4810 if (IPIF_IS_CONDEMNED(ipif)) 4811 continue; 4812 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 4813 ipif->ipif_zoneid != ALL_ZONES) 4814 continue; 4815 /* Allow the ipif to be down */ 4816 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 4817 if ((ipif->ipif_pp_dst_addr == addr) || 4818 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 4819 ipif->ipif_lcl_addr == addr)) { 4820 ipif_refhold_locked(ipif); 4821 mutex_exit(&ill->ill_lock); 4822 return (ipif); 4823 } 4824 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 4825 ipif_refhold_locked(ipif); 4826 mutex_exit(&ill->ill_lock); 4827 return (ipif); 4828 } 4829 } 4830 mutex_exit(&ill->ill_lock); 4831 /* 4832 * For a remote destination it isn't possible to nail down a particular 4833 * ipif. 4834 */ 4835 4836 /* Pick the first interface */ 4837 ipif = ipif_get_next_ipif(NULL, ill); 4838 return (ipif); 4839 } 4840 4841 /* 4842 * This func does not prevent refcnt from increasing. But if 4843 * the caller has taken steps to that effect, then this func 4844 * can be used to determine whether the ill has become quiescent 4845 */ 4846 static boolean_t 4847 ill_is_quiescent(ill_t *ill) 4848 { 4849 ipif_t *ipif; 4850 4851 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4852 4853 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4854 if (ipif->ipif_refcnt != 0) 4855 return (B_FALSE); 4856 } 4857 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) { 4858 return (B_FALSE); 4859 } 4860 return (B_TRUE); 4861 } 4862 4863 boolean_t 4864 ill_is_freeable(ill_t *ill) 4865 { 4866 ipif_t *ipif; 4867 4868 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4869 4870 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4871 if (ipif->ipif_refcnt != 0) { 4872 return (B_FALSE); 4873 } 4874 } 4875 if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) { 4876 return (B_FALSE); 4877 } 4878 return (B_TRUE); 4879 } 4880 4881 /* 4882 * This func does not prevent refcnt from increasing. But if 4883 * the caller has taken steps to that effect, then this func 4884 * can be used to determine whether the ipif has become quiescent 4885 */ 4886 static boolean_t 4887 ipif_is_quiescent(ipif_t *ipif) 4888 { 4889 ill_t *ill; 4890 4891 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 4892 4893 if (ipif->ipif_refcnt != 0) 4894 return (B_FALSE); 4895 4896 ill = ipif->ipif_ill; 4897 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 4898 ill->ill_logical_down) { 4899 return (B_TRUE); 4900 } 4901 4902 /* This is the last ipif going down or being deleted on this ill */ 4903 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { 4904 return (B_FALSE); 4905 } 4906 4907 return (B_TRUE); 4908 } 4909 4910 /* 4911 * return true if the ipif can be destroyed: the ipif has to be quiescent 4912 * with zero references from ire/ilm to it. 4913 */ 4914 static boolean_t 4915 ipif_is_freeable(ipif_t *ipif) 4916 { 4917 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 4918 ASSERT(ipif->ipif_id != 0); 4919 return (ipif->ipif_refcnt == 0); 4920 } 4921 4922 /* 4923 * The ipif/ill/ire has been refreled. Do the tail processing. 4924 * Determine if the ipif or ill in question has become quiescent and if so 4925 * wakeup close and/or restart any queued pending ioctl that is waiting 4926 * for the ipif_down (or ill_down) 4927 */ 4928 void 4929 ipif_ill_refrele_tail(ill_t *ill) 4930 { 4931 mblk_t *mp; 4932 conn_t *connp; 4933 ipsq_t *ipsq; 4934 ipxop_t *ipx; 4935 ipif_t *ipif; 4936 dl_notify_ind_t *dlindp; 4937 4938 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4939 4940 if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) { 4941 /* ip_modclose() may be waiting */ 4942 cv_broadcast(&ill->ill_cv); 4943 } 4944 4945 ipsq = ill->ill_phyint->phyint_ipsq; 4946 mutex_enter(&ipsq->ipsq_lock); 4947 ipx = ipsq->ipsq_xop; 4948 mutex_enter(&ipx->ipx_lock); 4949 if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */ 4950 goto unlock; 4951 4952 ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL); 4953 4954 ipif = ipx->ipx_pending_ipif; 4955 if (ipif->ipif_ill != ill) /* wait is for another ill; bail */ 4956 goto unlock; 4957 4958 switch (ipx->ipx_waitfor) { 4959 case IPIF_DOWN: 4960 if (!ipif_is_quiescent(ipif)) 4961 goto unlock; 4962 break; 4963 case IPIF_FREE: 4964 if (!ipif_is_freeable(ipif)) 4965 goto unlock; 4966 break; 4967 case ILL_DOWN: 4968 if (!ill_is_quiescent(ill)) 4969 goto unlock; 4970 break; 4971 case ILL_FREE: 4972 /* 4973 * ILL_FREE is only for loopback; normal ill teardown waits 4974 * synchronously in ip_modclose() without using ipx_waitfor, 4975 * handled by the cv_broadcast() at the top of this function. 4976 */ 4977 if (!ill_is_freeable(ill)) 4978 goto unlock; 4979 break; 4980 default: 4981 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n", 4982 (void *)ipsq, ipx->ipx_waitfor); 4983 } 4984 4985 ill_refhold_locked(ill); /* for qwriter_ip() call below */ 4986 mutex_exit(&ipx->ipx_lock); 4987 mp = ipsq_pending_mp_get(ipsq, &connp); 4988 mutex_exit(&ipsq->ipsq_lock); 4989 mutex_exit(&ill->ill_lock); 4990 4991 ASSERT(mp != NULL); 4992 /* 4993 * NOTE: all of the qwriter_ip() calls below use CUR_OP since 4994 * we can only get here when the current operation decides it 4995 * it needs to quiesce via ipsq_pending_mp_add(). 4996 */ 4997 switch (mp->b_datap->db_type) { 4998 case M_PCPROTO: 4999 case M_PROTO: 5000 /* 5001 * For now, only DL_NOTIFY_IND messages can use this facility. 5002 */ 5003 dlindp = (dl_notify_ind_t *)mp->b_rptr; 5004 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND); 5005 5006 switch (dlindp->dl_notification) { 5007 case DL_NOTE_PHYS_ADDR: 5008 qwriter_ip(ill, ill->ill_rq, mp, 5009 ill_set_phys_addr_tail, CUR_OP, B_TRUE); 5010 return; 5011 case DL_NOTE_REPLUMB: 5012 qwriter_ip(ill, ill->ill_rq, mp, 5013 ill_replumb_tail, CUR_OP, B_TRUE); 5014 return; 5015 default: 5016 ASSERT(0); 5017 ill_refrele(ill); 5018 } 5019 break; 5020 5021 case M_ERROR: 5022 case M_HANGUP: 5023 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP, 5024 B_TRUE); 5025 return; 5026 5027 case M_IOCTL: 5028 case M_IOCDATA: 5029 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) : 5030 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE); 5031 return; 5032 5033 default: 5034 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 5035 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 5036 } 5037 return; 5038 unlock: 5039 mutex_exit(&ipsq->ipsq_lock); 5040 mutex_exit(&ipx->ipx_lock); 5041 mutex_exit(&ill->ill_lock); 5042 } 5043 5044 #ifdef DEBUG 5045 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 5046 static void 5047 th_trace_rrecord(th_trace_t *th_trace) 5048 { 5049 tr_buf_t *tr_buf; 5050 uint_t lastref; 5051 5052 lastref = th_trace->th_trace_lastref; 5053 lastref++; 5054 if (lastref == TR_BUF_MAX) 5055 lastref = 0; 5056 th_trace->th_trace_lastref = lastref; 5057 tr_buf = &th_trace->th_trbuf[lastref]; 5058 tr_buf->tr_time = ddi_get_lbolt(); 5059 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH); 5060 } 5061 5062 static void 5063 th_trace_free(void *value) 5064 { 5065 th_trace_t *th_trace = value; 5066 5067 ASSERT(th_trace->th_refcnt == 0); 5068 kmem_free(th_trace, sizeof (*th_trace)); 5069 } 5070 5071 /* 5072 * Find or create the per-thread hash table used to track object references. 5073 * The ipst argument is NULL if we shouldn't allocate. 5074 * 5075 * Accesses per-thread data, so there's no need to lock here. 5076 */ 5077 static mod_hash_t * 5078 th_trace_gethash(ip_stack_t *ipst) 5079 { 5080 th_hash_t *thh; 5081 5082 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) { 5083 mod_hash_t *mh; 5084 char name[256]; 5085 size_t objsize, rshift; 5086 int retv; 5087 5088 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL) 5089 return (NULL); 5090 (void) snprintf(name, sizeof (name), "th_trace_%p", 5091 (void *)curthread); 5092 5093 /* 5094 * We use mod_hash_create_extended here rather than the more 5095 * obvious mod_hash_create_ptrhash because the latter has a 5096 * hard-coded KM_SLEEP, and we'd prefer to fail rather than 5097 * block. 5098 */ 5099 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)), 5100 MAX(sizeof (ire_t), sizeof (ncec_t))); 5101 rshift = highbit(objsize); 5102 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor, 5103 th_trace_free, mod_hash_byptr, (void *)rshift, 5104 mod_hash_ptrkey_cmp, KM_NOSLEEP); 5105 if (mh == NULL) { 5106 kmem_free(thh, sizeof (*thh)); 5107 return (NULL); 5108 } 5109 thh->thh_hash = mh; 5110 thh->thh_ipst = ipst; 5111 /* 5112 * We trace ills, ipifs, ires, and nces. All of these are 5113 * per-IP-stack, so the lock on the thread list is as well. 5114 */ 5115 rw_enter(&ip_thread_rwlock, RW_WRITER); 5116 list_insert_tail(&ip_thread_list, thh); 5117 rw_exit(&ip_thread_rwlock); 5118 retv = tsd_set(ip_thread_data, thh); 5119 ASSERT(retv == 0); 5120 } 5121 return (thh != NULL ? thh->thh_hash : NULL); 5122 } 5123 5124 boolean_t 5125 th_trace_ref(const void *obj, ip_stack_t *ipst) 5126 { 5127 th_trace_t *th_trace; 5128 mod_hash_t *mh; 5129 mod_hash_val_t val; 5130 5131 if ((mh = th_trace_gethash(ipst)) == NULL) 5132 return (B_FALSE); 5133 5134 /* 5135 * Attempt to locate the trace buffer for this obj and thread. 5136 * If it does not exist, then allocate a new trace buffer and 5137 * insert into the hash. 5138 */ 5139 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) { 5140 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP); 5141 if (th_trace == NULL) 5142 return (B_FALSE); 5143 5144 th_trace->th_id = curthread; 5145 if (mod_hash_insert(mh, (mod_hash_key_t)obj, 5146 (mod_hash_val_t)th_trace) != 0) { 5147 kmem_free(th_trace, sizeof (th_trace_t)); 5148 return (B_FALSE); 5149 } 5150 } else { 5151 th_trace = (th_trace_t *)val; 5152 } 5153 5154 ASSERT(th_trace->th_refcnt >= 0 && 5155 th_trace->th_refcnt < TR_BUF_MAX - 1); 5156 5157 th_trace->th_refcnt++; 5158 th_trace_rrecord(th_trace); 5159 return (B_TRUE); 5160 } 5161 5162 /* 5163 * For the purpose of tracing a reference release, we assume that global 5164 * tracing is always on and that the same thread initiated the reference hold 5165 * is releasing. 5166 */ 5167 void 5168 th_trace_unref(const void *obj) 5169 { 5170 int retv; 5171 mod_hash_t *mh; 5172 th_trace_t *th_trace; 5173 mod_hash_val_t val; 5174 5175 mh = th_trace_gethash(NULL); 5176 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val); 5177 ASSERT(retv == 0); 5178 th_trace = (th_trace_t *)val; 5179 5180 ASSERT(th_trace->th_refcnt > 0); 5181 th_trace->th_refcnt--; 5182 th_trace_rrecord(th_trace); 5183 } 5184 5185 /* 5186 * If tracing has been disabled, then we assume that the reference counts are 5187 * now useless, and we clear them out before destroying the entries. 5188 */ 5189 void 5190 th_trace_cleanup(const void *obj, boolean_t trace_disable) 5191 { 5192 th_hash_t *thh; 5193 mod_hash_t *mh; 5194 mod_hash_val_t val; 5195 th_trace_t *th_trace; 5196 int retv; 5197 5198 rw_enter(&ip_thread_rwlock, RW_READER); 5199 for (thh = list_head(&ip_thread_list); thh != NULL; 5200 thh = list_next(&ip_thread_list, thh)) { 5201 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj, 5202 &val) == 0) { 5203 th_trace = (th_trace_t *)val; 5204 if (trace_disable) 5205 th_trace->th_refcnt = 0; 5206 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj); 5207 ASSERT(retv == 0); 5208 } 5209 } 5210 rw_exit(&ip_thread_rwlock); 5211 } 5212 5213 void 5214 ipif_trace_ref(ipif_t *ipif) 5215 { 5216 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5217 5218 if (ipif->ipif_trace_disable) 5219 return; 5220 5221 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) { 5222 ipif->ipif_trace_disable = B_TRUE; 5223 ipif_trace_cleanup(ipif); 5224 } 5225 } 5226 5227 void 5228 ipif_untrace_ref(ipif_t *ipif) 5229 { 5230 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5231 5232 if (!ipif->ipif_trace_disable) 5233 th_trace_unref(ipif); 5234 } 5235 5236 void 5237 ill_trace_ref(ill_t *ill) 5238 { 5239 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5240 5241 if (ill->ill_trace_disable) 5242 return; 5243 5244 if (!th_trace_ref(ill, ill->ill_ipst)) { 5245 ill->ill_trace_disable = B_TRUE; 5246 ill_trace_cleanup(ill); 5247 } 5248 } 5249 5250 void 5251 ill_untrace_ref(ill_t *ill) 5252 { 5253 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5254 5255 if (!ill->ill_trace_disable) 5256 th_trace_unref(ill); 5257 } 5258 5259 /* 5260 * Called when ipif is unplumbed or when memory alloc fails. Note that on 5261 * failure, ipif_trace_disable is set. 5262 */ 5263 static void 5264 ipif_trace_cleanup(const ipif_t *ipif) 5265 { 5266 th_trace_cleanup(ipif, ipif->ipif_trace_disable); 5267 } 5268 5269 /* 5270 * Called when ill is unplumbed or when memory alloc fails. Note that on 5271 * failure, ill_trace_disable is set. 5272 */ 5273 static void 5274 ill_trace_cleanup(const ill_t *ill) 5275 { 5276 th_trace_cleanup(ill, ill->ill_trace_disable); 5277 } 5278 #endif /* DEBUG */ 5279 5280 void 5281 ipif_refhold_locked(ipif_t *ipif) 5282 { 5283 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5284 ipif->ipif_refcnt++; 5285 IPIF_TRACE_REF(ipif); 5286 } 5287 5288 void 5289 ipif_refhold(ipif_t *ipif) 5290 { 5291 ill_t *ill; 5292 5293 ill = ipif->ipif_ill; 5294 mutex_enter(&ill->ill_lock); 5295 ipif->ipif_refcnt++; 5296 IPIF_TRACE_REF(ipif); 5297 mutex_exit(&ill->ill_lock); 5298 } 5299 5300 /* 5301 * Must not be called while holding any locks. Otherwise if this is 5302 * the last reference to be released there is a chance of recursive mutex 5303 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5304 * to restart an ioctl. 5305 */ 5306 void 5307 ipif_refrele(ipif_t *ipif) 5308 { 5309 ill_t *ill; 5310 5311 ill = ipif->ipif_ill; 5312 5313 mutex_enter(&ill->ill_lock); 5314 ASSERT(ipif->ipif_refcnt != 0); 5315 ipif->ipif_refcnt--; 5316 IPIF_UNTRACE_REF(ipif); 5317 if (ipif->ipif_refcnt != 0) { 5318 mutex_exit(&ill->ill_lock); 5319 return; 5320 } 5321 5322 /* Drops the ill_lock */ 5323 ipif_ill_refrele_tail(ill); 5324 } 5325 5326 ipif_t * 5327 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 5328 { 5329 ipif_t *ipif; 5330 5331 mutex_enter(&ill->ill_lock); 5332 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 5333 ipif != NULL; ipif = ipif->ipif_next) { 5334 if (IPIF_IS_CONDEMNED(ipif)) 5335 continue; 5336 ipif_refhold_locked(ipif); 5337 mutex_exit(&ill->ill_lock); 5338 return (ipif); 5339 } 5340 mutex_exit(&ill->ill_lock); 5341 return (NULL); 5342 } 5343 5344 /* 5345 * TODO: make this table extendible at run time 5346 * Return a pointer to the mac type info for 'mac_type' 5347 */ 5348 static ip_m_t * 5349 ip_m_lookup(t_uscalar_t mac_type) 5350 { 5351 ip_m_t *ipm; 5352 5353 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 5354 if (ipm->ip_m_mac_type == mac_type) 5355 return (ipm); 5356 return (NULL); 5357 } 5358 5359 /* 5360 * Make a link layer address from the multicast IP address *addr. 5361 * To form the link layer address, invoke the ip_m_v*mapping function 5362 * associated with the link-layer type. 5363 */ 5364 void 5365 ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr) 5366 { 5367 ip_m_t *ipm; 5368 5369 if (ill->ill_net_type == IRE_IF_NORESOLVER) 5370 return; 5371 5372 ASSERT(addr != NULL); 5373 5374 ipm = ip_m_lookup(ill->ill_mactype); 5375 if (ipm == NULL || 5376 (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) || 5377 (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) { 5378 ip0dbg(("no mapping for ill %s mactype 0x%x\n", 5379 ill->ill_name, ill->ill_mactype)); 5380 return; 5381 } 5382 if (ill->ill_isv6) 5383 (*ipm->ip_m_v6mapping)(ill, addr, hwaddr); 5384 else 5385 (*ipm->ip_m_v4mapping)(ill, addr, hwaddr); 5386 } 5387 5388 /* 5389 * ip_rt_add is called to add an IPv4 route to the forwarding table. 5390 * ill is passed in to associate it with the correct interface. 5391 * If ire_arg is set, then we return the held IRE in that location. 5392 */ 5393 int 5394 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 5395 ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg, 5396 boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid) 5397 { 5398 ire_t *ire, *nire; 5399 ire_t *gw_ire = NULL; 5400 ipif_t *ipif = NULL; 5401 uint_t type; 5402 int match_flags = MATCH_IRE_TYPE; 5403 tsol_gc_t *gc = NULL; 5404 tsol_gcgrp_t *gcgrp = NULL; 5405 boolean_t gcgrp_xtraref = B_FALSE; 5406 boolean_t cgtp_broadcast; 5407 5408 ip1dbg(("ip_rt_add:")); 5409 5410 if (ire_arg != NULL) 5411 *ire_arg = NULL; 5412 5413 /* 5414 * If this is the case of RTF_HOST being set, then we set the netmask 5415 * to all ones (regardless if one was supplied). 5416 */ 5417 if (flags & RTF_HOST) 5418 mask = IP_HOST_MASK; 5419 5420 /* 5421 * Prevent routes with a zero gateway from being created (since 5422 * interfaces can currently be plumbed and brought up no assigned 5423 * address). 5424 */ 5425 if (gw_addr == 0) 5426 return (ENETUNREACH); 5427 /* 5428 * Get the ipif, if any, corresponding to the gw_addr 5429 * If -ifp was specified we restrict ourselves to the ill, otherwise 5430 * we match on the gatway and destination to handle unnumbered pt-pt 5431 * interfaces. 5432 */ 5433 if (ill != NULL) 5434 ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst); 5435 else 5436 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); 5437 if (ipif != NULL) { 5438 if (IS_VNI(ipif->ipif_ill)) { 5439 ipif_refrele(ipif); 5440 return (EINVAL); 5441 } 5442 } 5443 5444 /* 5445 * GateD will attempt to create routes with a loopback interface 5446 * address as the gateway and with RTF_GATEWAY set. We allow 5447 * these routes to be added, but create them as interface routes 5448 * since the gateway is an interface address. 5449 */ 5450 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 5451 flags &= ~RTF_GATEWAY; 5452 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 5453 mask == IP_HOST_MASK) { 5454 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK, 5455 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, 5456 NULL); 5457 if (ire != NULL) { 5458 ire_refrele(ire); 5459 ipif_refrele(ipif); 5460 return (EEXIST); 5461 } 5462 ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x" 5463 "for 0x%x\n", (void *)ipif, 5464 ipif->ipif_ire_type, 5465 ntohl(ipif->ipif_lcl_addr))); 5466 ire = ire_create( 5467 (uchar_t *)&dst_addr, /* dest address */ 5468 (uchar_t *)&mask, /* mask */ 5469 NULL, /* no gateway */ 5470 ipif->ipif_ire_type, /* LOOPBACK */ 5471 ipif->ipif_ill, 5472 zoneid, 5473 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0, 5474 NULL, 5475 ipst); 5476 5477 if (ire == NULL) { 5478 ipif_refrele(ipif); 5479 return (ENOMEM); 5480 } 5481 /* src address assigned by the caller? */ 5482 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5483 ire->ire_setsrc_addr = src_addr; 5484 5485 nire = ire_add(ire); 5486 if (nire == NULL) { 5487 /* 5488 * In the result of failure, ire_add() will have 5489 * already deleted the ire in question, so there 5490 * is no need to do that here. 5491 */ 5492 ipif_refrele(ipif); 5493 return (ENOMEM); 5494 } 5495 /* 5496 * Check if it was a duplicate entry. This handles 5497 * the case of two racing route adds for the same route 5498 */ 5499 if (nire != ire) { 5500 ASSERT(nire->ire_identical_ref > 1); 5501 ire_delete(nire); 5502 ire_refrele(nire); 5503 ipif_refrele(ipif); 5504 return (EEXIST); 5505 } 5506 ire = nire; 5507 goto save_ire; 5508 } 5509 } 5510 5511 /* 5512 * The routes for multicast with CGTP are quite special in that 5513 * the gateway is the local interface address, yet RTF_GATEWAY 5514 * is set. We turn off RTF_GATEWAY to provide compatibility with 5515 * this undocumented and unusual use of multicast routes. 5516 */ 5517 if ((flags & RTF_MULTIRT) && ipif != NULL) 5518 flags &= ~RTF_GATEWAY; 5519 5520 /* 5521 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 5522 * and the gateway address provided is one of the system's interface 5523 * addresses. By using the routing socket interface and supplying an 5524 * RTA_IFP sockaddr with an interface index, an alternate method of 5525 * specifying an interface route to be created is available which uses 5526 * the interface index that specifies the outgoing interface rather than 5527 * the address of an outgoing interface (which may not be able to 5528 * uniquely identify an interface). When coupled with the RTF_GATEWAY 5529 * flag, routes can be specified which not only specify the next-hop to 5530 * be used when routing to a certain prefix, but also which outgoing 5531 * interface should be used. 5532 * 5533 * Previously, interfaces would have unique addresses assigned to them 5534 * and so the address assigned to a particular interface could be used 5535 * to identify a particular interface. One exception to this was the 5536 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 5537 * 5538 * With the advent of IPv6 and its link-local addresses, this 5539 * restriction was relaxed and interfaces could share addresses between 5540 * themselves. In fact, typically all of the link-local interfaces on 5541 * an IPv6 node or router will have the same link-local address. In 5542 * order to differentiate between these interfaces, the use of an 5543 * interface index is necessary and this index can be carried inside a 5544 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 5545 * of using the interface index, however, is that all of the ipif's that 5546 * are part of an ill have the same index and so the RTA_IFP sockaddr 5547 * cannot be used to differentiate between ipif's (or logical 5548 * interfaces) that belong to the same ill (physical interface). 5549 * 5550 * For example, in the following case involving IPv4 interfaces and 5551 * logical interfaces 5552 * 5553 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 5554 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0 5555 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0 5556 * 5557 * the ipif's corresponding to each of these interface routes can be 5558 * uniquely identified by the "gateway" (actually interface address). 5559 * 5560 * In this case involving multiple IPv6 default routes to a particular 5561 * link-local gateway, the use of RTA_IFP is necessary to specify which 5562 * default route is of interest: 5563 * 5564 * default fe80::123:4567:89ab:cdef U if0 5565 * default fe80::123:4567:89ab:cdef U if1 5566 */ 5567 5568 /* RTF_GATEWAY not set */ 5569 if (!(flags & RTF_GATEWAY)) { 5570 if (sp != NULL) { 5571 ip2dbg(("ip_rt_add: gateway security attributes " 5572 "cannot be set with interface route\n")); 5573 if (ipif != NULL) 5574 ipif_refrele(ipif); 5575 return (EINVAL); 5576 } 5577 5578 /* 5579 * Whether or not ill (RTA_IFP) is set, we require that 5580 * the gateway is one of our local addresses. 5581 */ 5582 if (ipif == NULL) 5583 return (ENETUNREACH); 5584 5585 /* 5586 * We use MATCH_IRE_ILL here. If the caller specified an 5587 * interface (from the RTA_IFP sockaddr) we use it, otherwise 5588 * we use the ill derived from the gateway address. 5589 * We can always match the gateway address since we record it 5590 * in ire_gateway_addr. 5591 * We don't allow RTA_IFP to specify a different ill than the 5592 * one matching the ipif to make sure we can delete the route. 5593 */ 5594 match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL; 5595 if (ill == NULL) { 5596 ill = ipif->ipif_ill; 5597 } else if (ill != ipif->ipif_ill) { 5598 ipif_refrele(ipif); 5599 return (EINVAL); 5600 } 5601 5602 /* 5603 * We check for an existing entry at this point. 5604 * 5605 * Since a netmask isn't passed in via the ioctl interface 5606 * (SIOCADDRT), we don't check for a matching netmask in that 5607 * case. 5608 */ 5609 if (!ioctl_msg) 5610 match_flags |= MATCH_IRE_MASK; 5611 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, 5612 IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst, 5613 NULL); 5614 if (ire != NULL) { 5615 ire_refrele(ire); 5616 ipif_refrele(ipif); 5617 return (EEXIST); 5618 } 5619 5620 /* 5621 * Create a copy of the IRE_LOOPBACK, IRE_IF_NORESOLVER or 5622 * IRE_IF_RESOLVER with the modified address, netmask, and 5623 * gateway. 5624 */ 5625 ire = ire_create( 5626 (uchar_t *)&dst_addr, 5627 (uint8_t *)&mask, 5628 (uint8_t *)&gw_addr, 5629 ill->ill_net_type, 5630 ill, 5631 zoneid, 5632 flags, 5633 NULL, 5634 ipst); 5635 if (ire == NULL) { 5636 ipif_refrele(ipif); 5637 return (ENOMEM); 5638 } 5639 5640 /* 5641 * Some software (for example, GateD and Sun Cluster) attempts 5642 * to create (what amount to) IRE_PREFIX routes with the 5643 * loopback address as the gateway. This is primarily done to 5644 * set up prefixes with the RTF_REJECT flag set (for example, 5645 * when generating aggregate routes.) 5646 * 5647 * If the IRE type (as defined by ill->ill_net_type) is 5648 * IRE_LOOPBACK, then we map the request into a 5649 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as 5650 * these interface routes, by definition, can only be that. 5651 * 5652 * Needless to say, the real IRE_LOOPBACK is NOT created by this 5653 * routine, but rather using ire_create() directly. 5654 * 5655 */ 5656 if (ill->ill_net_type == IRE_LOOPBACK) { 5657 ire->ire_type = IRE_IF_NORESOLVER; 5658 ire->ire_flags |= RTF_BLACKHOLE; 5659 } 5660 5661 /* src address assigned by the caller? */ 5662 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5663 ire->ire_setsrc_addr = src_addr; 5664 5665 nire = ire_add(ire); 5666 if (nire == NULL) { 5667 /* 5668 * In the result of failure, ire_add() will have 5669 * already deleted the ire in question, so there 5670 * is no need to do that here. 5671 */ 5672 ipif_refrele(ipif); 5673 return (ENOMEM); 5674 } 5675 /* 5676 * Check if it was a duplicate entry. This handles 5677 * the case of two racing route adds for the same route 5678 */ 5679 if (nire != ire) { 5680 ire_delete(nire); 5681 ire_refrele(nire); 5682 ipif_refrele(ipif); 5683 return (EEXIST); 5684 } 5685 ire = nire; 5686 goto save_ire; 5687 } 5688 5689 /* 5690 * Get an interface IRE for the specified gateway. 5691 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 5692 * gateway, it is currently unreachable and we fail the request 5693 * accordingly. 5694 * If RTA_IFP was specified we look on that particular ill. 5695 */ 5696 if (ill != NULL) 5697 match_flags |= MATCH_IRE_ILL; 5698 5699 /* Check whether the gateway is reachable. */ 5700 again: 5701 type = IRE_INTERFACE; 5702 if (flags & RTF_INDIRECT) 5703 type |= IRE_OFFLINK; 5704 5705 gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill, 5706 ALL_ZONES, NULL, match_flags, 0, ipst, NULL); 5707 if (gw_ire == NULL) { 5708 /* 5709 * With IPMP, we allow host routes to influence in.mpathd's 5710 * target selection. However, if the test addresses are on 5711 * their own network, the above lookup will fail since the 5712 * underlying IRE_INTERFACEs are marked hidden. So allow 5713 * hidden test IREs to be found and try again. 5714 */ 5715 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) { 5716 match_flags |= MATCH_IRE_TESTHIDDEN; 5717 goto again; 5718 } 5719 5720 if (ipif != NULL) 5721 ipif_refrele(ipif); 5722 return (ENETUNREACH); 5723 } 5724 5725 /* 5726 * We create one of three types of IREs as a result of this request 5727 * based on the netmask. A netmask of all ones (which is automatically 5728 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 5729 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 5730 * created. Otherwise, an IRE_PREFIX route is created for the 5731 * destination prefix. 5732 */ 5733 if (mask == IP_HOST_MASK) 5734 type = IRE_HOST; 5735 else if (mask == 0) 5736 type = IRE_DEFAULT; 5737 else 5738 type = IRE_PREFIX; 5739 5740 /* check for a duplicate entry */ 5741 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, 5742 ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, 5743 0, ipst, NULL); 5744 if (ire != NULL) { 5745 if (ipif != NULL) 5746 ipif_refrele(ipif); 5747 ire_refrele(gw_ire); 5748 ire_refrele(ire); 5749 return (EEXIST); 5750 } 5751 5752 /* Security attribute exists */ 5753 if (sp != NULL) { 5754 tsol_gcgrp_addr_t ga; 5755 5756 /* find or create the gateway credentials group */ 5757 ga.ga_af = AF_INET; 5758 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 5759 5760 /* we hold reference to it upon success */ 5761 gcgrp = gcgrp_lookup(&ga, B_TRUE); 5762 if (gcgrp == NULL) { 5763 if (ipif != NULL) 5764 ipif_refrele(ipif); 5765 ire_refrele(gw_ire); 5766 return (ENOMEM); 5767 } 5768 5769 /* 5770 * Create and add the security attribute to the group; a 5771 * reference to the group is made upon allocating a new 5772 * entry successfully. If it finds an already-existing 5773 * entry for the security attribute in the group, it simply 5774 * returns it and no new reference is made to the group. 5775 */ 5776 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 5777 if (gc == NULL) { 5778 if (ipif != NULL) 5779 ipif_refrele(ipif); 5780 /* release reference held by gcgrp_lookup */ 5781 GCGRP_REFRELE(gcgrp); 5782 ire_refrele(gw_ire); 5783 return (ENOMEM); 5784 } 5785 } 5786 5787 /* Create the IRE. */ 5788 ire = ire_create( 5789 (uchar_t *)&dst_addr, /* dest address */ 5790 (uchar_t *)&mask, /* mask */ 5791 (uchar_t *)&gw_addr, /* gateway address */ 5792 (ushort_t)type, /* IRE type */ 5793 ill, 5794 zoneid, 5795 flags, 5796 gc, /* security attribute */ 5797 ipst); 5798 5799 /* 5800 * The ire holds a reference to the 'gc' and the 'gc' holds a 5801 * reference to the 'gcgrp'. We can now release the extra reference 5802 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 5803 */ 5804 if (gcgrp_xtraref) 5805 GCGRP_REFRELE(gcgrp); 5806 if (ire == NULL) { 5807 if (gc != NULL) 5808 GC_REFRELE(gc); 5809 if (ipif != NULL) 5810 ipif_refrele(ipif); 5811 ire_refrele(gw_ire); 5812 return (ENOMEM); 5813 } 5814 5815 /* Before we add, check if an extra CGTP broadcast is needed */ 5816 cgtp_broadcast = ((flags & RTF_MULTIRT) && 5817 ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST); 5818 5819 /* src address assigned by the caller? */ 5820 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5821 ire->ire_setsrc_addr = src_addr; 5822 5823 /* 5824 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 5825 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 5826 */ 5827 5828 /* Add the new IRE. */ 5829 nire = ire_add(ire); 5830 if (nire == NULL) { 5831 /* 5832 * In the result of failure, ire_add() will have 5833 * already deleted the ire in question, so there 5834 * is no need to do that here. 5835 */ 5836 if (ipif != NULL) 5837 ipif_refrele(ipif); 5838 ire_refrele(gw_ire); 5839 return (ENOMEM); 5840 } 5841 /* 5842 * Check if it was a duplicate entry. This handles 5843 * the case of two racing route adds for the same route 5844 */ 5845 if (nire != ire) { 5846 ire_delete(nire); 5847 ire_refrele(nire); 5848 if (ipif != NULL) 5849 ipif_refrele(ipif); 5850 ire_refrele(gw_ire); 5851 return (EEXIST); 5852 } 5853 ire = nire; 5854 5855 if (flags & RTF_MULTIRT) { 5856 /* 5857 * Invoke the CGTP (multirouting) filtering module 5858 * to add the dst address in the filtering database. 5859 * Replicated inbound packets coming from that address 5860 * will be filtered to discard the duplicates. 5861 * It is not necessary to call the CGTP filter hook 5862 * when the dst address is a broadcast or multicast, 5863 * because an IP source address cannot be a broadcast 5864 * or a multicast. 5865 */ 5866 if (cgtp_broadcast) { 5867 ip_cgtp_bcast_add(ire, ipst); 5868 goto save_ire; 5869 } 5870 if (ipst->ips_ip_cgtp_filter_ops != NULL && 5871 !CLASSD(ire->ire_addr)) { 5872 int res; 5873 ipif_t *src_ipif; 5874 5875 /* Find the source address corresponding to gw_ire */ 5876 src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr, 5877 NULL, zoneid, ipst); 5878 if (src_ipif != NULL) { 5879 res = ipst->ips_ip_cgtp_filter_ops-> 5880 cfo_add_dest_v4( 5881 ipst->ips_netstack->netstack_stackid, 5882 ire->ire_addr, 5883 ire->ire_gateway_addr, 5884 ire->ire_setsrc_addr, 5885 src_ipif->ipif_lcl_addr); 5886 ipif_refrele(src_ipif); 5887 } else { 5888 res = EADDRNOTAVAIL; 5889 } 5890 if (res != 0) { 5891 if (ipif != NULL) 5892 ipif_refrele(ipif); 5893 ire_refrele(gw_ire); 5894 ire_delete(ire); 5895 ire_refrele(ire); /* Held in ire_add */ 5896 return (res); 5897 } 5898 } 5899 } 5900 5901 save_ire: 5902 if (gw_ire != NULL) { 5903 ire_refrele(gw_ire); 5904 gw_ire = NULL; 5905 } 5906 if (ill != NULL) { 5907 /* 5908 * Save enough information so that we can recreate the IRE if 5909 * the interface goes down and then up. The metrics associated 5910 * with the route will be saved as well when rts_setmetrics() is 5911 * called after the IRE has been created. In the case where 5912 * memory cannot be allocated, none of this information will be 5913 * saved. 5914 */ 5915 ill_save_ire(ill, ire); 5916 } 5917 if (ioctl_msg) 5918 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst); 5919 if (ire_arg != NULL) { 5920 /* 5921 * Store the ire that was successfully added into where ire_arg 5922 * points to so that callers don't have to look it up 5923 * themselves (but they are responsible for ire_refrele()ing 5924 * the ire when they are finished with it). 5925 */ 5926 *ire_arg = ire; 5927 } else { 5928 ire_refrele(ire); /* Held in ire_add */ 5929 } 5930 if (ipif != NULL) 5931 ipif_refrele(ipif); 5932 return (0); 5933 } 5934 5935 /* 5936 * ip_rt_delete is called to delete an IPv4 route. 5937 * ill is passed in to associate it with the correct interface. 5938 */ 5939 /* ARGSUSED4 */ 5940 int 5941 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 5942 uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg, 5943 ip_stack_t *ipst, zoneid_t zoneid) 5944 { 5945 ire_t *ire = NULL; 5946 ipif_t *ipif; 5947 uint_t type; 5948 uint_t match_flags = MATCH_IRE_TYPE; 5949 int err = 0; 5950 5951 ip1dbg(("ip_rt_delete:")); 5952 /* 5953 * If this is the case of RTF_HOST being set, then we set the netmask 5954 * to all ones. Otherwise, we use the netmask if one was supplied. 5955 */ 5956 if (flags & RTF_HOST) { 5957 mask = IP_HOST_MASK; 5958 match_flags |= MATCH_IRE_MASK; 5959 } else if (rtm_addrs & RTA_NETMASK) { 5960 match_flags |= MATCH_IRE_MASK; 5961 } 5962 5963 /* 5964 * Note that RTF_GATEWAY is never set on a delete, therefore 5965 * we check if the gateway address is one of our interfaces first, 5966 * and fall back on RTF_GATEWAY routes. 5967 * 5968 * This makes it possible to delete an original 5969 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 5970 * However, we have RTF_KERNEL set on the ones created by ipif_up 5971 * and those can not be deleted here. 5972 * 5973 * We use MATCH_IRE_ILL if we know the interface. If the caller 5974 * specified an interface (from the RTA_IFP sockaddr) we use it, 5975 * otherwise we use the ill derived from the gateway address. 5976 * We can always match the gateway address since we record it 5977 * in ire_gateway_addr. 5978 * 5979 * For more detail on specifying routes by gateway address and by 5980 * interface index, see the comments in ip_rt_add(). 5981 */ 5982 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); 5983 if (ipif != NULL) { 5984 ill_t *ill_match; 5985 5986 if (ill != NULL) 5987 ill_match = ill; 5988 else 5989 ill_match = ipif->ipif_ill; 5990 5991 match_flags |= MATCH_IRE_ILL; 5992 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 5993 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK, 5994 ill_match, ALL_ZONES, NULL, match_flags, 0, ipst, 5995 NULL); 5996 } 5997 if (ire == NULL) { 5998 match_flags |= MATCH_IRE_GW; 5999 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, 6000 IRE_INTERFACE, ill_match, ALL_ZONES, NULL, 6001 match_flags, 0, ipst, NULL); 6002 } 6003 /* Avoid deleting routes created by kernel from an ipif */ 6004 if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) { 6005 ire_refrele(ire); 6006 ire = NULL; 6007 } 6008 6009 /* Restore in case we didn't find a match */ 6010 match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL); 6011 } 6012 6013 if (ire == NULL) { 6014 /* 6015 * At this point, the gateway address is not one of our own 6016 * addresses or a matching interface route was not found. We 6017 * set the IRE type to lookup based on whether 6018 * this is a host route, a default route or just a prefix. 6019 * 6020 * If an ill was passed in, then the lookup is based on an 6021 * interface index so MATCH_IRE_ILL is added to match_flags. 6022 */ 6023 match_flags |= MATCH_IRE_GW; 6024 if (ill != NULL) 6025 match_flags |= MATCH_IRE_ILL; 6026 if (mask == IP_HOST_MASK) 6027 type = IRE_HOST; 6028 else if (mask == 0) 6029 type = IRE_DEFAULT; 6030 else 6031 type = IRE_PREFIX; 6032 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, 6033 ALL_ZONES, NULL, match_flags, 0, ipst, NULL); 6034 } 6035 6036 if (ipif != NULL) { 6037 ipif_refrele(ipif); 6038 ipif = NULL; 6039 } 6040 6041 if (ire == NULL) 6042 return (ESRCH); 6043 6044 if (ire->ire_flags & RTF_MULTIRT) { 6045 /* 6046 * Invoke the CGTP (multirouting) filtering module 6047 * to remove the dst address from the filtering database. 6048 * Packets coming from that address will no longer be 6049 * filtered to remove duplicates. 6050 */ 6051 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 6052 err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4( 6053 ipst->ips_netstack->netstack_stackid, 6054 ire->ire_addr, ire->ire_gateway_addr); 6055 } 6056 ip_cgtp_bcast_delete(ire, ipst); 6057 } 6058 6059 ill = ire->ire_ill; 6060 if (ill != NULL) 6061 ill_remove_saved_ire(ill, ire); 6062 if (ioctl_msg) 6063 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst); 6064 ire_delete(ire); 6065 ire_refrele(ire); 6066 return (err); 6067 } 6068 6069 /* 6070 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 6071 */ 6072 /* ARGSUSED */ 6073 int 6074 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6075 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6076 { 6077 ipaddr_t dst_addr; 6078 ipaddr_t gw_addr; 6079 ipaddr_t mask; 6080 int error = 0; 6081 mblk_t *mp1; 6082 struct rtentry *rt; 6083 ipif_t *ipif = NULL; 6084 ip_stack_t *ipst; 6085 6086 ASSERT(q->q_next == NULL); 6087 ipst = CONNQ_TO_IPST(q); 6088 6089 ip1dbg(("ip_siocaddrt:")); 6090 /* Existence of mp1 verified in ip_wput_nondata */ 6091 mp1 = mp->b_cont->b_cont; 6092 rt = (struct rtentry *)mp1->b_rptr; 6093 6094 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6095 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6096 6097 /* 6098 * If the RTF_HOST flag is on, this is a request to assign a gateway 6099 * to a particular host address. In this case, we set the netmask to 6100 * all ones for the particular destination address. Otherwise, 6101 * determine the netmask to be used based on dst_addr and the interfaces 6102 * in use. 6103 */ 6104 if (rt->rt_flags & RTF_HOST) { 6105 mask = IP_HOST_MASK; 6106 } else { 6107 /* 6108 * Note that ip_subnet_mask returns a zero mask in the case of 6109 * default (an all-zeroes address). 6110 */ 6111 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 6112 } 6113 6114 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 6115 B_TRUE, NULL, ipst, ALL_ZONES); 6116 if (ipif != NULL) 6117 ipif_refrele(ipif); 6118 return (error); 6119 } 6120 6121 /* 6122 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 6123 */ 6124 /* ARGSUSED */ 6125 int 6126 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6127 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6128 { 6129 ipaddr_t dst_addr; 6130 ipaddr_t gw_addr; 6131 ipaddr_t mask; 6132 int error; 6133 mblk_t *mp1; 6134 struct rtentry *rt; 6135 ipif_t *ipif = NULL; 6136 ip_stack_t *ipst; 6137 6138 ASSERT(q->q_next == NULL); 6139 ipst = CONNQ_TO_IPST(q); 6140 6141 ip1dbg(("ip_siocdelrt:")); 6142 /* Existence of mp1 verified in ip_wput_nondata */ 6143 mp1 = mp->b_cont->b_cont; 6144 rt = (struct rtentry *)mp1->b_rptr; 6145 6146 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6147 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6148 6149 /* 6150 * If the RTF_HOST flag is on, this is a request to delete a gateway 6151 * to a particular host address. In this case, we set the netmask to 6152 * all ones for the particular destination address. Otherwise, 6153 * determine the netmask to be used based on dst_addr and the interfaces 6154 * in use. 6155 */ 6156 if (rt->rt_flags & RTF_HOST) { 6157 mask = IP_HOST_MASK; 6158 } else { 6159 /* 6160 * Note that ip_subnet_mask returns a zero mask in the case of 6161 * default (an all-zeroes address). 6162 */ 6163 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 6164 } 6165 6166 error = ip_rt_delete(dst_addr, mask, gw_addr, 6167 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, 6168 ipst, ALL_ZONES); 6169 if (ipif != NULL) 6170 ipif_refrele(ipif); 6171 return (error); 6172 } 6173 6174 /* 6175 * Enqueue the mp onto the ipsq, chained by b_next. 6176 * b_prev stores the function to be executed later, and b_queue the queue 6177 * where this mp originated. 6178 */ 6179 void 6180 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 6181 ill_t *pending_ill) 6182 { 6183 conn_t *connp; 6184 ipxop_t *ipx = ipsq->ipsq_xop; 6185 6186 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 6187 ASSERT(MUTEX_HELD(&ipx->ipx_lock)); 6188 ASSERT(func != NULL); 6189 6190 mp->b_queue = q; 6191 mp->b_prev = (void *)func; 6192 mp->b_next = NULL; 6193 6194 switch (type) { 6195 case CUR_OP: 6196 if (ipx->ipx_mptail != NULL) { 6197 ASSERT(ipx->ipx_mphead != NULL); 6198 ipx->ipx_mptail->b_next = mp; 6199 } else { 6200 ASSERT(ipx->ipx_mphead == NULL); 6201 ipx->ipx_mphead = mp; 6202 } 6203 ipx->ipx_mptail = mp; 6204 break; 6205 6206 case NEW_OP: 6207 if (ipsq->ipsq_xopq_mptail != NULL) { 6208 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 6209 ipsq->ipsq_xopq_mptail->b_next = mp; 6210 } else { 6211 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 6212 ipsq->ipsq_xopq_mphead = mp; 6213 } 6214 ipsq->ipsq_xopq_mptail = mp; 6215 ipx->ipx_ipsq_queued = B_TRUE; 6216 break; 6217 6218 case SWITCH_OP: 6219 ASSERT(ipsq->ipsq_swxop != NULL); 6220 /* only one switch operation is currently allowed */ 6221 ASSERT(ipsq->ipsq_switch_mp == NULL); 6222 ipsq->ipsq_switch_mp = mp; 6223 ipx->ipx_ipsq_queued = B_TRUE; 6224 break; 6225 default: 6226 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 6227 } 6228 6229 if (CONN_Q(q) && pending_ill != NULL) { 6230 connp = Q_TO_CONN(q); 6231 ASSERT(MUTEX_HELD(&connp->conn_lock)); 6232 connp->conn_oper_pending_ill = pending_ill; 6233 } 6234 } 6235 6236 /* 6237 * Dequeue the next message that requested exclusive access to this IPSQ's 6238 * xop. Specifically: 6239 * 6240 * 1. If we're still processing the current operation on `ipsq', then 6241 * dequeue the next message for the operation (from ipx_mphead), or 6242 * return NULL if there are no queued messages for the operation. 6243 * These messages are queued via CUR_OP to qwriter_ip() and friends. 6244 * 6245 * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is 6246 * not set) see if the ipsq has requested an xop switch. If so, switch 6247 * `ipsq' to a different xop. Xop switches only happen when joining or 6248 * leaving IPMP groups and require a careful dance -- see the comments 6249 * in-line below for details. If we're leaving a group xop or if we're 6250 * joining a group xop and become writer on it, then we proceed to (3). 6251 * Otherwise, we return NULL and exit the xop. 6252 * 6253 * 3. For each IPSQ in the xop, return any switch operation stored on 6254 * ipsq_switch_mp (set via SWITCH_OP); these must be processed before 6255 * any other messages queued on the IPSQ. Otherwise, dequeue the next 6256 * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead. 6257 * Note that if the phyint tied to `ipsq' is not using IPMP there will 6258 * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for 6259 * each phyint in the group, including the IPMP meta-interface phyint. 6260 */ 6261 static mblk_t * 6262 ipsq_dq(ipsq_t *ipsq) 6263 { 6264 ill_t *illv4, *illv6; 6265 mblk_t *mp; 6266 ipsq_t *xopipsq; 6267 ipsq_t *leftipsq = NULL; 6268 ipxop_t *ipx; 6269 phyint_t *phyi = ipsq->ipsq_phyint; 6270 ip_stack_t *ipst = ipsq->ipsq_ipst; 6271 boolean_t emptied = B_FALSE; 6272 6273 /* 6274 * Grab all the locks we need in the defined order (ill_g_lock -> 6275 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next. 6276 */ 6277 rw_enter(&ipst->ips_ill_g_lock, 6278 ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER); 6279 mutex_enter(&ipsq->ipsq_lock); 6280 ipx = ipsq->ipsq_xop; 6281 mutex_enter(&ipx->ipx_lock); 6282 6283 /* 6284 * Dequeue the next message associated with the current exclusive 6285 * operation, if any. 6286 */ 6287 if ((mp = ipx->ipx_mphead) != NULL) { 6288 ipx->ipx_mphead = mp->b_next; 6289 if (ipx->ipx_mphead == NULL) 6290 ipx->ipx_mptail = NULL; 6291 mp->b_next = (void *)ipsq; 6292 goto out; 6293 } 6294 6295 if (ipx->ipx_current_ipif != NULL) 6296 goto empty; 6297 6298 if (ipsq->ipsq_swxop != NULL) { 6299 /* 6300 * The exclusive operation that is now being completed has 6301 * requested a switch to a different xop. This happens 6302 * when an interface joins or leaves an IPMP group. Joins 6303 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()). 6304 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb 6305 * (phyint_free()), or interface plumb for an ill type 6306 * not in the IPMP group (ip_rput_dlpi_writer()). 6307 * 6308 * Xop switches are not allowed on the IPMP meta-interface. 6309 */ 6310 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP)); 6311 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 6312 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq); 6313 6314 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) { 6315 /* 6316 * We're switching back to our own xop, so we have two 6317 * xop's to drain/exit: our own, and the group xop 6318 * that we are leaving. 6319 * 6320 * First, pull ourselves out of the group ipsq list. 6321 * This is safe since we're writer on ill_g_lock. 6322 */ 6323 ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop); 6324 6325 xopipsq = ipx->ipx_ipsq; 6326 while (xopipsq->ipsq_next != ipsq) 6327 xopipsq = xopipsq->ipsq_next; 6328 6329 xopipsq->ipsq_next = ipsq->ipsq_next; 6330 ipsq->ipsq_next = ipsq; 6331 ipsq->ipsq_xop = ipsq->ipsq_swxop; 6332 ipsq->ipsq_swxop = NULL; 6333 6334 /* 6335 * Second, prepare to exit the group xop. The actual 6336 * ipsq_exit() is done at the end of this function 6337 * since we cannot hold any locks across ipsq_exit(). 6338 * Note that although we drop the group's ipx_lock, no 6339 * threads can proceed since we're still ipx_writer. 6340 */ 6341 leftipsq = xopipsq; 6342 mutex_exit(&ipx->ipx_lock); 6343 6344 /* 6345 * Third, set ipx to point to our own xop (which was 6346 * inactive and therefore can be entered). 6347 */ 6348 ipx = ipsq->ipsq_xop; 6349 mutex_enter(&ipx->ipx_lock); 6350 ASSERT(ipx->ipx_writer == NULL); 6351 ASSERT(ipx->ipx_current_ipif == NULL); 6352 } else { 6353 /* 6354 * We're switching from our own xop to a group xop. 6355 * The requestor of the switch must ensure that the 6356 * group xop cannot go away (e.g. by ensuring the 6357 * phyint associated with the xop cannot go away). 6358 * 6359 * If we can become writer on our new xop, then we'll 6360 * do the drain. Otherwise, the current writer of our 6361 * new xop will do the drain when it exits. 6362 * 6363 * First, splice ourselves into the group IPSQ list. 6364 * This is safe since we're writer on ill_g_lock. 6365 */ 6366 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 6367 6368 xopipsq = ipsq->ipsq_swxop->ipx_ipsq; 6369 while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq) 6370 xopipsq = xopipsq->ipsq_next; 6371 6372 xopipsq->ipsq_next = ipsq; 6373 ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq; 6374 ipsq->ipsq_xop = ipsq->ipsq_swxop; 6375 ipsq->ipsq_swxop = NULL; 6376 6377 /* 6378 * Second, exit our own xop, since it's now unused. 6379 * This is safe since we've got the only reference. 6380 */ 6381 ASSERT(ipx->ipx_writer == curthread); 6382 ipx->ipx_writer = NULL; 6383 VERIFY(--ipx->ipx_reentry_cnt == 0); 6384 ipx->ipx_ipsq_queued = B_FALSE; 6385 mutex_exit(&ipx->ipx_lock); 6386 6387 /* 6388 * Third, set ipx to point to our new xop, and check 6389 * if we can become writer on it. If we cannot, then 6390 * the current writer will drain the IPSQ group when 6391 * it exits. Our ipsq_xop is guaranteed to be stable 6392 * because we're still holding ipsq_lock. 6393 */ 6394 ipx = ipsq->ipsq_xop; 6395 mutex_enter(&ipx->ipx_lock); 6396 if (ipx->ipx_writer != NULL || 6397 ipx->ipx_current_ipif != NULL) { 6398 goto out; 6399 } 6400 } 6401 6402 /* 6403 * Fourth, become writer on our new ipx before we continue 6404 * with the drain. Note that we never dropped ipsq_lock 6405 * above, so no other thread could've raced with us to 6406 * become writer first. Also, we're holding ipx_lock, so 6407 * no other thread can examine the ipx right now. 6408 */ 6409 ASSERT(ipx->ipx_current_ipif == NULL); 6410 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 6411 VERIFY(ipx->ipx_reentry_cnt++ == 0); 6412 ipx->ipx_writer = curthread; 6413 ipx->ipx_forced = B_FALSE; 6414 #ifdef DEBUG 6415 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6416 #endif 6417 } 6418 6419 xopipsq = ipsq; 6420 do { 6421 /* 6422 * So that other operations operate on a consistent and 6423 * complete phyint, a switch message on an IPSQ must be 6424 * handled prior to any other operations on that IPSQ. 6425 */ 6426 if ((mp = xopipsq->ipsq_switch_mp) != NULL) { 6427 xopipsq->ipsq_switch_mp = NULL; 6428 ASSERT(mp->b_next == NULL); 6429 mp->b_next = (void *)xopipsq; 6430 goto out; 6431 } 6432 6433 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) { 6434 xopipsq->ipsq_xopq_mphead = mp->b_next; 6435 if (xopipsq->ipsq_xopq_mphead == NULL) 6436 xopipsq->ipsq_xopq_mptail = NULL; 6437 mp->b_next = (void *)xopipsq; 6438 goto out; 6439 } 6440 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 6441 empty: 6442 /* 6443 * There are no messages. Further, we are holding ipx_lock, hence no 6444 * new messages can end up on any IPSQ in the xop. 6445 */ 6446 ipx->ipx_writer = NULL; 6447 ipx->ipx_forced = B_FALSE; 6448 VERIFY(--ipx->ipx_reentry_cnt == 0); 6449 ipx->ipx_ipsq_queued = B_FALSE; 6450 emptied = B_TRUE; 6451 #ifdef DEBUG 6452 ipx->ipx_depth = 0; 6453 #endif 6454 out: 6455 mutex_exit(&ipx->ipx_lock); 6456 mutex_exit(&ipsq->ipsq_lock); 6457 6458 /* 6459 * If we completely emptied the xop, then wake up any threads waiting 6460 * to enter any of the IPSQ's associated with it. 6461 */ 6462 if (emptied) { 6463 xopipsq = ipsq; 6464 do { 6465 if ((phyi = xopipsq->ipsq_phyint) == NULL) 6466 continue; 6467 6468 illv4 = phyi->phyint_illv4; 6469 illv6 = phyi->phyint_illv6; 6470 6471 GRAB_ILL_LOCKS(illv4, illv6); 6472 if (illv4 != NULL) 6473 cv_broadcast(&illv4->ill_cv); 6474 if (illv6 != NULL) 6475 cv_broadcast(&illv6->ill_cv); 6476 RELEASE_ILL_LOCKS(illv4, illv6); 6477 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 6478 } 6479 rw_exit(&ipst->ips_ill_g_lock); 6480 6481 /* 6482 * Now that all locks are dropped, exit the IPSQ we left. 6483 */ 6484 if (leftipsq != NULL) 6485 ipsq_exit(leftipsq); 6486 6487 return (mp); 6488 } 6489 6490 /* 6491 * Return completion status of previously initiated DLPI operations on 6492 * ills in the purview of an ipsq. 6493 */ 6494 static boolean_t 6495 ipsq_dlpi_done(ipsq_t *ipsq) 6496 { 6497 ipsq_t *ipsq_start; 6498 phyint_t *phyi; 6499 ill_t *ill; 6500 6501 ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock)); 6502 ipsq_start = ipsq; 6503 6504 do { 6505 /* 6506 * The only current users of this function are ipsq_try_enter 6507 * and ipsq_enter which have made sure that ipsq_writer is 6508 * NULL before we reach here. ill_dlpi_pending is modified 6509 * only by an ipsq writer 6510 */ 6511 ASSERT(ipsq->ipsq_xop->ipx_writer == NULL); 6512 phyi = ipsq->ipsq_phyint; 6513 /* 6514 * phyi could be NULL if a phyint that is part of an 6515 * IPMP group is being unplumbed. A more detailed 6516 * comment is in ipmp_grp_update_kstats() 6517 */ 6518 if (phyi != NULL) { 6519 ill = phyi->phyint_illv4; 6520 if (ill != NULL && 6521 (ill->ill_dlpi_pending != DL_PRIM_INVAL || 6522 ill->ill_arl_dlpi_pending)) 6523 return (B_FALSE); 6524 6525 ill = phyi->phyint_illv6; 6526 if (ill != NULL && 6527 ill->ill_dlpi_pending != DL_PRIM_INVAL) 6528 return (B_FALSE); 6529 } 6530 6531 } while ((ipsq = ipsq->ipsq_next) != ipsq_start); 6532 6533 return (B_TRUE); 6534 } 6535 6536 /* 6537 * Enter the ipsq corresponding to ill, by waiting synchronously till 6538 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 6539 * will have to drain completely before ipsq_enter returns success. 6540 * ipx_current_ipif will be set if some exclusive op is in progress, 6541 * and the ipsq_exit logic will start the next enqueued op after 6542 * completion of the current op. If 'force' is used, we don't wait 6543 * for the enqueued ops. This is needed when a conn_close wants to 6544 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 6545 * of an ill can also use this option. But we dont' use it currently. 6546 */ 6547 #define ENTER_SQ_WAIT_TICKS 100 6548 boolean_t 6549 ipsq_enter(ill_t *ill, boolean_t force, int type) 6550 { 6551 ipsq_t *ipsq; 6552 ipxop_t *ipx; 6553 boolean_t waited_enough = B_FALSE; 6554 ip_stack_t *ipst = ill->ill_ipst; 6555 6556 /* 6557 * Note that the relationship between ill and ipsq is fixed as long as 6558 * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the 6559 * relationship between the IPSQ and xop cannot change. However, 6560 * since we cannot hold ipsq_lock across the cv_wait(), it may change 6561 * while we're waiting. We wait on ill_cv and rely on ipsq_exit() 6562 * waking up all ills in the xop when it becomes available. 6563 */ 6564 for (;;) { 6565 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6566 mutex_enter(&ill->ill_lock); 6567 if (ill->ill_state_flags & ILL_CONDEMNED) { 6568 mutex_exit(&ill->ill_lock); 6569 rw_exit(&ipst->ips_ill_g_lock); 6570 return (B_FALSE); 6571 } 6572 6573 ipsq = ill->ill_phyint->phyint_ipsq; 6574 mutex_enter(&ipsq->ipsq_lock); 6575 ipx = ipsq->ipsq_xop; 6576 mutex_enter(&ipx->ipx_lock); 6577 6578 if (ipx->ipx_writer == NULL && (type == CUR_OP || 6579 (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) || 6580 waited_enough)) 6581 break; 6582 6583 rw_exit(&ipst->ips_ill_g_lock); 6584 6585 if (!force || ipx->ipx_writer != NULL) { 6586 mutex_exit(&ipx->ipx_lock); 6587 mutex_exit(&ipsq->ipsq_lock); 6588 cv_wait(&ill->ill_cv, &ill->ill_lock); 6589 } else { 6590 mutex_exit(&ipx->ipx_lock); 6591 mutex_exit(&ipsq->ipsq_lock); 6592 (void) cv_reltimedwait(&ill->ill_cv, 6593 &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK); 6594 waited_enough = B_TRUE; 6595 } 6596 mutex_exit(&ill->ill_lock); 6597 } 6598 6599 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 6600 ASSERT(ipx->ipx_reentry_cnt == 0); 6601 ipx->ipx_writer = curthread; 6602 ipx->ipx_forced = (ipx->ipx_current_ipif != NULL); 6603 ipx->ipx_reentry_cnt++; 6604 #ifdef DEBUG 6605 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6606 #endif 6607 mutex_exit(&ipx->ipx_lock); 6608 mutex_exit(&ipsq->ipsq_lock); 6609 mutex_exit(&ill->ill_lock); 6610 rw_exit(&ipst->ips_ill_g_lock); 6611 6612 return (B_TRUE); 6613 } 6614 6615 /* 6616 * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock 6617 * across the call to the core interface ipsq_try_enter() and hence calls this 6618 * function directly. This is explained more fully in ipif_set_values(). 6619 * In order to support the above constraint, ipsq_try_enter is implemented as 6620 * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently 6621 */ 6622 static ipsq_t * 6623 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, 6624 int type, boolean_t reentry_ok) 6625 { 6626 ipsq_t *ipsq; 6627 ipxop_t *ipx; 6628 ip_stack_t *ipst = ill->ill_ipst; 6629 6630 /* 6631 * lock ordering: 6632 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock. 6633 * 6634 * ipx of an ipsq can't change when ipsq_lock is held. 6635 */ 6636 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 6637 GRAB_CONN_LOCK(q); 6638 mutex_enter(&ill->ill_lock); 6639 ipsq = ill->ill_phyint->phyint_ipsq; 6640 mutex_enter(&ipsq->ipsq_lock); 6641 ipx = ipsq->ipsq_xop; 6642 mutex_enter(&ipx->ipx_lock); 6643 6644 /* 6645 * 1. Enter the ipsq if we are already writer and reentry is ok. 6646 * (Note: If the caller does not specify reentry_ok then neither 6647 * 'func' nor any of its callees must ever attempt to enter the ipsq 6648 * again. Otherwise it can lead to an infinite loop 6649 * 2. Enter the ipsq if there is no current writer and this attempted 6650 * entry is part of the current operation 6651 * 3. Enter the ipsq if there is no current writer and this is a new 6652 * operation and the operation queue is empty and there is no 6653 * operation currently in progress and if all previously initiated 6654 * DLPI operations have completed. 6655 */ 6656 if ((ipx->ipx_writer == curthread && reentry_ok) || 6657 (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP && 6658 !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL && 6659 ipsq_dlpi_done(ipsq))))) { 6660 /* Success. */ 6661 ipx->ipx_reentry_cnt++; 6662 ipx->ipx_writer = curthread; 6663 ipx->ipx_forced = B_FALSE; 6664 mutex_exit(&ipx->ipx_lock); 6665 mutex_exit(&ipsq->ipsq_lock); 6666 mutex_exit(&ill->ill_lock); 6667 RELEASE_CONN_LOCK(q); 6668 #ifdef DEBUG 6669 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6670 #endif 6671 return (ipsq); 6672 } 6673 6674 if (func != NULL) 6675 ipsq_enq(ipsq, q, mp, func, type, ill); 6676 6677 mutex_exit(&ipx->ipx_lock); 6678 mutex_exit(&ipsq->ipsq_lock); 6679 mutex_exit(&ill->ill_lock); 6680 RELEASE_CONN_LOCK(q); 6681 return (NULL); 6682 } 6683 6684 /* 6685 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 6686 * certain critical operations like plumbing (i.e. most set ioctls), etc. 6687 * There is one ipsq per phyint. The ipsq 6688 * serializes exclusive ioctls issued by applications on a per ipsq basis in 6689 * ipsq_xopq_mphead. It also protects against multiple threads executing in 6690 * the ipsq. Responses from the driver pertain to the current ioctl (say a 6691 * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing 6692 * up the interface) and are enqueued in ipx_mphead. 6693 * 6694 * If a thread does not want to reenter the ipsq when it is already writer, 6695 * it must make sure that the specified reentry point to be called later 6696 * when the ipsq is empty, nor any code path starting from the specified reentry 6697 * point must never ever try to enter the ipsq again. Otherwise it can lead 6698 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 6699 * When the thread that is currently exclusive finishes, it (ipsq_exit) 6700 * dequeues the requests waiting to become exclusive in ipx_mphead and calls 6701 * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit 6702 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 6703 * ioctl if the current ioctl has completed. If the current ioctl is still 6704 * in progress it simply returns. The current ioctl could be waiting for 6705 * a response from another module (the driver or could be waiting for 6706 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp 6707 * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the 6708 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 6709 * ipx_current_ipif is NULL which happens only once the ioctl is complete and 6710 * all associated DLPI operations have completed. 6711 */ 6712 6713 /* 6714 * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif' 6715 * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ 6716 * on success, or NULL on failure. The caller ensures ipif/ill is valid by 6717 * refholding it as necessary. If the IPSQ cannot be entered and `func' is 6718 * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ 6719 * can be entered. If `func' is NULL, then `q' and `mp' are ignored. 6720 */ 6721 ipsq_t * 6722 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 6723 ipsq_func_t func, int type, boolean_t reentry_ok) 6724 { 6725 ip_stack_t *ipst; 6726 ipsq_t *ipsq; 6727 6728 /* Only 1 of ipif or ill can be specified */ 6729 ASSERT((ipif != NULL) ^ (ill != NULL)); 6730 6731 if (ipif != NULL) 6732 ill = ipif->ipif_ill; 6733 ipst = ill->ill_ipst; 6734 6735 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6736 ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok); 6737 rw_exit(&ipst->ips_ill_g_lock); 6738 6739 return (ipsq); 6740 } 6741 6742 /* 6743 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures 6744 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ 6745 * cannot be entered, the mp is queued for completion. 6746 */ 6747 void 6748 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 6749 boolean_t reentry_ok) 6750 { 6751 ipsq_t *ipsq; 6752 6753 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok); 6754 6755 /* 6756 * Drop the caller's refhold on the ill. This is safe since we either 6757 * entered the IPSQ (and thus are exclusive), or failed to enter the 6758 * IPSQ, in which case we return without accessing ill anymore. This 6759 * is needed because func needs to see the correct refcount. 6760 * e.g. removeif can work only then. 6761 */ 6762 ill_refrele(ill); 6763 if (ipsq != NULL) { 6764 (*func)(ipsq, q, mp, NULL); 6765 ipsq_exit(ipsq); 6766 } 6767 } 6768 6769 /* 6770 * Exit the specified IPSQ. If this is the final exit on it then drain it 6771 * prior to exiting. Caller must be writer on the specified IPSQ. 6772 */ 6773 void 6774 ipsq_exit(ipsq_t *ipsq) 6775 { 6776 mblk_t *mp; 6777 ipsq_t *mp_ipsq; 6778 queue_t *q; 6779 phyint_t *phyi; 6780 ipsq_func_t func; 6781 6782 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6783 6784 ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1); 6785 if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) { 6786 ipsq->ipsq_xop->ipx_reentry_cnt--; 6787 return; 6788 } 6789 6790 for (;;) { 6791 phyi = ipsq->ipsq_phyint; 6792 mp = ipsq_dq(ipsq); 6793 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next; 6794 6795 /* 6796 * If we've changed to a new IPSQ, and the phyint associated 6797 * with the old one has gone away, free the old IPSQ. Note 6798 * that this cannot happen while the IPSQ is in a group. 6799 */ 6800 if (mp_ipsq != ipsq && phyi == NULL) { 6801 ASSERT(ipsq->ipsq_next == ipsq); 6802 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 6803 ipsq_delete(ipsq); 6804 } 6805 6806 if (mp == NULL) 6807 break; 6808 6809 q = mp->b_queue; 6810 func = (ipsq_func_t)mp->b_prev; 6811 ipsq = mp_ipsq; 6812 mp->b_next = mp->b_prev = NULL; 6813 mp->b_queue = NULL; 6814 6815 /* 6816 * If 'q' is an conn queue, it is valid, since we did a 6817 * a refhold on the conn at the start of the ioctl. 6818 * If 'q' is an ill queue, it is valid, since close of an 6819 * ill will clean up its IPSQ. 6820 */ 6821 (*func)(ipsq, q, mp, NULL); 6822 } 6823 } 6824 6825 /* 6826 * Used to start any igmp or mld timers that could not be started 6827 * while holding ill_mcast_lock. The timers can't be started while holding 6828 * the lock, since mld/igmp_start_timers may need to call untimeout() 6829 * which can't be done while holding the lock which the timeout handler 6830 * acquires. Otherwise 6831 * there could be a deadlock since the timeout handlers 6832 * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire 6833 * ill_mcast_lock. 6834 */ 6835 void 6836 ill_mcast_timer_start(ip_stack_t *ipst) 6837 { 6838 int next; 6839 6840 mutex_enter(&ipst->ips_igmp_timer_lock); 6841 next = ipst->ips_igmp_deferred_next; 6842 ipst->ips_igmp_deferred_next = INFINITY; 6843 mutex_exit(&ipst->ips_igmp_timer_lock); 6844 6845 if (next != INFINITY) 6846 igmp_start_timers(next, ipst); 6847 6848 mutex_enter(&ipst->ips_mld_timer_lock); 6849 next = ipst->ips_mld_deferred_next; 6850 ipst->ips_mld_deferred_next = INFINITY; 6851 mutex_exit(&ipst->ips_mld_timer_lock); 6852 6853 if (next != INFINITY) 6854 mld_start_timers(next, ipst); 6855 } 6856 6857 /* 6858 * Start the current exclusive operation on `ipsq'; associate it with `ipif' 6859 * and `ioccmd'. 6860 */ 6861 void 6862 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) 6863 { 6864 ill_t *ill = ipif->ipif_ill; 6865 ipxop_t *ipx = ipsq->ipsq_xop; 6866 6867 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6868 ASSERT(ipx->ipx_current_ipif == NULL); 6869 ASSERT(ipx->ipx_current_ioctl == 0); 6870 6871 ipx->ipx_current_done = B_FALSE; 6872 ipx->ipx_current_ioctl = ioccmd; 6873 mutex_enter(&ipx->ipx_lock); 6874 ipx->ipx_current_ipif = ipif; 6875 mutex_exit(&ipx->ipx_lock); 6876 6877 /* 6878 * Set IPIF_CHANGING on one or more ipifs associated with the 6879 * current exclusive operation. IPIF_CHANGING prevents any new 6880 * references to the ipif (so that the references will eventually 6881 * drop to zero) and also prevents any "get" operations (e.g., 6882 * SIOCGLIFFLAGS) from being able to access the ipif until the 6883 * operation has completed and the ipif is again in a stable state. 6884 * 6885 * For ioctls, IPIF_CHANGING is set on the ipif associated with the 6886 * ioctl. For internal operations (where ioccmd is zero), all ipifs 6887 * on the ill are marked with IPIF_CHANGING since it's unclear which 6888 * ipifs will be affected. 6889 * 6890 * Note that SIOCLIFREMOVEIF is a special case as it sets 6891 * IPIF_CONDEMNED internally after identifying the right ipif to 6892 * operate on. 6893 */ 6894 switch (ioccmd) { 6895 case SIOCLIFREMOVEIF: 6896 break; 6897 case 0: 6898 mutex_enter(&ill->ill_lock); 6899 ipif = ipif->ipif_ill->ill_ipif; 6900 for (; ipif != NULL; ipif = ipif->ipif_next) 6901 ipif->ipif_state_flags |= IPIF_CHANGING; 6902 mutex_exit(&ill->ill_lock); 6903 break; 6904 default: 6905 mutex_enter(&ill->ill_lock); 6906 ipif->ipif_state_flags |= IPIF_CHANGING; 6907 mutex_exit(&ill->ill_lock); 6908 } 6909 } 6910 6911 /* 6912 * Finish the current exclusive operation on `ipsq'. Usually, this will allow 6913 * the next exclusive operation to begin once we ipsq_exit(). However, if 6914 * pending DLPI operations remain, then we will wait for the queue to drain 6915 * before allowing the next exclusive operation to begin. This ensures that 6916 * DLPI operations from one exclusive operation are never improperly processed 6917 * as part of a subsequent exclusive operation. 6918 */ 6919 void 6920 ipsq_current_finish(ipsq_t *ipsq) 6921 { 6922 ipxop_t *ipx = ipsq->ipsq_xop; 6923 t_uscalar_t dlpi_pending = DL_PRIM_INVAL; 6924 ipif_t *ipif = ipx->ipx_current_ipif; 6925 6926 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6927 6928 /* 6929 * For SIOCLIFREMOVEIF, the ipif has been already been blown away 6930 * (but in that case, IPIF_CHANGING will already be clear and no 6931 * pending DLPI messages can remain). 6932 */ 6933 if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) { 6934 ill_t *ill = ipif->ipif_ill; 6935 6936 mutex_enter(&ill->ill_lock); 6937 dlpi_pending = ill->ill_dlpi_pending; 6938 if (ipx->ipx_current_ioctl == 0) { 6939 ipif = ill->ill_ipif; 6940 for (; ipif != NULL; ipif = ipif->ipif_next) 6941 ipif->ipif_state_flags &= ~IPIF_CHANGING; 6942 } else { 6943 ipif->ipif_state_flags &= ~IPIF_CHANGING; 6944 } 6945 mutex_exit(&ill->ill_lock); 6946 } 6947 6948 ASSERT(!ipx->ipx_current_done); 6949 ipx->ipx_current_done = B_TRUE; 6950 ipx->ipx_current_ioctl = 0; 6951 if (dlpi_pending == DL_PRIM_INVAL) { 6952 mutex_enter(&ipx->ipx_lock); 6953 ipx->ipx_current_ipif = NULL; 6954 mutex_exit(&ipx->ipx_lock); 6955 } 6956 } 6957 6958 /* 6959 * The ill is closing. Flush all messages on the ipsq that originated 6960 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 6961 * for this ill since ipsq_enter could not have entered until then. 6962 * New messages can't be queued since the CONDEMNED flag is set. 6963 */ 6964 static void 6965 ipsq_flush(ill_t *ill) 6966 { 6967 queue_t *q; 6968 mblk_t *prev; 6969 mblk_t *mp; 6970 mblk_t *mp_next; 6971 ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 6972 6973 ASSERT(IAM_WRITER_ILL(ill)); 6974 6975 /* 6976 * Flush any messages sent up by the driver. 6977 */ 6978 mutex_enter(&ipx->ipx_lock); 6979 for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) { 6980 mp_next = mp->b_next; 6981 q = mp->b_queue; 6982 if (q == ill->ill_rq || q == ill->ill_wq) { 6983 /* dequeue mp */ 6984 if (prev == NULL) 6985 ipx->ipx_mphead = mp->b_next; 6986 else 6987 prev->b_next = mp->b_next; 6988 if (ipx->ipx_mptail == mp) { 6989 ASSERT(mp_next == NULL); 6990 ipx->ipx_mptail = prev; 6991 } 6992 inet_freemsg(mp); 6993 } else { 6994 prev = mp; 6995 } 6996 } 6997 mutex_exit(&ipx->ipx_lock); 6998 (void) ipsq_pending_mp_cleanup(ill, NULL); 6999 ipsq_xopq_mp_cleanup(ill, NULL); 7000 } 7001 7002 /* 7003 * Parse an ifreq or lifreq struct coming down ioctls and refhold 7004 * and return the associated ipif. 7005 * Return value: 7006 * Non zero: An error has occurred. ci may not be filled out. 7007 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 7008 * a held ipif in ci.ci_ipif. 7009 */ 7010 int 7011 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 7012 cmd_info_t *ci) 7013 { 7014 char *name; 7015 struct ifreq *ifr; 7016 struct lifreq *lifr; 7017 ipif_t *ipif = NULL; 7018 ill_t *ill; 7019 conn_t *connp; 7020 boolean_t isv6; 7021 boolean_t exists; 7022 mblk_t *mp1; 7023 zoneid_t zoneid; 7024 ip_stack_t *ipst; 7025 7026 if (q->q_next != NULL) { 7027 ill = (ill_t *)q->q_ptr; 7028 isv6 = ill->ill_isv6; 7029 connp = NULL; 7030 zoneid = ALL_ZONES; 7031 ipst = ill->ill_ipst; 7032 } else { 7033 ill = NULL; 7034 connp = Q_TO_CONN(q); 7035 isv6 = (connp->conn_family == AF_INET6); 7036 zoneid = connp->conn_zoneid; 7037 if (zoneid == GLOBAL_ZONEID) { 7038 /* global zone can access ipifs in all zones */ 7039 zoneid = ALL_ZONES; 7040 } 7041 ipst = connp->conn_netstack->netstack_ip; 7042 } 7043 7044 /* Has been checked in ip_wput_nondata */ 7045 mp1 = mp->b_cont->b_cont; 7046 7047 if (ipip->ipi_cmd_type == IF_CMD) { 7048 /* This a old style SIOC[GS]IF* command */ 7049 ifr = (struct ifreq *)mp1->b_rptr; 7050 /* 7051 * Null terminate the string to protect against buffer 7052 * overrun. String was generated by user code and may not 7053 * be trusted. 7054 */ 7055 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 7056 name = ifr->ifr_name; 7057 ci->ci_sin = (sin_t *)&ifr->ifr_addr; 7058 ci->ci_sin6 = NULL; 7059 ci->ci_lifr = (struct lifreq *)ifr; 7060 } else { 7061 /* This a new style SIOC[GS]LIF* command */ 7062 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 7063 lifr = (struct lifreq *)mp1->b_rptr; 7064 /* 7065 * Null terminate the string to protect against buffer 7066 * overrun. String was generated by user code and may not 7067 * be trusted. 7068 */ 7069 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 7070 name = lifr->lifr_name; 7071 ci->ci_sin = (sin_t *)&lifr->lifr_addr; 7072 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr; 7073 ci->ci_lifr = lifr; 7074 } 7075 7076 if (ipip->ipi_cmd == SIOCSLIFNAME) { 7077 /* 7078 * The ioctl will be failed if the ioctl comes down 7079 * an conn stream 7080 */ 7081 if (ill == NULL) { 7082 /* 7083 * Not an ill queue, return EINVAL same as the 7084 * old error code. 7085 */ 7086 return (ENXIO); 7087 } 7088 ipif = ill->ill_ipif; 7089 ipif_refhold(ipif); 7090 } else { 7091 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 7092 &exists, isv6, zoneid, ipst); 7093 7094 /* 7095 * Ensure that get ioctls don't see any internal state changes 7096 * caused by set ioctls by deferring them if IPIF_CHANGING is 7097 * set. 7098 */ 7099 if (ipif != NULL && !(ipip->ipi_flags & IPI_WR) && 7100 !IAM_WRITER_IPIF(ipif)) { 7101 ipsq_t *ipsq; 7102 7103 if (connp != NULL) 7104 mutex_enter(&connp->conn_lock); 7105 mutex_enter(&ipif->ipif_ill->ill_lock); 7106 if (IPIF_IS_CHANGING(ipif) && 7107 !IPIF_IS_CONDEMNED(ipif)) { 7108 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 7109 mutex_enter(&ipsq->ipsq_lock); 7110 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 7111 mutex_exit(&ipif->ipif_ill->ill_lock); 7112 ipsq_enq(ipsq, q, mp, ip_process_ioctl, 7113 NEW_OP, ipif->ipif_ill); 7114 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 7115 mutex_exit(&ipsq->ipsq_lock); 7116 if (connp != NULL) 7117 mutex_exit(&connp->conn_lock); 7118 ipif_refrele(ipif); 7119 return (EINPROGRESS); 7120 } 7121 mutex_exit(&ipif->ipif_ill->ill_lock); 7122 if (connp != NULL) 7123 mutex_exit(&connp->conn_lock); 7124 } 7125 } 7126 7127 /* 7128 * Old style [GS]IFCMD does not admit IPv6 ipif 7129 */ 7130 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) { 7131 ipif_refrele(ipif); 7132 return (ENXIO); 7133 } 7134 7135 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 7136 name[0] == '\0') { 7137 /* 7138 * Handle a or a SIOC?IF* with a null name 7139 * during plumb (on the ill queue before the I_PLINK). 7140 */ 7141 ipif = ill->ill_ipif; 7142 ipif_refhold(ipif); 7143 } 7144 7145 if (ipif == NULL) 7146 return (ENXIO); 7147 7148 DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq", 7149 int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif); 7150 7151 ci->ci_ipif = ipif; 7152 return (0); 7153 } 7154 7155 /* 7156 * Return the total number of ipifs. 7157 */ 7158 static uint_t 7159 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst) 7160 { 7161 uint_t numifs = 0; 7162 ill_t *ill; 7163 ill_walk_context_t ctx; 7164 ipif_t *ipif; 7165 7166 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7167 ill = ILL_START_WALK_V4(&ctx, ipst); 7168 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7169 if (IS_UNDER_IPMP(ill)) 7170 continue; 7171 for (ipif = ill->ill_ipif; ipif != NULL; 7172 ipif = ipif->ipif_next) { 7173 if (ipif->ipif_zoneid == zoneid || 7174 ipif->ipif_zoneid == ALL_ZONES) 7175 numifs++; 7176 } 7177 } 7178 rw_exit(&ipst->ips_ill_g_lock); 7179 return (numifs); 7180 } 7181 7182 /* 7183 * Return the total number of ipifs. 7184 */ 7185 static uint_t 7186 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst) 7187 { 7188 uint_t numifs = 0; 7189 ill_t *ill; 7190 ipif_t *ipif; 7191 ill_walk_context_t ctx; 7192 7193 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 7194 7195 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7196 if (family == AF_INET) 7197 ill = ILL_START_WALK_V4(&ctx, ipst); 7198 else if (family == AF_INET6) 7199 ill = ILL_START_WALK_V6(&ctx, ipst); 7200 else 7201 ill = ILL_START_WALK_ALL(&ctx, ipst); 7202 7203 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7204 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP)) 7205 continue; 7206 7207 for (ipif = ill->ill_ipif; ipif != NULL; 7208 ipif = ipif->ipif_next) { 7209 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7210 !(lifn_flags & LIFC_NOXMIT)) 7211 continue; 7212 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7213 !(lifn_flags & LIFC_TEMPORARY)) 7214 continue; 7215 if (((ipif->ipif_flags & 7216 (IPIF_NOXMIT|IPIF_NOLOCAL| 7217 IPIF_DEPRECATED)) || 7218 IS_LOOPBACK(ill) || 7219 !(ipif->ipif_flags & IPIF_UP)) && 7220 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 7221 continue; 7222 7223 if (zoneid != ipif->ipif_zoneid && 7224 ipif->ipif_zoneid != ALL_ZONES && 7225 (zoneid != GLOBAL_ZONEID || 7226 !(lifn_flags & LIFC_ALLZONES))) 7227 continue; 7228 7229 numifs++; 7230 } 7231 } 7232 rw_exit(&ipst->ips_ill_g_lock); 7233 return (numifs); 7234 } 7235 7236 uint_t 7237 ip_get_lifsrcofnum(ill_t *ill) 7238 { 7239 uint_t numifs = 0; 7240 ill_t *ill_head = ill; 7241 ip_stack_t *ipst = ill->ill_ipst; 7242 7243 /* 7244 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 7245 * other thread may be trying to relink the ILLs in this usesrc group 7246 * and adjusting the ill_usesrc_grp_next pointers 7247 */ 7248 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 7249 if ((ill->ill_usesrc_ifindex == 0) && 7250 (ill->ill_usesrc_grp_next != NULL)) { 7251 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 7252 ill = ill->ill_usesrc_grp_next) 7253 numifs++; 7254 } 7255 rw_exit(&ipst->ips_ill_g_usesrc_lock); 7256 7257 return (numifs); 7258 } 7259 7260 /* Null values are passed in for ipif, sin, and ifreq */ 7261 /* ARGSUSED */ 7262 int 7263 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7264 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7265 { 7266 int *nump; 7267 conn_t *connp = Q_TO_CONN(q); 7268 7269 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7270 7271 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 7272 nump = (int *)mp->b_cont->b_cont->b_rptr; 7273 7274 *nump = ip_get_numifs(connp->conn_zoneid, 7275 connp->conn_netstack->netstack_ip); 7276 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 7277 return (0); 7278 } 7279 7280 /* Null values are passed in for ipif, sin, and ifreq */ 7281 /* ARGSUSED */ 7282 int 7283 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 7284 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7285 { 7286 struct lifnum *lifn; 7287 mblk_t *mp1; 7288 conn_t *connp = Q_TO_CONN(q); 7289 7290 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7291 7292 /* Existence checked in ip_wput_nondata */ 7293 mp1 = mp->b_cont->b_cont; 7294 7295 lifn = (struct lifnum *)mp1->b_rptr; 7296 switch (lifn->lifn_family) { 7297 case AF_UNSPEC: 7298 case AF_INET: 7299 case AF_INET6: 7300 break; 7301 default: 7302 return (EAFNOSUPPORT); 7303 } 7304 7305 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 7306 connp->conn_zoneid, connp->conn_netstack->netstack_ip); 7307 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 7308 return (0); 7309 } 7310 7311 /* ARGSUSED */ 7312 int 7313 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7314 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7315 { 7316 STRUCT_HANDLE(ifconf, ifc); 7317 mblk_t *mp1; 7318 struct iocblk *iocp; 7319 struct ifreq *ifr; 7320 ill_walk_context_t ctx; 7321 ill_t *ill; 7322 ipif_t *ipif; 7323 struct sockaddr_in *sin; 7324 int32_t ifclen; 7325 zoneid_t zoneid; 7326 ip_stack_t *ipst = CONNQ_TO_IPST(q); 7327 7328 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 7329 7330 ip1dbg(("ip_sioctl_get_ifconf")); 7331 /* Existence verified in ip_wput_nondata */ 7332 mp1 = mp->b_cont->b_cont; 7333 iocp = (struct iocblk *)mp->b_rptr; 7334 zoneid = Q_TO_CONN(q)->conn_zoneid; 7335 7336 /* 7337 * The original SIOCGIFCONF passed in a struct ifconf which specified 7338 * the user buffer address and length into which the list of struct 7339 * ifreqs was to be copied. Since AT&T Streams does not seem to 7340 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 7341 * the SIOCGIFCONF operation was redefined to simply provide 7342 * a large output buffer into which we are supposed to jam the ifreq 7343 * array. The same ioctl command code was used, despite the fact that 7344 * both the applications and the kernel code had to change, thus making 7345 * it impossible to support both interfaces. 7346 * 7347 * For reasons not good enough to try to explain, the following 7348 * algorithm is used for deciding what to do with one of these: 7349 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 7350 * form with the output buffer coming down as the continuation message. 7351 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 7352 * and we have to copy in the ifconf structure to find out how big the 7353 * output buffer is and where to copy out to. Sure no problem... 7354 * 7355 */ 7356 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 7357 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 7358 int numifs = 0; 7359 size_t ifc_bufsize; 7360 7361 /* 7362 * Must be (better be!) continuation of a TRANSPARENT 7363 * IOCTL. We just copied in the ifconf structure. 7364 */ 7365 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 7366 (struct ifconf *)mp1->b_rptr); 7367 7368 /* 7369 * Allocate a buffer to hold requested information. 7370 * 7371 * If ifc_len is larger than what is needed, we only 7372 * allocate what we will use. 7373 * 7374 * If ifc_len is smaller than what is needed, return 7375 * EINVAL. 7376 * 7377 * XXX: the ill_t structure can hava 2 counters, for 7378 * v4 and v6 (not just ill_ipif_up_count) to store the 7379 * number of interfaces for a device, so we don't need 7380 * to count them here... 7381 */ 7382 numifs = ip_get_numifs(zoneid, ipst); 7383 7384 ifclen = STRUCT_FGET(ifc, ifc_len); 7385 ifc_bufsize = numifs * sizeof (struct ifreq); 7386 if (ifc_bufsize > ifclen) { 7387 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 7388 /* old behaviour */ 7389 return (EINVAL); 7390 } else { 7391 ifc_bufsize = ifclen; 7392 } 7393 } 7394 7395 mp1 = mi_copyout_alloc(q, mp, 7396 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 7397 if (mp1 == NULL) 7398 return (ENOMEM); 7399 7400 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 7401 } 7402 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 7403 /* 7404 * the SIOCGIFCONF ioctl only knows about 7405 * IPv4 addresses, so don't try to tell 7406 * it about interfaces with IPv6-only 7407 * addresses. (Last parm 'isv6' is B_FALSE) 7408 */ 7409 7410 ifr = (struct ifreq *)mp1->b_rptr; 7411 7412 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7413 ill = ILL_START_WALK_V4(&ctx, ipst); 7414 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7415 if (IS_UNDER_IPMP(ill)) 7416 continue; 7417 for (ipif = ill->ill_ipif; ipif != NULL; 7418 ipif = ipif->ipif_next) { 7419 if (zoneid != ipif->ipif_zoneid && 7420 ipif->ipif_zoneid != ALL_ZONES) 7421 continue; 7422 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 7423 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 7424 /* old behaviour */ 7425 rw_exit(&ipst->ips_ill_g_lock); 7426 return (EINVAL); 7427 } else { 7428 goto if_copydone; 7429 } 7430 } 7431 ipif_get_name(ipif, ifr->ifr_name, 7432 sizeof (ifr->ifr_name)); 7433 sin = (sin_t *)&ifr->ifr_addr; 7434 *sin = sin_null; 7435 sin->sin_family = AF_INET; 7436 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 7437 ifr++; 7438 } 7439 } 7440 if_copydone: 7441 rw_exit(&ipst->ips_ill_g_lock); 7442 mp1->b_wptr = (uchar_t *)ifr; 7443 7444 if (STRUCT_BUF(ifc) != NULL) { 7445 STRUCT_FSET(ifc, ifc_len, 7446 (int)((uchar_t *)ifr - mp1->b_rptr)); 7447 } 7448 return (0); 7449 } 7450 7451 /* 7452 * Get the interfaces using the address hosted on the interface passed in, 7453 * as a source adddress 7454 */ 7455 /* ARGSUSED */ 7456 int 7457 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7458 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7459 { 7460 mblk_t *mp1; 7461 ill_t *ill, *ill_head; 7462 ipif_t *ipif, *orig_ipif; 7463 int numlifs = 0; 7464 size_t lifs_bufsize, lifsmaxlen; 7465 struct lifreq *lifr; 7466 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7467 uint_t ifindex; 7468 zoneid_t zoneid; 7469 boolean_t isv6 = B_FALSE; 7470 struct sockaddr_in *sin; 7471 struct sockaddr_in6 *sin6; 7472 STRUCT_HANDLE(lifsrcof, lifs); 7473 ip_stack_t *ipst; 7474 7475 ipst = CONNQ_TO_IPST(q); 7476 7477 ASSERT(q->q_next == NULL); 7478 7479 zoneid = Q_TO_CONN(q)->conn_zoneid; 7480 7481 /* Existence verified in ip_wput_nondata */ 7482 mp1 = mp->b_cont->b_cont; 7483 7484 /* 7485 * Must be (better be!) continuation of a TRANSPARENT 7486 * IOCTL. We just copied in the lifsrcof structure. 7487 */ 7488 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 7489 (struct lifsrcof *)mp1->b_rptr); 7490 7491 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 7492 return (EINVAL); 7493 7494 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 7495 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; 7496 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst); 7497 if (ipif == NULL) { 7498 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 7499 ifindex)); 7500 return (ENXIO); 7501 } 7502 7503 /* Allocate a buffer to hold requested information */ 7504 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 7505 lifs_bufsize = numlifs * sizeof (struct lifreq); 7506 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 7507 /* The actual size needed is always returned in lifs_len */ 7508 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 7509 7510 /* If the amount we need is more than what is passed in, abort */ 7511 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 7512 ipif_refrele(ipif); 7513 return (0); 7514 } 7515 7516 mp1 = mi_copyout_alloc(q, mp, 7517 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 7518 if (mp1 == NULL) { 7519 ipif_refrele(ipif); 7520 return (ENOMEM); 7521 } 7522 7523 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 7524 bzero(mp1->b_rptr, lifs_bufsize); 7525 7526 lifr = (struct lifreq *)mp1->b_rptr; 7527 7528 ill = ill_head = ipif->ipif_ill; 7529 orig_ipif = ipif; 7530 7531 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 7532 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 7533 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7534 7535 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 7536 for (; (ill != NULL) && (ill != ill_head); 7537 ill = ill->ill_usesrc_grp_next) { 7538 7539 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 7540 break; 7541 7542 ipif = ill->ill_ipif; 7543 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name)); 7544 if (ipif->ipif_isv6) { 7545 sin6 = (sin6_t *)&lifr->lifr_addr; 7546 *sin6 = sin6_null; 7547 sin6->sin6_family = AF_INET6; 7548 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 7549 lifr->lifr_addrlen = ip_mask_to_plen_v6( 7550 &ipif->ipif_v6net_mask); 7551 } else { 7552 sin = (sin_t *)&lifr->lifr_addr; 7553 *sin = sin_null; 7554 sin->sin_family = AF_INET; 7555 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 7556 lifr->lifr_addrlen = ip_mask_to_plen( 7557 ipif->ipif_net_mask); 7558 } 7559 lifr++; 7560 } 7561 rw_exit(&ipst->ips_ill_g_usesrc_lock); 7562 rw_exit(&ipst->ips_ill_g_lock); 7563 ipif_refrele(orig_ipif); 7564 mp1->b_wptr = (uchar_t *)lifr; 7565 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 7566 7567 return (0); 7568 } 7569 7570 /* ARGSUSED */ 7571 int 7572 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7573 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7574 { 7575 mblk_t *mp1; 7576 int list; 7577 ill_t *ill; 7578 ipif_t *ipif; 7579 int flags; 7580 int numlifs = 0; 7581 size_t lifc_bufsize; 7582 struct lifreq *lifr; 7583 sa_family_t family; 7584 struct sockaddr_in *sin; 7585 struct sockaddr_in6 *sin6; 7586 ill_walk_context_t ctx; 7587 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7588 int32_t lifclen; 7589 zoneid_t zoneid; 7590 STRUCT_HANDLE(lifconf, lifc); 7591 ip_stack_t *ipst = CONNQ_TO_IPST(q); 7592 7593 ip1dbg(("ip_sioctl_get_lifconf")); 7594 7595 ASSERT(q->q_next == NULL); 7596 7597 zoneid = Q_TO_CONN(q)->conn_zoneid; 7598 7599 /* Existence verified in ip_wput_nondata */ 7600 mp1 = mp->b_cont->b_cont; 7601 7602 /* 7603 * An extended version of SIOCGIFCONF that takes an 7604 * additional address family and flags field. 7605 * AF_UNSPEC retrieve both IPv4 and IPv6. 7606 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 7607 * interfaces are omitted. 7608 * Similarly, IPIF_TEMPORARY interfaces are omitted 7609 * unless LIFC_TEMPORARY is specified. 7610 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 7611 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 7612 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 7613 * has priority over LIFC_NOXMIT. 7614 */ 7615 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 7616 7617 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 7618 return (EINVAL); 7619 7620 /* 7621 * Must be (better be!) continuation of a TRANSPARENT 7622 * IOCTL. We just copied in the lifconf structure. 7623 */ 7624 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 7625 7626 family = STRUCT_FGET(lifc, lifc_family); 7627 flags = STRUCT_FGET(lifc, lifc_flags); 7628 7629 switch (family) { 7630 case AF_UNSPEC: 7631 /* 7632 * walk all ILL's. 7633 */ 7634 list = MAX_G_HEADS; 7635 break; 7636 case AF_INET: 7637 /* 7638 * walk only IPV4 ILL's. 7639 */ 7640 list = IP_V4_G_HEAD; 7641 break; 7642 case AF_INET6: 7643 /* 7644 * walk only IPV6 ILL's. 7645 */ 7646 list = IP_V6_G_HEAD; 7647 break; 7648 default: 7649 return (EAFNOSUPPORT); 7650 } 7651 7652 /* 7653 * Allocate a buffer to hold requested information. 7654 * 7655 * If lifc_len is larger than what is needed, we only 7656 * allocate what we will use. 7657 * 7658 * If lifc_len is smaller than what is needed, return 7659 * EINVAL. 7660 */ 7661 numlifs = ip_get_numlifs(family, flags, zoneid, ipst); 7662 lifc_bufsize = numlifs * sizeof (struct lifreq); 7663 lifclen = STRUCT_FGET(lifc, lifc_len); 7664 if (lifc_bufsize > lifclen) { 7665 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 7666 return (EINVAL); 7667 else 7668 lifc_bufsize = lifclen; 7669 } 7670 7671 mp1 = mi_copyout_alloc(q, mp, 7672 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 7673 if (mp1 == NULL) 7674 return (ENOMEM); 7675 7676 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 7677 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 7678 7679 lifr = (struct lifreq *)mp1->b_rptr; 7680 7681 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7682 ill = ill_first(list, list, &ctx, ipst); 7683 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7684 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP)) 7685 continue; 7686 7687 for (ipif = ill->ill_ipif; ipif != NULL; 7688 ipif = ipif->ipif_next) { 7689 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7690 !(flags & LIFC_NOXMIT)) 7691 continue; 7692 7693 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7694 !(flags & LIFC_TEMPORARY)) 7695 continue; 7696 7697 if (((ipif->ipif_flags & 7698 (IPIF_NOXMIT|IPIF_NOLOCAL| 7699 IPIF_DEPRECATED)) || 7700 IS_LOOPBACK(ill) || 7701 !(ipif->ipif_flags & IPIF_UP)) && 7702 (flags & LIFC_EXTERNAL_SOURCE)) 7703 continue; 7704 7705 if (zoneid != ipif->ipif_zoneid && 7706 ipif->ipif_zoneid != ALL_ZONES && 7707 (zoneid != GLOBAL_ZONEID || 7708 !(flags & LIFC_ALLZONES))) 7709 continue; 7710 7711 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 7712 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 7713 rw_exit(&ipst->ips_ill_g_lock); 7714 return (EINVAL); 7715 } else { 7716 goto lif_copydone; 7717 } 7718 } 7719 7720 ipif_get_name(ipif, lifr->lifr_name, 7721 sizeof (lifr->lifr_name)); 7722 lifr->lifr_type = ill->ill_type; 7723 if (ipif->ipif_isv6) { 7724 sin6 = (sin6_t *)&lifr->lifr_addr; 7725 *sin6 = sin6_null; 7726 sin6->sin6_family = AF_INET6; 7727 sin6->sin6_addr = 7728 ipif->ipif_v6lcl_addr; 7729 lifr->lifr_addrlen = 7730 ip_mask_to_plen_v6( 7731 &ipif->ipif_v6net_mask); 7732 } else { 7733 sin = (sin_t *)&lifr->lifr_addr; 7734 *sin = sin_null; 7735 sin->sin_family = AF_INET; 7736 sin->sin_addr.s_addr = 7737 ipif->ipif_lcl_addr; 7738 lifr->lifr_addrlen = 7739 ip_mask_to_plen( 7740 ipif->ipif_net_mask); 7741 } 7742 lifr++; 7743 } 7744 } 7745 lif_copydone: 7746 rw_exit(&ipst->ips_ill_g_lock); 7747 7748 mp1->b_wptr = (uchar_t *)lifr; 7749 if (STRUCT_BUF(lifc) != NULL) { 7750 STRUCT_FSET(lifc, lifc_len, 7751 (int)((uchar_t *)lifr - mp1->b_rptr)); 7752 } 7753 return (0); 7754 } 7755 7756 static void 7757 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 7758 { 7759 ip6_asp_t *table; 7760 size_t table_size; 7761 mblk_t *data_mp; 7762 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7763 ip_stack_t *ipst; 7764 7765 if (q->q_next == NULL) 7766 ipst = CONNQ_TO_IPST(q); 7767 else 7768 ipst = ILLQ_TO_IPST(q); 7769 7770 /* These two ioctls are I_STR only */ 7771 if (iocp->ioc_count == TRANSPARENT) { 7772 miocnak(q, mp, 0, EINVAL); 7773 return; 7774 } 7775 7776 data_mp = mp->b_cont; 7777 if (data_mp == NULL) { 7778 /* The user passed us a NULL argument */ 7779 table = NULL; 7780 table_size = iocp->ioc_count; 7781 } else { 7782 /* 7783 * The user provided a table. The stream head 7784 * may have copied in the user data in chunks, 7785 * so make sure everything is pulled up 7786 * properly. 7787 */ 7788 if (MBLKL(data_mp) < iocp->ioc_count) { 7789 mblk_t *new_data_mp; 7790 if ((new_data_mp = msgpullup(data_mp, -1)) == 7791 NULL) { 7792 miocnak(q, mp, 0, ENOMEM); 7793 return; 7794 } 7795 freemsg(data_mp); 7796 data_mp = new_data_mp; 7797 mp->b_cont = data_mp; 7798 } 7799 table = (ip6_asp_t *)data_mp->b_rptr; 7800 table_size = iocp->ioc_count; 7801 } 7802 7803 switch (iocp->ioc_cmd) { 7804 case SIOCGIP6ADDRPOLICY: 7805 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst); 7806 if (iocp->ioc_rval == -1) 7807 iocp->ioc_error = EINVAL; 7808 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 7809 else if (table != NULL && 7810 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 7811 ip6_asp_t *src = table; 7812 ip6_asp32_t *dst = (void *)table; 7813 int count = table_size / sizeof (ip6_asp_t); 7814 int i; 7815 7816 /* 7817 * We need to do an in-place shrink of the array 7818 * to match the alignment attributes of the 7819 * 32-bit ABI looking at it. 7820 */ 7821 /* LINTED: logical expression always true: op "||" */ 7822 ASSERT(sizeof (*src) > sizeof (*dst)); 7823 for (i = 1; i < count; i++) 7824 bcopy(src + i, dst + i, sizeof (*dst)); 7825 } 7826 #endif 7827 break; 7828 7829 case SIOCSIP6ADDRPOLICY: 7830 ASSERT(mp->b_prev == NULL); 7831 mp->b_prev = (void *)q; 7832 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 7833 /* 7834 * We pass in the datamodel here so that the ip6_asp_replace() 7835 * routine can handle converting from 32-bit to native formats 7836 * where necessary. 7837 * 7838 * A better way to handle this might be to convert the inbound 7839 * data structure here, and hang it off a new 'mp'; thus the 7840 * ip6_asp_replace() logic would always be dealing with native 7841 * format data structures.. 7842 * 7843 * (An even simpler way to handle these ioctls is to just 7844 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 7845 * and just recompile everything that depends on it.) 7846 */ 7847 #endif 7848 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst, 7849 iocp->ioc_flag & IOC_MODELS); 7850 return; 7851 } 7852 7853 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 7854 qreply(q, mp); 7855 } 7856 7857 static void 7858 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 7859 { 7860 mblk_t *data_mp; 7861 struct dstinforeq *dir; 7862 uint8_t *end, *cur; 7863 in6_addr_t *daddr, *saddr; 7864 ipaddr_t v4daddr; 7865 ire_t *ire; 7866 ipaddr_t v4setsrc; 7867 in6_addr_t v6setsrc; 7868 char *slabel, *dlabel; 7869 boolean_t isipv4; 7870 int match_ire; 7871 ill_t *dst_ill; 7872 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7873 conn_t *connp = Q_TO_CONN(q); 7874 zoneid_t zoneid = IPCL_ZONEID(connp); 7875 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 7876 uint64_t ipif_flags; 7877 7878 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 7879 7880 /* 7881 * This ioctl is I_STR only, and must have a 7882 * data mblk following the M_IOCTL mblk. 7883 */ 7884 data_mp = mp->b_cont; 7885 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 7886 miocnak(q, mp, 0, EINVAL); 7887 return; 7888 } 7889 7890 if (MBLKL(data_mp) < iocp->ioc_count) { 7891 mblk_t *new_data_mp; 7892 7893 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 7894 miocnak(q, mp, 0, ENOMEM); 7895 return; 7896 } 7897 freemsg(data_mp); 7898 data_mp = new_data_mp; 7899 mp->b_cont = data_mp; 7900 } 7901 match_ire = MATCH_IRE_DSTONLY; 7902 7903 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 7904 end - cur >= sizeof (struct dstinforeq); 7905 cur += sizeof (struct dstinforeq)) { 7906 dir = (struct dstinforeq *)cur; 7907 daddr = &dir->dir_daddr; 7908 saddr = &dir->dir_saddr; 7909 7910 /* 7911 * ip_addr_scope_v6() and ip6_asp_lookup() handle 7912 * v4 mapped addresses; ire_ftable_lookup_v6() 7913 * and ip_select_source_v6() do not. 7914 */ 7915 dir->dir_dscope = ip_addr_scope_v6(daddr); 7916 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst); 7917 7918 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 7919 if (isipv4) { 7920 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 7921 v4setsrc = INADDR_ANY; 7922 ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid, 7923 NULL, match_ire, B_TRUE, 0, ipst, &v4setsrc, NULL, 7924 NULL); 7925 } else { 7926 v6setsrc = ipv6_all_zeros; 7927 ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid, 7928 NULL, match_ire, B_TRUE, 0, ipst, &v6setsrc, NULL, 7929 NULL); 7930 } 7931 ASSERT(ire != NULL); 7932 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 7933 ire_refrele(ire); 7934 dir->dir_dreachable = 0; 7935 7936 /* move on to next dst addr */ 7937 continue; 7938 } 7939 dir->dir_dreachable = 1; 7940 7941 dst_ill = ire_nexthop_ill(ire); 7942 if (dst_ill == NULL) { 7943 ire_refrele(ire); 7944 continue; 7945 } 7946 7947 /* With ipmp we most likely look at the ipmp ill here */ 7948 dir->dir_dmactype = dst_ill->ill_mactype; 7949 7950 if (isipv4) { 7951 ipaddr_t v4saddr; 7952 7953 if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr, 7954 connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst, 7955 &v4saddr, NULL, &ipif_flags) != 0) { 7956 v4saddr = INADDR_ANY; 7957 ipif_flags = 0; 7958 } 7959 IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr); 7960 } else { 7961 if (ip_select_source_v6(dst_ill, &v6setsrc, daddr, 7962 zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT, 7963 saddr, NULL, &ipif_flags) != 0) { 7964 *saddr = ipv6_all_zeros; 7965 ipif_flags = 0; 7966 } 7967 } 7968 7969 dir->dir_sscope = ip_addr_scope_v6(saddr); 7970 slabel = ip6_asp_lookup(saddr, NULL, ipst); 7971 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 7972 dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 7973 ire_refrele(ire); 7974 ill_refrele(dst_ill); 7975 } 7976 miocack(q, mp, iocp->ioc_count, 0); 7977 } 7978 7979 /* 7980 * Check if this is an address assigned to this machine. 7981 * Skips interfaces that are down by using ire checks. 7982 * Translates mapped addresses to v4 addresses and then 7983 * treats them as such, returning true if the v4 address 7984 * associated with this mapped address is configured. 7985 * Note: Applications will have to be careful what they do 7986 * with the response; use of mapped addresses limits 7987 * what can be done with the socket, especially with 7988 * respect to socket options and ioctls - neither IPv4 7989 * options nor IPv6 sticky options/ancillary data options 7990 * may be used. 7991 */ 7992 /* ARGSUSED */ 7993 int 7994 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7995 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 7996 { 7997 struct sioc_addrreq *sia; 7998 sin_t *sin; 7999 ire_t *ire; 8000 mblk_t *mp1; 8001 zoneid_t zoneid; 8002 ip_stack_t *ipst; 8003 8004 ip1dbg(("ip_sioctl_tmyaddr")); 8005 8006 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8007 zoneid = Q_TO_CONN(q)->conn_zoneid; 8008 ipst = CONNQ_TO_IPST(q); 8009 8010 /* Existence verified in ip_wput_nondata */ 8011 mp1 = mp->b_cont->b_cont; 8012 sia = (struct sioc_addrreq *)mp1->b_rptr; 8013 sin = (sin_t *)&sia->sa_addr; 8014 switch (sin->sin_family) { 8015 case AF_INET6: { 8016 sin6_t *sin6 = (sin6_t *)sin; 8017 8018 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8019 ipaddr_t v4_addr; 8020 8021 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8022 v4_addr); 8023 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 8024 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, 8025 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8026 } else { 8027 in6_addr_t v6addr; 8028 8029 v6addr = sin6->sin6_addr; 8030 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 8031 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, 8032 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8033 } 8034 break; 8035 } 8036 case AF_INET: { 8037 ipaddr_t v4addr; 8038 8039 v4addr = sin->sin_addr.s_addr; 8040 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 8041 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8042 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8043 break; 8044 } 8045 default: 8046 return (EAFNOSUPPORT); 8047 } 8048 if (ire != NULL) { 8049 sia->sa_res = 1; 8050 ire_refrele(ire); 8051 } else { 8052 sia->sa_res = 0; 8053 } 8054 return (0); 8055 } 8056 8057 /* 8058 * Check if this is an address assigned on-link i.e. neighbor, 8059 * and makes sure it's reachable from the current zone. 8060 * Returns true for my addresses as well. 8061 * Translates mapped addresses to v4 addresses and then 8062 * treats them as such, returning true if the v4 address 8063 * associated with this mapped address is configured. 8064 * Note: Applications will have to be careful what they do 8065 * with the response; use of mapped addresses limits 8066 * what can be done with the socket, especially with 8067 * respect to socket options and ioctls - neither IPv4 8068 * options nor IPv6 sticky options/ancillary data options 8069 * may be used. 8070 */ 8071 /* ARGSUSED */ 8072 int 8073 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8074 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 8075 { 8076 struct sioc_addrreq *sia; 8077 sin_t *sin; 8078 mblk_t *mp1; 8079 ire_t *ire = NULL; 8080 zoneid_t zoneid; 8081 ip_stack_t *ipst; 8082 8083 ip1dbg(("ip_sioctl_tonlink")); 8084 8085 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8086 zoneid = Q_TO_CONN(q)->conn_zoneid; 8087 ipst = CONNQ_TO_IPST(q); 8088 8089 /* Existence verified in ip_wput_nondata */ 8090 mp1 = mp->b_cont->b_cont; 8091 sia = (struct sioc_addrreq *)mp1->b_rptr; 8092 sin = (sin_t *)&sia->sa_addr; 8093 8094 /* 8095 * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST 8096 * to make sure we only look at on-link unicast address. 8097 */ 8098 switch (sin->sin_family) { 8099 case AF_INET6: { 8100 sin6_t *sin6 = (sin6_t *)sin; 8101 8102 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8103 ipaddr_t v4_addr; 8104 8105 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8106 v4_addr); 8107 if (!CLASSD(v4_addr)) { 8108 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0, 8109 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 8110 0, ipst, NULL); 8111 } 8112 } else { 8113 in6_addr_t v6addr; 8114 8115 v6addr = sin6->sin6_addr; 8116 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 8117 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0, 8118 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 0, 8119 ipst, NULL); 8120 } 8121 } 8122 break; 8123 } 8124 case AF_INET: { 8125 ipaddr_t v4addr; 8126 8127 v4addr = sin->sin_addr.s_addr; 8128 if (!CLASSD(v4addr)) { 8129 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL, 8130 zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 8131 } 8132 break; 8133 } 8134 default: 8135 return (EAFNOSUPPORT); 8136 } 8137 sia->sa_res = 0; 8138 if (ire != NULL) { 8139 ASSERT(!(ire->ire_type & IRE_MULTICAST)); 8140 8141 if ((ire->ire_type & IRE_ONLINK) && 8142 !(ire->ire_type & IRE_BROADCAST)) 8143 sia->sa_res = 1; 8144 ire_refrele(ire); 8145 } 8146 return (0); 8147 } 8148 8149 /* 8150 * TBD: implement when kernel maintaines a list of site prefixes. 8151 */ 8152 /* ARGSUSED */ 8153 int 8154 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8155 ip_ioctl_cmd_t *ipip, void *ifreq) 8156 { 8157 return (ENXIO); 8158 } 8159 8160 /* ARP IOCTLs. */ 8161 /* ARGSUSED */ 8162 int 8163 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8164 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8165 { 8166 int err; 8167 ipaddr_t ipaddr; 8168 struct iocblk *iocp; 8169 conn_t *connp; 8170 struct arpreq *ar; 8171 struct xarpreq *xar; 8172 int arp_flags, flags, alength; 8173 uchar_t *lladdr; 8174 ip_stack_t *ipst; 8175 ill_t *ill = ipif->ipif_ill; 8176 ill_t *proxy_ill = NULL; 8177 ipmp_arpent_t *entp = NULL; 8178 boolean_t proxyarp = B_FALSE; 8179 boolean_t if_arp_ioctl = B_FALSE; 8180 ncec_t *ncec = NULL; 8181 nce_t *nce; 8182 8183 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8184 connp = Q_TO_CONN(q); 8185 ipst = connp->conn_netstack->netstack_ip; 8186 iocp = (struct iocblk *)mp->b_rptr; 8187 8188 if (ipip->ipi_cmd_type == XARP_CMD) { 8189 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 8190 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 8191 ar = NULL; 8192 8193 arp_flags = xar->xarp_flags; 8194 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha); 8195 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0); 8196 /* 8197 * Validate against user's link layer address length 8198 * input and name and addr length limits. 8199 */ 8200 alength = ill->ill_phys_addr_length; 8201 if (ipip->ipi_cmd == SIOCSXARP) { 8202 if (alength != xar->xarp_ha.sdl_alen || 8203 (alength + xar->xarp_ha.sdl_nlen > 8204 sizeof (xar->xarp_ha.sdl_data))) 8205 return (EINVAL); 8206 } 8207 } else { 8208 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 8209 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 8210 xar = NULL; 8211 8212 arp_flags = ar->arp_flags; 8213 lladdr = (uchar_t *)ar->arp_ha.sa_data; 8214 /* 8215 * Theoretically, the sa_family could tell us what link 8216 * layer type this operation is trying to deal with. By 8217 * common usage AF_UNSPEC means ethernet. We'll assume 8218 * any attempt to use the SIOC?ARP ioctls is for ethernet, 8219 * for now. Our new SIOC*XARP ioctls can be used more 8220 * generally. 8221 * 8222 * If the underlying media happens to have a non 6 byte 8223 * address, arp module will fail set/get, but the del 8224 * operation will succeed. 8225 */ 8226 alength = 6; 8227 if ((ipip->ipi_cmd != SIOCDARP) && 8228 (alength != ill->ill_phys_addr_length)) { 8229 return (EINVAL); 8230 } 8231 } 8232 8233 /* Translate ATF* flags to NCE* flags */ 8234 flags = 0; 8235 if (arp_flags & ATF_AUTHORITY) 8236 flags |= NCE_F_AUTHORITY; 8237 if (arp_flags & ATF_PERM) 8238 flags |= NCE_F_NONUD; /* not subject to aging */ 8239 if (arp_flags & ATF_PUBL) 8240 flags |= NCE_F_PUBLISH; 8241 8242 /* 8243 * IPMP ARP special handling: 8244 * 8245 * 1. Since ARP mappings must appear consistent across the group, 8246 * prohibit changing ARP mappings on the underlying interfaces. 8247 * 8248 * 2. Since ARP mappings for IPMP data addresses are maintained by 8249 * IP itself, prohibit changing them. 8250 * 8251 * 3. For proxy ARP, use a functioning hardware address in the group, 8252 * provided one exists. If one doesn't, just add the entry as-is; 8253 * ipmp_illgrp_refresh_arpent() will refresh it if things change. 8254 */ 8255 if (IS_UNDER_IPMP(ill)) { 8256 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP) 8257 return (EPERM); 8258 } 8259 if (IS_IPMP(ill)) { 8260 ipmp_illgrp_t *illg = ill->ill_grp; 8261 8262 switch (ipip->ipi_cmd) { 8263 case SIOCSARP: 8264 case SIOCSXARP: 8265 proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength); 8266 if (proxy_ill != NULL) { 8267 proxyarp = B_TRUE; 8268 if (!ipmp_ill_is_active(proxy_ill)) 8269 proxy_ill = ipmp_illgrp_next_ill(illg); 8270 if (proxy_ill != NULL) 8271 lladdr = proxy_ill->ill_phys_addr; 8272 } 8273 /* FALLTHRU */ 8274 } 8275 } 8276 8277 ipaddr = sin->sin_addr.s_addr; 8278 /* 8279 * don't match across illgrp per case (1) and (2). 8280 * XXX use IS_IPMP(ill) like ndp_sioc_update? 8281 */ 8282 nce = nce_lookup_v4(ill, &ipaddr); 8283 if (nce != NULL) 8284 ncec = nce->nce_common; 8285 8286 switch (iocp->ioc_cmd) { 8287 case SIOCDARP: 8288 case SIOCDXARP: { 8289 /* 8290 * Delete the NCE if any. 8291 */ 8292 if (ncec == NULL) { 8293 iocp->ioc_error = ENXIO; 8294 break; 8295 } 8296 /* Don't allow changes to arp mappings of local addresses. */ 8297 if (NCE_MYADDR(ncec)) { 8298 nce_refrele(nce); 8299 return (ENOTSUP); 8300 } 8301 iocp->ioc_error = 0; 8302 8303 /* 8304 * Delete the nce_common which has ncec_ill set to ipmp_ill. 8305 * This will delete all the nce entries on the under_ills. 8306 */ 8307 ncec_delete(ncec); 8308 /* 8309 * Once the NCE has been deleted, then the ire_dep* consistency 8310 * mechanism will find any IRE which depended on the now 8311 * condemned NCE (as part of sending packets). 8312 * That mechanism handles redirects by deleting redirects 8313 * that refer to UNREACHABLE nces. 8314 */ 8315 break; 8316 } 8317 case SIOCGARP: 8318 case SIOCGXARP: 8319 if (ncec != NULL) { 8320 lladdr = ncec->ncec_lladdr; 8321 flags = ncec->ncec_flags; 8322 iocp->ioc_error = 0; 8323 ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags); 8324 } else { 8325 iocp->ioc_error = ENXIO; 8326 } 8327 break; 8328 case SIOCSARP: 8329 case SIOCSXARP: 8330 /* Don't allow changes to arp mappings of local addresses. */ 8331 if (ncec != NULL && NCE_MYADDR(ncec)) { 8332 nce_refrele(nce); 8333 return (ENOTSUP); 8334 } 8335 8336 /* static arp entries will undergo NUD if ATF_PERM is not set */ 8337 flags |= NCE_F_STATIC; 8338 if (!if_arp_ioctl) { 8339 ip_nce_lookup_and_update(&ipaddr, NULL, ipst, 8340 lladdr, alength, flags); 8341 } else { 8342 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 8343 if (ipif != NULL) { 8344 ip_nce_lookup_and_update(&ipaddr, ipif, ipst, 8345 lladdr, alength, flags); 8346 ipif_refrele(ipif); 8347 } 8348 } 8349 if (nce != NULL) { 8350 nce_refrele(nce); 8351 nce = NULL; 8352 } 8353 /* 8354 * NCE_F_STATIC entries will be added in state ND_REACHABLE 8355 * by nce_add_common() 8356 */ 8357 err = nce_lookup_then_add_v4(ill, lladdr, 8358 ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED, 8359 &nce); 8360 if (err == EEXIST) { 8361 ncec = nce->nce_common; 8362 mutex_enter(&ncec->ncec_lock); 8363 ncec->ncec_state = ND_REACHABLE; 8364 ncec->ncec_flags = flags; 8365 nce_update(ncec, ND_UNCHANGED, lladdr); 8366 mutex_exit(&ncec->ncec_lock); 8367 err = 0; 8368 } 8369 if (nce != NULL) { 8370 nce_refrele(nce); 8371 nce = NULL; 8372 } 8373 if (IS_IPMP(ill) && err == 0) { 8374 entp = ipmp_illgrp_create_arpent(ill->ill_grp, 8375 proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length, 8376 flags); 8377 if (entp == NULL || (proxyarp && proxy_ill == NULL)) { 8378 iocp->ioc_error = (entp == NULL ? ENOMEM : 0); 8379 break; 8380 } 8381 } 8382 iocp->ioc_error = err; 8383 } 8384 8385 if (nce != NULL) { 8386 nce_refrele(nce); 8387 } 8388 8389 /* 8390 * If we created an IPMP ARP entry, mark that we've notified ARP. 8391 */ 8392 if (entp != NULL) 8393 ipmp_illgrp_mark_arpent(ill->ill_grp, entp); 8394 8395 return (iocp->ioc_error); 8396 } 8397 8398 /* 8399 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify 8400 * the associated sin and refhold and return the associated ipif via `ci'. 8401 */ 8402 int 8403 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 8404 cmd_info_t *ci) 8405 { 8406 mblk_t *mp1; 8407 sin_t *sin; 8408 conn_t *connp; 8409 ipif_t *ipif; 8410 ire_t *ire = NULL; 8411 ill_t *ill = NULL; 8412 boolean_t exists; 8413 ip_stack_t *ipst; 8414 struct arpreq *ar; 8415 struct xarpreq *xar; 8416 struct sockaddr_dl *sdl; 8417 8418 /* ioctl comes down on a conn */ 8419 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8420 connp = Q_TO_CONN(q); 8421 if (connp->conn_family == AF_INET6) 8422 return (ENXIO); 8423 8424 ipst = connp->conn_netstack->netstack_ip; 8425 8426 /* Verified in ip_wput_nondata */ 8427 mp1 = mp->b_cont->b_cont; 8428 8429 if (ipip->ipi_cmd_type == XARP_CMD) { 8430 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq)); 8431 xar = (struct xarpreq *)mp1->b_rptr; 8432 sin = (sin_t *)&xar->xarp_pa; 8433 sdl = &xar->xarp_ha; 8434 8435 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET) 8436 return (ENXIO); 8437 if (sdl->sdl_nlen >= LIFNAMSIZ) 8438 return (EINVAL); 8439 } else { 8440 ASSERT(ipip->ipi_cmd_type == ARP_CMD); 8441 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq)); 8442 ar = (struct arpreq *)mp1->b_rptr; 8443 sin = (sin_t *)&ar->arp_pa; 8444 } 8445 8446 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) { 8447 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen, 8448 B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst); 8449 if (ipif == NULL) 8450 return (ENXIO); 8451 if (ipif->ipif_id != 0) { 8452 ipif_refrele(ipif); 8453 return (ENXIO); 8454 } 8455 } else { 8456 /* 8457 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen 8458 * of 0: use the IP address to find the ipif. If the IP 8459 * address is an IPMP test address, ire_ftable_lookup() will 8460 * find the wrong ill, so we first do an ipif_lookup_addr(). 8461 */ 8462 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES, 8463 ipst); 8464 if (ipif == NULL) { 8465 ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr, 8466 0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES, 8467 NULL, MATCH_IRE_TYPE, 0, ipst, NULL); 8468 if (ire == NULL || ((ill = ire->ire_ill) == NULL)) { 8469 if (ire != NULL) 8470 ire_refrele(ire); 8471 return (ENXIO); 8472 } 8473 ASSERT(ire != NULL && ill != NULL); 8474 ipif = ill->ill_ipif; 8475 ipif_refhold(ipif); 8476 ire_refrele(ire); 8477 } 8478 } 8479 8480 if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) { 8481 ipif_refrele(ipif); 8482 return (ENXIO); 8483 } 8484 8485 ci->ci_sin = sin; 8486 ci->ci_ipif = ipif; 8487 return (0); 8488 } 8489 8490 /* 8491 * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the 8492 * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is 8493 * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it 8494 * up and thus an ill can join that illgrp. 8495 * 8496 * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than 8497 * open()/close() primarily because close() is not allowed to fail or block 8498 * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason 8499 * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure 8500 * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the 8501 * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts 8502 * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent 8503 * state if I_UNLINK didn't occur. 8504 * 8505 * Note that for each plumb/unplumb operation, we may end up here more than 8506 * once because of the way ifconfig works. However, it's OK to link the same 8507 * illgrp more than once, or unlink an illgrp that's already unlinked. 8508 */ 8509 static int 8510 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd) 8511 { 8512 int err; 8513 ip_stack_t *ipst = ill->ill_ipst; 8514 8515 ASSERT(IS_IPMP(ill)); 8516 ASSERT(IAM_WRITER_ILL(ill)); 8517 8518 switch (ioccmd) { 8519 case I_LINK: 8520 return (ENOTSUP); 8521 8522 case I_PLINK: 8523 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 8524 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp); 8525 rw_exit(&ipst->ips_ipmp_lock); 8526 break; 8527 8528 case I_PUNLINK: 8529 /* 8530 * Require all UP ipifs be brought down prior to unlinking the 8531 * illgrp so any associated IREs (and other state) is torched. 8532 */ 8533 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) 8534 return (EBUSY); 8535 8536 /* 8537 * NOTE: We hold ipmp_lock across the unlink to prevent a race 8538 * with an SIOCSLIFGROUPNAME request from an ill trying to 8539 * join this group. Specifically: ills trying to join grab 8540 * ipmp_lock and bump a "pending join" counter checked by 8541 * ipmp_illgrp_unlink_grp(). During the unlink no new pending 8542 * joins can occur (since we have ipmp_lock). Once we drop 8543 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not 8544 * find the illgrp (since we unlinked it) and will return 8545 * EAFNOSUPPORT. This will then take them back through the 8546 * IPMP meta-interface plumbing logic in ifconfig, and thus 8547 * back through I_PLINK above. 8548 */ 8549 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 8550 err = ipmp_illgrp_unlink_grp(ill->ill_grp); 8551 rw_exit(&ipst->ips_ipmp_lock); 8552 return (err); 8553 default: 8554 break; 8555 } 8556 return (0); 8557 } 8558 8559 /* 8560 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 8561 * atomically set/clear the muxids. Also complete the ioctl by acking or 8562 * naking it. Note that the code is structured such that the link type, 8563 * whether it's persistent or not, is treated equally. ifconfig(1M) and 8564 * its clones use the persistent link, while pppd(1M) and perhaps many 8565 * other daemons may use non-persistent link. When combined with some 8566 * ill_t states, linking and unlinking lower streams may be used as 8567 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 8568 */ 8569 /* ARGSUSED */ 8570 void 8571 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 8572 { 8573 mblk_t *mp1; 8574 struct linkblk *li; 8575 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; 8576 int err = 0; 8577 8578 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK || 8579 ioccmd == I_LINK || ioccmd == I_UNLINK); 8580 8581 mp1 = mp->b_cont; /* This is the linkblk info */ 8582 li = (struct linkblk *)mp1->b_rptr; 8583 8584 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li); 8585 if (err == EINPROGRESS) 8586 return; 8587 done: 8588 if (err == 0) 8589 miocack(q, mp, 0, 0); 8590 else 8591 miocnak(q, mp, 0, err); 8592 8593 /* Conn was refheld in ip_sioctl_copyin_setup */ 8594 if (CONN_Q(q)) 8595 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 8596 } 8597 8598 /* 8599 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to 8600 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP 8601 * module stream). If `doconsist' is set, then do the extended consistency 8602 * checks requested by ifconfig(1M) and (atomically) set ill_muxid here. 8603 * Returns zero on success, EINPROGRESS if the operation is still pending, or 8604 * an error code on failure. 8605 */ 8606 static int 8607 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, 8608 struct linkblk *li) 8609 { 8610 int err = 0; 8611 ill_t *ill; 8612 queue_t *ipwq, *dwq; 8613 const char *name; 8614 struct qinit *qinfo; 8615 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 8616 boolean_t entered_ipsq = B_FALSE; 8617 boolean_t is_ip = B_FALSE; 8618 arl_t *arl; 8619 8620 /* 8621 * Walk the lower stream to verify it's the IP module stream. 8622 * The IP module is identified by its name, wput function, 8623 * and non-NULL q_next. STREAMS ensures that the lower stream 8624 * (li->l_qbot) will not vanish until this ioctl completes. 8625 */ 8626 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) { 8627 qinfo = ipwq->q_qinfo; 8628 name = qinfo->qi_minfo->mi_idname; 8629 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 && 8630 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 8631 is_ip = B_TRUE; 8632 break; 8633 } 8634 if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 && 8635 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 8636 break; 8637 } 8638 } 8639 8640 /* 8641 * If this isn't an IP module stream, bail. 8642 */ 8643 if (ipwq == NULL) 8644 return (0); 8645 8646 if (!is_ip) { 8647 arl = (arl_t *)ipwq->q_ptr; 8648 ill = arl_to_ill(arl); 8649 if (ill == NULL) 8650 return (0); 8651 } else { 8652 ill = ipwq->q_ptr; 8653 } 8654 ASSERT(ill != NULL); 8655 8656 if (ipsq == NULL) { 8657 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 8658 NEW_OP, B_FALSE); 8659 if (ipsq == NULL) { 8660 if (!is_ip) 8661 ill_refrele(ill); 8662 return (EINPROGRESS); 8663 } 8664 entered_ipsq = B_TRUE; 8665 } 8666 ASSERT(IAM_WRITER_ILL(ill)); 8667 mutex_enter(&ill->ill_lock); 8668 if (!is_ip) { 8669 if (islink && ill->ill_muxid == 0) { 8670 /* 8671 * Plumbing has to be done with IP plumbed first, arp 8672 * second, but here we have arp being plumbed first. 8673 */ 8674 mutex_exit(&ill->ill_lock); 8675 ipsq_exit(ipsq); 8676 ill_refrele(ill); 8677 return (EINVAL); 8678 } 8679 } 8680 mutex_exit(&ill->ill_lock); 8681 if (!is_ip) { 8682 arl->arl_muxid = islink ? li->l_index : 0; 8683 ill_refrele(ill); 8684 goto done; 8685 } 8686 8687 if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) 8688 goto done; 8689 8690 /* 8691 * As part of I_{P}LINKing, stash the number of downstream modules and 8692 * the read queue of the module immediately below IP in the ill. 8693 * These are used during the capability negotiation below. 8694 */ 8695 ill->ill_lmod_rq = NULL; 8696 ill->ill_lmod_cnt = 0; 8697 if (islink && ((dwq = ipwq->q_next) != NULL)) { 8698 ill->ill_lmod_rq = RD(dwq); 8699 for (; dwq != NULL; dwq = dwq->q_next) 8700 ill->ill_lmod_cnt++; 8701 } 8702 8703 ill->ill_muxid = islink ? li->l_index : 0; 8704 8705 /* 8706 * Mark the ipsq busy until the capability operations initiated below 8707 * complete. The PLINK/UNLINK ioctl itself completes when our caller 8708 * returns, but the capability operation may complete asynchronously 8709 * much later. 8710 */ 8711 ipsq_current_start(ipsq, ill->ill_ipif, ioccmd); 8712 /* 8713 * If there's at least one up ipif on this ill, then we're bound to 8714 * the underlying driver via DLPI. In that case, renegotiate 8715 * capabilities to account for any possible change in modules 8716 * interposed between IP and the driver. 8717 */ 8718 if (ill->ill_ipif_up_count > 0) { 8719 if (islink) 8720 ill_capability_probe(ill); 8721 else 8722 ill_capability_reset(ill, B_FALSE); 8723 } 8724 ipsq_current_finish(ipsq); 8725 done: 8726 if (entered_ipsq) 8727 ipsq_exit(ipsq); 8728 8729 return (err); 8730 } 8731 8732 /* 8733 * Search the ioctl command in the ioctl tables and return a pointer 8734 * to the ioctl command information. The ioctl command tables are 8735 * static and fully populated at compile time. 8736 */ 8737 ip_ioctl_cmd_t * 8738 ip_sioctl_lookup(int ioc_cmd) 8739 { 8740 int index; 8741 ip_ioctl_cmd_t *ipip; 8742 ip_ioctl_cmd_t *ipip_end; 8743 8744 if (ioc_cmd == IPI_DONTCARE) 8745 return (NULL); 8746 8747 /* 8748 * Do a 2 step search. First search the indexed table 8749 * based on the least significant byte of the ioctl cmd. 8750 * If we don't find a match, then search the misc table 8751 * serially. 8752 */ 8753 index = ioc_cmd & 0xFF; 8754 if (index < ip_ndx_ioctl_count) { 8755 ipip = &ip_ndx_ioctl_table[index]; 8756 if (ipip->ipi_cmd == ioc_cmd) { 8757 /* Found a match in the ndx table */ 8758 return (ipip); 8759 } 8760 } 8761 8762 /* Search the misc table */ 8763 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 8764 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 8765 if (ipip->ipi_cmd == ioc_cmd) 8766 /* Found a match in the misc table */ 8767 return (ipip); 8768 } 8769 8770 return (NULL); 8771 } 8772 8773 /* 8774 * Wrapper function for resuming deferred ioctl processing 8775 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 8776 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 8777 */ 8778 /* ARGSUSED */ 8779 void 8780 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 8781 void *dummy_arg) 8782 { 8783 ip_sioctl_copyin_setup(q, mp); 8784 } 8785 8786 /* 8787 * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message 8788 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 8789 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 8790 * We establish here the size of the block to be copied in. mi_copyin 8791 * arranges for this to happen, an processing continues in ip_wput_nondata with 8792 * an M_IOCDATA message. 8793 */ 8794 void 8795 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 8796 { 8797 int copyin_size; 8798 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8799 ip_ioctl_cmd_t *ipip; 8800 cred_t *cr; 8801 ip_stack_t *ipst; 8802 8803 if (CONN_Q(q)) 8804 ipst = CONNQ_TO_IPST(q); 8805 else 8806 ipst = ILLQ_TO_IPST(q); 8807 8808 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 8809 if (ipip == NULL) { 8810 /* 8811 * The ioctl is not one we understand or own. 8812 * Pass it along to be processed down stream, 8813 * if this is a module instance of IP, else nak 8814 * the ioctl. 8815 */ 8816 if (q->q_next == NULL) { 8817 goto nak; 8818 } else { 8819 putnext(q, mp); 8820 return; 8821 } 8822 } 8823 8824 /* 8825 * If this is deferred, then we will do all the checks when we 8826 * come back. 8827 */ 8828 if ((iocp->ioc_cmd == SIOCGDSTINFO || 8829 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) { 8830 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 8831 return; 8832 } 8833 8834 /* 8835 * Only allow a very small subset of IP ioctls on this stream if 8836 * IP is a module and not a driver. Allowing ioctls to be processed 8837 * in this case may cause assert failures or data corruption. 8838 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 8839 * ioctls allowed on an IP module stream, after which this stream 8840 * normally becomes a multiplexor (at which time the stream head 8841 * will fail all ioctls). 8842 */ 8843 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 8844 goto nak; 8845 } 8846 8847 /* Make sure we have ioctl data to process. */ 8848 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 8849 goto nak; 8850 8851 /* 8852 * Prefer dblk credential over ioctl credential; some synthesized 8853 * ioctls have kcred set because there's no way to crhold() 8854 * a credential in some contexts. (ioc_cr is not crfree() by 8855 * the framework; the caller of ioctl needs to hold the reference 8856 * for the duration of the call). 8857 */ 8858 cr = msg_getcred(mp, NULL); 8859 if (cr == NULL) 8860 cr = iocp->ioc_cr; 8861 8862 /* Make sure normal users don't send down privileged ioctls */ 8863 if ((ipip->ipi_flags & IPI_PRIV) && 8864 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) { 8865 /* We checked the privilege earlier but log it here */ 8866 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE)); 8867 return; 8868 } 8869 8870 /* 8871 * The ioctl command tables can only encode fixed length 8872 * ioctl data. If the length is variable, the table will 8873 * encode the length as zero. Such special cases are handled 8874 * below in the switch. 8875 */ 8876 if (ipip->ipi_copyin_size != 0) { 8877 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 8878 return; 8879 } 8880 8881 switch (iocp->ioc_cmd) { 8882 case O_SIOCGIFCONF: 8883 case SIOCGIFCONF: 8884 /* 8885 * This IOCTL is hilarious. See comments in 8886 * ip_sioctl_get_ifconf for the story. 8887 */ 8888 if (iocp->ioc_count == TRANSPARENT) 8889 copyin_size = SIZEOF_STRUCT(ifconf, 8890 iocp->ioc_flag); 8891 else 8892 copyin_size = iocp->ioc_count; 8893 mi_copyin(q, mp, NULL, copyin_size); 8894 return; 8895 8896 case O_SIOCGLIFCONF: 8897 case SIOCGLIFCONF: 8898 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 8899 mi_copyin(q, mp, NULL, copyin_size); 8900 return; 8901 8902 case SIOCGLIFSRCOF: 8903 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 8904 mi_copyin(q, mp, NULL, copyin_size); 8905 return; 8906 case SIOCGIP6ADDRPOLICY: 8907 ip_sioctl_ip6addrpolicy(q, mp); 8908 ip6_asp_table_refrele(ipst); 8909 return; 8910 8911 case SIOCSIP6ADDRPOLICY: 8912 ip_sioctl_ip6addrpolicy(q, mp); 8913 return; 8914 8915 case SIOCGDSTINFO: 8916 ip_sioctl_dstinfo(q, mp); 8917 ip6_asp_table_refrele(ipst); 8918 return; 8919 8920 case I_PLINK: 8921 case I_PUNLINK: 8922 case I_LINK: 8923 case I_UNLINK: 8924 /* 8925 * We treat non-persistent link similarly as the persistent 8926 * link case, in terms of plumbing/unplumbing, as well as 8927 * dynamic re-plumbing events indicator. See comments 8928 * in ip_sioctl_plink() for more. 8929 * 8930 * Request can be enqueued in the 'ipsq' while waiting 8931 * to become exclusive. So bump up the conn ref. 8932 */ 8933 if (CONN_Q(q)) 8934 CONN_INC_REF(Q_TO_CONN(q)); 8935 ip_sioctl_plink(NULL, q, mp, NULL); 8936 return; 8937 8938 case ND_GET: 8939 case ND_SET: 8940 /* 8941 * Use of the nd table requires holding the reader lock. 8942 * Modifying the nd table thru nd_load/nd_unload requires 8943 * the writer lock. 8944 */ 8945 rw_enter(&ipst->ips_ip_g_nd_lock, RW_READER); 8946 if (nd_getset(q, ipst->ips_ip_g_nd, mp)) { 8947 rw_exit(&ipst->ips_ip_g_nd_lock); 8948 8949 if (iocp->ioc_error) 8950 iocp->ioc_count = 0; 8951 mp->b_datap->db_type = M_IOCACK; 8952 qreply(q, mp); 8953 return; 8954 } 8955 rw_exit(&ipst->ips_ip_g_nd_lock); 8956 /* 8957 * We don't understand this subioctl of ND_GET / ND_SET. 8958 * Maybe intended for some driver / module below us 8959 */ 8960 if (q->q_next) { 8961 putnext(q, mp); 8962 } else { 8963 iocp->ioc_error = ENOENT; 8964 mp->b_datap->db_type = M_IOCNAK; 8965 iocp->ioc_count = 0; 8966 qreply(q, mp); 8967 } 8968 return; 8969 8970 case IP_IOCTL: 8971 ip_wput_ioctl(q, mp); 8972 return; 8973 8974 case SIOCILB: 8975 /* The ioctl length varies depending on the ILB command. */ 8976 copyin_size = iocp->ioc_count; 8977 if (copyin_size < sizeof (ilb_cmd_t)) 8978 goto nak; 8979 mi_copyin(q, mp, NULL, copyin_size); 8980 return; 8981 8982 default: 8983 cmn_err(CE_PANIC, "should not happen "); 8984 } 8985 nak: 8986 if (mp->b_cont != NULL) { 8987 freemsg(mp->b_cont); 8988 mp->b_cont = NULL; 8989 } 8990 iocp->ioc_error = EINVAL; 8991 mp->b_datap->db_type = M_IOCNAK; 8992 iocp->ioc_count = 0; 8993 qreply(q, mp); 8994 } 8995 8996 static void 8997 ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags) 8998 { 8999 struct arpreq *ar; 9000 struct xarpreq *xar; 9001 mblk_t *tmp; 9002 struct iocblk *iocp; 9003 int x_arp_ioctl = B_FALSE; 9004 int *flagsp; 9005 char *storage = NULL; 9006 9007 ASSERT(ill != NULL); 9008 9009 iocp = (struct iocblk *)mp->b_rptr; 9010 ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP); 9011 9012 tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */ 9013 if ((iocp->ioc_cmd == SIOCGXARP) || 9014 (iocp->ioc_cmd == SIOCSXARP)) { 9015 x_arp_ioctl = B_TRUE; 9016 xar = (struct xarpreq *)tmp->b_rptr; 9017 flagsp = &xar->xarp_flags; 9018 storage = xar->xarp_ha.sdl_data; 9019 } else { 9020 ar = (struct arpreq *)tmp->b_rptr; 9021 flagsp = &ar->arp_flags; 9022 storage = ar->arp_ha.sa_data; 9023 } 9024 9025 /* 9026 * We're done if this is not an SIOCG{X}ARP 9027 */ 9028 if (x_arp_ioctl) { 9029 storage += ill_xarp_info(&xar->xarp_ha, ill); 9030 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 9031 sizeof (xar->xarp_ha.sdl_data)) { 9032 iocp->ioc_error = EINVAL; 9033 return; 9034 } 9035 } 9036 *flagsp = ATF_INUSE; 9037 /* 9038 * If /sbin/arp told us we are the authority using the "permanent" 9039 * flag, or if this is one of my addresses print "permanent" 9040 * in the /sbin/arp output. 9041 */ 9042 if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY)) 9043 *flagsp |= ATF_AUTHORITY; 9044 if (flags & NCE_F_NONUD) 9045 *flagsp |= ATF_PERM; /* not subject to aging */ 9046 if (flags & NCE_F_PUBLISH) 9047 *flagsp |= ATF_PUBL; 9048 if (hwaddr != NULL) { 9049 *flagsp |= ATF_COM; 9050 bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length); 9051 } 9052 } 9053 9054 /* 9055 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 9056 * interface) create the next available logical interface for this 9057 * physical interface. 9058 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 9059 * ipif with the specified name. 9060 * 9061 * If the address family is not AF_UNSPEC then set the address as well. 9062 * 9063 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 9064 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 9065 * 9066 * Executed as a writer on the ill. 9067 * So no lock is needed to traverse the ipif chain, or examine the 9068 * phyint flags. 9069 */ 9070 /* ARGSUSED */ 9071 int 9072 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9073 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9074 { 9075 mblk_t *mp1; 9076 struct lifreq *lifr; 9077 boolean_t isv6; 9078 boolean_t exists; 9079 char *name; 9080 char *endp; 9081 char *cp; 9082 int namelen; 9083 ipif_t *ipif; 9084 long id; 9085 ipsq_t *ipsq; 9086 ill_t *ill; 9087 sin_t *sin; 9088 int err = 0; 9089 boolean_t found_sep = B_FALSE; 9090 conn_t *connp; 9091 zoneid_t zoneid; 9092 ip_stack_t *ipst = CONNQ_TO_IPST(q); 9093 9094 ASSERT(q->q_next == NULL); 9095 ip1dbg(("ip_sioctl_addif\n")); 9096 /* Existence of mp1 has been checked in ip_wput_nondata */ 9097 mp1 = mp->b_cont->b_cont; 9098 /* 9099 * Null terminate the string to protect against buffer 9100 * overrun. String was generated by user code and may not 9101 * be trusted. 9102 */ 9103 lifr = (struct lifreq *)mp1->b_rptr; 9104 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 9105 name = lifr->lifr_name; 9106 ASSERT(CONN_Q(q)); 9107 connp = Q_TO_CONN(q); 9108 isv6 = (connp->conn_family == AF_INET6); 9109 zoneid = connp->conn_zoneid; 9110 namelen = mi_strlen(name); 9111 if (namelen == 0) 9112 return (EINVAL); 9113 9114 exists = B_FALSE; 9115 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 9116 (mi_strcmp(name, ipif_loopback_name) == 0)) { 9117 /* 9118 * Allow creating lo0 using SIOCLIFADDIF. 9119 * can't be any other writer thread. So can pass null below 9120 * for the last 4 args to ipif_lookup_name. 9121 */ 9122 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE, 9123 &exists, isv6, zoneid, ipst); 9124 /* Prevent any further action */ 9125 if (ipif == NULL) { 9126 return (ENOBUFS); 9127 } else if (!exists) { 9128 /* We created the ipif now and as writer */ 9129 ipif_refrele(ipif); 9130 return (0); 9131 } else { 9132 ill = ipif->ipif_ill; 9133 ill_refhold(ill); 9134 ipif_refrele(ipif); 9135 } 9136 } else { 9137 /* Look for a colon in the name. */ 9138 endp = &name[namelen]; 9139 for (cp = endp; --cp > name; ) { 9140 if (*cp == IPIF_SEPARATOR_CHAR) { 9141 found_sep = B_TRUE; 9142 /* 9143 * Reject any non-decimal aliases for plumbing 9144 * of logical interfaces. Aliases with leading 9145 * zeroes are also rejected as they introduce 9146 * ambiguity in the naming of the interfaces. 9147 * Comparing with "0" takes care of all such 9148 * cases. 9149 */ 9150 if ((strncmp("0", cp+1, 1)) == 0) 9151 return (EINVAL); 9152 9153 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 9154 id <= 0 || *endp != '\0') { 9155 return (EINVAL); 9156 } 9157 *cp = '\0'; 9158 break; 9159 } 9160 } 9161 ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst); 9162 if (found_sep) 9163 *cp = IPIF_SEPARATOR_CHAR; 9164 if (ill == NULL) 9165 return (ENXIO); 9166 } 9167 9168 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 9169 B_TRUE); 9170 9171 /* 9172 * Release the refhold due to the lookup, now that we are excl 9173 * or we are just returning 9174 */ 9175 ill_refrele(ill); 9176 9177 if (ipsq == NULL) 9178 return (EINPROGRESS); 9179 9180 /* We are now exclusive on the IPSQ */ 9181 ASSERT(IAM_WRITER_ILL(ill)); 9182 9183 if (found_sep) { 9184 /* Now see if there is an IPIF with this unit number. */ 9185 for (ipif = ill->ill_ipif; ipif != NULL; 9186 ipif = ipif->ipif_next) { 9187 if (ipif->ipif_id == id) { 9188 err = EEXIST; 9189 goto done; 9190 } 9191 } 9192 } 9193 9194 /* 9195 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 9196 * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name() 9197 * instead. 9198 */ 9199 if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL, 9200 B_TRUE, B_TRUE, &err)) == NULL) { 9201 goto done; 9202 } 9203 9204 /* Return created name with ioctl */ 9205 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 9206 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 9207 ip1dbg(("created %s\n", lifr->lifr_name)); 9208 9209 /* Set address */ 9210 sin = (sin_t *)&lifr->lifr_addr; 9211 if (sin->sin_family != AF_UNSPEC) { 9212 err = ip_sioctl_addr(ipif, sin, q, mp, 9213 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 9214 } 9215 9216 done: 9217 ipsq_exit(ipsq); 9218 return (err); 9219 } 9220 9221 /* 9222 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 9223 * interface) delete it based on the IP address (on this physical interface). 9224 * Otherwise delete it based on the ipif_id. 9225 * Also, special handling to allow a removeif of lo0. 9226 */ 9227 /* ARGSUSED */ 9228 int 9229 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9230 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 9231 { 9232 conn_t *connp; 9233 ill_t *ill = ipif->ipif_ill; 9234 boolean_t success; 9235 ip_stack_t *ipst; 9236 9237 ipst = CONNQ_TO_IPST(q); 9238 9239 ASSERT(q->q_next == NULL); 9240 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 9241 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9242 ASSERT(IAM_WRITER_IPIF(ipif)); 9243 9244 connp = Q_TO_CONN(q); 9245 /* 9246 * Special case for unplumbing lo0 (the loopback physical interface). 9247 * If unplumbing lo0, the incoming address structure has been 9248 * initialized to all zeros. When unplumbing lo0, all its logical 9249 * interfaces must be removed too. 9250 * 9251 * Note that this interface may be called to remove a specific 9252 * loopback logical interface (eg, lo0:1). But in that case 9253 * ipif->ipif_id != 0 so that the code path for that case is the 9254 * same as any other interface (meaning it skips the code directly 9255 * below). 9256 */ 9257 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { 9258 if (sin->sin_family == AF_UNSPEC && 9259 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 9260 /* 9261 * Mark it condemned. No new ref. will be made to ill. 9262 */ 9263 mutex_enter(&ill->ill_lock); 9264 ill->ill_state_flags |= ILL_CONDEMNED; 9265 for (ipif = ill->ill_ipif; ipif != NULL; 9266 ipif = ipif->ipif_next) { 9267 ipif->ipif_state_flags |= IPIF_CONDEMNED; 9268 } 9269 mutex_exit(&ill->ill_lock); 9270 9271 ipif = ill->ill_ipif; 9272 /* unplumb the loopback interface */ 9273 ill_delete(ill); 9274 mutex_enter(&connp->conn_lock); 9275 mutex_enter(&ill->ill_lock); 9276 9277 /* Are any references to this ill active */ 9278 if (ill_is_freeable(ill)) { 9279 mutex_exit(&ill->ill_lock); 9280 mutex_exit(&connp->conn_lock); 9281 ill_delete_tail(ill); 9282 mi_free(ill); 9283 return (0); 9284 } 9285 success = ipsq_pending_mp_add(connp, ipif, 9286 CONNP_TO_WQ(connp), mp, ILL_FREE); 9287 mutex_exit(&connp->conn_lock); 9288 mutex_exit(&ill->ill_lock); 9289 if (success) 9290 return (EINPROGRESS); 9291 else 9292 return (EINTR); 9293 } 9294 } 9295 9296 if (ipif->ipif_id == 0) { 9297 ipsq_t *ipsq; 9298 9299 /* Find based on address */ 9300 if (ipif->ipif_isv6) { 9301 sin6_t *sin6; 9302 9303 if (sin->sin_family != AF_INET6) 9304 return (EAFNOSUPPORT); 9305 9306 sin6 = (sin6_t *)sin; 9307 /* We are a writer, so we should be able to lookup */ 9308 ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill, 9309 ipst); 9310 } else { 9311 if (sin->sin_family != AF_INET) 9312 return (EAFNOSUPPORT); 9313 9314 /* We are a writer, so we should be able to lookup */ 9315 ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill, 9316 ipst); 9317 } 9318 if (ipif == NULL) { 9319 return (EADDRNOTAVAIL); 9320 } 9321 9322 /* 9323 * It is possible for a user to send an SIOCLIFREMOVEIF with 9324 * lifr_name of the physical interface but with an ip address 9325 * lifr_addr of a logical interface plumbed over it. 9326 * So update ipx_current_ipif now that ipif points to the 9327 * correct one. 9328 */ 9329 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 9330 ipsq->ipsq_xop->ipx_current_ipif = ipif; 9331 9332 /* This is a writer */ 9333 ipif_refrele(ipif); 9334 } 9335 9336 /* 9337 * Can not delete instance zero since it is tied to the ill. 9338 */ 9339 if (ipif->ipif_id == 0) 9340 return (EBUSY); 9341 9342 mutex_enter(&ill->ill_lock); 9343 ipif->ipif_state_flags |= IPIF_CONDEMNED; 9344 mutex_exit(&ill->ill_lock); 9345 9346 ipif_free(ipif); 9347 9348 mutex_enter(&connp->conn_lock); 9349 mutex_enter(&ill->ill_lock); 9350 9351 /* Are any references to this ipif active */ 9352 if (ipif_is_freeable(ipif)) { 9353 mutex_exit(&ill->ill_lock); 9354 mutex_exit(&connp->conn_lock); 9355 ipif_non_duplicate(ipif); 9356 (void) ipif_down_tail(ipif); 9357 ipif_free_tail(ipif); /* frees ipif */ 9358 return (0); 9359 } 9360 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 9361 IPIF_FREE); 9362 mutex_exit(&ill->ill_lock); 9363 mutex_exit(&connp->conn_lock); 9364 if (success) 9365 return (EINPROGRESS); 9366 else 9367 return (EINTR); 9368 } 9369 9370 /* 9371 * Restart the removeif ioctl. The refcnt has gone down to 0. 9372 * The ipif is already condemned. So can't find it thru lookups. 9373 */ 9374 /* ARGSUSED */ 9375 int 9376 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 9377 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 9378 { 9379 ill_t *ill = ipif->ipif_ill; 9380 9381 ASSERT(IAM_WRITER_IPIF(ipif)); 9382 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 9383 9384 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 9385 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9386 9387 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { 9388 ASSERT(ill->ill_state_flags & ILL_CONDEMNED); 9389 ill_delete_tail(ill); 9390 mi_free(ill); 9391 return (0); 9392 } 9393 9394 ipif_non_duplicate(ipif); 9395 (void) ipif_down_tail(ipif); 9396 ipif_free_tail(ipif); 9397 9398 return (0); 9399 } 9400 9401 /* 9402 * Set the local interface address. 9403 * Allow an address of all zero when the interface is down. 9404 */ 9405 /* ARGSUSED */ 9406 int 9407 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9408 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9409 { 9410 int err = 0; 9411 in6_addr_t v6addr; 9412 boolean_t need_up = B_FALSE; 9413 9414 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 9415 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9416 9417 ASSERT(IAM_WRITER_IPIF(ipif)); 9418 9419 if (ipif->ipif_isv6) { 9420 sin6_t *sin6; 9421 ill_t *ill; 9422 phyint_t *phyi; 9423 9424 if (sin->sin_family != AF_INET6) 9425 return (EAFNOSUPPORT); 9426 9427 sin6 = (sin6_t *)sin; 9428 v6addr = sin6->sin6_addr; 9429 ill = ipif->ipif_ill; 9430 phyi = ill->ill_phyint; 9431 9432 /* 9433 * Enforce that true multicast interfaces have a link-local 9434 * address for logical unit 0. 9435 */ 9436 if (ipif->ipif_id == 0 && 9437 (ill->ill_flags & ILLF_MULTICAST) && 9438 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 9439 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 9440 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 9441 return (EADDRNOTAVAIL); 9442 } 9443 9444 /* 9445 * up interfaces shouldn't have the unspecified address 9446 * unless they also have the IPIF_NOLOCAL flags set and 9447 * have a subnet assigned. 9448 */ 9449 if ((ipif->ipif_flags & IPIF_UP) && 9450 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 9451 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 9452 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 9453 return (EADDRNOTAVAIL); 9454 } 9455 9456 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 9457 return (EADDRNOTAVAIL); 9458 } else { 9459 ipaddr_t addr; 9460 9461 if (sin->sin_family != AF_INET) 9462 return (EAFNOSUPPORT); 9463 9464 addr = sin->sin_addr.s_addr; 9465 9466 /* Allow 0 as the local address. */ 9467 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 9468 return (EADDRNOTAVAIL); 9469 9470 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9471 } 9472 9473 /* 9474 * Even if there is no change we redo things just to rerun 9475 * ipif_set_default. 9476 */ 9477 if (ipif->ipif_flags & IPIF_UP) { 9478 /* 9479 * Setting a new local address, make sure 9480 * we have net and subnet bcast ire's for 9481 * the old address if we need them. 9482 */ 9483 /* 9484 * If the interface is already marked up, 9485 * we call ipif_down which will take care 9486 * of ditching any IREs that have been set 9487 * up based on the old interface address. 9488 */ 9489 err = ipif_logical_down(ipif, q, mp); 9490 if (err == EINPROGRESS) 9491 return (err); 9492 (void) ipif_down_tail(ipif); 9493 need_up = 1; 9494 } 9495 9496 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 9497 return (err); 9498 } 9499 9500 int 9501 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9502 boolean_t need_up) 9503 { 9504 in6_addr_t v6addr; 9505 in6_addr_t ov6addr; 9506 ipaddr_t addr; 9507 sin6_t *sin6; 9508 int sinlen; 9509 int err = 0; 9510 ill_t *ill = ipif->ipif_ill; 9511 boolean_t need_dl_down; 9512 boolean_t need_arp_down; 9513 struct iocblk *iocp; 9514 9515 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL; 9516 9517 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 9518 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9519 ASSERT(IAM_WRITER_IPIF(ipif)); 9520 9521 /* Must cancel any pending timer before taking the ill_lock */ 9522 if (ipif->ipif_recovery_id != 0) 9523 (void) untimeout(ipif->ipif_recovery_id); 9524 ipif->ipif_recovery_id = 0; 9525 9526 if (ipif->ipif_isv6) { 9527 sin6 = (sin6_t *)sin; 9528 v6addr = sin6->sin6_addr; 9529 sinlen = sizeof (struct sockaddr_in6); 9530 } else { 9531 addr = sin->sin_addr.s_addr; 9532 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9533 sinlen = sizeof (struct sockaddr_in); 9534 } 9535 mutex_enter(&ill->ill_lock); 9536 ov6addr = ipif->ipif_v6lcl_addr; 9537 ipif->ipif_v6lcl_addr = v6addr; 9538 sctp_update_ipif_addr(ipif, ov6addr); 9539 ipif->ipif_addr_ready = 0; 9540 9541 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT); 9542 9543 /* 9544 * If the interface was previously marked as a duplicate, then since 9545 * we've now got a "new" address, it should no longer be considered a 9546 * duplicate -- even if the "new" address is the same as the old one. 9547 * Note that if all ipifs are down, we may have a pending ARP down 9548 * event to handle. This is because we want to recover from duplicates 9549 * and thus delay tearing down ARP until the duplicates have been 9550 * removed or disabled. 9551 */ 9552 need_dl_down = need_arp_down = B_FALSE; 9553 if (ipif->ipif_flags & IPIF_DUPLICATE) { 9554 need_arp_down = !need_up; 9555 ipif->ipif_flags &= ~IPIF_DUPLICATE; 9556 if (--ill->ill_ipif_dup_count == 0 && !need_up && 9557 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 9558 need_dl_down = B_TRUE; 9559 } 9560 } 9561 9562 ipif_set_default(ipif); 9563 9564 /* 9565 * If we've just manually set the IPv6 link-local address (0th ipif), 9566 * tag the ill so that future updates to the interface ID don't result 9567 * in this address getting automatically reconfigured from under the 9568 * administrator. 9569 */ 9570 if (ipif->ipif_isv6 && ipif->ipif_id == 0) 9571 ill->ill_manual_linklocal = 1; 9572 9573 /* 9574 * When publishing an interface address change event, we only notify 9575 * the event listeners of the new address. It is assumed that if they 9576 * actively care about the addresses assigned that they will have 9577 * already discovered the previous address assigned (if there was one.) 9578 * 9579 * Don't attach nic event message for SIOCLIFADDIF ioctl. 9580 */ 9581 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) { 9582 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id), 9583 NE_ADDRESS_CHANGE, sin, sinlen); 9584 } 9585 9586 mutex_exit(&ill->ill_lock); 9587 9588 if (need_up) { 9589 /* 9590 * Now bring the interface back up. If this 9591 * is the only IPIF for the ILL, ipif_up 9592 * will have to re-bind to the device, so 9593 * we may get back EINPROGRESS, in which 9594 * case, this IOCTL will get completed in 9595 * ip_rput_dlpi when we see the DL_BIND_ACK. 9596 */ 9597 err = ipif_up(ipif, q, mp); 9598 } else { 9599 /* Perhaps ilgs should use this ill */ 9600 update_conn_ill(NULL, ill->ill_ipst); 9601 } 9602 9603 if (need_dl_down) 9604 ill_dl_down(ill); 9605 9606 if (need_arp_down && !ill->ill_isv6) 9607 (void) ipif_arp_down(ipif); 9608 9609 /* 9610 * The default multicast interface might have changed (for 9611 * instance if the IPv6 scope of the address changed) 9612 */ 9613 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); 9614 9615 return (err); 9616 } 9617 9618 /* 9619 * Restart entry point to restart the address set operation after the 9620 * refcounts have dropped to zero. 9621 */ 9622 /* ARGSUSED */ 9623 int 9624 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9625 ip_ioctl_cmd_t *ipip, void *ifreq) 9626 { 9627 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 9628 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9629 ASSERT(IAM_WRITER_IPIF(ipif)); 9630 (void) ipif_down_tail(ipif); 9631 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 9632 } 9633 9634 /* ARGSUSED */ 9635 int 9636 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9637 ip_ioctl_cmd_t *ipip, void *if_req) 9638 { 9639 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 9640 struct lifreq *lifr = (struct lifreq *)if_req; 9641 9642 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 9643 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9644 /* 9645 * The net mask and address can't change since we have a 9646 * reference to the ipif. So no lock is necessary. 9647 */ 9648 if (ipif->ipif_isv6) { 9649 *sin6 = sin6_null; 9650 sin6->sin6_family = AF_INET6; 9651 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 9652 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 9653 lifr->lifr_addrlen = 9654 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 9655 } else { 9656 *sin = sin_null; 9657 sin->sin_family = AF_INET; 9658 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 9659 if (ipip->ipi_cmd_type == LIF_CMD) { 9660 lifr->lifr_addrlen = 9661 ip_mask_to_plen(ipif->ipif_net_mask); 9662 } 9663 } 9664 return (0); 9665 } 9666 9667 /* 9668 * Set the destination address for a pt-pt interface. 9669 */ 9670 /* ARGSUSED */ 9671 int 9672 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9673 ip_ioctl_cmd_t *ipip, void *if_req) 9674 { 9675 int err = 0; 9676 in6_addr_t v6addr; 9677 boolean_t need_up = B_FALSE; 9678 9679 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 9680 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9681 ASSERT(IAM_WRITER_IPIF(ipif)); 9682 9683 if (ipif->ipif_isv6) { 9684 sin6_t *sin6; 9685 9686 if (sin->sin_family != AF_INET6) 9687 return (EAFNOSUPPORT); 9688 9689 sin6 = (sin6_t *)sin; 9690 v6addr = sin6->sin6_addr; 9691 9692 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 9693 return (EADDRNOTAVAIL); 9694 } else { 9695 ipaddr_t addr; 9696 9697 if (sin->sin_family != AF_INET) 9698 return (EAFNOSUPPORT); 9699 9700 addr = sin->sin_addr.s_addr; 9701 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 9702 return (EADDRNOTAVAIL); 9703 9704 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9705 } 9706 9707 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 9708 return (0); /* No change */ 9709 9710 if (ipif->ipif_flags & IPIF_UP) { 9711 /* 9712 * If the interface is already marked up, 9713 * we call ipif_down which will take care 9714 * of ditching any IREs that have been set 9715 * up based on the old pp dst address. 9716 */ 9717 err = ipif_logical_down(ipif, q, mp); 9718 if (err == EINPROGRESS) 9719 return (err); 9720 (void) ipif_down_tail(ipif); 9721 need_up = B_TRUE; 9722 } 9723 /* 9724 * could return EINPROGRESS. If so ioctl will complete in 9725 * ip_rput_dlpi_writer 9726 */ 9727 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 9728 return (err); 9729 } 9730 9731 static int 9732 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9733 boolean_t need_up) 9734 { 9735 in6_addr_t v6addr; 9736 ill_t *ill = ipif->ipif_ill; 9737 int err = 0; 9738 boolean_t need_dl_down; 9739 boolean_t need_arp_down; 9740 9741 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 9742 ipif->ipif_id, (void *)ipif)); 9743 9744 /* Must cancel any pending timer before taking the ill_lock */ 9745 if (ipif->ipif_recovery_id != 0) 9746 (void) untimeout(ipif->ipif_recovery_id); 9747 ipif->ipif_recovery_id = 0; 9748 9749 if (ipif->ipif_isv6) { 9750 sin6_t *sin6; 9751 9752 sin6 = (sin6_t *)sin; 9753 v6addr = sin6->sin6_addr; 9754 } else { 9755 ipaddr_t addr; 9756 9757 addr = sin->sin_addr.s_addr; 9758 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9759 } 9760 mutex_enter(&ill->ill_lock); 9761 /* Set point to point destination address. */ 9762 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 9763 /* 9764 * Allow this as a means of creating logical 9765 * pt-pt interfaces on top of e.g. an Ethernet. 9766 * XXX Undocumented HACK for testing. 9767 * pt-pt interfaces are created with NUD disabled. 9768 */ 9769 ipif->ipif_flags |= IPIF_POINTOPOINT; 9770 ipif->ipif_flags &= ~IPIF_BROADCAST; 9771 if (ipif->ipif_isv6) 9772 ill->ill_flags |= ILLF_NONUD; 9773 } 9774 9775 /* 9776 * If the interface was previously marked as a duplicate, then since 9777 * we've now got a "new" address, it should no longer be considered a 9778 * duplicate -- even if the "new" address is the same as the old one. 9779 * Note that if all ipifs are down, we may have a pending ARP down 9780 * event to handle. 9781 */ 9782 need_dl_down = need_arp_down = B_FALSE; 9783 if (ipif->ipif_flags & IPIF_DUPLICATE) { 9784 need_arp_down = !need_up; 9785 ipif->ipif_flags &= ~IPIF_DUPLICATE; 9786 if (--ill->ill_ipif_dup_count == 0 && !need_up && 9787 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 9788 need_dl_down = B_TRUE; 9789 } 9790 } 9791 9792 /* 9793 * If we've just manually set the IPv6 destination link-local address 9794 * (0th ipif), tag the ill so that future updates to the destination 9795 * interface ID (as can happen with interfaces over IP tunnels) don't 9796 * result in this address getting automatically reconfigured from 9797 * under the administrator. 9798 */ 9799 if (ipif->ipif_isv6 && ipif->ipif_id == 0) 9800 ill->ill_manual_dst_linklocal = 1; 9801 9802 /* Set the new address. */ 9803 ipif->ipif_v6pp_dst_addr = v6addr; 9804 /* Make sure subnet tracks pp_dst */ 9805 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 9806 mutex_exit(&ill->ill_lock); 9807 9808 if (need_up) { 9809 /* 9810 * Now bring the interface back up. If this 9811 * is the only IPIF for the ILL, ipif_up 9812 * will have to re-bind to the device, so 9813 * we may get back EINPROGRESS, in which 9814 * case, this IOCTL will get completed in 9815 * ip_rput_dlpi when we see the DL_BIND_ACK. 9816 */ 9817 err = ipif_up(ipif, q, mp); 9818 } 9819 9820 if (need_dl_down) 9821 ill_dl_down(ill); 9822 if (need_arp_down && !ipif->ipif_isv6) 9823 (void) ipif_arp_down(ipif); 9824 9825 return (err); 9826 } 9827 9828 /* 9829 * Restart entry point to restart the dstaddress set operation after the 9830 * refcounts have dropped to zero. 9831 */ 9832 /* ARGSUSED */ 9833 int 9834 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9835 ip_ioctl_cmd_t *ipip, void *ifreq) 9836 { 9837 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 9838 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9839 (void) ipif_down_tail(ipif); 9840 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 9841 } 9842 9843 /* ARGSUSED */ 9844 int 9845 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9846 ip_ioctl_cmd_t *ipip, void *if_req) 9847 { 9848 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 9849 9850 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 9851 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9852 /* 9853 * Get point to point destination address. The addresses can't 9854 * change since we hold a reference to the ipif. 9855 */ 9856 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 9857 return (EADDRNOTAVAIL); 9858 9859 if (ipif->ipif_isv6) { 9860 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 9861 *sin6 = sin6_null; 9862 sin6->sin6_family = AF_INET6; 9863 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 9864 } else { 9865 *sin = sin_null; 9866 sin->sin_family = AF_INET; 9867 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 9868 } 9869 return (0); 9870 } 9871 9872 /* 9873 * Check which flags will change by the given flags being set 9874 * silently ignore flags which userland is not allowed to control. 9875 * (Because these flags may change between SIOCGLIFFLAGS and 9876 * SIOCSLIFFLAGS, and that's outside of userland's control, 9877 * we need to silently ignore them rather than fail.) 9878 */ 9879 static void 9880 ip_sioctl_flags_onoff(ipif_t *ipif, uint64_t flags, uint64_t *onp, 9881 uint64_t *offp) 9882 { 9883 ill_t *ill = ipif->ipif_ill; 9884 phyint_t *phyi = ill->ill_phyint; 9885 uint64_t cantchange_flags, intf_flags; 9886 uint64_t turn_on, turn_off; 9887 9888 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 9889 cantchange_flags = IFF_CANTCHANGE; 9890 if (IS_IPMP(ill)) 9891 cantchange_flags |= IFF_IPMP_CANTCHANGE; 9892 turn_on = (flags ^ intf_flags) & ~cantchange_flags; 9893 turn_off = intf_flags & turn_on; 9894 turn_on ^= turn_off; 9895 *onp = turn_on; 9896 *offp = turn_off; 9897 } 9898 9899 /* 9900 * Set interface flags. Many flags require special handling (e.g., 9901 * bringing the interface down); see below for details. 9902 * 9903 * NOTE : We really don't enforce that ipif_id zero should be used 9904 * for setting any flags other than IFF_LOGINT_FLAGS. This 9905 * is because applications generally does SICGLIFFLAGS and 9906 * ORs in the new flags (that affects the logical) and does a 9907 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 9908 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 9909 * flags that will be turned on is correct with respect to 9910 * ipif_id 0. For backward compatibility reasons, it is not done. 9911 */ 9912 /* ARGSUSED */ 9913 int 9914 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9915 ip_ioctl_cmd_t *ipip, void *if_req) 9916 { 9917 uint64_t turn_on; 9918 uint64_t turn_off; 9919 int err = 0; 9920 phyint_t *phyi; 9921 ill_t *ill; 9922 conn_t *connp; 9923 uint64_t intf_flags; 9924 boolean_t phyint_flags_modified = B_FALSE; 9925 uint64_t flags; 9926 struct ifreq *ifr; 9927 struct lifreq *lifr; 9928 boolean_t set_linklocal = B_FALSE; 9929 9930 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 9931 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9932 9933 ASSERT(IAM_WRITER_IPIF(ipif)); 9934 9935 ill = ipif->ipif_ill; 9936 phyi = ill->ill_phyint; 9937 9938 if (ipip->ipi_cmd_type == IF_CMD) { 9939 ifr = (struct ifreq *)if_req; 9940 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 9941 } else { 9942 lifr = (struct lifreq *)if_req; 9943 flags = lifr->lifr_flags; 9944 } 9945 9946 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 9947 9948 /* 9949 * Have the flags been set correctly until now? 9950 */ 9951 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 9952 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 9953 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 9954 /* 9955 * Compare the new flags to the old, and partition 9956 * into those coming on and those going off. 9957 * For the 16 bit command keep the bits above bit 16 unchanged. 9958 */ 9959 if (ipip->ipi_cmd == SIOCSIFFLAGS) 9960 flags |= intf_flags & ~0xFFFF; 9961 9962 /* 9963 * Explicitly fail attempts to change flags that are always invalid on 9964 * an IPMP meta-interface. 9965 */ 9966 if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID)) 9967 return (EINVAL); 9968 9969 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 9970 if ((turn_on|turn_off) == 0) 9971 return (0); /* No change */ 9972 9973 /* 9974 * All test addresses must be IFF_DEPRECATED (to ensure source address 9975 * selection avoids them) -- so force IFF_DEPRECATED on, and do not 9976 * allow it to be turned off. 9977 */ 9978 if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED && 9979 (turn_on|intf_flags) & IFF_NOFAILOVER) 9980 return (EINVAL); 9981 9982 if ((connp = Q_TO_CONN(q)) == NULL) 9983 return (EINVAL); 9984 9985 /* 9986 * Only vrrp control socket is allowed to change IFF_UP and 9987 * IFF_NOACCEPT flags when IFF_VRRP is set. 9988 */ 9989 if ((intf_flags & IFF_VRRP) && ((turn_off | turn_on) & IFF_UP)) { 9990 if (!connp->conn_isvrrp) 9991 return (EINVAL); 9992 } 9993 9994 /* 9995 * The IFF_NOACCEPT flag can only be set on an IFF_VRRP IP address by 9996 * VRRP control socket. 9997 */ 9998 if ((turn_off | turn_on) & IFF_NOACCEPT) { 9999 if (!connp->conn_isvrrp || !(intf_flags & IFF_VRRP)) 10000 return (EINVAL); 10001 } 10002 10003 if (turn_on & IFF_NOFAILOVER) { 10004 turn_on |= IFF_DEPRECATED; 10005 flags |= IFF_DEPRECATED; 10006 } 10007 10008 /* 10009 * On underlying interfaces, only allow applications to manage test 10010 * addresses -- otherwise, they may get confused when the address 10011 * moves as part of being brought up. Likewise, prevent an 10012 * application-managed test address from being converted to a data 10013 * address. To prevent migration of administratively up addresses in 10014 * the kernel, we don't allow them to be converted either. 10015 */ 10016 if (IS_UNDER_IPMP(ill)) { 10017 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF; 10018 10019 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER)) 10020 return (EINVAL); 10021 10022 if ((turn_off & IFF_NOFAILOVER) && 10023 (flags & (appflags | IFF_UP | IFF_DUPLICATE))) 10024 return (EINVAL); 10025 } 10026 10027 /* 10028 * Only allow IFF_TEMPORARY flag to be set on 10029 * IPv6 interfaces. 10030 */ 10031 if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6)) 10032 return (EINVAL); 10033 10034 /* 10035 * cannot turn off IFF_NOXMIT on VNI interfaces. 10036 */ 10037 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill)) 10038 return (EINVAL); 10039 10040 /* 10041 * Don't allow the IFF_ROUTER flag to be turned on on loopback 10042 * interfaces. It makes no sense in that context. 10043 */ 10044 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 10045 return (EINVAL); 10046 10047 /* 10048 * For IPv6 ipif_id 0, don't allow the interface to be up without 10049 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 10050 * If the link local address isn't set, and can be set, it will get 10051 * set later on in this function. 10052 */ 10053 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 10054 (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) && 10055 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 10056 if (ipif_cant_setlinklocal(ipif)) 10057 return (EINVAL); 10058 set_linklocal = B_TRUE; 10059 } 10060 10061 /* 10062 * If we modify physical interface flags, we'll potentially need to 10063 * send up two routing socket messages for the changes (one for the 10064 * IPv4 ill, and another for the IPv6 ill). Note that here. 10065 */ 10066 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 10067 phyint_flags_modified = B_TRUE; 10068 10069 /* 10070 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE 10071 * (otherwise, we'd immediately use them, defeating standby). Also, 10072 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not 10073 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already 10074 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We 10075 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics 10076 * will not be honored. 10077 */ 10078 if (turn_on & PHYI_STANDBY) { 10079 /* 10080 * No need to grab ill_g_usesrc_lock here; see the 10081 * synchronization notes in ip.c. 10082 */ 10083 if (ill->ill_usesrc_grp_next != NULL || 10084 intf_flags & PHYI_INACTIVE) 10085 return (EINVAL); 10086 if (!(flags & PHYI_FAILED)) { 10087 flags |= PHYI_INACTIVE; 10088 turn_on |= PHYI_INACTIVE; 10089 } 10090 } 10091 10092 if (turn_off & PHYI_STANDBY) { 10093 flags &= ~PHYI_INACTIVE; 10094 turn_off |= PHYI_INACTIVE; 10095 } 10096 10097 /* 10098 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both 10099 * would end up on. 10100 */ 10101 if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) == 10102 (PHYI_FAILED | PHYI_INACTIVE)) 10103 return (EINVAL); 10104 10105 /* 10106 * If ILLF_ROUTER changes, we need to change the ip forwarding 10107 * status of the interface. 10108 */ 10109 if ((turn_on | turn_off) & ILLF_ROUTER) 10110 (void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0)); 10111 10112 /* 10113 * If the interface is not UP and we are not going to 10114 * bring it UP, record the flags and return. When the 10115 * interface comes UP later, the right actions will be 10116 * taken. 10117 */ 10118 if (!(ipif->ipif_flags & IPIF_UP) && 10119 !(turn_on & IPIF_UP)) { 10120 /* Record new flags in their respective places. */ 10121 mutex_enter(&ill->ill_lock); 10122 mutex_enter(&ill->ill_phyint->phyint_lock); 10123 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 10124 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 10125 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 10126 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 10127 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 10128 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 10129 mutex_exit(&ill->ill_lock); 10130 mutex_exit(&ill->ill_phyint->phyint_lock); 10131 10132 /* 10133 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the 10134 * same to the kernel: if any of them has been set by 10135 * userland, the interface cannot be used for data traffic. 10136 */ 10137 if ((turn_on|turn_off) & 10138 (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 10139 ASSERT(!IS_IPMP(ill)); 10140 /* 10141 * It's possible the ill is part of an "anonymous" 10142 * IPMP group rather than a real group. In that case, 10143 * there are no other interfaces in the group and thus 10144 * no need to call ipmp_phyint_refresh_active(). 10145 */ 10146 if (IS_UNDER_IPMP(ill)) 10147 ipmp_phyint_refresh_active(phyi); 10148 } 10149 10150 if (phyint_flags_modified) { 10151 if (phyi->phyint_illv4 != NULL) { 10152 ip_rts_ifmsg(phyi->phyint_illv4-> 10153 ill_ipif, RTSQ_DEFAULT); 10154 } 10155 if (phyi->phyint_illv6 != NULL) { 10156 ip_rts_ifmsg(phyi->phyint_illv6-> 10157 ill_ipif, RTSQ_DEFAULT); 10158 } 10159 } 10160 /* The default multicast interface might have changed */ 10161 ire_increment_multicast_generation(ill->ill_ipst, 10162 ill->ill_isv6); 10163 10164 return (0); 10165 } else if (set_linklocal) { 10166 mutex_enter(&ill->ill_lock); 10167 if (set_linklocal) 10168 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 10169 mutex_exit(&ill->ill_lock); 10170 } 10171 10172 /* 10173 * Disallow IPv6 interfaces coming up that have the unspecified address, 10174 * or point-to-point interfaces with an unspecified destination. We do 10175 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 10176 * have a subnet assigned, which is how in.ndpd currently manages its 10177 * onlink prefix list when no addresses are configured with those 10178 * prefixes. 10179 */ 10180 if (ipif->ipif_isv6 && 10181 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 10182 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 10183 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 10184 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 10185 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 10186 return (EINVAL); 10187 } 10188 10189 /* 10190 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 10191 * from being brought up. 10192 */ 10193 if (!ipif->ipif_isv6 && 10194 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 10195 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 10196 return (EINVAL); 10197 } 10198 10199 /* 10200 * If we are going to change one or more of the flags that are 10201 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP, 10202 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and 10203 * IPIF_NOFAILOVER, we will take special action. This is 10204 * done by bring the ipif down, changing the flags and bringing 10205 * it back up again. For IPIF_NOFAILOVER, the act of bringing it 10206 * back up will trigger the address to be moved. 10207 * 10208 * If we are going to change IFF_NOACCEPT, we need to bring 10209 * all the ipifs down then bring them up again. The act of 10210 * bringing all the ipifs back up will trigger the local 10211 * ires being recreated with "no_accept" set/cleared. 10212 * 10213 * Note that ILLF_NOACCEPT is always set separately from the 10214 * other flags. 10215 */ 10216 if ((turn_on|turn_off) & 10217 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 10218 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED| 10219 IPIF_NOFAILOVER)) { 10220 /* 10221 * ipif_down() will ire_delete bcast ire's for the subnet, 10222 * while the ire_identical_ref tracks the case of IRE_BROADCAST 10223 * entries shared between multiple ipifs on the same subnet. 10224 */ 10225 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 10226 !(turn_off & IPIF_UP)) { 10227 if (ipif->ipif_flags & IPIF_UP) 10228 ill->ill_logical_down = 1; 10229 turn_on &= ~IPIF_UP; 10230 } 10231 err = ipif_down(ipif, q, mp); 10232 ip1dbg(("ipif_down returns %d err ", err)); 10233 if (err == EINPROGRESS) 10234 return (err); 10235 (void) ipif_down_tail(ipif); 10236 } else if ((turn_on|turn_off) & ILLF_NOACCEPT) { 10237 /* 10238 * If we can quiesce the ill, then continue. If not, then 10239 * ip_sioctl_flags_tail() will be called from 10240 * ipif_ill_refrele_tail(). 10241 */ 10242 ill_down_ipifs(ill, B_TRUE); 10243 10244 mutex_enter(&connp->conn_lock); 10245 mutex_enter(&ill->ill_lock); 10246 if (!ill_is_quiescent(ill)) { 10247 boolean_t success; 10248 10249 success = ipsq_pending_mp_add(connp, ill->ill_ipif, 10250 q, mp, ILL_DOWN); 10251 mutex_exit(&ill->ill_lock); 10252 mutex_exit(&connp->conn_lock); 10253 return (success ? EINPROGRESS : EINTR); 10254 } 10255 mutex_exit(&ill->ill_lock); 10256 mutex_exit(&connp->conn_lock); 10257 } 10258 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 10259 } 10260 10261 static int 10262 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) 10263 { 10264 ill_t *ill; 10265 phyint_t *phyi; 10266 uint64_t turn_on, turn_off; 10267 boolean_t phyint_flags_modified = B_FALSE; 10268 int err = 0; 10269 boolean_t set_linklocal = B_FALSE; 10270 10271 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 10272 ipif->ipif_ill->ill_name, ipif->ipif_id)); 10273 10274 ASSERT(IAM_WRITER_IPIF(ipif)); 10275 10276 ill = ipif->ipif_ill; 10277 phyi = ill->ill_phyint; 10278 10279 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10280 10281 /* 10282 * IFF_UP is handled separately. 10283 */ 10284 turn_on &= ~IFF_UP; 10285 turn_off &= ~IFF_UP; 10286 10287 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 10288 phyint_flags_modified = B_TRUE; 10289 10290 /* 10291 * Now we change the flags. Track current value of 10292 * other flags in their respective places. 10293 */ 10294 mutex_enter(&ill->ill_lock); 10295 mutex_enter(&phyi->phyint_lock); 10296 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 10297 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 10298 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 10299 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 10300 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 10301 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 10302 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 10303 set_linklocal = B_TRUE; 10304 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 10305 } 10306 10307 mutex_exit(&ill->ill_lock); 10308 mutex_exit(&phyi->phyint_lock); 10309 10310 if (set_linklocal) 10311 (void) ipif_setlinklocal(ipif); 10312 10313 /* 10314 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to 10315 * the kernel: if any of them has been set by userland, the interface 10316 * cannot be used for data traffic. 10317 */ 10318 if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 10319 ASSERT(!IS_IPMP(ill)); 10320 /* 10321 * It's possible the ill is part of an "anonymous" IPMP group 10322 * rather than a real group. In that case, there are no other 10323 * interfaces in the group and thus no need for us to call 10324 * ipmp_phyint_refresh_active(). 10325 */ 10326 if (IS_UNDER_IPMP(ill)) 10327 ipmp_phyint_refresh_active(phyi); 10328 } 10329 10330 if ((turn_on|turn_off) & ILLF_NOACCEPT) { 10331 /* 10332 * If the ILLF_NOACCEPT flag is changed, bring up all the 10333 * ipifs that were brought down. 10334 * 10335 * The routing sockets messages are sent as the result 10336 * of ill_up_ipifs(), further, SCTP's IPIF list was updated 10337 * as well. 10338 */ 10339 err = ill_up_ipifs(ill, q, mp); 10340 } else if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) { 10341 /* 10342 * XXX ipif_up really does not know whether a phyint flags 10343 * was modified or not. So, it sends up information on 10344 * only one routing sockets message. As we don't bring up 10345 * the interface and also set PHYI_ flags simultaneously 10346 * it should be okay. 10347 */ 10348 err = ipif_up(ipif, q, mp); 10349 } else { 10350 /* 10351 * Make sure routing socket sees all changes to the flags. 10352 * ipif_up_done* handles this when we use ipif_up. 10353 */ 10354 if (phyint_flags_modified) { 10355 if (phyi->phyint_illv4 != NULL) { 10356 ip_rts_ifmsg(phyi->phyint_illv4-> 10357 ill_ipif, RTSQ_DEFAULT); 10358 } 10359 if (phyi->phyint_illv6 != NULL) { 10360 ip_rts_ifmsg(phyi->phyint_illv6-> 10361 ill_ipif, RTSQ_DEFAULT); 10362 } 10363 } else { 10364 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 10365 } 10366 /* 10367 * Update the flags in SCTP's IPIF list, ipif_up() will do 10368 * this in need_up case. 10369 */ 10370 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10371 } 10372 10373 /* The default multicast interface might have changed */ 10374 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); 10375 return (err); 10376 } 10377 10378 /* 10379 * Restart the flags operation now that the refcounts have dropped to zero. 10380 */ 10381 /* ARGSUSED */ 10382 int 10383 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10384 ip_ioctl_cmd_t *ipip, void *if_req) 10385 { 10386 uint64_t flags; 10387 struct ifreq *ifr = if_req; 10388 struct lifreq *lifr = if_req; 10389 uint64_t turn_on, turn_off; 10390 10391 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 10392 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10393 10394 if (ipip->ipi_cmd_type == IF_CMD) { 10395 /* cast to uint16_t prevents unwanted sign extension */ 10396 flags = (uint16_t)ifr->ifr_flags; 10397 } else { 10398 flags = lifr->lifr_flags; 10399 } 10400 10401 /* 10402 * If this function call is a result of the ILLF_NOACCEPT flag 10403 * change, do not call ipif_down_tail(). See ip_sioctl_flags(). 10404 */ 10405 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10406 if (!((turn_on|turn_off) & ILLF_NOACCEPT)) 10407 (void) ipif_down_tail(ipif); 10408 10409 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 10410 } 10411 10412 /* 10413 * Can operate on either a module or a driver queue. 10414 */ 10415 /* ARGSUSED */ 10416 int 10417 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10418 ip_ioctl_cmd_t *ipip, void *if_req) 10419 { 10420 /* 10421 * Has the flags been set correctly till now ? 10422 */ 10423 ill_t *ill = ipif->ipif_ill; 10424 phyint_t *phyi = ill->ill_phyint; 10425 10426 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 10427 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10428 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 10429 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 10430 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 10431 10432 /* 10433 * Need a lock since some flags can be set even when there are 10434 * references to the ipif. 10435 */ 10436 mutex_enter(&ill->ill_lock); 10437 if (ipip->ipi_cmd_type == IF_CMD) { 10438 struct ifreq *ifr = (struct ifreq *)if_req; 10439 10440 /* Get interface flags (low 16 only). */ 10441 ifr->ifr_flags = ((ipif->ipif_flags | 10442 ill->ill_flags | phyi->phyint_flags) & 0xffff); 10443 } else { 10444 struct lifreq *lifr = (struct lifreq *)if_req; 10445 10446 /* Get interface flags. */ 10447 lifr->lifr_flags = ipif->ipif_flags | 10448 ill->ill_flags | phyi->phyint_flags; 10449 } 10450 mutex_exit(&ill->ill_lock); 10451 return (0); 10452 } 10453 10454 /* 10455 * We allow the MTU to be set on an ILL, but not have it be different 10456 * for different IPIFs since we don't actually send packets on IPIFs. 10457 */ 10458 /* ARGSUSED */ 10459 int 10460 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10461 ip_ioctl_cmd_t *ipip, void *if_req) 10462 { 10463 int mtu; 10464 int ip_min_mtu; 10465 struct ifreq *ifr; 10466 struct lifreq *lifr; 10467 ill_t *ill; 10468 10469 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 10470 ipif->ipif_id, (void *)ipif)); 10471 if (ipip->ipi_cmd_type == IF_CMD) { 10472 ifr = (struct ifreq *)if_req; 10473 mtu = ifr->ifr_metric; 10474 } else { 10475 lifr = (struct lifreq *)if_req; 10476 mtu = lifr->lifr_mtu; 10477 } 10478 /* Only allow for logical unit zero i.e. not on "bge0:17" */ 10479 if (ipif->ipif_id != 0) 10480 return (EINVAL); 10481 10482 ill = ipif->ipif_ill; 10483 if (ipif->ipif_isv6) 10484 ip_min_mtu = IPV6_MIN_MTU; 10485 else 10486 ip_min_mtu = IP_MIN_MTU; 10487 10488 mutex_enter(&ill->ill_lock); 10489 if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) { 10490 mutex_exit(&ill->ill_lock); 10491 return (EINVAL); 10492 } 10493 /* 10494 * The dce and fragmentation code can handle changes to ill_mtu 10495 * concurrent with sending/fragmenting packets. 10496 */ 10497 ill->ill_mtu = mtu; 10498 ill->ill_flags |= ILLF_FIXEDMTU; 10499 mutex_exit(&ill->ill_lock); 10500 10501 /* 10502 * Make sure all dce_generation checks find out 10503 * that ill_mtu has changed. 10504 */ 10505 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); 10506 10507 /* Update the MTU in SCTP's list */ 10508 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10509 return (0); 10510 } 10511 10512 /* Get interface MTU. */ 10513 /* ARGSUSED */ 10514 int 10515 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10516 ip_ioctl_cmd_t *ipip, void *if_req) 10517 { 10518 struct ifreq *ifr; 10519 struct lifreq *lifr; 10520 10521 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 10522 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10523 10524 /* 10525 * We allow a get on any logical interface even though the set 10526 * can only be done on logical unit 0. 10527 */ 10528 if (ipip->ipi_cmd_type == IF_CMD) { 10529 ifr = (struct ifreq *)if_req; 10530 ifr->ifr_metric = ipif->ipif_ill->ill_mtu; 10531 } else { 10532 lifr = (struct lifreq *)if_req; 10533 lifr->lifr_mtu = ipif->ipif_ill->ill_mtu; 10534 } 10535 return (0); 10536 } 10537 10538 /* Set interface broadcast address. */ 10539 /* ARGSUSED2 */ 10540 int 10541 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10542 ip_ioctl_cmd_t *ipip, void *if_req) 10543 { 10544 ipaddr_t addr; 10545 ire_t *ire; 10546 ill_t *ill = ipif->ipif_ill; 10547 ip_stack_t *ipst = ill->ill_ipst; 10548 10549 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name, 10550 ipif->ipif_id)); 10551 10552 ASSERT(IAM_WRITER_IPIF(ipif)); 10553 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 10554 return (EADDRNOTAVAIL); 10555 10556 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 10557 10558 if (sin->sin_family != AF_INET) 10559 return (EAFNOSUPPORT); 10560 10561 addr = sin->sin_addr.s_addr; 10562 if (ipif->ipif_flags & IPIF_UP) { 10563 /* 10564 * If we are already up, make sure the new 10565 * broadcast address makes sense. If it does, 10566 * there should be an IRE for it already. 10567 */ 10568 ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST, 10569 ill, ipif->ipif_zoneid, NULL, 10570 (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL); 10571 if (ire == NULL) { 10572 return (EINVAL); 10573 } else { 10574 ire_refrele(ire); 10575 } 10576 } 10577 /* 10578 * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST 10579 * needs to already exist we never need to change the set of 10580 * IRE_BROADCASTs when we are UP. 10581 */ 10582 if (addr != ipif->ipif_brd_addr) 10583 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 10584 10585 return (0); 10586 } 10587 10588 /* Get interface broadcast address. */ 10589 /* ARGSUSED */ 10590 int 10591 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10592 ip_ioctl_cmd_t *ipip, void *if_req) 10593 { 10594 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 10595 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10596 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 10597 return (EADDRNOTAVAIL); 10598 10599 /* IPIF_BROADCAST not possible with IPv6 */ 10600 ASSERT(!ipif->ipif_isv6); 10601 *sin = sin_null; 10602 sin->sin_family = AF_INET; 10603 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 10604 return (0); 10605 } 10606 10607 /* 10608 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 10609 */ 10610 /* ARGSUSED */ 10611 int 10612 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10613 ip_ioctl_cmd_t *ipip, void *if_req) 10614 { 10615 int err = 0; 10616 in6_addr_t v6mask; 10617 10618 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 10619 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10620 10621 ASSERT(IAM_WRITER_IPIF(ipif)); 10622 10623 if (ipif->ipif_isv6) { 10624 sin6_t *sin6; 10625 10626 if (sin->sin_family != AF_INET6) 10627 return (EAFNOSUPPORT); 10628 10629 sin6 = (sin6_t *)sin; 10630 v6mask = sin6->sin6_addr; 10631 } else { 10632 ipaddr_t mask; 10633 10634 if (sin->sin_family != AF_INET) 10635 return (EAFNOSUPPORT); 10636 10637 mask = sin->sin_addr.s_addr; 10638 V4MASK_TO_V6(mask, v6mask); 10639 } 10640 10641 /* 10642 * No big deal if the interface isn't already up, or the mask 10643 * isn't really changing, or this is pt-pt. 10644 */ 10645 if (!(ipif->ipif_flags & IPIF_UP) || 10646 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 10647 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 10648 ipif->ipif_v6net_mask = v6mask; 10649 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10650 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 10651 ipif->ipif_v6net_mask, 10652 ipif->ipif_v6subnet); 10653 } 10654 return (0); 10655 } 10656 /* 10657 * Make sure we have valid net and subnet broadcast ire's 10658 * for the old netmask, if needed by other logical interfaces. 10659 */ 10660 err = ipif_logical_down(ipif, q, mp); 10661 if (err == EINPROGRESS) 10662 return (err); 10663 (void) ipif_down_tail(ipif); 10664 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 10665 return (err); 10666 } 10667 10668 static int 10669 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 10670 { 10671 in6_addr_t v6mask; 10672 int err = 0; 10673 10674 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 10675 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10676 10677 if (ipif->ipif_isv6) { 10678 sin6_t *sin6; 10679 10680 sin6 = (sin6_t *)sin; 10681 v6mask = sin6->sin6_addr; 10682 } else { 10683 ipaddr_t mask; 10684 10685 mask = sin->sin_addr.s_addr; 10686 V4MASK_TO_V6(mask, v6mask); 10687 } 10688 10689 ipif->ipif_v6net_mask = v6mask; 10690 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10691 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 10692 ipif->ipif_v6subnet); 10693 } 10694 err = ipif_up(ipif, q, mp); 10695 10696 if (err == 0 || err == EINPROGRESS) { 10697 /* 10698 * The interface must be DL_BOUND if this packet has to 10699 * go out on the wire. Since we only go through a logical 10700 * down and are bound with the driver during an internal 10701 * down/up that is satisfied. 10702 */ 10703 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 10704 /* Potentially broadcast an address mask reply. */ 10705 ipif_mask_reply(ipif); 10706 } 10707 } 10708 return (err); 10709 } 10710 10711 /* ARGSUSED */ 10712 int 10713 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10714 ip_ioctl_cmd_t *ipip, void *if_req) 10715 { 10716 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 10717 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10718 (void) ipif_down_tail(ipif); 10719 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 10720 } 10721 10722 /* Get interface net mask. */ 10723 /* ARGSUSED */ 10724 int 10725 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10726 ip_ioctl_cmd_t *ipip, void *if_req) 10727 { 10728 struct lifreq *lifr = (struct lifreq *)if_req; 10729 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 10730 10731 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 10732 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10733 10734 /* 10735 * net mask can't change since we have a reference to the ipif. 10736 */ 10737 if (ipif->ipif_isv6) { 10738 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 10739 *sin6 = sin6_null; 10740 sin6->sin6_family = AF_INET6; 10741 sin6->sin6_addr = ipif->ipif_v6net_mask; 10742 lifr->lifr_addrlen = 10743 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 10744 } else { 10745 *sin = sin_null; 10746 sin->sin_family = AF_INET; 10747 sin->sin_addr.s_addr = ipif->ipif_net_mask; 10748 if (ipip->ipi_cmd_type == LIF_CMD) { 10749 lifr->lifr_addrlen = 10750 ip_mask_to_plen(ipif->ipif_net_mask); 10751 } 10752 } 10753 return (0); 10754 } 10755 10756 /* ARGSUSED */ 10757 int 10758 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10759 ip_ioctl_cmd_t *ipip, void *if_req) 10760 { 10761 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 10762 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10763 10764 /* 10765 * Since no applications should ever be setting metrics on underlying 10766 * interfaces, we explicitly fail to smoke 'em out. 10767 */ 10768 if (IS_UNDER_IPMP(ipif->ipif_ill)) 10769 return (EINVAL); 10770 10771 /* 10772 * Set interface metric. We don't use this for 10773 * anything but we keep track of it in case it is 10774 * important to routing applications or such. 10775 */ 10776 if (ipip->ipi_cmd_type == IF_CMD) { 10777 struct ifreq *ifr; 10778 10779 ifr = (struct ifreq *)if_req; 10780 ipif->ipif_metric = ifr->ifr_metric; 10781 } else { 10782 struct lifreq *lifr; 10783 10784 lifr = (struct lifreq *)if_req; 10785 ipif->ipif_metric = lifr->lifr_metric; 10786 } 10787 return (0); 10788 } 10789 10790 /* ARGSUSED */ 10791 int 10792 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10793 ip_ioctl_cmd_t *ipip, void *if_req) 10794 { 10795 /* Get interface metric. */ 10796 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 10797 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10798 10799 if (ipip->ipi_cmd_type == IF_CMD) { 10800 struct ifreq *ifr; 10801 10802 ifr = (struct ifreq *)if_req; 10803 ifr->ifr_metric = ipif->ipif_metric; 10804 } else { 10805 struct lifreq *lifr; 10806 10807 lifr = (struct lifreq *)if_req; 10808 lifr->lifr_metric = ipif->ipif_metric; 10809 } 10810 10811 return (0); 10812 } 10813 10814 /* ARGSUSED */ 10815 int 10816 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10817 ip_ioctl_cmd_t *ipip, void *if_req) 10818 { 10819 int arp_muxid; 10820 10821 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 10822 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10823 /* 10824 * Set the muxid returned from I_PLINK. 10825 */ 10826 if (ipip->ipi_cmd_type == IF_CMD) { 10827 struct ifreq *ifr = (struct ifreq *)if_req; 10828 10829 ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid; 10830 arp_muxid = ifr->ifr_arp_muxid; 10831 } else { 10832 struct lifreq *lifr = (struct lifreq *)if_req; 10833 10834 ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid; 10835 arp_muxid = lifr->lifr_arp_muxid; 10836 } 10837 arl_set_muxid(ipif->ipif_ill, arp_muxid); 10838 return (0); 10839 } 10840 10841 /* ARGSUSED */ 10842 int 10843 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10844 ip_ioctl_cmd_t *ipip, void *if_req) 10845 { 10846 int arp_muxid = 0; 10847 10848 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 10849 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10850 /* 10851 * Get the muxid saved in ill for I_PUNLINK. 10852 */ 10853 arp_muxid = arl_get_muxid(ipif->ipif_ill); 10854 if (ipip->ipi_cmd_type == IF_CMD) { 10855 struct ifreq *ifr = (struct ifreq *)if_req; 10856 10857 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid; 10858 ifr->ifr_arp_muxid = arp_muxid; 10859 } else { 10860 struct lifreq *lifr = (struct lifreq *)if_req; 10861 10862 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid; 10863 lifr->lifr_arp_muxid = arp_muxid; 10864 } 10865 return (0); 10866 } 10867 10868 /* 10869 * Set the subnet prefix. Does not modify the broadcast address. 10870 */ 10871 /* ARGSUSED */ 10872 int 10873 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10874 ip_ioctl_cmd_t *ipip, void *if_req) 10875 { 10876 int err = 0; 10877 in6_addr_t v6addr; 10878 in6_addr_t v6mask; 10879 boolean_t need_up = B_FALSE; 10880 int addrlen; 10881 10882 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 10883 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10884 10885 ASSERT(IAM_WRITER_IPIF(ipif)); 10886 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 10887 10888 if (ipif->ipif_isv6) { 10889 sin6_t *sin6; 10890 10891 if (sin->sin_family != AF_INET6) 10892 return (EAFNOSUPPORT); 10893 10894 sin6 = (sin6_t *)sin; 10895 v6addr = sin6->sin6_addr; 10896 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 10897 return (EADDRNOTAVAIL); 10898 } else { 10899 ipaddr_t addr; 10900 10901 if (sin->sin_family != AF_INET) 10902 return (EAFNOSUPPORT); 10903 10904 addr = sin->sin_addr.s_addr; 10905 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 10906 return (EADDRNOTAVAIL); 10907 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10908 /* Add 96 bits */ 10909 addrlen += IPV6_ABITS - IP_ABITS; 10910 } 10911 10912 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 10913 return (EINVAL); 10914 10915 /* Check if bits in the address is set past the mask */ 10916 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 10917 return (EINVAL); 10918 10919 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 10920 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 10921 return (0); /* No change */ 10922 10923 if (ipif->ipif_flags & IPIF_UP) { 10924 /* 10925 * If the interface is already marked up, 10926 * we call ipif_down which will take care 10927 * of ditching any IREs that have been set 10928 * up based on the old interface address. 10929 */ 10930 err = ipif_logical_down(ipif, q, mp); 10931 if (err == EINPROGRESS) 10932 return (err); 10933 (void) ipif_down_tail(ipif); 10934 need_up = B_TRUE; 10935 } 10936 10937 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 10938 return (err); 10939 } 10940 10941 static int 10942 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 10943 queue_t *q, mblk_t *mp, boolean_t need_up) 10944 { 10945 ill_t *ill = ipif->ipif_ill; 10946 int err = 0; 10947 10948 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 10949 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10950 10951 /* Set the new address. */ 10952 mutex_enter(&ill->ill_lock); 10953 ipif->ipif_v6net_mask = v6mask; 10954 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10955 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 10956 ipif->ipif_v6subnet); 10957 } 10958 mutex_exit(&ill->ill_lock); 10959 10960 if (need_up) { 10961 /* 10962 * Now bring the interface back up. If this 10963 * is the only IPIF for the ILL, ipif_up 10964 * will have to re-bind to the device, so 10965 * we may get back EINPROGRESS, in which 10966 * case, this IOCTL will get completed in 10967 * ip_rput_dlpi when we see the DL_BIND_ACK. 10968 */ 10969 err = ipif_up(ipif, q, mp); 10970 if (err == EINPROGRESS) 10971 return (err); 10972 } 10973 return (err); 10974 } 10975 10976 /* ARGSUSED */ 10977 int 10978 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10979 ip_ioctl_cmd_t *ipip, void *if_req) 10980 { 10981 int addrlen; 10982 in6_addr_t v6addr; 10983 in6_addr_t v6mask; 10984 struct lifreq *lifr = (struct lifreq *)if_req; 10985 10986 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 10987 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10988 (void) ipif_down_tail(ipif); 10989 10990 addrlen = lifr->lifr_addrlen; 10991 if (ipif->ipif_isv6) { 10992 sin6_t *sin6; 10993 10994 sin6 = (sin6_t *)sin; 10995 v6addr = sin6->sin6_addr; 10996 } else { 10997 ipaddr_t addr; 10998 10999 addr = sin->sin_addr.s_addr; 11000 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11001 addrlen += IPV6_ABITS - IP_ABITS; 11002 } 11003 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 11004 11005 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 11006 } 11007 11008 /* ARGSUSED */ 11009 int 11010 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11011 ip_ioctl_cmd_t *ipip, void *if_req) 11012 { 11013 struct lifreq *lifr = (struct lifreq *)if_req; 11014 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 11015 11016 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 11017 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11018 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11019 11020 if (ipif->ipif_isv6) { 11021 *sin6 = sin6_null; 11022 sin6->sin6_family = AF_INET6; 11023 sin6->sin6_addr = ipif->ipif_v6subnet; 11024 lifr->lifr_addrlen = 11025 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11026 } else { 11027 *sin = sin_null; 11028 sin->sin_family = AF_INET; 11029 sin->sin_addr.s_addr = ipif->ipif_subnet; 11030 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 11031 } 11032 return (0); 11033 } 11034 11035 /* 11036 * Set the IPv6 address token. 11037 */ 11038 /* ARGSUSED */ 11039 int 11040 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11041 ip_ioctl_cmd_t *ipi, void *if_req) 11042 { 11043 ill_t *ill = ipif->ipif_ill; 11044 int err; 11045 in6_addr_t v6addr; 11046 in6_addr_t v6mask; 11047 boolean_t need_up = B_FALSE; 11048 int i; 11049 sin6_t *sin6 = (sin6_t *)sin; 11050 struct lifreq *lifr = (struct lifreq *)if_req; 11051 int addrlen; 11052 11053 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 11054 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11055 ASSERT(IAM_WRITER_IPIF(ipif)); 11056 11057 addrlen = lifr->lifr_addrlen; 11058 /* Only allow for logical unit zero i.e. not on "le0:17" */ 11059 if (ipif->ipif_id != 0) 11060 return (EINVAL); 11061 11062 if (!ipif->ipif_isv6) 11063 return (EINVAL); 11064 11065 if (addrlen > IPV6_ABITS) 11066 return (EINVAL); 11067 11068 v6addr = sin6->sin6_addr; 11069 11070 /* 11071 * The length of the token is the length from the end. To get 11072 * the proper mask for this, compute the mask of the bits not 11073 * in the token; ie. the prefix, and then xor to get the mask. 11074 */ 11075 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 11076 return (EINVAL); 11077 for (i = 0; i < 4; i++) { 11078 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 11079 } 11080 11081 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 11082 ill->ill_token_length == addrlen) 11083 return (0); /* No change */ 11084 11085 if (ipif->ipif_flags & IPIF_UP) { 11086 err = ipif_logical_down(ipif, q, mp); 11087 if (err == EINPROGRESS) 11088 return (err); 11089 (void) ipif_down_tail(ipif); 11090 need_up = B_TRUE; 11091 } 11092 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 11093 return (err); 11094 } 11095 11096 static int 11097 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 11098 mblk_t *mp, boolean_t need_up) 11099 { 11100 in6_addr_t v6addr; 11101 in6_addr_t v6mask; 11102 ill_t *ill = ipif->ipif_ill; 11103 int i; 11104 int err = 0; 11105 11106 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 11107 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11108 v6addr = sin6->sin6_addr; 11109 /* 11110 * The length of the token is the length from the end. To get 11111 * the proper mask for this, compute the mask of the bits not 11112 * in the token; ie. the prefix, and then xor to get the mask. 11113 */ 11114 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 11115 for (i = 0; i < 4; i++) 11116 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 11117 11118 mutex_enter(&ill->ill_lock); 11119 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 11120 ill->ill_token_length = addrlen; 11121 ill->ill_manual_token = 1; 11122 11123 /* Reconfigure the link-local address based on this new token */ 11124 ipif_setlinklocal(ill->ill_ipif); 11125 11126 mutex_exit(&ill->ill_lock); 11127 11128 if (need_up) { 11129 /* 11130 * Now bring the interface back up. If this 11131 * is the only IPIF for the ILL, ipif_up 11132 * will have to re-bind to the device, so 11133 * we may get back EINPROGRESS, in which 11134 * case, this IOCTL will get completed in 11135 * ip_rput_dlpi when we see the DL_BIND_ACK. 11136 */ 11137 err = ipif_up(ipif, q, mp); 11138 if (err == EINPROGRESS) 11139 return (err); 11140 } 11141 return (err); 11142 } 11143 11144 /* ARGSUSED */ 11145 int 11146 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11147 ip_ioctl_cmd_t *ipi, void *if_req) 11148 { 11149 ill_t *ill; 11150 sin6_t *sin6 = (sin6_t *)sin; 11151 struct lifreq *lifr = (struct lifreq *)if_req; 11152 11153 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 11154 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11155 if (ipif->ipif_id != 0) 11156 return (EINVAL); 11157 11158 ill = ipif->ipif_ill; 11159 if (!ill->ill_isv6) 11160 return (ENXIO); 11161 11162 *sin6 = sin6_null; 11163 sin6->sin6_family = AF_INET6; 11164 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 11165 sin6->sin6_addr = ill->ill_token; 11166 lifr->lifr_addrlen = ill->ill_token_length; 11167 return (0); 11168 } 11169 11170 /* 11171 * Set (hardware) link specific information that might override 11172 * what was acquired through the DL_INFO_ACK. 11173 */ 11174 /* ARGSUSED */ 11175 int 11176 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11177 ip_ioctl_cmd_t *ipi, void *if_req) 11178 { 11179 ill_t *ill = ipif->ipif_ill; 11180 int ip_min_mtu; 11181 struct lifreq *lifr = (struct lifreq *)if_req; 11182 lif_ifinfo_req_t *lir; 11183 11184 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 11185 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11186 lir = &lifr->lifr_ifinfo; 11187 ASSERT(IAM_WRITER_IPIF(ipif)); 11188 11189 /* Only allow for logical unit zero i.e. not on "bge0:17" */ 11190 if (ipif->ipif_id != 0) 11191 return (EINVAL); 11192 11193 /* Set interface MTU. */ 11194 if (ipif->ipif_isv6) 11195 ip_min_mtu = IPV6_MIN_MTU; 11196 else 11197 ip_min_mtu = IP_MIN_MTU; 11198 11199 /* 11200 * Verify values before we set anything. Allow zero to 11201 * mean unspecified. 11202 * 11203 * XXX We should be able to set the user-defined lir_mtu to some value 11204 * that is greater than ill_current_frag but less than ill_max_frag- the 11205 * ill_max_frag value tells us the max MTU that can be handled by the 11206 * datalink, whereas the ill_current_frag is dynamically computed for 11207 * some link-types like tunnels, based on the tunnel PMTU. However, 11208 * since there is currently no way of distinguishing between 11209 * administratively fixed link mtu values (e.g., those set via 11210 * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered 11211 * for tunnels) we conservatively choose the ill_current_frag as the 11212 * upper-bound. 11213 */ 11214 if (lir->lir_maxmtu != 0 && 11215 (lir->lir_maxmtu > ill->ill_current_frag || 11216 lir->lir_maxmtu < ip_min_mtu)) 11217 return (EINVAL); 11218 if (lir->lir_reachtime != 0 && 11219 lir->lir_reachtime > ND_MAX_REACHTIME) 11220 return (EINVAL); 11221 if (lir->lir_reachretrans != 0 && 11222 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 11223 return (EINVAL); 11224 11225 mutex_enter(&ill->ill_lock); 11226 /* 11227 * The dce and fragmentation code can handle changes to ill_mtu 11228 * concurrent with sending/fragmenting packets. 11229 */ 11230 if (lir->lir_maxmtu != 0) 11231 ill->ill_user_mtu = lir->lir_maxmtu; 11232 11233 if (lir->lir_reachtime != 0) 11234 ill->ill_reachable_time = lir->lir_reachtime; 11235 11236 if (lir->lir_reachretrans != 0) 11237 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 11238 11239 ill->ill_max_hops = lir->lir_maxhops; 11240 ill->ill_max_buf = ND_MAX_Q; 11241 if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) { 11242 /* 11243 * ill_mtu is the actual interface MTU, obtained as the min 11244 * of user-configured mtu and the value announced by the 11245 * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since 11246 * we have already made the choice of requiring 11247 * ill_user_mtu < ill_current_frag by the time we get here, 11248 * the ill_mtu effectively gets assigned to the ill_user_mtu 11249 * here. 11250 */ 11251 ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu); 11252 } 11253 mutex_exit(&ill->ill_lock); 11254 11255 /* 11256 * Make sure all dce_generation checks find out 11257 * that ill_mtu has changed. 11258 */ 11259 if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0)) 11260 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); 11261 11262 /* 11263 * Refresh IPMP meta-interface MTU if necessary. 11264 */ 11265 if (IS_UNDER_IPMP(ill)) 11266 ipmp_illgrp_refresh_mtu(ill->ill_grp); 11267 11268 return (0); 11269 } 11270 11271 /* ARGSUSED */ 11272 int 11273 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11274 ip_ioctl_cmd_t *ipi, void *if_req) 11275 { 11276 struct lif_ifinfo_req *lir; 11277 ill_t *ill = ipif->ipif_ill; 11278 11279 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 11280 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11281 if (ipif->ipif_id != 0) 11282 return (EINVAL); 11283 11284 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 11285 lir->lir_maxhops = ill->ill_max_hops; 11286 lir->lir_reachtime = ill->ill_reachable_time; 11287 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 11288 lir->lir_maxmtu = ill->ill_mtu; 11289 11290 return (0); 11291 } 11292 11293 /* 11294 * Return best guess as to the subnet mask for the specified address. 11295 * Based on the subnet masks for all the configured interfaces. 11296 * 11297 * We end up returning a zero mask in the case of default, multicast or 11298 * experimental. 11299 */ 11300 static ipaddr_t 11301 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst) 11302 { 11303 ipaddr_t net_mask; 11304 ill_t *ill; 11305 ipif_t *ipif; 11306 ill_walk_context_t ctx; 11307 ipif_t *fallback_ipif = NULL; 11308 11309 net_mask = ip_net_mask(addr); 11310 if (net_mask == 0) { 11311 *ipifp = NULL; 11312 return (0); 11313 } 11314 11315 /* Let's check to see if this is maybe a local subnet route. */ 11316 /* this function only applies to IPv4 interfaces */ 11317 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 11318 ill = ILL_START_WALK_V4(&ctx, ipst); 11319 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 11320 mutex_enter(&ill->ill_lock); 11321 for (ipif = ill->ill_ipif; ipif != NULL; 11322 ipif = ipif->ipif_next) { 11323 if (IPIF_IS_CONDEMNED(ipif)) 11324 continue; 11325 if (!(ipif->ipif_flags & IPIF_UP)) 11326 continue; 11327 if ((ipif->ipif_subnet & net_mask) == 11328 (addr & net_mask)) { 11329 /* 11330 * Don't trust pt-pt interfaces if there are 11331 * other interfaces. 11332 */ 11333 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 11334 if (fallback_ipif == NULL) { 11335 ipif_refhold_locked(ipif); 11336 fallback_ipif = ipif; 11337 } 11338 continue; 11339 } 11340 11341 /* 11342 * Fine. Just assume the same net mask as the 11343 * directly attached subnet interface is using. 11344 */ 11345 ipif_refhold_locked(ipif); 11346 mutex_exit(&ill->ill_lock); 11347 rw_exit(&ipst->ips_ill_g_lock); 11348 if (fallback_ipif != NULL) 11349 ipif_refrele(fallback_ipif); 11350 *ipifp = ipif; 11351 return (ipif->ipif_net_mask); 11352 } 11353 } 11354 mutex_exit(&ill->ill_lock); 11355 } 11356 rw_exit(&ipst->ips_ill_g_lock); 11357 11358 *ipifp = fallback_ipif; 11359 return ((fallback_ipif != NULL) ? 11360 fallback_ipif->ipif_net_mask : net_mask); 11361 } 11362 11363 /* 11364 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 11365 */ 11366 static void 11367 ip_wput_ioctl(queue_t *q, mblk_t *mp) 11368 { 11369 IOCP iocp; 11370 ipft_t *ipft; 11371 ipllc_t *ipllc; 11372 mblk_t *mp1; 11373 cred_t *cr; 11374 int error = 0; 11375 conn_t *connp; 11376 11377 ip1dbg(("ip_wput_ioctl")); 11378 iocp = (IOCP)mp->b_rptr; 11379 mp1 = mp->b_cont; 11380 if (mp1 == NULL) { 11381 iocp->ioc_error = EINVAL; 11382 mp->b_datap->db_type = M_IOCNAK; 11383 iocp->ioc_count = 0; 11384 qreply(q, mp); 11385 return; 11386 } 11387 11388 /* 11389 * These IOCTLs provide various control capabilities to 11390 * upstream agents such as ULPs and processes. There 11391 * are currently two such IOCTLs implemented. They 11392 * are used by TCP to provide update information for 11393 * existing IREs and to forcibly delete an IRE for a 11394 * host that is not responding, thereby forcing an 11395 * attempt at a new route. 11396 */ 11397 iocp->ioc_error = EINVAL; 11398 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 11399 goto done; 11400 11401 ipllc = (ipllc_t *)mp1->b_rptr; 11402 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 11403 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 11404 break; 11405 } 11406 /* 11407 * prefer credential from mblk over ioctl; 11408 * see ip_sioctl_copyin_setup 11409 */ 11410 cr = msg_getcred(mp, NULL); 11411 if (cr == NULL) 11412 cr = iocp->ioc_cr; 11413 11414 /* 11415 * Refhold the conn in case the request gets queued up in some lookup 11416 */ 11417 ASSERT(CONN_Q(q)); 11418 connp = Q_TO_CONN(q); 11419 CONN_INC_REF(connp); 11420 if (ipft->ipft_pfi && 11421 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 11422 pullupmsg(mp1, ipft->ipft_min_size))) { 11423 error = (*ipft->ipft_pfi)(q, 11424 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 11425 } 11426 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 11427 /* 11428 * CONN_OPER_PENDING_DONE happens in the function called 11429 * through ipft_pfi above. 11430 */ 11431 return; 11432 } 11433 11434 CONN_OPER_PENDING_DONE(connp); 11435 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 11436 freemsg(mp); 11437 return; 11438 } 11439 iocp->ioc_error = error; 11440 11441 done: 11442 mp->b_datap->db_type = M_IOCACK; 11443 if (iocp->ioc_error) 11444 iocp->ioc_count = 0; 11445 qreply(q, mp); 11446 } 11447 11448 /* 11449 * Assign a unique id for the ipif. This is used by sctp_addr.c 11450 * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures. 11451 */ 11452 static void 11453 ipif_assign_seqid(ipif_t *ipif) 11454 { 11455 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 11456 11457 ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1); 11458 } 11459 11460 /* 11461 * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are 11462 * administratively down (i.e., no DAD), of the same type, and locked. Note 11463 * that the clone is complete -- including the seqid -- and the expectation is 11464 * that the caller will either free or overwrite `sipif' before it's unlocked. 11465 */ 11466 static void 11467 ipif_clone(const ipif_t *sipif, ipif_t *dipif) 11468 { 11469 ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock)); 11470 ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock)); 11471 ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 11472 ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 11473 ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type); 11474 11475 dipif->ipif_flags = sipif->ipif_flags; 11476 dipif->ipif_metric = sipif->ipif_metric; 11477 dipif->ipif_zoneid = sipif->ipif_zoneid; 11478 dipif->ipif_v6subnet = sipif->ipif_v6subnet; 11479 dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr; 11480 dipif->ipif_v6net_mask = sipif->ipif_v6net_mask; 11481 dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr; 11482 dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr; 11483 11484 /* 11485 * As per the comment atop the function, we assume that these sipif 11486 * fields will be changed before sipif is unlocked. 11487 */ 11488 dipif->ipif_seqid = sipif->ipif_seqid; 11489 dipif->ipif_state_flags = sipif->ipif_state_flags; 11490 } 11491 11492 /* 11493 * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif' 11494 * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin 11495 * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then 11496 * transfer the xop to `dipif'. Requires that all ipifs are administratively 11497 * down (i.e., no DAD), of the same type, and unlocked. 11498 */ 11499 static void 11500 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif) 11501 { 11502 ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq; 11503 ipxop_t *ipx = ipsq->ipsq_xop; 11504 11505 ASSERT(sipif != dipif); 11506 ASSERT(sipif != virgipif); 11507 11508 /* 11509 * Grab all of the locks that protect the ipif in a defined order. 11510 */ 11511 GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 11512 11513 ipif_clone(sipif, dipif); 11514 if (virgipif != NULL) { 11515 ipif_clone(virgipif, sipif); 11516 mi_free(virgipif); 11517 } 11518 11519 RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 11520 11521 /* 11522 * Transfer ownership of the current xop, if necessary. 11523 */ 11524 if (ipx->ipx_current_ipif == sipif) { 11525 ASSERT(ipx->ipx_pending_ipif == NULL); 11526 mutex_enter(&ipx->ipx_lock); 11527 ipx->ipx_current_ipif = dipif; 11528 mutex_exit(&ipx->ipx_lock); 11529 } 11530 11531 if (virgipif == NULL) 11532 mi_free(sipif); 11533 } 11534 11535 /* 11536 * checks if: 11537 * - <ill_name>:<ipif_id> is at most LIFNAMSIZ - 1 and 11538 * - logical interface is within the allowed range 11539 */ 11540 static int 11541 is_lifname_valid(ill_t *ill, unsigned int ipif_id) 11542 { 11543 if (snprintf(NULL, 0, "%s:%d", ill->ill_name, ipif_id) >= LIFNAMSIZ) 11544 return (ENAMETOOLONG); 11545 11546 if (ipif_id >= ill->ill_ipst->ips_ip_addrs_per_if) 11547 return (ERANGE); 11548 return (0); 11549 } 11550 11551 /* 11552 * Insert the ipif, so that the list of ipifs on the ill will be sorted 11553 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 11554 * be inserted into the first space available in the list. The value of 11555 * ipif_id will then be set to the appropriate value for its position. 11556 */ 11557 static int 11558 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock) 11559 { 11560 ill_t *ill; 11561 ipif_t *tipif; 11562 ipif_t **tipifp; 11563 int id, err; 11564 ip_stack_t *ipst; 11565 11566 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 11567 IAM_WRITER_IPIF(ipif)); 11568 11569 ill = ipif->ipif_ill; 11570 ASSERT(ill != NULL); 11571 ipst = ill->ill_ipst; 11572 11573 /* 11574 * In the case of lo0:0 we already hold the ill_g_lock. 11575 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 11576 * ipif_insert. 11577 */ 11578 if (acquire_g_lock) 11579 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 11580 mutex_enter(&ill->ill_lock); 11581 id = ipif->ipif_id; 11582 tipifp = &(ill->ill_ipif); 11583 if (id == -1) { /* need to find a real id */ 11584 id = 0; 11585 while ((tipif = *tipifp) != NULL) { 11586 ASSERT(tipif->ipif_id >= id); 11587 if (tipif->ipif_id != id) 11588 break; /* non-consecutive id */ 11589 id++; 11590 tipifp = &(tipif->ipif_next); 11591 } 11592 if ((err = is_lifname_valid(ill, id)) != 0) { 11593 mutex_exit(&ill->ill_lock); 11594 if (acquire_g_lock) 11595 rw_exit(&ipst->ips_ill_g_lock); 11596 return (err); 11597 } 11598 ipif->ipif_id = id; /* assign new id */ 11599 } else if ((err = is_lifname_valid(ill, id)) == 0) { 11600 /* we have a real id; insert ipif in the right place */ 11601 while ((tipif = *tipifp) != NULL) { 11602 ASSERT(tipif->ipif_id != id); 11603 if (tipif->ipif_id > id) 11604 break; /* found correct location */ 11605 tipifp = &(tipif->ipif_next); 11606 } 11607 } else { 11608 mutex_exit(&ill->ill_lock); 11609 if (acquire_g_lock) 11610 rw_exit(&ipst->ips_ill_g_lock); 11611 return (err); 11612 } 11613 11614 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 11615 11616 ipif->ipif_next = tipif; 11617 *tipifp = ipif; 11618 mutex_exit(&ill->ill_lock); 11619 if (acquire_g_lock) 11620 rw_exit(&ipst->ips_ill_g_lock); 11621 11622 return (0); 11623 } 11624 11625 static void 11626 ipif_remove(ipif_t *ipif) 11627 { 11628 ipif_t **ipifp; 11629 ill_t *ill = ipif->ipif_ill; 11630 11631 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock)); 11632 11633 mutex_enter(&ill->ill_lock); 11634 ipifp = &ill->ill_ipif; 11635 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 11636 if (*ipifp == ipif) { 11637 *ipifp = ipif->ipif_next; 11638 break; 11639 } 11640 } 11641 mutex_exit(&ill->ill_lock); 11642 } 11643 11644 /* 11645 * Allocate and initialize a new interface control structure. (Always 11646 * called as writer.) 11647 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 11648 * is not part of the global linked list of ills. ipif_seqid is unique 11649 * in the system and to preserve the uniqueness, it is assigned only 11650 * when ill becomes part of the global list. At that point ill will 11651 * have a name. If it doesn't get assigned here, it will get assigned 11652 * in ipif_set_values() as part of SIOCSLIFNAME processing. 11653 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 11654 * the interface flags or any other information from the DL_INFO_ACK for 11655 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 11656 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 11657 * second DL_INFO_ACK comes in from the driver. 11658 */ 11659 static ipif_t * 11660 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, 11661 boolean_t insert, int *errorp) 11662 { 11663 int err; 11664 ipif_t *ipif; 11665 ip_stack_t *ipst = ill->ill_ipst; 11666 11667 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 11668 ill->ill_name, id, (void *)ill)); 11669 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 11670 11671 if (errorp != NULL) 11672 *errorp = 0; 11673 11674 if ((ipif = mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) { 11675 if (errorp != NULL) 11676 *errorp = ENOMEM; 11677 return (NULL); 11678 } 11679 *ipif = ipif_zero; /* start clean */ 11680 11681 ipif->ipif_ill = ill; 11682 ipif->ipif_id = id; /* could be -1 */ 11683 /* 11684 * Inherit the zoneid from the ill; for the shared stack instance 11685 * this is always the global zone 11686 */ 11687 ipif->ipif_zoneid = ill->ill_zoneid; 11688 11689 ipif->ipif_refcnt = 0; 11690 11691 if (insert) { 11692 if ((err = ipif_insert(ipif, ire_type != IRE_LOOPBACK)) != 0) { 11693 mi_free(ipif); 11694 if (errorp != NULL) 11695 *errorp = err; 11696 return (NULL); 11697 } 11698 /* -1 id should have been replaced by real id */ 11699 id = ipif->ipif_id; 11700 ASSERT(id >= 0); 11701 } 11702 11703 if (ill->ill_name[0] != '\0') 11704 ipif_assign_seqid(ipif); 11705 11706 /* 11707 * If this is the zeroth ipif on the IPMP ill, create the illgrp 11708 * (which must not exist yet because the zeroth ipif is created once 11709 * per ill). However, do not not link it to the ipmp_grp_t until 11710 * I_PLINK is called; see ip_sioctl_plink_ipmp() for details. 11711 */ 11712 if (id == 0 && IS_IPMP(ill)) { 11713 if (ipmp_illgrp_create(ill) == NULL) { 11714 if (insert) { 11715 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 11716 ipif_remove(ipif); 11717 rw_exit(&ipst->ips_ill_g_lock); 11718 } 11719 mi_free(ipif); 11720 if (errorp != NULL) 11721 *errorp = ENOMEM; 11722 return (NULL); 11723 } 11724 } 11725 11726 /* 11727 * We grab ill_lock to protect the flag changes. The ipif is still 11728 * not up and can't be looked up until the ioctl completes and the 11729 * IPIF_CHANGING flag is cleared. 11730 */ 11731 mutex_enter(&ill->ill_lock); 11732 11733 ipif->ipif_ire_type = ire_type; 11734 11735 if (ipif->ipif_isv6) { 11736 ill->ill_flags |= ILLF_IPV6; 11737 } else { 11738 ipaddr_t inaddr_any = INADDR_ANY; 11739 11740 ill->ill_flags |= ILLF_IPV4; 11741 11742 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 11743 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11744 &ipif->ipif_v6lcl_addr); 11745 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11746 &ipif->ipif_v6subnet); 11747 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11748 &ipif->ipif_v6net_mask); 11749 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11750 &ipif->ipif_v6brd_addr); 11751 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11752 &ipif->ipif_v6pp_dst_addr); 11753 } 11754 11755 /* 11756 * Don't set the interface flags etc. now, will do it in 11757 * ip_ll_subnet_defaults. 11758 */ 11759 if (!initialize) 11760 goto out; 11761 11762 /* 11763 * NOTE: The IPMP meta-interface is special-cased because it starts 11764 * with no underlying interfaces (and thus an unknown broadcast 11765 * address length), but all interfaces that can be placed into an IPMP 11766 * group are required to be broadcast-capable. 11767 */ 11768 if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) { 11769 /* 11770 * Later detect lack of DLPI driver multicast capability by 11771 * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi(). 11772 */ 11773 ill->ill_flags |= ILLF_MULTICAST; 11774 if (!ipif->ipif_isv6) 11775 ipif->ipif_flags |= IPIF_BROADCAST; 11776 } else { 11777 if (ill->ill_net_type != IRE_LOOPBACK) { 11778 if (ipif->ipif_isv6) 11779 /* 11780 * Note: xresolv interfaces will eventually need 11781 * NOARP set here as well, but that will require 11782 * those external resolvers to have some 11783 * knowledge of that flag and act appropriately. 11784 * Not to be changed at present. 11785 */ 11786 ill->ill_flags |= ILLF_NONUD; 11787 else 11788 ill->ill_flags |= ILLF_NOARP; 11789 } 11790 if (ill->ill_phys_addr_length == 0) { 11791 if (IS_VNI(ill)) { 11792 ipif->ipif_flags |= IPIF_NOXMIT; 11793 } else { 11794 /* pt-pt supports multicast. */ 11795 ill->ill_flags |= ILLF_MULTICAST; 11796 if (ill->ill_net_type != IRE_LOOPBACK) 11797 ipif->ipif_flags |= IPIF_POINTOPOINT; 11798 } 11799 } 11800 } 11801 out: 11802 mutex_exit(&ill->ill_lock); 11803 return (ipif); 11804 } 11805 11806 /* 11807 * Remove the neighbor cache entries associated with this logical 11808 * interface. 11809 */ 11810 int 11811 ipif_arp_down(ipif_t *ipif) 11812 { 11813 ill_t *ill = ipif->ipif_ill; 11814 int err = 0; 11815 11816 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 11817 ASSERT(IAM_WRITER_IPIF(ipif)); 11818 11819 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down", 11820 ill_t *, ill, ipif_t *, ipif); 11821 ipif_nce_down(ipif); 11822 11823 /* 11824 * If this is the last ipif that is going down and there are no 11825 * duplicate addresses we may yet attempt to re-probe, then we need to 11826 * clean up ARP completely. 11827 */ 11828 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 11829 !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) { 11830 /* 11831 * If this was the last ipif on an IPMP interface, purge any 11832 * static ARP entries associated with it. 11833 */ 11834 if (IS_IPMP(ill)) 11835 ipmp_illgrp_refresh_arpent(ill->ill_grp); 11836 11837 /* UNBIND, DETACH */ 11838 err = arp_ll_down(ill); 11839 } 11840 11841 return (err); 11842 } 11843 11844 /* 11845 * Get the resolver set up for a new IP address. (Always called as writer.) 11846 * Called both for IPv4 and IPv6 interfaces, though it only does some 11847 * basic DAD related initialization for IPv6. Honors ILLF_NOARP. 11848 * 11849 * The enumerated value res_act tunes the behavior: 11850 * * Res_act_initial: set up all the resolver structures for a new 11851 * IP address. 11852 * * Res_act_defend: tell ARP that it needs to send a single gratuitous 11853 * ARP message in defense of the address. 11854 * * Res_act_rebind: tell ARP to change the hardware address for an IP 11855 * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif(). 11856 * 11857 * Returns zero on success, or an errno upon failure. 11858 */ 11859 int 11860 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 11861 { 11862 ill_t *ill = ipif->ipif_ill; 11863 int err; 11864 boolean_t was_dup; 11865 11866 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 11867 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 11868 ASSERT(IAM_WRITER_IPIF(ipif)); 11869 11870 was_dup = B_FALSE; 11871 if (res_act == Res_act_initial) { 11872 ipif->ipif_addr_ready = 0; 11873 /* 11874 * We're bringing an interface up here. There's no way that we 11875 * should need to shut down ARP now. 11876 */ 11877 mutex_enter(&ill->ill_lock); 11878 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11879 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11880 ill->ill_ipif_dup_count--; 11881 was_dup = B_TRUE; 11882 } 11883 mutex_exit(&ill->ill_lock); 11884 } 11885 if (ipif->ipif_recovery_id != 0) 11886 (void) untimeout(ipif->ipif_recovery_id); 11887 ipif->ipif_recovery_id = 0; 11888 if (ill->ill_net_type != IRE_IF_RESOLVER) { 11889 ipif->ipif_addr_ready = 1; 11890 return (0); 11891 } 11892 /* NDP will set the ipif_addr_ready flag when it's ready */ 11893 if (ill->ill_isv6) 11894 return (0); 11895 11896 err = ipif_arp_up(ipif, res_act, was_dup); 11897 return (err); 11898 } 11899 11900 /* 11901 * This routine restarts IPv4/IPv6 duplicate address detection (DAD) 11902 * when a link has just gone back up. 11903 */ 11904 static void 11905 ipif_nce_start_dad(ipif_t *ipif) 11906 { 11907 ncec_t *ncec; 11908 ill_t *ill = ipif->ipif_ill; 11909 boolean_t isv6 = ill->ill_isv6; 11910 11911 if (isv6) { 11912 ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill, 11913 &ipif->ipif_v6lcl_addr); 11914 } else { 11915 ipaddr_t v4addr; 11916 11917 if (ill->ill_net_type != IRE_IF_RESOLVER || 11918 (ipif->ipif_flags & IPIF_UNNUMBERED) || 11919 ipif->ipif_lcl_addr == INADDR_ANY) { 11920 /* 11921 * If we can't contact ARP for some reason, 11922 * that's not really a problem. Just send 11923 * out the routing socket notification that 11924 * DAD completion would have done, and continue. 11925 */ 11926 ipif_mask_reply(ipif); 11927 ipif_up_notify(ipif); 11928 ipif->ipif_addr_ready = 1; 11929 return; 11930 } 11931 11932 IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr); 11933 ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr); 11934 } 11935 11936 if (ncec == NULL) { 11937 ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n", 11938 (void *)ipif)); 11939 return; 11940 } 11941 if (!nce_restart_dad(ncec)) { 11942 /* 11943 * If we can't restart DAD for some reason, that's not really a 11944 * problem. Just send out the routing socket notification that 11945 * DAD completion would have done, and continue. 11946 */ 11947 ipif_up_notify(ipif); 11948 ipif->ipif_addr_ready = 1; 11949 } 11950 ncec_refrele(ncec); 11951 } 11952 11953 /* 11954 * Restart duplicate address detection on all interfaces on the given ill. 11955 * 11956 * This is called when an interface transitions from down to up 11957 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 11958 * 11959 * Note that since the underlying physical link has transitioned, we must cause 11960 * at least one routing socket message to be sent here, either via DAD 11961 * completion or just by default on the first ipif. (If we don't do this, then 11962 * in.mpathd will see long delays when doing link-based failure recovery.) 11963 */ 11964 void 11965 ill_restart_dad(ill_t *ill, boolean_t went_up) 11966 { 11967 ipif_t *ipif; 11968 11969 if (ill == NULL) 11970 return; 11971 11972 /* 11973 * If layer two doesn't support duplicate address detection, then just 11974 * send the routing socket message now and be done with it. 11975 */ 11976 if (!ill->ill_isv6 && arp_no_defense) { 11977 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 11978 return; 11979 } 11980 11981 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 11982 if (went_up) { 11983 11984 if (ipif->ipif_flags & IPIF_UP) { 11985 ipif_nce_start_dad(ipif); 11986 } else if (ipif->ipif_flags & IPIF_DUPLICATE) { 11987 /* 11988 * kick off the bring-up process now. 11989 */ 11990 ipif_do_recovery(ipif); 11991 } else { 11992 /* 11993 * Unfortunately, the first ipif is "special" 11994 * and represents the underlying ill in the 11995 * routing socket messages. Thus, when this 11996 * one ipif is down, we must still notify so 11997 * that the user knows the IFF_RUNNING status 11998 * change. (If the first ipif is up, then 11999 * we'll handle eventual routing socket 12000 * notification via DAD completion.) 12001 */ 12002 if (ipif == ill->ill_ipif) { 12003 ip_rts_ifmsg(ill->ill_ipif, 12004 RTSQ_DEFAULT); 12005 } 12006 } 12007 } else { 12008 /* 12009 * After link down, we'll need to send a new routing 12010 * message when the link comes back, so clear 12011 * ipif_addr_ready. 12012 */ 12013 ipif->ipif_addr_ready = 0; 12014 } 12015 } 12016 12017 /* 12018 * If we've torn down links, then notify the user right away. 12019 */ 12020 if (!went_up) 12021 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 12022 } 12023 12024 static void 12025 ipsq_delete(ipsq_t *ipsq) 12026 { 12027 ipxop_t *ipx = ipsq->ipsq_xop; 12028 12029 ipsq->ipsq_ipst = NULL; 12030 ASSERT(ipsq->ipsq_phyint == NULL); 12031 ASSERT(ipsq->ipsq_xop != NULL); 12032 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL); 12033 ASSERT(ipx->ipx_pending_mp == NULL); 12034 kmem_free(ipsq, sizeof (ipsq_t)); 12035 } 12036 12037 static int 12038 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp) 12039 { 12040 int err = 0; 12041 ipif_t *ipif; 12042 12043 if (ill == NULL) 12044 return (0); 12045 12046 ASSERT(IAM_WRITER_ILL(ill)); 12047 ill->ill_up_ipifs = B_TRUE; 12048 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12049 if (ipif->ipif_was_up) { 12050 if (!(ipif->ipif_flags & IPIF_UP)) 12051 err = ipif_up(ipif, q, mp); 12052 ipif->ipif_was_up = B_FALSE; 12053 if (err != 0) { 12054 ASSERT(err == EINPROGRESS); 12055 return (err); 12056 } 12057 } 12058 } 12059 ill->ill_up_ipifs = B_FALSE; 12060 return (0); 12061 } 12062 12063 /* 12064 * This function is called to bring up all the ipifs that were up before 12065 * bringing the ill down via ill_down_ipifs(). 12066 */ 12067 int 12068 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 12069 { 12070 int err; 12071 12072 ASSERT(IAM_WRITER_ILL(ill)); 12073 12074 if (ill->ill_replumbing) { 12075 ill->ill_replumbing = 0; 12076 /* 12077 * Send down REPLUMB_DONE notification followed by the 12078 * BIND_REQ on the arp stream. 12079 */ 12080 if (!ill->ill_isv6) 12081 arp_send_replumb_conf(ill); 12082 } 12083 err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp); 12084 if (err != 0) 12085 return (err); 12086 12087 return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp)); 12088 } 12089 12090 /* 12091 * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring 12092 * down the ipifs without sending DL_UNBIND_REQ to the driver. 12093 */ 12094 static void 12095 ill_down_ipifs(ill_t *ill, boolean_t logical) 12096 { 12097 ipif_t *ipif; 12098 12099 ASSERT(IAM_WRITER_ILL(ill)); 12100 12101 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12102 /* 12103 * We go through the ipif_down logic even if the ipif 12104 * is already down, since routes can be added based 12105 * on down ipifs. Going through ipif_down once again 12106 * will delete any IREs created based on these routes. 12107 */ 12108 if (ipif->ipif_flags & IPIF_UP) 12109 ipif->ipif_was_up = B_TRUE; 12110 12111 if (logical) { 12112 (void) ipif_logical_down(ipif, NULL, NULL); 12113 ipif_non_duplicate(ipif); 12114 (void) ipif_down_tail(ipif); 12115 } else { 12116 (void) ipif_down(ipif, NULL, NULL); 12117 } 12118 } 12119 } 12120 12121 /* 12122 * Redo source address selection. This makes IXAF_VERIFY_SOURCE take 12123 * a look again at valid source addresses. 12124 * This should be called each time after the set of source addresses has been 12125 * changed. 12126 */ 12127 void 12128 ip_update_source_selection(ip_stack_t *ipst) 12129 { 12130 /* We skip past SRC_GENERATION_VERIFY */ 12131 if (atomic_add_32_nv(&ipst->ips_src_generation, 1) == 12132 SRC_GENERATION_VERIFY) 12133 atomic_add_32(&ipst->ips_src_generation, 1); 12134 } 12135 12136 /* 12137 * Finish the group join started in ip_sioctl_groupname(). 12138 */ 12139 /* ARGSUSED */ 12140 static void 12141 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 12142 { 12143 ill_t *ill = q->q_ptr; 12144 phyint_t *phyi = ill->ill_phyint; 12145 ipmp_grp_t *grp = phyi->phyint_grp; 12146 ip_stack_t *ipst = ill->ill_ipst; 12147 12148 /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */ 12149 ASSERT(!IS_IPMP(ill) && grp != NULL); 12150 ASSERT(IAM_WRITER_IPSQ(ipsq)); 12151 12152 if (phyi->phyint_illv4 != NULL) { 12153 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12154 VERIFY(grp->gr_pendv4-- > 0); 12155 rw_exit(&ipst->ips_ipmp_lock); 12156 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4); 12157 } 12158 if (phyi->phyint_illv6 != NULL) { 12159 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12160 VERIFY(grp->gr_pendv6-- > 0); 12161 rw_exit(&ipst->ips_ipmp_lock); 12162 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6); 12163 } 12164 freemsg(mp); 12165 } 12166 12167 /* 12168 * Process an SIOCSLIFGROUPNAME request. 12169 */ 12170 /* ARGSUSED */ 12171 int 12172 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12173 ip_ioctl_cmd_t *ipip, void *ifreq) 12174 { 12175 struct lifreq *lifr = ifreq; 12176 ill_t *ill = ipif->ipif_ill; 12177 ip_stack_t *ipst = ill->ill_ipst; 12178 phyint_t *phyi = ill->ill_phyint; 12179 ipmp_grp_t *grp = phyi->phyint_grp; 12180 mblk_t *ipsq_mp; 12181 int err = 0; 12182 12183 /* 12184 * Note that phyint_grp can only change here, where we're exclusive. 12185 */ 12186 ASSERT(IAM_WRITER_ILL(ill)); 12187 12188 if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL || 12189 (phyi->phyint_flags & PHYI_VIRTUAL)) 12190 return (EINVAL); 12191 12192 lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0'; 12193 12194 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12195 12196 /* 12197 * If the name hasn't changed, there's nothing to do. 12198 */ 12199 if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0) 12200 goto unlock; 12201 12202 /* 12203 * Handle requests to rename an IPMP meta-interface. 12204 * 12205 * Note that creation of the IPMP meta-interface is handled in 12206 * userland through the standard plumbing sequence. As part of the 12207 * plumbing the IPMP meta-interface, its initial groupname is set to 12208 * the name of the interface (see ipif_set_values_tail()). 12209 */ 12210 if (IS_IPMP(ill)) { 12211 err = ipmp_grp_rename(grp, lifr->lifr_groupname); 12212 goto unlock; 12213 } 12214 12215 /* 12216 * Handle requests to add or remove an IP interface from a group. 12217 */ 12218 if (lifr->lifr_groupname[0] != '\0') { /* add */ 12219 /* 12220 * Moves are handled by first removing the interface from 12221 * its existing group, and then adding it to another group. 12222 * So, fail if it's already in a group. 12223 */ 12224 if (IS_UNDER_IPMP(ill)) { 12225 err = EALREADY; 12226 goto unlock; 12227 } 12228 12229 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst); 12230 if (grp == NULL) { 12231 err = ENOENT; 12232 goto unlock; 12233 } 12234 12235 /* 12236 * Check if the phyint and its ills are suitable for 12237 * inclusion into the group. 12238 */ 12239 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0) 12240 goto unlock; 12241 12242 /* 12243 * Checks pass; join the group, and enqueue the remaining 12244 * illgrp joins for when we've become part of the group xop 12245 * and are exclusive across its IPSQs. Since qwriter_ip() 12246 * requires an mblk_t to scribble on, and since `mp' will be 12247 * freed as part of completing the ioctl, allocate another. 12248 */ 12249 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) { 12250 err = ENOMEM; 12251 goto unlock; 12252 } 12253 12254 /* 12255 * Before we drop ipmp_lock, bump gr_pend* to ensure that the 12256 * IPMP meta-interface ills needed by `phyi' cannot go away 12257 * before ip_join_illgrps() is called back. See the comments 12258 * in ip_sioctl_plink_ipmp() for more. 12259 */ 12260 if (phyi->phyint_illv4 != NULL) 12261 grp->gr_pendv4++; 12262 if (phyi->phyint_illv6 != NULL) 12263 grp->gr_pendv6++; 12264 12265 rw_exit(&ipst->ips_ipmp_lock); 12266 12267 ipmp_phyint_join_grp(phyi, grp); 12268 ill_refhold(ill); 12269 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps, 12270 SWITCH_OP, B_FALSE); 12271 return (0); 12272 } else { 12273 /* 12274 * Request to remove the interface from a group. If the 12275 * interface is not in a group, this trivially succeeds. 12276 */ 12277 rw_exit(&ipst->ips_ipmp_lock); 12278 if (IS_UNDER_IPMP(ill)) 12279 ipmp_phyint_leave_grp(phyi); 12280 return (0); 12281 } 12282 unlock: 12283 rw_exit(&ipst->ips_ipmp_lock); 12284 return (err); 12285 } 12286 12287 /* 12288 * Process an SIOCGLIFBINDING request. 12289 */ 12290 /* ARGSUSED */ 12291 int 12292 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12293 ip_ioctl_cmd_t *ipip, void *ifreq) 12294 { 12295 ill_t *ill; 12296 struct lifreq *lifr = ifreq; 12297 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12298 12299 if (!IS_IPMP(ipif->ipif_ill)) 12300 return (EINVAL); 12301 12302 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12303 if ((ill = ipif->ipif_bound_ill) == NULL) 12304 lifr->lifr_binding[0] = '\0'; 12305 else 12306 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ); 12307 rw_exit(&ipst->ips_ipmp_lock); 12308 return (0); 12309 } 12310 12311 /* 12312 * Process an SIOCGLIFGROUPNAME request. 12313 */ 12314 /* ARGSUSED */ 12315 int 12316 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12317 ip_ioctl_cmd_t *ipip, void *ifreq) 12318 { 12319 ipmp_grp_t *grp; 12320 struct lifreq *lifr = ifreq; 12321 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12322 12323 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12324 if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL) 12325 lifr->lifr_groupname[0] = '\0'; 12326 else 12327 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ); 12328 rw_exit(&ipst->ips_ipmp_lock); 12329 return (0); 12330 } 12331 12332 /* 12333 * Process an SIOCGLIFGROUPINFO request. 12334 */ 12335 /* ARGSUSED */ 12336 int 12337 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12338 ip_ioctl_cmd_t *ipip, void *dummy) 12339 { 12340 ipmp_grp_t *grp; 12341 lifgroupinfo_t *lifgr; 12342 ip_stack_t *ipst = CONNQ_TO_IPST(q); 12343 12344 /* ip_wput_nondata() verified mp->b_cont->b_cont */ 12345 lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr; 12346 lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0'; 12347 12348 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12349 if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) { 12350 rw_exit(&ipst->ips_ipmp_lock); 12351 return (ENOENT); 12352 } 12353 ipmp_grp_info(grp, lifgr); 12354 rw_exit(&ipst->ips_ipmp_lock); 12355 return (0); 12356 } 12357 12358 static void 12359 ill_dl_down(ill_t *ill) 12360 { 12361 DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill); 12362 12363 /* 12364 * The ill is down; unbind but stay attached since we're still 12365 * associated with a PPA. If we have negotiated DLPI capabilites 12366 * with the data link service provider (IDS_OK) then reset them. 12367 * The interval between unbinding and rebinding is potentially 12368 * unbounded hence we cannot assume things will be the same. 12369 * The DLPI capabilities will be probed again when the data link 12370 * is brought up. 12371 */ 12372 mblk_t *mp = ill->ill_unbind_mp; 12373 12374 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 12375 12376 if (!ill->ill_replumbing) { 12377 /* Free all ilms for this ill */ 12378 update_conn_ill(ill, ill->ill_ipst); 12379 } else { 12380 ill_leave_multicast(ill); 12381 } 12382 12383 ill->ill_unbind_mp = NULL; 12384 if (mp != NULL) { 12385 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 12386 dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 12387 ill->ill_name)); 12388 mutex_enter(&ill->ill_lock); 12389 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 12390 mutex_exit(&ill->ill_lock); 12391 /* 12392 * ip_rput does not pass up normal (M_PROTO) DLPI messages 12393 * after ILL_CONDEMNED is set. So in the unplumb case, we call 12394 * ill_capability_dld_disable disable rightaway. If this is not 12395 * an unplumb operation then the disable happens on receipt of 12396 * the capab ack via ip_rput_dlpi_writer -> 12397 * ill_capability_ack_thr. In both cases the order of 12398 * the operations seen by DLD is capability disable followed 12399 * by DL_UNBIND. Also the DLD capability disable needs a 12400 * cv_wait'able context. 12401 */ 12402 if (ill->ill_state_flags & ILL_CONDEMNED) 12403 ill_capability_dld_disable(ill); 12404 ill_capability_reset(ill, B_FALSE); 12405 ill_dlpi_send(ill, mp); 12406 } 12407 mutex_enter(&ill->ill_lock); 12408 ill->ill_dl_up = 0; 12409 ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); 12410 mutex_exit(&ill->ill_lock); 12411 } 12412 12413 void 12414 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 12415 { 12416 union DL_primitives *dlp; 12417 t_uscalar_t prim; 12418 boolean_t waitack = B_FALSE; 12419 12420 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12421 12422 dlp = (union DL_primitives *)mp->b_rptr; 12423 prim = dlp->dl_primitive; 12424 12425 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 12426 dl_primstr(prim), prim, ill->ill_name)); 12427 12428 switch (prim) { 12429 case DL_PHYS_ADDR_REQ: 12430 { 12431 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 12432 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 12433 break; 12434 } 12435 case DL_BIND_REQ: 12436 mutex_enter(&ill->ill_lock); 12437 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 12438 mutex_exit(&ill->ill_lock); 12439 break; 12440 } 12441 12442 /* 12443 * Except for the ACKs for the M_PCPROTO messages, all other ACKs 12444 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore 12445 * we only wait for the ACK of the DL_UNBIND_REQ. 12446 */ 12447 mutex_enter(&ill->ill_lock); 12448 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 12449 (prim == DL_UNBIND_REQ)) { 12450 ill->ill_dlpi_pending = prim; 12451 waitack = B_TRUE; 12452 } 12453 12454 mutex_exit(&ill->ill_lock); 12455 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch", 12456 char *, dl_primstr(prim), ill_t *, ill); 12457 putnext(ill->ill_wq, mp); 12458 12459 /* 12460 * There is no ack for DL_NOTIFY_CONF messages 12461 */ 12462 if (waitack && prim == DL_NOTIFY_CONF) 12463 ill_dlpi_done(ill, prim); 12464 } 12465 12466 /* 12467 * Helper function for ill_dlpi_send(). 12468 */ 12469 /* ARGSUSED */ 12470 static void 12471 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 12472 { 12473 ill_dlpi_send(q->q_ptr, mp); 12474 } 12475 12476 /* 12477 * Send a DLPI control message to the driver but make sure there 12478 * is only one outstanding message. Uses ill_dlpi_pending to tell 12479 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 12480 * when an ACK or a NAK is received to process the next queued message. 12481 */ 12482 void 12483 ill_dlpi_send(ill_t *ill, mblk_t *mp) 12484 { 12485 mblk_t **mpp; 12486 12487 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12488 12489 /* 12490 * To ensure that any DLPI requests for current exclusive operation 12491 * are always completely sent before any DLPI messages for other 12492 * operations, require writer access before enqueuing. 12493 */ 12494 if (!IAM_WRITER_ILL(ill)) { 12495 ill_refhold(ill); 12496 /* qwriter_ip() does the ill_refrele() */ 12497 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer, 12498 NEW_OP, B_TRUE); 12499 return; 12500 } 12501 12502 mutex_enter(&ill->ill_lock); 12503 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 12504 /* Must queue message. Tail insertion */ 12505 mpp = &ill->ill_dlpi_deferred; 12506 while (*mpp != NULL) 12507 mpp = &((*mpp)->b_next); 12508 12509 ip1dbg(("ill_dlpi_send: deferring request for %s " 12510 "while %s pending\n", ill->ill_name, 12511 dl_primstr(ill->ill_dlpi_pending))); 12512 12513 *mpp = mp; 12514 mutex_exit(&ill->ill_lock); 12515 return; 12516 } 12517 mutex_exit(&ill->ill_lock); 12518 ill_dlpi_dispatch(ill, mp); 12519 } 12520 12521 void 12522 ill_capability_send(ill_t *ill, mblk_t *mp) 12523 { 12524 ill->ill_capab_pending_cnt++; 12525 ill_dlpi_send(ill, mp); 12526 } 12527 12528 void 12529 ill_capability_done(ill_t *ill) 12530 { 12531 ASSERT(ill->ill_capab_pending_cnt != 0); 12532 12533 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 12534 12535 ill->ill_capab_pending_cnt--; 12536 if (ill->ill_capab_pending_cnt == 0 && 12537 ill->ill_dlpi_capab_state == IDCS_OK) 12538 ill_capability_reset_alloc(ill); 12539 } 12540 12541 /* 12542 * Send all deferred DLPI messages without waiting for their ACKs. 12543 */ 12544 void 12545 ill_dlpi_send_deferred(ill_t *ill) 12546 { 12547 mblk_t *mp, *nextmp; 12548 12549 /* 12550 * Clear ill_dlpi_pending so that the message is not queued in 12551 * ill_dlpi_send(). 12552 */ 12553 mutex_enter(&ill->ill_lock); 12554 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12555 mp = ill->ill_dlpi_deferred; 12556 ill->ill_dlpi_deferred = NULL; 12557 mutex_exit(&ill->ill_lock); 12558 12559 for (; mp != NULL; mp = nextmp) { 12560 nextmp = mp->b_next; 12561 mp->b_next = NULL; 12562 ill_dlpi_send(ill, mp); 12563 } 12564 } 12565 12566 /* 12567 * Check if the DLPI primitive `prim' is pending; print a warning if not. 12568 */ 12569 boolean_t 12570 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim) 12571 { 12572 t_uscalar_t pending; 12573 12574 mutex_enter(&ill->ill_lock); 12575 if (ill->ill_dlpi_pending == prim) { 12576 mutex_exit(&ill->ill_lock); 12577 return (B_TRUE); 12578 } 12579 12580 /* 12581 * During teardown, ill_dlpi_dispatch() will send DLPI requests 12582 * without waiting, so don't print any warnings in that case. 12583 */ 12584 if (ill->ill_state_flags & ILL_CONDEMNED) { 12585 mutex_exit(&ill->ill_lock); 12586 return (B_FALSE); 12587 } 12588 pending = ill->ill_dlpi_pending; 12589 mutex_exit(&ill->ill_lock); 12590 12591 if (pending == DL_PRIM_INVAL) { 12592 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 12593 "received unsolicited ack for %s on %s\n", 12594 dl_primstr(prim), ill->ill_name); 12595 } else { 12596 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 12597 "received unexpected ack for %s on %s (expecting %s)\n", 12598 dl_primstr(prim), ill->ill_name, dl_primstr(pending)); 12599 } 12600 return (B_FALSE); 12601 } 12602 12603 /* 12604 * Complete the current DLPI operation associated with `prim' on `ill' and 12605 * start the next queued DLPI operation (if any). If there are no queued DLPI 12606 * operations and the ill's current exclusive IPSQ operation has finished 12607 * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to 12608 * allow the next exclusive IPSQ operation to begin upon ipsq_exit(). See 12609 * the comments above ipsq_current_finish() for details. 12610 */ 12611 void 12612 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 12613 { 12614 mblk_t *mp; 12615 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 12616 ipxop_t *ipx = ipsq->ipsq_xop; 12617 12618 ASSERT(IAM_WRITER_IPSQ(ipsq)); 12619 mutex_enter(&ill->ill_lock); 12620 12621 ASSERT(prim != DL_PRIM_INVAL); 12622 ASSERT(ill->ill_dlpi_pending == prim); 12623 12624 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 12625 dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 12626 12627 if ((mp = ill->ill_dlpi_deferred) == NULL) { 12628 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12629 if (ipx->ipx_current_done) { 12630 mutex_enter(&ipx->ipx_lock); 12631 ipx->ipx_current_ipif = NULL; 12632 mutex_exit(&ipx->ipx_lock); 12633 } 12634 cv_signal(&ill->ill_cv); 12635 mutex_exit(&ill->ill_lock); 12636 return; 12637 } 12638 12639 ill->ill_dlpi_deferred = mp->b_next; 12640 mp->b_next = NULL; 12641 mutex_exit(&ill->ill_lock); 12642 12643 ill_dlpi_dispatch(ill, mp); 12644 } 12645 12646 /* 12647 * Queue a (multicast) DLPI control message to be sent to the driver by 12648 * later calling ill_dlpi_send_queued. 12649 * We queue them while holding a lock (ill_mcast_lock) to ensure that they 12650 * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ 12651 * for the same group to race. 12652 * We send DLPI control messages in order using ill_lock. 12653 * For IPMP we should be called on the cast_ill. 12654 */ 12655 void 12656 ill_dlpi_queue(ill_t *ill, mblk_t *mp) 12657 { 12658 mblk_t **mpp; 12659 12660 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12661 12662 mutex_enter(&ill->ill_lock); 12663 /* Must queue message. Tail insertion */ 12664 mpp = &ill->ill_dlpi_deferred; 12665 while (*mpp != NULL) 12666 mpp = &((*mpp)->b_next); 12667 12668 *mpp = mp; 12669 mutex_exit(&ill->ill_lock); 12670 } 12671 12672 /* 12673 * Send the messages that were queued. Make sure there is only 12674 * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done() 12675 * when an ACK or a NAK is received to process the next queued message. 12676 * For IPMP we are called on the upper ill, but when send what is queued 12677 * on the cast_ill. 12678 */ 12679 void 12680 ill_dlpi_send_queued(ill_t *ill) 12681 { 12682 mblk_t *mp; 12683 union DL_primitives *dlp; 12684 t_uscalar_t prim; 12685 ill_t *release_ill = NULL; 12686 12687 if (IS_IPMP(ill)) { 12688 /* On the upper IPMP ill. */ 12689 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12690 if (release_ill == NULL) { 12691 /* Avoid ever sending anything down to the ipmpstub */ 12692 return; 12693 } 12694 ill = release_ill; 12695 } 12696 mutex_enter(&ill->ill_lock); 12697 while ((mp = ill->ill_dlpi_deferred) != NULL) { 12698 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 12699 /* Can't send. Somebody else will send it */ 12700 mutex_exit(&ill->ill_lock); 12701 goto done; 12702 } 12703 ill->ill_dlpi_deferred = mp->b_next; 12704 mp->b_next = NULL; 12705 if (!ill->ill_dl_up) { 12706 /* 12707 * Nobody there. All multicast addresses will be 12708 * re-joined when we get the DL_BIND_ACK bringing the 12709 * interface up. 12710 */ 12711 freemsg(mp); 12712 continue; 12713 } 12714 dlp = (union DL_primitives *)mp->b_rptr; 12715 prim = dlp->dl_primitive; 12716 12717 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 12718 (prim == DL_UNBIND_REQ)) { 12719 ill->ill_dlpi_pending = prim; 12720 } 12721 mutex_exit(&ill->ill_lock); 12722 12723 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued", 12724 char *, dl_primstr(prim), ill_t *, ill); 12725 putnext(ill->ill_wq, mp); 12726 mutex_enter(&ill->ill_lock); 12727 } 12728 mutex_exit(&ill->ill_lock); 12729 done: 12730 if (release_ill != NULL) 12731 ill_refrele(release_ill); 12732 } 12733 12734 /* 12735 * Queue an IP (IGMP/MLD) message to be sent by IP from 12736 * ill_mcast_send_queued 12737 * We queue them while holding a lock (ill_mcast_lock) to ensure that they 12738 * are sent in order i.e., prevent a IGMP leave and IGMP join for the same 12739 * group to race. 12740 * We send them in order using ill_lock. 12741 * For IPMP we are called on the upper ill, but we queue on the cast_ill. 12742 */ 12743 void 12744 ill_mcast_queue(ill_t *ill, mblk_t *mp) 12745 { 12746 mblk_t **mpp; 12747 ill_t *release_ill = NULL; 12748 12749 ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock)); 12750 12751 if (IS_IPMP(ill)) { 12752 /* On the upper IPMP ill. */ 12753 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12754 if (release_ill == NULL) { 12755 /* Discard instead of queuing for the ipmp interface */ 12756 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 12757 ip_drop_output("ipIfStatsOutDiscards - no cast_ill", 12758 mp, ill); 12759 freemsg(mp); 12760 return; 12761 } 12762 ill = release_ill; 12763 } 12764 12765 mutex_enter(&ill->ill_lock); 12766 /* Must queue message. Tail insertion */ 12767 mpp = &ill->ill_mcast_deferred; 12768 while (*mpp != NULL) 12769 mpp = &((*mpp)->b_next); 12770 12771 *mpp = mp; 12772 mutex_exit(&ill->ill_lock); 12773 if (release_ill != NULL) 12774 ill_refrele(release_ill); 12775 } 12776 12777 /* 12778 * Send the IP packets that were queued by ill_mcast_queue. 12779 * These are IGMP/MLD packets. 12780 * 12781 * For IPMP we are called on the upper ill, but when send what is queued 12782 * on the cast_ill. 12783 * 12784 * Request loopback of the report if we are acting as a multicast 12785 * router, so that the process-level routing demon can hear it. 12786 * This will run multiple times for the same group if there are members 12787 * on the same group for multiple ipif's on the same ill. The 12788 * igmp_input/mld_input code will suppress this due to the loopback thus we 12789 * always loopback membership report. 12790 * 12791 * We also need to make sure that this does not get load balanced 12792 * by IPMP. We do this by passing an ill to ip_output_simple. 12793 */ 12794 void 12795 ill_mcast_send_queued(ill_t *ill) 12796 { 12797 mblk_t *mp; 12798 ip_xmit_attr_t ixas; 12799 ill_t *release_ill = NULL; 12800 12801 if (IS_IPMP(ill)) { 12802 /* On the upper IPMP ill. */ 12803 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12804 if (release_ill == NULL) { 12805 /* 12806 * We should have no messages on the ipmp interface 12807 * but no point in trying to send them. 12808 */ 12809 return; 12810 } 12811 ill = release_ill; 12812 } 12813 bzero(&ixas, sizeof (ixas)); 12814 ixas.ixa_zoneid = ALL_ZONES; 12815 ixas.ixa_cred = kcred; 12816 ixas.ixa_cpid = NOPID; 12817 ixas.ixa_tsl = NULL; 12818 /* 12819 * Here we set ixa_ifindex. If IPMP it will be the lower ill which 12820 * makes ip_select_route pick the IRE_MULTICAST for the cast_ill. 12821 * That is necessary to handle IGMP/MLD snooping switches. 12822 */ 12823 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 12824 ixas.ixa_ipst = ill->ill_ipst; 12825 12826 mutex_enter(&ill->ill_lock); 12827 while ((mp = ill->ill_mcast_deferred) != NULL) { 12828 ill->ill_mcast_deferred = mp->b_next; 12829 mp->b_next = NULL; 12830 if (!ill->ill_dl_up) { 12831 /* 12832 * Nobody there. Just drop the ip packets. 12833 * IGMP/MLD will resend later, if this is a replumb. 12834 */ 12835 freemsg(mp); 12836 continue; 12837 } 12838 mutex_enter(&ill->ill_phyint->phyint_lock); 12839 if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) { 12840 /* 12841 * When the ill is getting deactivated, we only want to 12842 * send the DLPI messages, so drop IGMP/MLD packets. 12843 * DLPI messages are handled by ill_dlpi_send_queued() 12844 */ 12845 mutex_exit(&ill->ill_phyint->phyint_lock); 12846 freemsg(mp); 12847 continue; 12848 } 12849 mutex_exit(&ill->ill_phyint->phyint_lock); 12850 mutex_exit(&ill->ill_lock); 12851 12852 /* Check whether we are sending IPv4 or IPv6. */ 12853 if (ill->ill_isv6) { 12854 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 12855 12856 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 12857 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; 12858 } else { 12859 ipha_t *ipha = (ipha_t *)mp->b_rptr; 12860 12861 ixas.ixa_multicast_ttl = ipha->ipha_ttl; 12862 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 12863 ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM; 12864 } 12865 12866 ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE; 12867 (void) ip_output_simple(mp, &ixas); 12868 ixa_cleanup(&ixas); 12869 12870 mutex_enter(&ill->ill_lock); 12871 } 12872 mutex_exit(&ill->ill_lock); 12873 12874 done: 12875 if (release_ill != NULL) 12876 ill_refrele(release_ill); 12877 } 12878 12879 /* 12880 * Take down a specific interface, but don't lose any information about it. 12881 * (Always called as writer.) 12882 * This function goes through the down sequence even if the interface is 12883 * already down. There are 2 reasons. 12884 * a. Currently we permit interface routes that depend on down interfaces 12885 * to be added. This behaviour itself is questionable. However it appears 12886 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 12887 * time. We go thru the cleanup in order to remove these routes. 12888 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 12889 * DL_ERROR_ACK in response to the DL_BIND request. The interface is 12890 * down, but we need to cleanup i.e. do ill_dl_down and 12891 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 12892 * 12893 * IP-MT notes: 12894 * 12895 * Model of reference to interfaces. 12896 * 12897 * The following members in ipif_t track references to the ipif. 12898 * int ipif_refcnt; Active reference count 12899 * 12900 * The following members in ill_t track references to the ill. 12901 * int ill_refcnt; active refcnt 12902 * uint_t ill_ire_cnt; Number of ires referencing ill 12903 * uint_t ill_ncec_cnt; Number of ncecs referencing ill 12904 * uint_t ill_nce_cnt; Number of nces referencing ill 12905 * uint_t ill_ilm_cnt; Number of ilms referencing ill 12906 * 12907 * Reference to an ipif or ill can be obtained in any of the following ways. 12908 * 12909 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 12910 * Pointers to ipif / ill from other data structures viz ire and conn. 12911 * Implicit reference to the ipif / ill by holding a reference to the ire. 12912 * 12913 * The ipif/ill lookup functions return a reference held ipif / ill. 12914 * ipif_refcnt and ill_refcnt track the reference counts respectively. 12915 * This is a purely dynamic reference count associated with threads holding 12916 * references to the ipif / ill. Pointers from other structures do not 12917 * count towards this reference count. 12918 * 12919 * ill_ire_cnt is the number of ire's associated with the 12920 * ill. This is incremented whenever a new ire is created referencing the 12921 * ill. This is done atomically inside ire_add_v[46] where the ire is 12922 * actually added to the ire hash table. The count is decremented in 12923 * ire_inactive where the ire is destroyed. 12924 * 12925 * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill. 12926 * This is incremented atomically in 12927 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the 12928 * table. Similarly it is decremented in ncec_inactive() where the ncec 12929 * is destroyed. 12930 * 12931 * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is 12932 * incremented atomically in nce_add() where the nce is actually added to the 12933 * ill_nce. Similarly it is decremented in nce_inactive() where the nce 12934 * is destroyed. 12935 * 12936 * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in 12937 * ilm_add() and decremented before the ilm is freed in ilm_delete(). 12938 * 12939 * Flow of ioctls involving interface down/up 12940 * 12941 * The following is the sequence of an attempt to set some critical flags on an 12942 * up interface. 12943 * ip_sioctl_flags 12944 * ipif_down 12945 * wait for ipif to be quiescent 12946 * ipif_down_tail 12947 * ip_sioctl_flags_tail 12948 * 12949 * All set ioctls that involve down/up sequence would have a skeleton similar 12950 * to the above. All the *tail functions are called after the refcounts have 12951 * dropped to the appropriate values. 12952 * 12953 * SIOC ioctls during the IPIF_CHANGING interval. 12954 * 12955 * Threads handling SIOC set ioctls serialize on the squeue, but this 12956 * is not done for SIOC get ioctls. Since a set ioctl can cause several 12957 * steps of internal changes to the state, some of which are visible in 12958 * ipif_flags (such as IFF_UP being cleared and later set), and we want 12959 * the set ioctl to be atomic related to the get ioctls, the SIOC get code 12960 * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then 12961 * enqueued in the ipsq and the operation is restarted by ipsq_exit() when 12962 * the current exclusive operation completes. The IPIF_CHANGING check 12963 * and enqueue is atomic using the ill_lock and ipsq_lock. The 12964 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 12965 * change while the ill_lock is held. Before dropping the ill_lock we acquire 12966 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 12967 * until we release the ipsq_lock, even though the ill/ipif state flags 12968 * can change after we drop the ill_lock. 12969 */ 12970 int 12971 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 12972 { 12973 ill_t *ill = ipif->ipif_ill; 12974 conn_t *connp; 12975 boolean_t success; 12976 boolean_t ipif_was_up = B_FALSE; 12977 ip_stack_t *ipst = ill->ill_ipst; 12978 12979 ASSERT(IAM_WRITER_IPIF(ipif)); 12980 12981 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 12982 12983 DTRACE_PROBE3(ipif__downup, char *, "ipif_down", 12984 ill_t *, ill, ipif_t *, ipif); 12985 12986 if (ipif->ipif_flags & IPIF_UP) { 12987 mutex_enter(&ill->ill_lock); 12988 ipif->ipif_flags &= ~IPIF_UP; 12989 ASSERT(ill->ill_ipif_up_count > 0); 12990 --ill->ill_ipif_up_count; 12991 mutex_exit(&ill->ill_lock); 12992 ipif_was_up = B_TRUE; 12993 /* Update status in SCTP's list */ 12994 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 12995 ill_nic_event_dispatch(ipif->ipif_ill, 12996 MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0); 12997 } 12998 12999 /* 13000 * Blow away memberships we established in ipif_multicast_up(). 13001 */ 13002 ipif_multicast_down(ipif); 13003 13004 /* 13005 * Remove from the mapping for __sin6_src_id. We insert only 13006 * when the address is not INADDR_ANY. As IPv4 addresses are 13007 * stored as mapped addresses, we need to check for mapped 13008 * INADDR_ANY also. 13009 */ 13010 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 13011 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 13012 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 13013 int err; 13014 13015 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 13016 ipif->ipif_zoneid, ipst); 13017 if (err != 0) { 13018 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 13019 } 13020 } 13021 13022 if (ipif_was_up) { 13023 /* only delete if we'd added ire's before */ 13024 if (ipif->ipif_isv6) 13025 ipif_delete_ires_v6(ipif); 13026 else 13027 ipif_delete_ires_v4(ipif); 13028 } 13029 13030 if (ipif_was_up && ill->ill_ipif_up_count == 0) { 13031 /* 13032 * Since the interface is now down, it may have just become 13033 * inactive. Note that this needs to be done even for a 13034 * lll_logical_down(), or ARP entries will not get correctly 13035 * restored when the interface comes back up. 13036 */ 13037 if (IS_UNDER_IPMP(ill)) 13038 ipmp_ill_refresh_active(ill); 13039 } 13040 13041 /* 13042 * neighbor-discovery or arp entries for this interface. The ipif 13043 * has to be quiesced, so we walk all the nce's and delete those 13044 * that point at the ipif->ipif_ill. At the same time, we also 13045 * update IPMP so that ipifs for data addresses are unbound. We dont 13046 * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer 13047 * that for ipif_down_tail() 13048 */ 13049 ipif_nce_down(ipif); 13050 13051 /* 13052 * If this is the last ipif on the ill, we also need to remove 13053 * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will 13054 * never succeed. 13055 */ 13056 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) 13057 ire_walk_ill(0, 0, ill_downi, ill, ill); 13058 13059 /* 13060 * Walk all CONNs that can have a reference on an ire for this 13061 * ipif (we actually walk all that now have stale references). 13062 */ 13063 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); 13064 13065 /* 13066 * If mp is NULL the caller will wait for the appropriate refcnt. 13067 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 13068 * and ill_delete -> ipif_free -> ipif_down 13069 */ 13070 if (mp == NULL) { 13071 ASSERT(q == NULL); 13072 return (0); 13073 } 13074 13075 if (CONN_Q(q)) { 13076 connp = Q_TO_CONN(q); 13077 mutex_enter(&connp->conn_lock); 13078 } else { 13079 connp = NULL; 13080 } 13081 mutex_enter(&ill->ill_lock); 13082 /* 13083 * Are there any ire's pointing to this ipif that are still active ? 13084 * If this is the last ipif going down, are there any ire's pointing 13085 * to this ill that are still active ? 13086 */ 13087 if (ipif_is_quiescent(ipif)) { 13088 mutex_exit(&ill->ill_lock); 13089 if (connp != NULL) 13090 mutex_exit(&connp->conn_lock); 13091 return (0); 13092 } 13093 13094 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 13095 ill->ill_name, (void *)ill)); 13096 /* 13097 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 13098 * drops down, the operation will be restarted by ipif_ill_refrele_tail 13099 * which in turn is called by the last refrele on the ipif/ill/ire. 13100 */ 13101 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 13102 if (!success) { 13103 /* The conn is closing. So just return */ 13104 ASSERT(connp != NULL); 13105 mutex_exit(&ill->ill_lock); 13106 mutex_exit(&connp->conn_lock); 13107 return (EINTR); 13108 } 13109 13110 mutex_exit(&ill->ill_lock); 13111 if (connp != NULL) 13112 mutex_exit(&connp->conn_lock); 13113 return (EINPROGRESS); 13114 } 13115 13116 int 13117 ipif_down_tail(ipif_t *ipif) 13118 { 13119 ill_t *ill = ipif->ipif_ill; 13120 int err = 0; 13121 13122 DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail", 13123 ill_t *, ill, ipif_t *, ipif); 13124 13125 /* 13126 * Skip any loopback interface (null wq). 13127 * If this is the last logical interface on the ill 13128 * have ill_dl_down tell the driver we are gone (unbind) 13129 * Note that lun 0 can ipif_down even though 13130 * there are other logical units that are up. 13131 * This occurs e.g. when we change a "significant" IFF_ flag. 13132 */ 13133 if (ill->ill_wq != NULL && !ill->ill_logical_down && 13134 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 13135 ill->ill_dl_up) { 13136 ill_dl_down(ill); 13137 } 13138 if (!ipif->ipif_isv6) 13139 err = ipif_arp_down(ipif); 13140 13141 ill->ill_logical_down = 0; 13142 13143 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 13144 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT); 13145 return (err); 13146 } 13147 13148 /* 13149 * Bring interface logically down without bringing the physical interface 13150 * down e.g. when the netmask is changed. This avoids long lasting link 13151 * negotiations between an ethernet interface and a certain switches. 13152 */ 13153 static int 13154 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 13155 { 13156 DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down", 13157 ill_t *, ipif->ipif_ill, ipif_t *, ipif); 13158 13159 /* 13160 * The ill_logical_down flag is a transient flag. It is set here 13161 * and is cleared once the down has completed in ipif_down_tail. 13162 * This flag does not indicate whether the ill stream is in the 13163 * DL_BOUND state with the driver. Instead this flag is used by 13164 * ipif_down_tail to determine whether to DL_UNBIND the stream with 13165 * the driver. The state of the ill stream i.e. whether it is 13166 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 13167 */ 13168 ipif->ipif_ill->ill_logical_down = 1; 13169 return (ipif_down(ipif, q, mp)); 13170 } 13171 13172 /* 13173 * Initiate deallocate of an IPIF. Always called as writer. Called by 13174 * ill_delete or ip_sioctl_removeif. 13175 */ 13176 static void 13177 ipif_free(ipif_t *ipif) 13178 { 13179 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13180 13181 ASSERT(IAM_WRITER_IPIF(ipif)); 13182 13183 if (ipif->ipif_recovery_id != 0) 13184 (void) untimeout(ipif->ipif_recovery_id); 13185 ipif->ipif_recovery_id = 0; 13186 13187 /* 13188 * Take down the interface. We can be called either from ill_delete 13189 * or from ip_sioctl_removeif. 13190 */ 13191 (void) ipif_down(ipif, NULL, NULL); 13192 13193 /* 13194 * Now that the interface is down, there's no chance it can still 13195 * become a duplicate. Cancel any timer that may have been set while 13196 * tearing down. 13197 */ 13198 if (ipif->ipif_recovery_id != 0) 13199 (void) untimeout(ipif->ipif_recovery_id); 13200 ipif->ipif_recovery_id = 0; 13201 13202 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13203 /* Remove pointers to this ill in the multicast routing tables */ 13204 reset_mrt_vif_ipif(ipif); 13205 /* If necessary, clear the cached source ipif rotor. */ 13206 if (ipif->ipif_ill->ill_src_ipif == ipif) 13207 ipif->ipif_ill->ill_src_ipif = NULL; 13208 rw_exit(&ipst->ips_ill_g_lock); 13209 } 13210 13211 static void 13212 ipif_free_tail(ipif_t *ipif) 13213 { 13214 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13215 13216 /* 13217 * Need to hold both ill_g_lock and ill_lock while 13218 * inserting or removing an ipif from the linked list 13219 * of ipifs hanging off the ill. 13220 */ 13221 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13222 13223 #ifdef DEBUG 13224 ipif_trace_cleanup(ipif); 13225 #endif 13226 13227 /* Ask SCTP to take it out of it list */ 13228 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 13229 ip_rts_newaddrmsg(RTM_FREEADDR, 0, ipif, RTSQ_DEFAULT); 13230 13231 /* Get it out of the ILL interface list. */ 13232 ipif_remove(ipif); 13233 rw_exit(&ipst->ips_ill_g_lock); 13234 13235 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 13236 ASSERT(ipif->ipif_recovery_id == 0); 13237 ASSERT(ipif->ipif_ire_local == NULL); 13238 ASSERT(ipif->ipif_ire_if == NULL); 13239 13240 /* Free the memory. */ 13241 mi_free(ipif); 13242 } 13243 13244 /* 13245 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id" 13246 * is zero. 13247 */ 13248 void 13249 ipif_get_name(const ipif_t *ipif, char *buf, int len) 13250 { 13251 char lbuf[LIFNAMSIZ]; 13252 char *name; 13253 size_t name_len; 13254 13255 buf[0] = '\0'; 13256 name = ipif->ipif_ill->ill_name; 13257 name_len = ipif->ipif_ill->ill_name_length; 13258 if (ipif->ipif_id != 0) { 13259 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 13260 ipif->ipif_id); 13261 name = lbuf; 13262 name_len = mi_strlen(name) + 1; 13263 } 13264 len -= 1; 13265 buf[len] = '\0'; 13266 len = MIN(len, name_len); 13267 bcopy(name, buf, len); 13268 } 13269 13270 /* 13271 * Sets `buf' to an ill name. 13272 */ 13273 void 13274 ill_get_name(const ill_t *ill, char *buf, int len) 13275 { 13276 char *name; 13277 size_t name_len; 13278 13279 name = ill->ill_name; 13280 name_len = ill->ill_name_length; 13281 len -= 1; 13282 buf[len] = '\0'; 13283 len = MIN(len, name_len); 13284 bcopy(name, buf, len); 13285 } 13286 13287 /* 13288 * Find an IPIF based on the name passed in. Names can be of the form <phys> 13289 * (e.g., le0) or <phys>:<#> (e.g., le0:1). When there is no colon, the 13290 * implied unit id is zero. <phys> must correspond to the name of an ILL. 13291 * (May be called as writer.) 13292 */ 13293 static ipif_t * 13294 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 13295 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst) 13296 { 13297 char *cp; 13298 char *endp; 13299 long id; 13300 ill_t *ill; 13301 ipif_t *ipif; 13302 uint_t ire_type; 13303 boolean_t did_alloc = B_FALSE; 13304 13305 /* 13306 * If the caller wants to us to create the ipif, make sure we have a 13307 * valid zoneid 13308 */ 13309 ASSERT(!do_alloc || zoneid != ALL_ZONES); 13310 13311 if (namelen == 0) { 13312 return (NULL); 13313 } 13314 13315 *exists = B_FALSE; 13316 /* Look for a colon in the name. */ 13317 endp = &name[namelen]; 13318 for (cp = endp; --cp > name; ) { 13319 if (*cp == IPIF_SEPARATOR_CHAR) 13320 break; 13321 } 13322 13323 if (*cp == IPIF_SEPARATOR_CHAR) { 13324 /* 13325 * Reject any non-decimal aliases for logical 13326 * interfaces. Aliases with leading zeroes 13327 * are also rejected as they introduce ambiguity 13328 * in the naming of the interfaces. 13329 * In order to confirm with existing semantics, 13330 * and to not break any programs/script relying 13331 * on that behaviour, if<0>:0 is considered to be 13332 * a valid interface. 13333 * 13334 * If alias has two or more digits and the first 13335 * is zero, fail. 13336 */ 13337 if (&cp[2] < endp && cp[1] == '0') { 13338 return (NULL); 13339 } 13340 } 13341 13342 if (cp <= name) { 13343 cp = endp; 13344 } else { 13345 *cp = '\0'; 13346 } 13347 13348 /* 13349 * Look up the ILL, based on the portion of the name 13350 * before the slash. ill_lookup_on_name returns a held ill. 13351 * Temporary to check whether ill exists already. If so 13352 * ill_lookup_on_name will clear it. 13353 */ 13354 ill = ill_lookup_on_name(name, do_alloc, isv6, 13355 &did_alloc, ipst); 13356 if (cp != endp) 13357 *cp = IPIF_SEPARATOR_CHAR; 13358 if (ill == NULL) 13359 return (NULL); 13360 13361 /* Establish the unit number in the name. */ 13362 id = 0; 13363 if (cp < endp && *endp == '\0') { 13364 /* If there was a colon, the unit number follows. */ 13365 cp++; 13366 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 13367 ill_refrele(ill); 13368 return (NULL); 13369 } 13370 } 13371 13372 mutex_enter(&ill->ill_lock); 13373 /* Now see if there is an IPIF with this unit number. */ 13374 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13375 if (ipif->ipif_id == id) { 13376 if (zoneid != ALL_ZONES && 13377 zoneid != ipif->ipif_zoneid && 13378 ipif->ipif_zoneid != ALL_ZONES) { 13379 mutex_exit(&ill->ill_lock); 13380 ill_refrele(ill); 13381 return (NULL); 13382 } 13383 if (IPIF_CAN_LOOKUP(ipif)) { 13384 ipif_refhold_locked(ipif); 13385 mutex_exit(&ill->ill_lock); 13386 if (!did_alloc) 13387 *exists = B_TRUE; 13388 /* 13389 * Drop locks before calling ill_refrele 13390 * since it can potentially call into 13391 * ipif_ill_refrele_tail which can end up 13392 * in trying to acquire any lock. 13393 */ 13394 ill_refrele(ill); 13395 return (ipif); 13396 } 13397 } 13398 } 13399 13400 if (!do_alloc) { 13401 mutex_exit(&ill->ill_lock); 13402 ill_refrele(ill); 13403 return (NULL); 13404 } 13405 13406 /* 13407 * If none found, atomically allocate and return a new one. 13408 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 13409 * to support "receive only" use of lo0:1 etc. as is still done 13410 * below as an initial guess. 13411 * However, this is now likely to be overriden later in ipif_up_done() 13412 * when we know for sure what address has been configured on the 13413 * interface, since we might have more than one loopback interface 13414 * with a loopback address, e.g. in the case of zones, and all the 13415 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 13416 */ 13417 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 13418 ire_type = IRE_LOOPBACK; 13419 else 13420 ire_type = IRE_LOCAL; 13421 ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE, NULL); 13422 if (ipif != NULL) 13423 ipif_refhold_locked(ipif); 13424 mutex_exit(&ill->ill_lock); 13425 ill_refrele(ill); 13426 return (ipif); 13427 } 13428 13429 /* 13430 * This routine is called whenever a new address comes up on an ipif. If 13431 * we are configured to respond to address mask requests, then we are supposed 13432 * to broadcast an address mask reply at this time. This routine is also 13433 * called if we are already up, but a netmask change is made. This is legal 13434 * but might not make the system manager very popular. (May be called 13435 * as writer.) 13436 */ 13437 void 13438 ipif_mask_reply(ipif_t *ipif) 13439 { 13440 icmph_t *icmph; 13441 ipha_t *ipha; 13442 mblk_t *mp; 13443 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13444 ip_xmit_attr_t ixas; 13445 13446 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 13447 13448 if (!ipst->ips_ip_respond_to_address_mask_broadcast) 13449 return; 13450 13451 /* ICMP mask reply is IPv4 only */ 13452 ASSERT(!ipif->ipif_isv6); 13453 /* ICMP mask reply is not for a loopback interface */ 13454 ASSERT(ipif->ipif_ill->ill_wq != NULL); 13455 13456 if (ipif->ipif_lcl_addr == INADDR_ANY) 13457 return; 13458 13459 mp = allocb(REPLY_LEN, BPRI_HI); 13460 if (mp == NULL) 13461 return; 13462 mp->b_wptr = mp->b_rptr + REPLY_LEN; 13463 13464 ipha = (ipha_t *)mp->b_rptr; 13465 bzero(ipha, REPLY_LEN); 13466 *ipha = icmp_ipha; 13467 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 13468 ipha->ipha_src = ipif->ipif_lcl_addr; 13469 ipha->ipha_dst = ipif->ipif_brd_addr; 13470 ipha->ipha_length = htons(REPLY_LEN); 13471 ipha->ipha_ident = 0; 13472 13473 icmph = (icmph_t *)&ipha[1]; 13474 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 13475 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 13476 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 13477 13478 bzero(&ixas, sizeof (ixas)); 13479 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 13480 ixas.ixa_flags |= IXAF_SET_SOURCE; 13481 ixas.ixa_zoneid = ALL_ZONES; 13482 ixas.ixa_ifindex = 0; 13483 ixas.ixa_ipst = ipst; 13484 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 13485 (void) ip_output_simple(mp, &ixas); 13486 ixa_cleanup(&ixas); 13487 #undef REPLY_LEN 13488 } 13489 13490 /* 13491 * Join the ipif specific multicast groups. 13492 * Must be called after a mapping has been set up in the resolver. (Always 13493 * called as writer.) 13494 */ 13495 void 13496 ipif_multicast_up(ipif_t *ipif) 13497 { 13498 int err; 13499 ill_t *ill; 13500 ilm_t *ilm; 13501 13502 ASSERT(IAM_WRITER_IPIF(ipif)); 13503 13504 ill = ipif->ipif_ill; 13505 13506 ip1dbg(("ipif_multicast_up\n")); 13507 if (!(ill->ill_flags & ILLF_MULTICAST) || 13508 ipif->ipif_allhosts_ilm != NULL) 13509 return; 13510 13511 if (ipif->ipif_isv6) { 13512 in6_addr_t v6allmc = ipv6_all_hosts_mcast; 13513 in6_addr_t v6solmc = ipv6_solicited_node_mcast; 13514 13515 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 13516 13517 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 13518 return; 13519 13520 ip1dbg(("ipif_multicast_up - addmulti\n")); 13521 13522 /* 13523 * Join the all hosts multicast address. We skip this for 13524 * underlying IPMP interfaces since they should be invisible. 13525 */ 13526 if (!IS_UNDER_IPMP(ill)) { 13527 ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid, 13528 &err); 13529 if (ilm == NULL) { 13530 ASSERT(err != 0); 13531 ip0dbg(("ipif_multicast_up: " 13532 "all_hosts_mcast failed %d\n", err)); 13533 return; 13534 } 13535 ipif->ipif_allhosts_ilm = ilm; 13536 } 13537 13538 /* 13539 * Enable multicast for the solicited node multicast address. 13540 * If IPMP we need to put the membership on the upper ill. 13541 */ 13542 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 13543 ill_t *mcast_ill = NULL; 13544 boolean_t need_refrele; 13545 13546 if (IS_UNDER_IPMP(ill) && 13547 (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) { 13548 need_refrele = B_TRUE; 13549 } else { 13550 mcast_ill = ill; 13551 need_refrele = B_FALSE; 13552 } 13553 13554 ilm = ip_addmulti(&v6solmc, mcast_ill, 13555 ipif->ipif_zoneid, &err); 13556 if (need_refrele) 13557 ill_refrele(mcast_ill); 13558 13559 if (ilm == NULL) { 13560 ASSERT(err != 0); 13561 ip0dbg(("ipif_multicast_up: solicited MC" 13562 " failed %d\n", err)); 13563 if ((ilm = ipif->ipif_allhosts_ilm) != NULL) { 13564 ipif->ipif_allhosts_ilm = NULL; 13565 (void) ip_delmulti(ilm); 13566 } 13567 return; 13568 } 13569 ipif->ipif_solmulti_ilm = ilm; 13570 } 13571 } else { 13572 in6_addr_t v6group; 13573 13574 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill)) 13575 return; 13576 13577 /* Join the all hosts multicast address */ 13578 ip1dbg(("ipif_multicast_up - addmulti\n")); 13579 IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group); 13580 13581 ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err); 13582 if (ilm == NULL) { 13583 ASSERT(err != 0); 13584 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 13585 return; 13586 } 13587 ipif->ipif_allhosts_ilm = ilm; 13588 } 13589 } 13590 13591 /* 13592 * Blow away any multicast groups that we joined in ipif_multicast_up(). 13593 * (ilms from explicit memberships are handled in conn_update_ill.) 13594 */ 13595 void 13596 ipif_multicast_down(ipif_t *ipif) 13597 { 13598 ASSERT(IAM_WRITER_IPIF(ipif)); 13599 13600 ip1dbg(("ipif_multicast_down\n")); 13601 13602 if (ipif->ipif_allhosts_ilm != NULL) { 13603 (void) ip_delmulti(ipif->ipif_allhosts_ilm); 13604 ipif->ipif_allhosts_ilm = NULL; 13605 } 13606 if (ipif->ipif_solmulti_ilm != NULL) { 13607 (void) ip_delmulti(ipif->ipif_solmulti_ilm); 13608 ipif->ipif_solmulti_ilm = NULL; 13609 } 13610 } 13611 13612 /* 13613 * Used when an interface comes up to recreate any extra routes on this 13614 * interface. 13615 */ 13616 int 13617 ill_recover_saved_ire(ill_t *ill) 13618 { 13619 mblk_t *mp; 13620 ip_stack_t *ipst = ill->ill_ipst; 13621 13622 ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name)); 13623 13624 mutex_enter(&ill->ill_saved_ire_lock); 13625 for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 13626 ire_t *ire, *nire; 13627 ifrt_t *ifrt; 13628 13629 ifrt = (ifrt_t *)mp->b_rptr; 13630 /* 13631 * Create a copy of the IRE with the saved address and netmask. 13632 */ 13633 if (ill->ill_isv6) { 13634 ire = ire_create_v6( 13635 &ifrt->ifrt_v6addr, 13636 &ifrt->ifrt_v6mask, 13637 &ifrt->ifrt_v6gateway_addr, 13638 ifrt->ifrt_type, 13639 ill, 13640 ifrt->ifrt_zoneid, 13641 ifrt->ifrt_flags, 13642 NULL, 13643 ipst); 13644 } else { 13645 ire = ire_create( 13646 (uint8_t *)&ifrt->ifrt_addr, 13647 (uint8_t *)&ifrt->ifrt_mask, 13648 (uint8_t *)&ifrt->ifrt_gateway_addr, 13649 ifrt->ifrt_type, 13650 ill, 13651 ifrt->ifrt_zoneid, 13652 ifrt->ifrt_flags, 13653 NULL, 13654 ipst); 13655 } 13656 if (ire == NULL) { 13657 mutex_exit(&ill->ill_saved_ire_lock); 13658 return (ENOMEM); 13659 } 13660 13661 if (ifrt->ifrt_flags & RTF_SETSRC) { 13662 if (ill->ill_isv6) { 13663 ire->ire_setsrc_addr_v6 = 13664 ifrt->ifrt_v6setsrc_addr; 13665 } else { 13666 ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr; 13667 } 13668 } 13669 13670 /* 13671 * Some software (for example, GateD and Sun Cluster) attempts 13672 * to create (what amount to) IRE_PREFIX routes with the 13673 * loopback address as the gateway. This is primarily done to 13674 * set up prefixes with the RTF_REJECT flag set (for example, 13675 * when generating aggregate routes.) 13676 * 13677 * If the IRE type (as defined by ill->ill_net_type) is 13678 * IRE_LOOPBACK, then we map the request into a 13679 * IRE_IF_NORESOLVER. 13680 */ 13681 if (ill->ill_net_type == IRE_LOOPBACK) 13682 ire->ire_type = IRE_IF_NORESOLVER; 13683 13684 /* 13685 * ire held by ire_add, will be refreled' towards the 13686 * the end of ipif_up_done 13687 */ 13688 nire = ire_add(ire); 13689 /* 13690 * Check if it was a duplicate entry. This handles 13691 * the case of two racing route adds for the same route 13692 */ 13693 if (nire == NULL) { 13694 ip1dbg(("ill_recover_saved_ire: FAILED\n")); 13695 } else if (nire != ire) { 13696 ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n", 13697 (void *)nire)); 13698 ire_delete(nire); 13699 } else { 13700 ip1dbg(("ill_recover_saved_ire: added ire %p\n", 13701 (void *)nire)); 13702 } 13703 if (nire != NULL) 13704 ire_refrele(nire); 13705 } 13706 mutex_exit(&ill->ill_saved_ire_lock); 13707 return (0); 13708 } 13709 13710 /* 13711 * Used to set the netmask and broadcast address to default values when the 13712 * interface is brought up. (Always called as writer.) 13713 */ 13714 static void 13715 ipif_set_default(ipif_t *ipif) 13716 { 13717 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 13718 13719 if (!ipif->ipif_isv6) { 13720 /* 13721 * Interface holds an IPv4 address. Default 13722 * mask is the natural netmask. 13723 */ 13724 if (!ipif->ipif_net_mask) { 13725 ipaddr_t v4mask; 13726 13727 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 13728 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 13729 } 13730 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 13731 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 13732 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 13733 } else { 13734 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 13735 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 13736 } 13737 /* 13738 * NOTE: SunOS 4.X does this even if the broadcast address 13739 * has been already set thus we do the same here. 13740 */ 13741 if (ipif->ipif_flags & IPIF_BROADCAST) { 13742 ipaddr_t v4addr; 13743 13744 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 13745 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 13746 } 13747 } else { 13748 /* 13749 * Interface holds an IPv6-only address. Default 13750 * mask is all-ones. 13751 */ 13752 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 13753 ipif->ipif_v6net_mask = ipv6_all_ones; 13754 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 13755 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 13756 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 13757 } else { 13758 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 13759 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 13760 } 13761 } 13762 } 13763 13764 /* 13765 * Return 0 if this address can be used as local address without causing 13766 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 13767 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 13768 * Note that the same IPv6 link-local address is allowed as long as the ills 13769 * are not on the same link. 13770 */ 13771 int 13772 ip_addr_availability_check(ipif_t *new_ipif) 13773 { 13774 in6_addr_t our_v6addr; 13775 ill_t *ill; 13776 ipif_t *ipif; 13777 ill_walk_context_t ctx; 13778 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst; 13779 13780 ASSERT(IAM_WRITER_IPIF(new_ipif)); 13781 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock)); 13782 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); 13783 13784 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 13785 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 13786 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 13787 return (0); 13788 13789 our_v6addr = new_ipif->ipif_v6lcl_addr; 13790 13791 if (new_ipif->ipif_isv6) 13792 ill = ILL_START_WALK_V6(&ctx, ipst); 13793 else 13794 ill = ILL_START_WALK_V4(&ctx, ipst); 13795 13796 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 13797 for (ipif = ill->ill_ipif; ipif != NULL; 13798 ipif = ipif->ipif_next) { 13799 if ((ipif == new_ipif) || 13800 !(ipif->ipif_flags & IPIF_UP) || 13801 (ipif->ipif_flags & IPIF_UNNUMBERED) || 13802 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 13803 &our_v6addr)) 13804 continue; 13805 13806 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 13807 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 13808 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 13809 ipif->ipif_flags |= IPIF_UNNUMBERED; 13810 else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) || 13811 IN6_IS_ADDR_SITELOCAL(&our_v6addr)) && 13812 !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill)) 13813 continue; 13814 else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid && 13815 ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill)) 13816 continue; 13817 else if (new_ipif->ipif_ill == ill) 13818 return (EADDRINUSE); 13819 else 13820 return (EADDRNOTAVAIL); 13821 } 13822 } 13823 13824 return (0); 13825 } 13826 13827 /* 13828 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 13829 * IREs for the ipif. 13830 * When the routine returns EINPROGRESS then mp has been consumed and 13831 * the ioctl will be acked from ip_rput_dlpi. 13832 */ 13833 int 13834 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 13835 { 13836 ill_t *ill = ipif->ipif_ill; 13837 boolean_t isv6 = ipif->ipif_isv6; 13838 int err = 0; 13839 boolean_t success; 13840 uint_t ipif_orig_id; 13841 ip_stack_t *ipst = ill->ill_ipst; 13842 13843 ASSERT(IAM_WRITER_IPIF(ipif)); 13844 13845 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13846 DTRACE_PROBE3(ipif__downup, char *, "ipif_up", 13847 ill_t *, ill, ipif_t *, ipif); 13848 13849 /* Shouldn't get here if it is already up. */ 13850 if (ipif->ipif_flags & IPIF_UP) 13851 return (EALREADY); 13852 13853 /* 13854 * If this is a request to bring up a data address on an interface 13855 * under IPMP, then move the address to its IPMP meta-interface and 13856 * try to bring it up. One complication is that the zeroth ipif for 13857 * an ill is special, in that every ill always has one, and that code 13858 * throughout IP deferences ill->ill_ipif without holding any locks. 13859 */ 13860 if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) && 13861 (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) { 13862 ipif_t *stubipif = NULL, *moveipif = NULL; 13863 ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); 13864 13865 /* 13866 * The ipif being brought up should be quiesced. If it's not, 13867 * something has gone amiss and we need to bail out. (If it's 13868 * quiesced, we know it will remain so via IPIF_CONDEMNED.) 13869 */ 13870 mutex_enter(&ill->ill_lock); 13871 if (!ipif_is_quiescent(ipif)) { 13872 mutex_exit(&ill->ill_lock); 13873 return (EINVAL); 13874 } 13875 mutex_exit(&ill->ill_lock); 13876 13877 /* 13878 * If we're going to need to allocate ipifs, do it prior 13879 * to starting the move (and grabbing locks). 13880 */ 13881 if (ipif->ipif_id == 0) { 13882 if ((moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 13883 B_FALSE, &err)) == NULL) { 13884 return (err); 13885 } 13886 if ((stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 13887 B_FALSE, &err)) == NULL) { 13888 mi_free(moveipif); 13889 return (err); 13890 } 13891 } 13892 13893 /* 13894 * Grab or transfer the ipif to move. During the move, keep 13895 * ill_g_lock held to prevent any ill walker threads from 13896 * seeing things in an inconsistent state. 13897 */ 13898 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13899 if (ipif->ipif_id != 0) { 13900 ipif_remove(ipif); 13901 } else { 13902 ipif_transfer(ipif, moveipif, stubipif); 13903 ipif = moveipif; 13904 } 13905 13906 /* 13907 * Place the ipif on the IPMP ill. If the zeroth ipif on 13908 * the IPMP ill is a stub (0.0.0.0 down address) then we 13909 * replace that one. Otherwise, pick the next available slot. 13910 */ 13911 ipif->ipif_ill = ipmp_ill; 13912 ipif_orig_id = ipif->ipif_id; 13913 13914 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) { 13915 ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL); 13916 ipif = ipmp_ill->ill_ipif; 13917 } else { 13918 ipif->ipif_id = -1; 13919 if ((err = ipif_insert(ipif, B_FALSE)) != 0) { 13920 /* 13921 * No more available ipif_id's -- put it back 13922 * on the original ill and fail the operation. 13923 * Since we're writer on the ill, we can be 13924 * sure our old slot is still available. 13925 */ 13926 ipif->ipif_id = ipif_orig_id; 13927 ipif->ipif_ill = ill; 13928 if (ipif_orig_id == 0) { 13929 ipif_transfer(ipif, ill->ill_ipif, 13930 NULL); 13931 } else { 13932 VERIFY(ipif_insert(ipif, B_FALSE) == 0); 13933 } 13934 rw_exit(&ipst->ips_ill_g_lock); 13935 return (err); 13936 } 13937 } 13938 rw_exit(&ipst->ips_ill_g_lock); 13939 13940 /* 13941 * Tell SCTP that the ipif has moved. Note that even if we 13942 * had to allocate a new ipif, the original sequence id was 13943 * preserved and therefore SCTP won't know. 13944 */ 13945 sctp_move_ipif(ipif, ill, ipmp_ill); 13946 13947 /* 13948 * If the ipif being brought up was on slot zero, then we 13949 * first need to bring up the placeholder we stuck there. In 13950 * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive 13951 * call to ipif_up() itself, if we successfully bring up the 13952 * placeholder, we'll check ill_move_ipif and bring it up too. 13953 */ 13954 if (ipif_orig_id == 0) { 13955 ASSERT(ill->ill_move_ipif == NULL); 13956 ill->ill_move_ipif = ipif; 13957 if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0) 13958 ASSERT(ill->ill_move_ipif == NULL); 13959 if (err != EINPROGRESS) 13960 ill->ill_move_ipif = NULL; 13961 return (err); 13962 } 13963 13964 /* 13965 * Bring it up on the IPMP ill. 13966 */ 13967 return (ipif_up(ipif, q, mp)); 13968 } 13969 13970 /* Skip arp/ndp for any loopback interface. */ 13971 if (ill->ill_wq != NULL) { 13972 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 13973 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 13974 13975 if (!ill->ill_dl_up) { 13976 /* 13977 * ill_dl_up is not yet set. i.e. we are yet to 13978 * DL_BIND with the driver and this is the first 13979 * logical interface on the ill to become "up". 13980 * Tell the driver to get going (via DL_BIND_REQ). 13981 * Note that changing "significant" IFF_ flags 13982 * address/netmask etc cause a down/up dance, but 13983 * does not cause an unbind (DL_UNBIND) with the driver 13984 */ 13985 return (ill_dl_up(ill, ipif, mp, q)); 13986 } 13987 13988 /* 13989 * ipif_resolver_up may end up needeing to bind/attach 13990 * the ARP stream, which in turn necessitates a 13991 * DLPI message exchange with the driver. ioctls are 13992 * serialized and so we cannot send more than one 13993 * interface up message at a time. If ipif_resolver_up 13994 * does need to wait for the DLPI handshake for the ARP stream, 13995 * we get EINPROGRESS and we will complete in arp_bringup_done. 13996 */ 13997 13998 ASSERT(connp != NULL || !CONN_Q(q)); 13999 if (connp != NULL) 14000 mutex_enter(&connp->conn_lock); 14001 mutex_enter(&ill->ill_lock); 14002 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 14003 mutex_exit(&ill->ill_lock); 14004 if (connp != NULL) 14005 mutex_exit(&connp->conn_lock); 14006 if (!success) 14007 return (EINTR); 14008 14009 /* 14010 * Crank up IPv6 neighbor discovery. Unlike ARP, this should 14011 * complete when ipif_ndp_up returns. 14012 */ 14013 err = ipif_resolver_up(ipif, Res_act_initial); 14014 if (err == EINPROGRESS) { 14015 /* We will complete it in arp_bringup_done() */ 14016 return (err); 14017 } 14018 14019 if (isv6 && err == 0) 14020 err = ipif_ndp_up(ipif, B_TRUE); 14021 14022 ASSERT(err != EINPROGRESS); 14023 mp = ipsq_pending_mp_get(ipsq, &connp); 14024 ASSERT(mp != NULL); 14025 if (err != 0) 14026 return (err); 14027 } else { 14028 /* 14029 * Interfaces without underlying hardware don't do duplicate 14030 * address detection. 14031 */ 14032 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 14033 ipif->ipif_addr_ready = 1; 14034 err = ill_add_ires(ill); 14035 /* allocation failure? */ 14036 if (err != 0) 14037 return (err); 14038 } 14039 14040 err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 14041 if (err == 0 && ill->ill_move_ipif != NULL) { 14042 ipif = ill->ill_move_ipif; 14043 ill->ill_move_ipif = NULL; 14044 return (ipif_up(ipif, q, mp)); 14045 } 14046 return (err); 14047 } 14048 14049 /* 14050 * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST. 14051 * The identical set of IREs need to be removed in ill_delete_ires(). 14052 */ 14053 int 14054 ill_add_ires(ill_t *ill) 14055 { 14056 ire_t *ire; 14057 in6_addr_t dummy6 = {(uint32_t)V6_MCAST, 0, 0, 1}; 14058 in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP); 14059 14060 if (ill->ill_ire_multicast != NULL) 14061 return (0); 14062 14063 /* 14064 * provide some dummy ire_addr for creating the ire. 14065 */ 14066 if (ill->ill_isv6) { 14067 ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill, 14068 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); 14069 } else { 14070 ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill, 14071 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); 14072 } 14073 if (ire == NULL) 14074 return (ENOMEM); 14075 14076 ill->ill_ire_multicast = ire; 14077 return (0); 14078 } 14079 14080 void 14081 ill_delete_ires(ill_t *ill) 14082 { 14083 if (ill->ill_ire_multicast != NULL) { 14084 /* 14085 * BIND/ATTACH completed; Release the ref for ill_ire_multicast 14086 * which was taken without any th_tracing enabled. 14087 * We also mark it as condemned (note that it was never added) 14088 * so that caching conn's can move off of it. 14089 */ 14090 ire_make_condemned(ill->ill_ire_multicast); 14091 ire_refrele_notr(ill->ill_ire_multicast); 14092 ill->ill_ire_multicast = NULL; 14093 } 14094 } 14095 14096 /* 14097 * Perform a bind for the physical device. 14098 * When the routine returns EINPROGRESS then mp has been consumed and 14099 * the ioctl will be acked from ip_rput_dlpi. 14100 * Allocate an unbind message and save it until ipif_down. 14101 */ 14102 static int 14103 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 14104 { 14105 mblk_t *bind_mp = NULL; 14106 mblk_t *unbind_mp = NULL; 14107 conn_t *connp; 14108 boolean_t success; 14109 int err; 14110 14111 DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill); 14112 14113 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 14114 ASSERT(IAM_WRITER_ILL(ill)); 14115 ASSERT(mp != NULL); 14116 14117 /* 14118 * Make sure we have an IRE_MULTICAST in case we immediately 14119 * start receiving packets. 14120 */ 14121 err = ill_add_ires(ill); 14122 if (err != 0) 14123 goto bad; 14124 14125 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 14126 DL_BIND_REQ); 14127 if (bind_mp == NULL) 14128 goto bad; 14129 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 14130 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 14131 14132 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 14133 if (unbind_mp == NULL) 14134 goto bad; 14135 14136 /* 14137 * Record state needed to complete this operation when the 14138 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 14139 */ 14140 connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 14141 ASSERT(connp != NULL || !CONN_Q(q)); 14142 GRAB_CONN_LOCK(q); 14143 mutex_enter(&ipif->ipif_ill->ill_lock); 14144 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 14145 mutex_exit(&ipif->ipif_ill->ill_lock); 14146 RELEASE_CONN_LOCK(q); 14147 if (!success) 14148 goto bad; 14149 14150 /* 14151 * Save the unbind message for ill_dl_down(); it will be consumed when 14152 * the interface goes down. 14153 */ 14154 ASSERT(ill->ill_unbind_mp == NULL); 14155 ill->ill_unbind_mp = unbind_mp; 14156 14157 ill_dlpi_send(ill, bind_mp); 14158 /* Send down link-layer capabilities probe if not already done. */ 14159 ill_capability_probe(ill); 14160 14161 /* 14162 * Sysid used to rely on the fact that netboots set domainname 14163 * and the like. Now that miniroot boots aren't strictly netboots 14164 * and miniroot network configuration is driven from userland 14165 * these things still need to be set. This situation can be detected 14166 * by comparing the interface being configured here to the one 14167 * dhcifname was set to reference by the boot loader. Once sysid is 14168 * converted to use dhcp_ipc_getinfo() this call can go away. 14169 */ 14170 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && 14171 (strcmp(ill->ill_name, dhcifname) == 0) && 14172 (strlen(srpc_domain) == 0)) { 14173 if (dhcpinit() != 0) 14174 cmn_err(CE_WARN, "no cached dhcp response"); 14175 } 14176 14177 /* 14178 * This operation will complete in ip_rput_dlpi with either 14179 * a DL_BIND_ACK or DL_ERROR_ACK. 14180 */ 14181 return (EINPROGRESS); 14182 bad: 14183 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 14184 14185 freemsg(bind_mp); 14186 freemsg(unbind_mp); 14187 return (ENOMEM); 14188 } 14189 14190 /* Add room for tcp+ip headers */ 14191 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 14192 14193 /* 14194 * DLPI and ARP is up. 14195 * Create all the IREs associated with an interface. Bring up multicast. 14196 * Set the interface flag and finish other initialization 14197 * that potentially had to be deferred to after DL_BIND_ACK. 14198 */ 14199 int 14200 ipif_up_done(ipif_t *ipif) 14201 { 14202 ill_t *ill = ipif->ipif_ill; 14203 int err = 0; 14204 boolean_t loopback = B_FALSE; 14205 boolean_t update_src_selection = B_TRUE; 14206 ipif_t *tmp_ipif; 14207 14208 ip1dbg(("ipif_up_done(%s:%u)\n", 14209 ipif->ipif_ill->ill_name, ipif->ipif_id)); 14210 DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done", 14211 ill_t *, ill, ipif_t *, ipif); 14212 14213 /* Check if this is a loopback interface */ 14214 if (ipif->ipif_ill->ill_wq == NULL) 14215 loopback = B_TRUE; 14216 14217 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 14218 14219 /* 14220 * If all other interfaces for this ill are down or DEPRECATED, 14221 * or otherwise unsuitable for source address selection, 14222 * reset the src generation numbers to make sure source 14223 * address selection gets to take this new ipif into account. 14224 * No need to hold ill_lock while traversing the ipif list since 14225 * we are writer 14226 */ 14227 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 14228 tmp_ipif = tmp_ipif->ipif_next) { 14229 if (((tmp_ipif->ipif_flags & 14230 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 14231 !(tmp_ipif->ipif_flags & IPIF_UP)) || 14232 (tmp_ipif == ipif)) 14233 continue; 14234 /* first useable pre-existing interface */ 14235 update_src_selection = B_FALSE; 14236 break; 14237 } 14238 if (update_src_selection) 14239 ip_update_source_selection(ill->ill_ipst); 14240 14241 if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) { 14242 nce_t *loop_nce = NULL; 14243 uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD); 14244 14245 /* 14246 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 14247 * ipif_lookup_on_name(), but in the case of zones we can have 14248 * several loopback addresses on lo0. So all the interfaces with 14249 * loopback addresses need to be marked IRE_LOOPBACK. 14250 */ 14251 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 14252 htonl(INADDR_LOOPBACK)) 14253 ipif->ipif_ire_type = IRE_LOOPBACK; 14254 else 14255 ipif->ipif_ire_type = IRE_LOCAL; 14256 if (ill->ill_net_type != IRE_LOOPBACK) 14257 flags |= NCE_F_PUBLISH; 14258 14259 /* add unicast nce for the local addr */ 14260 err = nce_lookup_then_add_v4(ill, NULL, 14261 ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags, 14262 ND_REACHABLE, &loop_nce); 14263 /* A shared-IP zone sees EEXIST for lo0:N */ 14264 if (err == 0 || err == EEXIST) { 14265 ipif->ipif_added_nce = 1; 14266 loop_nce->nce_ipif_cnt++; 14267 nce_refrele(loop_nce); 14268 err = 0; 14269 } else { 14270 ASSERT(loop_nce == NULL); 14271 return (err); 14272 } 14273 } 14274 14275 /* Create all the IREs associated with this interface */ 14276 err = ipif_add_ires_v4(ipif, loopback); 14277 if (err != 0) { 14278 /* 14279 * see comments about return value from 14280 * ip_addr_availability_check() in ipif_add_ires_v4(). 14281 */ 14282 if (err != EADDRINUSE) { 14283 (void) ipif_arp_down(ipif); 14284 } else { 14285 /* 14286 * Make IPMP aware of the deleted ipif so that 14287 * the needed ipmp cleanup (e.g., of ipif_bound_ill) 14288 * can be completed. Note that we do not want to 14289 * destroy the nce that was created on the ipmp_ill 14290 * for the active copy of the duplicate address in 14291 * use. 14292 */ 14293 if (IS_IPMP(ill)) 14294 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 14295 err = EADDRNOTAVAIL; 14296 } 14297 return (err); 14298 } 14299 14300 if (ill->ill_ipif_up_count == 1 && !loopback) { 14301 /* Recover any additional IREs entries for this ill */ 14302 (void) ill_recover_saved_ire(ill); 14303 } 14304 14305 if (ill->ill_need_recover_multicast) { 14306 /* 14307 * Need to recover all multicast memberships in the driver. 14308 * This had to be deferred until we had attached. The same 14309 * code exists in ipif_up_done_v6() to recover IPv6 14310 * memberships. 14311 * 14312 * Note that it would be preferable to unconditionally do the 14313 * ill_recover_multicast() in ill_dl_up(), but we cannot do 14314 * that since ill_join_allmulti() depends on ill_dl_up being 14315 * set, and it is not set until we receive a DL_BIND_ACK after 14316 * having called ill_dl_up(). 14317 */ 14318 ill_recover_multicast(ill); 14319 } 14320 14321 if (ill->ill_ipif_up_count == 1) { 14322 /* 14323 * Since the interface is now up, it may now be active. 14324 */ 14325 if (IS_UNDER_IPMP(ill)) 14326 ipmp_ill_refresh_active(ill); 14327 14328 /* 14329 * If this is an IPMP interface, we may now be able to 14330 * establish ARP entries. 14331 */ 14332 if (IS_IPMP(ill)) 14333 ipmp_illgrp_refresh_arpent(ill->ill_grp); 14334 } 14335 14336 /* Join the allhosts multicast address */ 14337 ipif_multicast_up(ipif); 14338 14339 if (!loopback && !update_src_selection && 14340 !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) 14341 ip_update_source_selection(ill->ill_ipst); 14342 14343 if (!loopback && ipif->ipif_addr_ready) { 14344 /* Broadcast an address mask reply. */ 14345 ipif_mask_reply(ipif); 14346 } 14347 /* Perhaps ilgs should use this ill */ 14348 update_conn_ill(NULL, ill->ill_ipst); 14349 14350 /* 14351 * This had to be deferred until we had bound. Tell routing sockets and 14352 * others that this interface is up if it looks like the address has 14353 * been validated. Otherwise, if it isn't ready yet, wait for 14354 * duplicate address detection to do its thing. 14355 */ 14356 if (ipif->ipif_addr_ready) 14357 ipif_up_notify(ipif); 14358 return (0); 14359 } 14360 14361 /* 14362 * Add the IREs associated with the ipif. 14363 * Those MUST be explicitly removed in ipif_delete_ires_v4. 14364 */ 14365 static int 14366 ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback) 14367 { 14368 ill_t *ill = ipif->ipif_ill; 14369 ip_stack_t *ipst = ill->ill_ipst; 14370 ire_t *ire_array[20]; 14371 ire_t **irep = ire_array; 14372 ire_t **irep1; 14373 ipaddr_t net_mask = 0; 14374 ipaddr_t subnet_mask, route_mask; 14375 int err; 14376 ire_t *ire_local = NULL; /* LOCAL or LOOPBACK */ 14377 ire_t *ire_if = NULL; 14378 14379 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14380 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14381 /* 14382 * If we're on a labeled system then make sure that zone- 14383 * private addresses have proper remote host database entries. 14384 */ 14385 if (is_system_labeled() && 14386 ipif->ipif_ire_type != IRE_LOOPBACK && 14387 !tsol_check_interface_address(ipif)) 14388 return (EINVAL); 14389 14390 /* Register the source address for __sin6_src_id */ 14391 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 14392 ipif->ipif_zoneid, ipst); 14393 if (err != 0) { 14394 ip0dbg(("ipif_add_ires: srcid_insert %d\n", err)); 14395 return (err); 14396 } 14397 14398 /* If the interface address is set, create the local IRE. */ 14399 ire_local = ire_create( 14400 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 14401 (uchar_t *)&ip_g_all_ones, /* mask */ 14402 NULL, /* no gateway */ 14403 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 14404 ipif->ipif_ill, 14405 ipif->ipif_zoneid, 14406 ((ipif->ipif_flags & IPIF_PRIVATE) ? 14407 RTF_PRIVATE : 0) | RTF_KERNEL, 14408 NULL, 14409 ipst); 14410 ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x" 14411 " for 0x%x\n", (void *)ipif, (void *)ire_local, 14412 ipif->ipif_ire_type, 14413 ntohl(ipif->ipif_lcl_addr))); 14414 if (ire_local == NULL) { 14415 ip1dbg(("ipif_up_done: NULL ire_local\n")); 14416 err = ENOMEM; 14417 goto bad; 14418 } 14419 } else { 14420 ip1dbg(( 14421 "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n", 14422 ipif->ipif_ire_type, 14423 ntohl(ipif->ipif_lcl_addr), 14424 (uint_t)ipif->ipif_flags)); 14425 } 14426 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14427 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14428 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14429 } else { 14430 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 14431 } 14432 14433 subnet_mask = ipif->ipif_net_mask; 14434 14435 /* 14436 * If mask was not specified, use natural netmask of 14437 * interface address. Also, store this mask back into the 14438 * ipif struct. 14439 */ 14440 if (subnet_mask == 0) { 14441 subnet_mask = net_mask; 14442 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 14443 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 14444 ipif->ipif_v6subnet); 14445 } 14446 14447 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 14448 if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) && 14449 ipif->ipif_subnet != INADDR_ANY) { 14450 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 14451 14452 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 14453 route_mask = IP_HOST_MASK; 14454 } else { 14455 route_mask = subnet_mask; 14456 } 14457 14458 ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p " 14459 "creating if IRE ill_net_type 0x%x for 0x%x\n", 14460 (void *)ipif, (void *)ill, ill->ill_net_type, 14461 ntohl(ipif->ipif_subnet))); 14462 ire_if = ire_create( 14463 (uchar_t *)&ipif->ipif_subnet, 14464 (uchar_t *)&route_mask, 14465 (uchar_t *)&ipif->ipif_lcl_addr, 14466 ill->ill_net_type, 14467 ill, 14468 ipif->ipif_zoneid, 14469 ((ipif->ipif_flags & IPIF_PRIVATE) ? 14470 RTF_PRIVATE: 0) | RTF_KERNEL, 14471 NULL, 14472 ipst); 14473 if (ire_if == NULL) { 14474 ip1dbg(("ipif_up_done: NULL ire_if\n")); 14475 err = ENOMEM; 14476 goto bad; 14477 } 14478 } 14479 14480 /* 14481 * Create any necessary broadcast IREs. 14482 */ 14483 if ((ipif->ipif_flags & IPIF_BROADCAST) && 14484 !(ipif->ipif_flags & IPIF_NOXMIT)) 14485 irep = ipif_create_bcast_ires(ipif, irep); 14486 14487 /* If an earlier ire_create failed, get out now */ 14488 for (irep1 = irep; irep1 > ire_array; ) { 14489 irep1--; 14490 if (*irep1 == NULL) { 14491 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 14492 err = ENOMEM; 14493 goto bad; 14494 } 14495 } 14496 14497 /* 14498 * Need to atomically check for IP address availability under 14499 * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new 14500 * ills or new ipifs can be added while we are checking availability. 14501 */ 14502 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 14503 mutex_enter(&ipst->ips_ip_addr_avail_lock); 14504 /* Mark it up, and increment counters. */ 14505 ipif->ipif_flags |= IPIF_UP; 14506 ill->ill_ipif_up_count++; 14507 err = ip_addr_availability_check(ipif); 14508 mutex_exit(&ipst->ips_ip_addr_avail_lock); 14509 rw_exit(&ipst->ips_ill_g_lock); 14510 14511 if (err != 0) { 14512 /* 14513 * Our address may already be up on the same ill. In this case, 14514 * the ARP entry for our ipif replaced the one for the other 14515 * ipif. So we don't want to delete it (otherwise the other ipif 14516 * would be unable to send packets). 14517 * ip_addr_availability_check() identifies this case for us and 14518 * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL 14519 * which is the expected error code. 14520 */ 14521 ill->ill_ipif_up_count--; 14522 ipif->ipif_flags &= ~IPIF_UP; 14523 goto bad; 14524 } 14525 14526 /* 14527 * Add in all newly created IREs. ire_create_bcast() has 14528 * already checked for duplicates of the IRE_BROADCAST type. 14529 * We add the IRE_INTERFACE before the IRE_LOCAL to ensure 14530 * that lookups find the IRE_LOCAL even if the IRE_INTERFACE is 14531 * a /32 route. 14532 */ 14533 if (ire_if != NULL) { 14534 ire_if = ire_add(ire_if); 14535 if (ire_if == NULL) { 14536 err = ENOMEM; 14537 goto bad2; 14538 } 14539 #ifdef DEBUG 14540 ire_refhold_notr(ire_if); 14541 ire_refrele(ire_if); 14542 #endif 14543 } 14544 if (ire_local != NULL) { 14545 ire_local = ire_add(ire_local); 14546 if (ire_local == NULL) { 14547 err = ENOMEM; 14548 goto bad2; 14549 } 14550 #ifdef DEBUG 14551 ire_refhold_notr(ire_local); 14552 ire_refrele(ire_local); 14553 #endif 14554 } 14555 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14556 if (ire_local != NULL) 14557 ipif->ipif_ire_local = ire_local; 14558 if (ire_if != NULL) 14559 ipif->ipif_ire_if = ire_if; 14560 rw_exit(&ipst->ips_ill_g_lock); 14561 ire_local = NULL; 14562 ire_if = NULL; 14563 14564 /* 14565 * We first add all of them, and if that succeeds we refrele the 14566 * bunch. That enables us to delete all of them should any of the 14567 * ire_adds fail. 14568 */ 14569 for (irep1 = irep; irep1 > ire_array; ) { 14570 irep1--; 14571 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock))); 14572 *irep1 = ire_add(*irep1); 14573 if (*irep1 == NULL) { 14574 err = ENOMEM; 14575 goto bad2; 14576 } 14577 } 14578 14579 for (irep1 = irep; irep1 > ire_array; ) { 14580 irep1--; 14581 /* refheld by ire_add. */ 14582 if (*irep1 != NULL) { 14583 ire_refrele(*irep1); 14584 *irep1 = NULL; 14585 } 14586 } 14587 14588 if (!loopback) { 14589 /* 14590 * If the broadcast address has been set, make sure it makes 14591 * sense based on the interface address. 14592 * Only match on ill since we are sharing broadcast addresses. 14593 */ 14594 if ((ipif->ipif_brd_addr != INADDR_ANY) && 14595 (ipif->ipif_flags & IPIF_BROADCAST)) { 14596 ire_t *ire; 14597 14598 ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0, 14599 IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES, NULL, 14600 (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL); 14601 14602 if (ire == NULL) { 14603 /* 14604 * If there isn't a matching broadcast IRE, 14605 * revert to the default for this netmask. 14606 */ 14607 ipif->ipif_v6brd_addr = ipv6_all_zeros; 14608 mutex_enter(&ipif->ipif_ill->ill_lock); 14609 ipif_set_default(ipif); 14610 mutex_exit(&ipif->ipif_ill->ill_lock); 14611 } else { 14612 ire_refrele(ire); 14613 } 14614 } 14615 14616 } 14617 return (0); 14618 14619 bad2: 14620 ill->ill_ipif_up_count--; 14621 ipif->ipif_flags &= ~IPIF_UP; 14622 14623 bad: 14624 ip1dbg(("ipif_add_ires: FAILED \n")); 14625 if (ire_local != NULL) 14626 ire_delete(ire_local); 14627 if (ire_if != NULL) 14628 ire_delete(ire_if); 14629 14630 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14631 ire_local = ipif->ipif_ire_local; 14632 ipif->ipif_ire_local = NULL; 14633 ire_if = ipif->ipif_ire_if; 14634 ipif->ipif_ire_if = NULL; 14635 rw_exit(&ipst->ips_ill_g_lock); 14636 if (ire_local != NULL) { 14637 ire_delete(ire_local); 14638 ire_refrele_notr(ire_local); 14639 } 14640 if (ire_if != NULL) { 14641 ire_delete(ire_if); 14642 ire_refrele_notr(ire_if); 14643 } 14644 14645 while (irep > ire_array) { 14646 irep--; 14647 if (*irep != NULL) { 14648 ire_delete(*irep); 14649 } 14650 } 14651 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); 14652 14653 return (err); 14654 } 14655 14656 /* Remove all the IREs created by ipif_add_ires_v4 */ 14657 void 14658 ipif_delete_ires_v4(ipif_t *ipif) 14659 { 14660 ill_t *ill = ipif->ipif_ill; 14661 ip_stack_t *ipst = ill->ill_ipst; 14662 ire_t *ire; 14663 14664 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14665 ire = ipif->ipif_ire_local; 14666 ipif->ipif_ire_local = NULL; 14667 rw_exit(&ipst->ips_ill_g_lock); 14668 if (ire != NULL) { 14669 /* 14670 * Move count to ipif so we don't loose the count due to 14671 * a down/up dance. 14672 */ 14673 atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count); 14674 14675 ire_delete(ire); 14676 ire_refrele_notr(ire); 14677 } 14678 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14679 ire = ipif->ipif_ire_if; 14680 ipif->ipif_ire_if = NULL; 14681 rw_exit(&ipst->ips_ill_g_lock); 14682 if (ire != NULL) { 14683 ire_delete(ire); 14684 ire_refrele_notr(ire); 14685 } 14686 14687 /* 14688 * Delete the broadcast IREs. 14689 */ 14690 if ((ipif->ipif_flags & IPIF_BROADCAST) && 14691 !(ipif->ipif_flags & IPIF_NOXMIT)) 14692 ipif_delete_bcast_ires(ipif); 14693 } 14694 14695 /* 14696 * Checks for availbility of a usable source address (if there is one) when the 14697 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 14698 * this selection is done regardless of the destination. 14699 */ 14700 boolean_t 14701 ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid, 14702 ip_stack_t *ipst) 14703 { 14704 ipif_t *ipif = NULL; 14705 ill_t *uill; 14706 14707 ASSERT(ifindex != 0); 14708 14709 uill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 14710 if (uill == NULL) 14711 return (B_FALSE); 14712 14713 mutex_enter(&uill->ill_lock); 14714 for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14715 if (IPIF_IS_CONDEMNED(ipif)) 14716 continue; 14717 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 14718 continue; 14719 if (!(ipif->ipif_flags & IPIF_UP)) 14720 continue; 14721 if (ipif->ipif_zoneid != zoneid) 14722 continue; 14723 if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : 14724 ipif->ipif_lcl_addr == INADDR_ANY) 14725 continue; 14726 mutex_exit(&uill->ill_lock); 14727 ill_refrele(uill); 14728 return (B_TRUE); 14729 } 14730 mutex_exit(&uill->ill_lock); 14731 ill_refrele(uill); 14732 return (B_FALSE); 14733 } 14734 14735 /* 14736 * Find an ipif with a good local address on the ill+zoneid. 14737 */ 14738 ipif_t * 14739 ipif_good_addr(ill_t *ill, zoneid_t zoneid) 14740 { 14741 ipif_t *ipif; 14742 14743 mutex_enter(&ill->ill_lock); 14744 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14745 if (IPIF_IS_CONDEMNED(ipif)) 14746 continue; 14747 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 14748 continue; 14749 if (!(ipif->ipif_flags & IPIF_UP)) 14750 continue; 14751 if (ipif->ipif_zoneid != zoneid && 14752 ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES) 14753 continue; 14754 if (ill->ill_isv6 ? 14755 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : 14756 ipif->ipif_lcl_addr == INADDR_ANY) 14757 continue; 14758 ipif_refhold_locked(ipif); 14759 mutex_exit(&ill->ill_lock); 14760 return (ipif); 14761 } 14762 mutex_exit(&ill->ill_lock); 14763 return (NULL); 14764 } 14765 14766 /* 14767 * IP source address type, sorted from worst to best. For a given type, 14768 * always prefer IP addresses on the same subnet. All-zones addresses are 14769 * suboptimal because they pose problems with unlabeled destinations. 14770 */ 14771 typedef enum { 14772 IPIF_NONE, 14773 IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */ 14774 IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */ 14775 IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */ 14776 IPIF_SAMENET_ALLZONES, /* allzones and same subnet */ 14777 IPIF_DIFFNET, /* normal and different subnet */ 14778 IPIF_SAMENET, /* normal and same subnet */ 14779 IPIF_LOCALADDR /* local loopback */ 14780 } ipif_type_t; 14781 14782 /* 14783 * Pick the optimal ipif on `ill' for sending to destination `dst' from zone 14784 * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t 14785 * enumeration, and return the highest-rated ipif. If there's a tie, we pick 14786 * the first one, unless IPMP is used in which case we round-robin among them; 14787 * see below for more. 14788 * 14789 * Returns NULL if there is no suitable source address for the ill. 14790 * This only occurs when there is no valid source address for the ill. 14791 */ 14792 ipif_t * 14793 ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid, 14794 boolean_t allow_usesrc, boolean_t *notreadyp) 14795 { 14796 ill_t *usill = NULL; 14797 ill_t *ipmp_ill = NULL; 14798 ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif; 14799 ipif_type_t type, best_type; 14800 tsol_tpc_t *src_rhtp, *dst_rhtp; 14801 ip_stack_t *ipst = ill->ill_ipst; 14802 boolean_t samenet; 14803 14804 if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) { 14805 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, 14806 B_FALSE, ipst); 14807 if (usill != NULL) 14808 ill = usill; /* Select source from usesrc ILL */ 14809 else 14810 return (NULL); 14811 } 14812 14813 /* 14814 * Test addresses should never be used for source address selection, 14815 * so if we were passed one, switch to the IPMP meta-interface. 14816 */ 14817 if (IS_UNDER_IPMP(ill)) { 14818 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) 14819 ill = ipmp_ill; /* Select source from IPMP ill */ 14820 else 14821 return (NULL); 14822 } 14823 14824 /* 14825 * If we're dealing with an unlabeled destination on a labeled system, 14826 * make sure that we ignore source addresses that are incompatible with 14827 * the destination's default label. That destination's default label 14828 * must dominate the minimum label on the source address. 14829 */ 14830 dst_rhtp = NULL; 14831 if (is_system_labeled()) { 14832 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 14833 if (dst_rhtp == NULL) 14834 return (NULL); 14835 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 14836 TPC_RELE(dst_rhtp); 14837 dst_rhtp = NULL; 14838 } 14839 } 14840 14841 /* 14842 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill 14843 * can be deleted. But an ipif/ill can get CONDEMNED any time. 14844 * After selecting the right ipif, under ill_lock make sure ipif is 14845 * not condemned, and increment refcnt. If ipif is CONDEMNED, 14846 * we retry. Inside the loop we still need to check for CONDEMNED, 14847 * but not under a lock. 14848 */ 14849 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 14850 retry: 14851 /* 14852 * For source address selection, we treat the ipif list as circular 14853 * and continue until we get back to where we started. This allows 14854 * IPMP to vary source address selection (which improves inbound load 14855 * spreading) by caching its last ending point and starting from 14856 * there. NOTE: we don't have to worry about ill_src_ipif changing 14857 * ills since that can't happen on the IPMP ill. 14858 */ 14859 start_ipif = ill->ill_ipif; 14860 if (IS_IPMP(ill) && ill->ill_src_ipif != NULL) 14861 start_ipif = ill->ill_src_ipif; 14862 14863 ipif = start_ipif; 14864 best_ipif = NULL; 14865 best_type = IPIF_NONE; 14866 do { 14867 if ((next_ipif = ipif->ipif_next) == NULL) 14868 next_ipif = ill->ill_ipif; 14869 14870 if (IPIF_IS_CONDEMNED(ipif)) 14871 continue; 14872 /* Always skip NOLOCAL and ANYCAST interfaces */ 14873 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 14874 continue; 14875 /* Always skip NOACCEPT interfaces */ 14876 if (ipif->ipif_ill->ill_flags & ILLF_NOACCEPT) 14877 continue; 14878 if (!(ipif->ipif_flags & IPIF_UP)) 14879 continue; 14880 14881 if (!ipif->ipif_addr_ready) { 14882 if (notreadyp != NULL) 14883 *notreadyp = B_TRUE; 14884 continue; 14885 } 14886 14887 if (zoneid != ALL_ZONES && 14888 ipif->ipif_zoneid != zoneid && 14889 ipif->ipif_zoneid != ALL_ZONES) 14890 continue; 14891 14892 /* 14893 * Interfaces with 0.0.0.0 address are allowed to be UP, but 14894 * are not valid as source addresses. 14895 */ 14896 if (ipif->ipif_lcl_addr == INADDR_ANY) 14897 continue; 14898 14899 /* 14900 * Check compatibility of local address for destination's 14901 * default label if we're on a labeled system. Incompatible 14902 * addresses can't be used at all. 14903 */ 14904 if (dst_rhtp != NULL) { 14905 boolean_t incompat; 14906 14907 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 14908 IPV4_VERSION, B_FALSE); 14909 if (src_rhtp == NULL) 14910 continue; 14911 incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO || 14912 src_rhtp->tpc_tp.tp_doi != 14913 dst_rhtp->tpc_tp.tp_doi || 14914 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 14915 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 14916 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 14917 src_rhtp->tpc_tp.tp_sl_set_cipso)); 14918 TPC_RELE(src_rhtp); 14919 if (incompat) 14920 continue; 14921 } 14922 14923 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet); 14924 14925 if (ipif->ipif_lcl_addr == dst) { 14926 type = IPIF_LOCALADDR; 14927 } else if (ipif->ipif_flags & IPIF_DEPRECATED) { 14928 type = samenet ? IPIF_SAMENET_DEPRECATED : 14929 IPIF_DIFFNET_DEPRECATED; 14930 } else if (ipif->ipif_zoneid == ALL_ZONES) { 14931 type = samenet ? IPIF_SAMENET_ALLZONES : 14932 IPIF_DIFFNET_ALLZONES; 14933 } else { 14934 type = samenet ? IPIF_SAMENET : IPIF_DIFFNET; 14935 } 14936 14937 if (type > best_type) { 14938 best_type = type; 14939 best_ipif = ipif; 14940 if (best_type == IPIF_LOCALADDR) 14941 break; /* can't get better */ 14942 } 14943 } while ((ipif = next_ipif) != start_ipif); 14944 14945 if ((ipif = best_ipif) != NULL) { 14946 mutex_enter(&ipif->ipif_ill->ill_lock); 14947 if (IPIF_IS_CONDEMNED(ipif)) { 14948 mutex_exit(&ipif->ipif_ill->ill_lock); 14949 goto retry; 14950 } 14951 ipif_refhold_locked(ipif); 14952 14953 /* 14954 * For IPMP, update the source ipif rotor to the next ipif, 14955 * provided we can look it up. (We must not use it if it's 14956 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after 14957 * ipif_free() checked ill_src_ipif.) 14958 */ 14959 if (IS_IPMP(ill) && ipif != NULL) { 14960 next_ipif = ipif->ipif_next; 14961 if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif)) 14962 ill->ill_src_ipif = next_ipif; 14963 else 14964 ill->ill_src_ipif = NULL; 14965 } 14966 mutex_exit(&ipif->ipif_ill->ill_lock); 14967 } 14968 14969 rw_exit(&ipst->ips_ill_g_lock); 14970 if (usill != NULL) 14971 ill_refrele(usill); 14972 if (ipmp_ill != NULL) 14973 ill_refrele(ipmp_ill); 14974 if (dst_rhtp != NULL) 14975 TPC_RELE(dst_rhtp); 14976 14977 #ifdef DEBUG 14978 if (ipif == NULL) { 14979 char buf1[INET6_ADDRSTRLEN]; 14980 14981 ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n", 14982 ill->ill_name, 14983 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 14984 } else { 14985 char buf1[INET6_ADDRSTRLEN]; 14986 char buf2[INET6_ADDRSTRLEN]; 14987 14988 ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n", 14989 ipif->ipif_ill->ill_name, 14990 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 14991 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 14992 buf2, sizeof (buf2)))); 14993 } 14994 #endif /* DEBUG */ 14995 return (ipif); 14996 } 14997 14998 /* 14999 * Pick a source address based on the destination ill and an optional setsrc 15000 * address. 15001 * The result is stored in srcp. If generation is set, then put the source 15002 * generation number there before we look for the source address (to avoid 15003 * missing changes in the set of source addresses. 15004 * If flagsp is set, then us it to pass back ipif_flags. 15005 * 15006 * If the caller wants to cache the returned source address and detect when 15007 * that might be stale, the caller should pass in a generation argument, 15008 * which the caller can later compare against ips_src_generation 15009 * 15010 * The precedence order for selecting an IPv4 source address is: 15011 * - RTF_SETSRC on the offlink ire always wins. 15012 * - If usrsrc is set, swap the ill to be the usesrc one. 15013 * - If IPMP is used on the ill, select a random address from the most 15014 * preferred ones below: 15015 * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES 15016 * 2. Not deprecated, not ALL_ZONES 15017 * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES 15018 * 4. Not deprecated, ALL_ZONES 15019 * 5. If onlink destination, same subnet and deprecated 15020 * 6. Deprecated. 15021 * 15022 * We have lower preference for ALL_ZONES IP addresses, 15023 * as they pose problems with unlabeled destinations. 15024 * 15025 * Note that when multiple IP addresses match e.g., #1 we pick 15026 * the first one if IPMP is not in use. With IPMP we randomize. 15027 */ 15028 int 15029 ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst, 15030 ipaddr_t multicast_ifaddr, 15031 zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp, 15032 uint32_t *generation, uint64_t *flagsp) 15033 { 15034 ipif_t *ipif; 15035 boolean_t notready = B_FALSE; /* Set if !ipif_addr_ready found */ 15036 15037 if (flagsp != NULL) 15038 *flagsp = 0; 15039 15040 /* 15041 * Need to grab the generation number before we check to 15042 * avoid a race with a change to the set of local addresses. 15043 * No lock needed since the thread which updates the set of local 15044 * addresses use ipif/ill locks and exit those (hence a store memory 15045 * barrier) before doing the atomic increase of ips_src_generation. 15046 */ 15047 if (generation != NULL) { 15048 *generation = ipst->ips_src_generation; 15049 } 15050 15051 if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) { 15052 *srcp = multicast_ifaddr; 15053 return (0); 15054 } 15055 15056 /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */ 15057 if (setsrc != INADDR_ANY) { 15058 *srcp = setsrc; 15059 return (0); 15060 } 15061 ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, ¬ready); 15062 if (ipif == NULL) { 15063 if (notready) 15064 return (ENETDOWN); 15065 else 15066 return (EADDRNOTAVAIL); 15067 } 15068 *srcp = ipif->ipif_lcl_addr; 15069 if (flagsp != NULL) 15070 *flagsp = ipif->ipif_flags; 15071 ipif_refrele(ipif); 15072 return (0); 15073 } 15074 15075 /* ARGSUSED */ 15076 int 15077 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15078 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15079 { 15080 /* 15081 * ill_phyint_reinit merged the v4 and v6 into a single 15082 * ipsq. We might not have been able to complete the 15083 * operation in ipif_set_values, if we could not become 15084 * exclusive. If so restart it here. 15085 */ 15086 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 15087 } 15088 15089 /* 15090 * Can operate on either a module or a driver queue. 15091 * Returns an error if not a module queue. 15092 */ 15093 /* ARGSUSED */ 15094 int 15095 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15096 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15097 { 15098 queue_t *q1 = q; 15099 char *cp; 15100 char interf_name[LIFNAMSIZ]; 15101 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 15102 15103 if (q->q_next == NULL) { 15104 ip1dbg(( 15105 "if_unitsel: IF_UNITSEL: no q_next\n")); 15106 return (EINVAL); 15107 } 15108 15109 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 15110 return (EALREADY); 15111 15112 do { 15113 q1 = q1->q_next; 15114 } while (q1->q_next); 15115 cp = q1->q_qinfo->qi_minfo->mi_idname; 15116 (void) sprintf(interf_name, "%s%d", cp, ppa); 15117 15118 /* 15119 * Here we are not going to delay the ioack until after 15120 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 15121 * original ioctl message before sending the requests. 15122 */ 15123 return (ipif_set_values(q, mp, interf_name, &ppa)); 15124 } 15125 15126 /* ARGSUSED */ 15127 int 15128 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15129 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15130 { 15131 return (ENXIO); 15132 } 15133 15134 /* 15135 * Create any IRE_BROADCAST entries for `ipif', and store those entries in 15136 * `irep'. Returns a pointer to the next free `irep' entry 15137 * A mirror exists in ipif_delete_bcast_ires(). 15138 * 15139 * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is 15140 * done in ire_add. 15141 */ 15142 static ire_t ** 15143 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) 15144 { 15145 ipaddr_t addr; 15146 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 15147 ipaddr_t subnetmask = ipif->ipif_net_mask; 15148 ill_t *ill = ipif->ipif_ill; 15149 zoneid_t zoneid = ipif->ipif_zoneid; 15150 15151 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n")); 15152 15153 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 15154 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); 15155 15156 if (ipif->ipif_lcl_addr == INADDR_ANY || 15157 (ipif->ipif_flags & IPIF_NOLOCAL)) 15158 netmask = htonl(IN_CLASSA_NET); /* fallback */ 15159 15160 irep = ire_create_bcast(ill, 0, zoneid, irep); 15161 irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep); 15162 15163 /* 15164 * For backward compatibility, we create net broadcast IREs based on 15165 * the old "IP address class system", since some old machines only 15166 * respond to these class derived net broadcast. However, we must not 15167 * create these net broadcast IREs if the subnetmask is shorter than 15168 * the IP address class based derived netmask. Otherwise, we may 15169 * create a net broadcast address which is the same as an IP address 15170 * on the subnet -- and then TCP will refuse to talk to that address. 15171 */ 15172 if (netmask < subnetmask) { 15173 addr = netmask & ipif->ipif_subnet; 15174 irep = ire_create_bcast(ill, addr, zoneid, irep); 15175 irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep); 15176 } 15177 15178 /* 15179 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 15180 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 15181 * created. Creating these broadcast IREs will only create confusion 15182 * as `addr' will be the same as the IP address. 15183 */ 15184 if (subnetmask != 0xFFFFFFFF) { 15185 addr = ipif->ipif_subnet; 15186 irep = ire_create_bcast(ill, addr, zoneid, irep); 15187 irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep); 15188 } 15189 15190 return (irep); 15191 } 15192 15193 /* 15194 * Mirror of ipif_create_bcast_ires() 15195 */ 15196 static void 15197 ipif_delete_bcast_ires(ipif_t *ipif) 15198 { 15199 ipaddr_t addr; 15200 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 15201 ipaddr_t subnetmask = ipif->ipif_net_mask; 15202 ill_t *ill = ipif->ipif_ill; 15203 zoneid_t zoneid = ipif->ipif_zoneid; 15204 ire_t *ire; 15205 15206 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 15207 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); 15208 15209 if (ipif->ipif_lcl_addr == INADDR_ANY || 15210 (ipif->ipif_flags & IPIF_NOLOCAL)) 15211 netmask = htonl(IN_CLASSA_NET); /* fallback */ 15212 15213 ire = ire_lookup_bcast(ill, 0, zoneid); 15214 ASSERT(ire != NULL); 15215 ire_delete(ire); ire_refrele(ire); 15216 ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid); 15217 ASSERT(ire != NULL); 15218 ire_delete(ire); ire_refrele(ire); 15219 15220 /* 15221 * For backward compatibility, we create net broadcast IREs based on 15222 * the old "IP address class system", since some old machines only 15223 * respond to these class derived net broadcast. However, we must not 15224 * create these net broadcast IREs if the subnetmask is shorter than 15225 * the IP address class based derived netmask. Otherwise, we may 15226 * create a net broadcast address which is the same as an IP address 15227 * on the subnet -- and then TCP will refuse to talk to that address. 15228 */ 15229 if (netmask < subnetmask) { 15230 addr = netmask & ipif->ipif_subnet; 15231 ire = ire_lookup_bcast(ill, addr, zoneid); 15232 ASSERT(ire != NULL); 15233 ire_delete(ire); ire_refrele(ire); 15234 ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid); 15235 ASSERT(ire != NULL); 15236 ire_delete(ire); ire_refrele(ire); 15237 } 15238 15239 /* 15240 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 15241 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 15242 * created. Creating these broadcast IREs will only create confusion 15243 * as `addr' will be the same as the IP address. 15244 */ 15245 if (subnetmask != 0xFFFFFFFF) { 15246 addr = ipif->ipif_subnet; 15247 ire = ire_lookup_bcast(ill, addr, zoneid); 15248 ASSERT(ire != NULL); 15249 ire_delete(ire); ire_refrele(ire); 15250 ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid); 15251 ASSERT(ire != NULL); 15252 ire_delete(ire); ire_refrele(ire); 15253 } 15254 } 15255 15256 /* 15257 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 15258 * from lifr_flags and the name from lifr_name. 15259 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 15260 * since ipif_lookup_on_name uses the _isv6 flags when matching. 15261 * Returns EINPROGRESS when mp has been consumed by queueing it on 15262 * ipx_pending_mp and the ioctl will complete in ip_rput. 15263 * 15264 * Can operate on either a module or a driver queue. 15265 * Returns an error if not a module queue. 15266 */ 15267 /* ARGSUSED */ 15268 int 15269 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15270 ip_ioctl_cmd_t *ipip, void *if_req) 15271 { 15272 ill_t *ill = q->q_ptr; 15273 phyint_t *phyi; 15274 ip_stack_t *ipst; 15275 struct lifreq *lifr = if_req; 15276 uint64_t new_flags; 15277 15278 ASSERT(ipif != NULL); 15279 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 15280 15281 if (q->q_next == NULL) { 15282 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n")); 15283 return (EINVAL); 15284 } 15285 15286 /* 15287 * If we are not writer on 'q' then this interface exists already 15288 * and previous lookups (ip_extract_lifreq()) found this ipif -- 15289 * so return EALREADY. 15290 */ 15291 if (ill != ipif->ipif_ill) 15292 return (EALREADY); 15293 15294 if (ill->ill_name[0] != '\0') 15295 return (EALREADY); 15296 15297 /* 15298 * If there's another ill already with the requested name, ensure 15299 * that it's of the same type. Otherwise, ill_phyint_reinit() will 15300 * fuse together two unrelated ills, which will cause chaos. 15301 */ 15302 ipst = ill->ill_ipst; 15303 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 15304 lifr->lifr_name, NULL); 15305 if (phyi != NULL) { 15306 ill_t *ill_mate = phyi->phyint_illv4; 15307 15308 if (ill_mate == NULL) 15309 ill_mate = phyi->phyint_illv6; 15310 ASSERT(ill_mate != NULL); 15311 15312 if (ill_mate->ill_media->ip_m_mac_type != 15313 ill->ill_media->ip_m_mac_type) { 15314 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to " 15315 "use the same ill name on differing media\n")); 15316 return (EINVAL); 15317 } 15318 } 15319 15320 /* 15321 * We start off as IFF_IPV4 in ipif_allocate and become 15322 * IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value. 15323 * The only flags that we read from user space are IFF_IPV4, 15324 * IFF_IPV6, and IFF_BROADCAST. 15325 * 15326 * This ill has not been inserted into the global list. 15327 * So we are still single threaded and don't need any lock 15328 * 15329 * Saniy check the flags. 15330 */ 15331 15332 if ((lifr->lifr_flags & IFF_BROADCAST) && 15333 ((lifr->lifr_flags & IFF_IPV6) || 15334 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 15335 ip1dbg(("ip_sioctl_slifname: link not broadcast capable " 15336 "or IPv6 i.e., no broadcast \n")); 15337 return (EINVAL); 15338 } 15339 15340 new_flags = 15341 lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST); 15342 15343 if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) { 15344 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of " 15345 "IFF_IPV4 or IFF_IPV6\n")); 15346 return (EINVAL); 15347 } 15348 15349 /* 15350 * We always start off as IPv4, so only need to check for IPv6. 15351 */ 15352 if ((new_flags & IFF_IPV6) != 0) { 15353 ill->ill_flags |= ILLF_IPV6; 15354 ill->ill_flags &= ~ILLF_IPV4; 15355 } 15356 15357 if ((new_flags & IFF_BROADCAST) != 0) 15358 ipif->ipif_flags |= IPIF_BROADCAST; 15359 else 15360 ipif->ipif_flags &= ~IPIF_BROADCAST; 15361 15362 /* We started off as V4. */ 15363 if (ill->ill_flags & ILLF_IPV6) { 15364 ill->ill_phyint->phyint_illv6 = ill; 15365 ill->ill_phyint->phyint_illv4 = NULL; 15366 } 15367 15368 return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa)); 15369 } 15370 15371 /* ARGSUSED */ 15372 int 15373 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15374 ip_ioctl_cmd_t *ipip, void *if_req) 15375 { 15376 /* 15377 * ill_phyint_reinit merged the v4 and v6 into a single 15378 * ipsq. We might not have been able to complete the 15379 * slifname in ipif_set_values, if we could not become 15380 * exclusive. If so restart it here 15381 */ 15382 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 15383 } 15384 15385 /* 15386 * Return a pointer to the ipif which matches the index, IP version type and 15387 * zoneid. 15388 */ 15389 ipif_t * 15390 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 15391 ip_stack_t *ipst) 15392 { 15393 ill_t *ill; 15394 ipif_t *ipif = NULL; 15395 15396 ill = ill_lookup_on_ifindex(index, isv6, ipst); 15397 if (ill != NULL) { 15398 mutex_enter(&ill->ill_lock); 15399 for (ipif = ill->ill_ipif; ipif != NULL; 15400 ipif = ipif->ipif_next) { 15401 if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES || 15402 zoneid == ipif->ipif_zoneid || 15403 ipif->ipif_zoneid == ALL_ZONES)) { 15404 ipif_refhold_locked(ipif); 15405 break; 15406 } 15407 } 15408 mutex_exit(&ill->ill_lock); 15409 ill_refrele(ill); 15410 } 15411 return (ipif); 15412 } 15413 15414 /* 15415 * Change an existing physical interface's index. If the new index 15416 * is acceptable we update the index and the phyint_list_avl_by_index tree. 15417 * Finally, we update other systems which may have a dependence on the 15418 * index value. 15419 */ 15420 /* ARGSUSED */ 15421 int 15422 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15423 ip_ioctl_cmd_t *ipip, void *ifreq) 15424 { 15425 ill_t *ill; 15426 phyint_t *phyi; 15427 struct ifreq *ifr = (struct ifreq *)ifreq; 15428 struct lifreq *lifr = (struct lifreq *)ifreq; 15429 uint_t old_index, index; 15430 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15431 avl_index_t where; 15432 15433 if (ipip->ipi_cmd_type == IF_CMD) 15434 index = ifr->ifr_index; 15435 else 15436 index = lifr->lifr_index; 15437 15438 /* 15439 * Only allow on physical interface. Also, index zero is illegal. 15440 */ 15441 ill = ipif->ipif_ill; 15442 phyi = ill->ill_phyint; 15443 if (ipif->ipif_id != 0 || index == 0) { 15444 return (EINVAL); 15445 } 15446 15447 /* If the index is not changing, no work to do */ 15448 if (phyi->phyint_ifindex == index) 15449 return (0); 15450 15451 /* 15452 * Use phyint_exists() to determine if the new interface index 15453 * is already in use. If the index is unused then we need to 15454 * change the phyint's position in the phyint_list_avl_by_index 15455 * tree. If we do not do this, subsequent lookups (using the new 15456 * index value) will not find the phyint. 15457 */ 15458 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15459 if (phyint_exists(index, ipst)) { 15460 rw_exit(&ipst->ips_ill_g_lock); 15461 return (EEXIST); 15462 } 15463 15464 /* 15465 * The new index is unused. Set it in the phyint. However we must not 15466 * forget to trigger NE_IFINDEX_CHANGE event before the ifindex 15467 * changes. The event must be bound to old ifindex value. 15468 */ 15469 ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE, 15470 &index, sizeof (index)); 15471 15472 old_index = phyi->phyint_ifindex; 15473 phyi->phyint_ifindex = index; 15474 15475 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi); 15476 (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 15477 &index, &where); 15478 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 15479 phyi, where); 15480 rw_exit(&ipst->ips_ill_g_lock); 15481 15482 /* Update SCTP's ILL list */ 15483 sctp_ill_reindex(ill, old_index); 15484 15485 /* Send the routing sockets message */ 15486 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 15487 if (ILL_OTHER(ill)) 15488 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT); 15489 15490 /* Perhaps ilgs should use this ill */ 15491 update_conn_ill(NULL, ill->ill_ipst); 15492 return (0); 15493 } 15494 15495 /* ARGSUSED */ 15496 int 15497 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15498 ip_ioctl_cmd_t *ipip, void *ifreq) 15499 { 15500 struct ifreq *ifr = (struct ifreq *)ifreq; 15501 struct lifreq *lifr = (struct lifreq *)ifreq; 15502 15503 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 15504 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15505 /* Get the interface index */ 15506 if (ipip->ipi_cmd_type == IF_CMD) { 15507 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 15508 } else { 15509 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 15510 } 15511 return (0); 15512 } 15513 15514 /* ARGSUSED */ 15515 int 15516 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15517 ip_ioctl_cmd_t *ipip, void *ifreq) 15518 { 15519 struct lifreq *lifr = (struct lifreq *)ifreq; 15520 15521 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 15522 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15523 /* Get the interface zone */ 15524 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15525 lifr->lifr_zoneid = ipif->ipif_zoneid; 15526 return (0); 15527 } 15528 15529 /* 15530 * Set the zoneid of an interface. 15531 */ 15532 /* ARGSUSED */ 15533 int 15534 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15535 ip_ioctl_cmd_t *ipip, void *ifreq) 15536 { 15537 struct lifreq *lifr = (struct lifreq *)ifreq; 15538 int err = 0; 15539 boolean_t need_up = B_FALSE; 15540 zone_t *zptr; 15541 zone_status_t status; 15542 zoneid_t zoneid; 15543 15544 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15545 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 15546 if (!is_system_labeled()) 15547 return (ENOTSUP); 15548 zoneid = GLOBAL_ZONEID; 15549 } 15550 15551 /* cannot assign instance zero to a non-global zone */ 15552 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 15553 return (ENOTSUP); 15554 15555 /* 15556 * Cannot assign to a zone that doesn't exist or is shutting down. In 15557 * the event of a race with the zone shutdown processing, since IP 15558 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 15559 * interface will be cleaned up even if the zone is shut down 15560 * immediately after the status check. If the interface can't be brought 15561 * down right away, and the zone is shut down before the restart 15562 * function is called, we resolve the possible races by rechecking the 15563 * zone status in the restart function. 15564 */ 15565 if ((zptr = zone_find_by_id(zoneid)) == NULL) 15566 return (EINVAL); 15567 status = zone_status_get(zptr); 15568 zone_rele(zptr); 15569 15570 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 15571 return (EINVAL); 15572 15573 if (ipif->ipif_flags & IPIF_UP) { 15574 /* 15575 * If the interface is already marked up, 15576 * we call ipif_down which will take care 15577 * of ditching any IREs that have been set 15578 * up based on the old interface address. 15579 */ 15580 err = ipif_logical_down(ipif, q, mp); 15581 if (err == EINPROGRESS) 15582 return (err); 15583 (void) ipif_down_tail(ipif); 15584 need_up = B_TRUE; 15585 } 15586 15587 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 15588 return (err); 15589 } 15590 15591 static int 15592 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 15593 queue_t *q, mblk_t *mp, boolean_t need_up) 15594 { 15595 int err = 0; 15596 ip_stack_t *ipst; 15597 15598 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 15599 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15600 15601 if (CONN_Q(q)) 15602 ipst = CONNQ_TO_IPST(q); 15603 else 15604 ipst = ILLQ_TO_IPST(q); 15605 15606 /* 15607 * For exclusive stacks we don't allow a different zoneid than 15608 * global. 15609 */ 15610 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID && 15611 zoneid != GLOBAL_ZONEID) 15612 return (EINVAL); 15613 15614 /* Set the new zone id. */ 15615 ipif->ipif_zoneid = zoneid; 15616 15617 /* Update sctp list */ 15618 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 15619 15620 /* The default multicast interface might have changed */ 15621 ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6); 15622 15623 if (need_up) { 15624 /* 15625 * Now bring the interface back up. If this 15626 * is the only IPIF for the ILL, ipif_up 15627 * will have to re-bind to the device, so 15628 * we may get back EINPROGRESS, in which 15629 * case, this IOCTL will get completed in 15630 * ip_rput_dlpi when we see the DL_BIND_ACK. 15631 */ 15632 err = ipif_up(ipif, q, mp); 15633 } 15634 return (err); 15635 } 15636 15637 /* ARGSUSED */ 15638 int 15639 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15640 ip_ioctl_cmd_t *ipip, void *if_req) 15641 { 15642 struct lifreq *lifr = (struct lifreq *)if_req; 15643 zoneid_t zoneid; 15644 zone_t *zptr; 15645 zone_status_t status; 15646 15647 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15648 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 15649 zoneid = GLOBAL_ZONEID; 15650 15651 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 15652 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15653 15654 /* 15655 * We recheck the zone status to resolve the following race condition: 15656 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 15657 * 2) hme0:1 is up and can't be brought down right away; 15658 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 15659 * 3) zone "myzone" is halted; the zone status switches to 15660 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 15661 * the interfaces to remove - hme0:1 is not returned because it's not 15662 * yet in "myzone", so it won't be removed; 15663 * 4) the restart function for SIOCSLIFZONE is called; without the 15664 * status check here, we would have hme0:1 in "myzone" after it's been 15665 * destroyed. 15666 * Note that if the status check fails, we need to bring the interface 15667 * back to its state prior to ip_sioctl_slifzone(), hence the call to 15668 * ipif_up_done[_v6](). 15669 */ 15670 status = ZONE_IS_UNINITIALIZED; 15671 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 15672 status = zone_status_get(zptr); 15673 zone_rele(zptr); 15674 } 15675 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 15676 if (ipif->ipif_isv6) { 15677 (void) ipif_up_done_v6(ipif); 15678 } else { 15679 (void) ipif_up_done(ipif); 15680 } 15681 return (EINVAL); 15682 } 15683 15684 (void) ipif_down_tail(ipif); 15685 15686 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 15687 B_TRUE)); 15688 } 15689 15690 /* 15691 * Return the number of addresses on `ill' with one or more of the values 15692 * in `set' set and all of the values in `clear' clear. 15693 */ 15694 static uint_t 15695 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear) 15696 { 15697 ipif_t *ipif; 15698 uint_t cnt = 0; 15699 15700 ASSERT(IAM_WRITER_ILL(ill)); 15701 15702 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 15703 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear)) 15704 cnt++; 15705 15706 return (cnt); 15707 } 15708 15709 /* 15710 * Return the number of migratable addresses on `ill' that are under 15711 * application control. 15712 */ 15713 uint_t 15714 ill_appaddr_cnt(const ill_t *ill) 15715 { 15716 return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF, 15717 IPIF_NOFAILOVER)); 15718 } 15719 15720 /* 15721 * Return the number of point-to-point addresses on `ill'. 15722 */ 15723 uint_t 15724 ill_ptpaddr_cnt(const ill_t *ill) 15725 { 15726 return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0)); 15727 } 15728 15729 /* ARGSUSED */ 15730 int 15731 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15732 ip_ioctl_cmd_t *ipip, void *ifreq) 15733 { 15734 struct lifreq *lifr = ifreq; 15735 15736 ASSERT(q->q_next == NULL); 15737 ASSERT(CONN_Q(q)); 15738 15739 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 15740 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15741 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 15742 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 15743 15744 return (0); 15745 } 15746 15747 /* Find the previous ILL in this usesrc group */ 15748 static ill_t * 15749 ill_prev_usesrc(ill_t *uill) 15750 { 15751 ill_t *ill; 15752 15753 for (ill = uill->ill_usesrc_grp_next; 15754 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 15755 ill = ill->ill_usesrc_grp_next) 15756 /* do nothing */; 15757 return (ill); 15758 } 15759 15760 /* 15761 * Release all members of the usesrc group. This routine is called 15762 * from ill_delete when the interface being unplumbed is the 15763 * group head. 15764 * 15765 * This silently clears the usesrc that ifconfig setup. 15766 * An alternative would be to keep that ifindex, and drop packets on the floor 15767 * since no source address can be selected. 15768 * Even if we keep the current semantics, don't need a lock and a linked list. 15769 * Can walk all the ills checking if they have a ill_usesrc_ifindex matching 15770 * the one that is being removed. Issue is how we return the usesrc users 15771 * (SIOCGLIFSRCOF). We want to be able to find the ills which have an 15772 * ill_usesrc_ifindex matching a target ill. We could also do that with an 15773 * ill walk, but the walker would need to insert in the ioctl response. 15774 */ 15775 static void 15776 ill_disband_usesrc_group(ill_t *uill) 15777 { 15778 ill_t *next_ill, *tmp_ill; 15779 ip_stack_t *ipst = uill->ill_ipst; 15780 15781 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 15782 next_ill = uill->ill_usesrc_grp_next; 15783 15784 do { 15785 ASSERT(next_ill != NULL); 15786 tmp_ill = next_ill->ill_usesrc_grp_next; 15787 ASSERT(tmp_ill != NULL); 15788 next_ill->ill_usesrc_grp_next = NULL; 15789 next_ill->ill_usesrc_ifindex = 0; 15790 next_ill = tmp_ill; 15791 } while (next_ill->ill_usesrc_ifindex != 0); 15792 uill->ill_usesrc_grp_next = NULL; 15793 } 15794 15795 /* 15796 * Remove the client usesrc ILL from the list and relink to a new list 15797 */ 15798 int 15799 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 15800 { 15801 ill_t *ill, *tmp_ill; 15802 ip_stack_t *ipst = ucill->ill_ipst; 15803 15804 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 15805 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 15806 15807 /* 15808 * Check if the usesrc client ILL passed in is not already 15809 * in use as a usesrc ILL i.e one whose source address is 15810 * in use OR a usesrc ILL is not already in use as a usesrc 15811 * client ILL 15812 */ 15813 if ((ucill->ill_usesrc_ifindex == 0) || 15814 (uill->ill_usesrc_ifindex != 0)) { 15815 return (-1); 15816 } 15817 15818 ill = ill_prev_usesrc(ucill); 15819 ASSERT(ill->ill_usesrc_grp_next != NULL); 15820 15821 /* Remove from the current list */ 15822 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 15823 /* Only two elements in the list */ 15824 ASSERT(ill->ill_usesrc_ifindex == 0); 15825 ill->ill_usesrc_grp_next = NULL; 15826 } else { 15827 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 15828 } 15829 15830 if (ifindex == 0) { 15831 ucill->ill_usesrc_ifindex = 0; 15832 ucill->ill_usesrc_grp_next = NULL; 15833 return (0); 15834 } 15835 15836 ucill->ill_usesrc_ifindex = ifindex; 15837 tmp_ill = uill->ill_usesrc_grp_next; 15838 uill->ill_usesrc_grp_next = ucill; 15839 ucill->ill_usesrc_grp_next = 15840 (tmp_ill != NULL) ? tmp_ill : uill; 15841 return (0); 15842 } 15843 15844 /* 15845 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 15846 * ip.c for locking details. 15847 */ 15848 /* ARGSUSED */ 15849 int 15850 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15851 ip_ioctl_cmd_t *ipip, void *ifreq) 15852 { 15853 struct lifreq *lifr = (struct lifreq *)ifreq; 15854 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE; 15855 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 15856 int err = 0, ret; 15857 uint_t ifindex; 15858 ipsq_t *ipsq = NULL; 15859 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15860 15861 ASSERT(IAM_WRITER_IPIF(ipif)); 15862 ASSERT(q->q_next == NULL); 15863 ASSERT(CONN_Q(q)); 15864 15865 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; 15866 15867 ifindex = lifr->lifr_index; 15868 if (ifindex == 0) { 15869 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 15870 /* non usesrc group interface, nothing to reset */ 15871 return (0); 15872 } 15873 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 15874 /* valid reset request */ 15875 reset_flg = B_TRUE; 15876 } 15877 15878 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 15879 if (usesrc_ill == NULL) { 15880 return (ENXIO); 15881 } 15882 15883 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 15884 NEW_OP, B_TRUE); 15885 if (ipsq == NULL) { 15886 err = EINPROGRESS; 15887 /* Operation enqueued on the ipsq of the usesrc ILL */ 15888 goto done; 15889 } 15890 15891 /* USESRC isn't currently supported with IPMP */ 15892 if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) { 15893 err = ENOTSUP; 15894 goto done; 15895 } 15896 15897 /* 15898 * USESRC isn't compatible with the STANDBY flag. (STANDBY is only 15899 * used by IPMP underlying interfaces, but someone might think it's 15900 * more general and try to use it independently with VNI.) 15901 */ 15902 if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 15903 err = ENOTSUP; 15904 goto done; 15905 } 15906 15907 /* 15908 * If the client is already in use as a usesrc_ill or a usesrc_ill is 15909 * already a client then return EINVAL 15910 */ 15911 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 15912 err = EINVAL; 15913 goto done; 15914 } 15915 15916 /* 15917 * If the ill_usesrc_ifindex field is already set to what it needs to 15918 * be then this is a duplicate operation. 15919 */ 15920 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 15921 err = 0; 15922 goto done; 15923 } 15924 15925 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 15926 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 15927 usesrc_ill->ill_isv6)); 15928 15929 /* 15930 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 15931 * and the ill_usesrc_ifindex fields 15932 */ 15933 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 15934 15935 if (reset_flg) { 15936 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 15937 if (ret != 0) { 15938 err = EINVAL; 15939 } 15940 rw_exit(&ipst->ips_ill_g_usesrc_lock); 15941 goto done; 15942 } 15943 15944 /* 15945 * Four possibilities to consider: 15946 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 15947 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 15948 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 15949 * 4. Both are part of their respective usesrc groups 15950 */ 15951 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 15952 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 15953 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 15954 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 15955 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 15956 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 15957 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 15958 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 15959 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 15960 /* Insert at head of list */ 15961 usesrc_cli_ill->ill_usesrc_grp_next = 15962 usesrc_ill->ill_usesrc_grp_next; 15963 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 15964 } else { 15965 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 15966 ifindex); 15967 if (ret != 0) 15968 err = EINVAL; 15969 } 15970 rw_exit(&ipst->ips_ill_g_usesrc_lock); 15971 15972 done: 15973 if (ipsq != NULL) 15974 ipsq_exit(ipsq); 15975 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 15976 ill_refrele(usesrc_ill); 15977 15978 /* Let conn_ixa caching know that source address selection changed */ 15979 ip_update_source_selection(ipst); 15980 15981 return (err); 15982 } 15983 15984 /* 15985 * comparison function used by avl. 15986 */ 15987 static int 15988 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 15989 { 15990 15991 uint_t index; 15992 15993 ASSERT(phyip != NULL && index_ptr != NULL); 15994 15995 index = *((uint_t *)index_ptr); 15996 /* 15997 * let the phyint with the lowest index be on top. 15998 */ 15999 if (((phyint_t *)phyip)->phyint_ifindex < index) 16000 return (1); 16001 if (((phyint_t *)phyip)->phyint_ifindex > index) 16002 return (-1); 16003 return (0); 16004 } 16005 16006 /* 16007 * comparison function used by avl. 16008 */ 16009 static int 16010 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 16011 { 16012 ill_t *ill; 16013 int res = 0; 16014 16015 ASSERT(phyip != NULL && name_ptr != NULL); 16016 16017 if (((phyint_t *)phyip)->phyint_illv4) 16018 ill = ((phyint_t *)phyip)->phyint_illv4; 16019 else 16020 ill = ((phyint_t *)phyip)->phyint_illv6; 16021 ASSERT(ill != NULL); 16022 16023 res = strcmp(ill->ill_name, (char *)name_ptr); 16024 if (res > 0) 16025 return (1); 16026 else if (res < 0) 16027 return (-1); 16028 return (0); 16029 } 16030 16031 /* 16032 * This function is called on the unplumb path via ill_glist_delete() when 16033 * there are no ills left on the phyint and thus the phyint can be freed. 16034 */ 16035 static void 16036 phyint_free(phyint_t *phyi) 16037 { 16038 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 16039 16040 ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL); 16041 16042 /* 16043 * If this phyint was an IPMP meta-interface, blow away the group. 16044 * This is safe to do because all of the illgrps have already been 16045 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us. 16046 * If we're cleaning up as a result of failed initialization, 16047 * phyint_grp may be NULL. 16048 */ 16049 if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) { 16050 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 16051 ipmp_grp_destroy(phyi->phyint_grp); 16052 phyi->phyint_grp = NULL; 16053 rw_exit(&ipst->ips_ipmp_lock); 16054 } 16055 16056 /* 16057 * If this interface was under IPMP, take it out of the group. 16058 */ 16059 if (phyi->phyint_grp != NULL) 16060 ipmp_phyint_leave_grp(phyi); 16061 16062 /* 16063 * Delete the phyint and disassociate its ipsq. The ipsq itself 16064 * will be freed in ipsq_exit(). 16065 */ 16066 phyi->phyint_ipsq->ipsq_phyint = NULL; 16067 phyi->phyint_name[0] = '\0'; 16068 16069 mi_free(phyi); 16070 } 16071 16072 /* 16073 * Attach the ill to the phyint structure which can be shared by both 16074 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 16075 * function is called from ipif_set_values and ill_lookup_on_name (for 16076 * loopback) where we know the name of the ill. We lookup the ill and if 16077 * there is one present already with the name use that phyint. Otherwise 16078 * reuse the one allocated by ill_init. 16079 */ 16080 static void 16081 ill_phyint_reinit(ill_t *ill) 16082 { 16083 boolean_t isv6 = ill->ill_isv6; 16084 phyint_t *phyi_old; 16085 phyint_t *phyi; 16086 avl_index_t where = 0; 16087 ill_t *ill_other = NULL; 16088 ip_stack_t *ipst = ill->ill_ipst; 16089 16090 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 16091 16092 phyi_old = ill->ill_phyint; 16093 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 16094 phyi_old->phyint_illv6 == NULL)); 16095 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 16096 phyi_old->phyint_illv4 == NULL)); 16097 ASSERT(phyi_old->phyint_ifindex == 0); 16098 16099 /* 16100 * Now that our ill has a name, set it in the phyint. 16101 */ 16102 (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ); 16103 16104 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16105 ill->ill_name, &where); 16106 16107 /* 16108 * 1. We grabbed the ill_g_lock before inserting this ill into 16109 * the global list of ills. So no other thread could have located 16110 * this ill and hence the ipsq of this ill is guaranteed to be empty. 16111 * 2. Now locate the other protocol instance of this ill. 16112 * 3. Now grab both ill locks in the right order, and the phyint lock of 16113 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 16114 * of neither ill can change. 16115 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 16116 * other ill. 16117 * 5. Release all locks. 16118 */ 16119 16120 /* 16121 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 16122 * we are initializing IPv4. 16123 */ 16124 if (phyi != NULL) { 16125 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6; 16126 ASSERT(ill_other->ill_phyint != NULL); 16127 ASSERT((isv6 && !ill_other->ill_isv6) || 16128 (!isv6 && ill_other->ill_isv6)); 16129 GRAB_ILL_LOCKS(ill, ill_other); 16130 /* 16131 * We are potentially throwing away phyint_flags which 16132 * could be different from the one that we obtain from 16133 * ill_other->ill_phyint. But it is okay as we are assuming 16134 * that the state maintained within IP is correct. 16135 */ 16136 mutex_enter(&phyi->phyint_lock); 16137 if (isv6) { 16138 ASSERT(phyi->phyint_illv6 == NULL); 16139 phyi->phyint_illv6 = ill; 16140 } else { 16141 ASSERT(phyi->phyint_illv4 == NULL); 16142 phyi->phyint_illv4 = ill; 16143 } 16144 16145 /* 16146 * Delete the old phyint and make its ipsq eligible 16147 * to be freed in ipsq_exit(). 16148 */ 16149 phyi_old->phyint_illv4 = NULL; 16150 phyi_old->phyint_illv6 = NULL; 16151 phyi_old->phyint_ipsq->ipsq_phyint = NULL; 16152 phyi_old->phyint_name[0] = '\0'; 16153 mi_free(phyi_old); 16154 } else { 16155 mutex_enter(&ill->ill_lock); 16156 /* 16157 * We don't need to acquire any lock, since 16158 * the ill is not yet visible globally and we 16159 * have not yet released the ill_g_lock. 16160 */ 16161 phyi = phyi_old; 16162 mutex_enter(&phyi->phyint_lock); 16163 /* XXX We need a recovery strategy here. */ 16164 if (!phyint_assign_ifindex(phyi, ipst)) 16165 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 16166 16167 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16168 (void *)phyi, where); 16169 16170 (void) avl_find(&ipst->ips_phyint_g_list-> 16171 phyint_list_avl_by_index, 16172 &phyi->phyint_ifindex, &where); 16173 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16174 (void *)phyi, where); 16175 } 16176 16177 /* 16178 * Reassigning ill_phyint automatically reassigns the ipsq also. 16179 * pending mp is not affected because that is per ill basis. 16180 */ 16181 ill->ill_phyint = phyi; 16182 16183 /* 16184 * Now that the phyint's ifindex has been assigned, complete the 16185 * remaining 16186 */ 16187 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex; 16188 if (ill->ill_isv6) { 16189 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 16190 ill->ill_phyint->phyint_ifindex; 16191 ill->ill_mcast_type = ipst->ips_mld_max_version; 16192 } else { 16193 ill->ill_mcast_type = ipst->ips_igmp_max_version; 16194 } 16195 16196 /* 16197 * Generate an event within the hooks framework to indicate that 16198 * a new interface has just been added to IP. For this event to 16199 * be generated, the network interface must, at least, have an 16200 * ifindex assigned to it. (We don't generate the event for 16201 * loopback since ill_lookup_on_name() has its own NE_PLUMB event.) 16202 * 16203 * This needs to be run inside the ill_g_lock perimeter to ensure 16204 * that the ordering of delivered events to listeners matches the 16205 * order of them in the kernel. 16206 */ 16207 if (!IS_LOOPBACK(ill)) { 16208 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name, 16209 ill->ill_name_length); 16210 } 16211 RELEASE_ILL_LOCKS(ill, ill_other); 16212 mutex_exit(&phyi->phyint_lock); 16213 } 16214 16215 /* 16216 * Notify any downstream modules of the name of this interface. 16217 * An M_IOCTL is used even though we don't expect a successful reply. 16218 * Any reply message from the driver (presumably an M_IOCNAK) will 16219 * eventually get discarded somewhere upstream. The message format is 16220 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 16221 * to IP. 16222 */ 16223 static void 16224 ip_ifname_notify(ill_t *ill, queue_t *q) 16225 { 16226 mblk_t *mp1, *mp2; 16227 struct iocblk *iocp; 16228 struct lifreq *lifr; 16229 16230 mp1 = mkiocb(SIOCSLIFNAME); 16231 if (mp1 == NULL) 16232 return; 16233 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 16234 if (mp2 == NULL) { 16235 freeb(mp1); 16236 return; 16237 } 16238 16239 mp1->b_cont = mp2; 16240 iocp = (struct iocblk *)mp1->b_rptr; 16241 iocp->ioc_count = sizeof (struct lifreq); 16242 16243 lifr = (struct lifreq *)mp2->b_rptr; 16244 mp2->b_wptr += sizeof (struct lifreq); 16245 bzero(lifr, sizeof (struct lifreq)); 16246 16247 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 16248 lifr->lifr_ppa = ill->ill_ppa; 16249 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 16250 16251 DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify", 16252 char *, "SIOCSLIFNAME", ill_t *, ill); 16253 putnext(q, mp1); 16254 } 16255 16256 static int 16257 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 16258 { 16259 int err; 16260 ip_stack_t *ipst = ill->ill_ipst; 16261 phyint_t *phyi = ill->ill_phyint; 16262 16263 /* Set the obsolete NDD per-interface forwarding name. */ 16264 err = ill_set_ndd_name(ill); 16265 if (err != 0) { 16266 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 16267 err); 16268 } 16269 16270 /* 16271 * Now that ill_name is set, the configuration for the IPMP 16272 * meta-interface can be performed. 16273 */ 16274 if (IS_IPMP(ill)) { 16275 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 16276 /* 16277 * If phyi->phyint_grp is NULL, then this is the first IPMP 16278 * meta-interface and we need to create the IPMP group. 16279 */ 16280 if (phyi->phyint_grp == NULL) { 16281 /* 16282 * If someone has renamed another IPMP group to have 16283 * the same name as our interface, bail. 16284 */ 16285 if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) { 16286 rw_exit(&ipst->ips_ipmp_lock); 16287 return (EEXIST); 16288 } 16289 phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi); 16290 if (phyi->phyint_grp == NULL) { 16291 rw_exit(&ipst->ips_ipmp_lock); 16292 return (ENOMEM); 16293 } 16294 } 16295 rw_exit(&ipst->ips_ipmp_lock); 16296 } 16297 16298 /* Tell downstream modules where they are. */ 16299 ip_ifname_notify(ill, q); 16300 16301 /* 16302 * ill_dl_phys returns EINPROGRESS in the usual case. 16303 * Error cases are ENOMEM ... 16304 */ 16305 err = ill_dl_phys(ill, ipif, mp, q); 16306 16307 if (ill->ill_isv6) { 16308 mutex_enter(&ipst->ips_mld_slowtimeout_lock); 16309 if (ipst->ips_mld_slowtimeout_id == 0) { 16310 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, 16311 (void *)ipst, 16312 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 16313 } 16314 mutex_exit(&ipst->ips_mld_slowtimeout_lock); 16315 } else { 16316 mutex_enter(&ipst->ips_igmp_slowtimeout_lock); 16317 if (ipst->ips_igmp_slowtimeout_id == 0) { 16318 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, 16319 (void *)ipst, 16320 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 16321 } 16322 mutex_exit(&ipst->ips_igmp_slowtimeout_lock); 16323 } 16324 16325 return (err); 16326 } 16327 16328 /* 16329 * Common routine for ppa and ifname setting. Should be called exclusive. 16330 * 16331 * Returns EINPROGRESS when mp has been consumed by queueing it on 16332 * ipx_pending_mp and the ioctl will complete in ip_rput. 16333 * 16334 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 16335 * the new name and new ppa in lifr_name and lifr_ppa respectively. 16336 * For SLIFNAME, we pass these values back to the userland. 16337 */ 16338 static int 16339 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 16340 { 16341 ill_t *ill; 16342 ipif_t *ipif; 16343 ipsq_t *ipsq; 16344 char *ppa_ptr; 16345 char *old_ptr; 16346 char old_char; 16347 int error; 16348 ip_stack_t *ipst; 16349 16350 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 16351 ASSERT(q->q_next != NULL); 16352 ASSERT(interf_name != NULL); 16353 16354 ill = (ill_t *)q->q_ptr; 16355 ipst = ill->ill_ipst; 16356 16357 ASSERT(ill->ill_ipst != NULL); 16358 ASSERT(ill->ill_name[0] == '\0'); 16359 ASSERT(IAM_WRITER_ILL(ill)); 16360 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 16361 ASSERT(ill->ill_ppa == UINT_MAX); 16362 16363 ill->ill_defend_start = ill->ill_defend_count = 0; 16364 /* The ppa is sent down by ifconfig or is chosen */ 16365 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 16366 return (EINVAL); 16367 } 16368 16369 /* 16370 * make sure ppa passed in is same as ppa in the name. 16371 * This check is not made when ppa == UINT_MAX in that case ppa 16372 * in the name could be anything. System will choose a ppa and 16373 * update new_ppa_ptr and inter_name to contain the choosen ppa. 16374 */ 16375 if (*new_ppa_ptr != UINT_MAX) { 16376 /* stoi changes the pointer */ 16377 old_ptr = ppa_ptr; 16378 /* 16379 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 16380 * (they don't have an externally visible ppa). We assign one 16381 * here so that we can manage the interface. Note that in 16382 * the past this value was always 0 for DLPI 1 drivers. 16383 */ 16384 if (*new_ppa_ptr == 0) 16385 *new_ppa_ptr = stoi(&old_ptr); 16386 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 16387 return (EINVAL); 16388 } 16389 /* 16390 * terminate string before ppa 16391 * save char at that location. 16392 */ 16393 old_char = ppa_ptr[0]; 16394 ppa_ptr[0] = '\0'; 16395 16396 ill->ill_ppa = *new_ppa_ptr; 16397 /* 16398 * Finish as much work now as possible before calling ill_glist_insert 16399 * which makes the ill globally visible and also merges it with the 16400 * other protocol instance of this phyint. The remaining work is 16401 * done after entering the ipsq which may happen sometime later. 16402 * ill_set_ndd_name occurs after the ill has been made globally visible. 16403 */ 16404 ipif = ill->ill_ipif; 16405 16406 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 16407 ipif_assign_seqid(ipif); 16408 16409 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 16410 ill->ill_flags |= ILLF_IPV4; 16411 16412 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 16413 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 16414 16415 if (ill->ill_flags & ILLF_IPV6) { 16416 16417 ill->ill_isv6 = B_TRUE; 16418 ill_set_inputfn(ill); 16419 if (ill->ill_rq != NULL) { 16420 ill->ill_rq->q_qinfo = &iprinitv6; 16421 } 16422 16423 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 16424 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 16425 ipif->ipif_v6subnet = ipv6_all_zeros; 16426 ipif->ipif_v6net_mask = ipv6_all_zeros; 16427 ipif->ipif_v6brd_addr = ipv6_all_zeros; 16428 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 16429 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 16430 /* 16431 * point-to-point or Non-mulicast capable 16432 * interfaces won't do NUD unless explicitly 16433 * configured to do so. 16434 */ 16435 if (ipif->ipif_flags & IPIF_POINTOPOINT || 16436 !(ill->ill_flags & ILLF_MULTICAST)) { 16437 ill->ill_flags |= ILLF_NONUD; 16438 } 16439 /* Make sure IPv4 specific flag is not set on IPv6 if */ 16440 if (ill->ill_flags & ILLF_NOARP) { 16441 /* 16442 * Note: xresolv interfaces will eventually need 16443 * NOARP set here as well, but that will require 16444 * those external resolvers to have some 16445 * knowledge of that flag and act appropriately. 16446 * Not to be changed at present. 16447 */ 16448 ill->ill_flags &= ~ILLF_NOARP; 16449 } 16450 /* 16451 * Set the ILLF_ROUTER flag according to the global 16452 * IPv6 forwarding policy. 16453 */ 16454 if (ipst->ips_ipv6_forward != 0) 16455 ill->ill_flags |= ILLF_ROUTER; 16456 } else if (ill->ill_flags & ILLF_IPV4) { 16457 ill->ill_isv6 = B_FALSE; 16458 ill_set_inputfn(ill); 16459 ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER; 16460 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 16461 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 16462 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 16463 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 16464 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 16465 /* 16466 * Set the ILLF_ROUTER flag according to the global 16467 * IPv4 forwarding policy. 16468 */ 16469 if (ipst->ips_ip_g_forward != 0) 16470 ill->ill_flags |= ILLF_ROUTER; 16471 } 16472 16473 ASSERT(ill->ill_phyint != NULL); 16474 16475 /* 16476 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will 16477 * be completed in ill_glist_insert -> ill_phyint_reinit 16478 */ 16479 if (!ill_allocate_mibs(ill)) 16480 return (ENOMEM); 16481 16482 /* 16483 * Pick a default sap until we get the DL_INFO_ACK back from 16484 * the driver. 16485 */ 16486 ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap : 16487 ill->ill_media->ip_m_ipv4sap; 16488 16489 ill->ill_ifname_pending = 1; 16490 ill->ill_ifname_pending_err = 0; 16491 16492 /* 16493 * When the first ipif comes up in ipif_up_done(), multicast groups 16494 * that were joined while this ill was not bound to the DLPI link need 16495 * to be recovered by ill_recover_multicast(). 16496 */ 16497 ill->ill_need_recover_multicast = 1; 16498 16499 ill_refhold(ill); 16500 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16501 if ((error = ill_glist_insert(ill, interf_name, 16502 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 16503 ill->ill_ppa = UINT_MAX; 16504 ill->ill_name[0] = '\0'; 16505 /* 16506 * undo null termination done above. 16507 */ 16508 ppa_ptr[0] = old_char; 16509 rw_exit(&ipst->ips_ill_g_lock); 16510 ill_refrele(ill); 16511 return (error); 16512 } 16513 16514 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 16515 16516 /* 16517 * When we return the buffer pointed to by interf_name should contain 16518 * the same name as in ill_name. 16519 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 16520 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 16521 * so copy full name and update the ppa ptr. 16522 * When ppa passed in != UINT_MAX all values are correct just undo 16523 * null termination, this saves a bcopy. 16524 */ 16525 if (*new_ppa_ptr == UINT_MAX) { 16526 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 16527 *new_ppa_ptr = ill->ill_ppa; 16528 } else { 16529 /* 16530 * undo null termination done above. 16531 */ 16532 ppa_ptr[0] = old_char; 16533 } 16534 16535 /* Let SCTP know about this ILL */ 16536 sctp_update_ill(ill, SCTP_ILL_INSERT); 16537 16538 /* 16539 * ill_glist_insert has made the ill visible globally, and 16540 * ill_phyint_reinit could have changed the ipsq. At this point, 16541 * we need to hold the ips_ill_g_lock across the call to enter the 16542 * ipsq to enforce atomicity and prevent reordering. In the event 16543 * the ipsq has changed, and if the new ipsq is currently busy, 16544 * we need to make sure that this half-completed ioctl is ahead of 16545 * any subsequent ioctl. We achieve this by not dropping the 16546 * ips_ill_g_lock which prevents any ill lookup itself thereby 16547 * ensuring that new ioctls can't start. 16548 */ 16549 ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP, 16550 B_TRUE); 16551 16552 rw_exit(&ipst->ips_ill_g_lock); 16553 ill_refrele(ill); 16554 if (ipsq == NULL) 16555 return (EINPROGRESS); 16556 16557 /* 16558 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. 16559 */ 16560 if (ipsq->ipsq_xop->ipx_current_ipif == NULL) 16561 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); 16562 else 16563 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif); 16564 16565 error = ipif_set_values_tail(ill, ipif, mp, q); 16566 ipsq_exit(ipsq); 16567 if (error != 0 && error != EINPROGRESS) { 16568 /* 16569 * restore previous values 16570 */ 16571 ill->ill_isv6 = B_FALSE; 16572 ill_set_inputfn(ill); 16573 } 16574 return (error); 16575 } 16576 16577 void 16578 ipif_init(ip_stack_t *ipst) 16579 { 16580 int i; 16581 16582 for (i = 0; i < MAX_G_HEADS; i++) { 16583 ipst->ips_ill_g_heads[i].ill_g_list_head = 16584 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 16585 ipst->ips_ill_g_heads[i].ill_g_list_tail = 16586 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 16587 } 16588 16589 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16590 ill_phyint_compare_index, 16591 sizeof (phyint_t), 16592 offsetof(struct phyint, phyint_avl_by_index)); 16593 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16594 ill_phyint_compare_name, 16595 sizeof (phyint_t), 16596 offsetof(struct phyint, phyint_avl_by_name)); 16597 } 16598 16599 /* 16600 * Save enough information so that we can recreate the IRE if 16601 * the interface goes down and then up. 16602 */ 16603 void 16604 ill_save_ire(ill_t *ill, ire_t *ire) 16605 { 16606 mblk_t *save_mp; 16607 16608 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 16609 if (save_mp != NULL) { 16610 ifrt_t *ifrt; 16611 16612 save_mp->b_wptr += sizeof (ifrt_t); 16613 ifrt = (ifrt_t *)save_mp->b_rptr; 16614 bzero(ifrt, sizeof (ifrt_t)); 16615 ifrt->ifrt_type = ire->ire_type; 16616 if (ire->ire_ipversion == IPV4_VERSION) { 16617 ASSERT(!ill->ill_isv6); 16618 ifrt->ifrt_addr = ire->ire_addr; 16619 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 16620 ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr; 16621 ifrt->ifrt_mask = ire->ire_mask; 16622 } else { 16623 ASSERT(ill->ill_isv6); 16624 ifrt->ifrt_v6addr = ire->ire_addr_v6; 16625 /* ire_gateway_addr_v6 can change due to RTM_CHANGE */ 16626 mutex_enter(&ire->ire_lock); 16627 ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6; 16628 mutex_exit(&ire->ire_lock); 16629 ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6; 16630 ifrt->ifrt_v6mask = ire->ire_mask_v6; 16631 } 16632 ifrt->ifrt_flags = ire->ire_flags; 16633 ifrt->ifrt_zoneid = ire->ire_zoneid; 16634 mutex_enter(&ill->ill_saved_ire_lock); 16635 save_mp->b_cont = ill->ill_saved_ire_mp; 16636 ill->ill_saved_ire_mp = save_mp; 16637 ill->ill_saved_ire_cnt++; 16638 mutex_exit(&ill->ill_saved_ire_lock); 16639 } 16640 } 16641 16642 /* 16643 * Remove one entry from ill_saved_ire_mp. 16644 */ 16645 void 16646 ill_remove_saved_ire(ill_t *ill, ire_t *ire) 16647 { 16648 mblk_t **mpp; 16649 mblk_t *mp; 16650 ifrt_t *ifrt; 16651 16652 /* Remove from ill_saved_ire_mp list if it is there */ 16653 mutex_enter(&ill->ill_saved_ire_lock); 16654 for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL; 16655 mpp = &(*mpp)->b_cont) { 16656 in6_addr_t gw_addr_v6; 16657 16658 /* 16659 * On a given ill, the tuple of address, gateway, mask, 16660 * ire_type, and zoneid is unique for each saved IRE. 16661 */ 16662 mp = *mpp; 16663 ifrt = (ifrt_t *)mp->b_rptr; 16664 /* ire_gateway_addr_v6 can change - need lock */ 16665 mutex_enter(&ire->ire_lock); 16666 gw_addr_v6 = ire->ire_gateway_addr_v6; 16667 mutex_exit(&ire->ire_lock); 16668 16669 if (ifrt->ifrt_zoneid != ire->ire_zoneid || 16670 ifrt->ifrt_type != ire->ire_type) 16671 continue; 16672 16673 if (ill->ill_isv6 ? 16674 (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr, 16675 &ire->ire_addr_v6) && 16676 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr, 16677 &gw_addr_v6) && 16678 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask, 16679 &ire->ire_mask_v6)) : 16680 (ifrt->ifrt_addr == ire->ire_addr && 16681 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 16682 ifrt->ifrt_mask == ire->ire_mask)) { 16683 *mpp = mp->b_cont; 16684 ill->ill_saved_ire_cnt--; 16685 freeb(mp); 16686 break; 16687 } 16688 } 16689 mutex_exit(&ill->ill_saved_ire_lock); 16690 } 16691 16692 /* 16693 * IP multirouting broadcast routes handling 16694 * Append CGTP broadcast IREs to regular ones created 16695 * at ifconfig time. 16696 * The usage is a route add <cgtp_bc> <nic_bc> -multirt i.e., both 16697 * the destination and the gateway are broadcast addresses. 16698 * The caller has verified that the destination is an IRE_BROADCAST and that 16699 * RTF_MULTIRT was set. Here if the gateway is a broadcast address, then 16700 * we create a MULTIRT IRE_BROADCAST. 16701 * Note that the IRE_HOST created by ire_rt_add doesn't get found by anything 16702 * since the IRE_BROADCAST takes precedence; ire_add_v4 does head insertion. 16703 */ 16704 static void 16705 ip_cgtp_bcast_add(ire_t *ire, ip_stack_t *ipst) 16706 { 16707 ire_t *ire_prim; 16708 16709 ASSERT(ire != NULL); 16710 16711 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, 16712 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, 16713 NULL); 16714 if (ire_prim != NULL) { 16715 /* 16716 * We are in the special case of broadcasts for 16717 * CGTP. We add an IRE_BROADCAST that holds 16718 * the RTF_MULTIRT flag, the destination 16719 * address and the low level 16720 * info of ire_prim. In other words, CGTP 16721 * broadcast is added to the redundant ipif. 16722 */ 16723 ill_t *ill_prim; 16724 ire_t *bcast_ire; 16725 16726 ill_prim = ire_prim->ire_ill; 16727 16728 ip2dbg(("ip_cgtp_filter_bcast_add: ire_prim %p, ill_prim %p\n", 16729 (void *)ire_prim, (void *)ill_prim)); 16730 16731 bcast_ire = ire_create( 16732 (uchar_t *)&ire->ire_addr, 16733 (uchar_t *)&ip_g_all_ones, 16734 (uchar_t *)&ire->ire_gateway_addr, 16735 IRE_BROADCAST, 16736 ill_prim, 16737 GLOBAL_ZONEID, /* CGTP is only for the global zone */ 16738 ire->ire_flags | RTF_KERNEL, 16739 NULL, 16740 ipst); 16741 16742 /* 16743 * Here we assume that ire_add does head insertion so that 16744 * the added IRE_BROADCAST comes before the existing IRE_HOST. 16745 */ 16746 if (bcast_ire != NULL) { 16747 if (ire->ire_flags & RTF_SETSRC) { 16748 bcast_ire->ire_setsrc_addr = 16749 ire->ire_setsrc_addr; 16750 } 16751 bcast_ire = ire_add(bcast_ire); 16752 if (bcast_ire != NULL) { 16753 ip2dbg(("ip_cgtp_filter_bcast_add: " 16754 "added bcast_ire %p\n", 16755 (void *)bcast_ire)); 16756 16757 ill_save_ire(ill_prim, bcast_ire); 16758 ire_refrele(bcast_ire); 16759 } 16760 } 16761 ire_refrele(ire_prim); 16762 } 16763 } 16764 16765 /* 16766 * IP multirouting broadcast routes handling 16767 * Remove the broadcast ire. 16768 * The usage is a route delete <cgtp_bc> <nic_bc> -multirt i.e., both 16769 * the destination and the gateway are broadcast addresses. 16770 * The caller has only verified that RTF_MULTIRT was set. We check 16771 * that the destination is broadcast and that the gateway is a broadcast 16772 * address, and if so delete the IRE added by ip_cgtp_bcast_add(). 16773 */ 16774 static void 16775 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst) 16776 { 16777 ASSERT(ire != NULL); 16778 16779 if (ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST) { 16780 ire_t *ire_prim; 16781 16782 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, 16783 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, 16784 ipst, NULL); 16785 if (ire_prim != NULL) { 16786 ill_t *ill_prim; 16787 ire_t *bcast_ire; 16788 16789 ill_prim = ire_prim->ire_ill; 16790 16791 ip2dbg(("ip_cgtp_filter_bcast_delete: " 16792 "ire_prim %p, ill_prim %p\n", 16793 (void *)ire_prim, (void *)ill_prim)); 16794 16795 bcast_ire = ire_ftable_lookup_v4(ire->ire_addr, 0, 16796 ire->ire_gateway_addr, IRE_BROADCAST, 16797 ill_prim, ALL_ZONES, NULL, 16798 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_ILL | 16799 MATCH_IRE_MASK, 0, ipst, NULL); 16800 16801 if (bcast_ire != NULL) { 16802 ip2dbg(("ip_cgtp_filter_bcast_delete: " 16803 "looked up bcast_ire %p\n", 16804 (void *)bcast_ire)); 16805 ill_remove_saved_ire(bcast_ire->ire_ill, 16806 bcast_ire); 16807 ire_delete(bcast_ire); 16808 ire_refrele(bcast_ire); 16809 } 16810 ire_refrele(ire_prim); 16811 } 16812 } 16813 } 16814 16815 /* 16816 * Derive an interface id from the link layer address. 16817 * Knows about IEEE 802 and IEEE EUI-64 mappings. 16818 */ 16819 static void 16820 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16821 { 16822 char *addr; 16823 16824 /* 16825 * Note that some IPv6 interfaces get plumbed over links that claim to 16826 * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g. 16827 * PPP links). The ETHERADDRL check here ensures that we only set the 16828 * interface ID on IPv6 interfaces above links that actually have real 16829 * Ethernet addresses. 16830 */ 16831 if (ill->ill_phys_addr_length == ETHERADDRL) { 16832 /* Form EUI-64 like address */ 16833 addr = (char *)&v6addr->s6_addr32[2]; 16834 bcopy(ill->ill_phys_addr, addr, 3); 16835 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 16836 addr[3] = (char)0xff; 16837 addr[4] = (char)0xfe; 16838 bcopy(ill->ill_phys_addr + 3, addr + 5, 3); 16839 } 16840 } 16841 16842 /* ARGSUSED */ 16843 static void 16844 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16845 { 16846 } 16847 16848 typedef struct ipmp_ifcookie { 16849 uint32_t ic_hostid; 16850 char ic_ifname[LIFNAMSIZ]; 16851 char ic_zonename[ZONENAME_MAX]; 16852 } ipmp_ifcookie_t; 16853 16854 /* 16855 * Construct a pseudo-random interface ID for the IPMP interface that's both 16856 * predictable and (almost) guaranteed to be unique. 16857 */ 16858 static void 16859 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16860 { 16861 zone_t *zp; 16862 uint8_t *addr; 16863 uchar_t hash[16]; 16864 ulong_t hostid; 16865 MD5_CTX ctx; 16866 ipmp_ifcookie_t ic = { 0 }; 16867 16868 ASSERT(IS_IPMP(ill)); 16869 16870 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); 16871 ic.ic_hostid = htonl((uint32_t)hostid); 16872 16873 (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ); 16874 16875 if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) { 16876 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX); 16877 zone_rele(zp); 16878 } 16879 16880 MD5Init(&ctx); 16881 MD5Update(&ctx, &ic, sizeof (ic)); 16882 MD5Final(hash, &ctx); 16883 16884 /* 16885 * Map the hash to an interface ID per the basic approach in RFC3041. 16886 */ 16887 addr = &v6addr->s6_addr8[8]; 16888 bcopy(hash + 8, addr, sizeof (uint64_t)); 16889 addr[0] &= ~0x2; /* set local bit */ 16890 } 16891 16892 /* 16893 * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet. 16894 */ 16895 static void 16896 ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr) 16897 { 16898 phyint_t *phyi = ill->ill_phyint; 16899 16900 /* 16901 * Check PHYI_MULTI_BCAST and length of physical 16902 * address to determine if we use the mapping or the 16903 * broadcast address. 16904 */ 16905 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || 16906 ill->ill_phys_addr_length != ETHERADDRL) { 16907 ip_mbcast_mapping(ill, m_ip6addr, m_physaddr); 16908 return; 16909 } 16910 m_physaddr[0] = 0x33; 16911 m_physaddr[1] = 0x33; 16912 m_physaddr[2] = m_ip6addr[12]; 16913 m_physaddr[3] = m_ip6addr[13]; 16914 m_physaddr[4] = m_ip6addr[14]; 16915 m_physaddr[5] = m_ip6addr[15]; 16916 } 16917 16918 /* 16919 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet. 16920 */ 16921 static void 16922 ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 16923 { 16924 phyint_t *phyi = ill->ill_phyint; 16925 16926 /* 16927 * Check PHYI_MULTI_BCAST and length of physical 16928 * address to determine if we use the mapping or the 16929 * broadcast address. 16930 */ 16931 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || 16932 ill->ill_phys_addr_length != ETHERADDRL) { 16933 ip_mbcast_mapping(ill, m_ipaddr, m_physaddr); 16934 return; 16935 } 16936 m_physaddr[0] = 0x01; 16937 m_physaddr[1] = 0x00; 16938 m_physaddr[2] = 0x5e; 16939 m_physaddr[3] = m_ipaddr[1] & 0x7f; 16940 m_physaddr[4] = m_ipaddr[2]; 16941 m_physaddr[5] = m_ipaddr[3]; 16942 } 16943 16944 /* ARGSUSED */ 16945 static void 16946 ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 16947 { 16948 /* 16949 * for the MULTI_BCAST case and other cases when we want to 16950 * use the link-layer broadcast address for multicast. 16951 */ 16952 uint8_t *bphys_addr; 16953 dl_unitdata_req_t *dlur; 16954 16955 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 16956 if (ill->ill_sap_length < 0) { 16957 bphys_addr = (uchar_t *)dlur + 16958 dlur->dl_dest_addr_offset; 16959 } else { 16960 bphys_addr = (uchar_t *)dlur + 16961 dlur->dl_dest_addr_offset + ill->ill_sap_length; 16962 } 16963 16964 bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length); 16965 } 16966 16967 /* 16968 * Derive IPoIB interface id from the link layer address. 16969 */ 16970 static void 16971 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16972 { 16973 char *addr; 16974 16975 ASSERT(ill->ill_phys_addr_length == 20); 16976 addr = (char *)&v6addr->s6_addr32[2]; 16977 bcopy(ill->ill_phys_addr + 12, addr, 8); 16978 /* 16979 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 16980 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 16981 * rules. In these cases, the IBA considers these GUIDs to be in 16982 * "Modified EUI-64" format, and thus toggling the u/l bit is not 16983 * required; vendors are required not to assign global EUI-64's 16984 * that differ only in u/l bit values, thus guaranteeing uniqueness 16985 * of the interface identifier. Whether the GUID is in modified 16986 * or proper EUI-64 format, the ipv6 identifier must have the u/l 16987 * bit set to 1. 16988 */ 16989 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 16990 } 16991 16992 /* 16993 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand. 16994 * Note on mapping from multicast IP addresses to IPoIB multicast link 16995 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 16996 * The format of an IPoIB multicast address is: 16997 * 16998 * 4 byte QPN Scope Sign. Pkey 16999 * +--------------------------------------------+ 17000 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 17001 * +--------------------------------------------+ 17002 * 17003 * The Scope and Pkey components are properties of the IBA port and 17004 * network interface. They can be ascertained from the broadcast address. 17005 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 17006 */ 17007 static void 17008 ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17009 { 17010 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 17011 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 17012 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 17013 uint8_t *bphys_addr; 17014 dl_unitdata_req_t *dlur; 17015 17016 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); 17017 17018 /* 17019 * RFC 4391: IPv4 MGID is 28-bit long. 17020 */ 17021 m_physaddr[16] = m_ipaddr[0] & 0x0f; 17022 m_physaddr[17] = m_ipaddr[1]; 17023 m_physaddr[18] = m_ipaddr[2]; 17024 m_physaddr[19] = m_ipaddr[3]; 17025 17026 17027 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17028 if (ill->ill_sap_length < 0) { 17029 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 17030 } else { 17031 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + 17032 ill->ill_sap_length; 17033 } 17034 /* 17035 * Now fill in the IBA scope/Pkey values from the broadcast address. 17036 */ 17037 m_physaddr[5] = bphys_addr[5]; 17038 m_physaddr[8] = bphys_addr[8]; 17039 m_physaddr[9] = bphys_addr[9]; 17040 } 17041 17042 static void 17043 ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17044 { 17045 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 17046 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 17047 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 17048 uint8_t *bphys_addr; 17049 dl_unitdata_req_t *dlur; 17050 17051 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); 17052 17053 /* 17054 * RFC 4391: IPv4 MGID is 80-bit long. 17055 */ 17056 bcopy(&m_ipaddr[6], &m_physaddr[10], 10); 17057 17058 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17059 if (ill->ill_sap_length < 0) { 17060 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 17061 } else { 17062 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + 17063 ill->ill_sap_length; 17064 } 17065 /* 17066 * Now fill in the IBA scope/Pkey values from the broadcast address. 17067 */ 17068 m_physaddr[5] = bphys_addr[5]; 17069 m_physaddr[8] = bphys_addr[8]; 17070 m_physaddr[9] = bphys_addr[9]; 17071 } 17072 17073 /* 17074 * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4 17075 * tunnel). The IPv4 address simply get placed in the lower 4 bytes of the 17076 * IPv6 interface id. This is a suggested mechanism described in section 3.7 17077 * of RFC4213. 17078 */ 17079 static void 17080 ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 17081 { 17082 ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t)); 17083 v6addr->s6_addr32[2] = 0; 17084 bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t)); 17085 } 17086 17087 /* 17088 * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6 17089 * tunnel). The lower 8 bytes of the IPv6 address simply become the interface 17090 * id. 17091 */ 17092 static void 17093 ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 17094 { 17095 in6_addr_t *v6lladdr = (in6_addr_t *)physaddr; 17096 17097 ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t)); 17098 bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8); 17099 } 17100 17101 static void 17102 ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17103 { 17104 ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr); 17105 } 17106 17107 static void 17108 ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 17109 { 17110 ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr); 17111 } 17112 17113 static void 17114 ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17115 { 17116 ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr); 17117 } 17118 17119 static void 17120 ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 17121 { 17122 ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr); 17123 } 17124 17125 /* 17126 * Lookup an ill and verify that the zoneid has an ipif on that ill. 17127 * Returns an held ill, or NULL. 17128 */ 17129 ill_t * 17130 ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6, 17131 ip_stack_t *ipst) 17132 { 17133 ill_t *ill; 17134 ipif_t *ipif; 17135 17136 ill = ill_lookup_on_ifindex(index, isv6, ipst); 17137 if (ill == NULL) 17138 return (NULL); 17139 17140 mutex_enter(&ill->ill_lock); 17141 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17142 if (IPIF_IS_CONDEMNED(ipif)) 17143 continue; 17144 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 17145 ipif->ipif_zoneid != ALL_ZONES) 17146 continue; 17147 17148 mutex_exit(&ill->ill_lock); 17149 return (ill); 17150 } 17151 mutex_exit(&ill->ill_lock); 17152 ill_refrele(ill); 17153 return (NULL); 17154 } 17155 17156 /* 17157 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 17158 * If a pointer to an ipif_t is returned then the caller will need to do 17159 * an ill_refrele(). 17160 */ 17161 ipif_t * 17162 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, 17163 ip_stack_t *ipst) 17164 { 17165 ipif_t *ipif; 17166 ill_t *ill; 17167 17168 ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 17169 if (ill == NULL) 17170 return (NULL); 17171 17172 mutex_enter(&ill->ill_lock); 17173 if (ill->ill_state_flags & ILL_CONDEMNED) { 17174 mutex_exit(&ill->ill_lock); 17175 ill_refrele(ill); 17176 return (NULL); 17177 } 17178 17179 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17180 if (!IPIF_CAN_LOOKUP(ipif)) 17181 continue; 17182 if (lifidx == ipif->ipif_id) { 17183 ipif_refhold_locked(ipif); 17184 break; 17185 } 17186 } 17187 17188 mutex_exit(&ill->ill_lock); 17189 ill_refrele(ill); 17190 return (ipif); 17191 } 17192 17193 /* 17194 * Set ill_inputfn based on the current know state. 17195 * This needs to be called when any of the factors taken into 17196 * account changes. 17197 */ 17198 void 17199 ill_set_inputfn(ill_t *ill) 17200 { 17201 ip_stack_t *ipst = ill->ill_ipst; 17202 17203 if (ill->ill_isv6) { 17204 if (is_system_labeled()) 17205 ill->ill_inputfn = ill_input_full_v6; 17206 else 17207 ill->ill_inputfn = ill_input_short_v6; 17208 } else { 17209 if (is_system_labeled()) 17210 ill->ill_inputfn = ill_input_full_v4; 17211 else if (ill->ill_dhcpinit != 0) 17212 ill->ill_inputfn = ill_input_full_v4; 17213 else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head 17214 != NULL) 17215 ill->ill_inputfn = ill_input_full_v4; 17216 else if (ipst->ips_ip_cgtp_filter && 17217 ipst->ips_ip_cgtp_filter_ops != NULL) 17218 ill->ill_inputfn = ill_input_full_v4; 17219 else 17220 ill->ill_inputfn = ill_input_short_v4; 17221 } 17222 } 17223 17224 /* 17225 * Re-evaluate ill_inputfn for all the IPv4 ills. 17226 * Used when RSVP and CGTP comes and goes. 17227 */ 17228 void 17229 ill_set_inputfn_all(ip_stack_t *ipst) 17230 { 17231 ill_walk_context_t ctx; 17232 ill_t *ill; 17233 17234 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 17235 ill = ILL_START_WALK_V4(&ctx, ipst); 17236 for (; ill != NULL; ill = ill_next(&ctx, ill)) 17237 ill_set_inputfn(ill); 17238 17239 rw_exit(&ipst->ips_ill_g_lock); 17240 } 17241 17242 /* 17243 * Set the physical address information for `ill' to the contents of the 17244 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be 17245 * asynchronous if `ill' cannot immediately be quiesced -- in which case 17246 * EINPROGRESS will be returned. 17247 */ 17248 int 17249 ill_set_phys_addr(ill_t *ill, mblk_t *mp) 17250 { 17251 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 17252 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr; 17253 17254 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17255 17256 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR && 17257 dlindp->dl_data != DL_CURR_DEST_ADDR && 17258 dlindp->dl_data != DL_CURR_PHYS_ADDR) { 17259 /* Changing DL_IPV6_TOKEN is not yet supported */ 17260 return (0); 17261 } 17262 17263 /* 17264 * We need to store up to two copies of `mp' in `ill'. Due to the 17265 * design of ipsq_pending_mp_add(), we can't pass them as separate 17266 * arguments to ill_set_phys_addr_tail(). Instead, chain them 17267 * together here, then pull 'em apart in ill_set_phys_addr_tail(). 17268 */ 17269 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) { 17270 freemsg(mp); 17271 return (ENOMEM); 17272 } 17273 17274 ipsq_current_start(ipsq, ill->ill_ipif, 0); 17275 mutex_enter(&ill->ill_lock); 17276 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 17277 /* no more nce addition allowed */ 17278 mutex_exit(&ill->ill_lock); 17279 17280 /* 17281 * If we can quiesce the ill, then set the address. If not, then 17282 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). 17283 */ 17284 ill_down_ipifs(ill, B_TRUE); 17285 mutex_enter(&ill->ill_lock); 17286 if (!ill_is_quiescent(ill)) { 17287 /* call cannot fail since `conn_t *' argument is NULL */ 17288 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 17289 mp, ILL_DOWN); 17290 mutex_exit(&ill->ill_lock); 17291 return (EINPROGRESS); 17292 } 17293 mutex_exit(&ill->ill_lock); 17294 17295 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL); 17296 return (0); 17297 } 17298 17299 /* 17300 * Once the ill associated with `q' has quiesced, set its physical address 17301 * information to the values in `addrmp'. Note that two copies of `addrmp' 17302 * are passed (linked by b_cont), since we sometimes need to save two distinct 17303 * copies in the ill_t, and our context doesn't permit sleeping or allocation 17304 * failure (we'll free the other copy if it's not needed). Since the ill_t 17305 * is quiesced, we know any stale nce's with the old address information have 17306 * already been removed, so we don't need to call nce_flush(). 17307 */ 17308 /* ARGSUSED */ 17309 static void 17310 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) 17311 { 17312 ill_t *ill = q->q_ptr; 17313 mblk_t *addrmp2 = unlinkb(addrmp); 17314 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr; 17315 uint_t addrlen, addroff; 17316 int status; 17317 17318 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17319 17320 addroff = dlindp->dl_addr_offset; 17321 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length); 17322 17323 switch (dlindp->dl_data) { 17324 case DL_IPV6_LINK_LAYER_ADDR: 17325 ill_set_ndmp(ill, addrmp, addroff, addrlen); 17326 freemsg(addrmp2); 17327 break; 17328 17329 case DL_CURR_DEST_ADDR: 17330 freemsg(ill->ill_dest_addr_mp); 17331 ill->ill_dest_addr = addrmp->b_rptr + addroff; 17332 ill->ill_dest_addr_mp = addrmp; 17333 if (ill->ill_isv6) { 17334 ill_setdesttoken(ill); 17335 ipif_setdestlinklocal(ill->ill_ipif); 17336 } 17337 freemsg(addrmp2); 17338 break; 17339 17340 case DL_CURR_PHYS_ADDR: 17341 freemsg(ill->ill_phys_addr_mp); 17342 ill->ill_phys_addr = addrmp->b_rptr + addroff; 17343 ill->ill_phys_addr_mp = addrmp; 17344 ill->ill_phys_addr_length = addrlen; 17345 if (ill->ill_isv6) 17346 ill_set_ndmp(ill, addrmp2, addroff, addrlen); 17347 else 17348 freemsg(addrmp2); 17349 if (ill->ill_isv6) { 17350 ill_setdefaulttoken(ill); 17351 ipif_setlinklocal(ill->ill_ipif); 17352 } 17353 break; 17354 default: 17355 ASSERT(0); 17356 } 17357 17358 /* 17359 * If there are ipifs to bring up, ill_up_ipifs() will return 17360 * EINPROGRESS, and ipsq_current_finish() will be called by 17361 * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is 17362 * brought up. 17363 */ 17364 status = ill_up_ipifs(ill, q, addrmp); 17365 mutex_enter(&ill->ill_lock); 17366 if (ill->ill_dl_up) 17367 ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS; 17368 mutex_exit(&ill->ill_lock); 17369 if (status != EINPROGRESS) 17370 ipsq_current_finish(ipsq); 17371 } 17372 17373 /* 17374 * Helper routine for setting the ill_nd_lla fields. 17375 */ 17376 void 17377 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen) 17378 { 17379 freemsg(ill->ill_nd_lla_mp); 17380 ill->ill_nd_lla = ndmp->b_rptr + addroff; 17381 ill->ill_nd_lla_mp = ndmp; 17382 ill->ill_nd_lla_len = addrlen; 17383 } 17384 17385 /* 17386 * Replumb the ill. 17387 */ 17388 int 17389 ill_replumb(ill_t *ill, mblk_t *mp) 17390 { 17391 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 17392 17393 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17394 17395 ipsq_current_start(ipsq, ill->ill_ipif, 0); 17396 17397 mutex_enter(&ill->ill_lock); 17398 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 17399 /* no more nce addition allowed */ 17400 mutex_exit(&ill->ill_lock); 17401 17402 /* 17403 * If we can quiesce the ill, then continue. If not, then 17404 * ill_replumb_tail() will be called from ipif_ill_refrele_tail(). 17405 */ 17406 ill_down_ipifs(ill, B_FALSE); 17407 17408 mutex_enter(&ill->ill_lock); 17409 if (!ill_is_quiescent(ill)) { 17410 /* call cannot fail since `conn_t *' argument is NULL */ 17411 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 17412 mp, ILL_DOWN); 17413 mutex_exit(&ill->ill_lock); 17414 return (EINPROGRESS); 17415 } 17416 mutex_exit(&ill->ill_lock); 17417 17418 ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL); 17419 return (0); 17420 } 17421 17422 /* ARGSUSED */ 17423 static void 17424 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 17425 { 17426 ill_t *ill = q->q_ptr; 17427 int err; 17428 conn_t *connp = NULL; 17429 17430 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17431 freemsg(ill->ill_replumb_mp); 17432 ill->ill_replumb_mp = copyb(mp); 17433 17434 if (ill->ill_replumb_mp == NULL) { 17435 /* out of memory */ 17436 ipsq_current_finish(ipsq); 17437 return; 17438 } 17439 17440 mutex_enter(&ill->ill_lock); 17441 ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif, 17442 ill->ill_rq, ill->ill_replumb_mp, 0); 17443 mutex_exit(&ill->ill_lock); 17444 17445 if (!ill->ill_up_ipifs) { 17446 /* already closing */ 17447 ipsq_current_finish(ipsq); 17448 return; 17449 } 17450 ill->ill_replumbing = 1; 17451 err = ill_down_ipifs_tail(ill); 17452 17453 /* 17454 * Successfully quiesced and brought down the interface, now we send 17455 * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the 17456 * DL_NOTE_REPLUMB message. 17457 */ 17458 mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO, 17459 DL_NOTIFY_CONF); 17460 ASSERT(mp != NULL); 17461 ((dl_notify_conf_t *)mp->b_rptr)->dl_notification = 17462 DL_NOTE_REPLUMB_DONE; 17463 ill_dlpi_send(ill, mp); 17464 17465 /* 17466 * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP 17467 * streams have to be unbound. When all the DLPI exchanges are done, 17468 * ipsq_current_finish() will be called by arp_bringup_done(). The 17469 * remainder of ipif bringup via ill_up_ipifs() will also be done in 17470 * arp_bringup_done(). 17471 */ 17472 ASSERT(ill->ill_replumb_mp != NULL); 17473 if (err == EINPROGRESS) 17474 return; 17475 else 17476 ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp); 17477 ASSERT(connp == NULL); 17478 if (err == 0 && ill->ill_replumb_mp != NULL && 17479 ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) { 17480 return; 17481 } 17482 ipsq_current_finish(ipsq); 17483 } 17484 17485 /* 17486 * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf' 17487 * which is `bufsize' bytes. On success, zero is returned and `buf' updated 17488 * as per the ioctl. On failure, an errno is returned. 17489 */ 17490 static int 17491 ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr) 17492 { 17493 int rval; 17494 struct strioctl iocb; 17495 17496 iocb.ic_cmd = cmd; 17497 iocb.ic_timout = 15; 17498 iocb.ic_len = bufsize; 17499 iocb.ic_dp = buf; 17500 17501 return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval)); 17502 } 17503 17504 /* 17505 * Issue an SIOCGLIFCONF for address family `af' and store the result into a 17506 * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success. 17507 */ 17508 static int 17509 ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp, 17510 uint_t *bufsizep, cred_t *cr) 17511 { 17512 int err; 17513 struct lifnum lifn; 17514 17515 bzero(&lifn, sizeof (lifn)); 17516 lifn.lifn_family = af; 17517 lifn.lifn_flags = LIFC_UNDER_IPMP; 17518 17519 if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0) 17520 return (err); 17521 17522 /* 17523 * Pad the interface count to account for additional interfaces that 17524 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 17525 */ 17526 lifn.lifn_count += 4; 17527 bzero(lifcp, sizeof (*lifcp)); 17528 lifcp->lifc_flags = LIFC_UNDER_IPMP; 17529 lifcp->lifc_family = af; 17530 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 17531 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 17532 17533 err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr); 17534 if (err != 0) { 17535 kmem_free(lifcp->lifc_buf, *bufsizep); 17536 return (err); 17537 } 17538 17539 return (0); 17540 } 17541 17542 /* 17543 * Helper for ip_interface_cleanup() that removes the loopback interface. 17544 */ 17545 static void 17546 ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 17547 { 17548 int err; 17549 struct lifreq lifr; 17550 17551 bzero(&lifr, sizeof (lifr)); 17552 (void) strcpy(lifr.lifr_name, ipif_loopback_name); 17553 17554 err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr); 17555 if (err != 0) { 17556 ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: " 17557 "error %d\n", isv6 ? "v6" : "v4", err)); 17558 } 17559 } 17560 17561 /* 17562 * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP 17563 * groups and that IPMP data addresses are down. These conditions must be met 17564 * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp(). 17565 */ 17566 static void 17567 ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 17568 { 17569 int af = isv6 ? AF_INET6 : AF_INET; 17570 int i, nifs; 17571 int err; 17572 uint_t bufsize; 17573 uint_t lifrsize = sizeof (struct lifreq); 17574 struct lifconf lifc; 17575 struct lifreq *lifrp; 17576 17577 if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) { 17578 cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list " 17579 "(error %d); any IPMP interfaces cannot be shutdown", err); 17580 return; 17581 } 17582 17583 nifs = lifc.lifc_len / lifrsize; 17584 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 17585 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 17586 if (err != 0) { 17587 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get " 17588 "flags: error %d", lifrp->lifr_name, err); 17589 continue; 17590 } 17591 17592 if (lifrp->lifr_flags & IFF_IPMP) { 17593 if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0) 17594 continue; 17595 17596 lifrp->lifr_flags &= ~IFF_UP; 17597 err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr); 17598 if (err != 0) { 17599 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 17600 "bring down (error %d); IPMP interface may " 17601 "not be shutdown", lifrp->lifr_name, err); 17602 } 17603 17604 /* 17605 * Check if IFF_DUPLICATE is still set -- and if so, 17606 * reset the address to clear it. 17607 */ 17608 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 17609 if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE)) 17610 continue; 17611 17612 err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr); 17613 if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR, 17614 lifrp, lifrsize, cr)) != 0) { 17615 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 17616 "reset DAD (error %d); IPMP interface may " 17617 "not be shutdown", lifrp->lifr_name, err); 17618 } 17619 continue; 17620 } 17621 17622 lifrp->lifr_groupname[0] = '\0'; 17623 err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp, lifrsize, cr); 17624 if (err != 0) { 17625 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot leave " 17626 "IPMP group (error %d); associated IPMP interface " 17627 "may not be shutdown", lifrp->lifr_name, err); 17628 continue; 17629 } 17630 } 17631 17632 kmem_free(lifc.lifc_buf, bufsize); 17633 } 17634 17635 #define UDPDEV "/devices/pseudo/udp@0:udp" 17636 #define UDP6DEV "/devices/pseudo/udp6@0:udp6" 17637 17638 /* 17639 * Remove the loopback interfaces and prep the IPMP interfaces to be torn down. 17640 * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away 17641 * when the user-level processes in the zone are killed and the latter are 17642 * cleaned up by str_stack_shutdown(). 17643 */ 17644 void 17645 ip_interface_cleanup(ip_stack_t *ipst) 17646 { 17647 ldi_handle_t lh; 17648 ldi_ident_t li; 17649 cred_t *cr; 17650 int err; 17651 int i; 17652 char *devs[] = { UDP6DEV, UDPDEV }; 17653 netstackid_t stackid = ipst->ips_netstack->netstack_stackid; 17654 17655 if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) { 17656 cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:" 17657 " error %d", err); 17658 return; 17659 } 17660 17661 cr = zone_get_kcred(netstackid_to_zoneid(stackid)); 17662 ASSERT(cr != NULL); 17663 17664 /* 17665 * NOTE: loop executes exactly twice and is hardcoded to know that the 17666 * first iteration is IPv6. (Unrolling yields repetitious code, hence 17667 * the loop.) 17668 */ 17669 for (i = 0; i < 2; i++) { 17670 err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li); 17671 if (err != 0) { 17672 cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:" 17673 " error %d", devs[i], err); 17674 continue; 17675 } 17676 17677 ip_loopback_removeif(lh, i == 0, cr); 17678 ip_ipmp_cleanup(lh, i == 0, cr); 17679 17680 (void) ldi_close(lh, FREAD|FWRITE, cr); 17681 } 17682 17683 ldi_ident_release(li); 17684 crfree(cr); 17685 } 17686 17687 /* 17688 * This needs to be in-sync with nic_event_t definition 17689 */ 17690 static const char * 17691 ill_hook_event2str(nic_event_t event) 17692 { 17693 switch (event) { 17694 case NE_PLUMB: 17695 return ("PLUMB"); 17696 case NE_UNPLUMB: 17697 return ("UNPLUMB"); 17698 case NE_UP: 17699 return ("UP"); 17700 case NE_DOWN: 17701 return ("DOWN"); 17702 case NE_ADDRESS_CHANGE: 17703 return ("ADDRESS_CHANGE"); 17704 case NE_LIF_UP: 17705 return ("LIF_UP"); 17706 case NE_LIF_DOWN: 17707 return ("LIF_DOWN"); 17708 case NE_IFINDEX_CHANGE: 17709 return ("IFINDEX_CHANGE"); 17710 default: 17711 return ("UNKNOWN"); 17712 } 17713 } 17714 17715 void 17716 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event, 17717 nic_event_data_t data, size_t datalen) 17718 { 17719 ip_stack_t *ipst = ill->ill_ipst; 17720 hook_nic_event_int_t *info; 17721 const char *str = NULL; 17722 17723 /* create a new nic event info */ 17724 if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL) 17725 goto fail; 17726 17727 info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex; 17728 info->hnei_event.hne_lif = lif; 17729 info->hnei_event.hne_event = event; 17730 info->hnei_event.hne_protocol = ill->ill_isv6 ? 17731 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 17732 info->hnei_event.hne_data = NULL; 17733 info->hnei_event.hne_datalen = 0; 17734 info->hnei_stackid = ipst->ips_netstack->netstack_stackid; 17735 17736 if (data != NULL && datalen != 0) { 17737 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP); 17738 if (info->hnei_event.hne_data == NULL) 17739 goto fail; 17740 bcopy(data, info->hnei_event.hne_data, datalen); 17741 info->hnei_event.hne_datalen = datalen; 17742 } 17743 17744 if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info, 17745 DDI_NOSLEEP) == DDI_SUCCESS) 17746 return; 17747 17748 fail: 17749 if (info != NULL) { 17750 if (info->hnei_event.hne_data != NULL) { 17751 kmem_free(info->hnei_event.hne_data, 17752 info->hnei_event.hne_datalen); 17753 } 17754 kmem_free(info, sizeof (hook_nic_event_t)); 17755 } 17756 str = ill_hook_event2str(event); 17757 ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event " 17758 "information for %s (ENOMEM)\n", str, ill->ill_name)); 17759 } 17760 17761 static int 17762 ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act) 17763 { 17764 int err = 0; 17765 const in_addr_t *addr = NULL; 17766 nce_t *nce = NULL; 17767 ill_t *ill = ipif->ipif_ill; 17768 ill_t *bound_ill; 17769 boolean_t added_ipif = B_FALSE; 17770 uint16_t state; 17771 uint16_t flags; 17772 17773 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail", 17774 ill_t *, ill, ipif_t *, ipif); 17775 if (ipif->ipif_lcl_addr != INADDR_ANY) { 17776 addr = &ipif->ipif_lcl_addr; 17777 } 17778 17779 if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) { 17780 if (res_act != Res_act_initial) 17781 return (EINVAL); 17782 } 17783 17784 if (addr != NULL) { 17785 ipmp_illgrp_t *illg = ill->ill_grp; 17786 17787 /* add unicast nce for the local addr */ 17788 17789 if (IS_IPMP(ill)) { 17790 /* 17791 * If we're here via ipif_up(), then the ipif 17792 * won't be bound yet -- add it to the group, 17793 * which will bind it if possible. (We would 17794 * add it in ipif_up(), but deleting on failure 17795 * there is gruesome.) If we're here via 17796 * ipmp_ill_bind_ipif(), then the ipif has 17797 * already been added to the group and we 17798 * just need to use the binding. 17799 */ 17800 if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) { 17801 bound_ill = ipmp_illgrp_add_ipif(illg, ipif); 17802 if (bound_ill == NULL) { 17803 /* 17804 * We couldn't bind the ipif to an ill 17805 * yet, so we have nothing to publish. 17806 * Mark the address as ready and return. 17807 */ 17808 ipif->ipif_addr_ready = 1; 17809 return (0); 17810 } 17811 added_ipif = B_TRUE; 17812 } 17813 } else { 17814 bound_ill = ill; 17815 } 17816 17817 flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY | 17818 NCE_F_NONUD); 17819 /* 17820 * If this is an initial bring-up (or the ipif was never 17821 * completely brought up), do DAD. Otherwise, we're here 17822 * because IPMP has rebound an address to this ill: send 17823 * unsolicited advertisements (ARP announcements) to 17824 * inform others. 17825 */ 17826 if (res_act == Res_act_initial || !ipif->ipif_addr_ready) { 17827 state = ND_UNCHANGED; /* compute in nce_add_common() */ 17828 } else { 17829 state = ND_REACHABLE; 17830 flags |= NCE_F_UNSOL_ADV; 17831 } 17832 17833 retry: 17834 err = nce_lookup_then_add_v4(ill, 17835 bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length, 17836 addr, flags, state, &nce); 17837 17838 /* 17839 * note that we may encounter EEXIST if we are moving 17840 * the nce as a result of a rebind operation. 17841 */ 17842 switch (err) { 17843 case 0: 17844 ipif->ipif_added_nce = 1; 17845 nce->nce_ipif_cnt++; 17846 break; 17847 case EEXIST: 17848 ip1dbg(("ipif_arp_up: NCE already exists for %s\n", 17849 ill->ill_name)); 17850 if (!NCE_MYADDR(nce->nce_common)) { 17851 /* 17852 * A leftover nce from before this address 17853 * existed 17854 */ 17855 ncec_delete(nce->nce_common); 17856 nce_refrele(nce); 17857 nce = NULL; 17858 goto retry; 17859 } 17860 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 17861 nce_refrele(nce); 17862 nce = NULL; 17863 ip1dbg(("ipif_arp_up: NCE already exists " 17864 "for %s:%u\n", ill->ill_name, 17865 ipif->ipif_id)); 17866 goto arp_up_done; 17867 } 17868 /* 17869 * Duplicate local addresses are permissible for 17870 * IPIF_POINTOPOINT interfaces which will get marked 17871 * IPIF_UNNUMBERED later in 17872 * ip_addr_availability_check(). 17873 * 17874 * The nce_ipif_cnt field tracks the number of 17875 * ipifs that have nce_addr as their local address. 17876 */ 17877 ipif->ipif_addr_ready = 1; 17878 ipif->ipif_added_nce = 1; 17879 nce->nce_ipif_cnt++; 17880 err = 0; 17881 break; 17882 default: 17883 ASSERT(nce == NULL); 17884 goto arp_up_done; 17885 } 17886 if (arp_no_defense) { 17887 if ((ipif->ipif_flags & IPIF_UP) && 17888 !ipif->ipif_addr_ready) 17889 ipif_up_notify(ipif); 17890 ipif->ipif_addr_ready = 1; 17891 } 17892 } else { 17893 /* zero address. nothing to publish */ 17894 ipif->ipif_addr_ready = 1; 17895 } 17896 if (nce != NULL) 17897 nce_refrele(nce); 17898 arp_up_done: 17899 if (added_ipif && err != 0) 17900 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 17901 return (err); 17902 } 17903 17904 int 17905 ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup) 17906 { 17907 int err = 0; 17908 ill_t *ill = ipif->ipif_ill; 17909 boolean_t first_interface, wait_for_dlpi = B_FALSE; 17910 17911 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up", 17912 ill_t *, ill, ipif_t *, ipif); 17913 17914 /* 17915 * need to bring up ARP or setup mcast mapping only 17916 * when the first interface is coming UP. 17917 */ 17918 first_interface = (ill->ill_ipif_up_count == 0 && 17919 ill->ill_ipif_dup_count == 0 && !was_dup); 17920 17921 if (res_act == Res_act_initial && first_interface) { 17922 /* 17923 * Send ATTACH + BIND 17924 */ 17925 err = arp_ll_up(ill); 17926 if (err != EINPROGRESS && err != 0) 17927 return (err); 17928 17929 /* 17930 * Add NCE for local address. Start DAD. 17931 * we'll wait to hear that DAD has finished 17932 * before using the interface. 17933 */ 17934 if (err == EINPROGRESS) 17935 wait_for_dlpi = B_TRUE; 17936 } 17937 17938 if (!wait_for_dlpi) 17939 (void) ipif_arp_up_done_tail(ipif, res_act); 17940 17941 return (!wait_for_dlpi ? 0 : EINPROGRESS); 17942 } 17943 17944 /* 17945 * Finish processing of "arp_up" after all the DLPI message 17946 * exchanges have completed between arp and the driver. 17947 */ 17948 void 17949 arp_bringup_done(ill_t *ill, int err) 17950 { 17951 mblk_t *mp1; 17952 ipif_t *ipif; 17953 conn_t *connp = NULL; 17954 ipsq_t *ipsq; 17955 queue_t *q; 17956 17957 ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name)); 17958 17959 ASSERT(IAM_WRITER_ILL(ill)); 17960 17961 ipsq = ill->ill_phyint->phyint_ipsq; 17962 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 17963 mp1 = ipsq_pending_mp_get(ipsq, &connp); 17964 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 17965 if (mp1 == NULL) /* bringup was aborted by the user */ 17966 return; 17967 17968 /* 17969 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 17970 * must have an associated conn_t. Otherwise, we're bringing this 17971 * interface back up as part of handling an asynchronous event (e.g., 17972 * physical address change). 17973 */ 17974 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 17975 ASSERT(connp != NULL); 17976 q = CONNP_TO_WQ(connp); 17977 } else { 17978 ASSERT(connp == NULL); 17979 q = ill->ill_rq; 17980 } 17981 if (err == 0) { 17982 if (ipif->ipif_isv6) { 17983 if ((err = ipif_up_done_v6(ipif)) != 0) 17984 ip0dbg(("arp_bringup_done: init failed\n")); 17985 } else { 17986 err = ipif_arp_up_done_tail(ipif, Res_act_initial); 17987 if (err != 0 || 17988 (err = ipif_up_done(ipif)) != 0) { 17989 ip0dbg(("arp_bringup_done: " 17990 "init failed err %x\n", err)); 17991 (void) ipif_arp_down(ipif); 17992 } 17993 17994 } 17995 } else { 17996 ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n")); 17997 } 17998 17999 if ((err == 0) && (ill->ill_up_ipifs)) { 18000 err = ill_up_ipifs(ill, q, mp1); 18001 if (err == EINPROGRESS) 18002 return; 18003 } 18004 18005 /* 18006 * If we have a moved ipif to bring up, and everything has succeeded 18007 * to this point, bring it up on the IPMP ill. Otherwise, leave it 18008 * down -- the admin can try to bring it up by hand if need be. 18009 */ 18010 if (ill->ill_move_ipif != NULL) { 18011 ipif = ill->ill_move_ipif; 18012 ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif, 18013 ipif->ipif_ill->ill_name)); 18014 ill->ill_move_ipif = NULL; 18015 if (err == 0) { 18016 err = ipif_up(ipif, q, mp1); 18017 if (err == EINPROGRESS) 18018 return; 18019 } 18020 } 18021 18022 /* 18023 * The operation must complete without EINPROGRESS since 18024 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 18025 * Otherwise, the operation will be stuck forever in the ipsq. 18026 */ 18027 ASSERT(err != EINPROGRESS); 18028 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18029 DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish", 18030 int, ipsq->ipsq_xop->ipx_current_ioctl, 18031 ill_t *, ill, ipif_t *, ipif); 18032 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 18033 } else { 18034 ipsq_current_finish(ipsq); 18035 } 18036 } 18037 18038 /* 18039 * Finish processing of arp replumb after all the DLPI message 18040 * exchanges have completed between arp and the driver. 18041 */ 18042 void 18043 arp_replumb_done(ill_t *ill, int err) 18044 { 18045 mblk_t *mp1; 18046 ipif_t *ipif; 18047 conn_t *connp = NULL; 18048 ipsq_t *ipsq; 18049 queue_t *q; 18050 18051 ASSERT(IAM_WRITER_ILL(ill)); 18052 18053 ipsq = ill->ill_phyint->phyint_ipsq; 18054 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 18055 mp1 = ipsq_pending_mp_get(ipsq, &connp); 18056 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 18057 if (mp1 == NULL) { 18058 ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n", 18059 ipsq->ipsq_xop->ipx_current_ioctl)); 18060 /* bringup was aborted by the user */ 18061 return; 18062 } 18063 /* 18064 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 18065 * must have an associated conn_t. Otherwise, we're bringing this 18066 * interface back up as part of handling an asynchronous event (e.g., 18067 * physical address change). 18068 */ 18069 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18070 ASSERT(connp != NULL); 18071 q = CONNP_TO_WQ(connp); 18072 } else { 18073 ASSERT(connp == NULL); 18074 q = ill->ill_rq; 18075 } 18076 if ((err == 0) && (ill->ill_up_ipifs)) { 18077 err = ill_up_ipifs(ill, q, mp1); 18078 if (err == EINPROGRESS) 18079 return; 18080 } 18081 /* 18082 * The operation must complete without EINPROGRESS since 18083 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 18084 * Otherwise, the operation will be stuck forever in the ipsq. 18085 */ 18086 ASSERT(err != EINPROGRESS); 18087 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18088 DTRACE_PROBE4(ipif__ioctl, char *, 18089 "arp_replumb_done finish", 18090 int, ipsq->ipsq_xop->ipx_current_ioctl, 18091 ill_t *, ill, ipif_t *, ipif); 18092 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 18093 } else { 18094 ipsq_current_finish(ipsq); 18095 } 18096 } 18097 18098 void 18099 ipif_up_notify(ipif_t *ipif) 18100 { 18101 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 18102 ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT); 18103 sctp_update_ipif(ipif, SCTP_IPIF_UP); 18104 ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id), 18105 NE_LIF_UP, NULL, 0); 18106 } 18107 18108 /* 18109 * ILB ioctl uses cv_wait (such as deleting a rule or adding a server) and 18110 * this assumes the context is cv_wait'able. Hence it shouldnt' be used on 18111 * TPI end points with STREAMS modules pushed above. This is assured by not 18112 * having the IPI_MODOK flag for the ioctl. And IP ensures the ILB ioctl 18113 * never ends up on an ipsq, otherwise we may end up processing the ioctl 18114 * while unwinding from the ispq and that could be a thread from the bottom. 18115 */ 18116 /* ARGSUSED */ 18117 int 18118 ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 18119 ip_ioctl_cmd_t *ipip, void *arg) 18120 { 18121 mblk_t *cmd_mp = mp->b_cont->b_cont; 18122 ilb_cmd_t command = *((ilb_cmd_t *)cmd_mp->b_rptr); 18123 int ret = 0; 18124 int i; 18125 size_t size; 18126 ip_stack_t *ipst; 18127 zoneid_t zoneid; 18128 ilb_stack_t *ilbs; 18129 18130 ipst = CONNQ_TO_IPST(q); 18131 ilbs = ipst->ips_netstack->netstack_ilb; 18132 zoneid = Q_TO_CONN(q)->conn_zoneid; 18133 18134 switch (command) { 18135 case ILB_CREATE_RULE: { 18136 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 18137 18138 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 18139 ret = EINVAL; 18140 break; 18141 } 18142 18143 ret = ilb_rule_add(ilbs, zoneid, cmd); 18144 break; 18145 } 18146 case ILB_DESTROY_RULE: 18147 case ILB_ENABLE_RULE: 18148 case ILB_DISABLE_RULE: { 18149 ilb_name_cmd_t *cmd = (ilb_name_cmd_t *)cmd_mp->b_rptr; 18150 18151 if (MBLKL(cmd_mp) != sizeof (ilb_name_cmd_t)) { 18152 ret = EINVAL; 18153 break; 18154 } 18155 18156 if (cmd->flags & ILB_RULE_ALLRULES) { 18157 if (command == ILB_DESTROY_RULE) { 18158 ilb_rule_del_all(ilbs, zoneid); 18159 break; 18160 } else if (command == ILB_ENABLE_RULE) { 18161 ilb_rule_enable_all(ilbs, zoneid); 18162 break; 18163 } else if (command == ILB_DISABLE_RULE) { 18164 ilb_rule_disable_all(ilbs, zoneid); 18165 break; 18166 } 18167 } else { 18168 if (command == ILB_DESTROY_RULE) { 18169 ret = ilb_rule_del(ilbs, zoneid, cmd->name); 18170 } else if (command == ILB_ENABLE_RULE) { 18171 ret = ilb_rule_enable(ilbs, zoneid, cmd->name, 18172 NULL); 18173 } else if (command == ILB_DISABLE_RULE) { 18174 ret = ilb_rule_disable(ilbs, zoneid, cmd->name, 18175 NULL); 18176 } 18177 } 18178 break; 18179 } 18180 case ILB_NUM_RULES: { 18181 ilb_num_rules_cmd_t *cmd; 18182 18183 if (MBLKL(cmd_mp) != sizeof (ilb_num_rules_cmd_t)) { 18184 ret = EINVAL; 18185 break; 18186 } 18187 cmd = (ilb_num_rules_cmd_t *)cmd_mp->b_rptr; 18188 ilb_get_num_rules(ilbs, zoneid, &(cmd->num)); 18189 break; 18190 } 18191 case ILB_RULE_NAMES: { 18192 ilb_rule_names_cmd_t *cmd; 18193 18194 cmd = (ilb_rule_names_cmd_t *)cmd_mp->b_rptr; 18195 if (MBLKL(cmd_mp) < sizeof (ilb_rule_names_cmd_t) || 18196 cmd->num_names == 0) { 18197 ret = EINVAL; 18198 break; 18199 } 18200 size = cmd->num_names * ILB_RULE_NAMESZ; 18201 if (cmd_mp->b_rptr + offsetof(ilb_rule_names_cmd_t, buf) + 18202 size != cmd_mp->b_wptr) { 18203 ret = EINVAL; 18204 break; 18205 } 18206 ilb_get_rulenames(ilbs, zoneid, &cmd->num_names, cmd->buf); 18207 break; 18208 } 18209 case ILB_NUM_SERVERS: { 18210 ilb_num_servers_cmd_t *cmd; 18211 18212 if (MBLKL(cmd_mp) != sizeof (ilb_num_servers_cmd_t)) { 18213 ret = EINVAL; 18214 break; 18215 } 18216 cmd = (ilb_num_servers_cmd_t *)cmd_mp->b_rptr; 18217 ret = ilb_get_num_servers(ilbs, zoneid, cmd->name, 18218 &(cmd->num)); 18219 break; 18220 } 18221 case ILB_LIST_RULE: { 18222 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 18223 18224 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 18225 ret = EINVAL; 18226 break; 18227 } 18228 ret = ilb_rule_list(ilbs, zoneid, cmd); 18229 break; 18230 } 18231 case ILB_LIST_SERVERS: { 18232 ilb_servers_info_cmd_t *cmd; 18233 18234 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 18235 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t) || 18236 cmd->num_servers == 0) { 18237 ret = EINVAL; 18238 break; 18239 } 18240 size = cmd->num_servers * sizeof (ilb_server_info_t); 18241 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 18242 size != cmd_mp->b_wptr) { 18243 ret = EINVAL; 18244 break; 18245 } 18246 18247 ret = ilb_get_servers(ilbs, zoneid, cmd->name, cmd->servers, 18248 &cmd->num_servers); 18249 break; 18250 } 18251 case ILB_ADD_SERVERS: { 18252 ilb_servers_info_cmd_t *cmd; 18253 ilb_rule_t *rule; 18254 18255 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 18256 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t)) { 18257 ret = EINVAL; 18258 break; 18259 } 18260 size = cmd->num_servers * sizeof (ilb_server_info_t); 18261 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 18262 size != cmd_mp->b_wptr) { 18263 ret = EINVAL; 18264 break; 18265 } 18266 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 18267 if (rule == NULL) { 18268 ASSERT(ret != 0); 18269 break; 18270 } 18271 for (i = 0; i < cmd->num_servers; i++) { 18272 ilb_server_info_t *s; 18273 18274 s = &cmd->servers[i]; 18275 s->err = ilb_server_add(ilbs, rule, s); 18276 } 18277 ILB_RULE_REFRELE(rule); 18278 break; 18279 } 18280 case ILB_DEL_SERVERS: 18281 case ILB_ENABLE_SERVERS: 18282 case ILB_DISABLE_SERVERS: { 18283 ilb_servers_cmd_t *cmd; 18284 ilb_rule_t *rule; 18285 int (*f)(); 18286 18287 cmd = (ilb_servers_cmd_t *)cmd_mp->b_rptr; 18288 if (MBLKL(cmd_mp) < sizeof (ilb_servers_cmd_t)) { 18289 ret = EINVAL; 18290 break; 18291 } 18292 size = cmd->num_servers * sizeof (ilb_server_arg_t); 18293 if (cmd_mp->b_rptr + offsetof(ilb_servers_cmd_t, servers) + 18294 size != cmd_mp->b_wptr) { 18295 ret = EINVAL; 18296 break; 18297 } 18298 18299 if (command == ILB_DEL_SERVERS) 18300 f = ilb_server_del; 18301 else if (command == ILB_ENABLE_SERVERS) 18302 f = ilb_server_enable; 18303 else if (command == ILB_DISABLE_SERVERS) 18304 f = ilb_server_disable; 18305 18306 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 18307 if (rule == NULL) { 18308 ASSERT(ret != 0); 18309 break; 18310 } 18311 18312 for (i = 0; i < cmd->num_servers; i++) { 18313 ilb_server_arg_t *s; 18314 18315 s = &cmd->servers[i]; 18316 s->err = f(ilbs, zoneid, NULL, rule, &s->addr); 18317 } 18318 ILB_RULE_REFRELE(rule); 18319 break; 18320 } 18321 case ILB_LIST_NAT_TABLE: { 18322 ilb_list_nat_cmd_t *cmd; 18323 18324 cmd = (ilb_list_nat_cmd_t *)cmd_mp->b_rptr; 18325 if (MBLKL(cmd_mp) < sizeof (ilb_list_nat_cmd_t)) { 18326 ret = EINVAL; 18327 break; 18328 } 18329 size = cmd->num_nat * sizeof (ilb_nat_entry_t); 18330 if (cmd_mp->b_rptr + offsetof(ilb_list_nat_cmd_t, entries) + 18331 size != cmd_mp->b_wptr) { 18332 ret = EINVAL; 18333 break; 18334 } 18335 18336 ret = ilb_list_nat(ilbs, zoneid, cmd->entries, &cmd->num_nat, 18337 &cmd->flags); 18338 break; 18339 } 18340 case ILB_LIST_STICKY_TABLE: { 18341 ilb_list_sticky_cmd_t *cmd; 18342 18343 cmd = (ilb_list_sticky_cmd_t *)cmd_mp->b_rptr; 18344 if (MBLKL(cmd_mp) < sizeof (ilb_list_sticky_cmd_t)) { 18345 ret = EINVAL; 18346 break; 18347 } 18348 size = cmd->num_sticky * sizeof (ilb_sticky_entry_t); 18349 if (cmd_mp->b_rptr + offsetof(ilb_list_sticky_cmd_t, entries) + 18350 size != cmd_mp->b_wptr) { 18351 ret = EINVAL; 18352 break; 18353 } 18354 18355 ret = ilb_list_sticky(ilbs, zoneid, cmd->entries, 18356 &cmd->num_sticky, &cmd->flags); 18357 break; 18358 } 18359 default: 18360 ret = EINVAL; 18361 break; 18362 } 18363 done: 18364 return (ret); 18365 } 18366 18367 /* Remove all cache entries for this logical interface */ 18368 void 18369 ipif_nce_down(ipif_t *ipif) 18370 { 18371 ill_t *ill = ipif->ipif_ill; 18372 nce_t *nce; 18373 18374 DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down", 18375 ill_t *, ill, ipif_t *, ipif); 18376 if (ipif->ipif_added_nce) { 18377 if (ipif->ipif_isv6) 18378 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); 18379 else 18380 nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr); 18381 if (nce != NULL) { 18382 if (--nce->nce_ipif_cnt == 0) 18383 ncec_delete(nce->nce_common); 18384 ipif->ipif_added_nce = 0; 18385 nce_refrele(nce); 18386 } else { 18387 /* 18388 * nce may already be NULL because it was already 18389 * flushed, e.g., due to a call to nce_flush 18390 */ 18391 ipif->ipif_added_nce = 0; 18392 } 18393 } 18394 /* 18395 * Make IPMP aware of the deleted data address. 18396 */ 18397 if (IS_IPMP(ill)) 18398 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 18399 18400 /* 18401 * Remove all other nces dependent on this ill when the last ipif 18402 * is going away. 18403 */ 18404 if (ill->ill_ipif_up_count == 0) { 18405 ncec_walk(ill, (pfi_t)ncec_delete_per_ill, 18406 (uchar_t *)ill, ill->ill_ipst); 18407 if (IS_UNDER_IPMP(ill)) 18408 nce_flush(ill, B_TRUE); 18409 } 18410 } 18411