1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 /* 28 * This file contains the interface control functions for IP. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/stream.h> 33 #include <sys/dlpi.h> 34 #include <sys/stropts.h> 35 #include <sys/strsun.h> 36 #include <sys/sysmacros.h> 37 #include <sys/strsubr.h> 38 #include <sys/strlog.h> 39 #include <sys/ddi.h> 40 #include <sys/sunddi.h> 41 #include <sys/cmn_err.h> 42 #include <sys/kstat.h> 43 #include <sys/debug.h> 44 #include <sys/zone.h> 45 #include <sys/sunldi.h> 46 #include <sys/file.h> 47 #include <sys/bitmap.h> 48 #include <sys/cpuvar.h> 49 #include <sys/time.h> 50 #include <sys/ctype.h> 51 #include <sys/kmem.h> 52 #include <sys/systm.h> 53 #include <sys/param.h> 54 #include <sys/socket.h> 55 #include <sys/isa_defs.h> 56 #include <net/if.h> 57 #include <net/if_arp.h> 58 #include <net/if_types.h> 59 #include <net/if_dl.h> 60 #include <net/route.h> 61 #include <sys/sockio.h> 62 #include <netinet/in.h> 63 #include <netinet/ip6.h> 64 #include <netinet/icmp6.h> 65 #include <netinet/igmp_var.h> 66 #include <sys/policy.h> 67 #include <sys/ethernet.h> 68 #include <sys/callb.h> 69 #include <sys/md5.h> 70 71 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 72 #include <inet/mi.h> 73 #include <inet/nd.h> 74 #include <inet/arp.h> 75 #include <inet/ip_arp.h> 76 #include <inet/mib2.h> 77 #include <inet/ip.h> 78 #include <inet/ip6.h> 79 #include <inet/ip6_asp.h> 80 #include <inet/tcp.h> 81 #include <inet/ip_multi.h> 82 #include <inet/ip_ire.h> 83 #include <inet/ip_ftable.h> 84 #include <inet/ip_rts.h> 85 #include <inet/ip_ndp.h> 86 #include <inet/ip_if.h> 87 #include <inet/ip_impl.h> 88 #include <inet/sctp_ip.h> 89 #include <inet/ip_netinfo.h> 90 #include <inet/ilb_ip.h> 91 92 #include <netinet/igmp.h> 93 #include <inet/ip_listutils.h> 94 #include <inet/ipclassifier.h> 95 #include <sys/mac_client.h> 96 #include <sys/dld.h> 97 98 #include <sys/systeminfo.h> 99 #include <sys/bootconf.h> 100 101 #include <sys/tsol/tndb.h> 102 #include <sys/tsol/tnet.h> 103 104 /* The character which tells where the ill_name ends */ 105 #define IPIF_SEPARATOR_CHAR ':' 106 107 /* IP ioctl function table entry */ 108 typedef struct ipft_s { 109 int ipft_cmd; 110 pfi_t ipft_pfi; 111 int ipft_min_size; 112 int ipft_flags; 113 } ipft_t; 114 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 115 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 116 117 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 118 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 119 char *value, caddr_t cp, cred_t *ioc_cr); 120 121 static boolean_t ill_is_quiescent(ill_t *); 122 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 123 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 124 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 125 mblk_t *mp, boolean_t need_up); 126 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 127 mblk_t *mp, boolean_t need_up); 128 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 129 queue_t *q, mblk_t *mp, boolean_t need_up); 130 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 131 mblk_t *mp); 132 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 133 mblk_t *mp); 134 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 135 queue_t *q, mblk_t *mp, boolean_t need_up); 136 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, 137 int ioccmd, struct linkblk *li); 138 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); 139 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 140 static void ipsq_flush(ill_t *ill); 141 142 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 143 queue_t *q, mblk_t *mp, boolean_t need_up); 144 static void ipsq_delete(ipsq_t *); 145 146 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 147 boolean_t initialize, boolean_t insert, int *errorp); 148 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); 149 static void ipif_delete_bcast_ires(ipif_t *ipif); 150 static int ipif_add_ires_v4(ipif_t *, boolean_t); 151 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, 152 boolean_t isv6); 153 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 154 static void ipif_free(ipif_t *ipif); 155 static void ipif_free_tail(ipif_t *ipif); 156 static void ipif_set_default(ipif_t *ipif); 157 static int ipif_set_values(queue_t *q, mblk_t *mp, 158 char *interf_name, uint_t *ppa); 159 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 160 queue_t *q); 161 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 162 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 163 ip_stack_t *); 164 165 static int ill_alloc_ppa(ill_if_t *, ill_t *); 166 static void ill_delete_interface_type(ill_if_t *); 167 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 168 static void ill_dl_down(ill_t *ill); 169 static void ill_down(ill_t *ill); 170 static void ill_down_ipifs(ill_t *, boolean_t); 171 static void ill_free_mib(ill_t *ill); 172 static void ill_glist_delete(ill_t *); 173 static void ill_phyint_reinit(ill_t *ill); 174 static void ill_set_nce_router_flags(ill_t *, boolean_t); 175 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); 176 static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *); 177 178 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; 179 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid; 180 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; 181 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid; 182 static ip_v4mapinfo_func_t ip_ether_v4_mapping; 183 static ip_v6mapinfo_func_t ip_ether_v6_mapping; 184 static ip_v4mapinfo_func_t ip_ib_v4_mapping; 185 static ip_v6mapinfo_func_t ip_ib_v6_mapping; 186 static ip_v4mapinfo_func_t ip_mbcast_mapping; 187 static void ip_cgtp_bcast_add(ire_t *, ip_stack_t *); 188 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); 189 static void phyint_free(phyint_t *); 190 191 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *); 192 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 193 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 194 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 195 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *); 196 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 197 dl_capability_sub_t *); 198 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *); 199 static void ill_capability_dld_reset_fill(ill_t *, mblk_t *); 200 static void ill_capability_dld_ack(ill_t *, mblk_t *, 201 dl_capability_sub_t *); 202 static void ill_capability_dld_enable(ill_t *); 203 static void ill_capability_ack_thr(void *); 204 static void ill_capability_lso_enable(ill_t *); 205 206 static ill_t *ill_prev_usesrc(ill_t *); 207 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 208 static void ill_disband_usesrc_group(ill_t *); 209 static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int); 210 211 #ifdef DEBUG 212 static void ill_trace_cleanup(const ill_t *); 213 static void ipif_trace_cleanup(const ipif_t *); 214 #endif 215 216 /* 217 * if we go over the memory footprint limit more than once in this msec 218 * interval, we'll start pruning aggressively. 219 */ 220 int ip_min_frag_prune_time = 0; 221 222 static ipft_t ip_ioctl_ftbl[] = { 223 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 224 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 225 IPFT_F_NO_REPLY }, 226 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 227 { 0 } 228 }; 229 230 /* Simple ICMP IP Header Template */ 231 static ipha_t icmp_ipha = { 232 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 233 }; 234 235 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 236 237 static ip_m_t ip_m_tbl[] = { 238 { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 239 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 240 ip_nodef_v6intfid }, 241 { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6, 242 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 243 ip_nodef_v6intfid }, 244 { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6, 245 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 246 ip_nodef_v6intfid }, 247 { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6, 248 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 249 ip_nodef_v6intfid }, 250 { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6, 251 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 252 ip_nodef_v6intfid }, 253 { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6, 254 ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid, 255 ip_nodef_v6intfid }, 256 { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6, 257 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 258 ip_ipv4_v6destintfid }, 259 { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6, 260 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid, 261 ip_ipv6_v6destintfid }, 262 { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6, 263 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 264 ip_nodef_v6intfid }, 265 { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 266 NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid }, 267 { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 268 NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid }, 269 { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 270 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 271 ip_nodef_v6intfid } 272 }; 273 274 static ill_t ill_null; /* Empty ILL for init. */ 275 char ipif_loopback_name[] = "lo0"; 276 static char *ipv4_forward_suffix = ":ip_forwarding"; 277 static char *ipv6_forward_suffix = ":ip6_forwarding"; 278 static sin6_t sin6_null; /* Zero address for quick clears */ 279 static sin_t sin_null; /* Zero address for quick clears */ 280 281 /* When set search for unused ipif_seqid */ 282 static ipif_t ipif_zero; 283 284 /* 285 * ppa arena is created after these many 286 * interfaces have been plumbed. 287 */ 288 uint_t ill_no_arena = 12; /* Setable in /etc/system */ 289 290 /* 291 * Allocate per-interface mibs. 292 * Returns true if ok. False otherwise. 293 * ipsq may not yet be allocated (loopback case ). 294 */ 295 static boolean_t 296 ill_allocate_mibs(ill_t *ill) 297 { 298 /* Already allocated? */ 299 if (ill->ill_ip_mib != NULL) { 300 if (ill->ill_isv6) 301 ASSERT(ill->ill_icmp6_mib != NULL); 302 return (B_TRUE); 303 } 304 305 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), 306 KM_NOSLEEP); 307 if (ill->ill_ip_mib == NULL) { 308 return (B_FALSE); 309 } 310 311 /* Setup static information */ 312 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, 313 sizeof (mib2_ipIfStatsEntry_t)); 314 if (ill->ill_isv6) { 315 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 316 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 317 sizeof (mib2_ipv6AddrEntry_t)); 318 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 319 sizeof (mib2_ipv6RouteEntry_t)); 320 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 321 sizeof (mib2_ipv6NetToMediaEntry_t)); 322 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 323 sizeof (ipv6_member_t)); 324 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 325 sizeof (ipv6_grpsrc_t)); 326 } else { 327 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 328 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 329 sizeof (mib2_ipAddrEntry_t)); 330 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 331 sizeof (mib2_ipRouteEntry_t)); 332 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 333 sizeof (mib2_ipNetToMediaEntry_t)); 334 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 335 sizeof (ip_member_t)); 336 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 337 sizeof (ip_grpsrc_t)); 338 339 /* 340 * For a v4 ill, we are done at this point, because per ill 341 * icmp mibs are only used for v6. 342 */ 343 return (B_TRUE); 344 } 345 346 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 347 KM_NOSLEEP); 348 if (ill->ill_icmp6_mib == NULL) { 349 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 350 ill->ill_ip_mib = NULL; 351 return (B_FALSE); 352 } 353 /* static icmp info */ 354 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 355 sizeof (mib2_ipv6IfIcmpEntry_t); 356 /* 357 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later 358 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 359 * -> ill_phyint_reinit 360 */ 361 return (B_TRUE); 362 } 363 364 /* 365 * Completely vaporize a lower level tap and all associated interfaces. 366 * ill_delete is called only out of ip_close when the device control 367 * stream is being closed. 368 */ 369 void 370 ill_delete(ill_t *ill) 371 { 372 ipif_t *ipif; 373 ill_t *prev_ill; 374 ip_stack_t *ipst = ill->ill_ipst; 375 376 /* 377 * ill_delete may be forcibly entering the ipsq. The previous 378 * ioctl may not have completed and may need to be aborted. 379 * ipsq_flush takes care of it. If we don't need to enter the 380 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 381 * ill_delete_tail is sufficient. 382 */ 383 ipsq_flush(ill); 384 385 /* 386 * Nuke all interfaces. ipif_free will take down the interface, 387 * remove it from the list, and free the data structure. 388 * Walk down the ipif list and remove the logical interfaces 389 * first before removing the main ipif. We can't unplumb 390 * zeroth interface first in the case of IPv6 as update_conn_ill 391 * -> ip_ll_multireq de-references ill_ipif for checking 392 * POINTOPOINT. 393 * 394 * If ill_ipif was not properly initialized (i.e low on memory), 395 * then no interfaces to clean up. In this case just clean up the 396 * ill. 397 */ 398 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 399 ipif_free(ipif); 400 401 /* 402 * clean out all the nce_t entries that depend on this 403 * ill for the ill_phys_addr. 404 */ 405 nce_flush(ill, B_TRUE); 406 407 /* Clean up msgs on pending upcalls for mrouted */ 408 reset_mrt_ill(ill); 409 410 update_conn_ill(ill, ipst); 411 412 /* 413 * Remove multicast references added as a result of calls to 414 * ip_join_allmulti(). 415 */ 416 ip_purge_allmulti(ill); 417 418 /* 419 * If the ill being deleted is under IPMP, boot it out of the illgrp. 420 */ 421 if (IS_UNDER_IPMP(ill)) 422 ipmp_ill_leave_illgrp(ill); 423 424 /* 425 * ill_down will arrange to blow off any IRE's dependent on this 426 * ILL, and shut down fragmentation reassembly. 427 */ 428 ill_down(ill); 429 430 /* Let SCTP know, so that it can remove this from its list. */ 431 sctp_update_ill(ill, SCTP_ILL_REMOVE); 432 433 /* 434 * Walk all CONNs that can have a reference on an ire or nce for this 435 * ill (we actually walk all that now have stale references). 436 */ 437 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); 438 439 /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 440 if (ill->ill_isv6) 441 dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst); 442 443 /* 444 * If an address on this ILL is being used as a source address then 445 * clear out the pointers in other ILLs that point to this ILL. 446 */ 447 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 448 if (ill->ill_usesrc_grp_next != NULL) { 449 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 450 ill_disband_usesrc_group(ill); 451 } else { /* consumer of the usesrc ILL */ 452 prev_ill = ill_prev_usesrc(ill); 453 prev_ill->ill_usesrc_grp_next = 454 ill->ill_usesrc_grp_next; 455 } 456 } 457 rw_exit(&ipst->ips_ill_g_usesrc_lock); 458 } 459 460 static void 461 ipif_non_duplicate(ipif_t *ipif) 462 { 463 ill_t *ill = ipif->ipif_ill; 464 mutex_enter(&ill->ill_lock); 465 if (ipif->ipif_flags & IPIF_DUPLICATE) { 466 ipif->ipif_flags &= ~IPIF_DUPLICATE; 467 ASSERT(ill->ill_ipif_dup_count > 0); 468 ill->ill_ipif_dup_count--; 469 } 470 mutex_exit(&ill->ill_lock); 471 } 472 473 /* 474 * ill_delete_tail is called from ip_modclose after all references 475 * to the closing ill are gone. The wait is done in ip_modclose 476 */ 477 void 478 ill_delete_tail(ill_t *ill) 479 { 480 mblk_t **mpp; 481 ipif_t *ipif; 482 ip_stack_t *ipst = ill->ill_ipst; 483 484 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 485 ipif_non_duplicate(ipif); 486 (void) ipif_down_tail(ipif); 487 } 488 489 ASSERT(ill->ill_ipif_dup_count == 0); 490 491 /* 492 * If polling capability is enabled (which signifies direct 493 * upcall into IP and driver has ill saved as a handle), 494 * we need to make sure that unbind has completed before we 495 * let the ill disappear and driver no longer has any reference 496 * to this ill. 497 */ 498 mutex_enter(&ill->ill_lock); 499 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 500 cv_wait(&ill->ill_cv, &ill->ill_lock); 501 mutex_exit(&ill->ill_lock); 502 ASSERT(!(ill->ill_capabilities & 503 (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT))); 504 505 if (ill->ill_net_type != IRE_LOOPBACK) 506 qprocsoff(ill->ill_rq); 507 508 /* 509 * We do an ipsq_flush once again now. New messages could have 510 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 511 * could also have landed up if an ioctl thread had looked up 512 * the ill before we set the ILL_CONDEMNED flag, but not yet 513 * enqueued the ioctl when we did the ipsq_flush last time. 514 */ 515 ipsq_flush(ill); 516 517 /* 518 * Free capabilities. 519 */ 520 if (ill->ill_hcksum_capab != NULL) { 521 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 522 ill->ill_hcksum_capab = NULL; 523 } 524 525 if (ill->ill_zerocopy_capab != NULL) { 526 kmem_free(ill->ill_zerocopy_capab, 527 sizeof (ill_zerocopy_capab_t)); 528 ill->ill_zerocopy_capab = NULL; 529 } 530 531 if (ill->ill_lso_capab != NULL) { 532 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 533 ill->ill_lso_capab = NULL; 534 } 535 536 if (ill->ill_dld_capab != NULL) { 537 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t)); 538 ill->ill_dld_capab = NULL; 539 } 540 541 while (ill->ill_ipif != NULL) 542 ipif_free_tail(ill->ill_ipif); 543 544 /* 545 * We have removed all references to ilm from conn and the ones joined 546 * within the kernel. 547 * 548 * We don't walk conns, mrts and ires because 549 * 550 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts. 551 * 2) ill_down ->ill_downi walks all the ires and cleans up 552 * ill references. 553 */ 554 555 /* 556 * If this ill is an IPMP meta-interface, blow away the illgrp. This 557 * is safe to do because the illgrp has already been unlinked from the 558 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it. 559 */ 560 if (IS_IPMP(ill)) { 561 ipmp_illgrp_destroy(ill->ill_grp); 562 ill->ill_grp = NULL; 563 } 564 565 /* 566 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free 567 * could free the phyint. No more reference to the phyint after this 568 * point. 569 */ 570 (void) ill_glist_delete(ill); 571 572 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 573 if (ill->ill_ndd_name != NULL) 574 nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name); 575 rw_exit(&ipst->ips_ip_g_nd_lock); 576 577 if (ill->ill_frag_ptr != NULL) { 578 uint_t count; 579 580 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 581 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 582 } 583 mi_free(ill->ill_frag_ptr); 584 ill->ill_frag_ptr = NULL; 585 ill->ill_frag_hash_tbl = NULL; 586 } 587 588 freemsg(ill->ill_nd_lla_mp); 589 /* Free all retained control messages. */ 590 mpp = &ill->ill_first_mp_to_free; 591 do { 592 while (mpp[0]) { 593 mblk_t *mp; 594 mblk_t *mp1; 595 596 mp = mpp[0]; 597 mpp[0] = mp->b_next; 598 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 599 mp1->b_next = NULL; 600 mp1->b_prev = NULL; 601 } 602 freemsg(mp); 603 } 604 } while (mpp++ != &ill->ill_last_mp_to_free); 605 606 ill_free_mib(ill); 607 608 #ifdef DEBUG 609 ill_trace_cleanup(ill); 610 #endif 611 612 /* The default multicast interface might have changed */ 613 ire_increment_multicast_generation(ipst, ill->ill_isv6); 614 615 /* Drop refcnt here */ 616 netstack_rele(ill->ill_ipst->ips_netstack); 617 ill->ill_ipst = NULL; 618 } 619 620 static void 621 ill_free_mib(ill_t *ill) 622 { 623 ip_stack_t *ipst = ill->ill_ipst; 624 625 /* 626 * MIB statistics must not be lost, so when an interface 627 * goes away the counter values will be added to the global 628 * MIBs. 629 */ 630 if (ill->ill_ip_mib != NULL) { 631 if (ill->ill_isv6) { 632 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, 633 ill->ill_ip_mib); 634 } else { 635 ip_mib2_add_ip_stats(&ipst->ips_ip_mib, 636 ill->ill_ip_mib); 637 } 638 639 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 640 ill->ill_ip_mib = NULL; 641 } 642 if (ill->ill_icmp6_mib != NULL) { 643 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, 644 ill->ill_icmp6_mib); 645 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 646 ill->ill_icmp6_mib = NULL; 647 } 648 } 649 650 /* 651 * Concatenate together a physical address and a sap. 652 * 653 * Sap_lengths are interpreted as follows: 654 * sap_length == 0 ==> no sap 655 * sap_length > 0 ==> sap is at the head of the dlpi address 656 * sap_length < 0 ==> sap is at the tail of the dlpi address 657 */ 658 static void 659 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 660 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 661 { 662 uint16_t sap_addr = (uint16_t)sap_src; 663 664 if (sap_length == 0) { 665 if (phys_src == NULL) 666 bzero(dst, phys_length); 667 else 668 bcopy(phys_src, dst, phys_length); 669 } else if (sap_length < 0) { 670 if (phys_src == NULL) 671 bzero(dst, phys_length); 672 else 673 bcopy(phys_src, dst, phys_length); 674 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 675 } else { 676 bcopy(&sap_addr, dst, sizeof (sap_addr)); 677 if (phys_src == NULL) 678 bzero((char *)dst + sap_length, phys_length); 679 else 680 bcopy(phys_src, (char *)dst + sap_length, phys_length); 681 } 682 } 683 684 /* 685 * Generate a dl_unitdata_req mblk for the device and address given. 686 * addr_length is the length of the physical portion of the address. 687 * If addr is NULL include an all zero address of the specified length. 688 * TRUE? In any case, addr_length is taken to be the entire length of the 689 * dlpi address, including the absolute value of sap_length. 690 */ 691 mblk_t * 692 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 693 t_scalar_t sap_length) 694 { 695 dl_unitdata_req_t *dlur; 696 mblk_t *mp; 697 t_scalar_t abs_sap_length; /* absolute value */ 698 699 abs_sap_length = ABS(sap_length); 700 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 701 DL_UNITDATA_REQ); 702 if (mp == NULL) 703 return (NULL); 704 dlur = (dl_unitdata_req_t *)mp->b_rptr; 705 /* HACK: accomodate incompatible DLPI drivers */ 706 if (addr_length == 8) 707 addr_length = 6; 708 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 709 dlur->dl_dest_addr_offset = sizeof (*dlur); 710 dlur->dl_priority.dl_min = 0; 711 dlur->dl_priority.dl_max = 0; 712 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 713 (uchar_t *)&dlur[1]); 714 return (mp); 715 } 716 717 /* 718 * Add the pending mp to the list. There can be only 1 pending mp 719 * in the list. Any exclusive ioctl that needs to wait for a response 720 * from another module or driver needs to use this function to set 721 * the ipx_pending_mp to the ioctl mblk and wait for the response from 722 * the other module/driver. This is also used while waiting for the 723 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 724 */ 725 boolean_t 726 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 727 int waitfor) 728 { 729 ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop; 730 731 ASSERT(IAM_WRITER_IPIF(ipif)); 732 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 733 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 734 ASSERT(ipx->ipx_pending_mp == NULL); 735 /* 736 * The caller may be using a different ipif than the one passed into 737 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 738 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT 739 * that `ipx_current_ipif == ipif'. 740 */ 741 ASSERT(ipx->ipx_current_ipif != NULL); 742 743 /* 744 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the 745 * driver. 746 */ 747 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) || 748 (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) || 749 (DB_TYPE(add_mp) == M_PCPROTO)); 750 751 if (connp != NULL) { 752 ASSERT(MUTEX_HELD(&connp->conn_lock)); 753 /* 754 * Return error if the conn has started closing. The conn 755 * could have finished cleaning up the pending mp list, 756 * If so we should not add another mp to the list negating 757 * the cleanup. 758 */ 759 if (connp->conn_state_flags & CONN_CLOSING) 760 return (B_FALSE); 761 } 762 mutex_enter(&ipx->ipx_lock); 763 ipx->ipx_pending_ipif = ipif; 764 /* 765 * Note down the queue in b_queue. This will be returned by 766 * ipsq_pending_mp_get. Caller will then use these values to restart 767 * the processing 768 */ 769 add_mp->b_next = NULL; 770 add_mp->b_queue = q; 771 ipx->ipx_pending_mp = add_mp; 772 ipx->ipx_waitfor = waitfor; 773 mutex_exit(&ipx->ipx_lock); 774 775 if (connp != NULL) 776 connp->conn_oper_pending_ill = ipif->ipif_ill; 777 778 return (B_TRUE); 779 } 780 781 /* 782 * Retrieve the ipx_pending_mp and return it. There can be only 1 mp 783 * queued in the list. 784 */ 785 mblk_t * 786 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 787 { 788 mblk_t *curr = NULL; 789 ipxop_t *ipx = ipsq->ipsq_xop; 790 791 *connpp = NULL; 792 mutex_enter(&ipx->ipx_lock); 793 if (ipx->ipx_pending_mp == NULL) { 794 mutex_exit(&ipx->ipx_lock); 795 return (NULL); 796 } 797 798 /* There can be only 1 such excl message */ 799 curr = ipx->ipx_pending_mp; 800 ASSERT(curr->b_next == NULL); 801 ipx->ipx_pending_ipif = NULL; 802 ipx->ipx_pending_mp = NULL; 803 ipx->ipx_waitfor = 0; 804 mutex_exit(&ipx->ipx_lock); 805 806 if (CONN_Q(curr->b_queue)) { 807 /* 808 * This mp did a refhold on the conn, at the start of the ioctl. 809 * So we can safely return a pointer to the conn to the caller. 810 */ 811 *connpp = Q_TO_CONN(curr->b_queue); 812 } else { 813 *connpp = NULL; 814 } 815 curr->b_next = NULL; 816 curr->b_prev = NULL; 817 return (curr); 818 } 819 820 /* 821 * Cleanup the ioctl mp queued in ipx_pending_mp 822 * - Called in the ill_delete path 823 * - Called in the M_ERROR or M_HANGUP path on the ill. 824 * - Called in the conn close path. 825 */ 826 boolean_t 827 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 828 { 829 mblk_t *mp; 830 ipxop_t *ipx; 831 queue_t *q; 832 ipif_t *ipif; 833 int cmd; 834 835 ASSERT(IAM_WRITER_ILL(ill)); 836 ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 837 838 /* 839 * If connp is null, unconditionally clean up the ipx_pending_mp. 840 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 841 * even if it is meant for another ill, since we have to enqueue 842 * a new mp now in ipx_pending_mp to complete the ipif_down. 843 * If connp is non-null we are called from the conn close path. 844 */ 845 mutex_enter(&ipx->ipx_lock); 846 mp = ipx->ipx_pending_mp; 847 if (mp == NULL || (connp != NULL && 848 mp->b_queue != CONNP_TO_WQ(connp))) { 849 mutex_exit(&ipx->ipx_lock); 850 return (B_FALSE); 851 } 852 /* Now remove from the ipx_pending_mp */ 853 ipx->ipx_pending_mp = NULL; 854 q = mp->b_queue; 855 mp->b_next = NULL; 856 mp->b_prev = NULL; 857 mp->b_queue = NULL; 858 859 ipif = ipx->ipx_pending_ipif; 860 ipx->ipx_pending_ipif = NULL; 861 ipx->ipx_waitfor = 0; 862 ipx->ipx_current_ipif = NULL; 863 cmd = ipx->ipx_current_ioctl; 864 ipx->ipx_current_ioctl = 0; 865 ipx->ipx_current_done = B_TRUE; 866 mutex_exit(&ipx->ipx_lock); 867 868 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 869 DTRACE_PROBE4(ipif__ioctl, 870 char *, "ipsq_pending_mp_cleanup", 871 int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill, 872 ipif_t *, ipif); 873 if (connp == NULL) { 874 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 875 } else { 876 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); 877 mutex_enter(&ipif->ipif_ill->ill_lock); 878 ipif->ipif_state_flags &= ~IPIF_CHANGING; 879 mutex_exit(&ipif->ipif_ill->ill_lock); 880 } 881 } else { 882 /* 883 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 884 * be just inet_freemsg. we have to restart it 885 * otherwise the thread will be stuck. 886 */ 887 inet_freemsg(mp); 888 } 889 return (B_TRUE); 890 } 891 892 /* 893 * Called in the conn close path and ill delete path 894 */ 895 static void 896 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 897 { 898 ipsq_t *ipsq; 899 mblk_t *prev; 900 mblk_t *curr; 901 mblk_t *next; 902 queue_t *q; 903 mblk_t *tmp_list = NULL; 904 905 ASSERT(IAM_WRITER_ILL(ill)); 906 if (connp != NULL) 907 q = CONNP_TO_WQ(connp); 908 else 909 q = ill->ill_wq; 910 911 ipsq = ill->ill_phyint->phyint_ipsq; 912 /* 913 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 914 * In the case of ioctl from a conn, there can be only 1 mp 915 * queued on the ipsq. If an ill is being unplumbed, only messages 916 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 917 * ioctls meant for this ill form conn's are not flushed. They will 918 * be processed during ipsq_exit and will not find the ill and will 919 * return error. 920 */ 921 mutex_enter(&ipsq->ipsq_lock); 922 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 923 curr = next) { 924 next = curr->b_next; 925 if (curr->b_queue == q || curr->b_queue == RD(q)) { 926 /* Unlink the mblk from the pending mp list */ 927 if (prev != NULL) { 928 prev->b_next = curr->b_next; 929 } else { 930 ASSERT(ipsq->ipsq_xopq_mphead == curr); 931 ipsq->ipsq_xopq_mphead = curr->b_next; 932 } 933 if (ipsq->ipsq_xopq_mptail == curr) 934 ipsq->ipsq_xopq_mptail = prev; 935 /* 936 * Create a temporary list and release the ipsq lock 937 * New elements are added to the head of the tmp_list 938 */ 939 curr->b_next = tmp_list; 940 tmp_list = curr; 941 } else { 942 prev = curr; 943 } 944 } 945 mutex_exit(&ipsq->ipsq_lock); 946 947 while (tmp_list != NULL) { 948 curr = tmp_list; 949 tmp_list = curr->b_next; 950 curr->b_next = NULL; 951 curr->b_prev = NULL; 952 curr->b_queue = NULL; 953 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 954 DTRACE_PROBE4(ipif__ioctl, 955 char *, "ipsq_xopq_mp_cleanup", 956 int, 0, ill_t *, NULL, ipif_t *, NULL); 957 ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? 958 CONN_CLOSE : NO_COPYOUT, NULL); 959 } else { 960 /* 961 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 962 * this can't be just inet_freemsg. we have to 963 * restart it otherwise the thread will be stuck. 964 */ 965 inet_freemsg(curr); 966 } 967 } 968 } 969 970 /* 971 * This conn has started closing. Cleanup any pending ioctl from this conn. 972 * STREAMS ensures that there can be at most 1 ioctl pending on a stream. 973 */ 974 void 975 conn_ioctl_cleanup(conn_t *connp) 976 { 977 ipsq_t *ipsq; 978 ill_t *ill; 979 boolean_t refheld; 980 981 /* 982 * Is any exclusive ioctl pending ? If so clean it up. If the 983 * ioctl has not yet started, the mp is pending in the list headed by 984 * ipsq_xopq_head. If the ioctl has started the mp could be present in 985 * ipx_pending_mp. If the ioctl timed out in the streamhead but 986 * is currently executing now the mp is not queued anywhere but 987 * conn_oper_pending_ill is null. The conn close will wait 988 * till the conn_ref drops to zero. 989 */ 990 mutex_enter(&connp->conn_lock); 991 ill = connp->conn_oper_pending_ill; 992 if (ill == NULL) { 993 mutex_exit(&connp->conn_lock); 994 return; 995 } 996 997 /* 998 * We may not be able to refhold the ill if the ill/ipif 999 * is changing. But we need to make sure that the ill will 1000 * not vanish. So we just bump up the ill_waiter count. 1001 */ 1002 refheld = ill_waiter_inc(ill); 1003 mutex_exit(&connp->conn_lock); 1004 if (refheld) { 1005 if (ipsq_enter(ill, B_TRUE, NEW_OP)) { 1006 ill_waiter_dcr(ill); 1007 /* 1008 * Check whether this ioctl has started and is 1009 * pending. If it is not found there then check 1010 * whether this ioctl has not even started and is in 1011 * the ipsq_xopq list. 1012 */ 1013 if (!ipsq_pending_mp_cleanup(ill, connp)) 1014 ipsq_xopq_mp_cleanup(ill, connp); 1015 ipsq = ill->ill_phyint->phyint_ipsq; 1016 ipsq_exit(ipsq); 1017 return; 1018 } 1019 } 1020 1021 /* 1022 * The ill is also closing and we could not bump up the 1023 * ill_waiter_count or we could not enter the ipsq. Leave 1024 * the cleanup to ill_delete 1025 */ 1026 mutex_enter(&connp->conn_lock); 1027 while (connp->conn_oper_pending_ill != NULL) 1028 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1029 mutex_exit(&connp->conn_lock); 1030 if (refheld) 1031 ill_waiter_dcr(ill); 1032 } 1033 1034 /* 1035 * ipcl_walk function for cleaning up conn_*_ill fields. 1036 * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and 1037 * conn_bound_if in place. We prefer dropping 1038 * packets instead of sending them out the wrong interface, or accepting 1039 * packets from the wrong ifindex. 1040 */ 1041 static void 1042 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1043 { 1044 ill_t *ill = (ill_t *)arg; 1045 1046 mutex_enter(&connp->conn_lock); 1047 if (connp->conn_dhcpinit_ill == ill) { 1048 connp->conn_dhcpinit_ill = NULL; 1049 ASSERT(ill->ill_dhcpinit != 0); 1050 atomic_dec_32(&ill->ill_dhcpinit); 1051 ill_set_inputfn(ill); 1052 } 1053 mutex_exit(&connp->conn_lock); 1054 } 1055 1056 static int 1057 ill_down_ipifs_tail(ill_t *ill) 1058 { 1059 ipif_t *ipif; 1060 int err; 1061 1062 ASSERT(IAM_WRITER_ILL(ill)); 1063 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1064 ipif_non_duplicate(ipif); 1065 /* 1066 * ipif_down_tail will call arp_ll_down on the last ipif 1067 * and typically return EINPROGRESS when the DL_UNBIND is sent. 1068 */ 1069 if ((err = ipif_down_tail(ipif)) != 0) 1070 return (err); 1071 } 1072 return (0); 1073 } 1074 1075 /* ARGSUSED */ 1076 void 1077 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1078 { 1079 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1080 (void) ill_down_ipifs_tail(q->q_ptr); 1081 freemsg(mp); 1082 ipsq_current_finish(ipsq); 1083 } 1084 1085 /* 1086 * ill_down_start is called when we want to down this ill and bring it up again 1087 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1088 * all interfaces, but don't tear down any plumbing. 1089 */ 1090 boolean_t 1091 ill_down_start(queue_t *q, mblk_t *mp) 1092 { 1093 ill_t *ill = q->q_ptr; 1094 ipif_t *ipif; 1095 1096 ASSERT(IAM_WRITER_ILL(ill)); 1097 mutex_enter(&ill->ill_lock); 1098 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 1099 /* no more nce addition allowed */ 1100 mutex_exit(&ill->ill_lock); 1101 1102 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1103 (void) ipif_down(ipif, NULL, NULL); 1104 1105 ill_down(ill); 1106 1107 /* 1108 * Walk all CONNs that can have a reference on an ire or nce for this 1109 * ill (we actually walk all that now have stale references). 1110 */ 1111 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst); 1112 1113 /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 1114 if (ill->ill_isv6) 1115 dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst); 1116 1117 1118 (void) ipsq_pending_mp_cleanup(ill, NULL); 1119 1120 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); 1121 1122 /* 1123 * Atomically test and add the pending mp if references are active. 1124 */ 1125 mutex_enter(&ill->ill_lock); 1126 if (!ill_is_quiescent(ill)) { 1127 /* call cannot fail since `conn_t *' argument is NULL */ 1128 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1129 mp, ILL_DOWN); 1130 mutex_exit(&ill->ill_lock); 1131 return (B_FALSE); 1132 } 1133 mutex_exit(&ill->ill_lock); 1134 return (B_TRUE); 1135 } 1136 1137 static void 1138 ill_down(ill_t *ill) 1139 { 1140 mblk_t *mp; 1141 ip_stack_t *ipst = ill->ill_ipst; 1142 1143 /* 1144 * Blow off any IREs dependent on this ILL. 1145 * The caller needs to handle conn_ixa_cleanup 1146 */ 1147 ill_delete_ires(ill); 1148 1149 ire_walk_ill(0, 0, ill_downi, ill, ill); 1150 1151 /* Remove any conn_*_ill depending on this ill */ 1152 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); 1153 1154 /* 1155 * Free state for additional IREs. 1156 */ 1157 mutex_enter(&ill->ill_saved_ire_lock); 1158 mp = ill->ill_saved_ire_mp; 1159 ill->ill_saved_ire_mp = NULL; 1160 ill->ill_saved_ire_cnt = 0; 1161 mutex_exit(&ill->ill_saved_ire_lock); 1162 freemsg(mp); 1163 } 1164 1165 /* 1166 * ire_walk routine used to delete every IRE that depends on 1167 * 'ill'. (Always called as writer.) 1168 * 1169 * Note: since the routes added by the kernel are deleted separately, 1170 * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE. 1171 * 1172 * We also remove references on ire_nce_cache entries that refer to the ill. 1173 */ 1174 void 1175 ill_downi(ire_t *ire, char *ill_arg) 1176 { 1177 ill_t *ill = (ill_t *)ill_arg; 1178 nce_t *nce; 1179 1180 mutex_enter(&ire->ire_lock); 1181 nce = ire->ire_nce_cache; 1182 if (nce != NULL && nce->nce_ill == ill) 1183 ire->ire_nce_cache = NULL; 1184 else 1185 nce = NULL; 1186 mutex_exit(&ire->ire_lock); 1187 if (nce != NULL) 1188 nce_refrele(nce); 1189 if (ire->ire_ill == ill) 1190 ire_delete(ire); 1191 } 1192 1193 /* Remove IRE_IF_CLONE on this ill */ 1194 void 1195 ill_downi_if_clone(ire_t *ire, char *ill_arg) 1196 { 1197 ill_t *ill = (ill_t *)ill_arg; 1198 1199 ASSERT(ire->ire_type & IRE_IF_CLONE); 1200 if (ire->ire_ill == ill) 1201 ire_delete(ire); 1202 } 1203 1204 /* Consume an M_IOCACK of the fastpath probe. */ 1205 void 1206 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1207 { 1208 mblk_t *mp1 = mp; 1209 1210 /* 1211 * If this was the first attempt turn on the fastpath probing. 1212 */ 1213 mutex_enter(&ill->ill_lock); 1214 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1215 ill->ill_dlpi_fastpath_state = IDS_OK; 1216 mutex_exit(&ill->ill_lock); 1217 1218 /* Free the M_IOCACK mblk, hold on to the data */ 1219 mp = mp->b_cont; 1220 freeb(mp1); 1221 if (mp == NULL) 1222 return; 1223 if (mp->b_cont != NULL) 1224 nce_fastpath_update(ill, mp); 1225 else 1226 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1227 freemsg(mp); 1228 } 1229 1230 /* 1231 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1232 * The data portion of the request is a dl_unitdata_req_t template for 1233 * what we would send downstream in the absence of a fastpath confirmation. 1234 */ 1235 int 1236 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1237 { 1238 struct iocblk *ioc; 1239 mblk_t *mp; 1240 1241 if (dlur_mp == NULL) 1242 return (EINVAL); 1243 1244 mutex_enter(&ill->ill_lock); 1245 switch (ill->ill_dlpi_fastpath_state) { 1246 case IDS_FAILED: 1247 /* 1248 * Driver NAKed the first fastpath ioctl - assume it doesn't 1249 * support it. 1250 */ 1251 mutex_exit(&ill->ill_lock); 1252 return (ENOTSUP); 1253 case IDS_UNKNOWN: 1254 /* This is the first probe */ 1255 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1256 break; 1257 default: 1258 break; 1259 } 1260 mutex_exit(&ill->ill_lock); 1261 1262 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1263 return (EAGAIN); 1264 1265 mp->b_cont = copyb(dlur_mp); 1266 if (mp->b_cont == NULL) { 1267 freeb(mp); 1268 return (EAGAIN); 1269 } 1270 1271 ioc = (struct iocblk *)mp->b_rptr; 1272 ioc->ioc_count = msgdsize(mp->b_cont); 1273 1274 DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe", 1275 char *, "DL_IOC_HDR_INFO", ill_t *, ill); 1276 putnext(ill->ill_wq, mp); 1277 return (0); 1278 } 1279 1280 void 1281 ill_capability_probe(ill_t *ill) 1282 { 1283 mblk_t *mp; 1284 1285 ASSERT(IAM_WRITER_ILL(ill)); 1286 1287 if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN && 1288 ill->ill_dlpi_capab_state != IDCS_FAILED) 1289 return; 1290 1291 /* 1292 * We are starting a new cycle of capability negotiation. 1293 * Free up the capab reset messages of any previous incarnation. 1294 * We will do a fresh allocation when we get the response to our probe 1295 */ 1296 if (ill->ill_capab_reset_mp != NULL) { 1297 freemsg(ill->ill_capab_reset_mp); 1298 ill->ill_capab_reset_mp = NULL; 1299 } 1300 1301 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1302 1303 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); 1304 if (mp == NULL) 1305 return; 1306 1307 ill_capability_send(ill, mp); 1308 ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; 1309 } 1310 1311 void 1312 ill_capability_reset(ill_t *ill, boolean_t reneg) 1313 { 1314 ASSERT(IAM_WRITER_ILL(ill)); 1315 1316 if (ill->ill_dlpi_capab_state != IDCS_OK) 1317 return; 1318 1319 ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; 1320 1321 ill_capability_send(ill, ill->ill_capab_reset_mp); 1322 ill->ill_capab_reset_mp = NULL; 1323 /* 1324 * We turn off all capabilities except those pertaining to 1325 * direct function call capabilities viz. ILL_CAPAB_DLD* 1326 * which will be turned off by the corresponding reset functions. 1327 */ 1328 ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY); 1329 } 1330 1331 static void 1332 ill_capability_reset_alloc(ill_t *ill) 1333 { 1334 mblk_t *mp; 1335 size_t size = 0; 1336 int err; 1337 dl_capability_req_t *capb; 1338 1339 ASSERT(IAM_WRITER_ILL(ill)); 1340 ASSERT(ill->ill_capab_reset_mp == NULL); 1341 1342 if (ILL_HCKSUM_CAPABLE(ill)) { 1343 size += sizeof (dl_capability_sub_t) + 1344 sizeof (dl_capab_hcksum_t); 1345 } 1346 1347 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) { 1348 size += sizeof (dl_capability_sub_t) + 1349 sizeof (dl_capab_zerocopy_t); 1350 } 1351 1352 if (ill->ill_capabilities & ILL_CAPAB_DLD) { 1353 size += sizeof (dl_capability_sub_t) + 1354 sizeof (dl_capab_dld_t); 1355 } 1356 1357 mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED, 1358 STR_NOSIG, &err); 1359 1360 mp->b_datap->db_type = M_PROTO; 1361 bzero(mp->b_rptr, size + sizeof (dl_capability_req_t)); 1362 1363 capb = (dl_capability_req_t *)mp->b_rptr; 1364 capb->dl_primitive = DL_CAPABILITY_REQ; 1365 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1366 capb->dl_sub_length = size; 1367 1368 mp->b_wptr += sizeof (dl_capability_req_t); 1369 1370 /* 1371 * Each handler fills in the corresponding dl_capability_sub_t 1372 * inside the mblk, 1373 */ 1374 ill_capability_hcksum_reset_fill(ill, mp); 1375 ill_capability_zerocopy_reset_fill(ill, mp); 1376 ill_capability_dld_reset_fill(ill, mp); 1377 1378 ill->ill_capab_reset_mp = mp; 1379 } 1380 1381 static void 1382 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1383 { 1384 dl_capab_id_t *id_ic; 1385 uint_t sub_dl_cap = outers->dl_cap; 1386 dl_capability_sub_t *inners; 1387 uint8_t *capend; 1388 1389 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1390 1391 /* 1392 * Note: range checks here are not absolutely sufficient to 1393 * make us robust against malformed messages sent by drivers; 1394 * this is in keeping with the rest of IP's dlpi handling. 1395 * (Remember, it's coming from something else in the kernel 1396 * address space) 1397 */ 1398 1399 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1400 if (capend > mp->b_wptr) { 1401 cmn_err(CE_WARN, "ill_capability_id_ack: " 1402 "malformed sub-capability too long for mblk"); 1403 return; 1404 } 1405 1406 id_ic = (dl_capab_id_t *)(outers + 1); 1407 1408 if (outers->dl_length < sizeof (*id_ic) || 1409 (inners = &id_ic->id_subcap, 1410 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1411 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1412 "encapsulated capab type %d too long for mblk", 1413 inners->dl_cap); 1414 return; 1415 } 1416 1417 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1418 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1419 "isn't as expected; pass-thru module(s) detected, " 1420 "discarding capability\n", inners->dl_cap)); 1421 return; 1422 } 1423 1424 /* Process the encapsulated sub-capability */ 1425 ill_capability_dispatch(ill, mp, inners); 1426 } 1427 1428 static void 1429 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp) 1430 { 1431 dl_capability_sub_t *dl_subcap; 1432 1433 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 1434 return; 1435 1436 /* 1437 * The dl_capab_dld_t that follows the dl_capability_sub_t is not 1438 * initialized below since it is not used by DLD. 1439 */ 1440 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1441 dl_subcap->dl_cap = DL_CAPAB_DLD; 1442 dl_subcap->dl_length = sizeof (dl_capab_dld_t); 1443 1444 mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); 1445 } 1446 1447 static void 1448 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp) 1449 { 1450 /* 1451 * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK 1452 * is only to get the VRRP capability. 1453 * 1454 * Note that we cannot check ill_ipif_up_count here since 1455 * ill_ipif_up_count is only incremented when the resolver is setup. 1456 * That is done asynchronously, and can race with this function. 1457 */ 1458 if (!ill->ill_dl_up) { 1459 if (subp->dl_cap == DL_CAPAB_VRRP) 1460 ill_capability_vrrp_ack(ill, mp, subp); 1461 return; 1462 } 1463 1464 switch (subp->dl_cap) { 1465 case DL_CAPAB_HCKSUM: 1466 ill_capability_hcksum_ack(ill, mp, subp); 1467 break; 1468 case DL_CAPAB_ZEROCOPY: 1469 ill_capability_zerocopy_ack(ill, mp, subp); 1470 break; 1471 case DL_CAPAB_DLD: 1472 ill_capability_dld_ack(ill, mp, subp); 1473 break; 1474 case DL_CAPAB_VRRP: 1475 break; 1476 default: 1477 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 1478 subp->dl_cap)); 1479 } 1480 } 1481 1482 /* 1483 * Process the vrrp capability received from a DLS Provider. isub must point 1484 * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message. 1485 */ 1486 static void 1487 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1488 { 1489 dl_capab_vrrp_t *vrrp; 1490 uint_t sub_dl_cap = isub->dl_cap; 1491 uint8_t *capend; 1492 1493 ASSERT(IAM_WRITER_ILL(ill)); 1494 ASSERT(sub_dl_cap == DL_CAPAB_VRRP); 1495 1496 /* 1497 * Note: range checks here are not absolutely sufficient to 1498 * make us robust against malformed messages sent by drivers; 1499 * this is in keeping with the rest of IP's dlpi handling. 1500 * (Remember, it's coming from something else in the kernel 1501 * address space) 1502 */ 1503 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1504 if (capend > mp->b_wptr) { 1505 cmn_err(CE_WARN, "ill_capability_vrrp_ack: " 1506 "malformed sub-capability too long for mblk"); 1507 return; 1508 } 1509 vrrp = (dl_capab_vrrp_t *)(isub + 1); 1510 1511 /* 1512 * Compare the IP address family and set ILLF_VRRP for the right ill. 1513 */ 1514 if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) || 1515 (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) { 1516 ill->ill_flags |= ILLF_VRRP; 1517 } 1518 } 1519 1520 /* 1521 * Process a hardware checksum offload capability negotiation ack received 1522 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 1523 * of a DL_CAPABILITY_ACK message. 1524 */ 1525 static void 1526 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1527 { 1528 dl_capability_req_t *ocap; 1529 dl_capab_hcksum_t *ihck, *ohck; 1530 ill_hcksum_capab_t **ill_hcksum; 1531 mblk_t *nmp = NULL; 1532 uint_t sub_dl_cap = isub->dl_cap; 1533 uint8_t *capend; 1534 1535 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 1536 1537 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 1538 1539 /* 1540 * Note: range checks here are not absolutely sufficient to 1541 * make us robust against malformed messages sent by drivers; 1542 * this is in keeping with the rest of IP's dlpi handling. 1543 * (Remember, it's coming from something else in the kernel 1544 * address space) 1545 */ 1546 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1547 if (capend > mp->b_wptr) { 1548 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1549 "malformed sub-capability too long for mblk"); 1550 return; 1551 } 1552 1553 /* 1554 * There are two types of acks we process here: 1555 * 1. acks in reply to a (first form) generic capability req 1556 * (no ENABLE flag set) 1557 * 2. acks in reply to a ENABLE capability req. 1558 * (ENABLE flag set) 1559 */ 1560 ihck = (dl_capab_hcksum_t *)(isub + 1); 1561 1562 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 1563 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 1564 "unsupported hardware checksum " 1565 "sub-capability (version %d, expected %d)", 1566 ihck->hcksum_version, HCKSUM_VERSION_1); 1567 return; 1568 } 1569 1570 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 1571 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 1572 "checksum capability isn't as expected; pass-thru " 1573 "module(s) detected, discarding capability\n")); 1574 return; 1575 } 1576 1577 #define CURR_HCKSUM_CAPAB \ 1578 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 1579 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 1580 1581 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 1582 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 1583 /* do ENABLE processing */ 1584 if (*ill_hcksum == NULL) { 1585 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 1586 KM_NOSLEEP); 1587 1588 if (*ill_hcksum == NULL) { 1589 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1590 "could not enable hcksum version %d " 1591 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 1592 ill->ill_name); 1593 return; 1594 } 1595 } 1596 1597 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 1598 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 1599 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 1600 ip1dbg(("ill_capability_hcksum_ack: interface %s " 1601 "has enabled hardware checksumming\n ", 1602 ill->ill_name)); 1603 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 1604 /* 1605 * Enabling hardware checksum offload 1606 * Currently IP supports {TCP,UDP}/IPv4 1607 * partial and full cksum offload and 1608 * IPv4 header checksum offload. 1609 * Allocate new mblk which will 1610 * contain a new capability request 1611 * to enable hardware checksum offload. 1612 */ 1613 uint_t size; 1614 uchar_t *rptr; 1615 1616 size = sizeof (dl_capability_req_t) + 1617 sizeof (dl_capability_sub_t) + isub->dl_length; 1618 1619 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1620 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1621 "could not enable hardware cksum for %s (ENOMEM)\n", 1622 ill->ill_name); 1623 return; 1624 } 1625 1626 rptr = nmp->b_rptr; 1627 /* initialize dl_capability_req_t */ 1628 ocap = (dl_capability_req_t *)nmp->b_rptr; 1629 ocap->dl_sub_offset = 1630 sizeof (dl_capability_req_t); 1631 ocap->dl_sub_length = 1632 sizeof (dl_capability_sub_t) + 1633 isub->dl_length; 1634 nmp->b_rptr += sizeof (dl_capability_req_t); 1635 1636 /* initialize dl_capability_sub_t */ 1637 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 1638 nmp->b_rptr += sizeof (*isub); 1639 1640 /* initialize dl_capab_hcksum_t */ 1641 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 1642 bcopy(ihck, ohck, sizeof (*ihck)); 1643 1644 nmp->b_rptr = rptr; 1645 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 1646 1647 /* Set ENABLE flag */ 1648 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 1649 ohck->hcksum_txflags |= HCKSUM_ENABLE; 1650 1651 /* 1652 * nmp points to a DL_CAPABILITY_REQ message to enable 1653 * hardware checksum acceleration. 1654 */ 1655 ill_capability_send(ill, nmp); 1656 } else { 1657 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 1658 "advertised %x hardware checksum capability flags\n", 1659 ill->ill_name, ihck->hcksum_txflags)); 1660 } 1661 } 1662 1663 static void 1664 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp) 1665 { 1666 dl_capab_hcksum_t *hck_subcap; 1667 dl_capability_sub_t *dl_subcap; 1668 1669 if (!ILL_HCKSUM_CAPABLE(ill)) 1670 return; 1671 1672 ASSERT(ill->ill_hcksum_capab != NULL); 1673 1674 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1675 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 1676 dl_subcap->dl_length = sizeof (*hck_subcap); 1677 1678 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 1679 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 1680 hck_subcap->hcksum_txflags = 0; 1681 1682 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap); 1683 } 1684 1685 static void 1686 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1687 { 1688 mblk_t *nmp = NULL; 1689 dl_capability_req_t *oc; 1690 dl_capab_zerocopy_t *zc_ic, *zc_oc; 1691 ill_zerocopy_capab_t **ill_zerocopy_capab; 1692 uint_t sub_dl_cap = isub->dl_cap; 1693 uint8_t *capend; 1694 1695 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 1696 1697 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 1698 1699 /* 1700 * Note: range checks here are not absolutely sufficient to 1701 * make us robust against malformed messages sent by drivers; 1702 * this is in keeping with the rest of IP's dlpi handling. 1703 * (Remember, it's coming from something else in the kernel 1704 * address space) 1705 */ 1706 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1707 if (capend > mp->b_wptr) { 1708 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1709 "malformed sub-capability too long for mblk"); 1710 return; 1711 } 1712 1713 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 1714 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 1715 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 1716 "unsupported ZEROCOPY sub-capability (version %d, " 1717 "expected %d)", zc_ic->zerocopy_version, 1718 ZEROCOPY_VERSION_1); 1719 return; 1720 } 1721 1722 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 1723 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 1724 "capability isn't as expected; pass-thru module(s) " 1725 "detected, discarding capability\n")); 1726 return; 1727 } 1728 1729 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 1730 if (*ill_zerocopy_capab == NULL) { 1731 *ill_zerocopy_capab = 1732 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 1733 KM_NOSLEEP); 1734 1735 if (*ill_zerocopy_capab == NULL) { 1736 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1737 "could not enable Zero-copy version %d " 1738 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 1739 ill->ill_name); 1740 return; 1741 } 1742 } 1743 1744 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 1745 "supports Zero-copy version %d\n", ill->ill_name, 1746 ZEROCOPY_VERSION_1)); 1747 1748 (*ill_zerocopy_capab)->ill_zerocopy_version = 1749 zc_ic->zerocopy_version; 1750 (*ill_zerocopy_capab)->ill_zerocopy_flags = 1751 zc_ic->zerocopy_flags; 1752 1753 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 1754 } else { 1755 uint_t size; 1756 uchar_t *rptr; 1757 1758 size = sizeof (dl_capability_req_t) + 1759 sizeof (dl_capability_sub_t) + 1760 sizeof (dl_capab_zerocopy_t); 1761 1762 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1763 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1764 "could not enable zerocopy for %s (ENOMEM)\n", 1765 ill->ill_name); 1766 return; 1767 } 1768 1769 rptr = nmp->b_rptr; 1770 /* initialize dl_capability_req_t */ 1771 oc = (dl_capability_req_t *)rptr; 1772 oc->dl_sub_offset = sizeof (dl_capability_req_t); 1773 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 1774 sizeof (dl_capab_zerocopy_t); 1775 rptr += sizeof (dl_capability_req_t); 1776 1777 /* initialize dl_capability_sub_t */ 1778 bcopy(isub, rptr, sizeof (*isub)); 1779 rptr += sizeof (*isub); 1780 1781 /* initialize dl_capab_zerocopy_t */ 1782 zc_oc = (dl_capab_zerocopy_t *)rptr; 1783 *zc_oc = *zc_ic; 1784 1785 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 1786 "to enable zero-copy version %d\n", ill->ill_name, 1787 ZEROCOPY_VERSION_1)); 1788 1789 /* set VMSAFE_MEM flag */ 1790 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 1791 1792 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 1793 ill_capability_send(ill, nmp); 1794 } 1795 } 1796 1797 static void 1798 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp) 1799 { 1800 dl_capab_zerocopy_t *zerocopy_subcap; 1801 dl_capability_sub_t *dl_subcap; 1802 1803 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 1804 return; 1805 1806 ASSERT(ill->ill_zerocopy_capab != NULL); 1807 1808 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1809 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 1810 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 1811 1812 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 1813 zerocopy_subcap->zerocopy_version = 1814 ill->ill_zerocopy_capab->ill_zerocopy_version; 1815 zerocopy_subcap->zerocopy_flags = 0; 1816 1817 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 1818 } 1819 1820 /* 1821 * DLD capability 1822 * Refer to dld.h for more information regarding the purpose and usage 1823 * of this capability. 1824 */ 1825 static void 1826 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1827 { 1828 dl_capab_dld_t *dld_ic, dld; 1829 uint_t sub_dl_cap = isub->dl_cap; 1830 uint8_t *capend; 1831 ill_dld_capab_t *idc; 1832 1833 ASSERT(IAM_WRITER_ILL(ill)); 1834 ASSERT(sub_dl_cap == DL_CAPAB_DLD); 1835 1836 /* 1837 * Note: range checks here are not absolutely sufficient to 1838 * make us robust against malformed messages sent by drivers; 1839 * this is in keeping with the rest of IP's dlpi handling. 1840 * (Remember, it's coming from something else in the kernel 1841 * address space) 1842 */ 1843 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1844 if (capend > mp->b_wptr) { 1845 cmn_err(CE_WARN, "ill_capability_dld_ack: " 1846 "malformed sub-capability too long for mblk"); 1847 return; 1848 } 1849 dld_ic = (dl_capab_dld_t *)(isub + 1); 1850 if (dld_ic->dld_version != DLD_CURRENT_VERSION) { 1851 cmn_err(CE_CONT, "ill_capability_dld_ack: " 1852 "unsupported DLD sub-capability (version %d, " 1853 "expected %d)", dld_ic->dld_version, 1854 DLD_CURRENT_VERSION); 1855 return; 1856 } 1857 if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) { 1858 ip1dbg(("ill_capability_dld_ack: mid token for dld " 1859 "capability isn't as expected; pass-thru module(s) " 1860 "detected, discarding capability\n")); 1861 return; 1862 } 1863 1864 /* 1865 * Copy locally to ensure alignment. 1866 */ 1867 bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t)); 1868 1869 if ((idc = ill->ill_dld_capab) == NULL) { 1870 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP); 1871 if (idc == NULL) { 1872 cmn_err(CE_WARN, "ill_capability_dld_ack: " 1873 "could not enable DLD version %d " 1874 "for %s (ENOMEM)\n", DLD_CURRENT_VERSION, 1875 ill->ill_name); 1876 return; 1877 } 1878 ill->ill_dld_capab = idc; 1879 } 1880 idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab; 1881 idc->idc_capab_dh = (void *)dld.dld_capab_handle; 1882 ip1dbg(("ill_capability_dld_ack: interface %s " 1883 "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION)); 1884 1885 ill_capability_dld_enable(ill); 1886 } 1887 1888 /* 1889 * Typically capability negotiation between IP and the driver happens via 1890 * DLPI message exchange. However GLD also offers a direct function call 1891 * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities, 1892 * But arbitrary function calls into IP or GLD are not permitted, since both 1893 * of them are protected by their own perimeter mechanism. The perimeter can 1894 * be viewed as a coarse lock or serialization mechanism. The hierarchy of 1895 * these perimeters is IP -> MAC. Thus for example to enable the squeue 1896 * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter 1897 * to enter the mac perimeter and then do the direct function calls into 1898 * GLD to enable squeue polling. The ring related callbacks from the mac into 1899 * the stack to add, bind, quiesce, restart or cleanup a ring are all 1900 * protected by the mac perimeter. 1901 */ 1902 static void 1903 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp) 1904 { 1905 ill_dld_capab_t *idc = ill->ill_dld_capab; 1906 int err; 1907 1908 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp, 1909 DLD_ENABLE); 1910 ASSERT(err == 0); 1911 } 1912 1913 static void 1914 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph) 1915 { 1916 ill_dld_capab_t *idc = ill->ill_dld_capab; 1917 int err; 1918 1919 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph, 1920 DLD_DISABLE); 1921 ASSERT(err == 0); 1922 } 1923 1924 boolean_t 1925 ill_mac_perim_held(ill_t *ill) 1926 { 1927 ill_dld_capab_t *idc = ill->ill_dld_capab; 1928 1929 return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL, 1930 DLD_QUERY)); 1931 } 1932 1933 static void 1934 ill_capability_direct_enable(ill_t *ill) 1935 { 1936 ill_dld_capab_t *idc = ill->ill_dld_capab; 1937 ill_dld_direct_t *idd = &idc->idc_direct; 1938 dld_capab_direct_t direct; 1939 int rc; 1940 1941 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 1942 1943 bzero(&direct, sizeof (direct)); 1944 direct.di_rx_cf = (uintptr_t)ip_input; 1945 direct.di_rx_ch = ill; 1946 1947 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct, 1948 DLD_ENABLE); 1949 if (rc == 0) { 1950 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df; 1951 idd->idd_tx_dh = direct.di_tx_dh; 1952 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df; 1953 idd->idd_tx_cb_dh = direct.di_tx_cb_dh; 1954 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df; 1955 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh; 1956 ASSERT(idd->idd_tx_cb_df != NULL); 1957 ASSERT(idd->idd_tx_fctl_df != NULL); 1958 ASSERT(idd->idd_tx_df != NULL); 1959 /* 1960 * One time registration of flow enable callback function 1961 */ 1962 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh, 1963 ill_flow_enable, ill); 1964 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT; 1965 DTRACE_PROBE1(direct_on, (ill_t *), ill); 1966 } else { 1967 cmn_err(CE_WARN, "warning: could not enable DIRECT " 1968 "capability, rc = %d\n", rc); 1969 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc); 1970 } 1971 } 1972 1973 static void 1974 ill_capability_poll_enable(ill_t *ill) 1975 { 1976 ill_dld_capab_t *idc = ill->ill_dld_capab; 1977 dld_capab_poll_t poll; 1978 int rc; 1979 1980 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 1981 1982 bzero(&poll, sizeof (poll)); 1983 poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring; 1984 poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring; 1985 poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring; 1986 poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring; 1987 poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring; 1988 poll.poll_ring_ch = ill; 1989 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll, 1990 DLD_ENABLE); 1991 if (rc == 0) { 1992 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL; 1993 DTRACE_PROBE1(poll_on, (ill_t *), ill); 1994 } else { 1995 ip1dbg(("warning: could not enable POLL " 1996 "capability, rc = %d\n", rc)); 1997 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc); 1998 } 1999 } 2000 2001 /* 2002 * Enable the LSO capability. 2003 */ 2004 static void 2005 ill_capability_lso_enable(ill_t *ill) 2006 { 2007 ill_dld_capab_t *idc = ill->ill_dld_capab; 2008 dld_capab_lso_t lso; 2009 int rc; 2010 2011 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 2012 2013 if (ill->ill_lso_capab == NULL) { 2014 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), 2015 KM_NOSLEEP); 2016 if (ill->ill_lso_capab == NULL) { 2017 cmn_err(CE_WARN, "ill_capability_lso_enable: " 2018 "could not enable LSO for %s (ENOMEM)\n", 2019 ill->ill_name); 2020 return; 2021 } 2022 } 2023 2024 bzero(&lso, sizeof (lso)); 2025 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso, 2026 DLD_ENABLE)) == 0) { 2027 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags; 2028 ill->ill_lso_capab->ill_lso_max = lso.lso_max; 2029 ill->ill_capabilities |= ILL_CAPAB_LSO; 2030 ip1dbg(("ill_capability_lso_enable: interface %s " 2031 "has enabled LSO\n ", ill->ill_name)); 2032 } else { 2033 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 2034 ill->ill_lso_capab = NULL; 2035 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc); 2036 } 2037 } 2038 2039 static void 2040 ill_capability_dld_enable(ill_t *ill) 2041 { 2042 mac_perim_handle_t mph; 2043 2044 ASSERT(IAM_WRITER_ILL(ill)); 2045 2046 if (ill->ill_isv6) 2047 return; 2048 2049 ill_mac_perim_enter(ill, &mph); 2050 if (!ill->ill_isv6) { 2051 ill_capability_direct_enable(ill); 2052 ill_capability_poll_enable(ill); 2053 ill_capability_lso_enable(ill); 2054 } 2055 ill->ill_capabilities |= ILL_CAPAB_DLD; 2056 ill_mac_perim_exit(ill, mph); 2057 } 2058 2059 static void 2060 ill_capability_dld_disable(ill_t *ill) 2061 { 2062 ill_dld_capab_t *idc; 2063 ill_dld_direct_t *idd; 2064 mac_perim_handle_t mph; 2065 2066 ASSERT(IAM_WRITER_ILL(ill)); 2067 2068 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 2069 return; 2070 2071 ill_mac_perim_enter(ill, &mph); 2072 2073 idc = ill->ill_dld_capab; 2074 if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) { 2075 /* 2076 * For performance we avoid locks in the transmit data path 2077 * and don't maintain a count of the number of threads using 2078 * direct calls. Thus some threads could be using direct 2079 * transmit calls to GLD, even after the capability mechanism 2080 * turns it off. This is still safe since the handles used in 2081 * the direct calls continue to be valid until the unplumb is 2082 * completed. Remove the callback that was added (1-time) at 2083 * capab enable time. 2084 */ 2085 mutex_enter(&ill->ill_lock); 2086 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT; 2087 mutex_exit(&ill->ill_lock); 2088 if (ill->ill_flownotify_mh != NULL) { 2089 idd = &idc->idc_direct; 2090 idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL, 2091 ill->ill_flownotify_mh); 2092 ill->ill_flownotify_mh = NULL; 2093 } 2094 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, 2095 NULL, DLD_DISABLE); 2096 } 2097 2098 if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) { 2099 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL; 2100 ip_squeue_clean_all(ill); 2101 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, 2102 NULL, DLD_DISABLE); 2103 } 2104 2105 if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) { 2106 ASSERT(ill->ill_lso_capab != NULL); 2107 /* 2108 * Clear the capability flag for LSO but retain the 2109 * ill_lso_capab structure since it's possible that another 2110 * thread is still referring to it. The structure only gets 2111 * deallocated when we destroy the ill. 2112 */ 2113 2114 ill->ill_capabilities &= ~ILL_CAPAB_LSO; 2115 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, 2116 NULL, DLD_DISABLE); 2117 } 2118 2119 ill->ill_capabilities &= ~ILL_CAPAB_DLD; 2120 ill_mac_perim_exit(ill, mph); 2121 } 2122 2123 /* 2124 * Capability Negotiation protocol 2125 * 2126 * We don't wait for DLPI capability operations to finish during interface 2127 * bringup or teardown. Doing so would introduce more asynchrony and the 2128 * interface up/down operations will need multiple return and restarts. 2129 * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as 2130 * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next 2131 * exclusive operation won't start until the DLPI operations of the previous 2132 * exclusive operation complete. 2133 * 2134 * The capability state machine is shown below. 2135 * 2136 * state next state event, action 2137 * 2138 * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe 2139 * IDCS_PROBE_SENT IDCS_OK ill_capability_ack 2140 * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack) 2141 * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG 2142 * IDCS_OK IDCS_RESET_SENT ill_capability_reset 2143 * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr 2144 * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr -> 2145 * ill_capability_probe. 2146 */ 2147 2148 /* 2149 * Dedicated thread started from ip_stack_init that handles capability 2150 * disable. This thread ensures the taskq dispatch does not fail by waiting 2151 * for resources using TQ_SLEEP. The taskq mechanism is used to ensure 2152 * that direct calls to DLD are done in a cv_waitable context. 2153 */ 2154 void 2155 ill_taskq_dispatch(ip_stack_t *ipst) 2156 { 2157 callb_cpr_t cprinfo; 2158 char name[64]; 2159 mblk_t *mp; 2160 2161 (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d", 2162 ipst->ips_netstack->netstack_stackid); 2163 CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr, 2164 name); 2165 mutex_enter(&ipst->ips_capab_taskq_lock); 2166 2167 for (;;) { 2168 mp = ipst->ips_capab_taskq_head; 2169 while (mp != NULL) { 2170 ipst->ips_capab_taskq_head = mp->b_next; 2171 if (ipst->ips_capab_taskq_head == NULL) 2172 ipst->ips_capab_taskq_tail = NULL; 2173 mutex_exit(&ipst->ips_capab_taskq_lock); 2174 mp->b_next = NULL; 2175 2176 VERIFY(taskq_dispatch(system_taskq, 2177 ill_capability_ack_thr, mp, TQ_SLEEP) != 0); 2178 mutex_enter(&ipst->ips_capab_taskq_lock); 2179 mp = ipst->ips_capab_taskq_head; 2180 } 2181 2182 if (ipst->ips_capab_taskq_quit) 2183 break; 2184 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2185 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock); 2186 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock); 2187 } 2188 VERIFY(ipst->ips_capab_taskq_head == NULL); 2189 VERIFY(ipst->ips_capab_taskq_tail == NULL); 2190 CALLB_CPR_EXIT(&cprinfo); 2191 thread_exit(); 2192 } 2193 2194 /* 2195 * Consume a new-style hardware capabilities negotiation ack. 2196 * Called via taskq on receipt of DL_CAPABILITY_ACK. 2197 */ 2198 static void 2199 ill_capability_ack_thr(void *arg) 2200 { 2201 mblk_t *mp = arg; 2202 dl_capability_ack_t *capp; 2203 dl_capability_sub_t *subp, *endp; 2204 ill_t *ill; 2205 boolean_t reneg; 2206 2207 ill = (ill_t *)mp->b_prev; 2208 mp->b_prev = NULL; 2209 2210 VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE); 2211 2212 if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT || 2213 ill->ill_dlpi_capab_state == IDCS_RENEG) { 2214 /* 2215 * We have received the ack for our DL_CAPAB reset request. 2216 * There isnt' anything in the message that needs processing. 2217 * All message based capabilities have been disabled, now 2218 * do the function call based capability disable. 2219 */ 2220 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG; 2221 ill_capability_dld_disable(ill); 2222 ill->ill_dlpi_capab_state = IDCS_UNKNOWN; 2223 if (reneg) 2224 ill_capability_probe(ill); 2225 goto done; 2226 } 2227 2228 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT) 2229 ill->ill_dlpi_capab_state = IDCS_OK; 2230 2231 capp = (dl_capability_ack_t *)mp->b_rptr; 2232 2233 if (capp->dl_sub_length == 0) { 2234 /* no new-style capabilities */ 2235 goto done; 2236 } 2237 2238 /* make sure the driver supplied correct dl_sub_length */ 2239 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 2240 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 2241 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 2242 goto done; 2243 } 2244 2245 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 2246 /* 2247 * There are sub-capabilities. Process the ones we know about. 2248 * Loop until we don't have room for another sub-cap header.. 2249 */ 2250 for (subp = SC(capp, capp->dl_sub_offset), 2251 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 2252 subp <= endp; 2253 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 2254 2255 switch (subp->dl_cap) { 2256 case DL_CAPAB_ID_WRAPPER: 2257 ill_capability_id_ack(ill, mp, subp); 2258 break; 2259 default: 2260 ill_capability_dispatch(ill, mp, subp); 2261 break; 2262 } 2263 } 2264 #undef SC 2265 done: 2266 inet_freemsg(mp); 2267 ill_capability_done(ill); 2268 ipsq_exit(ill->ill_phyint->phyint_ipsq); 2269 } 2270 2271 /* 2272 * This needs to be started in a taskq thread to provide a cv_waitable 2273 * context. 2274 */ 2275 void 2276 ill_capability_ack(ill_t *ill, mblk_t *mp) 2277 { 2278 ip_stack_t *ipst = ill->ill_ipst; 2279 2280 mp->b_prev = (mblk_t *)ill; 2281 ASSERT(mp->b_next == NULL); 2282 2283 if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp, 2284 TQ_NOSLEEP) != 0) 2285 return; 2286 2287 /* 2288 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread 2289 * which will do the dispatch using TQ_SLEEP to guarantee success. 2290 */ 2291 mutex_enter(&ipst->ips_capab_taskq_lock); 2292 if (ipst->ips_capab_taskq_head == NULL) { 2293 ASSERT(ipst->ips_capab_taskq_tail == NULL); 2294 ipst->ips_capab_taskq_head = mp; 2295 } else { 2296 ipst->ips_capab_taskq_tail->b_next = mp; 2297 } 2298 ipst->ips_capab_taskq_tail = mp; 2299 2300 cv_signal(&ipst->ips_capab_taskq_cv); 2301 mutex_exit(&ipst->ips_capab_taskq_lock); 2302 } 2303 2304 /* 2305 * This routine is called to scan the fragmentation reassembly table for 2306 * the specified ILL for any packets that are starting to smell. 2307 * dead_interval is the maximum time in seconds that will be tolerated. It 2308 * will either be the value specified in ip_g_frag_timeout, or zero if the 2309 * ILL is shutting down and it is time to blow everything off. 2310 * 2311 * It returns the number of seconds (as a time_t) that the next frag timer 2312 * should be scheduled for, 0 meaning that the timer doesn't need to be 2313 * re-started. Note that the method of calculating next_timeout isn't 2314 * entirely accurate since time will flow between the time we grab 2315 * current_time and the time we schedule the next timeout. This isn't a 2316 * big problem since this is the timer for sending an ICMP reassembly time 2317 * exceeded messages, and it doesn't have to be exactly accurate. 2318 * 2319 * This function is 2320 * sometimes called as writer, although this is not required. 2321 */ 2322 time_t 2323 ill_frag_timeout(ill_t *ill, time_t dead_interval) 2324 { 2325 ipfb_t *ipfb; 2326 ipfb_t *endp; 2327 ipf_t *ipf; 2328 ipf_t *ipfnext; 2329 mblk_t *mp; 2330 time_t current_time = gethrestime_sec(); 2331 time_t next_timeout = 0; 2332 uint32_t hdr_length; 2333 mblk_t *send_icmp_head; 2334 mblk_t *send_icmp_head_v6; 2335 ip_stack_t *ipst = ill->ill_ipst; 2336 ip_recv_attr_t iras; 2337 2338 bzero(&iras, sizeof (iras)); 2339 iras.ira_flags = 0; 2340 iras.ira_ill = iras.ira_rill = ill; 2341 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2342 iras.ira_rifindex = iras.ira_ruifindex; 2343 2344 ipfb = ill->ill_frag_hash_tbl; 2345 if (ipfb == NULL) 2346 return (B_FALSE); 2347 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 2348 /* Walk the frag hash table. */ 2349 for (; ipfb < endp; ipfb++) { 2350 send_icmp_head = NULL; 2351 send_icmp_head_v6 = NULL; 2352 mutex_enter(&ipfb->ipfb_lock); 2353 while ((ipf = ipfb->ipfb_ipf) != 0) { 2354 time_t frag_time = current_time - ipf->ipf_timestamp; 2355 time_t frag_timeout; 2356 2357 if (frag_time < dead_interval) { 2358 /* 2359 * There are some outstanding fragments 2360 * that will timeout later. Make note of 2361 * the time so that we can reschedule the 2362 * next timeout appropriately. 2363 */ 2364 frag_timeout = dead_interval - frag_time; 2365 if (next_timeout == 0 || 2366 frag_timeout < next_timeout) { 2367 next_timeout = frag_timeout; 2368 } 2369 break; 2370 } 2371 /* Time's up. Get it out of here. */ 2372 hdr_length = ipf->ipf_nf_hdr_len; 2373 ipfnext = ipf->ipf_hash_next; 2374 if (ipfnext) 2375 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 2376 *ipf->ipf_ptphn = ipfnext; 2377 mp = ipf->ipf_mp->b_cont; 2378 for (; mp; mp = mp->b_cont) { 2379 /* Extra points for neatness. */ 2380 IP_REASS_SET_START(mp, 0); 2381 IP_REASS_SET_END(mp, 0); 2382 } 2383 mp = ipf->ipf_mp->b_cont; 2384 atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count); 2385 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 2386 ipfb->ipfb_count -= ipf->ipf_count; 2387 ASSERT(ipfb->ipfb_frag_pkts > 0); 2388 ipfb->ipfb_frag_pkts--; 2389 /* 2390 * We do not send any icmp message from here because 2391 * we currently are holding the ipfb_lock for this 2392 * hash chain. If we try and send any icmp messages 2393 * from here we may end up via a put back into ip 2394 * trying to get the same lock, causing a recursive 2395 * mutex panic. Instead we build a list and send all 2396 * the icmp messages after we have dropped the lock. 2397 */ 2398 if (ill->ill_isv6) { 2399 if (hdr_length != 0) { 2400 mp->b_next = send_icmp_head_v6; 2401 send_icmp_head_v6 = mp; 2402 } else { 2403 freemsg(mp); 2404 } 2405 } else { 2406 if (hdr_length != 0) { 2407 mp->b_next = send_icmp_head; 2408 send_icmp_head = mp; 2409 } else { 2410 freemsg(mp); 2411 } 2412 } 2413 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2414 ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill); 2415 freeb(ipf->ipf_mp); 2416 } 2417 mutex_exit(&ipfb->ipfb_lock); 2418 /* 2419 * Now need to send any icmp messages that we delayed from 2420 * above. 2421 */ 2422 while (send_icmp_head_v6 != NULL) { 2423 ip6_t *ip6h; 2424 2425 mp = send_icmp_head_v6; 2426 send_icmp_head_v6 = send_icmp_head_v6->b_next; 2427 mp->b_next = NULL; 2428 ip6h = (ip6_t *)mp->b_rptr; 2429 iras.ira_flags = 0; 2430 /* 2431 * This will result in an incorrect ALL_ZONES zoneid 2432 * for multicast packets, but we 2433 * don't send ICMP errors for those in any case. 2434 */ 2435 iras.ira_zoneid = 2436 ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 2437 ill, ipst); 2438 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2439 icmp_time_exceeded_v6(mp, 2440 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 2441 &iras); 2442 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2443 } 2444 while (send_icmp_head != NULL) { 2445 ipaddr_t dst; 2446 2447 mp = send_icmp_head; 2448 send_icmp_head = send_icmp_head->b_next; 2449 mp->b_next = NULL; 2450 2451 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 2452 2453 iras.ira_flags = IRAF_IS_IPV4; 2454 /* 2455 * This will result in an incorrect ALL_ZONES zoneid 2456 * for broadcast and multicast packets, but we 2457 * don't send ICMP errors for those in any case. 2458 */ 2459 iras.ira_zoneid = ipif_lookup_addr_zoneid(dst, 2460 ill, ipst); 2461 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2462 icmp_time_exceeded(mp, 2463 ICMP_REASSEMBLY_TIME_EXCEEDED, &iras); 2464 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2465 } 2466 } 2467 /* 2468 * A non-dying ILL will use the return value to decide whether to 2469 * restart the frag timer, and for how long. 2470 */ 2471 return (next_timeout); 2472 } 2473 2474 /* 2475 * This routine is called when the approximate count of mblk memory used 2476 * for the specified ILL has exceeded max_count. 2477 */ 2478 void 2479 ill_frag_prune(ill_t *ill, uint_t max_count) 2480 { 2481 ipfb_t *ipfb; 2482 ipf_t *ipf; 2483 size_t count; 2484 clock_t now; 2485 2486 /* 2487 * If we are here within ip_min_frag_prune_time msecs remove 2488 * ill_frag_free_num_pkts oldest packets from each bucket and increment 2489 * ill_frag_free_num_pkts. 2490 */ 2491 mutex_enter(&ill->ill_lock); 2492 now = ddi_get_lbolt(); 2493 if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <= 2494 (ip_min_frag_prune_time != 0 ? 2495 ip_min_frag_prune_time : msec_per_tick)) { 2496 2497 ill->ill_frag_free_num_pkts++; 2498 2499 } else { 2500 ill->ill_frag_free_num_pkts = 0; 2501 } 2502 ill->ill_last_frag_clean_time = now; 2503 mutex_exit(&ill->ill_lock); 2504 2505 /* 2506 * free ill_frag_free_num_pkts oldest packets from each bucket. 2507 */ 2508 if (ill->ill_frag_free_num_pkts != 0) { 2509 int ix; 2510 2511 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2512 ipfb = &ill->ill_frag_hash_tbl[ix]; 2513 mutex_enter(&ipfb->ipfb_lock); 2514 if (ipfb->ipfb_ipf != NULL) { 2515 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 2516 ill->ill_frag_free_num_pkts); 2517 } 2518 mutex_exit(&ipfb->ipfb_lock); 2519 } 2520 } 2521 /* 2522 * While the reassembly list for this ILL is too big, prune a fragment 2523 * queue by age, oldest first. 2524 */ 2525 while (ill->ill_frag_count > max_count) { 2526 int ix; 2527 ipfb_t *oipfb = NULL; 2528 uint_t oldest = UINT_MAX; 2529 2530 count = 0; 2531 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2532 ipfb = &ill->ill_frag_hash_tbl[ix]; 2533 mutex_enter(&ipfb->ipfb_lock); 2534 ipf = ipfb->ipfb_ipf; 2535 if (ipf != NULL && ipf->ipf_gen < oldest) { 2536 oldest = ipf->ipf_gen; 2537 oipfb = ipfb; 2538 } 2539 count += ipfb->ipfb_count; 2540 mutex_exit(&ipfb->ipfb_lock); 2541 } 2542 if (oipfb == NULL) 2543 break; 2544 2545 if (count <= max_count) 2546 return; /* Somebody beat us to it, nothing to do */ 2547 mutex_enter(&oipfb->ipfb_lock); 2548 ipf = oipfb->ipfb_ipf; 2549 if (ipf != NULL) { 2550 ill_frag_free_pkts(ill, oipfb, ipf, 1); 2551 } 2552 mutex_exit(&oipfb->ipfb_lock); 2553 } 2554 } 2555 2556 /* 2557 * free 'free_cnt' fragmented packets starting at ipf. 2558 */ 2559 void 2560 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 2561 { 2562 size_t count; 2563 mblk_t *mp; 2564 mblk_t *tmp; 2565 ipf_t **ipfp = ipf->ipf_ptphn; 2566 2567 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 2568 ASSERT(ipfp != NULL); 2569 ASSERT(ipf != NULL); 2570 2571 while (ipf != NULL && free_cnt-- > 0) { 2572 count = ipf->ipf_count; 2573 mp = ipf->ipf_mp; 2574 ipf = ipf->ipf_hash_next; 2575 for (tmp = mp; tmp; tmp = tmp->b_cont) { 2576 IP_REASS_SET_START(tmp, 0); 2577 IP_REASS_SET_END(tmp, 0); 2578 } 2579 atomic_add_32(&ill->ill_frag_count, -count); 2580 ASSERT(ipfb->ipfb_count >= count); 2581 ipfb->ipfb_count -= count; 2582 ASSERT(ipfb->ipfb_frag_pkts > 0); 2583 ipfb->ipfb_frag_pkts--; 2584 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2585 ip_drop_input("ipIfStatsReasmFails", mp, ill); 2586 freemsg(mp); 2587 } 2588 2589 if (ipf) 2590 ipf->ipf_ptphn = ipfp; 2591 ipfp[0] = ipf; 2592 } 2593 2594 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 2595 "obsolete and may be removed in a future release of Solaris. Use " \ 2596 "ifconfig(1M) to manipulate the forwarding status of an interface." 2597 2598 /* 2599 * For obsolete per-interface forwarding configuration; 2600 * called in response to ND_GET. 2601 */ 2602 /* ARGSUSED */ 2603 static int 2604 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 2605 { 2606 ill_t *ill = (ill_t *)cp; 2607 2608 cmn_err(CE_WARN, ND_FORWARD_WARNING); 2609 2610 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 2611 return (0); 2612 } 2613 2614 /* 2615 * For obsolete per-interface forwarding configuration; 2616 * called in response to ND_SET. 2617 */ 2618 /* ARGSUSED */ 2619 static int 2620 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 2621 cred_t *ioc_cr) 2622 { 2623 long value; 2624 int retval; 2625 ip_stack_t *ipst = CONNQ_TO_IPST(q); 2626 2627 cmn_err(CE_WARN, ND_FORWARD_WARNING); 2628 2629 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 2630 value < 0 || value > 1) { 2631 return (EINVAL); 2632 } 2633 2634 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2635 retval = ill_forward_set((ill_t *)cp, (value != 0)); 2636 rw_exit(&ipst->ips_ill_g_lock); 2637 return (retval); 2638 } 2639 2640 /* 2641 * Helper function for ill_forward_set(). 2642 */ 2643 static void 2644 ill_forward_set_on_ill(ill_t *ill, boolean_t enable) 2645 { 2646 ip_stack_t *ipst = ill->ill_ipst; 2647 2648 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2649 2650 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 2651 (enable ? "Enabling" : "Disabling"), 2652 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 2653 mutex_enter(&ill->ill_lock); 2654 if (enable) 2655 ill->ill_flags |= ILLF_ROUTER; 2656 else 2657 ill->ill_flags &= ~ILLF_ROUTER; 2658 mutex_exit(&ill->ill_lock); 2659 if (ill->ill_isv6) 2660 ill_set_nce_router_flags(ill, enable); 2661 /* Notify routing socket listeners of this change. */ 2662 if (ill->ill_ipif != NULL) 2663 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 2664 } 2665 2666 /* 2667 * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing 2668 * socket messages for each interface whose flags we change. 2669 */ 2670 int 2671 ill_forward_set(ill_t *ill, boolean_t enable) 2672 { 2673 ipmp_illgrp_t *illg; 2674 ip_stack_t *ipst = ill->ill_ipst; 2675 2676 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2677 2678 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 2679 (!enable && !(ill->ill_flags & ILLF_ROUTER))) 2680 return (0); 2681 2682 if (IS_LOOPBACK(ill)) 2683 return (EINVAL); 2684 2685 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { 2686 /* 2687 * Update all of the interfaces in the group. 2688 */ 2689 illg = ill->ill_grp; 2690 ill = list_head(&illg->ig_if); 2691 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) 2692 ill_forward_set_on_ill(ill, enable); 2693 2694 /* 2695 * Update the IPMP meta-interface. 2696 */ 2697 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable); 2698 return (0); 2699 } 2700 2701 ill_forward_set_on_ill(ill, enable); 2702 return (0); 2703 } 2704 2705 /* 2706 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 2707 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 2708 * set or clear. 2709 */ 2710 static void 2711 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 2712 { 2713 ipif_t *ipif; 2714 ncec_t *ncec; 2715 nce_t *nce; 2716 2717 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 2718 /* 2719 * NOTE: we match across the illgrp because nce's for 2720 * addresses on IPMP interfaces have an nce_ill that points to 2721 * the bound underlying ill. 2722 */ 2723 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); 2724 if (nce != NULL) { 2725 ncec = nce->nce_common; 2726 mutex_enter(&ncec->ncec_lock); 2727 if (enable) 2728 ncec->ncec_flags |= NCE_F_ISROUTER; 2729 else 2730 ncec->ncec_flags &= ~NCE_F_ISROUTER; 2731 mutex_exit(&ncec->ncec_lock); 2732 nce_refrele(nce); 2733 } 2734 } 2735 } 2736 2737 /* 2738 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 2739 * for this ill. Make sure the v6/v4 question has been answered about this 2740 * ill. The creation of this ndd variable is only for backwards compatibility. 2741 * The preferred way to control per-interface IP forwarding is through the 2742 * ILLF_ROUTER interface flag. 2743 */ 2744 static int 2745 ill_set_ndd_name(ill_t *ill) 2746 { 2747 char *suffix; 2748 ip_stack_t *ipst = ill->ill_ipst; 2749 2750 ASSERT(IAM_WRITER_ILL(ill)); 2751 2752 if (ill->ill_isv6) 2753 suffix = ipv6_forward_suffix; 2754 else 2755 suffix = ipv4_forward_suffix; 2756 2757 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 2758 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 2759 /* 2760 * Copies over the '\0'. 2761 * Note that strlen(suffix) is always bounded. 2762 */ 2763 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 2764 strlen(suffix) + 1); 2765 2766 /* 2767 * Use of the nd table requires holding the reader lock. 2768 * Modifying the nd table thru nd_load/nd_unload requires 2769 * the writer lock. 2770 */ 2771 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 2772 if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 2773 nd_ill_forward_set, (caddr_t)ill)) { 2774 /* 2775 * If the nd_load failed, it only meant that it could not 2776 * allocate a new bunch of room for further NDD expansion. 2777 * Because of that, the ill_ndd_name will be set to 0, and 2778 * this interface is at the mercy of the global ip_forwarding 2779 * variable. 2780 */ 2781 rw_exit(&ipst->ips_ip_g_nd_lock); 2782 ill->ill_ndd_name = NULL; 2783 return (ENOMEM); 2784 } 2785 rw_exit(&ipst->ips_ip_g_nd_lock); 2786 return (0); 2787 } 2788 2789 /* 2790 * Intializes the context structure and returns the first ill in the list 2791 * cuurently start_list and end_list can have values: 2792 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 2793 * IP_V4_G_HEAD Traverse IPV4 list only. 2794 * IP_V6_G_HEAD Traverse IPV6 list only. 2795 */ 2796 2797 /* 2798 * We don't check for CONDEMNED ills here. Caller must do that if 2799 * necessary under the ill lock. 2800 */ 2801 ill_t * 2802 ill_first(int start_list, int end_list, ill_walk_context_t *ctx, 2803 ip_stack_t *ipst) 2804 { 2805 ill_if_t *ifp; 2806 ill_t *ill; 2807 avl_tree_t *avl_tree; 2808 2809 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 2810 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 2811 2812 /* 2813 * setup the lists to search 2814 */ 2815 if (end_list != MAX_G_HEADS) { 2816 ctx->ctx_current_list = start_list; 2817 ctx->ctx_last_list = end_list; 2818 } else { 2819 ctx->ctx_last_list = MAX_G_HEADS - 1; 2820 ctx->ctx_current_list = 0; 2821 } 2822 2823 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 2824 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2825 if (ifp != (ill_if_t *) 2826 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2827 avl_tree = &ifp->illif_avl_by_ppa; 2828 ill = avl_first(avl_tree); 2829 /* 2830 * ill is guaranteed to be non NULL or ifp should have 2831 * not existed. 2832 */ 2833 ASSERT(ill != NULL); 2834 return (ill); 2835 } 2836 ctx->ctx_current_list++; 2837 } 2838 2839 return (NULL); 2840 } 2841 2842 /* 2843 * returns the next ill in the list. ill_first() must have been called 2844 * before calling ill_next() or bad things will happen. 2845 */ 2846 2847 /* 2848 * We don't check for CONDEMNED ills here. Caller must do that if 2849 * necessary under the ill lock. 2850 */ 2851 ill_t * 2852 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 2853 { 2854 ill_if_t *ifp; 2855 ill_t *ill; 2856 ip_stack_t *ipst = lastill->ill_ipst; 2857 2858 ASSERT(lastill->ill_ifptr != (ill_if_t *) 2859 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)); 2860 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 2861 AVL_AFTER)) != NULL) { 2862 return (ill); 2863 } 2864 2865 /* goto next ill_ifp in the list. */ 2866 ifp = lastill->ill_ifptr->illif_next; 2867 2868 /* make sure not at end of circular list */ 2869 while (ifp == 2870 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2871 if (++ctx->ctx_current_list > ctx->ctx_last_list) 2872 return (NULL); 2873 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2874 } 2875 2876 return (avl_first(&ifp->illif_avl_by_ppa)); 2877 } 2878 2879 /* 2880 * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+ 2881 * The final number (PPA) must not have any leading zeros. Upon success, a 2882 * pointer to the start of the PPA is returned; otherwise NULL is returned. 2883 */ 2884 static char * 2885 ill_get_ppa_ptr(char *name) 2886 { 2887 int namelen = strlen(name); 2888 int end_ndx = namelen - 1; 2889 int ppa_ndx, i; 2890 2891 /* 2892 * Check that the first character is [a-zA-Z], and that the last 2893 * character is [0-9]. 2894 */ 2895 if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx])) 2896 return (NULL); 2897 2898 /* 2899 * Set `ppa_ndx' to the PPA start, and check for leading zeroes. 2900 */ 2901 for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--) 2902 if (!isdigit(name[ppa_ndx - 1])) 2903 break; 2904 2905 if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx) 2906 return (NULL); 2907 2908 /* 2909 * Check that the intermediate characters are [a-z0-9.] 2910 */ 2911 for (i = 1; i < ppa_ndx; i++) { 2912 if (!isalpha(name[i]) && !isdigit(name[i]) && 2913 name[i] != '.' && name[i] != '_') { 2914 return (NULL); 2915 } 2916 } 2917 2918 return (name + ppa_ndx); 2919 } 2920 2921 /* 2922 * use avl tree to locate the ill. 2923 */ 2924 static ill_t * 2925 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst) 2926 { 2927 char *ppa_ptr = NULL; 2928 int len; 2929 uint_t ppa; 2930 ill_t *ill = NULL; 2931 ill_if_t *ifp; 2932 int list; 2933 2934 /* 2935 * get ppa ptr 2936 */ 2937 if (isv6) 2938 list = IP_V6_G_HEAD; 2939 else 2940 list = IP_V4_G_HEAD; 2941 2942 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 2943 return (NULL); 2944 } 2945 2946 len = ppa_ptr - name + 1; 2947 2948 ppa = stoi(&ppa_ptr); 2949 2950 ifp = IP_VX_ILL_G_LIST(list, ipst); 2951 2952 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 2953 /* 2954 * match is done on len - 1 as the name is not null 2955 * terminated it contains ppa in addition to the interface 2956 * name. 2957 */ 2958 if ((ifp->illif_name_len == len) && 2959 bcmp(ifp->illif_name, name, len - 1) == 0) { 2960 break; 2961 } else { 2962 ifp = ifp->illif_next; 2963 } 2964 } 2965 2966 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 2967 /* 2968 * Even the interface type does not exist. 2969 */ 2970 return (NULL); 2971 } 2972 2973 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 2974 if (ill != NULL) { 2975 mutex_enter(&ill->ill_lock); 2976 if (ILL_CAN_LOOKUP(ill)) { 2977 ill_refhold_locked(ill); 2978 mutex_exit(&ill->ill_lock); 2979 return (ill); 2980 } 2981 mutex_exit(&ill->ill_lock); 2982 } 2983 return (NULL); 2984 } 2985 2986 /* 2987 * comparison function for use with avl. 2988 */ 2989 static int 2990 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 2991 { 2992 uint_t ppa; 2993 uint_t ill_ppa; 2994 2995 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 2996 2997 ppa = *((uint_t *)ppa_ptr); 2998 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 2999 /* 3000 * We want the ill with the lowest ppa to be on the 3001 * top. 3002 */ 3003 if (ill_ppa < ppa) 3004 return (1); 3005 if (ill_ppa > ppa) 3006 return (-1); 3007 return (0); 3008 } 3009 3010 /* 3011 * remove an interface type from the global list. 3012 */ 3013 static void 3014 ill_delete_interface_type(ill_if_t *interface) 3015 { 3016 ASSERT(interface != NULL); 3017 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 3018 3019 avl_destroy(&interface->illif_avl_by_ppa); 3020 if (interface->illif_ppa_arena != NULL) 3021 vmem_destroy(interface->illif_ppa_arena); 3022 3023 remque(interface); 3024 3025 mi_free(interface); 3026 } 3027 3028 /* 3029 * remove ill from the global list. 3030 */ 3031 static void 3032 ill_glist_delete(ill_t *ill) 3033 { 3034 ip_stack_t *ipst; 3035 phyint_t *phyi; 3036 3037 if (ill == NULL) 3038 return; 3039 ipst = ill->ill_ipst; 3040 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 3041 3042 /* 3043 * If the ill was never inserted into the AVL tree 3044 * we skip the if branch. 3045 */ 3046 if (ill->ill_ifptr != NULL) { 3047 /* 3048 * remove from AVL tree and free ppa number 3049 */ 3050 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 3051 3052 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 3053 vmem_free(ill->ill_ifptr->illif_ppa_arena, 3054 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 3055 } 3056 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 3057 ill_delete_interface_type(ill->ill_ifptr); 3058 } 3059 3060 /* 3061 * Indicate ill is no longer in the list. 3062 */ 3063 ill->ill_ifptr = NULL; 3064 ill->ill_name_length = 0; 3065 ill->ill_name[0] = '\0'; 3066 ill->ill_ppa = UINT_MAX; 3067 } 3068 3069 /* Generate one last event for this ill. */ 3070 ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name, 3071 ill->ill_name_length); 3072 3073 ASSERT(ill->ill_phyint != NULL); 3074 phyi = ill->ill_phyint; 3075 ill->ill_phyint = NULL; 3076 3077 /* 3078 * ill_init allocates a phyint always to store the copy 3079 * of flags relevant to phyint. At that point in time, we could 3080 * not assign the name and hence phyint_illv4/v6 could not be 3081 * initialized. Later in ipif_set_values, we assign the name to 3082 * the ill, at which point in time we assign phyint_illv4/v6. 3083 * Thus we don't rely on phyint_illv6 to be initialized always. 3084 */ 3085 if (ill->ill_flags & ILLF_IPV6) 3086 phyi->phyint_illv6 = NULL; 3087 else 3088 phyi->phyint_illv4 = NULL; 3089 3090 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) { 3091 rw_exit(&ipst->ips_ill_g_lock); 3092 return; 3093 } 3094 3095 /* 3096 * There are no ills left on this phyint; pull it out of the phyint 3097 * avl trees, and free it. 3098 */ 3099 if (phyi->phyint_ifindex > 0) { 3100 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3101 phyi); 3102 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 3103 phyi); 3104 } 3105 rw_exit(&ipst->ips_ill_g_lock); 3106 3107 phyint_free(phyi); 3108 } 3109 3110 /* 3111 * allocate a ppa, if the number of plumbed interfaces of this type are 3112 * less than ill_no_arena do a linear search to find a unused ppa. 3113 * When the number goes beyond ill_no_arena switch to using an arena. 3114 * Note: ppa value of zero cannot be allocated from vmem_arena as it 3115 * is the return value for an error condition, so allocation starts at one 3116 * and is decremented by one. 3117 */ 3118 static int 3119 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 3120 { 3121 ill_t *tmp_ill; 3122 uint_t start, end; 3123 int ppa; 3124 3125 if (ifp->illif_ppa_arena == NULL && 3126 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 3127 /* 3128 * Create an arena. 3129 */ 3130 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 3131 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 3132 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 3133 /* allocate what has already been assigned */ 3134 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 3135 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 3136 tmp_ill, AVL_AFTER)) { 3137 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 3138 1, /* size */ 3139 1, /* align/quantum */ 3140 0, /* phase */ 3141 0, /* nocross */ 3142 /* minaddr */ 3143 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), 3144 /* maxaddr */ 3145 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), 3146 VM_NOSLEEP|VM_FIRSTFIT); 3147 if (ppa == 0) { 3148 ip1dbg(("ill_alloc_ppa: ppa allocation" 3149 " failed while switching")); 3150 vmem_destroy(ifp->illif_ppa_arena); 3151 ifp->illif_ppa_arena = NULL; 3152 break; 3153 } 3154 } 3155 } 3156 3157 if (ifp->illif_ppa_arena != NULL) { 3158 if (ill->ill_ppa == UINT_MAX) { 3159 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 3160 1, VM_NOSLEEP|VM_FIRSTFIT); 3161 if (ppa == 0) 3162 return (EAGAIN); 3163 ill->ill_ppa = --ppa; 3164 } else { 3165 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 3166 1, /* size */ 3167 1, /* align/quantum */ 3168 0, /* phase */ 3169 0, /* nocross */ 3170 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 3171 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 3172 VM_NOSLEEP|VM_FIRSTFIT); 3173 /* 3174 * Most likely the allocation failed because 3175 * the requested ppa was in use. 3176 */ 3177 if (ppa == 0) 3178 return (EEXIST); 3179 } 3180 return (0); 3181 } 3182 3183 /* 3184 * No arena is in use and not enough (>ill_no_arena) interfaces have 3185 * been plumbed to create one. Do a linear search to get a unused ppa. 3186 */ 3187 if (ill->ill_ppa == UINT_MAX) { 3188 end = UINT_MAX - 1; 3189 start = 0; 3190 } else { 3191 end = start = ill->ill_ppa; 3192 } 3193 3194 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 3195 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 3196 if (start++ >= end) { 3197 if (ill->ill_ppa == UINT_MAX) 3198 return (EAGAIN); 3199 else 3200 return (EEXIST); 3201 } 3202 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 3203 } 3204 ill->ill_ppa = start; 3205 return (0); 3206 } 3207 3208 /* 3209 * Insert ill into the list of configured ill's. Once this function completes, 3210 * the ill is globally visible and is available through lookups. More precisely 3211 * this happens after the caller drops the ill_g_lock. 3212 */ 3213 static int 3214 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 3215 { 3216 ill_if_t *ill_interface; 3217 avl_index_t where = 0; 3218 int error; 3219 int name_length; 3220 int index; 3221 boolean_t check_length = B_FALSE; 3222 ip_stack_t *ipst = ill->ill_ipst; 3223 3224 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 3225 3226 name_length = mi_strlen(name) + 1; 3227 3228 if (isv6) 3229 index = IP_V6_G_HEAD; 3230 else 3231 index = IP_V4_G_HEAD; 3232 3233 ill_interface = IP_VX_ILL_G_LIST(index, ipst); 3234 /* 3235 * Search for interface type based on name 3236 */ 3237 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 3238 if ((ill_interface->illif_name_len == name_length) && 3239 (strcmp(ill_interface->illif_name, name) == 0)) { 3240 break; 3241 } 3242 ill_interface = ill_interface->illif_next; 3243 } 3244 3245 /* 3246 * Interface type not found, create one. 3247 */ 3248 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 3249 ill_g_head_t ghead; 3250 3251 /* 3252 * allocate ill_if_t structure 3253 */ 3254 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 3255 if (ill_interface == NULL) { 3256 return (ENOMEM); 3257 } 3258 3259 (void) strcpy(ill_interface->illif_name, name); 3260 ill_interface->illif_name_len = name_length; 3261 3262 avl_create(&ill_interface->illif_avl_by_ppa, 3263 ill_compare_ppa, sizeof (ill_t), 3264 offsetof(struct ill_s, ill_avl_byppa)); 3265 3266 /* 3267 * link the structure in the back to maintain order 3268 * of configuration for ifconfig output. 3269 */ 3270 ghead = ipst->ips_ill_g_heads[index]; 3271 insque(ill_interface, ghead.ill_g_list_tail); 3272 } 3273 3274 if (ill->ill_ppa == UINT_MAX) 3275 check_length = B_TRUE; 3276 3277 error = ill_alloc_ppa(ill_interface, ill); 3278 if (error != 0) { 3279 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 3280 ill_delete_interface_type(ill->ill_ifptr); 3281 return (error); 3282 } 3283 3284 /* 3285 * When the ppa is choosen by the system, check that there is 3286 * enough space to insert ppa. if a specific ppa was passed in this 3287 * check is not required as the interface name passed in will have 3288 * the right ppa in it. 3289 */ 3290 if (check_length) { 3291 /* 3292 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 3293 */ 3294 char buf[sizeof (uint_t) * 3]; 3295 3296 /* 3297 * convert ppa to string to calculate the amount of space 3298 * required for it in the name. 3299 */ 3300 numtos(ill->ill_ppa, buf); 3301 3302 /* Do we have enough space to insert ppa ? */ 3303 3304 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 3305 /* Free ppa and interface type struct */ 3306 if (ill_interface->illif_ppa_arena != NULL) { 3307 vmem_free(ill_interface->illif_ppa_arena, 3308 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 3309 } 3310 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 3311 ill_delete_interface_type(ill->ill_ifptr); 3312 3313 return (EINVAL); 3314 } 3315 } 3316 3317 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 3318 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 3319 3320 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 3321 &where); 3322 ill->ill_ifptr = ill_interface; 3323 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 3324 3325 ill_phyint_reinit(ill); 3326 return (0); 3327 } 3328 3329 /* Initialize the per phyint ipsq used for serialization */ 3330 static boolean_t 3331 ipsq_init(ill_t *ill, boolean_t enter) 3332 { 3333 ipsq_t *ipsq; 3334 ipxop_t *ipx; 3335 3336 if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL) 3337 return (B_FALSE); 3338 3339 ill->ill_phyint->phyint_ipsq = ipsq; 3340 ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop; 3341 ipx->ipx_ipsq = ipsq; 3342 ipsq->ipsq_next = ipsq; 3343 ipsq->ipsq_phyint = ill->ill_phyint; 3344 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 3345 mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0); 3346 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */ 3347 if (enter) { 3348 ipx->ipx_writer = curthread; 3349 ipx->ipx_forced = B_FALSE; 3350 ipx->ipx_reentry_cnt = 1; 3351 #ifdef DEBUG 3352 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 3353 #endif 3354 } 3355 return (B_TRUE); 3356 } 3357 3358 /* 3359 * ill_init is called by ip_open when a device control stream is opened. 3360 * It does a few initializations, and shoots a DL_INFO_REQ message down 3361 * to the driver. The response is later picked up in ip_rput_dlpi and 3362 * used to set up default mechanisms for talking to the driver. (Always 3363 * called as writer.) 3364 * 3365 * If this function returns error, ip_open will call ip_close which in 3366 * turn will call ill_delete to clean up any memory allocated here that 3367 * is not yet freed. 3368 */ 3369 int 3370 ill_init(queue_t *q, ill_t *ill) 3371 { 3372 int count; 3373 dl_info_req_t *dlir; 3374 mblk_t *info_mp; 3375 uchar_t *frag_ptr; 3376 3377 /* 3378 * The ill is initialized to zero by mi_alloc*(). In addition 3379 * some fields already contain valid values, initialized in 3380 * ip_open(), before we reach here. 3381 */ 3382 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 3383 mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 3384 ill->ill_saved_ire_cnt = 0; 3385 3386 ill->ill_rq = q; 3387 ill->ill_wq = WR(q); 3388 3389 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 3390 BPRI_HI); 3391 if (info_mp == NULL) 3392 return (ENOMEM); 3393 3394 /* 3395 * Allocate sufficient space to contain our fragment hash table and 3396 * the device name. 3397 */ 3398 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 3399 2 * LIFNAMSIZ + strlen(ipv6_forward_suffix)); 3400 if (frag_ptr == NULL) { 3401 freemsg(info_mp); 3402 return (ENOMEM); 3403 } 3404 ill->ill_frag_ptr = frag_ptr; 3405 ill->ill_frag_free_num_pkts = 0; 3406 ill->ill_last_frag_clean_time = 0; 3407 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 3408 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 3409 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 3410 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 3411 NULL, MUTEX_DEFAULT, NULL); 3412 } 3413 3414 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 3415 if (ill->ill_phyint == NULL) { 3416 freemsg(info_mp); 3417 mi_free(frag_ptr); 3418 return (ENOMEM); 3419 } 3420 3421 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 3422 /* 3423 * For now pretend this is a v4 ill. We need to set phyint_ill* 3424 * at this point because of the following reason. If we can't 3425 * enter the ipsq at some point and cv_wait, the writer that 3426 * wakes us up tries to locate us using the list of all phyints 3427 * in an ipsq and the ills from the phyint thru the phyint_ill*. 3428 * If we don't set it now, we risk a missed wakeup. 3429 */ 3430 ill->ill_phyint->phyint_illv4 = ill; 3431 ill->ill_ppa = UINT_MAX; 3432 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); 3433 3434 ill_set_inputfn(ill); 3435 3436 if (!ipsq_init(ill, B_TRUE)) { 3437 freemsg(info_mp); 3438 mi_free(frag_ptr); 3439 mi_free(ill->ill_phyint); 3440 return (ENOMEM); 3441 } 3442 3443 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 3444 3445 /* Frag queue limit stuff */ 3446 ill->ill_frag_count = 0; 3447 ill->ill_ipf_gen = 0; 3448 3449 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); 3450 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); 3451 ill->ill_global_timer = INFINITY; 3452 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 3453 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 3454 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 3455 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 3456 3457 /* 3458 * Initialize IPv6 configuration variables. The IP module is always 3459 * opened as an IPv4 module. Instead tracking down the cases where 3460 * it switches to do ipv6, we'll just initialize the IPv6 configuration 3461 * here for convenience, this has no effect until the ill is set to do 3462 * IPv6. 3463 */ 3464 ill->ill_reachable_time = ND_REACHABLE_TIME; 3465 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 3466 ill->ill_max_buf = ND_MAX_Q; 3467 ill->ill_refcnt = 0; 3468 3469 /* Send down the Info Request to the driver. */ 3470 info_mp->b_datap->db_type = M_PCPROTO; 3471 dlir = (dl_info_req_t *)info_mp->b_rptr; 3472 info_mp->b_wptr = (uchar_t *)&dlir[1]; 3473 dlir->dl_primitive = DL_INFO_REQ; 3474 3475 ill->ill_dlpi_pending = DL_PRIM_INVAL; 3476 3477 qprocson(q); 3478 ill_dlpi_send(ill, info_mp); 3479 3480 return (0); 3481 } 3482 3483 /* 3484 * ill_dls_info 3485 * creates datalink socket info from the device. 3486 */ 3487 int 3488 ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill) 3489 { 3490 size_t len; 3491 3492 sdl->sdl_family = AF_LINK; 3493 sdl->sdl_index = ill_get_upper_ifindex(ill); 3494 sdl->sdl_type = ill->ill_type; 3495 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); 3496 len = strlen(sdl->sdl_data); 3497 ASSERT(len < 256); 3498 sdl->sdl_nlen = (uchar_t)len; 3499 sdl->sdl_alen = ill->ill_phys_addr_length; 3500 sdl->sdl_slen = 0; 3501 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) 3502 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen); 3503 3504 return (sizeof (struct sockaddr_dl)); 3505 } 3506 3507 /* 3508 * ill_xarp_info 3509 * creates xarp info from the device. 3510 */ 3511 static int 3512 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 3513 { 3514 sdl->sdl_family = AF_LINK; 3515 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 3516 sdl->sdl_type = ill->ill_type; 3517 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); 3518 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 3519 sdl->sdl_alen = ill->ill_phys_addr_length; 3520 sdl->sdl_slen = 0; 3521 return (sdl->sdl_nlen); 3522 } 3523 3524 static int 3525 loopback_kstat_update(kstat_t *ksp, int rw) 3526 { 3527 kstat_named_t *kn; 3528 netstackid_t stackid; 3529 netstack_t *ns; 3530 ip_stack_t *ipst; 3531 3532 if (ksp == NULL || ksp->ks_data == NULL) 3533 return (EIO); 3534 3535 if (rw == KSTAT_WRITE) 3536 return (EACCES); 3537 3538 kn = KSTAT_NAMED_PTR(ksp); 3539 stackid = (zoneid_t)(uintptr_t)ksp->ks_private; 3540 3541 ns = netstack_find_by_stackid(stackid); 3542 if (ns == NULL) 3543 return (-1); 3544 3545 ipst = ns->netstack_ip; 3546 if (ipst == NULL) { 3547 netstack_rele(ns); 3548 return (-1); 3549 } 3550 kn[0].value.ui32 = ipst->ips_loopback_packets; 3551 kn[1].value.ui32 = ipst->ips_loopback_packets; 3552 netstack_rele(ns); 3553 return (0); 3554 } 3555 3556 /* 3557 * Has ifindex been plumbed already? 3558 */ 3559 static boolean_t 3560 phyint_exists(uint_t index, ip_stack_t *ipst) 3561 { 3562 ASSERT(index != 0); 3563 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 3564 3565 return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3566 &index, NULL) != NULL); 3567 } 3568 3569 /* Pick a unique ifindex */ 3570 boolean_t 3571 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst) 3572 { 3573 uint_t starting_index; 3574 3575 if (!ipst->ips_ill_index_wrap) { 3576 *indexp = ipst->ips_ill_index++; 3577 if (ipst->ips_ill_index == 0) { 3578 /* Reached the uint_t limit Next time wrap */ 3579 ipst->ips_ill_index_wrap = B_TRUE; 3580 } 3581 return (B_TRUE); 3582 } 3583 3584 /* 3585 * Start reusing unused indexes. Note that we hold the ill_g_lock 3586 * at this point and don't want to call any function that attempts 3587 * to get the lock again. 3588 */ 3589 starting_index = ipst->ips_ill_index++; 3590 for (; ipst->ips_ill_index != starting_index; ipst->ips_ill_index++) { 3591 if (ipst->ips_ill_index != 0 && 3592 !phyint_exists(ipst->ips_ill_index, ipst)) { 3593 /* found unused index - use it */ 3594 *indexp = ipst->ips_ill_index; 3595 return (B_TRUE); 3596 } 3597 } 3598 3599 /* 3600 * all interface indicies are inuse. 3601 */ 3602 return (B_FALSE); 3603 } 3604 3605 /* 3606 * Assign a unique interface index for the phyint. 3607 */ 3608 static boolean_t 3609 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst) 3610 { 3611 ASSERT(phyi->phyint_ifindex == 0); 3612 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst)); 3613 } 3614 3615 /* 3616 * Initialize the flags on `phyi' as per the provided mactype. 3617 */ 3618 static void 3619 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype) 3620 { 3621 uint64_t flags = 0; 3622 3623 /* 3624 * Initialize PHYI_RUNNING and PHYI_FAILED. For non-IPMP interfaces, 3625 * we always presume the underlying hardware is working and set 3626 * PHYI_RUNNING (if it's not, the driver will subsequently send a 3627 * DL_NOTE_LINK_DOWN message). For IPMP interfaces, at initialization 3628 * there are no active interfaces in the group so we set PHYI_FAILED. 3629 */ 3630 if (mactype == SUNW_DL_IPMP) 3631 flags |= PHYI_FAILED; 3632 else 3633 flags |= PHYI_RUNNING; 3634 3635 switch (mactype) { 3636 case SUNW_DL_VNI: 3637 flags |= PHYI_VIRTUAL; 3638 break; 3639 case SUNW_DL_IPMP: 3640 flags |= PHYI_IPMP; 3641 break; 3642 case DL_LOOP: 3643 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL); 3644 break; 3645 } 3646 3647 mutex_enter(&phyi->phyint_lock); 3648 phyi->phyint_flags |= flags; 3649 mutex_exit(&phyi->phyint_lock); 3650 } 3651 3652 /* 3653 * Return a pointer to the ill which matches the supplied name. Note that 3654 * the ill name length includes the null termination character. (May be 3655 * called as writer.) 3656 * If do_alloc and the interface is "lo0" it will be automatically created. 3657 * Cannot bump up reference on condemned ills. So dup detect can't be done 3658 * using this func. 3659 */ 3660 ill_t * 3661 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 3662 boolean_t *did_alloc, ip_stack_t *ipst) 3663 { 3664 ill_t *ill; 3665 ipif_t *ipif; 3666 ipsq_t *ipsq; 3667 kstat_named_t *kn; 3668 boolean_t isloopback; 3669 in6_addr_t ov6addr; 3670 3671 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 3672 3673 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3674 ill = ill_find_by_name(name, isv6, ipst); 3675 rw_exit(&ipst->ips_ill_g_lock); 3676 if (ill != NULL) 3677 return (ill); 3678 3679 /* 3680 * Couldn't find it. Does this happen to be a lookup for the 3681 * loopback device and are we allowed to allocate it? 3682 */ 3683 if (!isloopback || !do_alloc) 3684 return (NULL); 3685 3686 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 3687 ill = ill_find_by_name(name, isv6, ipst); 3688 if (ill != NULL) { 3689 rw_exit(&ipst->ips_ill_g_lock); 3690 return (ill); 3691 } 3692 3693 /* Create the loopback device on demand */ 3694 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 3695 sizeof (ipif_loopback_name), BPRI_MED)); 3696 if (ill == NULL) 3697 goto done; 3698 3699 *ill = ill_null; 3700 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 3701 ill->ill_ipst = ipst; 3702 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); 3703 netstack_hold(ipst->ips_netstack); 3704 /* 3705 * For exclusive stacks we set the zoneid to zero 3706 * to make IP operate as if in the global zone. 3707 */ 3708 ill->ill_zoneid = GLOBAL_ZONEID; 3709 3710 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 3711 if (ill->ill_phyint == NULL) 3712 goto done; 3713 3714 if (isv6) 3715 ill->ill_phyint->phyint_illv6 = ill; 3716 else 3717 ill->ill_phyint->phyint_illv4 = ill; 3718 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 3719 phyint_flags_init(ill->ill_phyint, DL_LOOP); 3720 3721 if (isv6) { 3722 ill->ill_isv6 = B_TRUE; 3723 ill->ill_max_frag = ip_loopback_mtu_v6plus; 3724 } else { 3725 ill->ill_max_frag = ip_loopback_mtuplus; 3726 } 3727 if (!ill_allocate_mibs(ill)) 3728 goto done; 3729 ill->ill_current_frag = ill->ill_max_frag; 3730 ill->ill_mtu = ill->ill_max_frag; /* Initial value */ 3731 /* 3732 * ipif_loopback_name can't be pointed at directly because its used 3733 * by both the ipv4 and ipv6 interfaces. When the ill is removed 3734 * from the glist, ill_glist_delete() sets the first character of 3735 * ill_name to '\0'. 3736 */ 3737 ill->ill_name = (char *)ill + sizeof (*ill); 3738 (void) strcpy(ill->ill_name, ipif_loopback_name); 3739 ill->ill_name_length = sizeof (ipif_loopback_name); 3740 /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */ 3741 ill->ill_dlpi_pending = DL_PRIM_INVAL; 3742 3743 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); 3744 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); 3745 ill->ill_global_timer = INFINITY; 3746 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 3747 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 3748 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 3749 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 3750 3751 /* No resolver here. */ 3752 ill->ill_net_type = IRE_LOOPBACK; 3753 3754 /* Initialize the ipsq */ 3755 if (!ipsq_init(ill, B_FALSE)) 3756 goto done; 3757 3758 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL); 3759 if (ipif == NULL) 3760 goto done; 3761 3762 ill->ill_flags = ILLF_MULTICAST; 3763 3764 ov6addr = ipif->ipif_v6lcl_addr; 3765 /* Set up default loopback address and mask. */ 3766 if (!isv6) { 3767 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 3768 3769 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 3770 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 3771 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 3772 ipif->ipif_v6subnet); 3773 ill->ill_flags |= ILLF_IPV4; 3774 } else { 3775 ipif->ipif_v6lcl_addr = ipv6_loopback; 3776 ipif->ipif_v6net_mask = ipv6_all_ones; 3777 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 3778 ipif->ipif_v6subnet); 3779 ill->ill_flags |= ILLF_IPV6; 3780 } 3781 3782 /* 3783 * Chain us in at the end of the ill list. hold the ill 3784 * before we make it globally visible. 1 for the lookup. 3785 */ 3786 ill->ill_refcnt = 0; 3787 ill_refhold(ill); 3788 3789 ill->ill_frag_count = 0; 3790 ill->ill_frag_free_num_pkts = 0; 3791 ill->ill_last_frag_clean_time = 0; 3792 3793 ipsq = ill->ill_phyint->phyint_ipsq; 3794 3795 ill_set_inputfn(ill); 3796 3797 if (ill_glist_insert(ill, "lo", isv6) != 0) 3798 cmn_err(CE_PANIC, "cannot insert loopback interface"); 3799 3800 /* Let SCTP know so that it can add this to its list */ 3801 sctp_update_ill(ill, SCTP_ILL_INSERT); 3802 3803 /* 3804 * We have already assigned ipif_v6lcl_addr above, but we need to 3805 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which 3806 * requires to be after ill_glist_insert() since we need the 3807 * ill_index set. Pass on ipv6_loopback as the old address. 3808 */ 3809 sctp_update_ipif_addr(ipif, ov6addr); 3810 3811 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT); 3812 3813 /* 3814 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs. 3815 * If so, free our original one. 3816 */ 3817 if (ipsq != ill->ill_phyint->phyint_ipsq) 3818 ipsq_delete(ipsq); 3819 3820 if (ipst->ips_loopback_ksp == NULL) { 3821 /* Export loopback interface statistics */ 3822 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0, 3823 ipif_loopback_name, "net", 3824 KSTAT_TYPE_NAMED, 2, 0, 3825 ipst->ips_netstack->netstack_stackid); 3826 if (ipst->ips_loopback_ksp != NULL) { 3827 ipst->ips_loopback_ksp->ks_update = 3828 loopback_kstat_update; 3829 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp); 3830 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 3831 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 3832 ipst->ips_loopback_ksp->ks_private = 3833 (void *)(uintptr_t)ipst->ips_netstack-> 3834 netstack_stackid; 3835 kstat_install(ipst->ips_loopback_ksp); 3836 } 3837 } 3838 3839 *did_alloc = B_TRUE; 3840 rw_exit(&ipst->ips_ill_g_lock); 3841 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id), 3842 NE_PLUMB, ill->ill_name, ill->ill_name_length); 3843 return (ill); 3844 done: 3845 if (ill != NULL) { 3846 if (ill->ill_phyint != NULL) { 3847 ipsq = ill->ill_phyint->phyint_ipsq; 3848 if (ipsq != NULL) { 3849 ipsq->ipsq_phyint = NULL; 3850 ipsq_delete(ipsq); 3851 } 3852 mi_free(ill->ill_phyint); 3853 } 3854 ill_free_mib(ill); 3855 if (ill->ill_ipst != NULL) 3856 netstack_rele(ill->ill_ipst->ips_netstack); 3857 mi_free(ill); 3858 } 3859 rw_exit(&ipst->ips_ill_g_lock); 3860 return (NULL); 3861 } 3862 3863 /* 3864 * For IPP calls - use the ip_stack_t for global stack. 3865 */ 3866 ill_t * 3867 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6) 3868 { 3869 ip_stack_t *ipst; 3870 ill_t *ill; 3871 3872 ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip; 3873 if (ipst == NULL) { 3874 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n"); 3875 return (NULL); 3876 } 3877 3878 ill = ill_lookup_on_ifindex(index, isv6, ipst); 3879 netstack_rele(ipst->ips_netstack); 3880 return (ill); 3881 } 3882 3883 /* 3884 * Return a pointer to the ill which matches the index and IP version type. 3885 */ 3886 ill_t * 3887 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 3888 { 3889 ill_t *ill; 3890 phyint_t *phyi; 3891 3892 /* 3893 * Indexes are stored in the phyint - a common structure 3894 * to both IPv4 and IPv6. 3895 */ 3896 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3897 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3898 (void *) &index, NULL); 3899 if (phyi != NULL) { 3900 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 3901 if (ill != NULL) { 3902 mutex_enter(&ill->ill_lock); 3903 if (!ILL_IS_CONDEMNED(ill)) { 3904 ill_refhold_locked(ill); 3905 mutex_exit(&ill->ill_lock); 3906 rw_exit(&ipst->ips_ill_g_lock); 3907 return (ill); 3908 } 3909 mutex_exit(&ill->ill_lock); 3910 } 3911 } 3912 rw_exit(&ipst->ips_ill_g_lock); 3913 return (NULL); 3914 } 3915 3916 /* 3917 * Verify whether or not an interface index is valid. 3918 * It can be zero (meaning "reset") or an interface index assigned 3919 * to a non-VNI interface. (We don't use VNI interface to send packets.) 3920 */ 3921 boolean_t 3922 ip_ifindex_valid(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst) 3923 { 3924 ill_t *ill; 3925 3926 if (ifindex == 0) 3927 return (B_TRUE); 3928 3929 ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 3930 if (ill == NULL) 3931 return (B_FALSE); 3932 if (IS_VNI(ill)) { 3933 ill_refrele(ill); 3934 return (B_FALSE); 3935 } 3936 ill_refrele(ill); 3937 return (B_TRUE); 3938 } 3939 3940 /* 3941 * Return the ifindex next in sequence after the passed in ifindex. 3942 * If there is no next ifindex for the given protocol, return 0. 3943 */ 3944 uint_t 3945 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 3946 { 3947 phyint_t *phyi; 3948 phyint_t *phyi_initial; 3949 uint_t ifindex; 3950 3951 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3952 3953 if (index == 0) { 3954 phyi = avl_first( 3955 &ipst->ips_phyint_g_list->phyint_list_avl_by_index); 3956 } else { 3957 phyi = phyi_initial = avl_find( 3958 &ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3959 (void *) &index, NULL); 3960 } 3961 3962 for (; phyi != NULL; 3963 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3964 phyi, AVL_AFTER)) { 3965 /* 3966 * If we're not returning the first interface in the tree 3967 * and we still haven't moved past the phyint_t that 3968 * corresponds to index, avl_walk needs to be called again 3969 */ 3970 if (!((index != 0) && (phyi == phyi_initial))) { 3971 if (isv6) { 3972 if ((phyi->phyint_illv6) && 3973 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 3974 (phyi->phyint_illv6->ill_isv6 == 1)) 3975 break; 3976 } else { 3977 if ((phyi->phyint_illv4) && 3978 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 3979 (phyi->phyint_illv4->ill_isv6 == 0)) 3980 break; 3981 } 3982 } 3983 } 3984 3985 rw_exit(&ipst->ips_ill_g_lock); 3986 3987 if (phyi != NULL) 3988 ifindex = phyi->phyint_ifindex; 3989 else 3990 ifindex = 0; 3991 3992 return (ifindex); 3993 } 3994 3995 /* 3996 * Return the ifindex for the named interface. 3997 * If there is no next ifindex for the interface, return 0. 3998 */ 3999 uint_t 4000 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst) 4001 { 4002 phyint_t *phyi; 4003 avl_index_t where = 0; 4004 uint_t ifindex; 4005 4006 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4007 4008 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 4009 name, &where)) == NULL) { 4010 rw_exit(&ipst->ips_ill_g_lock); 4011 return (0); 4012 } 4013 4014 ifindex = phyi->phyint_ifindex; 4015 4016 rw_exit(&ipst->ips_ill_g_lock); 4017 4018 return (ifindex); 4019 } 4020 4021 /* 4022 * Return the ifindex to be used by upper layer protocols for instance 4023 * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill. 4024 */ 4025 uint_t 4026 ill_get_upper_ifindex(const ill_t *ill) 4027 { 4028 if (IS_UNDER_IPMP(ill)) 4029 return (ipmp_ill_get_ipmp_ifindex(ill)); 4030 else 4031 return (ill->ill_phyint->phyint_ifindex); 4032 } 4033 4034 4035 /* 4036 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 4037 * that gives a running thread a reference to the ill. This reference must be 4038 * released by the thread when it is done accessing the ill and related 4039 * objects. ill_refcnt can not be used to account for static references 4040 * such as other structures pointing to an ill. Callers must generally 4041 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 4042 * or be sure that the ill is not being deleted or changing state before 4043 * calling the refhold functions. A non-zero ill_refcnt ensures that the 4044 * ill won't change any of its critical state such as address, netmask etc. 4045 */ 4046 void 4047 ill_refhold(ill_t *ill) 4048 { 4049 mutex_enter(&ill->ill_lock); 4050 ill->ill_refcnt++; 4051 ILL_TRACE_REF(ill); 4052 mutex_exit(&ill->ill_lock); 4053 } 4054 4055 void 4056 ill_refhold_locked(ill_t *ill) 4057 { 4058 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4059 ill->ill_refcnt++; 4060 ILL_TRACE_REF(ill); 4061 } 4062 4063 /* Returns true if we managed to get a refhold */ 4064 boolean_t 4065 ill_check_and_refhold(ill_t *ill) 4066 { 4067 mutex_enter(&ill->ill_lock); 4068 if (!ILL_IS_CONDEMNED(ill)) { 4069 ill_refhold_locked(ill); 4070 mutex_exit(&ill->ill_lock); 4071 return (B_TRUE); 4072 } 4073 mutex_exit(&ill->ill_lock); 4074 return (B_FALSE); 4075 } 4076 4077 /* 4078 * Must not be called while holding any locks. Otherwise if this is 4079 * the last reference to be released, there is a chance of recursive mutex 4080 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 4081 * to restart an ioctl. 4082 */ 4083 void 4084 ill_refrele(ill_t *ill) 4085 { 4086 mutex_enter(&ill->ill_lock); 4087 ASSERT(ill->ill_refcnt != 0); 4088 ill->ill_refcnt--; 4089 ILL_UNTRACE_REF(ill); 4090 if (ill->ill_refcnt != 0) { 4091 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 4092 mutex_exit(&ill->ill_lock); 4093 return; 4094 } 4095 4096 /* Drops the ill_lock */ 4097 ipif_ill_refrele_tail(ill); 4098 } 4099 4100 /* 4101 * Obtain a weak reference count on the ill. This reference ensures the 4102 * ill won't be freed, but the ill may change any of its critical state 4103 * such as netmask, address etc. Returns an error if the ill has started 4104 * closing. 4105 */ 4106 boolean_t 4107 ill_waiter_inc(ill_t *ill) 4108 { 4109 mutex_enter(&ill->ill_lock); 4110 if (ill->ill_state_flags & ILL_CONDEMNED) { 4111 mutex_exit(&ill->ill_lock); 4112 return (B_FALSE); 4113 } 4114 ill->ill_waiters++; 4115 mutex_exit(&ill->ill_lock); 4116 return (B_TRUE); 4117 } 4118 4119 void 4120 ill_waiter_dcr(ill_t *ill) 4121 { 4122 mutex_enter(&ill->ill_lock); 4123 ill->ill_waiters--; 4124 if (ill->ill_waiters == 0) 4125 cv_broadcast(&ill->ill_cv); 4126 mutex_exit(&ill->ill_lock); 4127 } 4128 4129 /* 4130 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 4131 * driver. We construct best guess defaults for lower level information that 4132 * we need. If an interface is brought up without injection of any overriding 4133 * information from outside, we have to be ready to go with these defaults. 4134 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 4135 * we primarely want the dl_provider_style. 4136 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 4137 * at which point we assume the other part of the information is valid. 4138 */ 4139 void 4140 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 4141 { 4142 uchar_t *brdcst_addr; 4143 uint_t brdcst_addr_length, phys_addr_length; 4144 t_scalar_t sap_length; 4145 dl_info_ack_t *dlia; 4146 ip_m_t *ipm; 4147 dl_qos_cl_sel1_t *sel1; 4148 int min_mtu; 4149 4150 ASSERT(IAM_WRITER_ILL(ill)); 4151 4152 /* 4153 * Till the ill is fully up the ill is not globally visible. 4154 * So no need for a lock. 4155 */ 4156 dlia = (dl_info_ack_t *)mp->b_rptr; 4157 ill->ill_mactype = dlia->dl_mac_type; 4158 4159 ipm = ip_m_lookup(dlia->dl_mac_type); 4160 if (ipm == NULL) { 4161 ipm = ip_m_lookup(DL_OTHER); 4162 ASSERT(ipm != NULL); 4163 } 4164 ill->ill_media = ipm; 4165 4166 /* 4167 * When the new DLPI stuff is ready we'll pull lengths 4168 * from dlia. 4169 */ 4170 if (dlia->dl_version == DL_VERSION_2) { 4171 brdcst_addr_length = dlia->dl_brdcst_addr_length; 4172 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 4173 brdcst_addr_length); 4174 if (brdcst_addr == NULL) { 4175 brdcst_addr_length = 0; 4176 } 4177 sap_length = dlia->dl_sap_length; 4178 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 4179 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 4180 brdcst_addr_length, sap_length, phys_addr_length)); 4181 } else { 4182 brdcst_addr_length = 6; 4183 brdcst_addr = ip_six_byte_all_ones; 4184 sap_length = -2; 4185 phys_addr_length = brdcst_addr_length; 4186 } 4187 4188 ill->ill_bcast_addr_length = brdcst_addr_length; 4189 ill->ill_phys_addr_length = phys_addr_length; 4190 ill->ill_sap_length = sap_length; 4191 4192 /* 4193 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU, 4194 * but we must ensure a minimum IP MTU is used since other bits of 4195 * IP will fly apart otherwise. 4196 */ 4197 min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; 4198 ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu); 4199 ill->ill_current_frag = ill->ill_max_frag; 4200 ill->ill_mtu = ill->ill_max_frag; 4201 4202 ill->ill_type = ipm->ip_m_type; 4203 4204 if (!ill->ill_dlpi_style_set) { 4205 if (dlia->dl_provider_style == DL_STYLE2) 4206 ill->ill_needs_attach = 1; 4207 4208 phyint_flags_init(ill->ill_phyint, ill->ill_mactype); 4209 4210 /* 4211 * Allocate the first ipif on this ill. We don't delay it 4212 * further as ioctl handling assumes at least one ipif exists. 4213 * 4214 * At this point we don't know whether the ill is v4 or v6. 4215 * We will know this whan the SIOCSLIFNAME happens and 4216 * the correct value for ill_isv6 will be assigned in 4217 * ipif_set_values(). We need to hold the ill lock and 4218 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 4219 * the wakeup. 4220 */ 4221 (void) ipif_allocate(ill, 0, IRE_LOCAL, 4222 dlia->dl_provider_style != DL_STYLE2, B_TRUE, NULL); 4223 mutex_enter(&ill->ill_lock); 4224 ASSERT(ill->ill_dlpi_style_set == 0); 4225 ill->ill_dlpi_style_set = 1; 4226 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 4227 cv_broadcast(&ill->ill_cv); 4228 mutex_exit(&ill->ill_lock); 4229 freemsg(mp); 4230 return; 4231 } 4232 ASSERT(ill->ill_ipif != NULL); 4233 /* 4234 * We know whether it is IPv4 or IPv6 now, as this is the 4235 * second DL_INFO_ACK we are recieving in response to the 4236 * DL_INFO_REQ sent in ipif_set_values. 4237 */ 4238 ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap; 4239 /* 4240 * Clear all the flags that were set based on ill_bcast_addr_length 4241 * and ill_phys_addr_length (in ipif_set_values) as these could have 4242 * changed now and we need to re-evaluate. 4243 */ 4244 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 4245 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 4246 4247 /* 4248 * Free ill_bcast_mp as things could have changed now. 4249 * 4250 * NOTE: The IPMP meta-interface is special-cased because it starts 4251 * with no underlying interfaces (and thus an unknown broadcast 4252 * address length), but we enforce that an interface is broadcast- 4253 * capable as part of allowing it to join a group. 4254 */ 4255 if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) { 4256 if (ill->ill_bcast_mp != NULL) 4257 freemsg(ill->ill_bcast_mp); 4258 ill->ill_net_type = IRE_IF_NORESOLVER; 4259 4260 ill->ill_bcast_mp = ill_dlur_gen(NULL, 4261 ill->ill_phys_addr_length, 4262 ill->ill_sap, 4263 ill->ill_sap_length); 4264 4265 if (ill->ill_isv6) 4266 /* 4267 * Note: xresolv interfaces will eventually need NOARP 4268 * set here as well, but that will require those 4269 * external resolvers to have some knowledge of 4270 * that flag and act appropriately. Not to be changed 4271 * at present. 4272 */ 4273 ill->ill_flags |= ILLF_NONUD; 4274 else 4275 ill->ill_flags |= ILLF_NOARP; 4276 4277 if (ill->ill_mactype == SUNW_DL_VNI) { 4278 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 4279 } else if (ill->ill_phys_addr_length == 0 || 4280 ill->ill_mactype == DL_IPV4 || 4281 ill->ill_mactype == DL_IPV6) { 4282 /* 4283 * The underying link is point-to-point, so mark the 4284 * interface as such. We can do IP multicast over 4285 * such a link since it transmits all network-layer 4286 * packets to the remote side the same way. 4287 */ 4288 ill->ill_flags |= ILLF_MULTICAST; 4289 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 4290 } 4291 } else { 4292 ill->ill_net_type = IRE_IF_RESOLVER; 4293 if (ill->ill_bcast_mp != NULL) 4294 freemsg(ill->ill_bcast_mp); 4295 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 4296 ill->ill_bcast_addr_length, ill->ill_sap, 4297 ill->ill_sap_length); 4298 /* 4299 * Later detect lack of DLPI driver multicast 4300 * capability by catching DL_ENABMULTI errors in 4301 * ip_rput_dlpi. 4302 */ 4303 ill->ill_flags |= ILLF_MULTICAST; 4304 if (!ill->ill_isv6) 4305 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 4306 } 4307 4308 /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */ 4309 if (ill->ill_mactype == SUNW_DL_IPMP) 4310 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP); 4311 4312 /* By default an interface does not support any CoS marking */ 4313 ill->ill_flags &= ~ILLF_COS_ENABLED; 4314 4315 /* 4316 * If we get QoS information in DL_INFO_ACK, the device supports 4317 * some form of CoS marking, set ILLF_COS_ENABLED. 4318 */ 4319 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 4320 dlia->dl_qos_length); 4321 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 4322 ill->ill_flags |= ILLF_COS_ENABLED; 4323 } 4324 4325 /* Clear any previous error indication. */ 4326 ill->ill_error = 0; 4327 freemsg(mp); 4328 } 4329 4330 /* 4331 * Perform various checks to verify that an address would make sense as a 4332 * local, remote, or subnet interface address. 4333 */ 4334 static boolean_t 4335 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 4336 { 4337 ipaddr_t net_mask; 4338 4339 /* 4340 * Don't allow all zeroes, or all ones, but allow 4341 * all ones netmask. 4342 */ 4343 if ((net_mask = ip_net_mask(addr)) == 0) 4344 return (B_FALSE); 4345 /* A given netmask overrides the "guess" netmask */ 4346 if (subnet_mask != 0) 4347 net_mask = subnet_mask; 4348 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 4349 (addr == (addr | ~net_mask)))) { 4350 return (B_FALSE); 4351 } 4352 4353 /* 4354 * Even if the netmask is all ones, we do not allow address to be 4355 * 255.255.255.255 4356 */ 4357 if (addr == INADDR_BROADCAST) 4358 return (B_FALSE); 4359 4360 if (CLASSD(addr)) 4361 return (B_FALSE); 4362 4363 return (B_TRUE); 4364 } 4365 4366 #define V6_IPIF_LINKLOCAL(p) \ 4367 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr) 4368 4369 /* 4370 * Compare two given ipifs and check if the second one is better than 4371 * the first one using the order of preference (not taking deprecated 4372 * into acount) specified in ipif_lookup_multicast(). 4373 */ 4374 static boolean_t 4375 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) 4376 { 4377 /* Check the least preferred first. */ 4378 if (IS_LOOPBACK(old_ipif->ipif_ill)) { 4379 /* If both ipifs are the same, use the first one. */ 4380 if (IS_LOOPBACK(new_ipif->ipif_ill)) 4381 return (B_FALSE); 4382 else 4383 return (B_TRUE); 4384 } 4385 4386 /* For IPv6, check for link local address. */ 4387 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) { 4388 if (IS_LOOPBACK(new_ipif->ipif_ill) || 4389 V6_IPIF_LINKLOCAL(new_ipif)) { 4390 /* The second one is equal or less preferred. */ 4391 return (B_FALSE); 4392 } else { 4393 return (B_TRUE); 4394 } 4395 } 4396 4397 /* Then check for point to point interface. */ 4398 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) { 4399 if (IS_LOOPBACK(new_ipif->ipif_ill) || 4400 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) || 4401 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) { 4402 return (B_FALSE); 4403 } else { 4404 return (B_TRUE); 4405 } 4406 } 4407 4408 /* old_ipif is a normal interface, so no need to use the new one. */ 4409 return (B_FALSE); 4410 } 4411 4412 /* 4413 * Find a mulitcast-capable ipif given an IP instance and zoneid. 4414 * The ipif must be up, and its ill must multicast-capable, not 4415 * condemned, not an underlying interface in an IPMP group, and 4416 * not a VNI interface. Order of preference: 4417 * 4418 * 1a. normal 4419 * 1b. normal, but deprecated 4420 * 2a. point to point 4421 * 2b. point to point, but deprecated 4422 * 3a. link local 4423 * 3b. link local, but deprecated 4424 * 4. loopback. 4425 */ 4426 static ipif_t * 4427 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 4428 { 4429 ill_t *ill; 4430 ill_walk_context_t ctx; 4431 ipif_t *ipif; 4432 ipif_t *saved_ipif = NULL; 4433 ipif_t *dep_ipif = NULL; 4434 4435 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4436 if (isv6) 4437 ill = ILL_START_WALK_V6(&ctx, ipst); 4438 else 4439 ill = ILL_START_WALK_V4(&ctx, ipst); 4440 4441 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4442 mutex_enter(&ill->ill_lock); 4443 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || 4444 ILL_IS_CONDEMNED(ill) || 4445 !(ill->ill_flags & ILLF_MULTICAST)) { 4446 mutex_exit(&ill->ill_lock); 4447 continue; 4448 } 4449 for (ipif = ill->ill_ipif; ipif != NULL; 4450 ipif = ipif->ipif_next) { 4451 if (zoneid != ipif->ipif_zoneid && 4452 zoneid != ALL_ZONES && 4453 ipif->ipif_zoneid != ALL_ZONES) { 4454 continue; 4455 } 4456 if (!(ipif->ipif_flags & IPIF_UP) || 4457 IPIF_IS_CONDEMNED(ipif)) { 4458 continue; 4459 } 4460 4461 /* 4462 * Found one candidate. If it is deprecated, 4463 * remember it in dep_ipif. If it is not deprecated, 4464 * remember it in saved_ipif. 4465 */ 4466 if (ipif->ipif_flags & IPIF_DEPRECATED) { 4467 if (dep_ipif == NULL) { 4468 dep_ipif = ipif; 4469 } else if (ipif_comp_multi(dep_ipif, ipif, 4470 isv6)) { 4471 /* 4472 * If the previous dep_ipif does not 4473 * belong to the same ill, we've done 4474 * a ipif_refhold() on it. So we need 4475 * to release it. 4476 */ 4477 if (dep_ipif->ipif_ill != ill) 4478 ipif_refrele(dep_ipif); 4479 dep_ipif = ipif; 4480 } 4481 continue; 4482 } 4483 if (saved_ipif == NULL) { 4484 saved_ipif = ipif; 4485 } else { 4486 if (ipif_comp_multi(saved_ipif, ipif, isv6)) { 4487 if (saved_ipif->ipif_ill != ill) 4488 ipif_refrele(saved_ipif); 4489 saved_ipif = ipif; 4490 } 4491 } 4492 } 4493 /* 4494 * Before going to the next ill, do a ipif_refhold() on the 4495 * saved ones. 4496 */ 4497 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill) 4498 ipif_refhold_locked(saved_ipif); 4499 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill) 4500 ipif_refhold_locked(dep_ipif); 4501 mutex_exit(&ill->ill_lock); 4502 } 4503 rw_exit(&ipst->ips_ill_g_lock); 4504 4505 /* 4506 * If we have only the saved_ipif, return it. But if we have both 4507 * saved_ipif and dep_ipif, check to see which one is better. 4508 */ 4509 if (saved_ipif != NULL) { 4510 if (dep_ipif != NULL) { 4511 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) { 4512 ipif_refrele(saved_ipif); 4513 return (dep_ipif); 4514 } else { 4515 ipif_refrele(dep_ipif); 4516 return (saved_ipif); 4517 } 4518 } 4519 return (saved_ipif); 4520 } else { 4521 return (dep_ipif); 4522 } 4523 } 4524 4525 ill_t * 4526 ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 4527 { 4528 ipif_t *ipif; 4529 ill_t *ill; 4530 4531 ipif = ipif_lookup_multicast(ipst, zoneid, isv6); 4532 if (ipif == NULL) 4533 return (NULL); 4534 4535 ill = ipif->ipif_ill; 4536 ill_refhold(ill); 4537 ipif_refrele(ipif); 4538 return (ill); 4539 } 4540 4541 /* 4542 * This function is called when an application does not specify an interface 4543 * to be used for multicast traffic (joining a group/sending data). It 4544 * calls ire_lookup_multi() to look for an interface route for the 4545 * specified multicast group. Doing this allows the administrator to add 4546 * prefix routes for multicast to indicate which interface to be used for 4547 * multicast traffic in the above scenario. The route could be for all 4548 * multicast (224.0/4), for a single multicast group (a /32 route) or 4549 * anything in between. If there is no such multicast route, we just find 4550 * any multicast capable interface and return it. The returned ipif 4551 * is refhold'ed. 4552 * 4553 * We support MULTIRT and RTF_SETSRC on the multicast routes added to the 4554 * unicast table. This is used by CGTP. 4555 */ 4556 ill_t * 4557 ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 4558 boolean_t *multirtp, ipaddr_t *setsrcp) 4559 { 4560 ill_t *ill; 4561 4562 ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp); 4563 if (ill != NULL) 4564 return (ill); 4565 4566 return (ill_lookup_multicast(ipst, zoneid, B_FALSE)); 4567 } 4568 4569 /* 4570 * Look for an ipif with the specified interface address and destination. 4571 * The destination address is used only for matching point-to-point interfaces. 4572 */ 4573 ipif_t * 4574 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst) 4575 { 4576 ipif_t *ipif; 4577 ill_t *ill; 4578 ill_walk_context_t ctx; 4579 4580 /* 4581 * First match all the point-to-point interfaces 4582 * before looking at non-point-to-point interfaces. 4583 * This is done to avoid returning non-point-to-point 4584 * ipif instead of unnumbered point-to-point ipif. 4585 */ 4586 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4587 ill = ILL_START_WALK_V4(&ctx, ipst); 4588 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4589 mutex_enter(&ill->ill_lock); 4590 for (ipif = ill->ill_ipif; ipif != NULL; 4591 ipif = ipif->ipif_next) { 4592 /* Allow the ipif to be down */ 4593 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 4594 (ipif->ipif_lcl_addr == if_addr) && 4595 (ipif->ipif_pp_dst_addr == dst)) { 4596 if (!IPIF_IS_CONDEMNED(ipif)) { 4597 ipif_refhold_locked(ipif); 4598 mutex_exit(&ill->ill_lock); 4599 rw_exit(&ipst->ips_ill_g_lock); 4600 return (ipif); 4601 } 4602 } 4603 } 4604 mutex_exit(&ill->ill_lock); 4605 } 4606 rw_exit(&ipst->ips_ill_g_lock); 4607 4608 /* lookup the ipif based on interface address */ 4609 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst); 4610 ASSERT(ipif == NULL || !ipif->ipif_isv6); 4611 return (ipif); 4612 } 4613 4614 /* 4615 * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact(). 4616 */ 4617 static ipif_t * 4618 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags, 4619 zoneid_t zoneid, ip_stack_t *ipst) 4620 { 4621 ipif_t *ipif; 4622 ill_t *ill; 4623 boolean_t ptp = B_FALSE; 4624 ill_walk_context_t ctx; 4625 boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP); 4626 boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP); 4627 4628 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4629 /* 4630 * Repeat twice, first based on local addresses and 4631 * next time for pointopoint. 4632 */ 4633 repeat: 4634 ill = ILL_START_WALK_V4(&ctx, ipst); 4635 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4636 if (match_ill != NULL && ill != match_ill && 4637 (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { 4638 continue; 4639 } 4640 mutex_enter(&ill->ill_lock); 4641 for (ipif = ill->ill_ipif; ipif != NULL; 4642 ipif = ipif->ipif_next) { 4643 if (zoneid != ALL_ZONES && 4644 zoneid != ipif->ipif_zoneid && 4645 ipif->ipif_zoneid != ALL_ZONES) 4646 continue; 4647 4648 if (no_duplicate && !(ipif->ipif_flags & IPIF_UP)) 4649 continue; 4650 4651 /* Allow the ipif to be down */ 4652 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 4653 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 4654 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 4655 (ipif->ipif_pp_dst_addr == addr))) { 4656 if (!IPIF_IS_CONDEMNED(ipif)) { 4657 ipif_refhold_locked(ipif); 4658 mutex_exit(&ill->ill_lock); 4659 rw_exit(&ipst->ips_ill_g_lock); 4660 return (ipif); 4661 } 4662 } 4663 } 4664 mutex_exit(&ill->ill_lock); 4665 } 4666 4667 /* If we already did the ptp case, then we are done */ 4668 if (ptp) { 4669 rw_exit(&ipst->ips_ill_g_lock); 4670 return (NULL); 4671 } 4672 ptp = B_TRUE; 4673 goto repeat; 4674 } 4675 4676 /* 4677 * Lookup an ipif with the specified address. For point-to-point links we 4678 * look for matches on either the destination address or the local address, 4679 * but we skip the local address check if IPIF_UNNUMBERED is set. If the 4680 * `match_ill' argument is non-NULL, the lookup is restricted to that ill 4681 * (or illgrp if `match_ill' is in an IPMP group). 4682 */ 4683 ipif_t * 4684 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, 4685 ip_stack_t *ipst) 4686 { 4687 return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP, 4688 zoneid, ipst)); 4689 } 4690 4691 /* 4692 * Lookup an ipif with the specified address. Similar to ipif_lookup_addr, 4693 * except that we will only return an address if it is not marked as 4694 * IPIF_DUPLICATE 4695 */ 4696 ipif_t * 4697 ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, 4698 ip_stack_t *ipst) 4699 { 4700 return (ipif_lookup_addr_common(addr, match_ill, 4701 (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP), 4702 zoneid, ipst)); 4703 } 4704 4705 /* 4706 * Special abbreviated version of ipif_lookup_addr() that doesn't match 4707 * `match_ill' across the IPMP group. This function is only needed in some 4708 * corner-cases; almost everything should use ipif_lookup_addr(). 4709 */ 4710 ipif_t * 4711 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 4712 { 4713 ASSERT(match_ill != NULL); 4714 return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES, 4715 ipst)); 4716 } 4717 4718 /* 4719 * Look for an ipif with the specified address. For point-point links 4720 * we look for matches on either the destination address and the local 4721 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 4722 * is set. 4723 * If the `match_ill' argument is non-NULL, the lookup is restricted to that 4724 * ill (or illgrp if `match_ill' is in an IPMP group). 4725 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 4726 */ 4727 zoneid_t 4728 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 4729 { 4730 zoneid_t zoneid; 4731 ipif_t *ipif; 4732 ill_t *ill; 4733 boolean_t ptp = B_FALSE; 4734 ill_walk_context_t ctx; 4735 4736 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4737 /* 4738 * Repeat twice, first based on local addresses and 4739 * next time for pointopoint. 4740 */ 4741 repeat: 4742 ill = ILL_START_WALK_V4(&ctx, ipst); 4743 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4744 if (match_ill != NULL && ill != match_ill && 4745 !IS_IN_SAME_ILLGRP(ill, match_ill)) { 4746 continue; 4747 } 4748 mutex_enter(&ill->ill_lock); 4749 for (ipif = ill->ill_ipif; ipif != NULL; 4750 ipif = ipif->ipif_next) { 4751 /* Allow the ipif to be down */ 4752 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 4753 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 4754 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 4755 (ipif->ipif_pp_dst_addr == addr)) && 4756 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 4757 zoneid = ipif->ipif_zoneid; 4758 mutex_exit(&ill->ill_lock); 4759 rw_exit(&ipst->ips_ill_g_lock); 4760 /* 4761 * If ipif_zoneid was ALL_ZONES then we have 4762 * a trusted extensions shared IP address. 4763 * In that case GLOBAL_ZONEID works to send. 4764 */ 4765 if (zoneid == ALL_ZONES) 4766 zoneid = GLOBAL_ZONEID; 4767 return (zoneid); 4768 } 4769 } 4770 mutex_exit(&ill->ill_lock); 4771 } 4772 4773 /* If we already did the ptp case, then we are done */ 4774 if (ptp) { 4775 rw_exit(&ipst->ips_ill_g_lock); 4776 return (ALL_ZONES); 4777 } 4778 ptp = B_TRUE; 4779 goto repeat; 4780 } 4781 4782 /* 4783 * Look for an ipif that matches the specified remote address i.e. the 4784 * ipif that would receive the specified packet. 4785 * First look for directly connected interfaces and then do a recursive 4786 * IRE lookup and pick the first ipif corresponding to the source address in the 4787 * ire. 4788 * Returns: held ipif 4789 * 4790 * This is only used for ICMP_ADDRESS_MASK_REQUESTs 4791 */ 4792 ipif_t * 4793 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 4794 { 4795 ipif_t *ipif; 4796 4797 ASSERT(!ill->ill_isv6); 4798 4799 /* 4800 * Someone could be changing this ipif currently or change it 4801 * after we return this. Thus a few packets could use the old 4802 * old values. However structure updates/creates (ire, ilg, ilm etc) 4803 * will atomically be updated or cleaned up with the new value 4804 * Thus we don't need a lock to check the flags or other attrs below. 4805 */ 4806 mutex_enter(&ill->ill_lock); 4807 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4808 if (IPIF_IS_CONDEMNED(ipif)) 4809 continue; 4810 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 4811 ipif->ipif_zoneid != ALL_ZONES) 4812 continue; 4813 /* Allow the ipif to be down */ 4814 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 4815 if ((ipif->ipif_pp_dst_addr == addr) || 4816 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 4817 ipif->ipif_lcl_addr == addr)) { 4818 ipif_refhold_locked(ipif); 4819 mutex_exit(&ill->ill_lock); 4820 return (ipif); 4821 } 4822 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 4823 ipif_refhold_locked(ipif); 4824 mutex_exit(&ill->ill_lock); 4825 return (ipif); 4826 } 4827 } 4828 mutex_exit(&ill->ill_lock); 4829 /* 4830 * For a remote destination it isn't possible to nail down a particular 4831 * ipif. 4832 */ 4833 4834 /* Pick the first interface */ 4835 ipif = ipif_get_next_ipif(NULL, ill); 4836 return (ipif); 4837 } 4838 4839 /* 4840 * This func does not prevent refcnt from increasing. But if 4841 * the caller has taken steps to that effect, then this func 4842 * can be used to determine whether the ill has become quiescent 4843 */ 4844 static boolean_t 4845 ill_is_quiescent(ill_t *ill) 4846 { 4847 ipif_t *ipif; 4848 4849 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4850 4851 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4852 if (ipif->ipif_refcnt != 0) 4853 return (B_FALSE); 4854 } 4855 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) { 4856 return (B_FALSE); 4857 } 4858 return (B_TRUE); 4859 } 4860 4861 boolean_t 4862 ill_is_freeable(ill_t *ill) 4863 { 4864 ipif_t *ipif; 4865 4866 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4867 4868 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4869 if (ipif->ipif_refcnt != 0) { 4870 return (B_FALSE); 4871 } 4872 } 4873 if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) { 4874 return (B_FALSE); 4875 } 4876 return (B_TRUE); 4877 } 4878 4879 /* 4880 * This func does not prevent refcnt from increasing. But if 4881 * the caller has taken steps to that effect, then this func 4882 * can be used to determine whether the ipif has become quiescent 4883 */ 4884 static boolean_t 4885 ipif_is_quiescent(ipif_t *ipif) 4886 { 4887 ill_t *ill; 4888 4889 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 4890 4891 if (ipif->ipif_refcnt != 0) 4892 return (B_FALSE); 4893 4894 ill = ipif->ipif_ill; 4895 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 4896 ill->ill_logical_down) { 4897 return (B_TRUE); 4898 } 4899 4900 /* This is the last ipif going down or being deleted on this ill */ 4901 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { 4902 return (B_FALSE); 4903 } 4904 4905 return (B_TRUE); 4906 } 4907 4908 /* 4909 * return true if the ipif can be destroyed: the ipif has to be quiescent 4910 * with zero references from ire/ilm to it. 4911 */ 4912 static boolean_t 4913 ipif_is_freeable(ipif_t *ipif) 4914 { 4915 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 4916 ASSERT(ipif->ipif_id != 0); 4917 return (ipif->ipif_refcnt == 0); 4918 } 4919 4920 /* 4921 * The ipif/ill/ire has been refreled. Do the tail processing. 4922 * Determine if the ipif or ill in question has become quiescent and if so 4923 * wakeup close and/or restart any queued pending ioctl that is waiting 4924 * for the ipif_down (or ill_down) 4925 */ 4926 void 4927 ipif_ill_refrele_tail(ill_t *ill) 4928 { 4929 mblk_t *mp; 4930 conn_t *connp; 4931 ipsq_t *ipsq; 4932 ipxop_t *ipx; 4933 ipif_t *ipif; 4934 dl_notify_ind_t *dlindp; 4935 4936 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4937 4938 if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) { 4939 /* ip_modclose() may be waiting */ 4940 cv_broadcast(&ill->ill_cv); 4941 } 4942 4943 ipsq = ill->ill_phyint->phyint_ipsq; 4944 mutex_enter(&ipsq->ipsq_lock); 4945 ipx = ipsq->ipsq_xop; 4946 mutex_enter(&ipx->ipx_lock); 4947 if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */ 4948 goto unlock; 4949 4950 ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL); 4951 4952 ipif = ipx->ipx_pending_ipif; 4953 if (ipif->ipif_ill != ill) /* wait is for another ill; bail */ 4954 goto unlock; 4955 4956 switch (ipx->ipx_waitfor) { 4957 case IPIF_DOWN: 4958 if (!ipif_is_quiescent(ipif)) 4959 goto unlock; 4960 break; 4961 case IPIF_FREE: 4962 if (!ipif_is_freeable(ipif)) 4963 goto unlock; 4964 break; 4965 case ILL_DOWN: 4966 if (!ill_is_quiescent(ill)) 4967 goto unlock; 4968 break; 4969 case ILL_FREE: 4970 /* 4971 * ILL_FREE is only for loopback; normal ill teardown waits 4972 * synchronously in ip_modclose() without using ipx_waitfor, 4973 * handled by the cv_broadcast() at the top of this function. 4974 */ 4975 if (!ill_is_freeable(ill)) 4976 goto unlock; 4977 break; 4978 default: 4979 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n", 4980 (void *)ipsq, ipx->ipx_waitfor); 4981 } 4982 4983 ill_refhold_locked(ill); /* for qwriter_ip() call below */ 4984 mutex_exit(&ipx->ipx_lock); 4985 mp = ipsq_pending_mp_get(ipsq, &connp); 4986 mutex_exit(&ipsq->ipsq_lock); 4987 mutex_exit(&ill->ill_lock); 4988 4989 ASSERT(mp != NULL); 4990 /* 4991 * NOTE: all of the qwriter_ip() calls below use CUR_OP since 4992 * we can only get here when the current operation decides it 4993 * it needs to quiesce via ipsq_pending_mp_add(). 4994 */ 4995 switch (mp->b_datap->db_type) { 4996 case M_PCPROTO: 4997 case M_PROTO: 4998 /* 4999 * For now, only DL_NOTIFY_IND messages can use this facility. 5000 */ 5001 dlindp = (dl_notify_ind_t *)mp->b_rptr; 5002 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND); 5003 5004 switch (dlindp->dl_notification) { 5005 case DL_NOTE_PHYS_ADDR: 5006 qwriter_ip(ill, ill->ill_rq, mp, 5007 ill_set_phys_addr_tail, CUR_OP, B_TRUE); 5008 return; 5009 case DL_NOTE_REPLUMB: 5010 qwriter_ip(ill, ill->ill_rq, mp, 5011 ill_replumb_tail, CUR_OP, B_TRUE); 5012 return; 5013 default: 5014 ASSERT(0); 5015 ill_refrele(ill); 5016 } 5017 break; 5018 5019 case M_ERROR: 5020 case M_HANGUP: 5021 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP, 5022 B_TRUE); 5023 return; 5024 5025 case M_IOCTL: 5026 case M_IOCDATA: 5027 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) : 5028 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE); 5029 return; 5030 5031 default: 5032 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 5033 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 5034 } 5035 return; 5036 unlock: 5037 mutex_exit(&ipsq->ipsq_lock); 5038 mutex_exit(&ipx->ipx_lock); 5039 mutex_exit(&ill->ill_lock); 5040 } 5041 5042 #ifdef DEBUG 5043 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 5044 static void 5045 th_trace_rrecord(th_trace_t *th_trace) 5046 { 5047 tr_buf_t *tr_buf; 5048 uint_t lastref; 5049 5050 lastref = th_trace->th_trace_lastref; 5051 lastref++; 5052 if (lastref == TR_BUF_MAX) 5053 lastref = 0; 5054 th_trace->th_trace_lastref = lastref; 5055 tr_buf = &th_trace->th_trbuf[lastref]; 5056 tr_buf->tr_time = ddi_get_lbolt(); 5057 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH); 5058 } 5059 5060 static void 5061 th_trace_free(void *value) 5062 { 5063 th_trace_t *th_trace = value; 5064 5065 ASSERT(th_trace->th_refcnt == 0); 5066 kmem_free(th_trace, sizeof (*th_trace)); 5067 } 5068 5069 /* 5070 * Find or create the per-thread hash table used to track object references. 5071 * The ipst argument is NULL if we shouldn't allocate. 5072 * 5073 * Accesses per-thread data, so there's no need to lock here. 5074 */ 5075 static mod_hash_t * 5076 th_trace_gethash(ip_stack_t *ipst) 5077 { 5078 th_hash_t *thh; 5079 5080 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) { 5081 mod_hash_t *mh; 5082 char name[256]; 5083 size_t objsize, rshift; 5084 int retv; 5085 5086 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL) 5087 return (NULL); 5088 (void) snprintf(name, sizeof (name), "th_trace_%p", 5089 (void *)curthread); 5090 5091 /* 5092 * We use mod_hash_create_extended here rather than the more 5093 * obvious mod_hash_create_ptrhash because the latter has a 5094 * hard-coded KM_SLEEP, and we'd prefer to fail rather than 5095 * block. 5096 */ 5097 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)), 5098 MAX(sizeof (ire_t), sizeof (ncec_t))); 5099 rshift = highbit(objsize); 5100 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor, 5101 th_trace_free, mod_hash_byptr, (void *)rshift, 5102 mod_hash_ptrkey_cmp, KM_NOSLEEP); 5103 if (mh == NULL) { 5104 kmem_free(thh, sizeof (*thh)); 5105 return (NULL); 5106 } 5107 thh->thh_hash = mh; 5108 thh->thh_ipst = ipst; 5109 /* 5110 * We trace ills, ipifs, ires, and nces. All of these are 5111 * per-IP-stack, so the lock on the thread list is as well. 5112 */ 5113 rw_enter(&ip_thread_rwlock, RW_WRITER); 5114 list_insert_tail(&ip_thread_list, thh); 5115 rw_exit(&ip_thread_rwlock); 5116 retv = tsd_set(ip_thread_data, thh); 5117 ASSERT(retv == 0); 5118 } 5119 return (thh != NULL ? thh->thh_hash : NULL); 5120 } 5121 5122 boolean_t 5123 th_trace_ref(const void *obj, ip_stack_t *ipst) 5124 { 5125 th_trace_t *th_trace; 5126 mod_hash_t *mh; 5127 mod_hash_val_t val; 5128 5129 if ((mh = th_trace_gethash(ipst)) == NULL) 5130 return (B_FALSE); 5131 5132 /* 5133 * Attempt to locate the trace buffer for this obj and thread. 5134 * If it does not exist, then allocate a new trace buffer and 5135 * insert into the hash. 5136 */ 5137 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) { 5138 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP); 5139 if (th_trace == NULL) 5140 return (B_FALSE); 5141 5142 th_trace->th_id = curthread; 5143 if (mod_hash_insert(mh, (mod_hash_key_t)obj, 5144 (mod_hash_val_t)th_trace) != 0) { 5145 kmem_free(th_trace, sizeof (th_trace_t)); 5146 return (B_FALSE); 5147 } 5148 } else { 5149 th_trace = (th_trace_t *)val; 5150 } 5151 5152 ASSERT(th_trace->th_refcnt >= 0 && 5153 th_trace->th_refcnt < TR_BUF_MAX - 1); 5154 5155 th_trace->th_refcnt++; 5156 th_trace_rrecord(th_trace); 5157 return (B_TRUE); 5158 } 5159 5160 /* 5161 * For the purpose of tracing a reference release, we assume that global 5162 * tracing is always on and that the same thread initiated the reference hold 5163 * is releasing. 5164 */ 5165 void 5166 th_trace_unref(const void *obj) 5167 { 5168 int retv; 5169 mod_hash_t *mh; 5170 th_trace_t *th_trace; 5171 mod_hash_val_t val; 5172 5173 mh = th_trace_gethash(NULL); 5174 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val); 5175 ASSERT(retv == 0); 5176 th_trace = (th_trace_t *)val; 5177 5178 ASSERT(th_trace->th_refcnt > 0); 5179 th_trace->th_refcnt--; 5180 th_trace_rrecord(th_trace); 5181 } 5182 5183 /* 5184 * If tracing has been disabled, then we assume that the reference counts are 5185 * now useless, and we clear them out before destroying the entries. 5186 */ 5187 void 5188 th_trace_cleanup(const void *obj, boolean_t trace_disable) 5189 { 5190 th_hash_t *thh; 5191 mod_hash_t *mh; 5192 mod_hash_val_t val; 5193 th_trace_t *th_trace; 5194 int retv; 5195 5196 rw_enter(&ip_thread_rwlock, RW_READER); 5197 for (thh = list_head(&ip_thread_list); thh != NULL; 5198 thh = list_next(&ip_thread_list, thh)) { 5199 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj, 5200 &val) == 0) { 5201 th_trace = (th_trace_t *)val; 5202 if (trace_disable) 5203 th_trace->th_refcnt = 0; 5204 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj); 5205 ASSERT(retv == 0); 5206 } 5207 } 5208 rw_exit(&ip_thread_rwlock); 5209 } 5210 5211 void 5212 ipif_trace_ref(ipif_t *ipif) 5213 { 5214 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5215 5216 if (ipif->ipif_trace_disable) 5217 return; 5218 5219 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) { 5220 ipif->ipif_trace_disable = B_TRUE; 5221 ipif_trace_cleanup(ipif); 5222 } 5223 } 5224 5225 void 5226 ipif_untrace_ref(ipif_t *ipif) 5227 { 5228 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5229 5230 if (!ipif->ipif_trace_disable) 5231 th_trace_unref(ipif); 5232 } 5233 5234 void 5235 ill_trace_ref(ill_t *ill) 5236 { 5237 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5238 5239 if (ill->ill_trace_disable) 5240 return; 5241 5242 if (!th_trace_ref(ill, ill->ill_ipst)) { 5243 ill->ill_trace_disable = B_TRUE; 5244 ill_trace_cleanup(ill); 5245 } 5246 } 5247 5248 void 5249 ill_untrace_ref(ill_t *ill) 5250 { 5251 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5252 5253 if (!ill->ill_trace_disable) 5254 th_trace_unref(ill); 5255 } 5256 5257 /* 5258 * Called when ipif is unplumbed or when memory alloc fails. Note that on 5259 * failure, ipif_trace_disable is set. 5260 */ 5261 static void 5262 ipif_trace_cleanup(const ipif_t *ipif) 5263 { 5264 th_trace_cleanup(ipif, ipif->ipif_trace_disable); 5265 } 5266 5267 /* 5268 * Called when ill is unplumbed or when memory alloc fails. Note that on 5269 * failure, ill_trace_disable is set. 5270 */ 5271 static void 5272 ill_trace_cleanup(const ill_t *ill) 5273 { 5274 th_trace_cleanup(ill, ill->ill_trace_disable); 5275 } 5276 #endif /* DEBUG */ 5277 5278 void 5279 ipif_refhold_locked(ipif_t *ipif) 5280 { 5281 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5282 ipif->ipif_refcnt++; 5283 IPIF_TRACE_REF(ipif); 5284 } 5285 5286 void 5287 ipif_refhold(ipif_t *ipif) 5288 { 5289 ill_t *ill; 5290 5291 ill = ipif->ipif_ill; 5292 mutex_enter(&ill->ill_lock); 5293 ipif->ipif_refcnt++; 5294 IPIF_TRACE_REF(ipif); 5295 mutex_exit(&ill->ill_lock); 5296 } 5297 5298 /* 5299 * Must not be called while holding any locks. Otherwise if this is 5300 * the last reference to be released there is a chance of recursive mutex 5301 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5302 * to restart an ioctl. 5303 */ 5304 void 5305 ipif_refrele(ipif_t *ipif) 5306 { 5307 ill_t *ill; 5308 5309 ill = ipif->ipif_ill; 5310 5311 mutex_enter(&ill->ill_lock); 5312 ASSERT(ipif->ipif_refcnt != 0); 5313 ipif->ipif_refcnt--; 5314 IPIF_UNTRACE_REF(ipif); 5315 if (ipif->ipif_refcnt != 0) { 5316 mutex_exit(&ill->ill_lock); 5317 return; 5318 } 5319 5320 /* Drops the ill_lock */ 5321 ipif_ill_refrele_tail(ill); 5322 } 5323 5324 ipif_t * 5325 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 5326 { 5327 ipif_t *ipif; 5328 5329 mutex_enter(&ill->ill_lock); 5330 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 5331 ipif != NULL; ipif = ipif->ipif_next) { 5332 if (IPIF_IS_CONDEMNED(ipif)) 5333 continue; 5334 ipif_refhold_locked(ipif); 5335 mutex_exit(&ill->ill_lock); 5336 return (ipif); 5337 } 5338 mutex_exit(&ill->ill_lock); 5339 return (NULL); 5340 } 5341 5342 /* 5343 * TODO: make this table extendible at run time 5344 * Return a pointer to the mac type info for 'mac_type' 5345 */ 5346 static ip_m_t * 5347 ip_m_lookup(t_uscalar_t mac_type) 5348 { 5349 ip_m_t *ipm; 5350 5351 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 5352 if (ipm->ip_m_mac_type == mac_type) 5353 return (ipm); 5354 return (NULL); 5355 } 5356 5357 /* 5358 * Make a link layer address from the multicast IP address *addr. 5359 * To form the link layer address, invoke the ip_m_v*mapping function 5360 * associated with the link-layer type. 5361 */ 5362 void 5363 ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr) 5364 { 5365 ip_m_t *ipm; 5366 5367 if (ill->ill_net_type == IRE_IF_NORESOLVER) 5368 return; 5369 5370 ASSERT(addr != NULL); 5371 5372 ipm = ip_m_lookup(ill->ill_mactype); 5373 if (ipm == NULL || 5374 (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) || 5375 (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) { 5376 ip0dbg(("no mapping for ill %s mactype 0x%x\n", 5377 ill->ill_name, ill->ill_mactype)); 5378 return; 5379 } 5380 if (ill->ill_isv6) 5381 (*ipm->ip_m_v6mapping)(ill, addr, hwaddr); 5382 else 5383 (*ipm->ip_m_v4mapping)(ill, addr, hwaddr); 5384 } 5385 5386 /* 5387 * ip_rt_add is called to add an IPv4 route to the forwarding table. 5388 * ill is passed in to associate it with the correct interface. 5389 * If ire_arg is set, then we return the held IRE in that location. 5390 */ 5391 int 5392 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 5393 ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg, 5394 boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid) 5395 { 5396 ire_t *ire, *nire; 5397 ire_t *gw_ire = NULL; 5398 ipif_t *ipif = NULL; 5399 uint_t type; 5400 int match_flags = MATCH_IRE_TYPE; 5401 tsol_gc_t *gc = NULL; 5402 tsol_gcgrp_t *gcgrp = NULL; 5403 boolean_t gcgrp_xtraref = B_FALSE; 5404 boolean_t cgtp_broadcast; 5405 5406 ip1dbg(("ip_rt_add:")); 5407 5408 if (ire_arg != NULL) 5409 *ire_arg = NULL; 5410 5411 /* 5412 * If this is the case of RTF_HOST being set, then we set the netmask 5413 * to all ones (regardless if one was supplied). 5414 */ 5415 if (flags & RTF_HOST) 5416 mask = IP_HOST_MASK; 5417 5418 /* 5419 * Prevent routes with a zero gateway from being created (since 5420 * interfaces can currently be plumbed and brought up no assigned 5421 * address). 5422 */ 5423 if (gw_addr == 0) 5424 return (ENETUNREACH); 5425 /* 5426 * Get the ipif, if any, corresponding to the gw_addr 5427 * If -ifp was specified we restrict ourselves to the ill, otherwise 5428 * we match on the gatway and destination to handle unnumbered pt-pt 5429 * interfaces. 5430 */ 5431 if (ill != NULL) 5432 ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst); 5433 else 5434 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); 5435 if (ipif != NULL) { 5436 if (IS_VNI(ipif->ipif_ill)) { 5437 ipif_refrele(ipif); 5438 return (EINVAL); 5439 } 5440 } 5441 5442 /* 5443 * GateD will attempt to create routes with a loopback interface 5444 * address as the gateway and with RTF_GATEWAY set. We allow 5445 * these routes to be added, but create them as interface routes 5446 * since the gateway is an interface address. 5447 */ 5448 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 5449 flags &= ~RTF_GATEWAY; 5450 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 5451 mask == IP_HOST_MASK) { 5452 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK, 5453 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, 5454 NULL); 5455 if (ire != NULL) { 5456 ire_refrele(ire); 5457 ipif_refrele(ipif); 5458 return (EEXIST); 5459 } 5460 ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x" 5461 "for 0x%x\n", (void *)ipif, 5462 ipif->ipif_ire_type, 5463 ntohl(ipif->ipif_lcl_addr))); 5464 ire = ire_create( 5465 (uchar_t *)&dst_addr, /* dest address */ 5466 (uchar_t *)&mask, /* mask */ 5467 NULL, /* no gateway */ 5468 ipif->ipif_ire_type, /* LOOPBACK */ 5469 ipif->ipif_ill, 5470 zoneid, 5471 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0, 5472 NULL, 5473 ipst); 5474 5475 if (ire == NULL) { 5476 ipif_refrele(ipif); 5477 return (ENOMEM); 5478 } 5479 /* src address assigned by the caller? */ 5480 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5481 ire->ire_setsrc_addr = src_addr; 5482 5483 nire = ire_add(ire); 5484 if (nire == NULL) { 5485 /* 5486 * In the result of failure, ire_add() will have 5487 * already deleted the ire in question, so there 5488 * is no need to do that here. 5489 */ 5490 ipif_refrele(ipif); 5491 return (ENOMEM); 5492 } 5493 /* 5494 * Check if it was a duplicate entry. This handles 5495 * the case of two racing route adds for the same route 5496 */ 5497 if (nire != ire) { 5498 ASSERT(nire->ire_identical_ref > 1); 5499 ire_delete(nire); 5500 ire_refrele(nire); 5501 ipif_refrele(ipif); 5502 return (EEXIST); 5503 } 5504 ire = nire; 5505 goto save_ire; 5506 } 5507 } 5508 5509 /* 5510 * The routes for multicast with CGTP are quite special in that 5511 * the gateway is the local interface address, yet RTF_GATEWAY 5512 * is set. We turn off RTF_GATEWAY to provide compatibility with 5513 * this undocumented and unusual use of multicast routes. 5514 */ 5515 if ((flags & RTF_MULTIRT) && ipif != NULL) 5516 flags &= ~RTF_GATEWAY; 5517 5518 /* 5519 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 5520 * and the gateway address provided is one of the system's interface 5521 * addresses. By using the routing socket interface and supplying an 5522 * RTA_IFP sockaddr with an interface index, an alternate method of 5523 * specifying an interface route to be created is available which uses 5524 * the interface index that specifies the outgoing interface rather than 5525 * the address of an outgoing interface (which may not be able to 5526 * uniquely identify an interface). When coupled with the RTF_GATEWAY 5527 * flag, routes can be specified which not only specify the next-hop to 5528 * be used when routing to a certain prefix, but also which outgoing 5529 * interface should be used. 5530 * 5531 * Previously, interfaces would have unique addresses assigned to them 5532 * and so the address assigned to a particular interface could be used 5533 * to identify a particular interface. One exception to this was the 5534 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 5535 * 5536 * With the advent of IPv6 and its link-local addresses, this 5537 * restriction was relaxed and interfaces could share addresses between 5538 * themselves. In fact, typically all of the link-local interfaces on 5539 * an IPv6 node or router will have the same link-local address. In 5540 * order to differentiate between these interfaces, the use of an 5541 * interface index is necessary and this index can be carried inside a 5542 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 5543 * of using the interface index, however, is that all of the ipif's that 5544 * are part of an ill have the same index and so the RTA_IFP sockaddr 5545 * cannot be used to differentiate between ipif's (or logical 5546 * interfaces) that belong to the same ill (physical interface). 5547 * 5548 * For example, in the following case involving IPv4 interfaces and 5549 * logical interfaces 5550 * 5551 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 5552 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0 5553 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0 5554 * 5555 * the ipif's corresponding to each of these interface routes can be 5556 * uniquely identified by the "gateway" (actually interface address). 5557 * 5558 * In this case involving multiple IPv6 default routes to a particular 5559 * link-local gateway, the use of RTA_IFP is necessary to specify which 5560 * default route is of interest: 5561 * 5562 * default fe80::123:4567:89ab:cdef U if0 5563 * default fe80::123:4567:89ab:cdef U if1 5564 */ 5565 5566 /* RTF_GATEWAY not set */ 5567 if (!(flags & RTF_GATEWAY)) { 5568 if (sp != NULL) { 5569 ip2dbg(("ip_rt_add: gateway security attributes " 5570 "cannot be set with interface route\n")); 5571 if (ipif != NULL) 5572 ipif_refrele(ipif); 5573 return (EINVAL); 5574 } 5575 5576 /* 5577 * Whether or not ill (RTA_IFP) is set, we require that 5578 * the gateway is one of our local addresses. 5579 */ 5580 if (ipif == NULL) 5581 return (ENETUNREACH); 5582 5583 /* 5584 * We use MATCH_IRE_ILL here. If the caller specified an 5585 * interface (from the RTA_IFP sockaddr) we use it, otherwise 5586 * we use the ill derived from the gateway address. 5587 * We can always match the gateway address since we record it 5588 * in ire_gateway_addr. 5589 * We don't allow RTA_IFP to specify a different ill than the 5590 * one matching the ipif to make sure we can delete the route. 5591 */ 5592 match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL; 5593 if (ill == NULL) { 5594 ill = ipif->ipif_ill; 5595 } else if (ill != ipif->ipif_ill) { 5596 ipif_refrele(ipif); 5597 return (EINVAL); 5598 } 5599 5600 /* 5601 * We check for an existing entry at this point. 5602 * 5603 * Since a netmask isn't passed in via the ioctl interface 5604 * (SIOCADDRT), we don't check for a matching netmask in that 5605 * case. 5606 */ 5607 if (!ioctl_msg) 5608 match_flags |= MATCH_IRE_MASK; 5609 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, 5610 IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst, 5611 NULL); 5612 if (ire != NULL) { 5613 ire_refrele(ire); 5614 ipif_refrele(ipif); 5615 return (EEXIST); 5616 } 5617 5618 /* 5619 * Create a copy of the IRE_LOOPBACK, IRE_IF_NORESOLVER or 5620 * IRE_IF_RESOLVER with the modified address, netmask, and 5621 * gateway. 5622 */ 5623 ire = ire_create( 5624 (uchar_t *)&dst_addr, 5625 (uint8_t *)&mask, 5626 (uint8_t *)&gw_addr, 5627 ill->ill_net_type, 5628 ill, 5629 zoneid, 5630 flags, 5631 NULL, 5632 ipst); 5633 if (ire == NULL) { 5634 ipif_refrele(ipif); 5635 return (ENOMEM); 5636 } 5637 5638 /* 5639 * Some software (for example, GateD and Sun Cluster) attempts 5640 * to create (what amount to) IRE_PREFIX routes with the 5641 * loopback address as the gateway. This is primarily done to 5642 * set up prefixes with the RTF_REJECT flag set (for example, 5643 * when generating aggregate routes.) 5644 * 5645 * If the IRE type (as defined by ill->ill_net_type) is 5646 * IRE_LOOPBACK, then we map the request into a 5647 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as 5648 * these interface routes, by definition, can only be that. 5649 * 5650 * Needless to say, the real IRE_LOOPBACK is NOT created by this 5651 * routine, but rather using ire_create() directly. 5652 * 5653 */ 5654 if (ill->ill_net_type == IRE_LOOPBACK) { 5655 ire->ire_type = IRE_IF_NORESOLVER; 5656 ire->ire_flags |= RTF_BLACKHOLE; 5657 } 5658 5659 /* src address assigned by the caller? */ 5660 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5661 ire->ire_setsrc_addr = src_addr; 5662 5663 nire = ire_add(ire); 5664 if (nire == NULL) { 5665 /* 5666 * In the result of failure, ire_add() will have 5667 * already deleted the ire in question, so there 5668 * is no need to do that here. 5669 */ 5670 ipif_refrele(ipif); 5671 return (ENOMEM); 5672 } 5673 /* 5674 * Check if it was a duplicate entry. This handles 5675 * the case of two racing route adds for the same route 5676 */ 5677 if (nire != ire) { 5678 ire_delete(nire); 5679 ire_refrele(nire); 5680 ipif_refrele(ipif); 5681 return (EEXIST); 5682 } 5683 ire = nire; 5684 goto save_ire; 5685 } 5686 5687 /* 5688 * Get an interface IRE for the specified gateway. 5689 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 5690 * gateway, it is currently unreachable and we fail the request 5691 * accordingly. 5692 * If RTA_IFP was specified we look on that particular ill. 5693 */ 5694 if (ill != NULL) 5695 match_flags |= MATCH_IRE_ILL; 5696 5697 /* Check whether the gateway is reachable. */ 5698 again: 5699 type = IRE_INTERFACE; 5700 if (flags & RTF_INDIRECT) 5701 type |= IRE_OFFLINK; 5702 5703 gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill, 5704 ALL_ZONES, NULL, match_flags, 0, ipst, NULL); 5705 if (gw_ire == NULL) { 5706 /* 5707 * With IPMP, we allow host routes to influence in.mpathd's 5708 * target selection. However, if the test addresses are on 5709 * their own network, the above lookup will fail since the 5710 * underlying IRE_INTERFACEs are marked hidden. So allow 5711 * hidden test IREs to be found and try again. 5712 */ 5713 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) { 5714 match_flags |= MATCH_IRE_TESTHIDDEN; 5715 goto again; 5716 } 5717 5718 if (ipif != NULL) 5719 ipif_refrele(ipif); 5720 return (ENETUNREACH); 5721 } 5722 5723 /* 5724 * We create one of three types of IREs as a result of this request 5725 * based on the netmask. A netmask of all ones (which is automatically 5726 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 5727 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 5728 * created. Otherwise, an IRE_PREFIX route is created for the 5729 * destination prefix. 5730 */ 5731 if (mask == IP_HOST_MASK) 5732 type = IRE_HOST; 5733 else if (mask == 0) 5734 type = IRE_DEFAULT; 5735 else 5736 type = IRE_PREFIX; 5737 5738 /* check for a duplicate entry */ 5739 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, 5740 ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, 5741 0, ipst, NULL); 5742 if (ire != NULL) { 5743 if (ipif != NULL) 5744 ipif_refrele(ipif); 5745 ire_refrele(gw_ire); 5746 ire_refrele(ire); 5747 return (EEXIST); 5748 } 5749 5750 /* Security attribute exists */ 5751 if (sp != NULL) { 5752 tsol_gcgrp_addr_t ga; 5753 5754 /* find or create the gateway credentials group */ 5755 ga.ga_af = AF_INET; 5756 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 5757 5758 /* we hold reference to it upon success */ 5759 gcgrp = gcgrp_lookup(&ga, B_TRUE); 5760 if (gcgrp == NULL) { 5761 if (ipif != NULL) 5762 ipif_refrele(ipif); 5763 ire_refrele(gw_ire); 5764 return (ENOMEM); 5765 } 5766 5767 /* 5768 * Create and add the security attribute to the group; a 5769 * reference to the group is made upon allocating a new 5770 * entry successfully. If it finds an already-existing 5771 * entry for the security attribute in the group, it simply 5772 * returns it and no new reference is made to the group. 5773 */ 5774 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 5775 if (gc == NULL) { 5776 if (ipif != NULL) 5777 ipif_refrele(ipif); 5778 /* release reference held by gcgrp_lookup */ 5779 GCGRP_REFRELE(gcgrp); 5780 ire_refrele(gw_ire); 5781 return (ENOMEM); 5782 } 5783 } 5784 5785 /* Create the IRE. */ 5786 ire = ire_create( 5787 (uchar_t *)&dst_addr, /* dest address */ 5788 (uchar_t *)&mask, /* mask */ 5789 (uchar_t *)&gw_addr, /* gateway address */ 5790 (ushort_t)type, /* IRE type */ 5791 ill, 5792 zoneid, 5793 flags, 5794 gc, /* security attribute */ 5795 ipst); 5796 5797 /* 5798 * The ire holds a reference to the 'gc' and the 'gc' holds a 5799 * reference to the 'gcgrp'. We can now release the extra reference 5800 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 5801 */ 5802 if (gcgrp_xtraref) 5803 GCGRP_REFRELE(gcgrp); 5804 if (ire == NULL) { 5805 if (gc != NULL) 5806 GC_REFRELE(gc); 5807 if (ipif != NULL) 5808 ipif_refrele(ipif); 5809 ire_refrele(gw_ire); 5810 return (ENOMEM); 5811 } 5812 5813 /* Before we add, check if an extra CGTP broadcast is needed */ 5814 cgtp_broadcast = ((flags & RTF_MULTIRT) && 5815 ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST); 5816 5817 /* src address assigned by the caller? */ 5818 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5819 ire->ire_setsrc_addr = src_addr; 5820 5821 /* 5822 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 5823 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 5824 */ 5825 5826 /* Add the new IRE. */ 5827 nire = ire_add(ire); 5828 if (nire == NULL) { 5829 /* 5830 * In the result of failure, ire_add() will have 5831 * already deleted the ire in question, so there 5832 * is no need to do that here. 5833 */ 5834 if (ipif != NULL) 5835 ipif_refrele(ipif); 5836 ire_refrele(gw_ire); 5837 return (ENOMEM); 5838 } 5839 /* 5840 * Check if it was a duplicate entry. This handles 5841 * the case of two racing route adds for the same route 5842 */ 5843 if (nire != ire) { 5844 ire_delete(nire); 5845 ire_refrele(nire); 5846 if (ipif != NULL) 5847 ipif_refrele(ipif); 5848 ire_refrele(gw_ire); 5849 return (EEXIST); 5850 } 5851 ire = nire; 5852 5853 if (flags & RTF_MULTIRT) { 5854 /* 5855 * Invoke the CGTP (multirouting) filtering module 5856 * to add the dst address in the filtering database. 5857 * Replicated inbound packets coming from that address 5858 * will be filtered to discard the duplicates. 5859 * It is not necessary to call the CGTP filter hook 5860 * when the dst address is a broadcast or multicast, 5861 * because an IP source address cannot be a broadcast 5862 * or a multicast. 5863 */ 5864 if (cgtp_broadcast) { 5865 ip_cgtp_bcast_add(ire, ipst); 5866 goto save_ire; 5867 } 5868 if (ipst->ips_ip_cgtp_filter_ops != NULL && 5869 !CLASSD(ire->ire_addr)) { 5870 int res; 5871 ipif_t *src_ipif; 5872 5873 /* Find the source address corresponding to gw_ire */ 5874 src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr, 5875 NULL, zoneid, ipst); 5876 if (src_ipif != NULL) { 5877 res = ipst->ips_ip_cgtp_filter_ops-> 5878 cfo_add_dest_v4( 5879 ipst->ips_netstack->netstack_stackid, 5880 ire->ire_addr, 5881 ire->ire_gateway_addr, 5882 ire->ire_setsrc_addr, 5883 src_ipif->ipif_lcl_addr); 5884 ipif_refrele(src_ipif); 5885 } else { 5886 res = EADDRNOTAVAIL; 5887 } 5888 if (res != 0) { 5889 if (ipif != NULL) 5890 ipif_refrele(ipif); 5891 ire_refrele(gw_ire); 5892 ire_delete(ire); 5893 ire_refrele(ire); /* Held in ire_add */ 5894 return (res); 5895 } 5896 } 5897 } 5898 5899 save_ire: 5900 if (gw_ire != NULL) { 5901 ire_refrele(gw_ire); 5902 gw_ire = NULL; 5903 } 5904 if (ill != NULL) { 5905 /* 5906 * Save enough information so that we can recreate the IRE if 5907 * the interface goes down and then up. The metrics associated 5908 * with the route will be saved as well when rts_setmetrics() is 5909 * called after the IRE has been created. In the case where 5910 * memory cannot be allocated, none of this information will be 5911 * saved. 5912 */ 5913 ill_save_ire(ill, ire); 5914 } 5915 if (ioctl_msg) 5916 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst); 5917 if (ire_arg != NULL) { 5918 /* 5919 * Store the ire that was successfully added into where ire_arg 5920 * points to so that callers don't have to look it up 5921 * themselves (but they are responsible for ire_refrele()ing 5922 * the ire when they are finished with it). 5923 */ 5924 *ire_arg = ire; 5925 } else { 5926 ire_refrele(ire); /* Held in ire_add */ 5927 } 5928 if (ipif != NULL) 5929 ipif_refrele(ipif); 5930 return (0); 5931 } 5932 5933 /* 5934 * ip_rt_delete is called to delete an IPv4 route. 5935 * ill is passed in to associate it with the correct interface. 5936 */ 5937 /* ARGSUSED4 */ 5938 int 5939 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 5940 uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg, 5941 ip_stack_t *ipst, zoneid_t zoneid) 5942 { 5943 ire_t *ire = NULL; 5944 ipif_t *ipif; 5945 uint_t type; 5946 uint_t match_flags = MATCH_IRE_TYPE; 5947 int err = 0; 5948 5949 ip1dbg(("ip_rt_delete:")); 5950 /* 5951 * If this is the case of RTF_HOST being set, then we set the netmask 5952 * to all ones. Otherwise, we use the netmask if one was supplied. 5953 */ 5954 if (flags & RTF_HOST) { 5955 mask = IP_HOST_MASK; 5956 match_flags |= MATCH_IRE_MASK; 5957 } else if (rtm_addrs & RTA_NETMASK) { 5958 match_flags |= MATCH_IRE_MASK; 5959 } 5960 5961 /* 5962 * Note that RTF_GATEWAY is never set on a delete, therefore 5963 * we check if the gateway address is one of our interfaces first, 5964 * and fall back on RTF_GATEWAY routes. 5965 * 5966 * This makes it possible to delete an original 5967 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 5968 * However, we have RTF_KERNEL set on the ones created by ipif_up 5969 * and those can not be deleted here. 5970 * 5971 * We use MATCH_IRE_ILL if we know the interface. If the caller 5972 * specified an interface (from the RTA_IFP sockaddr) we use it, 5973 * otherwise we use the ill derived from the gateway address. 5974 * We can always match the gateway address since we record it 5975 * in ire_gateway_addr. 5976 * 5977 * For more detail on specifying routes by gateway address and by 5978 * interface index, see the comments in ip_rt_add(). 5979 */ 5980 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); 5981 if (ipif != NULL) { 5982 ill_t *ill_match; 5983 5984 if (ill != NULL) 5985 ill_match = ill; 5986 else 5987 ill_match = ipif->ipif_ill; 5988 5989 match_flags |= MATCH_IRE_ILL; 5990 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 5991 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK, 5992 ill_match, ALL_ZONES, NULL, match_flags, 0, ipst, 5993 NULL); 5994 } 5995 if (ire == NULL) { 5996 match_flags |= MATCH_IRE_GW; 5997 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, 5998 IRE_INTERFACE, ill_match, ALL_ZONES, NULL, 5999 match_flags, 0, ipst, NULL); 6000 } 6001 /* Avoid deleting routes created by kernel from an ipif */ 6002 if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) { 6003 ire_refrele(ire); 6004 ire = NULL; 6005 } 6006 6007 /* Restore in case we didn't find a match */ 6008 match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL); 6009 } 6010 6011 if (ire == NULL) { 6012 /* 6013 * At this point, the gateway address is not one of our own 6014 * addresses or a matching interface route was not found. We 6015 * set the IRE type to lookup based on whether 6016 * this is a host route, a default route or just a prefix. 6017 * 6018 * If an ill was passed in, then the lookup is based on an 6019 * interface index so MATCH_IRE_ILL is added to match_flags. 6020 */ 6021 match_flags |= MATCH_IRE_GW; 6022 if (ill != NULL) 6023 match_flags |= MATCH_IRE_ILL; 6024 if (mask == IP_HOST_MASK) 6025 type = IRE_HOST; 6026 else if (mask == 0) 6027 type = IRE_DEFAULT; 6028 else 6029 type = IRE_PREFIX; 6030 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, 6031 ALL_ZONES, NULL, match_flags, 0, ipst, NULL); 6032 } 6033 6034 if (ipif != NULL) { 6035 ipif_refrele(ipif); 6036 ipif = NULL; 6037 } 6038 6039 if (ire == NULL) 6040 return (ESRCH); 6041 6042 if (ire->ire_flags & RTF_MULTIRT) { 6043 /* 6044 * Invoke the CGTP (multirouting) filtering module 6045 * to remove the dst address from the filtering database. 6046 * Packets coming from that address will no longer be 6047 * filtered to remove duplicates. 6048 */ 6049 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 6050 err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4( 6051 ipst->ips_netstack->netstack_stackid, 6052 ire->ire_addr, ire->ire_gateway_addr); 6053 } 6054 ip_cgtp_bcast_delete(ire, ipst); 6055 } 6056 6057 ill = ire->ire_ill; 6058 if (ill != NULL) 6059 ill_remove_saved_ire(ill, ire); 6060 if (ioctl_msg) 6061 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst); 6062 ire_delete(ire); 6063 ire_refrele(ire); 6064 return (err); 6065 } 6066 6067 /* 6068 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 6069 */ 6070 /* ARGSUSED */ 6071 int 6072 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6073 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6074 { 6075 ipaddr_t dst_addr; 6076 ipaddr_t gw_addr; 6077 ipaddr_t mask; 6078 int error = 0; 6079 mblk_t *mp1; 6080 struct rtentry *rt; 6081 ipif_t *ipif = NULL; 6082 ip_stack_t *ipst; 6083 6084 ASSERT(q->q_next == NULL); 6085 ipst = CONNQ_TO_IPST(q); 6086 6087 ip1dbg(("ip_siocaddrt:")); 6088 /* Existence of mp1 verified in ip_wput_nondata */ 6089 mp1 = mp->b_cont->b_cont; 6090 rt = (struct rtentry *)mp1->b_rptr; 6091 6092 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6093 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6094 6095 /* 6096 * If the RTF_HOST flag is on, this is a request to assign a gateway 6097 * to a particular host address. In this case, we set the netmask to 6098 * all ones for the particular destination address. Otherwise, 6099 * determine the netmask to be used based on dst_addr and the interfaces 6100 * in use. 6101 */ 6102 if (rt->rt_flags & RTF_HOST) { 6103 mask = IP_HOST_MASK; 6104 } else { 6105 /* 6106 * Note that ip_subnet_mask returns a zero mask in the case of 6107 * default (an all-zeroes address). 6108 */ 6109 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 6110 } 6111 6112 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 6113 B_TRUE, NULL, ipst, ALL_ZONES); 6114 if (ipif != NULL) 6115 ipif_refrele(ipif); 6116 return (error); 6117 } 6118 6119 /* 6120 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 6121 */ 6122 /* ARGSUSED */ 6123 int 6124 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6125 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6126 { 6127 ipaddr_t dst_addr; 6128 ipaddr_t gw_addr; 6129 ipaddr_t mask; 6130 int error; 6131 mblk_t *mp1; 6132 struct rtentry *rt; 6133 ipif_t *ipif = NULL; 6134 ip_stack_t *ipst; 6135 6136 ASSERT(q->q_next == NULL); 6137 ipst = CONNQ_TO_IPST(q); 6138 6139 ip1dbg(("ip_siocdelrt:")); 6140 /* Existence of mp1 verified in ip_wput_nondata */ 6141 mp1 = mp->b_cont->b_cont; 6142 rt = (struct rtentry *)mp1->b_rptr; 6143 6144 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6145 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6146 6147 /* 6148 * If the RTF_HOST flag is on, this is a request to delete a gateway 6149 * to a particular host address. In this case, we set the netmask to 6150 * all ones for the particular destination address. Otherwise, 6151 * determine the netmask to be used based on dst_addr and the interfaces 6152 * in use. 6153 */ 6154 if (rt->rt_flags & RTF_HOST) { 6155 mask = IP_HOST_MASK; 6156 } else { 6157 /* 6158 * Note that ip_subnet_mask returns a zero mask in the case of 6159 * default (an all-zeroes address). 6160 */ 6161 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 6162 } 6163 6164 error = ip_rt_delete(dst_addr, mask, gw_addr, 6165 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, 6166 ipst, ALL_ZONES); 6167 if (ipif != NULL) 6168 ipif_refrele(ipif); 6169 return (error); 6170 } 6171 6172 /* 6173 * Enqueue the mp onto the ipsq, chained by b_next. 6174 * b_prev stores the function to be executed later, and b_queue the queue 6175 * where this mp originated. 6176 */ 6177 void 6178 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 6179 ill_t *pending_ill) 6180 { 6181 conn_t *connp; 6182 ipxop_t *ipx = ipsq->ipsq_xop; 6183 6184 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 6185 ASSERT(MUTEX_HELD(&ipx->ipx_lock)); 6186 ASSERT(func != NULL); 6187 6188 mp->b_queue = q; 6189 mp->b_prev = (void *)func; 6190 mp->b_next = NULL; 6191 6192 switch (type) { 6193 case CUR_OP: 6194 if (ipx->ipx_mptail != NULL) { 6195 ASSERT(ipx->ipx_mphead != NULL); 6196 ipx->ipx_mptail->b_next = mp; 6197 } else { 6198 ASSERT(ipx->ipx_mphead == NULL); 6199 ipx->ipx_mphead = mp; 6200 } 6201 ipx->ipx_mptail = mp; 6202 break; 6203 6204 case NEW_OP: 6205 if (ipsq->ipsq_xopq_mptail != NULL) { 6206 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 6207 ipsq->ipsq_xopq_mptail->b_next = mp; 6208 } else { 6209 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 6210 ipsq->ipsq_xopq_mphead = mp; 6211 } 6212 ipsq->ipsq_xopq_mptail = mp; 6213 ipx->ipx_ipsq_queued = B_TRUE; 6214 break; 6215 6216 case SWITCH_OP: 6217 ASSERT(ipsq->ipsq_swxop != NULL); 6218 /* only one switch operation is currently allowed */ 6219 ASSERT(ipsq->ipsq_switch_mp == NULL); 6220 ipsq->ipsq_switch_mp = mp; 6221 ipx->ipx_ipsq_queued = B_TRUE; 6222 break; 6223 default: 6224 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 6225 } 6226 6227 if (CONN_Q(q) && pending_ill != NULL) { 6228 connp = Q_TO_CONN(q); 6229 ASSERT(MUTEX_HELD(&connp->conn_lock)); 6230 connp->conn_oper_pending_ill = pending_ill; 6231 } 6232 } 6233 6234 /* 6235 * Dequeue the next message that requested exclusive access to this IPSQ's 6236 * xop. Specifically: 6237 * 6238 * 1. If we're still processing the current operation on `ipsq', then 6239 * dequeue the next message for the operation (from ipx_mphead), or 6240 * return NULL if there are no queued messages for the operation. 6241 * These messages are queued via CUR_OP to qwriter_ip() and friends. 6242 * 6243 * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is 6244 * not set) see if the ipsq has requested an xop switch. If so, switch 6245 * `ipsq' to a different xop. Xop switches only happen when joining or 6246 * leaving IPMP groups and require a careful dance -- see the comments 6247 * in-line below for details. If we're leaving a group xop or if we're 6248 * joining a group xop and become writer on it, then we proceed to (3). 6249 * Otherwise, we return NULL and exit the xop. 6250 * 6251 * 3. For each IPSQ in the xop, return any switch operation stored on 6252 * ipsq_switch_mp (set via SWITCH_OP); these must be processed before 6253 * any other messages queued on the IPSQ. Otherwise, dequeue the next 6254 * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead. 6255 * Note that if the phyint tied to `ipsq' is not using IPMP there will 6256 * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for 6257 * each phyint in the group, including the IPMP meta-interface phyint. 6258 */ 6259 static mblk_t * 6260 ipsq_dq(ipsq_t *ipsq) 6261 { 6262 ill_t *illv4, *illv6; 6263 mblk_t *mp; 6264 ipsq_t *xopipsq; 6265 ipsq_t *leftipsq = NULL; 6266 ipxop_t *ipx; 6267 phyint_t *phyi = ipsq->ipsq_phyint; 6268 ip_stack_t *ipst = ipsq->ipsq_ipst; 6269 boolean_t emptied = B_FALSE; 6270 6271 /* 6272 * Grab all the locks we need in the defined order (ill_g_lock -> 6273 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next. 6274 */ 6275 rw_enter(&ipst->ips_ill_g_lock, 6276 ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER); 6277 mutex_enter(&ipsq->ipsq_lock); 6278 ipx = ipsq->ipsq_xop; 6279 mutex_enter(&ipx->ipx_lock); 6280 6281 /* 6282 * Dequeue the next message associated with the current exclusive 6283 * operation, if any. 6284 */ 6285 if ((mp = ipx->ipx_mphead) != NULL) { 6286 ipx->ipx_mphead = mp->b_next; 6287 if (ipx->ipx_mphead == NULL) 6288 ipx->ipx_mptail = NULL; 6289 mp->b_next = (void *)ipsq; 6290 goto out; 6291 } 6292 6293 if (ipx->ipx_current_ipif != NULL) 6294 goto empty; 6295 6296 if (ipsq->ipsq_swxop != NULL) { 6297 /* 6298 * The exclusive operation that is now being completed has 6299 * requested a switch to a different xop. This happens 6300 * when an interface joins or leaves an IPMP group. Joins 6301 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()). 6302 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb 6303 * (phyint_free()), or interface plumb for an ill type 6304 * not in the IPMP group (ip_rput_dlpi_writer()). 6305 * 6306 * Xop switches are not allowed on the IPMP meta-interface. 6307 */ 6308 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP)); 6309 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 6310 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq); 6311 6312 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) { 6313 /* 6314 * We're switching back to our own xop, so we have two 6315 * xop's to drain/exit: our own, and the group xop 6316 * that we are leaving. 6317 * 6318 * First, pull ourselves out of the group ipsq list. 6319 * This is safe since we're writer on ill_g_lock. 6320 */ 6321 ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop); 6322 6323 xopipsq = ipx->ipx_ipsq; 6324 while (xopipsq->ipsq_next != ipsq) 6325 xopipsq = xopipsq->ipsq_next; 6326 6327 xopipsq->ipsq_next = ipsq->ipsq_next; 6328 ipsq->ipsq_next = ipsq; 6329 ipsq->ipsq_xop = ipsq->ipsq_swxop; 6330 ipsq->ipsq_swxop = NULL; 6331 6332 /* 6333 * Second, prepare to exit the group xop. The actual 6334 * ipsq_exit() is done at the end of this function 6335 * since we cannot hold any locks across ipsq_exit(). 6336 * Note that although we drop the group's ipx_lock, no 6337 * threads can proceed since we're still ipx_writer. 6338 */ 6339 leftipsq = xopipsq; 6340 mutex_exit(&ipx->ipx_lock); 6341 6342 /* 6343 * Third, set ipx to point to our own xop (which was 6344 * inactive and therefore can be entered). 6345 */ 6346 ipx = ipsq->ipsq_xop; 6347 mutex_enter(&ipx->ipx_lock); 6348 ASSERT(ipx->ipx_writer == NULL); 6349 ASSERT(ipx->ipx_current_ipif == NULL); 6350 } else { 6351 /* 6352 * We're switching from our own xop to a group xop. 6353 * The requestor of the switch must ensure that the 6354 * group xop cannot go away (e.g. by ensuring the 6355 * phyint associated with the xop cannot go away). 6356 * 6357 * If we can become writer on our new xop, then we'll 6358 * do the drain. Otherwise, the current writer of our 6359 * new xop will do the drain when it exits. 6360 * 6361 * First, splice ourselves into the group IPSQ list. 6362 * This is safe since we're writer on ill_g_lock. 6363 */ 6364 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 6365 6366 xopipsq = ipsq->ipsq_swxop->ipx_ipsq; 6367 while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq) 6368 xopipsq = xopipsq->ipsq_next; 6369 6370 xopipsq->ipsq_next = ipsq; 6371 ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq; 6372 ipsq->ipsq_xop = ipsq->ipsq_swxop; 6373 ipsq->ipsq_swxop = NULL; 6374 6375 /* 6376 * Second, exit our own xop, since it's now unused. 6377 * This is safe since we've got the only reference. 6378 */ 6379 ASSERT(ipx->ipx_writer == curthread); 6380 ipx->ipx_writer = NULL; 6381 VERIFY(--ipx->ipx_reentry_cnt == 0); 6382 ipx->ipx_ipsq_queued = B_FALSE; 6383 mutex_exit(&ipx->ipx_lock); 6384 6385 /* 6386 * Third, set ipx to point to our new xop, and check 6387 * if we can become writer on it. If we cannot, then 6388 * the current writer will drain the IPSQ group when 6389 * it exits. Our ipsq_xop is guaranteed to be stable 6390 * because we're still holding ipsq_lock. 6391 */ 6392 ipx = ipsq->ipsq_xop; 6393 mutex_enter(&ipx->ipx_lock); 6394 if (ipx->ipx_writer != NULL || 6395 ipx->ipx_current_ipif != NULL) { 6396 goto out; 6397 } 6398 } 6399 6400 /* 6401 * Fourth, become writer on our new ipx before we continue 6402 * with the drain. Note that we never dropped ipsq_lock 6403 * above, so no other thread could've raced with us to 6404 * become writer first. Also, we're holding ipx_lock, so 6405 * no other thread can examine the ipx right now. 6406 */ 6407 ASSERT(ipx->ipx_current_ipif == NULL); 6408 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 6409 VERIFY(ipx->ipx_reentry_cnt++ == 0); 6410 ipx->ipx_writer = curthread; 6411 ipx->ipx_forced = B_FALSE; 6412 #ifdef DEBUG 6413 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6414 #endif 6415 } 6416 6417 xopipsq = ipsq; 6418 do { 6419 /* 6420 * So that other operations operate on a consistent and 6421 * complete phyint, a switch message on an IPSQ must be 6422 * handled prior to any other operations on that IPSQ. 6423 */ 6424 if ((mp = xopipsq->ipsq_switch_mp) != NULL) { 6425 xopipsq->ipsq_switch_mp = NULL; 6426 ASSERT(mp->b_next == NULL); 6427 mp->b_next = (void *)xopipsq; 6428 goto out; 6429 } 6430 6431 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) { 6432 xopipsq->ipsq_xopq_mphead = mp->b_next; 6433 if (xopipsq->ipsq_xopq_mphead == NULL) 6434 xopipsq->ipsq_xopq_mptail = NULL; 6435 mp->b_next = (void *)xopipsq; 6436 goto out; 6437 } 6438 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 6439 empty: 6440 /* 6441 * There are no messages. Further, we are holding ipx_lock, hence no 6442 * new messages can end up on any IPSQ in the xop. 6443 */ 6444 ipx->ipx_writer = NULL; 6445 ipx->ipx_forced = B_FALSE; 6446 VERIFY(--ipx->ipx_reentry_cnt == 0); 6447 ipx->ipx_ipsq_queued = B_FALSE; 6448 emptied = B_TRUE; 6449 #ifdef DEBUG 6450 ipx->ipx_depth = 0; 6451 #endif 6452 out: 6453 mutex_exit(&ipx->ipx_lock); 6454 mutex_exit(&ipsq->ipsq_lock); 6455 6456 /* 6457 * If we completely emptied the xop, then wake up any threads waiting 6458 * to enter any of the IPSQ's associated with it. 6459 */ 6460 if (emptied) { 6461 xopipsq = ipsq; 6462 do { 6463 if ((phyi = xopipsq->ipsq_phyint) == NULL) 6464 continue; 6465 6466 illv4 = phyi->phyint_illv4; 6467 illv6 = phyi->phyint_illv6; 6468 6469 GRAB_ILL_LOCKS(illv4, illv6); 6470 if (illv4 != NULL) 6471 cv_broadcast(&illv4->ill_cv); 6472 if (illv6 != NULL) 6473 cv_broadcast(&illv6->ill_cv); 6474 RELEASE_ILL_LOCKS(illv4, illv6); 6475 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 6476 } 6477 rw_exit(&ipst->ips_ill_g_lock); 6478 6479 /* 6480 * Now that all locks are dropped, exit the IPSQ we left. 6481 */ 6482 if (leftipsq != NULL) 6483 ipsq_exit(leftipsq); 6484 6485 return (mp); 6486 } 6487 6488 /* 6489 * Return completion status of previously initiated DLPI operations on 6490 * ills in the purview of an ipsq. 6491 */ 6492 static boolean_t 6493 ipsq_dlpi_done(ipsq_t *ipsq) 6494 { 6495 ipsq_t *ipsq_start; 6496 phyint_t *phyi; 6497 ill_t *ill; 6498 6499 ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock)); 6500 ipsq_start = ipsq; 6501 6502 do { 6503 /* 6504 * The only current users of this function are ipsq_try_enter 6505 * and ipsq_enter which have made sure that ipsq_writer is 6506 * NULL before we reach here. ill_dlpi_pending is modified 6507 * only by an ipsq writer 6508 */ 6509 ASSERT(ipsq->ipsq_xop->ipx_writer == NULL); 6510 phyi = ipsq->ipsq_phyint; 6511 /* 6512 * phyi could be NULL if a phyint that is part of an 6513 * IPMP group is being unplumbed. A more detailed 6514 * comment is in ipmp_grp_update_kstats() 6515 */ 6516 if (phyi != NULL) { 6517 ill = phyi->phyint_illv4; 6518 if (ill != NULL && 6519 (ill->ill_dlpi_pending != DL_PRIM_INVAL || 6520 ill->ill_arl_dlpi_pending)) 6521 return (B_FALSE); 6522 6523 ill = phyi->phyint_illv6; 6524 if (ill != NULL && 6525 ill->ill_dlpi_pending != DL_PRIM_INVAL) 6526 return (B_FALSE); 6527 } 6528 6529 } while ((ipsq = ipsq->ipsq_next) != ipsq_start); 6530 6531 return (B_TRUE); 6532 } 6533 6534 /* 6535 * Enter the ipsq corresponding to ill, by waiting synchronously till 6536 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 6537 * will have to drain completely before ipsq_enter returns success. 6538 * ipx_current_ipif will be set if some exclusive op is in progress, 6539 * and the ipsq_exit logic will start the next enqueued op after 6540 * completion of the current op. If 'force' is used, we don't wait 6541 * for the enqueued ops. This is needed when a conn_close wants to 6542 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 6543 * of an ill can also use this option. But we dont' use it currently. 6544 */ 6545 #define ENTER_SQ_WAIT_TICKS 100 6546 boolean_t 6547 ipsq_enter(ill_t *ill, boolean_t force, int type) 6548 { 6549 ipsq_t *ipsq; 6550 ipxop_t *ipx; 6551 boolean_t waited_enough = B_FALSE; 6552 ip_stack_t *ipst = ill->ill_ipst; 6553 6554 /* 6555 * Note that the relationship between ill and ipsq is fixed as long as 6556 * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the 6557 * relationship between the IPSQ and xop cannot change. However, 6558 * since we cannot hold ipsq_lock across the cv_wait(), it may change 6559 * while we're waiting. We wait on ill_cv and rely on ipsq_exit() 6560 * waking up all ills in the xop when it becomes available. 6561 */ 6562 for (;;) { 6563 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6564 mutex_enter(&ill->ill_lock); 6565 if (ill->ill_state_flags & ILL_CONDEMNED) { 6566 mutex_exit(&ill->ill_lock); 6567 rw_exit(&ipst->ips_ill_g_lock); 6568 return (B_FALSE); 6569 } 6570 6571 ipsq = ill->ill_phyint->phyint_ipsq; 6572 mutex_enter(&ipsq->ipsq_lock); 6573 ipx = ipsq->ipsq_xop; 6574 mutex_enter(&ipx->ipx_lock); 6575 6576 if (ipx->ipx_writer == NULL && (type == CUR_OP || 6577 (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) || 6578 waited_enough)) 6579 break; 6580 6581 rw_exit(&ipst->ips_ill_g_lock); 6582 6583 if (!force || ipx->ipx_writer != NULL) { 6584 mutex_exit(&ipx->ipx_lock); 6585 mutex_exit(&ipsq->ipsq_lock); 6586 cv_wait(&ill->ill_cv, &ill->ill_lock); 6587 } else { 6588 mutex_exit(&ipx->ipx_lock); 6589 mutex_exit(&ipsq->ipsq_lock); 6590 (void) cv_reltimedwait(&ill->ill_cv, 6591 &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK); 6592 waited_enough = B_TRUE; 6593 } 6594 mutex_exit(&ill->ill_lock); 6595 } 6596 6597 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 6598 ASSERT(ipx->ipx_reentry_cnt == 0); 6599 ipx->ipx_writer = curthread; 6600 ipx->ipx_forced = (ipx->ipx_current_ipif != NULL); 6601 ipx->ipx_reentry_cnt++; 6602 #ifdef DEBUG 6603 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6604 #endif 6605 mutex_exit(&ipx->ipx_lock); 6606 mutex_exit(&ipsq->ipsq_lock); 6607 mutex_exit(&ill->ill_lock); 6608 rw_exit(&ipst->ips_ill_g_lock); 6609 6610 return (B_TRUE); 6611 } 6612 6613 /* 6614 * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock 6615 * across the call to the core interface ipsq_try_enter() and hence calls this 6616 * function directly. This is explained more fully in ipif_set_values(). 6617 * In order to support the above constraint, ipsq_try_enter is implemented as 6618 * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently 6619 */ 6620 static ipsq_t * 6621 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, 6622 int type, boolean_t reentry_ok) 6623 { 6624 ipsq_t *ipsq; 6625 ipxop_t *ipx; 6626 ip_stack_t *ipst = ill->ill_ipst; 6627 6628 /* 6629 * lock ordering: 6630 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock. 6631 * 6632 * ipx of an ipsq can't change when ipsq_lock is held. 6633 */ 6634 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 6635 GRAB_CONN_LOCK(q); 6636 mutex_enter(&ill->ill_lock); 6637 ipsq = ill->ill_phyint->phyint_ipsq; 6638 mutex_enter(&ipsq->ipsq_lock); 6639 ipx = ipsq->ipsq_xop; 6640 mutex_enter(&ipx->ipx_lock); 6641 6642 /* 6643 * 1. Enter the ipsq if we are already writer and reentry is ok. 6644 * (Note: If the caller does not specify reentry_ok then neither 6645 * 'func' nor any of its callees must ever attempt to enter the ipsq 6646 * again. Otherwise it can lead to an infinite loop 6647 * 2. Enter the ipsq if there is no current writer and this attempted 6648 * entry is part of the current operation 6649 * 3. Enter the ipsq if there is no current writer and this is a new 6650 * operation and the operation queue is empty and there is no 6651 * operation currently in progress and if all previously initiated 6652 * DLPI operations have completed. 6653 */ 6654 if ((ipx->ipx_writer == curthread && reentry_ok) || 6655 (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP && 6656 !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL && 6657 ipsq_dlpi_done(ipsq))))) { 6658 /* Success. */ 6659 ipx->ipx_reentry_cnt++; 6660 ipx->ipx_writer = curthread; 6661 ipx->ipx_forced = B_FALSE; 6662 mutex_exit(&ipx->ipx_lock); 6663 mutex_exit(&ipsq->ipsq_lock); 6664 mutex_exit(&ill->ill_lock); 6665 RELEASE_CONN_LOCK(q); 6666 #ifdef DEBUG 6667 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6668 #endif 6669 return (ipsq); 6670 } 6671 6672 if (func != NULL) 6673 ipsq_enq(ipsq, q, mp, func, type, ill); 6674 6675 mutex_exit(&ipx->ipx_lock); 6676 mutex_exit(&ipsq->ipsq_lock); 6677 mutex_exit(&ill->ill_lock); 6678 RELEASE_CONN_LOCK(q); 6679 return (NULL); 6680 } 6681 6682 /* 6683 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 6684 * certain critical operations like plumbing (i.e. most set ioctls), etc. 6685 * There is one ipsq per phyint. The ipsq 6686 * serializes exclusive ioctls issued by applications on a per ipsq basis in 6687 * ipsq_xopq_mphead. It also protects against multiple threads executing in 6688 * the ipsq. Responses from the driver pertain to the current ioctl (say a 6689 * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing 6690 * up the interface) and are enqueued in ipx_mphead. 6691 * 6692 * If a thread does not want to reenter the ipsq when it is already writer, 6693 * it must make sure that the specified reentry point to be called later 6694 * when the ipsq is empty, nor any code path starting from the specified reentry 6695 * point must never ever try to enter the ipsq again. Otherwise it can lead 6696 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 6697 * When the thread that is currently exclusive finishes, it (ipsq_exit) 6698 * dequeues the requests waiting to become exclusive in ipx_mphead and calls 6699 * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit 6700 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 6701 * ioctl if the current ioctl has completed. If the current ioctl is still 6702 * in progress it simply returns. The current ioctl could be waiting for 6703 * a response from another module (the driver or could be waiting for 6704 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp 6705 * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the 6706 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 6707 * ipx_current_ipif is NULL which happens only once the ioctl is complete and 6708 * all associated DLPI operations have completed. 6709 */ 6710 6711 /* 6712 * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif' 6713 * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ 6714 * on success, or NULL on failure. The caller ensures ipif/ill is valid by 6715 * refholding it as necessary. If the IPSQ cannot be entered and `func' is 6716 * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ 6717 * can be entered. If `func' is NULL, then `q' and `mp' are ignored. 6718 */ 6719 ipsq_t * 6720 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 6721 ipsq_func_t func, int type, boolean_t reentry_ok) 6722 { 6723 ip_stack_t *ipst; 6724 ipsq_t *ipsq; 6725 6726 /* Only 1 of ipif or ill can be specified */ 6727 ASSERT((ipif != NULL) ^ (ill != NULL)); 6728 6729 if (ipif != NULL) 6730 ill = ipif->ipif_ill; 6731 ipst = ill->ill_ipst; 6732 6733 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6734 ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok); 6735 rw_exit(&ipst->ips_ill_g_lock); 6736 6737 return (ipsq); 6738 } 6739 6740 /* 6741 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures 6742 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ 6743 * cannot be entered, the mp is queued for completion. 6744 */ 6745 void 6746 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 6747 boolean_t reentry_ok) 6748 { 6749 ipsq_t *ipsq; 6750 6751 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok); 6752 6753 /* 6754 * Drop the caller's refhold on the ill. This is safe since we either 6755 * entered the IPSQ (and thus are exclusive), or failed to enter the 6756 * IPSQ, in which case we return without accessing ill anymore. This 6757 * is needed because func needs to see the correct refcount. 6758 * e.g. removeif can work only then. 6759 */ 6760 ill_refrele(ill); 6761 if (ipsq != NULL) { 6762 (*func)(ipsq, q, mp, NULL); 6763 ipsq_exit(ipsq); 6764 } 6765 } 6766 6767 /* 6768 * Exit the specified IPSQ. If this is the final exit on it then drain it 6769 * prior to exiting. Caller must be writer on the specified IPSQ. 6770 */ 6771 void 6772 ipsq_exit(ipsq_t *ipsq) 6773 { 6774 mblk_t *mp; 6775 ipsq_t *mp_ipsq; 6776 queue_t *q; 6777 phyint_t *phyi; 6778 ipsq_func_t func; 6779 6780 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6781 6782 ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1); 6783 if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) { 6784 ipsq->ipsq_xop->ipx_reentry_cnt--; 6785 return; 6786 } 6787 6788 for (;;) { 6789 phyi = ipsq->ipsq_phyint; 6790 mp = ipsq_dq(ipsq); 6791 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next; 6792 6793 /* 6794 * If we've changed to a new IPSQ, and the phyint associated 6795 * with the old one has gone away, free the old IPSQ. Note 6796 * that this cannot happen while the IPSQ is in a group. 6797 */ 6798 if (mp_ipsq != ipsq && phyi == NULL) { 6799 ASSERT(ipsq->ipsq_next == ipsq); 6800 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 6801 ipsq_delete(ipsq); 6802 } 6803 6804 if (mp == NULL) 6805 break; 6806 6807 q = mp->b_queue; 6808 func = (ipsq_func_t)mp->b_prev; 6809 ipsq = mp_ipsq; 6810 mp->b_next = mp->b_prev = NULL; 6811 mp->b_queue = NULL; 6812 6813 /* 6814 * If 'q' is an conn queue, it is valid, since we did a 6815 * a refhold on the conn at the start of the ioctl. 6816 * If 'q' is an ill queue, it is valid, since close of an 6817 * ill will clean up its IPSQ. 6818 */ 6819 (*func)(ipsq, q, mp, NULL); 6820 } 6821 } 6822 6823 /* 6824 * Used to start any igmp or mld timers that could not be started 6825 * while holding ill_mcast_lock. The timers can't be started while holding 6826 * the lock, since mld/igmp_start_timers may need to call untimeout() 6827 * which can't be done while holding the lock which the timeout handler 6828 * acquires. Otherwise 6829 * there could be a deadlock since the timeout handlers 6830 * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire 6831 * ill_mcast_lock. 6832 */ 6833 void 6834 ill_mcast_timer_start(ip_stack_t *ipst) 6835 { 6836 int next; 6837 6838 mutex_enter(&ipst->ips_igmp_timer_lock); 6839 next = ipst->ips_igmp_deferred_next; 6840 ipst->ips_igmp_deferred_next = INFINITY; 6841 mutex_exit(&ipst->ips_igmp_timer_lock); 6842 6843 if (next != INFINITY) 6844 igmp_start_timers(next, ipst); 6845 6846 mutex_enter(&ipst->ips_mld_timer_lock); 6847 next = ipst->ips_mld_deferred_next; 6848 ipst->ips_mld_deferred_next = INFINITY; 6849 mutex_exit(&ipst->ips_mld_timer_lock); 6850 6851 if (next != INFINITY) 6852 mld_start_timers(next, ipst); 6853 } 6854 6855 /* 6856 * Start the current exclusive operation on `ipsq'; associate it with `ipif' 6857 * and `ioccmd'. 6858 */ 6859 void 6860 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) 6861 { 6862 ill_t *ill = ipif->ipif_ill; 6863 ipxop_t *ipx = ipsq->ipsq_xop; 6864 6865 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6866 ASSERT(ipx->ipx_current_ipif == NULL); 6867 ASSERT(ipx->ipx_current_ioctl == 0); 6868 6869 ipx->ipx_current_done = B_FALSE; 6870 ipx->ipx_current_ioctl = ioccmd; 6871 mutex_enter(&ipx->ipx_lock); 6872 ipx->ipx_current_ipif = ipif; 6873 mutex_exit(&ipx->ipx_lock); 6874 6875 /* 6876 * Set IPIF_CHANGING on one or more ipifs associated with the 6877 * current exclusive operation. IPIF_CHANGING prevents any new 6878 * references to the ipif (so that the references will eventually 6879 * drop to zero) and also prevents any "get" operations (e.g., 6880 * SIOCGLIFFLAGS) from being able to access the ipif until the 6881 * operation has completed and the ipif is again in a stable state. 6882 * 6883 * For ioctls, IPIF_CHANGING is set on the ipif associated with the 6884 * ioctl. For internal operations (where ioccmd is zero), all ipifs 6885 * on the ill are marked with IPIF_CHANGING since it's unclear which 6886 * ipifs will be affected. 6887 * 6888 * Note that SIOCLIFREMOVEIF is a special case as it sets 6889 * IPIF_CONDEMNED internally after identifying the right ipif to 6890 * operate on. 6891 */ 6892 switch (ioccmd) { 6893 case SIOCLIFREMOVEIF: 6894 break; 6895 case 0: 6896 mutex_enter(&ill->ill_lock); 6897 ipif = ipif->ipif_ill->ill_ipif; 6898 for (; ipif != NULL; ipif = ipif->ipif_next) 6899 ipif->ipif_state_flags |= IPIF_CHANGING; 6900 mutex_exit(&ill->ill_lock); 6901 break; 6902 default: 6903 mutex_enter(&ill->ill_lock); 6904 ipif->ipif_state_flags |= IPIF_CHANGING; 6905 mutex_exit(&ill->ill_lock); 6906 } 6907 } 6908 6909 /* 6910 * Finish the current exclusive operation on `ipsq'. Usually, this will allow 6911 * the next exclusive operation to begin once we ipsq_exit(). However, if 6912 * pending DLPI operations remain, then we will wait for the queue to drain 6913 * before allowing the next exclusive operation to begin. This ensures that 6914 * DLPI operations from one exclusive operation are never improperly processed 6915 * as part of a subsequent exclusive operation. 6916 */ 6917 void 6918 ipsq_current_finish(ipsq_t *ipsq) 6919 { 6920 ipxop_t *ipx = ipsq->ipsq_xop; 6921 t_uscalar_t dlpi_pending = DL_PRIM_INVAL; 6922 ipif_t *ipif = ipx->ipx_current_ipif; 6923 6924 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6925 6926 /* 6927 * For SIOCLIFREMOVEIF, the ipif has been already been blown away 6928 * (but in that case, IPIF_CHANGING will already be clear and no 6929 * pending DLPI messages can remain). 6930 */ 6931 if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) { 6932 ill_t *ill = ipif->ipif_ill; 6933 6934 mutex_enter(&ill->ill_lock); 6935 dlpi_pending = ill->ill_dlpi_pending; 6936 if (ipx->ipx_current_ioctl == 0) { 6937 ipif = ill->ill_ipif; 6938 for (; ipif != NULL; ipif = ipif->ipif_next) 6939 ipif->ipif_state_flags &= ~IPIF_CHANGING; 6940 } else { 6941 ipif->ipif_state_flags &= ~IPIF_CHANGING; 6942 } 6943 mutex_exit(&ill->ill_lock); 6944 } 6945 6946 ASSERT(!ipx->ipx_current_done); 6947 ipx->ipx_current_done = B_TRUE; 6948 ipx->ipx_current_ioctl = 0; 6949 if (dlpi_pending == DL_PRIM_INVAL) { 6950 mutex_enter(&ipx->ipx_lock); 6951 ipx->ipx_current_ipif = NULL; 6952 mutex_exit(&ipx->ipx_lock); 6953 } 6954 } 6955 6956 /* 6957 * The ill is closing. Flush all messages on the ipsq that originated 6958 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 6959 * for this ill since ipsq_enter could not have entered until then. 6960 * New messages can't be queued since the CONDEMNED flag is set. 6961 */ 6962 static void 6963 ipsq_flush(ill_t *ill) 6964 { 6965 queue_t *q; 6966 mblk_t *prev; 6967 mblk_t *mp; 6968 mblk_t *mp_next; 6969 ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 6970 6971 ASSERT(IAM_WRITER_ILL(ill)); 6972 6973 /* 6974 * Flush any messages sent up by the driver. 6975 */ 6976 mutex_enter(&ipx->ipx_lock); 6977 for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) { 6978 mp_next = mp->b_next; 6979 q = mp->b_queue; 6980 if (q == ill->ill_rq || q == ill->ill_wq) { 6981 /* dequeue mp */ 6982 if (prev == NULL) 6983 ipx->ipx_mphead = mp->b_next; 6984 else 6985 prev->b_next = mp->b_next; 6986 if (ipx->ipx_mptail == mp) { 6987 ASSERT(mp_next == NULL); 6988 ipx->ipx_mptail = prev; 6989 } 6990 inet_freemsg(mp); 6991 } else { 6992 prev = mp; 6993 } 6994 } 6995 mutex_exit(&ipx->ipx_lock); 6996 (void) ipsq_pending_mp_cleanup(ill, NULL); 6997 ipsq_xopq_mp_cleanup(ill, NULL); 6998 } 6999 7000 /* 7001 * Parse an ifreq or lifreq struct coming down ioctls and refhold 7002 * and return the associated ipif. 7003 * Return value: 7004 * Non zero: An error has occurred. ci may not be filled out. 7005 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 7006 * a held ipif in ci.ci_ipif. 7007 */ 7008 int 7009 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 7010 cmd_info_t *ci) 7011 { 7012 char *name; 7013 struct ifreq *ifr; 7014 struct lifreq *lifr; 7015 ipif_t *ipif = NULL; 7016 ill_t *ill; 7017 conn_t *connp; 7018 boolean_t isv6; 7019 boolean_t exists; 7020 mblk_t *mp1; 7021 zoneid_t zoneid; 7022 ip_stack_t *ipst; 7023 7024 if (q->q_next != NULL) { 7025 ill = (ill_t *)q->q_ptr; 7026 isv6 = ill->ill_isv6; 7027 connp = NULL; 7028 zoneid = ALL_ZONES; 7029 ipst = ill->ill_ipst; 7030 } else { 7031 ill = NULL; 7032 connp = Q_TO_CONN(q); 7033 isv6 = (connp->conn_family == AF_INET6); 7034 zoneid = connp->conn_zoneid; 7035 if (zoneid == GLOBAL_ZONEID) { 7036 /* global zone can access ipifs in all zones */ 7037 zoneid = ALL_ZONES; 7038 } 7039 ipst = connp->conn_netstack->netstack_ip; 7040 } 7041 7042 /* Has been checked in ip_wput_nondata */ 7043 mp1 = mp->b_cont->b_cont; 7044 7045 if (ipip->ipi_cmd_type == IF_CMD) { 7046 /* This a old style SIOC[GS]IF* command */ 7047 ifr = (struct ifreq *)mp1->b_rptr; 7048 /* 7049 * Null terminate the string to protect against buffer 7050 * overrun. String was generated by user code and may not 7051 * be trusted. 7052 */ 7053 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 7054 name = ifr->ifr_name; 7055 ci->ci_sin = (sin_t *)&ifr->ifr_addr; 7056 ci->ci_sin6 = NULL; 7057 ci->ci_lifr = (struct lifreq *)ifr; 7058 } else { 7059 /* This a new style SIOC[GS]LIF* command */ 7060 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 7061 lifr = (struct lifreq *)mp1->b_rptr; 7062 /* 7063 * Null terminate the string to protect against buffer 7064 * overrun. String was generated by user code and may not 7065 * be trusted. 7066 */ 7067 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 7068 name = lifr->lifr_name; 7069 ci->ci_sin = (sin_t *)&lifr->lifr_addr; 7070 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr; 7071 ci->ci_lifr = lifr; 7072 } 7073 7074 if (ipip->ipi_cmd == SIOCSLIFNAME) { 7075 /* 7076 * The ioctl will be failed if the ioctl comes down 7077 * an conn stream 7078 */ 7079 if (ill == NULL) { 7080 /* 7081 * Not an ill queue, return EINVAL same as the 7082 * old error code. 7083 */ 7084 return (ENXIO); 7085 } 7086 ipif = ill->ill_ipif; 7087 ipif_refhold(ipif); 7088 } else { 7089 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 7090 &exists, isv6, zoneid, ipst); 7091 7092 /* 7093 * Ensure that get ioctls don't see any internal state changes 7094 * caused by set ioctls by deferring them if IPIF_CHANGING is 7095 * set. 7096 */ 7097 if (ipif != NULL && !(ipip->ipi_flags & IPI_WR) && 7098 !IAM_WRITER_IPIF(ipif)) { 7099 ipsq_t *ipsq; 7100 7101 if (connp != NULL) 7102 mutex_enter(&connp->conn_lock); 7103 mutex_enter(&ipif->ipif_ill->ill_lock); 7104 if (IPIF_IS_CHANGING(ipif) && 7105 !IPIF_IS_CONDEMNED(ipif)) { 7106 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 7107 mutex_enter(&ipsq->ipsq_lock); 7108 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 7109 mutex_exit(&ipif->ipif_ill->ill_lock); 7110 ipsq_enq(ipsq, q, mp, ip_process_ioctl, 7111 NEW_OP, ipif->ipif_ill); 7112 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 7113 mutex_exit(&ipsq->ipsq_lock); 7114 if (connp != NULL) 7115 mutex_exit(&connp->conn_lock); 7116 ipif_refrele(ipif); 7117 return (EINPROGRESS); 7118 } 7119 mutex_exit(&ipif->ipif_ill->ill_lock); 7120 if (connp != NULL) 7121 mutex_exit(&connp->conn_lock); 7122 } 7123 } 7124 7125 /* 7126 * Old style [GS]IFCMD does not admit IPv6 ipif 7127 */ 7128 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) { 7129 ipif_refrele(ipif); 7130 return (ENXIO); 7131 } 7132 7133 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 7134 name[0] == '\0') { 7135 /* 7136 * Handle a or a SIOC?IF* with a null name 7137 * during plumb (on the ill queue before the I_PLINK). 7138 */ 7139 ipif = ill->ill_ipif; 7140 ipif_refhold(ipif); 7141 } 7142 7143 if (ipif == NULL) 7144 return (ENXIO); 7145 7146 DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq", 7147 int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif); 7148 7149 ci->ci_ipif = ipif; 7150 return (0); 7151 } 7152 7153 /* 7154 * Return the total number of ipifs. 7155 */ 7156 static uint_t 7157 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst) 7158 { 7159 uint_t numifs = 0; 7160 ill_t *ill; 7161 ill_walk_context_t ctx; 7162 ipif_t *ipif; 7163 7164 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7165 ill = ILL_START_WALK_V4(&ctx, ipst); 7166 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7167 if (IS_UNDER_IPMP(ill)) 7168 continue; 7169 for (ipif = ill->ill_ipif; ipif != NULL; 7170 ipif = ipif->ipif_next) { 7171 if (ipif->ipif_zoneid == zoneid || 7172 ipif->ipif_zoneid == ALL_ZONES) 7173 numifs++; 7174 } 7175 } 7176 rw_exit(&ipst->ips_ill_g_lock); 7177 return (numifs); 7178 } 7179 7180 /* 7181 * Return the total number of ipifs. 7182 */ 7183 static uint_t 7184 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst) 7185 { 7186 uint_t numifs = 0; 7187 ill_t *ill; 7188 ipif_t *ipif; 7189 ill_walk_context_t ctx; 7190 7191 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 7192 7193 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7194 if (family == AF_INET) 7195 ill = ILL_START_WALK_V4(&ctx, ipst); 7196 else if (family == AF_INET6) 7197 ill = ILL_START_WALK_V6(&ctx, ipst); 7198 else 7199 ill = ILL_START_WALK_ALL(&ctx, ipst); 7200 7201 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7202 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP)) 7203 continue; 7204 7205 for (ipif = ill->ill_ipif; ipif != NULL; 7206 ipif = ipif->ipif_next) { 7207 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7208 !(lifn_flags & LIFC_NOXMIT)) 7209 continue; 7210 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7211 !(lifn_flags & LIFC_TEMPORARY)) 7212 continue; 7213 if (((ipif->ipif_flags & 7214 (IPIF_NOXMIT|IPIF_NOLOCAL| 7215 IPIF_DEPRECATED)) || 7216 IS_LOOPBACK(ill) || 7217 !(ipif->ipif_flags & IPIF_UP)) && 7218 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 7219 continue; 7220 7221 if (zoneid != ipif->ipif_zoneid && 7222 ipif->ipif_zoneid != ALL_ZONES && 7223 (zoneid != GLOBAL_ZONEID || 7224 !(lifn_flags & LIFC_ALLZONES))) 7225 continue; 7226 7227 numifs++; 7228 } 7229 } 7230 rw_exit(&ipst->ips_ill_g_lock); 7231 return (numifs); 7232 } 7233 7234 uint_t 7235 ip_get_lifsrcofnum(ill_t *ill) 7236 { 7237 uint_t numifs = 0; 7238 ill_t *ill_head = ill; 7239 ip_stack_t *ipst = ill->ill_ipst; 7240 7241 /* 7242 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 7243 * other thread may be trying to relink the ILLs in this usesrc group 7244 * and adjusting the ill_usesrc_grp_next pointers 7245 */ 7246 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 7247 if ((ill->ill_usesrc_ifindex == 0) && 7248 (ill->ill_usesrc_grp_next != NULL)) { 7249 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 7250 ill = ill->ill_usesrc_grp_next) 7251 numifs++; 7252 } 7253 rw_exit(&ipst->ips_ill_g_usesrc_lock); 7254 7255 return (numifs); 7256 } 7257 7258 /* Null values are passed in for ipif, sin, and ifreq */ 7259 /* ARGSUSED */ 7260 int 7261 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7262 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7263 { 7264 int *nump; 7265 conn_t *connp = Q_TO_CONN(q); 7266 7267 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7268 7269 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 7270 nump = (int *)mp->b_cont->b_cont->b_rptr; 7271 7272 *nump = ip_get_numifs(connp->conn_zoneid, 7273 connp->conn_netstack->netstack_ip); 7274 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 7275 return (0); 7276 } 7277 7278 /* Null values are passed in for ipif, sin, and ifreq */ 7279 /* ARGSUSED */ 7280 int 7281 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 7282 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7283 { 7284 struct lifnum *lifn; 7285 mblk_t *mp1; 7286 conn_t *connp = Q_TO_CONN(q); 7287 7288 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7289 7290 /* Existence checked in ip_wput_nondata */ 7291 mp1 = mp->b_cont->b_cont; 7292 7293 lifn = (struct lifnum *)mp1->b_rptr; 7294 switch (lifn->lifn_family) { 7295 case AF_UNSPEC: 7296 case AF_INET: 7297 case AF_INET6: 7298 break; 7299 default: 7300 return (EAFNOSUPPORT); 7301 } 7302 7303 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 7304 connp->conn_zoneid, connp->conn_netstack->netstack_ip); 7305 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 7306 return (0); 7307 } 7308 7309 /* ARGSUSED */ 7310 int 7311 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7312 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7313 { 7314 STRUCT_HANDLE(ifconf, ifc); 7315 mblk_t *mp1; 7316 struct iocblk *iocp; 7317 struct ifreq *ifr; 7318 ill_walk_context_t ctx; 7319 ill_t *ill; 7320 ipif_t *ipif; 7321 struct sockaddr_in *sin; 7322 int32_t ifclen; 7323 zoneid_t zoneid; 7324 ip_stack_t *ipst = CONNQ_TO_IPST(q); 7325 7326 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 7327 7328 ip1dbg(("ip_sioctl_get_ifconf")); 7329 /* Existence verified in ip_wput_nondata */ 7330 mp1 = mp->b_cont->b_cont; 7331 iocp = (struct iocblk *)mp->b_rptr; 7332 zoneid = Q_TO_CONN(q)->conn_zoneid; 7333 7334 /* 7335 * The original SIOCGIFCONF passed in a struct ifconf which specified 7336 * the user buffer address and length into which the list of struct 7337 * ifreqs was to be copied. Since AT&T Streams does not seem to 7338 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 7339 * the SIOCGIFCONF operation was redefined to simply provide 7340 * a large output buffer into which we are supposed to jam the ifreq 7341 * array. The same ioctl command code was used, despite the fact that 7342 * both the applications and the kernel code had to change, thus making 7343 * it impossible to support both interfaces. 7344 * 7345 * For reasons not good enough to try to explain, the following 7346 * algorithm is used for deciding what to do with one of these: 7347 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 7348 * form with the output buffer coming down as the continuation message. 7349 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 7350 * and we have to copy in the ifconf structure to find out how big the 7351 * output buffer is and where to copy out to. Sure no problem... 7352 * 7353 */ 7354 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 7355 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 7356 int numifs = 0; 7357 size_t ifc_bufsize; 7358 7359 /* 7360 * Must be (better be!) continuation of a TRANSPARENT 7361 * IOCTL. We just copied in the ifconf structure. 7362 */ 7363 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 7364 (struct ifconf *)mp1->b_rptr); 7365 7366 /* 7367 * Allocate a buffer to hold requested information. 7368 * 7369 * If ifc_len is larger than what is needed, we only 7370 * allocate what we will use. 7371 * 7372 * If ifc_len is smaller than what is needed, return 7373 * EINVAL. 7374 * 7375 * XXX: the ill_t structure can hava 2 counters, for 7376 * v4 and v6 (not just ill_ipif_up_count) to store the 7377 * number of interfaces for a device, so we don't need 7378 * to count them here... 7379 */ 7380 numifs = ip_get_numifs(zoneid, ipst); 7381 7382 ifclen = STRUCT_FGET(ifc, ifc_len); 7383 ifc_bufsize = numifs * sizeof (struct ifreq); 7384 if (ifc_bufsize > ifclen) { 7385 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 7386 /* old behaviour */ 7387 return (EINVAL); 7388 } else { 7389 ifc_bufsize = ifclen; 7390 } 7391 } 7392 7393 mp1 = mi_copyout_alloc(q, mp, 7394 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 7395 if (mp1 == NULL) 7396 return (ENOMEM); 7397 7398 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 7399 } 7400 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 7401 /* 7402 * the SIOCGIFCONF ioctl only knows about 7403 * IPv4 addresses, so don't try to tell 7404 * it about interfaces with IPv6-only 7405 * addresses. (Last parm 'isv6' is B_FALSE) 7406 */ 7407 7408 ifr = (struct ifreq *)mp1->b_rptr; 7409 7410 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7411 ill = ILL_START_WALK_V4(&ctx, ipst); 7412 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7413 if (IS_UNDER_IPMP(ill)) 7414 continue; 7415 for (ipif = ill->ill_ipif; ipif != NULL; 7416 ipif = ipif->ipif_next) { 7417 if (zoneid != ipif->ipif_zoneid && 7418 ipif->ipif_zoneid != ALL_ZONES) 7419 continue; 7420 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 7421 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 7422 /* old behaviour */ 7423 rw_exit(&ipst->ips_ill_g_lock); 7424 return (EINVAL); 7425 } else { 7426 goto if_copydone; 7427 } 7428 } 7429 ipif_get_name(ipif, ifr->ifr_name, 7430 sizeof (ifr->ifr_name)); 7431 sin = (sin_t *)&ifr->ifr_addr; 7432 *sin = sin_null; 7433 sin->sin_family = AF_INET; 7434 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 7435 ifr++; 7436 } 7437 } 7438 if_copydone: 7439 rw_exit(&ipst->ips_ill_g_lock); 7440 mp1->b_wptr = (uchar_t *)ifr; 7441 7442 if (STRUCT_BUF(ifc) != NULL) { 7443 STRUCT_FSET(ifc, ifc_len, 7444 (int)((uchar_t *)ifr - mp1->b_rptr)); 7445 } 7446 return (0); 7447 } 7448 7449 /* 7450 * Get the interfaces using the address hosted on the interface passed in, 7451 * as a source adddress 7452 */ 7453 /* ARGSUSED */ 7454 int 7455 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7456 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7457 { 7458 mblk_t *mp1; 7459 ill_t *ill, *ill_head; 7460 ipif_t *ipif, *orig_ipif; 7461 int numlifs = 0; 7462 size_t lifs_bufsize, lifsmaxlen; 7463 struct lifreq *lifr; 7464 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7465 uint_t ifindex; 7466 zoneid_t zoneid; 7467 boolean_t isv6 = B_FALSE; 7468 struct sockaddr_in *sin; 7469 struct sockaddr_in6 *sin6; 7470 STRUCT_HANDLE(lifsrcof, lifs); 7471 ip_stack_t *ipst; 7472 7473 ipst = CONNQ_TO_IPST(q); 7474 7475 ASSERT(q->q_next == NULL); 7476 7477 zoneid = Q_TO_CONN(q)->conn_zoneid; 7478 7479 /* Existence verified in ip_wput_nondata */ 7480 mp1 = mp->b_cont->b_cont; 7481 7482 /* 7483 * Must be (better be!) continuation of a TRANSPARENT 7484 * IOCTL. We just copied in the lifsrcof structure. 7485 */ 7486 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 7487 (struct lifsrcof *)mp1->b_rptr); 7488 7489 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 7490 return (EINVAL); 7491 7492 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 7493 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; 7494 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst); 7495 if (ipif == NULL) { 7496 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 7497 ifindex)); 7498 return (ENXIO); 7499 } 7500 7501 /* Allocate a buffer to hold requested information */ 7502 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 7503 lifs_bufsize = numlifs * sizeof (struct lifreq); 7504 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 7505 /* The actual size needed is always returned in lifs_len */ 7506 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 7507 7508 /* If the amount we need is more than what is passed in, abort */ 7509 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 7510 ipif_refrele(ipif); 7511 return (0); 7512 } 7513 7514 mp1 = mi_copyout_alloc(q, mp, 7515 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 7516 if (mp1 == NULL) { 7517 ipif_refrele(ipif); 7518 return (ENOMEM); 7519 } 7520 7521 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 7522 bzero(mp1->b_rptr, lifs_bufsize); 7523 7524 lifr = (struct lifreq *)mp1->b_rptr; 7525 7526 ill = ill_head = ipif->ipif_ill; 7527 orig_ipif = ipif; 7528 7529 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 7530 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 7531 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7532 7533 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 7534 for (; (ill != NULL) && (ill != ill_head); 7535 ill = ill->ill_usesrc_grp_next) { 7536 7537 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 7538 break; 7539 7540 ipif = ill->ill_ipif; 7541 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name)); 7542 if (ipif->ipif_isv6) { 7543 sin6 = (sin6_t *)&lifr->lifr_addr; 7544 *sin6 = sin6_null; 7545 sin6->sin6_family = AF_INET6; 7546 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 7547 lifr->lifr_addrlen = ip_mask_to_plen_v6( 7548 &ipif->ipif_v6net_mask); 7549 } else { 7550 sin = (sin_t *)&lifr->lifr_addr; 7551 *sin = sin_null; 7552 sin->sin_family = AF_INET; 7553 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 7554 lifr->lifr_addrlen = ip_mask_to_plen( 7555 ipif->ipif_net_mask); 7556 } 7557 lifr++; 7558 } 7559 rw_exit(&ipst->ips_ill_g_usesrc_lock); 7560 rw_exit(&ipst->ips_ill_g_lock); 7561 ipif_refrele(orig_ipif); 7562 mp1->b_wptr = (uchar_t *)lifr; 7563 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 7564 7565 return (0); 7566 } 7567 7568 /* ARGSUSED */ 7569 int 7570 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7571 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7572 { 7573 mblk_t *mp1; 7574 int list; 7575 ill_t *ill; 7576 ipif_t *ipif; 7577 int flags; 7578 int numlifs = 0; 7579 size_t lifc_bufsize; 7580 struct lifreq *lifr; 7581 sa_family_t family; 7582 struct sockaddr_in *sin; 7583 struct sockaddr_in6 *sin6; 7584 ill_walk_context_t ctx; 7585 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7586 int32_t lifclen; 7587 zoneid_t zoneid; 7588 STRUCT_HANDLE(lifconf, lifc); 7589 ip_stack_t *ipst = CONNQ_TO_IPST(q); 7590 7591 ip1dbg(("ip_sioctl_get_lifconf")); 7592 7593 ASSERT(q->q_next == NULL); 7594 7595 zoneid = Q_TO_CONN(q)->conn_zoneid; 7596 7597 /* Existence verified in ip_wput_nondata */ 7598 mp1 = mp->b_cont->b_cont; 7599 7600 /* 7601 * An extended version of SIOCGIFCONF that takes an 7602 * additional address family and flags field. 7603 * AF_UNSPEC retrieve both IPv4 and IPv6. 7604 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 7605 * interfaces are omitted. 7606 * Similarly, IPIF_TEMPORARY interfaces are omitted 7607 * unless LIFC_TEMPORARY is specified. 7608 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 7609 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 7610 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 7611 * has priority over LIFC_NOXMIT. 7612 */ 7613 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 7614 7615 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 7616 return (EINVAL); 7617 7618 /* 7619 * Must be (better be!) continuation of a TRANSPARENT 7620 * IOCTL. We just copied in the lifconf structure. 7621 */ 7622 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 7623 7624 family = STRUCT_FGET(lifc, lifc_family); 7625 flags = STRUCT_FGET(lifc, lifc_flags); 7626 7627 switch (family) { 7628 case AF_UNSPEC: 7629 /* 7630 * walk all ILL's. 7631 */ 7632 list = MAX_G_HEADS; 7633 break; 7634 case AF_INET: 7635 /* 7636 * walk only IPV4 ILL's. 7637 */ 7638 list = IP_V4_G_HEAD; 7639 break; 7640 case AF_INET6: 7641 /* 7642 * walk only IPV6 ILL's. 7643 */ 7644 list = IP_V6_G_HEAD; 7645 break; 7646 default: 7647 return (EAFNOSUPPORT); 7648 } 7649 7650 /* 7651 * Allocate a buffer to hold requested information. 7652 * 7653 * If lifc_len is larger than what is needed, we only 7654 * allocate what we will use. 7655 * 7656 * If lifc_len is smaller than what is needed, return 7657 * EINVAL. 7658 */ 7659 numlifs = ip_get_numlifs(family, flags, zoneid, ipst); 7660 lifc_bufsize = numlifs * sizeof (struct lifreq); 7661 lifclen = STRUCT_FGET(lifc, lifc_len); 7662 if (lifc_bufsize > lifclen) { 7663 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 7664 return (EINVAL); 7665 else 7666 lifc_bufsize = lifclen; 7667 } 7668 7669 mp1 = mi_copyout_alloc(q, mp, 7670 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 7671 if (mp1 == NULL) 7672 return (ENOMEM); 7673 7674 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 7675 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 7676 7677 lifr = (struct lifreq *)mp1->b_rptr; 7678 7679 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7680 ill = ill_first(list, list, &ctx, ipst); 7681 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7682 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP)) 7683 continue; 7684 7685 for (ipif = ill->ill_ipif; ipif != NULL; 7686 ipif = ipif->ipif_next) { 7687 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7688 !(flags & LIFC_NOXMIT)) 7689 continue; 7690 7691 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7692 !(flags & LIFC_TEMPORARY)) 7693 continue; 7694 7695 if (((ipif->ipif_flags & 7696 (IPIF_NOXMIT|IPIF_NOLOCAL| 7697 IPIF_DEPRECATED)) || 7698 IS_LOOPBACK(ill) || 7699 !(ipif->ipif_flags & IPIF_UP)) && 7700 (flags & LIFC_EXTERNAL_SOURCE)) 7701 continue; 7702 7703 if (zoneid != ipif->ipif_zoneid && 7704 ipif->ipif_zoneid != ALL_ZONES && 7705 (zoneid != GLOBAL_ZONEID || 7706 !(flags & LIFC_ALLZONES))) 7707 continue; 7708 7709 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 7710 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 7711 rw_exit(&ipst->ips_ill_g_lock); 7712 return (EINVAL); 7713 } else { 7714 goto lif_copydone; 7715 } 7716 } 7717 7718 ipif_get_name(ipif, lifr->lifr_name, 7719 sizeof (lifr->lifr_name)); 7720 lifr->lifr_type = ill->ill_type; 7721 if (ipif->ipif_isv6) { 7722 sin6 = (sin6_t *)&lifr->lifr_addr; 7723 *sin6 = sin6_null; 7724 sin6->sin6_family = AF_INET6; 7725 sin6->sin6_addr = 7726 ipif->ipif_v6lcl_addr; 7727 lifr->lifr_addrlen = 7728 ip_mask_to_plen_v6( 7729 &ipif->ipif_v6net_mask); 7730 } else { 7731 sin = (sin_t *)&lifr->lifr_addr; 7732 *sin = sin_null; 7733 sin->sin_family = AF_INET; 7734 sin->sin_addr.s_addr = 7735 ipif->ipif_lcl_addr; 7736 lifr->lifr_addrlen = 7737 ip_mask_to_plen( 7738 ipif->ipif_net_mask); 7739 } 7740 lifr++; 7741 } 7742 } 7743 lif_copydone: 7744 rw_exit(&ipst->ips_ill_g_lock); 7745 7746 mp1->b_wptr = (uchar_t *)lifr; 7747 if (STRUCT_BUF(lifc) != NULL) { 7748 STRUCT_FSET(lifc, lifc_len, 7749 (int)((uchar_t *)lifr - mp1->b_rptr)); 7750 } 7751 return (0); 7752 } 7753 7754 static void 7755 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 7756 { 7757 ip6_asp_t *table; 7758 size_t table_size; 7759 mblk_t *data_mp; 7760 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7761 ip_stack_t *ipst; 7762 7763 if (q->q_next == NULL) 7764 ipst = CONNQ_TO_IPST(q); 7765 else 7766 ipst = ILLQ_TO_IPST(q); 7767 7768 /* These two ioctls are I_STR only */ 7769 if (iocp->ioc_count == TRANSPARENT) { 7770 miocnak(q, mp, 0, EINVAL); 7771 return; 7772 } 7773 7774 data_mp = mp->b_cont; 7775 if (data_mp == NULL) { 7776 /* The user passed us a NULL argument */ 7777 table = NULL; 7778 table_size = iocp->ioc_count; 7779 } else { 7780 /* 7781 * The user provided a table. The stream head 7782 * may have copied in the user data in chunks, 7783 * so make sure everything is pulled up 7784 * properly. 7785 */ 7786 if (MBLKL(data_mp) < iocp->ioc_count) { 7787 mblk_t *new_data_mp; 7788 if ((new_data_mp = msgpullup(data_mp, -1)) == 7789 NULL) { 7790 miocnak(q, mp, 0, ENOMEM); 7791 return; 7792 } 7793 freemsg(data_mp); 7794 data_mp = new_data_mp; 7795 mp->b_cont = data_mp; 7796 } 7797 table = (ip6_asp_t *)data_mp->b_rptr; 7798 table_size = iocp->ioc_count; 7799 } 7800 7801 switch (iocp->ioc_cmd) { 7802 case SIOCGIP6ADDRPOLICY: 7803 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst); 7804 if (iocp->ioc_rval == -1) 7805 iocp->ioc_error = EINVAL; 7806 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 7807 else if (table != NULL && 7808 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 7809 ip6_asp_t *src = table; 7810 ip6_asp32_t *dst = (void *)table; 7811 int count = table_size / sizeof (ip6_asp_t); 7812 int i; 7813 7814 /* 7815 * We need to do an in-place shrink of the array 7816 * to match the alignment attributes of the 7817 * 32-bit ABI looking at it. 7818 */ 7819 /* LINTED: logical expression always true: op "||" */ 7820 ASSERT(sizeof (*src) > sizeof (*dst)); 7821 for (i = 1; i < count; i++) 7822 bcopy(src + i, dst + i, sizeof (*dst)); 7823 } 7824 #endif 7825 break; 7826 7827 case SIOCSIP6ADDRPOLICY: 7828 ASSERT(mp->b_prev == NULL); 7829 mp->b_prev = (void *)q; 7830 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 7831 /* 7832 * We pass in the datamodel here so that the ip6_asp_replace() 7833 * routine can handle converting from 32-bit to native formats 7834 * where necessary. 7835 * 7836 * A better way to handle this might be to convert the inbound 7837 * data structure here, and hang it off a new 'mp'; thus the 7838 * ip6_asp_replace() logic would always be dealing with native 7839 * format data structures.. 7840 * 7841 * (An even simpler way to handle these ioctls is to just 7842 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 7843 * and just recompile everything that depends on it.) 7844 */ 7845 #endif 7846 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst, 7847 iocp->ioc_flag & IOC_MODELS); 7848 return; 7849 } 7850 7851 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 7852 qreply(q, mp); 7853 } 7854 7855 static void 7856 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 7857 { 7858 mblk_t *data_mp; 7859 struct dstinforeq *dir; 7860 uint8_t *end, *cur; 7861 in6_addr_t *daddr, *saddr; 7862 ipaddr_t v4daddr; 7863 ire_t *ire; 7864 ipaddr_t v4setsrc; 7865 in6_addr_t v6setsrc; 7866 char *slabel, *dlabel; 7867 boolean_t isipv4; 7868 int match_ire; 7869 ill_t *dst_ill; 7870 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7871 conn_t *connp = Q_TO_CONN(q); 7872 zoneid_t zoneid = IPCL_ZONEID(connp); 7873 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 7874 uint64_t ipif_flags; 7875 7876 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 7877 7878 /* 7879 * This ioctl is I_STR only, and must have a 7880 * data mblk following the M_IOCTL mblk. 7881 */ 7882 data_mp = mp->b_cont; 7883 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 7884 miocnak(q, mp, 0, EINVAL); 7885 return; 7886 } 7887 7888 if (MBLKL(data_mp) < iocp->ioc_count) { 7889 mblk_t *new_data_mp; 7890 7891 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 7892 miocnak(q, mp, 0, ENOMEM); 7893 return; 7894 } 7895 freemsg(data_mp); 7896 data_mp = new_data_mp; 7897 mp->b_cont = data_mp; 7898 } 7899 match_ire = MATCH_IRE_DSTONLY; 7900 7901 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 7902 end - cur >= sizeof (struct dstinforeq); 7903 cur += sizeof (struct dstinforeq)) { 7904 dir = (struct dstinforeq *)cur; 7905 daddr = &dir->dir_daddr; 7906 saddr = &dir->dir_saddr; 7907 7908 /* 7909 * ip_addr_scope_v6() and ip6_asp_lookup() handle 7910 * v4 mapped addresses; ire_ftable_lookup_v6() 7911 * and ip_select_source_v6() do not. 7912 */ 7913 dir->dir_dscope = ip_addr_scope_v6(daddr); 7914 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst); 7915 7916 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 7917 if (isipv4) { 7918 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 7919 v4setsrc = INADDR_ANY; 7920 ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid, 7921 NULL, match_ire, B_TRUE, 0, ipst, &v4setsrc, NULL, 7922 NULL); 7923 } else { 7924 v6setsrc = ipv6_all_zeros; 7925 ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid, 7926 NULL, match_ire, B_TRUE, 0, ipst, &v6setsrc, NULL, 7927 NULL); 7928 } 7929 ASSERT(ire != NULL); 7930 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 7931 ire_refrele(ire); 7932 dir->dir_dreachable = 0; 7933 7934 /* move on to next dst addr */ 7935 continue; 7936 } 7937 dir->dir_dreachable = 1; 7938 7939 dst_ill = ire_nexthop_ill(ire); 7940 if (dst_ill == NULL) { 7941 ire_refrele(ire); 7942 continue; 7943 } 7944 7945 /* With ipmp we most likely look at the ipmp ill here */ 7946 dir->dir_dmactype = dst_ill->ill_mactype; 7947 7948 if (isipv4) { 7949 ipaddr_t v4saddr; 7950 7951 if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr, 7952 connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst, 7953 &v4saddr, NULL, &ipif_flags) != 0) { 7954 v4saddr = INADDR_ANY; 7955 ipif_flags = 0; 7956 } 7957 IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr); 7958 } else { 7959 if (ip_select_source_v6(dst_ill, &v6setsrc, daddr, 7960 zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT, 7961 saddr, NULL, &ipif_flags) != 0) { 7962 *saddr = ipv6_all_zeros; 7963 ipif_flags = 0; 7964 } 7965 } 7966 7967 dir->dir_sscope = ip_addr_scope_v6(saddr); 7968 slabel = ip6_asp_lookup(saddr, NULL, ipst); 7969 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 7970 dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 7971 ire_refrele(ire); 7972 ill_refrele(dst_ill); 7973 } 7974 miocack(q, mp, iocp->ioc_count, 0); 7975 } 7976 7977 /* 7978 * Check if this is an address assigned to this machine. 7979 * Skips interfaces that are down by using ire checks. 7980 * Translates mapped addresses to v4 addresses and then 7981 * treats them as such, returning true if the v4 address 7982 * associated with this mapped address is configured. 7983 * Note: Applications will have to be careful what they do 7984 * with the response; use of mapped addresses limits 7985 * what can be done with the socket, especially with 7986 * respect to socket options and ioctls - neither IPv4 7987 * options nor IPv6 sticky options/ancillary data options 7988 * may be used. 7989 */ 7990 /* ARGSUSED */ 7991 int 7992 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7993 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 7994 { 7995 struct sioc_addrreq *sia; 7996 sin_t *sin; 7997 ire_t *ire; 7998 mblk_t *mp1; 7999 zoneid_t zoneid; 8000 ip_stack_t *ipst; 8001 8002 ip1dbg(("ip_sioctl_tmyaddr")); 8003 8004 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8005 zoneid = Q_TO_CONN(q)->conn_zoneid; 8006 ipst = CONNQ_TO_IPST(q); 8007 8008 /* Existence verified in ip_wput_nondata */ 8009 mp1 = mp->b_cont->b_cont; 8010 sia = (struct sioc_addrreq *)mp1->b_rptr; 8011 sin = (sin_t *)&sia->sa_addr; 8012 switch (sin->sin_family) { 8013 case AF_INET6: { 8014 sin6_t *sin6 = (sin6_t *)sin; 8015 8016 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8017 ipaddr_t v4_addr; 8018 8019 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8020 v4_addr); 8021 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 8022 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, 8023 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8024 } else { 8025 in6_addr_t v6addr; 8026 8027 v6addr = sin6->sin6_addr; 8028 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 8029 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, 8030 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8031 } 8032 break; 8033 } 8034 case AF_INET: { 8035 ipaddr_t v4addr; 8036 8037 v4addr = sin->sin_addr.s_addr; 8038 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 8039 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8040 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8041 break; 8042 } 8043 default: 8044 return (EAFNOSUPPORT); 8045 } 8046 if (ire != NULL) { 8047 sia->sa_res = 1; 8048 ire_refrele(ire); 8049 } else { 8050 sia->sa_res = 0; 8051 } 8052 return (0); 8053 } 8054 8055 /* 8056 * Check if this is an address assigned on-link i.e. neighbor, 8057 * and makes sure it's reachable from the current zone. 8058 * Returns true for my addresses as well. 8059 * Translates mapped addresses to v4 addresses and then 8060 * treats them as such, returning true if the v4 address 8061 * associated with this mapped address is configured. 8062 * Note: Applications will have to be careful what they do 8063 * with the response; use of mapped addresses limits 8064 * what can be done with the socket, especially with 8065 * respect to socket options and ioctls - neither IPv4 8066 * options nor IPv6 sticky options/ancillary data options 8067 * may be used. 8068 */ 8069 /* ARGSUSED */ 8070 int 8071 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8072 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 8073 { 8074 struct sioc_addrreq *sia; 8075 sin_t *sin; 8076 mblk_t *mp1; 8077 ire_t *ire = NULL; 8078 zoneid_t zoneid; 8079 ip_stack_t *ipst; 8080 8081 ip1dbg(("ip_sioctl_tonlink")); 8082 8083 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8084 zoneid = Q_TO_CONN(q)->conn_zoneid; 8085 ipst = CONNQ_TO_IPST(q); 8086 8087 /* Existence verified in ip_wput_nondata */ 8088 mp1 = mp->b_cont->b_cont; 8089 sia = (struct sioc_addrreq *)mp1->b_rptr; 8090 sin = (sin_t *)&sia->sa_addr; 8091 8092 /* 8093 * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST 8094 * to make sure we only look at on-link unicast address. 8095 */ 8096 switch (sin->sin_family) { 8097 case AF_INET6: { 8098 sin6_t *sin6 = (sin6_t *)sin; 8099 8100 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8101 ipaddr_t v4_addr; 8102 8103 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8104 v4_addr); 8105 if (!CLASSD(v4_addr)) { 8106 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0, 8107 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 8108 0, ipst, NULL); 8109 } 8110 } else { 8111 in6_addr_t v6addr; 8112 8113 v6addr = sin6->sin6_addr; 8114 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 8115 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0, 8116 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 0, 8117 ipst, NULL); 8118 } 8119 } 8120 break; 8121 } 8122 case AF_INET: { 8123 ipaddr_t v4addr; 8124 8125 v4addr = sin->sin_addr.s_addr; 8126 if (!CLASSD(v4addr)) { 8127 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL, 8128 zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 8129 } 8130 break; 8131 } 8132 default: 8133 return (EAFNOSUPPORT); 8134 } 8135 sia->sa_res = 0; 8136 if (ire != NULL) { 8137 ASSERT(!(ire->ire_type & IRE_MULTICAST)); 8138 8139 if ((ire->ire_type & IRE_ONLINK) && 8140 !(ire->ire_type & IRE_BROADCAST)) 8141 sia->sa_res = 1; 8142 ire_refrele(ire); 8143 } 8144 return (0); 8145 } 8146 8147 /* 8148 * TBD: implement when kernel maintaines a list of site prefixes. 8149 */ 8150 /* ARGSUSED */ 8151 int 8152 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8153 ip_ioctl_cmd_t *ipip, void *ifreq) 8154 { 8155 return (ENXIO); 8156 } 8157 8158 /* ARP IOCTLs. */ 8159 /* ARGSUSED */ 8160 int 8161 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8162 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8163 { 8164 int err; 8165 ipaddr_t ipaddr; 8166 struct iocblk *iocp; 8167 conn_t *connp; 8168 struct arpreq *ar; 8169 struct xarpreq *xar; 8170 int arp_flags, flags, alength; 8171 uchar_t *lladdr; 8172 ip_stack_t *ipst; 8173 ill_t *ill = ipif->ipif_ill; 8174 ill_t *proxy_ill = NULL; 8175 ipmp_arpent_t *entp = NULL; 8176 boolean_t proxyarp = B_FALSE; 8177 boolean_t if_arp_ioctl = B_FALSE; 8178 ncec_t *ncec = NULL; 8179 nce_t *nce; 8180 8181 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8182 connp = Q_TO_CONN(q); 8183 ipst = connp->conn_netstack->netstack_ip; 8184 iocp = (struct iocblk *)mp->b_rptr; 8185 8186 if (ipip->ipi_cmd_type == XARP_CMD) { 8187 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 8188 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 8189 ar = NULL; 8190 8191 arp_flags = xar->xarp_flags; 8192 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha); 8193 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0); 8194 /* 8195 * Validate against user's link layer address length 8196 * input and name and addr length limits. 8197 */ 8198 alength = ill->ill_phys_addr_length; 8199 if (ipip->ipi_cmd == SIOCSXARP) { 8200 if (alength != xar->xarp_ha.sdl_alen || 8201 (alength + xar->xarp_ha.sdl_nlen > 8202 sizeof (xar->xarp_ha.sdl_data))) 8203 return (EINVAL); 8204 } 8205 } else { 8206 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 8207 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 8208 xar = NULL; 8209 8210 arp_flags = ar->arp_flags; 8211 lladdr = (uchar_t *)ar->arp_ha.sa_data; 8212 /* 8213 * Theoretically, the sa_family could tell us what link 8214 * layer type this operation is trying to deal with. By 8215 * common usage AF_UNSPEC means ethernet. We'll assume 8216 * any attempt to use the SIOC?ARP ioctls is for ethernet, 8217 * for now. Our new SIOC*XARP ioctls can be used more 8218 * generally. 8219 * 8220 * If the underlying media happens to have a non 6 byte 8221 * address, arp module will fail set/get, but the del 8222 * operation will succeed. 8223 */ 8224 alength = 6; 8225 if ((ipip->ipi_cmd != SIOCDARP) && 8226 (alength != ill->ill_phys_addr_length)) { 8227 return (EINVAL); 8228 } 8229 } 8230 8231 /* Translate ATF* flags to NCE* flags */ 8232 flags = 0; 8233 if (arp_flags & ATF_AUTHORITY) 8234 flags |= NCE_F_AUTHORITY; 8235 if (arp_flags & ATF_PERM) 8236 flags |= NCE_F_NONUD; /* not subject to aging */ 8237 if (arp_flags & ATF_PUBL) 8238 flags |= NCE_F_PUBLISH; 8239 8240 /* 8241 * IPMP ARP special handling: 8242 * 8243 * 1. Since ARP mappings must appear consistent across the group, 8244 * prohibit changing ARP mappings on the underlying interfaces. 8245 * 8246 * 2. Since ARP mappings for IPMP data addresses are maintained by 8247 * IP itself, prohibit changing them. 8248 * 8249 * 3. For proxy ARP, use a functioning hardware address in the group, 8250 * provided one exists. If one doesn't, just add the entry as-is; 8251 * ipmp_illgrp_refresh_arpent() will refresh it if things change. 8252 */ 8253 if (IS_UNDER_IPMP(ill)) { 8254 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP) 8255 return (EPERM); 8256 } 8257 if (IS_IPMP(ill)) { 8258 ipmp_illgrp_t *illg = ill->ill_grp; 8259 8260 switch (ipip->ipi_cmd) { 8261 case SIOCSARP: 8262 case SIOCSXARP: 8263 proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength); 8264 if (proxy_ill != NULL) { 8265 proxyarp = B_TRUE; 8266 if (!ipmp_ill_is_active(proxy_ill)) 8267 proxy_ill = ipmp_illgrp_next_ill(illg); 8268 if (proxy_ill != NULL) 8269 lladdr = proxy_ill->ill_phys_addr; 8270 } 8271 /* FALLTHRU */ 8272 } 8273 } 8274 8275 ipaddr = sin->sin_addr.s_addr; 8276 /* 8277 * don't match across illgrp per case (1) and (2). 8278 * XXX use IS_IPMP(ill) like ndp_sioc_update? 8279 */ 8280 nce = nce_lookup_v4(ill, &ipaddr); 8281 if (nce != NULL) 8282 ncec = nce->nce_common; 8283 8284 switch (iocp->ioc_cmd) { 8285 case SIOCDARP: 8286 case SIOCDXARP: { 8287 /* 8288 * Delete the NCE if any. 8289 */ 8290 if (ncec == NULL) { 8291 iocp->ioc_error = ENXIO; 8292 break; 8293 } 8294 /* Don't allow changes to arp mappings of local addresses. */ 8295 if (NCE_MYADDR(ncec)) { 8296 nce_refrele(nce); 8297 return (ENOTSUP); 8298 } 8299 iocp->ioc_error = 0; 8300 8301 /* 8302 * Delete the nce_common which has ncec_ill set to ipmp_ill. 8303 * This will delete all the nce entries on the under_ills. 8304 */ 8305 ncec_delete(ncec); 8306 /* 8307 * Once the NCE has been deleted, then the ire_dep* consistency 8308 * mechanism will find any IRE which depended on the now 8309 * condemned NCE (as part of sending packets). 8310 * That mechanism handles redirects by deleting redirects 8311 * that refer to UNREACHABLE nces. 8312 */ 8313 break; 8314 } 8315 case SIOCGARP: 8316 case SIOCGXARP: 8317 if (ncec != NULL) { 8318 lladdr = ncec->ncec_lladdr; 8319 flags = ncec->ncec_flags; 8320 iocp->ioc_error = 0; 8321 ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags); 8322 } else { 8323 iocp->ioc_error = ENXIO; 8324 } 8325 break; 8326 case SIOCSARP: 8327 case SIOCSXARP: 8328 /* Don't allow changes to arp mappings of local addresses. */ 8329 if (ncec != NULL && NCE_MYADDR(ncec)) { 8330 nce_refrele(nce); 8331 return (ENOTSUP); 8332 } 8333 8334 /* static arp entries will undergo NUD if ATF_PERM is not set */ 8335 flags |= NCE_F_STATIC; 8336 if (!if_arp_ioctl) { 8337 ip_nce_lookup_and_update(&ipaddr, NULL, ipst, 8338 lladdr, alength, flags); 8339 } else { 8340 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 8341 if (ipif != NULL) { 8342 ip_nce_lookup_and_update(&ipaddr, ipif, ipst, 8343 lladdr, alength, flags); 8344 ipif_refrele(ipif); 8345 } 8346 } 8347 if (nce != NULL) { 8348 nce_refrele(nce); 8349 nce = NULL; 8350 } 8351 /* 8352 * NCE_F_STATIC entries will be added in state ND_REACHABLE 8353 * by nce_add_common() 8354 */ 8355 err = nce_lookup_then_add_v4(ill, lladdr, 8356 ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED, 8357 &nce); 8358 if (err == EEXIST) { 8359 ncec = nce->nce_common; 8360 mutex_enter(&ncec->ncec_lock); 8361 ncec->ncec_state = ND_REACHABLE; 8362 ncec->ncec_flags = flags; 8363 nce_update(ncec, ND_UNCHANGED, lladdr); 8364 mutex_exit(&ncec->ncec_lock); 8365 err = 0; 8366 } 8367 if (nce != NULL) { 8368 nce_refrele(nce); 8369 nce = NULL; 8370 } 8371 if (IS_IPMP(ill) && err == 0) { 8372 entp = ipmp_illgrp_create_arpent(ill->ill_grp, 8373 proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length, 8374 flags); 8375 if (entp == NULL || (proxyarp && proxy_ill == NULL)) { 8376 iocp->ioc_error = (entp == NULL ? ENOMEM : 0); 8377 break; 8378 } 8379 } 8380 iocp->ioc_error = err; 8381 } 8382 8383 if (nce != NULL) { 8384 nce_refrele(nce); 8385 } 8386 8387 /* 8388 * If we created an IPMP ARP entry, mark that we've notified ARP. 8389 */ 8390 if (entp != NULL) 8391 ipmp_illgrp_mark_arpent(ill->ill_grp, entp); 8392 8393 return (iocp->ioc_error); 8394 } 8395 8396 /* 8397 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify 8398 * the associated sin and refhold and return the associated ipif via `ci'. 8399 */ 8400 int 8401 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 8402 cmd_info_t *ci) 8403 { 8404 mblk_t *mp1; 8405 sin_t *sin; 8406 conn_t *connp; 8407 ipif_t *ipif; 8408 ire_t *ire = NULL; 8409 ill_t *ill = NULL; 8410 boolean_t exists; 8411 ip_stack_t *ipst; 8412 struct arpreq *ar; 8413 struct xarpreq *xar; 8414 struct sockaddr_dl *sdl; 8415 8416 /* ioctl comes down on a conn */ 8417 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8418 connp = Q_TO_CONN(q); 8419 if (connp->conn_family == AF_INET6) 8420 return (ENXIO); 8421 8422 ipst = connp->conn_netstack->netstack_ip; 8423 8424 /* Verified in ip_wput_nondata */ 8425 mp1 = mp->b_cont->b_cont; 8426 8427 if (ipip->ipi_cmd_type == XARP_CMD) { 8428 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq)); 8429 xar = (struct xarpreq *)mp1->b_rptr; 8430 sin = (sin_t *)&xar->xarp_pa; 8431 sdl = &xar->xarp_ha; 8432 8433 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET) 8434 return (ENXIO); 8435 if (sdl->sdl_nlen >= LIFNAMSIZ) 8436 return (EINVAL); 8437 } else { 8438 ASSERT(ipip->ipi_cmd_type == ARP_CMD); 8439 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq)); 8440 ar = (struct arpreq *)mp1->b_rptr; 8441 sin = (sin_t *)&ar->arp_pa; 8442 } 8443 8444 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) { 8445 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen, 8446 B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst); 8447 if (ipif == NULL) 8448 return (ENXIO); 8449 if (ipif->ipif_id != 0) { 8450 ipif_refrele(ipif); 8451 return (ENXIO); 8452 } 8453 } else { 8454 /* 8455 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen 8456 * of 0: use the IP address to find the ipif. If the IP 8457 * address is an IPMP test address, ire_ftable_lookup() will 8458 * find the wrong ill, so we first do an ipif_lookup_addr(). 8459 */ 8460 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES, 8461 ipst); 8462 if (ipif == NULL) { 8463 ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr, 8464 0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES, 8465 NULL, MATCH_IRE_TYPE, 0, ipst, NULL); 8466 if (ire == NULL || ((ill = ire->ire_ill) == NULL)) { 8467 if (ire != NULL) 8468 ire_refrele(ire); 8469 return (ENXIO); 8470 } 8471 ASSERT(ire != NULL && ill != NULL); 8472 ipif = ill->ill_ipif; 8473 ipif_refhold(ipif); 8474 ire_refrele(ire); 8475 } 8476 } 8477 8478 if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) { 8479 ipif_refrele(ipif); 8480 return (ENXIO); 8481 } 8482 8483 ci->ci_sin = sin; 8484 ci->ci_ipif = ipif; 8485 return (0); 8486 } 8487 8488 /* 8489 * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the 8490 * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is 8491 * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it 8492 * up and thus an ill can join that illgrp. 8493 * 8494 * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than 8495 * open()/close() primarily because close() is not allowed to fail or block 8496 * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason 8497 * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure 8498 * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the 8499 * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts 8500 * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent 8501 * state if I_UNLINK didn't occur. 8502 * 8503 * Note that for each plumb/unplumb operation, we may end up here more than 8504 * once because of the way ifconfig works. However, it's OK to link the same 8505 * illgrp more than once, or unlink an illgrp that's already unlinked. 8506 */ 8507 static int 8508 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd) 8509 { 8510 int err; 8511 ip_stack_t *ipst = ill->ill_ipst; 8512 8513 ASSERT(IS_IPMP(ill)); 8514 ASSERT(IAM_WRITER_ILL(ill)); 8515 8516 switch (ioccmd) { 8517 case I_LINK: 8518 return (ENOTSUP); 8519 8520 case I_PLINK: 8521 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 8522 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp); 8523 rw_exit(&ipst->ips_ipmp_lock); 8524 break; 8525 8526 case I_PUNLINK: 8527 /* 8528 * Require all UP ipifs be brought down prior to unlinking the 8529 * illgrp so any associated IREs (and other state) is torched. 8530 */ 8531 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) 8532 return (EBUSY); 8533 8534 /* 8535 * NOTE: We hold ipmp_lock across the unlink to prevent a race 8536 * with an SIOCSLIFGROUPNAME request from an ill trying to 8537 * join this group. Specifically: ills trying to join grab 8538 * ipmp_lock and bump a "pending join" counter checked by 8539 * ipmp_illgrp_unlink_grp(). During the unlink no new pending 8540 * joins can occur (since we have ipmp_lock). Once we drop 8541 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not 8542 * find the illgrp (since we unlinked it) and will return 8543 * EAFNOSUPPORT. This will then take them back through the 8544 * IPMP meta-interface plumbing logic in ifconfig, and thus 8545 * back through I_PLINK above. 8546 */ 8547 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 8548 err = ipmp_illgrp_unlink_grp(ill->ill_grp); 8549 rw_exit(&ipst->ips_ipmp_lock); 8550 return (err); 8551 default: 8552 break; 8553 } 8554 return (0); 8555 } 8556 8557 /* 8558 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 8559 * atomically set/clear the muxids. Also complete the ioctl by acking or 8560 * naking it. Note that the code is structured such that the link type, 8561 * whether it's persistent or not, is treated equally. ifconfig(1M) and 8562 * its clones use the persistent link, while pppd(1M) and perhaps many 8563 * other daemons may use non-persistent link. When combined with some 8564 * ill_t states, linking and unlinking lower streams may be used as 8565 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 8566 */ 8567 /* ARGSUSED */ 8568 void 8569 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 8570 { 8571 mblk_t *mp1; 8572 struct linkblk *li; 8573 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; 8574 int err = 0; 8575 8576 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK || 8577 ioccmd == I_LINK || ioccmd == I_UNLINK); 8578 8579 mp1 = mp->b_cont; /* This is the linkblk info */ 8580 li = (struct linkblk *)mp1->b_rptr; 8581 8582 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li); 8583 if (err == EINPROGRESS) 8584 return; 8585 done: 8586 if (err == 0) 8587 miocack(q, mp, 0, 0); 8588 else 8589 miocnak(q, mp, 0, err); 8590 8591 /* Conn was refheld in ip_sioctl_copyin_setup */ 8592 if (CONN_Q(q)) 8593 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 8594 } 8595 8596 /* 8597 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to 8598 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP 8599 * module stream). If `doconsist' is set, then do the extended consistency 8600 * checks requested by ifconfig(1M) and (atomically) set ill_muxid here. 8601 * Returns zero on success, EINPROGRESS if the operation is still pending, or 8602 * an error code on failure. 8603 */ 8604 static int 8605 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, 8606 struct linkblk *li) 8607 { 8608 int err = 0; 8609 ill_t *ill; 8610 queue_t *ipwq, *dwq; 8611 const char *name; 8612 struct qinit *qinfo; 8613 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 8614 boolean_t entered_ipsq = B_FALSE; 8615 boolean_t is_ip = B_FALSE; 8616 arl_t *arl; 8617 8618 /* 8619 * Walk the lower stream to verify it's the IP module stream. 8620 * The IP module is identified by its name, wput function, 8621 * and non-NULL q_next. STREAMS ensures that the lower stream 8622 * (li->l_qbot) will not vanish until this ioctl completes. 8623 */ 8624 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) { 8625 qinfo = ipwq->q_qinfo; 8626 name = qinfo->qi_minfo->mi_idname; 8627 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 && 8628 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 8629 is_ip = B_TRUE; 8630 break; 8631 } 8632 if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 && 8633 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 8634 break; 8635 } 8636 } 8637 8638 /* 8639 * If this isn't an IP module stream, bail. 8640 */ 8641 if (ipwq == NULL) 8642 return (0); 8643 8644 if (!is_ip) { 8645 arl = (arl_t *)ipwq->q_ptr; 8646 ill = arl_to_ill(arl); 8647 if (ill == NULL) 8648 return (0); 8649 } else { 8650 ill = ipwq->q_ptr; 8651 } 8652 ASSERT(ill != NULL); 8653 8654 if (ipsq == NULL) { 8655 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 8656 NEW_OP, B_FALSE); 8657 if (ipsq == NULL) { 8658 if (!is_ip) 8659 ill_refrele(ill); 8660 return (EINPROGRESS); 8661 } 8662 entered_ipsq = B_TRUE; 8663 } 8664 ASSERT(IAM_WRITER_ILL(ill)); 8665 mutex_enter(&ill->ill_lock); 8666 if (!is_ip) { 8667 if (islink && ill->ill_muxid == 0) { 8668 /* 8669 * Plumbing has to be done with IP plumbed first, arp 8670 * second, but here we have arp being plumbed first. 8671 */ 8672 mutex_exit(&ill->ill_lock); 8673 ipsq_exit(ipsq); 8674 ill_refrele(ill); 8675 return (EINVAL); 8676 } 8677 } 8678 mutex_exit(&ill->ill_lock); 8679 if (!is_ip) { 8680 arl->arl_muxid = islink ? li->l_index : 0; 8681 ill_refrele(ill); 8682 goto done; 8683 } 8684 8685 if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) 8686 goto done; 8687 8688 /* 8689 * As part of I_{P}LINKing, stash the number of downstream modules and 8690 * the read queue of the module immediately below IP in the ill. 8691 * These are used during the capability negotiation below. 8692 */ 8693 ill->ill_lmod_rq = NULL; 8694 ill->ill_lmod_cnt = 0; 8695 if (islink && ((dwq = ipwq->q_next) != NULL)) { 8696 ill->ill_lmod_rq = RD(dwq); 8697 for (; dwq != NULL; dwq = dwq->q_next) 8698 ill->ill_lmod_cnt++; 8699 } 8700 8701 ill->ill_muxid = islink ? li->l_index : 0; 8702 8703 /* 8704 * Mark the ipsq busy until the capability operations initiated below 8705 * complete. The PLINK/UNLINK ioctl itself completes when our caller 8706 * returns, but the capability operation may complete asynchronously 8707 * much later. 8708 */ 8709 ipsq_current_start(ipsq, ill->ill_ipif, ioccmd); 8710 /* 8711 * If there's at least one up ipif on this ill, then we're bound to 8712 * the underlying driver via DLPI. In that case, renegotiate 8713 * capabilities to account for any possible change in modules 8714 * interposed between IP and the driver. 8715 */ 8716 if (ill->ill_ipif_up_count > 0) { 8717 if (islink) 8718 ill_capability_probe(ill); 8719 else 8720 ill_capability_reset(ill, B_FALSE); 8721 } 8722 ipsq_current_finish(ipsq); 8723 done: 8724 if (entered_ipsq) 8725 ipsq_exit(ipsq); 8726 8727 return (err); 8728 } 8729 8730 /* 8731 * Search the ioctl command in the ioctl tables and return a pointer 8732 * to the ioctl command information. The ioctl command tables are 8733 * static and fully populated at compile time. 8734 */ 8735 ip_ioctl_cmd_t * 8736 ip_sioctl_lookup(int ioc_cmd) 8737 { 8738 int index; 8739 ip_ioctl_cmd_t *ipip; 8740 ip_ioctl_cmd_t *ipip_end; 8741 8742 if (ioc_cmd == IPI_DONTCARE) 8743 return (NULL); 8744 8745 /* 8746 * Do a 2 step search. First search the indexed table 8747 * based on the least significant byte of the ioctl cmd. 8748 * If we don't find a match, then search the misc table 8749 * serially. 8750 */ 8751 index = ioc_cmd & 0xFF; 8752 if (index < ip_ndx_ioctl_count) { 8753 ipip = &ip_ndx_ioctl_table[index]; 8754 if (ipip->ipi_cmd == ioc_cmd) { 8755 /* Found a match in the ndx table */ 8756 return (ipip); 8757 } 8758 } 8759 8760 /* Search the misc table */ 8761 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 8762 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 8763 if (ipip->ipi_cmd == ioc_cmd) 8764 /* Found a match in the misc table */ 8765 return (ipip); 8766 } 8767 8768 return (NULL); 8769 } 8770 8771 /* 8772 * Wrapper function for resuming deferred ioctl processing 8773 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 8774 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 8775 */ 8776 /* ARGSUSED */ 8777 void 8778 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 8779 void *dummy_arg) 8780 { 8781 ip_sioctl_copyin_setup(q, mp); 8782 } 8783 8784 /* 8785 * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message 8786 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 8787 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 8788 * We establish here the size of the block to be copied in. mi_copyin 8789 * arranges for this to happen, an processing continues in ip_wput_nondata with 8790 * an M_IOCDATA message. 8791 */ 8792 void 8793 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 8794 { 8795 int copyin_size; 8796 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8797 ip_ioctl_cmd_t *ipip; 8798 cred_t *cr; 8799 ip_stack_t *ipst; 8800 8801 if (CONN_Q(q)) 8802 ipst = CONNQ_TO_IPST(q); 8803 else 8804 ipst = ILLQ_TO_IPST(q); 8805 8806 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 8807 if (ipip == NULL) { 8808 /* 8809 * The ioctl is not one we understand or own. 8810 * Pass it along to be processed down stream, 8811 * if this is a module instance of IP, else nak 8812 * the ioctl. 8813 */ 8814 if (q->q_next == NULL) { 8815 goto nak; 8816 } else { 8817 putnext(q, mp); 8818 return; 8819 } 8820 } 8821 8822 /* 8823 * If this is deferred, then we will do all the checks when we 8824 * come back. 8825 */ 8826 if ((iocp->ioc_cmd == SIOCGDSTINFO || 8827 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) { 8828 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 8829 return; 8830 } 8831 8832 /* 8833 * Only allow a very small subset of IP ioctls on this stream if 8834 * IP is a module and not a driver. Allowing ioctls to be processed 8835 * in this case may cause assert failures or data corruption. 8836 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 8837 * ioctls allowed on an IP module stream, after which this stream 8838 * normally becomes a multiplexor (at which time the stream head 8839 * will fail all ioctls). 8840 */ 8841 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 8842 goto nak; 8843 } 8844 8845 /* Make sure we have ioctl data to process. */ 8846 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 8847 goto nak; 8848 8849 /* 8850 * Prefer dblk credential over ioctl credential; some synthesized 8851 * ioctls have kcred set because there's no way to crhold() 8852 * a credential in some contexts. (ioc_cr is not crfree() by 8853 * the framework; the caller of ioctl needs to hold the reference 8854 * for the duration of the call). 8855 */ 8856 cr = msg_getcred(mp, NULL); 8857 if (cr == NULL) 8858 cr = iocp->ioc_cr; 8859 8860 /* Make sure normal users don't send down privileged ioctls */ 8861 if ((ipip->ipi_flags & IPI_PRIV) && 8862 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) { 8863 /* We checked the privilege earlier but log it here */ 8864 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE)); 8865 return; 8866 } 8867 8868 /* 8869 * The ioctl command tables can only encode fixed length 8870 * ioctl data. If the length is variable, the table will 8871 * encode the length as zero. Such special cases are handled 8872 * below in the switch. 8873 */ 8874 if (ipip->ipi_copyin_size != 0) { 8875 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 8876 return; 8877 } 8878 8879 switch (iocp->ioc_cmd) { 8880 case O_SIOCGIFCONF: 8881 case SIOCGIFCONF: 8882 /* 8883 * This IOCTL is hilarious. See comments in 8884 * ip_sioctl_get_ifconf for the story. 8885 */ 8886 if (iocp->ioc_count == TRANSPARENT) 8887 copyin_size = SIZEOF_STRUCT(ifconf, 8888 iocp->ioc_flag); 8889 else 8890 copyin_size = iocp->ioc_count; 8891 mi_copyin(q, mp, NULL, copyin_size); 8892 return; 8893 8894 case O_SIOCGLIFCONF: 8895 case SIOCGLIFCONF: 8896 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 8897 mi_copyin(q, mp, NULL, copyin_size); 8898 return; 8899 8900 case SIOCGLIFSRCOF: 8901 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 8902 mi_copyin(q, mp, NULL, copyin_size); 8903 return; 8904 case SIOCGIP6ADDRPOLICY: 8905 ip_sioctl_ip6addrpolicy(q, mp); 8906 ip6_asp_table_refrele(ipst); 8907 return; 8908 8909 case SIOCSIP6ADDRPOLICY: 8910 ip_sioctl_ip6addrpolicy(q, mp); 8911 return; 8912 8913 case SIOCGDSTINFO: 8914 ip_sioctl_dstinfo(q, mp); 8915 ip6_asp_table_refrele(ipst); 8916 return; 8917 8918 case I_PLINK: 8919 case I_PUNLINK: 8920 case I_LINK: 8921 case I_UNLINK: 8922 /* 8923 * We treat non-persistent link similarly as the persistent 8924 * link case, in terms of plumbing/unplumbing, as well as 8925 * dynamic re-plumbing events indicator. See comments 8926 * in ip_sioctl_plink() for more. 8927 * 8928 * Request can be enqueued in the 'ipsq' while waiting 8929 * to become exclusive. So bump up the conn ref. 8930 */ 8931 if (CONN_Q(q)) 8932 CONN_INC_REF(Q_TO_CONN(q)); 8933 ip_sioctl_plink(NULL, q, mp, NULL); 8934 return; 8935 8936 case ND_GET: 8937 case ND_SET: 8938 /* 8939 * Use of the nd table requires holding the reader lock. 8940 * Modifying the nd table thru nd_load/nd_unload requires 8941 * the writer lock. 8942 */ 8943 rw_enter(&ipst->ips_ip_g_nd_lock, RW_READER); 8944 if (nd_getset(q, ipst->ips_ip_g_nd, mp)) { 8945 rw_exit(&ipst->ips_ip_g_nd_lock); 8946 8947 if (iocp->ioc_error) 8948 iocp->ioc_count = 0; 8949 mp->b_datap->db_type = M_IOCACK; 8950 qreply(q, mp); 8951 return; 8952 } 8953 rw_exit(&ipst->ips_ip_g_nd_lock); 8954 /* 8955 * We don't understand this subioctl of ND_GET / ND_SET. 8956 * Maybe intended for some driver / module below us 8957 */ 8958 if (q->q_next) { 8959 putnext(q, mp); 8960 } else { 8961 iocp->ioc_error = ENOENT; 8962 mp->b_datap->db_type = M_IOCNAK; 8963 iocp->ioc_count = 0; 8964 qreply(q, mp); 8965 } 8966 return; 8967 8968 case IP_IOCTL: 8969 ip_wput_ioctl(q, mp); 8970 return; 8971 8972 case SIOCILB: 8973 /* The ioctl length varies depending on the ILB command. */ 8974 copyin_size = iocp->ioc_count; 8975 if (copyin_size < sizeof (ilb_cmd_t)) 8976 goto nak; 8977 mi_copyin(q, mp, NULL, copyin_size); 8978 return; 8979 8980 default: 8981 cmn_err(CE_PANIC, "should not happen "); 8982 } 8983 nak: 8984 if (mp->b_cont != NULL) { 8985 freemsg(mp->b_cont); 8986 mp->b_cont = NULL; 8987 } 8988 iocp->ioc_error = EINVAL; 8989 mp->b_datap->db_type = M_IOCNAK; 8990 iocp->ioc_count = 0; 8991 qreply(q, mp); 8992 } 8993 8994 static void 8995 ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags) 8996 { 8997 struct arpreq *ar; 8998 struct xarpreq *xar; 8999 mblk_t *tmp; 9000 struct iocblk *iocp; 9001 int x_arp_ioctl = B_FALSE; 9002 int *flagsp; 9003 char *storage = NULL; 9004 9005 ASSERT(ill != NULL); 9006 9007 iocp = (struct iocblk *)mp->b_rptr; 9008 ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP); 9009 9010 tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */ 9011 if ((iocp->ioc_cmd == SIOCGXARP) || 9012 (iocp->ioc_cmd == SIOCSXARP)) { 9013 x_arp_ioctl = B_TRUE; 9014 xar = (struct xarpreq *)tmp->b_rptr; 9015 flagsp = &xar->xarp_flags; 9016 storage = xar->xarp_ha.sdl_data; 9017 } else { 9018 ar = (struct arpreq *)tmp->b_rptr; 9019 flagsp = &ar->arp_flags; 9020 storage = ar->arp_ha.sa_data; 9021 } 9022 9023 /* 9024 * We're done if this is not an SIOCG{X}ARP 9025 */ 9026 if (x_arp_ioctl) { 9027 storage += ill_xarp_info(&xar->xarp_ha, ill); 9028 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 9029 sizeof (xar->xarp_ha.sdl_data)) { 9030 iocp->ioc_error = EINVAL; 9031 return; 9032 } 9033 } 9034 *flagsp = ATF_INUSE; 9035 /* 9036 * If /sbin/arp told us we are the authority using the "permanent" 9037 * flag, or if this is one of my addresses print "permanent" 9038 * in the /sbin/arp output. 9039 */ 9040 if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY)) 9041 *flagsp |= ATF_AUTHORITY; 9042 if (flags & NCE_F_NONUD) 9043 *flagsp |= ATF_PERM; /* not subject to aging */ 9044 if (flags & NCE_F_PUBLISH) 9045 *flagsp |= ATF_PUBL; 9046 if (hwaddr != NULL) { 9047 *flagsp |= ATF_COM; 9048 bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length); 9049 } 9050 } 9051 9052 /* 9053 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 9054 * interface) create the next available logical interface for this 9055 * physical interface. 9056 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 9057 * ipif with the specified name. 9058 * 9059 * If the address family is not AF_UNSPEC then set the address as well. 9060 * 9061 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 9062 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 9063 * 9064 * Executed as a writer on the ill. 9065 * So no lock is needed to traverse the ipif chain, or examine the 9066 * phyint flags. 9067 */ 9068 /* ARGSUSED */ 9069 int 9070 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9071 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9072 { 9073 mblk_t *mp1; 9074 struct lifreq *lifr; 9075 boolean_t isv6; 9076 boolean_t exists; 9077 char *name; 9078 char *endp; 9079 char *cp; 9080 int namelen; 9081 ipif_t *ipif; 9082 long id; 9083 ipsq_t *ipsq; 9084 ill_t *ill; 9085 sin_t *sin; 9086 int err = 0; 9087 boolean_t found_sep = B_FALSE; 9088 conn_t *connp; 9089 zoneid_t zoneid; 9090 ip_stack_t *ipst = CONNQ_TO_IPST(q); 9091 9092 ASSERT(q->q_next == NULL); 9093 ip1dbg(("ip_sioctl_addif\n")); 9094 /* Existence of mp1 has been checked in ip_wput_nondata */ 9095 mp1 = mp->b_cont->b_cont; 9096 /* 9097 * Null terminate the string to protect against buffer 9098 * overrun. String was generated by user code and may not 9099 * be trusted. 9100 */ 9101 lifr = (struct lifreq *)mp1->b_rptr; 9102 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 9103 name = lifr->lifr_name; 9104 ASSERT(CONN_Q(q)); 9105 connp = Q_TO_CONN(q); 9106 isv6 = (connp->conn_family == AF_INET6); 9107 zoneid = connp->conn_zoneid; 9108 namelen = mi_strlen(name); 9109 if (namelen == 0) 9110 return (EINVAL); 9111 9112 exists = B_FALSE; 9113 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 9114 (mi_strcmp(name, ipif_loopback_name) == 0)) { 9115 /* 9116 * Allow creating lo0 using SIOCLIFADDIF. 9117 * can't be any other writer thread. So can pass null below 9118 * for the last 4 args to ipif_lookup_name. 9119 */ 9120 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE, 9121 &exists, isv6, zoneid, ipst); 9122 /* Prevent any further action */ 9123 if (ipif == NULL) { 9124 return (ENOBUFS); 9125 } else if (!exists) { 9126 /* We created the ipif now and as writer */ 9127 ipif_refrele(ipif); 9128 return (0); 9129 } else { 9130 ill = ipif->ipif_ill; 9131 ill_refhold(ill); 9132 ipif_refrele(ipif); 9133 } 9134 } else { 9135 /* Look for a colon in the name. */ 9136 endp = &name[namelen]; 9137 for (cp = endp; --cp > name; ) { 9138 if (*cp == IPIF_SEPARATOR_CHAR) { 9139 found_sep = B_TRUE; 9140 /* 9141 * Reject any non-decimal aliases for plumbing 9142 * of logical interfaces. Aliases with leading 9143 * zeroes are also rejected as they introduce 9144 * ambiguity in the naming of the interfaces. 9145 * Comparing with "0" takes care of all such 9146 * cases. 9147 */ 9148 if ((strncmp("0", cp+1, 1)) == 0) 9149 return (EINVAL); 9150 9151 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 9152 id <= 0 || *endp != '\0') { 9153 return (EINVAL); 9154 } 9155 *cp = '\0'; 9156 break; 9157 } 9158 } 9159 ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst); 9160 if (found_sep) 9161 *cp = IPIF_SEPARATOR_CHAR; 9162 if (ill == NULL) 9163 return (ENXIO); 9164 } 9165 9166 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 9167 B_TRUE); 9168 9169 /* 9170 * Release the refhold due to the lookup, now that we are excl 9171 * or we are just returning 9172 */ 9173 ill_refrele(ill); 9174 9175 if (ipsq == NULL) 9176 return (EINPROGRESS); 9177 9178 /* We are now exclusive on the IPSQ */ 9179 ASSERT(IAM_WRITER_ILL(ill)); 9180 9181 if (found_sep) { 9182 /* Now see if there is an IPIF with this unit number. */ 9183 for (ipif = ill->ill_ipif; ipif != NULL; 9184 ipif = ipif->ipif_next) { 9185 if (ipif->ipif_id == id) { 9186 err = EEXIST; 9187 goto done; 9188 } 9189 } 9190 } 9191 9192 /* 9193 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 9194 * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name() 9195 * instead. 9196 */ 9197 if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL, 9198 B_TRUE, B_TRUE, &err)) == NULL) { 9199 goto done; 9200 } 9201 9202 /* Return created name with ioctl */ 9203 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 9204 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 9205 ip1dbg(("created %s\n", lifr->lifr_name)); 9206 9207 /* Set address */ 9208 sin = (sin_t *)&lifr->lifr_addr; 9209 if (sin->sin_family != AF_UNSPEC) { 9210 err = ip_sioctl_addr(ipif, sin, q, mp, 9211 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 9212 } 9213 9214 done: 9215 ipsq_exit(ipsq); 9216 return (err); 9217 } 9218 9219 /* 9220 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 9221 * interface) delete it based on the IP address (on this physical interface). 9222 * Otherwise delete it based on the ipif_id. 9223 * Also, special handling to allow a removeif of lo0. 9224 */ 9225 /* ARGSUSED */ 9226 int 9227 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9228 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 9229 { 9230 conn_t *connp; 9231 ill_t *ill = ipif->ipif_ill; 9232 boolean_t success; 9233 ip_stack_t *ipst; 9234 9235 ipst = CONNQ_TO_IPST(q); 9236 9237 ASSERT(q->q_next == NULL); 9238 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 9239 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9240 ASSERT(IAM_WRITER_IPIF(ipif)); 9241 9242 connp = Q_TO_CONN(q); 9243 /* 9244 * Special case for unplumbing lo0 (the loopback physical interface). 9245 * If unplumbing lo0, the incoming address structure has been 9246 * initialized to all zeros. When unplumbing lo0, all its logical 9247 * interfaces must be removed too. 9248 * 9249 * Note that this interface may be called to remove a specific 9250 * loopback logical interface (eg, lo0:1). But in that case 9251 * ipif->ipif_id != 0 so that the code path for that case is the 9252 * same as any other interface (meaning it skips the code directly 9253 * below). 9254 */ 9255 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { 9256 if (sin->sin_family == AF_UNSPEC && 9257 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 9258 /* 9259 * Mark it condemned. No new ref. will be made to ill. 9260 */ 9261 mutex_enter(&ill->ill_lock); 9262 ill->ill_state_flags |= ILL_CONDEMNED; 9263 for (ipif = ill->ill_ipif; ipif != NULL; 9264 ipif = ipif->ipif_next) { 9265 ipif->ipif_state_flags |= IPIF_CONDEMNED; 9266 } 9267 mutex_exit(&ill->ill_lock); 9268 9269 ipif = ill->ill_ipif; 9270 /* unplumb the loopback interface */ 9271 ill_delete(ill); 9272 mutex_enter(&connp->conn_lock); 9273 mutex_enter(&ill->ill_lock); 9274 9275 /* Are any references to this ill active */ 9276 if (ill_is_freeable(ill)) { 9277 mutex_exit(&ill->ill_lock); 9278 mutex_exit(&connp->conn_lock); 9279 ill_delete_tail(ill); 9280 mi_free(ill); 9281 return (0); 9282 } 9283 success = ipsq_pending_mp_add(connp, ipif, 9284 CONNP_TO_WQ(connp), mp, ILL_FREE); 9285 mutex_exit(&connp->conn_lock); 9286 mutex_exit(&ill->ill_lock); 9287 if (success) 9288 return (EINPROGRESS); 9289 else 9290 return (EINTR); 9291 } 9292 } 9293 9294 if (ipif->ipif_id == 0) { 9295 ipsq_t *ipsq; 9296 9297 /* Find based on address */ 9298 if (ipif->ipif_isv6) { 9299 sin6_t *sin6; 9300 9301 if (sin->sin_family != AF_INET6) 9302 return (EAFNOSUPPORT); 9303 9304 sin6 = (sin6_t *)sin; 9305 /* We are a writer, so we should be able to lookup */ 9306 ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill, 9307 ipst); 9308 } else { 9309 if (sin->sin_family != AF_INET) 9310 return (EAFNOSUPPORT); 9311 9312 /* We are a writer, so we should be able to lookup */ 9313 ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill, 9314 ipst); 9315 } 9316 if (ipif == NULL) { 9317 return (EADDRNOTAVAIL); 9318 } 9319 9320 /* 9321 * It is possible for a user to send an SIOCLIFREMOVEIF with 9322 * lifr_name of the physical interface but with an ip address 9323 * lifr_addr of a logical interface plumbed over it. 9324 * So update ipx_current_ipif now that ipif points to the 9325 * correct one. 9326 */ 9327 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 9328 ipsq->ipsq_xop->ipx_current_ipif = ipif; 9329 9330 /* This is a writer */ 9331 ipif_refrele(ipif); 9332 } 9333 9334 /* 9335 * Can not delete instance zero since it is tied to the ill. 9336 */ 9337 if (ipif->ipif_id == 0) 9338 return (EBUSY); 9339 9340 mutex_enter(&ill->ill_lock); 9341 ipif->ipif_state_flags |= IPIF_CONDEMNED; 9342 mutex_exit(&ill->ill_lock); 9343 9344 ipif_free(ipif); 9345 9346 mutex_enter(&connp->conn_lock); 9347 mutex_enter(&ill->ill_lock); 9348 9349 /* Are any references to this ipif active */ 9350 if (ipif_is_freeable(ipif)) { 9351 mutex_exit(&ill->ill_lock); 9352 mutex_exit(&connp->conn_lock); 9353 ipif_non_duplicate(ipif); 9354 (void) ipif_down_tail(ipif); 9355 ipif_free_tail(ipif); /* frees ipif */ 9356 return (0); 9357 } 9358 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 9359 IPIF_FREE); 9360 mutex_exit(&ill->ill_lock); 9361 mutex_exit(&connp->conn_lock); 9362 if (success) 9363 return (EINPROGRESS); 9364 else 9365 return (EINTR); 9366 } 9367 9368 /* 9369 * Restart the removeif ioctl. The refcnt has gone down to 0. 9370 * The ipif is already condemned. So can't find it thru lookups. 9371 */ 9372 /* ARGSUSED */ 9373 int 9374 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 9375 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 9376 { 9377 ill_t *ill = ipif->ipif_ill; 9378 9379 ASSERT(IAM_WRITER_IPIF(ipif)); 9380 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 9381 9382 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 9383 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9384 9385 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { 9386 ASSERT(ill->ill_state_flags & ILL_CONDEMNED); 9387 ill_delete_tail(ill); 9388 mi_free(ill); 9389 return (0); 9390 } 9391 9392 ipif_non_duplicate(ipif); 9393 (void) ipif_down_tail(ipif); 9394 ipif_free_tail(ipif); 9395 9396 return (0); 9397 } 9398 9399 /* 9400 * Set the local interface address. 9401 * Allow an address of all zero when the interface is down. 9402 */ 9403 /* ARGSUSED */ 9404 int 9405 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9406 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9407 { 9408 int err = 0; 9409 in6_addr_t v6addr; 9410 boolean_t need_up = B_FALSE; 9411 9412 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 9413 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9414 9415 ASSERT(IAM_WRITER_IPIF(ipif)); 9416 9417 if (ipif->ipif_isv6) { 9418 sin6_t *sin6; 9419 ill_t *ill; 9420 phyint_t *phyi; 9421 9422 if (sin->sin_family != AF_INET6) 9423 return (EAFNOSUPPORT); 9424 9425 sin6 = (sin6_t *)sin; 9426 v6addr = sin6->sin6_addr; 9427 ill = ipif->ipif_ill; 9428 phyi = ill->ill_phyint; 9429 9430 /* 9431 * Enforce that true multicast interfaces have a link-local 9432 * address for logical unit 0. 9433 */ 9434 if (ipif->ipif_id == 0 && 9435 (ill->ill_flags & ILLF_MULTICAST) && 9436 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 9437 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 9438 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 9439 return (EADDRNOTAVAIL); 9440 } 9441 9442 /* 9443 * up interfaces shouldn't have the unspecified address 9444 * unless they also have the IPIF_NOLOCAL flags set and 9445 * have a subnet assigned. 9446 */ 9447 if ((ipif->ipif_flags & IPIF_UP) && 9448 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 9449 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 9450 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 9451 return (EADDRNOTAVAIL); 9452 } 9453 9454 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 9455 return (EADDRNOTAVAIL); 9456 } else { 9457 ipaddr_t addr; 9458 9459 if (sin->sin_family != AF_INET) 9460 return (EAFNOSUPPORT); 9461 9462 addr = sin->sin_addr.s_addr; 9463 9464 /* Allow 0 as the local address. */ 9465 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 9466 return (EADDRNOTAVAIL); 9467 9468 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9469 } 9470 9471 /* 9472 * Even if there is no change we redo things just to rerun 9473 * ipif_set_default. 9474 */ 9475 if (ipif->ipif_flags & IPIF_UP) { 9476 /* 9477 * Setting a new local address, make sure 9478 * we have net and subnet bcast ire's for 9479 * the old address if we need them. 9480 */ 9481 /* 9482 * If the interface is already marked up, 9483 * we call ipif_down which will take care 9484 * of ditching any IREs that have been set 9485 * up based on the old interface address. 9486 */ 9487 err = ipif_logical_down(ipif, q, mp); 9488 if (err == EINPROGRESS) 9489 return (err); 9490 (void) ipif_down_tail(ipif); 9491 need_up = 1; 9492 } 9493 9494 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 9495 return (err); 9496 } 9497 9498 int 9499 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9500 boolean_t need_up) 9501 { 9502 in6_addr_t v6addr; 9503 in6_addr_t ov6addr; 9504 ipaddr_t addr; 9505 sin6_t *sin6; 9506 int sinlen; 9507 int err = 0; 9508 ill_t *ill = ipif->ipif_ill; 9509 boolean_t need_dl_down; 9510 boolean_t need_arp_down; 9511 struct iocblk *iocp; 9512 9513 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL; 9514 9515 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 9516 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9517 ASSERT(IAM_WRITER_IPIF(ipif)); 9518 9519 /* Must cancel any pending timer before taking the ill_lock */ 9520 if (ipif->ipif_recovery_id != 0) 9521 (void) untimeout(ipif->ipif_recovery_id); 9522 ipif->ipif_recovery_id = 0; 9523 9524 if (ipif->ipif_isv6) { 9525 sin6 = (sin6_t *)sin; 9526 v6addr = sin6->sin6_addr; 9527 sinlen = sizeof (struct sockaddr_in6); 9528 } else { 9529 addr = sin->sin_addr.s_addr; 9530 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9531 sinlen = sizeof (struct sockaddr_in); 9532 } 9533 mutex_enter(&ill->ill_lock); 9534 ov6addr = ipif->ipif_v6lcl_addr; 9535 ipif->ipif_v6lcl_addr = v6addr; 9536 sctp_update_ipif_addr(ipif, ov6addr); 9537 ipif->ipif_addr_ready = 0; 9538 9539 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT); 9540 9541 /* 9542 * If the interface was previously marked as a duplicate, then since 9543 * we've now got a "new" address, it should no longer be considered a 9544 * duplicate -- even if the "new" address is the same as the old one. 9545 * Note that if all ipifs are down, we may have a pending ARP down 9546 * event to handle. This is because we want to recover from duplicates 9547 * and thus delay tearing down ARP until the duplicates have been 9548 * removed or disabled. 9549 */ 9550 need_dl_down = need_arp_down = B_FALSE; 9551 if (ipif->ipif_flags & IPIF_DUPLICATE) { 9552 need_arp_down = !need_up; 9553 ipif->ipif_flags &= ~IPIF_DUPLICATE; 9554 if (--ill->ill_ipif_dup_count == 0 && !need_up && 9555 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 9556 need_dl_down = B_TRUE; 9557 } 9558 } 9559 9560 ipif_set_default(ipif); 9561 9562 /* 9563 * If we've just manually set the IPv6 link-local address (0th ipif), 9564 * tag the ill so that future updates to the interface ID don't result 9565 * in this address getting automatically reconfigured from under the 9566 * administrator. 9567 */ 9568 if (ipif->ipif_isv6 && ipif->ipif_id == 0) 9569 ill->ill_manual_linklocal = 1; 9570 9571 /* 9572 * When publishing an interface address change event, we only notify 9573 * the event listeners of the new address. It is assumed that if they 9574 * actively care about the addresses assigned that they will have 9575 * already discovered the previous address assigned (if there was one.) 9576 * 9577 * Don't attach nic event message for SIOCLIFADDIF ioctl. 9578 */ 9579 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) { 9580 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id), 9581 NE_ADDRESS_CHANGE, sin, sinlen); 9582 } 9583 9584 mutex_exit(&ill->ill_lock); 9585 9586 if (need_up) { 9587 /* 9588 * Now bring the interface back up. If this 9589 * is the only IPIF for the ILL, ipif_up 9590 * will have to re-bind to the device, so 9591 * we may get back EINPROGRESS, in which 9592 * case, this IOCTL will get completed in 9593 * ip_rput_dlpi when we see the DL_BIND_ACK. 9594 */ 9595 err = ipif_up(ipif, q, mp); 9596 } else { 9597 /* Perhaps ilgs should use this ill */ 9598 update_conn_ill(NULL, ill->ill_ipst); 9599 } 9600 9601 if (need_dl_down) 9602 ill_dl_down(ill); 9603 9604 if (need_arp_down && !ill->ill_isv6) 9605 (void) ipif_arp_down(ipif); 9606 9607 /* 9608 * The default multicast interface might have changed (for 9609 * instance if the IPv6 scope of the address changed) 9610 */ 9611 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); 9612 9613 return (err); 9614 } 9615 9616 /* 9617 * Restart entry point to restart the address set operation after the 9618 * refcounts have dropped to zero. 9619 */ 9620 /* ARGSUSED */ 9621 int 9622 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9623 ip_ioctl_cmd_t *ipip, void *ifreq) 9624 { 9625 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 9626 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9627 ASSERT(IAM_WRITER_IPIF(ipif)); 9628 (void) ipif_down_tail(ipif); 9629 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 9630 } 9631 9632 /* ARGSUSED */ 9633 int 9634 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9635 ip_ioctl_cmd_t *ipip, void *if_req) 9636 { 9637 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 9638 struct lifreq *lifr = (struct lifreq *)if_req; 9639 9640 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 9641 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9642 /* 9643 * The net mask and address can't change since we have a 9644 * reference to the ipif. So no lock is necessary. 9645 */ 9646 if (ipif->ipif_isv6) { 9647 *sin6 = sin6_null; 9648 sin6->sin6_family = AF_INET6; 9649 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 9650 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 9651 lifr->lifr_addrlen = 9652 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 9653 } else { 9654 *sin = sin_null; 9655 sin->sin_family = AF_INET; 9656 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 9657 if (ipip->ipi_cmd_type == LIF_CMD) { 9658 lifr->lifr_addrlen = 9659 ip_mask_to_plen(ipif->ipif_net_mask); 9660 } 9661 } 9662 return (0); 9663 } 9664 9665 /* 9666 * Set the destination address for a pt-pt interface. 9667 */ 9668 /* ARGSUSED */ 9669 int 9670 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9671 ip_ioctl_cmd_t *ipip, void *if_req) 9672 { 9673 int err = 0; 9674 in6_addr_t v6addr; 9675 boolean_t need_up = B_FALSE; 9676 9677 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 9678 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9679 ASSERT(IAM_WRITER_IPIF(ipif)); 9680 9681 if (ipif->ipif_isv6) { 9682 sin6_t *sin6; 9683 9684 if (sin->sin_family != AF_INET6) 9685 return (EAFNOSUPPORT); 9686 9687 sin6 = (sin6_t *)sin; 9688 v6addr = sin6->sin6_addr; 9689 9690 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 9691 return (EADDRNOTAVAIL); 9692 } else { 9693 ipaddr_t addr; 9694 9695 if (sin->sin_family != AF_INET) 9696 return (EAFNOSUPPORT); 9697 9698 addr = sin->sin_addr.s_addr; 9699 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 9700 return (EADDRNOTAVAIL); 9701 9702 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9703 } 9704 9705 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 9706 return (0); /* No change */ 9707 9708 if (ipif->ipif_flags & IPIF_UP) { 9709 /* 9710 * If the interface is already marked up, 9711 * we call ipif_down which will take care 9712 * of ditching any IREs that have been set 9713 * up based on the old pp dst address. 9714 */ 9715 err = ipif_logical_down(ipif, q, mp); 9716 if (err == EINPROGRESS) 9717 return (err); 9718 (void) ipif_down_tail(ipif); 9719 need_up = B_TRUE; 9720 } 9721 /* 9722 * could return EINPROGRESS. If so ioctl will complete in 9723 * ip_rput_dlpi_writer 9724 */ 9725 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 9726 return (err); 9727 } 9728 9729 static int 9730 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9731 boolean_t need_up) 9732 { 9733 in6_addr_t v6addr; 9734 ill_t *ill = ipif->ipif_ill; 9735 int err = 0; 9736 boolean_t need_dl_down; 9737 boolean_t need_arp_down; 9738 9739 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 9740 ipif->ipif_id, (void *)ipif)); 9741 9742 /* Must cancel any pending timer before taking the ill_lock */ 9743 if (ipif->ipif_recovery_id != 0) 9744 (void) untimeout(ipif->ipif_recovery_id); 9745 ipif->ipif_recovery_id = 0; 9746 9747 if (ipif->ipif_isv6) { 9748 sin6_t *sin6; 9749 9750 sin6 = (sin6_t *)sin; 9751 v6addr = sin6->sin6_addr; 9752 } else { 9753 ipaddr_t addr; 9754 9755 addr = sin->sin_addr.s_addr; 9756 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9757 } 9758 mutex_enter(&ill->ill_lock); 9759 /* Set point to point destination address. */ 9760 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 9761 /* 9762 * Allow this as a means of creating logical 9763 * pt-pt interfaces on top of e.g. an Ethernet. 9764 * XXX Undocumented HACK for testing. 9765 * pt-pt interfaces are created with NUD disabled. 9766 */ 9767 ipif->ipif_flags |= IPIF_POINTOPOINT; 9768 ipif->ipif_flags &= ~IPIF_BROADCAST; 9769 if (ipif->ipif_isv6) 9770 ill->ill_flags |= ILLF_NONUD; 9771 } 9772 9773 /* 9774 * If the interface was previously marked as a duplicate, then since 9775 * we've now got a "new" address, it should no longer be considered a 9776 * duplicate -- even if the "new" address is the same as the old one. 9777 * Note that if all ipifs are down, we may have a pending ARP down 9778 * event to handle. 9779 */ 9780 need_dl_down = need_arp_down = B_FALSE; 9781 if (ipif->ipif_flags & IPIF_DUPLICATE) { 9782 need_arp_down = !need_up; 9783 ipif->ipif_flags &= ~IPIF_DUPLICATE; 9784 if (--ill->ill_ipif_dup_count == 0 && !need_up && 9785 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 9786 need_dl_down = B_TRUE; 9787 } 9788 } 9789 9790 /* Set the new address. */ 9791 ipif->ipif_v6pp_dst_addr = v6addr; 9792 /* Make sure subnet tracks pp_dst */ 9793 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 9794 mutex_exit(&ill->ill_lock); 9795 9796 if (need_up) { 9797 /* 9798 * Now bring the interface back up. If this 9799 * is the only IPIF for the ILL, ipif_up 9800 * will have to re-bind to the device, so 9801 * we may get back EINPROGRESS, in which 9802 * case, this IOCTL will get completed in 9803 * ip_rput_dlpi when we see the DL_BIND_ACK. 9804 */ 9805 err = ipif_up(ipif, q, mp); 9806 } 9807 9808 if (need_dl_down) 9809 ill_dl_down(ill); 9810 if (need_arp_down && !ipif->ipif_isv6) 9811 (void) ipif_arp_down(ipif); 9812 9813 return (err); 9814 } 9815 9816 /* 9817 * Restart entry point to restart the dstaddress set operation after the 9818 * refcounts have dropped to zero. 9819 */ 9820 /* ARGSUSED */ 9821 int 9822 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9823 ip_ioctl_cmd_t *ipip, void *ifreq) 9824 { 9825 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 9826 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9827 (void) ipif_down_tail(ipif); 9828 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 9829 } 9830 9831 /* ARGSUSED */ 9832 int 9833 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9834 ip_ioctl_cmd_t *ipip, void *if_req) 9835 { 9836 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 9837 9838 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 9839 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9840 /* 9841 * Get point to point destination address. The addresses can't 9842 * change since we hold a reference to the ipif. 9843 */ 9844 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 9845 return (EADDRNOTAVAIL); 9846 9847 if (ipif->ipif_isv6) { 9848 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 9849 *sin6 = sin6_null; 9850 sin6->sin6_family = AF_INET6; 9851 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 9852 } else { 9853 *sin = sin_null; 9854 sin->sin_family = AF_INET; 9855 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 9856 } 9857 return (0); 9858 } 9859 9860 /* 9861 * Check which flags will change by the given flags being set 9862 * silently ignore flags which userland is not allowed to control. 9863 * (Because these flags may change between SIOCGLIFFLAGS and 9864 * SIOCSLIFFLAGS, and that's outside of userland's control, 9865 * we need to silently ignore them rather than fail.) 9866 */ 9867 static void 9868 ip_sioctl_flags_onoff(ipif_t *ipif, uint64_t flags, uint64_t *onp, 9869 uint64_t *offp) 9870 { 9871 ill_t *ill = ipif->ipif_ill; 9872 phyint_t *phyi = ill->ill_phyint; 9873 uint64_t cantchange_flags, intf_flags; 9874 uint64_t turn_on, turn_off; 9875 9876 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 9877 cantchange_flags = IFF_CANTCHANGE; 9878 if (IS_IPMP(ill)) 9879 cantchange_flags |= IFF_IPMP_CANTCHANGE; 9880 turn_on = (flags ^ intf_flags) & ~cantchange_flags; 9881 turn_off = intf_flags & turn_on; 9882 turn_on ^= turn_off; 9883 *onp = turn_on; 9884 *offp = turn_off; 9885 } 9886 9887 /* 9888 * Set interface flags. Many flags require special handling (e.g., 9889 * bringing the interface down); see below for details. 9890 * 9891 * NOTE : We really don't enforce that ipif_id zero should be used 9892 * for setting any flags other than IFF_LOGINT_FLAGS. This 9893 * is because applications generally does SICGLIFFLAGS and 9894 * ORs in the new flags (that affects the logical) and does a 9895 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 9896 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 9897 * flags that will be turned on is correct with respect to 9898 * ipif_id 0. For backward compatibility reasons, it is not done. 9899 */ 9900 /* ARGSUSED */ 9901 int 9902 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9903 ip_ioctl_cmd_t *ipip, void *if_req) 9904 { 9905 uint64_t turn_on; 9906 uint64_t turn_off; 9907 int err = 0; 9908 phyint_t *phyi; 9909 ill_t *ill; 9910 conn_t *connp; 9911 uint64_t intf_flags; 9912 boolean_t phyint_flags_modified = B_FALSE; 9913 uint64_t flags; 9914 struct ifreq *ifr; 9915 struct lifreq *lifr; 9916 boolean_t set_linklocal = B_FALSE; 9917 9918 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 9919 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9920 9921 ASSERT(IAM_WRITER_IPIF(ipif)); 9922 9923 ill = ipif->ipif_ill; 9924 phyi = ill->ill_phyint; 9925 9926 if (ipip->ipi_cmd_type == IF_CMD) { 9927 ifr = (struct ifreq *)if_req; 9928 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 9929 } else { 9930 lifr = (struct lifreq *)if_req; 9931 flags = lifr->lifr_flags; 9932 } 9933 9934 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 9935 9936 /* 9937 * Have the flags been set correctly until now? 9938 */ 9939 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 9940 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 9941 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 9942 /* 9943 * Compare the new flags to the old, and partition 9944 * into those coming on and those going off. 9945 * For the 16 bit command keep the bits above bit 16 unchanged. 9946 */ 9947 if (ipip->ipi_cmd == SIOCSIFFLAGS) 9948 flags |= intf_flags & ~0xFFFF; 9949 9950 /* 9951 * Explicitly fail attempts to change flags that are always invalid on 9952 * an IPMP meta-interface. 9953 */ 9954 if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID)) 9955 return (EINVAL); 9956 9957 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 9958 if ((turn_on|turn_off) == 0) 9959 return (0); /* No change */ 9960 9961 /* 9962 * All test addresses must be IFF_DEPRECATED (to ensure source address 9963 * selection avoids them) -- so force IFF_DEPRECATED on, and do not 9964 * allow it to be turned off. 9965 */ 9966 if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED && 9967 (turn_on|intf_flags) & IFF_NOFAILOVER) 9968 return (EINVAL); 9969 9970 if ((connp = Q_TO_CONN(q)) == NULL) 9971 return (EINVAL); 9972 9973 /* 9974 * Only vrrp control socket is allowed to change IFF_UP and 9975 * IFF_NOACCEPT flags when IFF_VRRP is set. 9976 */ 9977 if ((intf_flags & IFF_VRRP) && ((turn_off | turn_on) & IFF_UP)) { 9978 if (!connp->conn_isvrrp) 9979 return (EINVAL); 9980 } 9981 9982 /* 9983 * The IFF_NOACCEPT flag can only be set on an IFF_VRRP IP address by 9984 * VRRP control socket. 9985 */ 9986 if ((turn_off | turn_on) & IFF_NOACCEPT) { 9987 if (!connp->conn_isvrrp || !(intf_flags & IFF_VRRP)) 9988 return (EINVAL); 9989 } 9990 9991 if (turn_on & IFF_NOFAILOVER) { 9992 turn_on |= IFF_DEPRECATED; 9993 flags |= IFF_DEPRECATED; 9994 } 9995 9996 /* 9997 * On underlying interfaces, only allow applications to manage test 9998 * addresses -- otherwise, they may get confused when the address 9999 * moves as part of being brought up. Likewise, prevent an 10000 * application-managed test address from being converted to a data 10001 * address. To prevent migration of administratively up addresses in 10002 * the kernel, we don't allow them to be converted either. 10003 */ 10004 if (IS_UNDER_IPMP(ill)) { 10005 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF; 10006 10007 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER)) 10008 return (EINVAL); 10009 10010 if ((turn_off & IFF_NOFAILOVER) && 10011 (flags & (appflags | IFF_UP | IFF_DUPLICATE))) 10012 return (EINVAL); 10013 } 10014 10015 /* 10016 * Only allow IFF_TEMPORARY flag to be set on 10017 * IPv6 interfaces. 10018 */ 10019 if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6)) 10020 return (EINVAL); 10021 10022 /* 10023 * cannot turn off IFF_NOXMIT on VNI interfaces. 10024 */ 10025 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill)) 10026 return (EINVAL); 10027 10028 /* 10029 * Don't allow the IFF_ROUTER flag to be turned on on loopback 10030 * interfaces. It makes no sense in that context. 10031 */ 10032 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 10033 return (EINVAL); 10034 10035 /* 10036 * For IPv6 ipif_id 0, don't allow the interface to be up without 10037 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 10038 * If the link local address isn't set, and can be set, it will get 10039 * set later on in this function. 10040 */ 10041 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 10042 (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) && 10043 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 10044 if (ipif_cant_setlinklocal(ipif)) 10045 return (EINVAL); 10046 set_linklocal = B_TRUE; 10047 } 10048 10049 /* 10050 * If we modify physical interface flags, we'll potentially need to 10051 * send up two routing socket messages for the changes (one for the 10052 * IPv4 ill, and another for the IPv6 ill). Note that here. 10053 */ 10054 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 10055 phyint_flags_modified = B_TRUE; 10056 10057 /* 10058 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE 10059 * (otherwise, we'd immediately use them, defeating standby). Also, 10060 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not 10061 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already 10062 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We 10063 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics 10064 * will not be honored. 10065 */ 10066 if (turn_on & PHYI_STANDBY) { 10067 /* 10068 * No need to grab ill_g_usesrc_lock here; see the 10069 * synchronization notes in ip.c. 10070 */ 10071 if (ill->ill_usesrc_grp_next != NULL || 10072 intf_flags & PHYI_INACTIVE) 10073 return (EINVAL); 10074 if (!(flags & PHYI_FAILED)) { 10075 flags |= PHYI_INACTIVE; 10076 turn_on |= PHYI_INACTIVE; 10077 } 10078 } 10079 10080 if (turn_off & PHYI_STANDBY) { 10081 flags &= ~PHYI_INACTIVE; 10082 turn_off |= PHYI_INACTIVE; 10083 } 10084 10085 /* 10086 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both 10087 * would end up on. 10088 */ 10089 if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) == 10090 (PHYI_FAILED | PHYI_INACTIVE)) 10091 return (EINVAL); 10092 10093 /* 10094 * If ILLF_ROUTER changes, we need to change the ip forwarding 10095 * status of the interface. 10096 */ 10097 if ((turn_on | turn_off) & ILLF_ROUTER) 10098 (void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0)); 10099 10100 /* 10101 * If the interface is not UP and we are not going to 10102 * bring it UP, record the flags and return. When the 10103 * interface comes UP later, the right actions will be 10104 * taken. 10105 */ 10106 if (!(ipif->ipif_flags & IPIF_UP) && 10107 !(turn_on & IPIF_UP)) { 10108 /* Record new flags in their respective places. */ 10109 mutex_enter(&ill->ill_lock); 10110 mutex_enter(&ill->ill_phyint->phyint_lock); 10111 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 10112 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 10113 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 10114 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 10115 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 10116 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 10117 mutex_exit(&ill->ill_lock); 10118 mutex_exit(&ill->ill_phyint->phyint_lock); 10119 10120 /* 10121 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the 10122 * same to the kernel: if any of them has been set by 10123 * userland, the interface cannot be used for data traffic. 10124 */ 10125 if ((turn_on|turn_off) & 10126 (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 10127 ASSERT(!IS_IPMP(ill)); 10128 /* 10129 * It's possible the ill is part of an "anonymous" 10130 * IPMP group rather than a real group. In that case, 10131 * there are no other interfaces in the group and thus 10132 * no need to call ipmp_phyint_refresh_active(). 10133 */ 10134 if (IS_UNDER_IPMP(ill)) 10135 ipmp_phyint_refresh_active(phyi); 10136 } 10137 10138 if (phyint_flags_modified) { 10139 if (phyi->phyint_illv4 != NULL) { 10140 ip_rts_ifmsg(phyi->phyint_illv4-> 10141 ill_ipif, RTSQ_DEFAULT); 10142 } 10143 if (phyi->phyint_illv6 != NULL) { 10144 ip_rts_ifmsg(phyi->phyint_illv6-> 10145 ill_ipif, RTSQ_DEFAULT); 10146 } 10147 } 10148 /* The default multicast interface might have changed */ 10149 ire_increment_multicast_generation(ill->ill_ipst, 10150 ill->ill_isv6); 10151 10152 return (0); 10153 } else if (set_linklocal) { 10154 mutex_enter(&ill->ill_lock); 10155 if (set_linklocal) 10156 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 10157 mutex_exit(&ill->ill_lock); 10158 } 10159 10160 /* 10161 * Disallow IPv6 interfaces coming up that have the unspecified address, 10162 * or point-to-point interfaces with an unspecified destination. We do 10163 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 10164 * have a subnet assigned, which is how in.ndpd currently manages its 10165 * onlink prefix list when no addresses are configured with those 10166 * prefixes. 10167 */ 10168 if (ipif->ipif_isv6 && 10169 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 10170 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 10171 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 10172 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 10173 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 10174 return (EINVAL); 10175 } 10176 10177 /* 10178 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 10179 * from being brought up. 10180 */ 10181 if (!ipif->ipif_isv6 && 10182 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 10183 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 10184 return (EINVAL); 10185 } 10186 10187 /* 10188 * If we are going to change one or more of the flags that are 10189 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP, 10190 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and 10191 * IPIF_NOFAILOVER, we will take special action. This is 10192 * done by bring the ipif down, changing the flags and bringing 10193 * it back up again. For IPIF_NOFAILOVER, the act of bringing it 10194 * back up will trigger the address to be moved. 10195 * 10196 * If we are going to change IFF_NOACCEPT, we need to bring 10197 * all the ipifs down then bring them up again. The act of 10198 * bringing all the ipifs back up will trigger the local 10199 * ires being recreated with "no_accept" set/cleared. 10200 * 10201 * Note that ILLF_NOACCEPT is always set separately from the 10202 * other flags. 10203 */ 10204 if ((turn_on|turn_off) & 10205 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 10206 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED| 10207 IPIF_NOFAILOVER)) { 10208 /* 10209 * ipif_down() will ire_delete bcast ire's for the subnet, 10210 * while the ire_identical_ref tracks the case of IRE_BROADCAST 10211 * entries shared between multiple ipifs on the same subnet. 10212 */ 10213 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 10214 !(turn_off & IPIF_UP)) { 10215 if (ipif->ipif_flags & IPIF_UP) 10216 ill->ill_logical_down = 1; 10217 turn_on &= ~IPIF_UP; 10218 } 10219 err = ipif_down(ipif, q, mp); 10220 ip1dbg(("ipif_down returns %d err ", err)); 10221 if (err == EINPROGRESS) 10222 return (err); 10223 (void) ipif_down_tail(ipif); 10224 } else if ((turn_on|turn_off) & ILLF_NOACCEPT) { 10225 /* 10226 * If we can quiesce the ill, then continue. If not, then 10227 * ip_sioctl_flags_tail() will be called from 10228 * ipif_ill_refrele_tail(). 10229 */ 10230 ill_down_ipifs(ill, B_TRUE); 10231 10232 mutex_enter(&connp->conn_lock); 10233 mutex_enter(&ill->ill_lock); 10234 if (!ill_is_quiescent(ill)) { 10235 boolean_t success; 10236 10237 success = ipsq_pending_mp_add(connp, ill->ill_ipif, 10238 q, mp, ILL_DOWN); 10239 mutex_exit(&ill->ill_lock); 10240 mutex_exit(&connp->conn_lock); 10241 return (success ? EINPROGRESS : EINTR); 10242 } 10243 mutex_exit(&ill->ill_lock); 10244 mutex_exit(&connp->conn_lock); 10245 } 10246 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 10247 } 10248 10249 static int 10250 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) 10251 { 10252 ill_t *ill; 10253 phyint_t *phyi; 10254 uint64_t turn_on, turn_off; 10255 boolean_t phyint_flags_modified = B_FALSE; 10256 int err = 0; 10257 boolean_t set_linklocal = B_FALSE; 10258 10259 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 10260 ipif->ipif_ill->ill_name, ipif->ipif_id)); 10261 10262 ASSERT(IAM_WRITER_IPIF(ipif)); 10263 10264 ill = ipif->ipif_ill; 10265 phyi = ill->ill_phyint; 10266 10267 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10268 10269 /* 10270 * IFF_UP is handled separately. 10271 */ 10272 turn_on &= ~IFF_UP; 10273 turn_off &= ~IFF_UP; 10274 10275 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 10276 phyint_flags_modified = B_TRUE; 10277 10278 /* 10279 * Now we change the flags. Track current value of 10280 * other flags in their respective places. 10281 */ 10282 mutex_enter(&ill->ill_lock); 10283 mutex_enter(&phyi->phyint_lock); 10284 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 10285 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 10286 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 10287 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 10288 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 10289 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 10290 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 10291 set_linklocal = B_TRUE; 10292 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 10293 } 10294 10295 mutex_exit(&ill->ill_lock); 10296 mutex_exit(&phyi->phyint_lock); 10297 10298 if (set_linklocal) 10299 (void) ipif_setlinklocal(ipif); 10300 10301 /* 10302 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to 10303 * the kernel: if any of them has been set by userland, the interface 10304 * cannot be used for data traffic. 10305 */ 10306 if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 10307 ASSERT(!IS_IPMP(ill)); 10308 /* 10309 * It's possible the ill is part of an "anonymous" IPMP group 10310 * rather than a real group. In that case, there are no other 10311 * interfaces in the group and thus no need for us to call 10312 * ipmp_phyint_refresh_active(). 10313 */ 10314 if (IS_UNDER_IPMP(ill)) 10315 ipmp_phyint_refresh_active(phyi); 10316 } 10317 10318 if ((turn_on|turn_off) & ILLF_NOACCEPT) { 10319 /* 10320 * If the ILLF_NOACCEPT flag is changed, bring up all the 10321 * ipifs that were brought down. 10322 * 10323 * The routing sockets messages are sent as the result 10324 * of ill_up_ipifs(), further, SCTP's IPIF list was updated 10325 * as well. 10326 */ 10327 err = ill_up_ipifs(ill, q, mp); 10328 } else if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) { 10329 /* 10330 * XXX ipif_up really does not know whether a phyint flags 10331 * was modified or not. So, it sends up information on 10332 * only one routing sockets message. As we don't bring up 10333 * the interface and also set PHYI_ flags simultaneously 10334 * it should be okay. 10335 */ 10336 err = ipif_up(ipif, q, mp); 10337 } else { 10338 /* 10339 * Make sure routing socket sees all changes to the flags. 10340 * ipif_up_done* handles this when we use ipif_up. 10341 */ 10342 if (phyint_flags_modified) { 10343 if (phyi->phyint_illv4 != NULL) { 10344 ip_rts_ifmsg(phyi->phyint_illv4-> 10345 ill_ipif, RTSQ_DEFAULT); 10346 } 10347 if (phyi->phyint_illv6 != NULL) { 10348 ip_rts_ifmsg(phyi->phyint_illv6-> 10349 ill_ipif, RTSQ_DEFAULT); 10350 } 10351 } else { 10352 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 10353 } 10354 /* 10355 * Update the flags in SCTP's IPIF list, ipif_up() will do 10356 * this in need_up case. 10357 */ 10358 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10359 } 10360 10361 /* The default multicast interface might have changed */ 10362 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); 10363 return (err); 10364 } 10365 10366 /* 10367 * Restart the flags operation now that the refcounts have dropped to zero. 10368 */ 10369 /* ARGSUSED */ 10370 int 10371 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10372 ip_ioctl_cmd_t *ipip, void *if_req) 10373 { 10374 uint64_t flags; 10375 struct ifreq *ifr = if_req; 10376 struct lifreq *lifr = if_req; 10377 uint64_t turn_on, turn_off; 10378 10379 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 10380 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10381 10382 if (ipip->ipi_cmd_type == IF_CMD) { 10383 /* cast to uint16_t prevents unwanted sign extension */ 10384 flags = (uint16_t)ifr->ifr_flags; 10385 } else { 10386 flags = lifr->lifr_flags; 10387 } 10388 10389 /* 10390 * If this function call is a result of the ILLF_NOACCEPT flag 10391 * change, do not call ipif_down_tail(). See ip_sioctl_flags(). 10392 */ 10393 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10394 if (!((turn_on|turn_off) & ILLF_NOACCEPT)) 10395 (void) ipif_down_tail(ipif); 10396 10397 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 10398 } 10399 10400 /* 10401 * Can operate on either a module or a driver queue. 10402 */ 10403 /* ARGSUSED */ 10404 int 10405 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10406 ip_ioctl_cmd_t *ipip, void *if_req) 10407 { 10408 /* 10409 * Has the flags been set correctly till now ? 10410 */ 10411 ill_t *ill = ipif->ipif_ill; 10412 phyint_t *phyi = ill->ill_phyint; 10413 10414 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 10415 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10416 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 10417 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 10418 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 10419 10420 /* 10421 * Need a lock since some flags can be set even when there are 10422 * references to the ipif. 10423 */ 10424 mutex_enter(&ill->ill_lock); 10425 if (ipip->ipi_cmd_type == IF_CMD) { 10426 struct ifreq *ifr = (struct ifreq *)if_req; 10427 10428 /* Get interface flags (low 16 only). */ 10429 ifr->ifr_flags = ((ipif->ipif_flags | 10430 ill->ill_flags | phyi->phyint_flags) & 0xffff); 10431 } else { 10432 struct lifreq *lifr = (struct lifreq *)if_req; 10433 10434 /* Get interface flags. */ 10435 lifr->lifr_flags = ipif->ipif_flags | 10436 ill->ill_flags | phyi->phyint_flags; 10437 } 10438 mutex_exit(&ill->ill_lock); 10439 return (0); 10440 } 10441 10442 /* 10443 * We allow the MTU to be set on an ILL, but not have it be different 10444 * for different IPIFs since we don't actually send packets on IPIFs. 10445 */ 10446 /* ARGSUSED */ 10447 int 10448 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10449 ip_ioctl_cmd_t *ipip, void *if_req) 10450 { 10451 int mtu; 10452 int ip_min_mtu; 10453 struct ifreq *ifr; 10454 struct lifreq *lifr; 10455 ill_t *ill; 10456 10457 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 10458 ipif->ipif_id, (void *)ipif)); 10459 if (ipip->ipi_cmd_type == IF_CMD) { 10460 ifr = (struct ifreq *)if_req; 10461 mtu = ifr->ifr_metric; 10462 } else { 10463 lifr = (struct lifreq *)if_req; 10464 mtu = lifr->lifr_mtu; 10465 } 10466 /* Only allow for logical unit zero i.e. not on "bge0:17" */ 10467 if (ipif->ipif_id != 0) 10468 return (EINVAL); 10469 10470 ill = ipif->ipif_ill; 10471 if (ipif->ipif_isv6) 10472 ip_min_mtu = IPV6_MIN_MTU; 10473 else 10474 ip_min_mtu = IP_MIN_MTU; 10475 10476 mutex_enter(&ill->ill_lock); 10477 if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) { 10478 mutex_exit(&ill->ill_lock); 10479 return (EINVAL); 10480 } 10481 /* 10482 * The dce and fragmentation code can handle changes to ill_mtu 10483 * concurrent with sending/fragmenting packets. 10484 */ 10485 ill->ill_mtu = mtu; 10486 ill->ill_flags |= ILLF_FIXEDMTU; 10487 mutex_exit(&ill->ill_lock); 10488 10489 /* 10490 * Make sure all dce_generation checks find out 10491 * that ill_mtu has changed. 10492 */ 10493 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); 10494 10495 /* Update the MTU in SCTP's list */ 10496 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10497 return (0); 10498 } 10499 10500 /* Get interface MTU. */ 10501 /* ARGSUSED */ 10502 int 10503 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10504 ip_ioctl_cmd_t *ipip, void *if_req) 10505 { 10506 struct ifreq *ifr; 10507 struct lifreq *lifr; 10508 10509 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 10510 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10511 10512 /* 10513 * We allow a get on any logical interface even though the set 10514 * can only be done on logical unit 0. 10515 */ 10516 if (ipip->ipi_cmd_type == IF_CMD) { 10517 ifr = (struct ifreq *)if_req; 10518 ifr->ifr_metric = ipif->ipif_ill->ill_mtu; 10519 } else { 10520 lifr = (struct lifreq *)if_req; 10521 lifr->lifr_mtu = ipif->ipif_ill->ill_mtu; 10522 } 10523 return (0); 10524 } 10525 10526 /* Set interface broadcast address. */ 10527 /* ARGSUSED2 */ 10528 int 10529 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10530 ip_ioctl_cmd_t *ipip, void *if_req) 10531 { 10532 ipaddr_t addr; 10533 ire_t *ire; 10534 ill_t *ill = ipif->ipif_ill; 10535 ip_stack_t *ipst = ill->ill_ipst; 10536 10537 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name, 10538 ipif->ipif_id)); 10539 10540 ASSERT(IAM_WRITER_IPIF(ipif)); 10541 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 10542 return (EADDRNOTAVAIL); 10543 10544 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 10545 10546 if (sin->sin_family != AF_INET) 10547 return (EAFNOSUPPORT); 10548 10549 addr = sin->sin_addr.s_addr; 10550 if (ipif->ipif_flags & IPIF_UP) { 10551 /* 10552 * If we are already up, make sure the new 10553 * broadcast address makes sense. If it does, 10554 * there should be an IRE for it already. 10555 */ 10556 ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST, 10557 ill, ipif->ipif_zoneid, NULL, 10558 (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL); 10559 if (ire == NULL) { 10560 return (EINVAL); 10561 } else { 10562 ire_refrele(ire); 10563 } 10564 } 10565 /* 10566 * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST 10567 * needs to already exist we never need to change the set of 10568 * IRE_BROADCASTs when we are UP. 10569 */ 10570 if (addr != ipif->ipif_brd_addr) 10571 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 10572 10573 return (0); 10574 } 10575 10576 /* Get interface broadcast address. */ 10577 /* ARGSUSED */ 10578 int 10579 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10580 ip_ioctl_cmd_t *ipip, void *if_req) 10581 { 10582 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 10583 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10584 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 10585 return (EADDRNOTAVAIL); 10586 10587 /* IPIF_BROADCAST not possible with IPv6 */ 10588 ASSERT(!ipif->ipif_isv6); 10589 *sin = sin_null; 10590 sin->sin_family = AF_INET; 10591 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 10592 return (0); 10593 } 10594 10595 /* 10596 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 10597 */ 10598 /* ARGSUSED */ 10599 int 10600 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10601 ip_ioctl_cmd_t *ipip, void *if_req) 10602 { 10603 int err = 0; 10604 in6_addr_t v6mask; 10605 10606 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 10607 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10608 10609 ASSERT(IAM_WRITER_IPIF(ipif)); 10610 10611 if (ipif->ipif_isv6) { 10612 sin6_t *sin6; 10613 10614 if (sin->sin_family != AF_INET6) 10615 return (EAFNOSUPPORT); 10616 10617 sin6 = (sin6_t *)sin; 10618 v6mask = sin6->sin6_addr; 10619 } else { 10620 ipaddr_t mask; 10621 10622 if (sin->sin_family != AF_INET) 10623 return (EAFNOSUPPORT); 10624 10625 mask = sin->sin_addr.s_addr; 10626 V4MASK_TO_V6(mask, v6mask); 10627 } 10628 10629 /* 10630 * No big deal if the interface isn't already up, or the mask 10631 * isn't really changing, or this is pt-pt. 10632 */ 10633 if (!(ipif->ipif_flags & IPIF_UP) || 10634 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 10635 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 10636 ipif->ipif_v6net_mask = v6mask; 10637 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10638 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 10639 ipif->ipif_v6net_mask, 10640 ipif->ipif_v6subnet); 10641 } 10642 return (0); 10643 } 10644 /* 10645 * Make sure we have valid net and subnet broadcast ire's 10646 * for the old netmask, if needed by other logical interfaces. 10647 */ 10648 err = ipif_logical_down(ipif, q, mp); 10649 if (err == EINPROGRESS) 10650 return (err); 10651 (void) ipif_down_tail(ipif); 10652 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 10653 return (err); 10654 } 10655 10656 static int 10657 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 10658 { 10659 in6_addr_t v6mask; 10660 int err = 0; 10661 10662 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 10663 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10664 10665 if (ipif->ipif_isv6) { 10666 sin6_t *sin6; 10667 10668 sin6 = (sin6_t *)sin; 10669 v6mask = sin6->sin6_addr; 10670 } else { 10671 ipaddr_t mask; 10672 10673 mask = sin->sin_addr.s_addr; 10674 V4MASK_TO_V6(mask, v6mask); 10675 } 10676 10677 ipif->ipif_v6net_mask = v6mask; 10678 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10679 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 10680 ipif->ipif_v6subnet); 10681 } 10682 err = ipif_up(ipif, q, mp); 10683 10684 if (err == 0 || err == EINPROGRESS) { 10685 /* 10686 * The interface must be DL_BOUND if this packet has to 10687 * go out on the wire. Since we only go through a logical 10688 * down and are bound with the driver during an internal 10689 * down/up that is satisfied. 10690 */ 10691 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 10692 /* Potentially broadcast an address mask reply. */ 10693 ipif_mask_reply(ipif); 10694 } 10695 } 10696 return (err); 10697 } 10698 10699 /* ARGSUSED */ 10700 int 10701 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10702 ip_ioctl_cmd_t *ipip, void *if_req) 10703 { 10704 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 10705 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10706 (void) ipif_down_tail(ipif); 10707 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 10708 } 10709 10710 /* Get interface net mask. */ 10711 /* ARGSUSED */ 10712 int 10713 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10714 ip_ioctl_cmd_t *ipip, void *if_req) 10715 { 10716 struct lifreq *lifr = (struct lifreq *)if_req; 10717 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 10718 10719 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 10720 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10721 10722 /* 10723 * net mask can't change since we have a reference to the ipif. 10724 */ 10725 if (ipif->ipif_isv6) { 10726 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 10727 *sin6 = sin6_null; 10728 sin6->sin6_family = AF_INET6; 10729 sin6->sin6_addr = ipif->ipif_v6net_mask; 10730 lifr->lifr_addrlen = 10731 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 10732 } else { 10733 *sin = sin_null; 10734 sin->sin_family = AF_INET; 10735 sin->sin_addr.s_addr = ipif->ipif_net_mask; 10736 if (ipip->ipi_cmd_type == LIF_CMD) { 10737 lifr->lifr_addrlen = 10738 ip_mask_to_plen(ipif->ipif_net_mask); 10739 } 10740 } 10741 return (0); 10742 } 10743 10744 /* ARGSUSED */ 10745 int 10746 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10747 ip_ioctl_cmd_t *ipip, void *if_req) 10748 { 10749 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 10750 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10751 10752 /* 10753 * Since no applications should ever be setting metrics on underlying 10754 * interfaces, we explicitly fail to smoke 'em out. 10755 */ 10756 if (IS_UNDER_IPMP(ipif->ipif_ill)) 10757 return (EINVAL); 10758 10759 /* 10760 * Set interface metric. We don't use this for 10761 * anything but we keep track of it in case it is 10762 * important to routing applications or such. 10763 */ 10764 if (ipip->ipi_cmd_type == IF_CMD) { 10765 struct ifreq *ifr; 10766 10767 ifr = (struct ifreq *)if_req; 10768 ipif->ipif_metric = ifr->ifr_metric; 10769 } else { 10770 struct lifreq *lifr; 10771 10772 lifr = (struct lifreq *)if_req; 10773 ipif->ipif_metric = lifr->lifr_metric; 10774 } 10775 return (0); 10776 } 10777 10778 /* ARGSUSED */ 10779 int 10780 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10781 ip_ioctl_cmd_t *ipip, void *if_req) 10782 { 10783 /* Get interface metric. */ 10784 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 10785 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10786 10787 if (ipip->ipi_cmd_type == IF_CMD) { 10788 struct ifreq *ifr; 10789 10790 ifr = (struct ifreq *)if_req; 10791 ifr->ifr_metric = ipif->ipif_metric; 10792 } else { 10793 struct lifreq *lifr; 10794 10795 lifr = (struct lifreq *)if_req; 10796 lifr->lifr_metric = ipif->ipif_metric; 10797 } 10798 10799 return (0); 10800 } 10801 10802 /* ARGSUSED */ 10803 int 10804 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10805 ip_ioctl_cmd_t *ipip, void *if_req) 10806 { 10807 int arp_muxid; 10808 10809 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 10810 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10811 /* 10812 * Set the muxid returned from I_PLINK. 10813 */ 10814 if (ipip->ipi_cmd_type == IF_CMD) { 10815 struct ifreq *ifr = (struct ifreq *)if_req; 10816 10817 ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid; 10818 arp_muxid = ifr->ifr_arp_muxid; 10819 } else { 10820 struct lifreq *lifr = (struct lifreq *)if_req; 10821 10822 ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid; 10823 arp_muxid = lifr->lifr_arp_muxid; 10824 } 10825 arl_set_muxid(ipif->ipif_ill, arp_muxid); 10826 return (0); 10827 } 10828 10829 /* ARGSUSED */ 10830 int 10831 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10832 ip_ioctl_cmd_t *ipip, void *if_req) 10833 { 10834 int arp_muxid = 0; 10835 10836 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 10837 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10838 /* 10839 * Get the muxid saved in ill for I_PUNLINK. 10840 */ 10841 arp_muxid = arl_get_muxid(ipif->ipif_ill); 10842 if (ipip->ipi_cmd_type == IF_CMD) { 10843 struct ifreq *ifr = (struct ifreq *)if_req; 10844 10845 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid; 10846 ifr->ifr_arp_muxid = arp_muxid; 10847 } else { 10848 struct lifreq *lifr = (struct lifreq *)if_req; 10849 10850 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid; 10851 lifr->lifr_arp_muxid = arp_muxid; 10852 } 10853 return (0); 10854 } 10855 10856 /* 10857 * Set the subnet prefix. Does not modify the broadcast address. 10858 */ 10859 /* ARGSUSED */ 10860 int 10861 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10862 ip_ioctl_cmd_t *ipip, void *if_req) 10863 { 10864 int err = 0; 10865 in6_addr_t v6addr; 10866 in6_addr_t v6mask; 10867 boolean_t need_up = B_FALSE; 10868 int addrlen; 10869 10870 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 10871 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10872 10873 ASSERT(IAM_WRITER_IPIF(ipif)); 10874 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 10875 10876 if (ipif->ipif_isv6) { 10877 sin6_t *sin6; 10878 10879 if (sin->sin_family != AF_INET6) 10880 return (EAFNOSUPPORT); 10881 10882 sin6 = (sin6_t *)sin; 10883 v6addr = sin6->sin6_addr; 10884 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 10885 return (EADDRNOTAVAIL); 10886 } else { 10887 ipaddr_t addr; 10888 10889 if (sin->sin_family != AF_INET) 10890 return (EAFNOSUPPORT); 10891 10892 addr = sin->sin_addr.s_addr; 10893 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 10894 return (EADDRNOTAVAIL); 10895 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10896 /* Add 96 bits */ 10897 addrlen += IPV6_ABITS - IP_ABITS; 10898 } 10899 10900 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 10901 return (EINVAL); 10902 10903 /* Check if bits in the address is set past the mask */ 10904 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 10905 return (EINVAL); 10906 10907 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 10908 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 10909 return (0); /* No change */ 10910 10911 if (ipif->ipif_flags & IPIF_UP) { 10912 /* 10913 * If the interface is already marked up, 10914 * we call ipif_down which will take care 10915 * of ditching any IREs that have been set 10916 * up based on the old interface address. 10917 */ 10918 err = ipif_logical_down(ipif, q, mp); 10919 if (err == EINPROGRESS) 10920 return (err); 10921 (void) ipif_down_tail(ipif); 10922 need_up = B_TRUE; 10923 } 10924 10925 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 10926 return (err); 10927 } 10928 10929 static int 10930 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 10931 queue_t *q, mblk_t *mp, boolean_t need_up) 10932 { 10933 ill_t *ill = ipif->ipif_ill; 10934 int err = 0; 10935 10936 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 10937 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10938 10939 /* Set the new address. */ 10940 mutex_enter(&ill->ill_lock); 10941 ipif->ipif_v6net_mask = v6mask; 10942 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10943 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 10944 ipif->ipif_v6subnet); 10945 } 10946 mutex_exit(&ill->ill_lock); 10947 10948 if (need_up) { 10949 /* 10950 * Now bring the interface back up. If this 10951 * is the only IPIF for the ILL, ipif_up 10952 * will have to re-bind to the device, so 10953 * we may get back EINPROGRESS, in which 10954 * case, this IOCTL will get completed in 10955 * ip_rput_dlpi when we see the DL_BIND_ACK. 10956 */ 10957 err = ipif_up(ipif, q, mp); 10958 if (err == EINPROGRESS) 10959 return (err); 10960 } 10961 return (err); 10962 } 10963 10964 /* ARGSUSED */ 10965 int 10966 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10967 ip_ioctl_cmd_t *ipip, void *if_req) 10968 { 10969 int addrlen; 10970 in6_addr_t v6addr; 10971 in6_addr_t v6mask; 10972 struct lifreq *lifr = (struct lifreq *)if_req; 10973 10974 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 10975 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10976 (void) ipif_down_tail(ipif); 10977 10978 addrlen = lifr->lifr_addrlen; 10979 if (ipif->ipif_isv6) { 10980 sin6_t *sin6; 10981 10982 sin6 = (sin6_t *)sin; 10983 v6addr = sin6->sin6_addr; 10984 } else { 10985 ipaddr_t addr; 10986 10987 addr = sin->sin_addr.s_addr; 10988 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10989 addrlen += IPV6_ABITS - IP_ABITS; 10990 } 10991 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 10992 10993 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 10994 } 10995 10996 /* ARGSUSED */ 10997 int 10998 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10999 ip_ioctl_cmd_t *ipip, void *if_req) 11000 { 11001 struct lifreq *lifr = (struct lifreq *)if_req; 11002 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 11003 11004 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 11005 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11006 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11007 11008 if (ipif->ipif_isv6) { 11009 *sin6 = sin6_null; 11010 sin6->sin6_family = AF_INET6; 11011 sin6->sin6_addr = ipif->ipif_v6subnet; 11012 lifr->lifr_addrlen = 11013 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11014 } else { 11015 *sin = sin_null; 11016 sin->sin_family = AF_INET; 11017 sin->sin_addr.s_addr = ipif->ipif_subnet; 11018 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 11019 } 11020 return (0); 11021 } 11022 11023 /* 11024 * Set the IPv6 address token. 11025 */ 11026 /* ARGSUSED */ 11027 int 11028 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11029 ip_ioctl_cmd_t *ipi, void *if_req) 11030 { 11031 ill_t *ill = ipif->ipif_ill; 11032 int err; 11033 in6_addr_t v6addr; 11034 in6_addr_t v6mask; 11035 boolean_t need_up = B_FALSE; 11036 int i; 11037 sin6_t *sin6 = (sin6_t *)sin; 11038 struct lifreq *lifr = (struct lifreq *)if_req; 11039 int addrlen; 11040 11041 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 11042 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11043 ASSERT(IAM_WRITER_IPIF(ipif)); 11044 11045 addrlen = lifr->lifr_addrlen; 11046 /* Only allow for logical unit zero i.e. not on "le0:17" */ 11047 if (ipif->ipif_id != 0) 11048 return (EINVAL); 11049 11050 if (!ipif->ipif_isv6) 11051 return (EINVAL); 11052 11053 if (addrlen > IPV6_ABITS) 11054 return (EINVAL); 11055 11056 v6addr = sin6->sin6_addr; 11057 11058 /* 11059 * The length of the token is the length from the end. To get 11060 * the proper mask for this, compute the mask of the bits not 11061 * in the token; ie. the prefix, and then xor to get the mask. 11062 */ 11063 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 11064 return (EINVAL); 11065 for (i = 0; i < 4; i++) { 11066 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 11067 } 11068 11069 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 11070 ill->ill_token_length == addrlen) 11071 return (0); /* No change */ 11072 11073 if (ipif->ipif_flags & IPIF_UP) { 11074 err = ipif_logical_down(ipif, q, mp); 11075 if (err == EINPROGRESS) 11076 return (err); 11077 (void) ipif_down_tail(ipif); 11078 need_up = B_TRUE; 11079 } 11080 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 11081 return (err); 11082 } 11083 11084 static int 11085 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 11086 mblk_t *mp, boolean_t need_up) 11087 { 11088 in6_addr_t v6addr; 11089 in6_addr_t v6mask; 11090 ill_t *ill = ipif->ipif_ill; 11091 int i; 11092 int err = 0; 11093 11094 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 11095 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11096 v6addr = sin6->sin6_addr; 11097 /* 11098 * The length of the token is the length from the end. To get 11099 * the proper mask for this, compute the mask of the bits not 11100 * in the token; ie. the prefix, and then xor to get the mask. 11101 */ 11102 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 11103 for (i = 0; i < 4; i++) 11104 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 11105 11106 mutex_enter(&ill->ill_lock); 11107 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 11108 ill->ill_token_length = addrlen; 11109 ill->ill_manual_token = 1; 11110 11111 /* Reconfigure the link-local address based on this new token */ 11112 ipif_setlinklocal(ill->ill_ipif); 11113 11114 mutex_exit(&ill->ill_lock); 11115 11116 if (need_up) { 11117 /* 11118 * Now bring the interface back up. If this 11119 * is the only IPIF for the ILL, ipif_up 11120 * will have to re-bind to the device, so 11121 * we may get back EINPROGRESS, in which 11122 * case, this IOCTL will get completed in 11123 * ip_rput_dlpi when we see the DL_BIND_ACK. 11124 */ 11125 err = ipif_up(ipif, q, mp); 11126 if (err == EINPROGRESS) 11127 return (err); 11128 } 11129 return (err); 11130 } 11131 11132 /* ARGSUSED */ 11133 int 11134 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11135 ip_ioctl_cmd_t *ipi, void *if_req) 11136 { 11137 ill_t *ill; 11138 sin6_t *sin6 = (sin6_t *)sin; 11139 struct lifreq *lifr = (struct lifreq *)if_req; 11140 11141 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 11142 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11143 if (ipif->ipif_id != 0) 11144 return (EINVAL); 11145 11146 ill = ipif->ipif_ill; 11147 if (!ill->ill_isv6) 11148 return (ENXIO); 11149 11150 *sin6 = sin6_null; 11151 sin6->sin6_family = AF_INET6; 11152 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 11153 sin6->sin6_addr = ill->ill_token; 11154 lifr->lifr_addrlen = ill->ill_token_length; 11155 return (0); 11156 } 11157 11158 /* 11159 * Set (hardware) link specific information that might override 11160 * what was acquired through the DL_INFO_ACK. 11161 */ 11162 /* ARGSUSED */ 11163 int 11164 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11165 ip_ioctl_cmd_t *ipi, void *if_req) 11166 { 11167 ill_t *ill = ipif->ipif_ill; 11168 int ip_min_mtu; 11169 struct lifreq *lifr = (struct lifreq *)if_req; 11170 lif_ifinfo_req_t *lir; 11171 11172 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 11173 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11174 lir = &lifr->lifr_ifinfo; 11175 ASSERT(IAM_WRITER_IPIF(ipif)); 11176 11177 /* Only allow for logical unit zero i.e. not on "bge0:17" */ 11178 if (ipif->ipif_id != 0) 11179 return (EINVAL); 11180 11181 /* Set interface MTU. */ 11182 if (ipif->ipif_isv6) 11183 ip_min_mtu = IPV6_MIN_MTU; 11184 else 11185 ip_min_mtu = IP_MIN_MTU; 11186 11187 /* 11188 * Verify values before we set anything. Allow zero to 11189 * mean unspecified. 11190 * 11191 * XXX We should be able to set the user-defined lir_mtu to some value 11192 * that is greater than ill_current_frag but less than ill_max_frag- the 11193 * ill_max_frag value tells us the max MTU that can be handled by the 11194 * datalink, whereas the ill_current_frag is dynamically computed for 11195 * some link-types like tunnels, based on the tunnel PMTU. However, 11196 * since there is currently no way of distinguishing between 11197 * administratively fixed link mtu values (e.g., those set via 11198 * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered 11199 * for tunnels) we conservatively choose the ill_current_frag as the 11200 * upper-bound. 11201 */ 11202 if (lir->lir_maxmtu != 0 && 11203 (lir->lir_maxmtu > ill->ill_current_frag || 11204 lir->lir_maxmtu < ip_min_mtu)) 11205 return (EINVAL); 11206 if (lir->lir_reachtime != 0 && 11207 lir->lir_reachtime > ND_MAX_REACHTIME) 11208 return (EINVAL); 11209 if (lir->lir_reachretrans != 0 && 11210 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 11211 return (EINVAL); 11212 11213 mutex_enter(&ill->ill_lock); 11214 /* 11215 * The dce and fragmentation code can handle changes to ill_mtu 11216 * concurrent with sending/fragmenting packets. 11217 */ 11218 if (lir->lir_maxmtu != 0) 11219 ill->ill_user_mtu = lir->lir_maxmtu; 11220 11221 if (lir->lir_reachtime != 0) 11222 ill->ill_reachable_time = lir->lir_reachtime; 11223 11224 if (lir->lir_reachretrans != 0) 11225 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 11226 11227 ill->ill_max_hops = lir->lir_maxhops; 11228 ill->ill_max_buf = ND_MAX_Q; 11229 if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) { 11230 /* 11231 * ill_mtu is the actual interface MTU, obtained as the min 11232 * of user-configured mtu and the value announced by the 11233 * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since 11234 * we have already made the choice of requiring 11235 * ill_user_mtu < ill_current_frag by the time we get here, 11236 * the ill_mtu effectively gets assigned to the ill_user_mtu 11237 * here. 11238 */ 11239 ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu); 11240 } 11241 mutex_exit(&ill->ill_lock); 11242 11243 /* 11244 * Make sure all dce_generation checks find out 11245 * that ill_mtu has changed. 11246 */ 11247 if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0)) 11248 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); 11249 11250 /* 11251 * Refresh IPMP meta-interface MTU if necessary. 11252 */ 11253 if (IS_UNDER_IPMP(ill)) 11254 ipmp_illgrp_refresh_mtu(ill->ill_grp); 11255 11256 return (0); 11257 } 11258 11259 /* ARGSUSED */ 11260 int 11261 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11262 ip_ioctl_cmd_t *ipi, void *if_req) 11263 { 11264 struct lif_ifinfo_req *lir; 11265 ill_t *ill = ipif->ipif_ill; 11266 11267 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 11268 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11269 if (ipif->ipif_id != 0) 11270 return (EINVAL); 11271 11272 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 11273 lir->lir_maxhops = ill->ill_max_hops; 11274 lir->lir_reachtime = ill->ill_reachable_time; 11275 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 11276 lir->lir_maxmtu = ill->ill_mtu; 11277 11278 return (0); 11279 } 11280 11281 /* 11282 * Return best guess as to the subnet mask for the specified address. 11283 * Based on the subnet masks for all the configured interfaces. 11284 * 11285 * We end up returning a zero mask in the case of default, multicast or 11286 * experimental. 11287 */ 11288 static ipaddr_t 11289 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst) 11290 { 11291 ipaddr_t net_mask; 11292 ill_t *ill; 11293 ipif_t *ipif; 11294 ill_walk_context_t ctx; 11295 ipif_t *fallback_ipif = NULL; 11296 11297 net_mask = ip_net_mask(addr); 11298 if (net_mask == 0) { 11299 *ipifp = NULL; 11300 return (0); 11301 } 11302 11303 /* Let's check to see if this is maybe a local subnet route. */ 11304 /* this function only applies to IPv4 interfaces */ 11305 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 11306 ill = ILL_START_WALK_V4(&ctx, ipst); 11307 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 11308 mutex_enter(&ill->ill_lock); 11309 for (ipif = ill->ill_ipif; ipif != NULL; 11310 ipif = ipif->ipif_next) { 11311 if (IPIF_IS_CONDEMNED(ipif)) 11312 continue; 11313 if (!(ipif->ipif_flags & IPIF_UP)) 11314 continue; 11315 if ((ipif->ipif_subnet & net_mask) == 11316 (addr & net_mask)) { 11317 /* 11318 * Don't trust pt-pt interfaces if there are 11319 * other interfaces. 11320 */ 11321 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 11322 if (fallback_ipif == NULL) { 11323 ipif_refhold_locked(ipif); 11324 fallback_ipif = ipif; 11325 } 11326 continue; 11327 } 11328 11329 /* 11330 * Fine. Just assume the same net mask as the 11331 * directly attached subnet interface is using. 11332 */ 11333 ipif_refhold_locked(ipif); 11334 mutex_exit(&ill->ill_lock); 11335 rw_exit(&ipst->ips_ill_g_lock); 11336 if (fallback_ipif != NULL) 11337 ipif_refrele(fallback_ipif); 11338 *ipifp = ipif; 11339 return (ipif->ipif_net_mask); 11340 } 11341 } 11342 mutex_exit(&ill->ill_lock); 11343 } 11344 rw_exit(&ipst->ips_ill_g_lock); 11345 11346 *ipifp = fallback_ipif; 11347 return ((fallback_ipif != NULL) ? 11348 fallback_ipif->ipif_net_mask : net_mask); 11349 } 11350 11351 /* 11352 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 11353 */ 11354 static void 11355 ip_wput_ioctl(queue_t *q, mblk_t *mp) 11356 { 11357 IOCP iocp; 11358 ipft_t *ipft; 11359 ipllc_t *ipllc; 11360 mblk_t *mp1; 11361 cred_t *cr; 11362 int error = 0; 11363 conn_t *connp; 11364 11365 ip1dbg(("ip_wput_ioctl")); 11366 iocp = (IOCP)mp->b_rptr; 11367 mp1 = mp->b_cont; 11368 if (mp1 == NULL) { 11369 iocp->ioc_error = EINVAL; 11370 mp->b_datap->db_type = M_IOCNAK; 11371 iocp->ioc_count = 0; 11372 qreply(q, mp); 11373 return; 11374 } 11375 11376 /* 11377 * These IOCTLs provide various control capabilities to 11378 * upstream agents such as ULPs and processes. There 11379 * are currently two such IOCTLs implemented. They 11380 * are used by TCP to provide update information for 11381 * existing IREs and to forcibly delete an IRE for a 11382 * host that is not responding, thereby forcing an 11383 * attempt at a new route. 11384 */ 11385 iocp->ioc_error = EINVAL; 11386 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 11387 goto done; 11388 11389 ipllc = (ipllc_t *)mp1->b_rptr; 11390 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 11391 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 11392 break; 11393 } 11394 /* 11395 * prefer credential from mblk over ioctl; 11396 * see ip_sioctl_copyin_setup 11397 */ 11398 cr = msg_getcred(mp, NULL); 11399 if (cr == NULL) 11400 cr = iocp->ioc_cr; 11401 11402 /* 11403 * Refhold the conn in case the request gets queued up in some lookup 11404 */ 11405 ASSERT(CONN_Q(q)); 11406 connp = Q_TO_CONN(q); 11407 CONN_INC_REF(connp); 11408 if (ipft->ipft_pfi && 11409 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 11410 pullupmsg(mp1, ipft->ipft_min_size))) { 11411 error = (*ipft->ipft_pfi)(q, 11412 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 11413 } 11414 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 11415 /* 11416 * CONN_OPER_PENDING_DONE happens in the function called 11417 * through ipft_pfi above. 11418 */ 11419 return; 11420 } 11421 11422 CONN_OPER_PENDING_DONE(connp); 11423 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 11424 freemsg(mp); 11425 return; 11426 } 11427 iocp->ioc_error = error; 11428 11429 done: 11430 mp->b_datap->db_type = M_IOCACK; 11431 if (iocp->ioc_error) 11432 iocp->ioc_count = 0; 11433 qreply(q, mp); 11434 } 11435 11436 /* 11437 * Assign a unique id for the ipif. This is used by sctp_addr.c 11438 * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures. 11439 */ 11440 static void 11441 ipif_assign_seqid(ipif_t *ipif) 11442 { 11443 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 11444 11445 ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1); 11446 } 11447 11448 /* 11449 * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are 11450 * administratively down (i.e., no DAD), of the same type, and locked. Note 11451 * that the clone is complete -- including the seqid -- and the expectation is 11452 * that the caller will either free or overwrite `sipif' before it's unlocked. 11453 */ 11454 static void 11455 ipif_clone(const ipif_t *sipif, ipif_t *dipif) 11456 { 11457 ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock)); 11458 ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock)); 11459 ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 11460 ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 11461 ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type); 11462 11463 dipif->ipif_flags = sipif->ipif_flags; 11464 dipif->ipif_metric = sipif->ipif_metric; 11465 dipif->ipif_zoneid = sipif->ipif_zoneid; 11466 dipif->ipif_v6subnet = sipif->ipif_v6subnet; 11467 dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr; 11468 dipif->ipif_v6net_mask = sipif->ipif_v6net_mask; 11469 dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr; 11470 dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr; 11471 11472 /* 11473 * As per the comment atop the function, we assume that these sipif 11474 * fields will be changed before sipif is unlocked. 11475 */ 11476 dipif->ipif_seqid = sipif->ipif_seqid; 11477 dipif->ipif_state_flags = sipif->ipif_state_flags; 11478 } 11479 11480 /* 11481 * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif' 11482 * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin 11483 * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then 11484 * transfer the xop to `dipif'. Requires that all ipifs are administratively 11485 * down (i.e., no DAD), of the same type, and unlocked. 11486 */ 11487 static void 11488 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif) 11489 { 11490 ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq; 11491 ipxop_t *ipx = ipsq->ipsq_xop; 11492 11493 ASSERT(sipif != dipif); 11494 ASSERT(sipif != virgipif); 11495 11496 /* 11497 * Grab all of the locks that protect the ipif in a defined order. 11498 */ 11499 GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 11500 11501 ipif_clone(sipif, dipif); 11502 if (virgipif != NULL) { 11503 ipif_clone(virgipif, sipif); 11504 mi_free(virgipif); 11505 } 11506 11507 RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 11508 11509 /* 11510 * Transfer ownership of the current xop, if necessary. 11511 */ 11512 if (ipx->ipx_current_ipif == sipif) { 11513 ASSERT(ipx->ipx_pending_ipif == NULL); 11514 mutex_enter(&ipx->ipx_lock); 11515 ipx->ipx_current_ipif = dipif; 11516 mutex_exit(&ipx->ipx_lock); 11517 } 11518 11519 if (virgipif == NULL) 11520 mi_free(sipif); 11521 } 11522 11523 /* 11524 * checks if: 11525 * - <ill_name>:<ipif_id> is at most LIFNAMSIZ - 1 and 11526 * - logical interface is within the allowed range 11527 */ 11528 static int 11529 is_lifname_valid(ill_t *ill, unsigned int ipif_id) 11530 { 11531 if (snprintf(NULL, 0, "%s:%d", ill->ill_name, ipif_id) >= LIFNAMSIZ) 11532 return (ENAMETOOLONG); 11533 11534 if (ipif_id >= ill->ill_ipst->ips_ip_addrs_per_if) 11535 return (ERANGE); 11536 return (0); 11537 } 11538 11539 /* 11540 * Insert the ipif, so that the list of ipifs on the ill will be sorted 11541 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 11542 * be inserted into the first space available in the list. The value of 11543 * ipif_id will then be set to the appropriate value for its position. 11544 */ 11545 static int 11546 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock) 11547 { 11548 ill_t *ill; 11549 ipif_t *tipif; 11550 ipif_t **tipifp; 11551 int id, err; 11552 ip_stack_t *ipst; 11553 11554 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 11555 IAM_WRITER_IPIF(ipif)); 11556 11557 ill = ipif->ipif_ill; 11558 ASSERT(ill != NULL); 11559 ipst = ill->ill_ipst; 11560 11561 /* 11562 * In the case of lo0:0 we already hold the ill_g_lock. 11563 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 11564 * ipif_insert. 11565 */ 11566 if (acquire_g_lock) 11567 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 11568 mutex_enter(&ill->ill_lock); 11569 id = ipif->ipif_id; 11570 tipifp = &(ill->ill_ipif); 11571 if (id == -1) { /* need to find a real id */ 11572 id = 0; 11573 while ((tipif = *tipifp) != NULL) { 11574 ASSERT(tipif->ipif_id >= id); 11575 if (tipif->ipif_id != id) 11576 break; /* non-consecutive id */ 11577 id++; 11578 tipifp = &(tipif->ipif_next); 11579 } 11580 if ((err = is_lifname_valid(ill, id)) != 0) { 11581 mutex_exit(&ill->ill_lock); 11582 if (acquire_g_lock) 11583 rw_exit(&ipst->ips_ill_g_lock); 11584 return (err); 11585 } 11586 ipif->ipif_id = id; /* assign new id */ 11587 } else if ((err = is_lifname_valid(ill, id)) == 0) { 11588 /* we have a real id; insert ipif in the right place */ 11589 while ((tipif = *tipifp) != NULL) { 11590 ASSERT(tipif->ipif_id != id); 11591 if (tipif->ipif_id > id) 11592 break; /* found correct location */ 11593 tipifp = &(tipif->ipif_next); 11594 } 11595 } else { 11596 mutex_exit(&ill->ill_lock); 11597 if (acquire_g_lock) 11598 rw_exit(&ipst->ips_ill_g_lock); 11599 return (err); 11600 } 11601 11602 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 11603 11604 ipif->ipif_next = tipif; 11605 *tipifp = ipif; 11606 mutex_exit(&ill->ill_lock); 11607 if (acquire_g_lock) 11608 rw_exit(&ipst->ips_ill_g_lock); 11609 11610 return (0); 11611 } 11612 11613 static void 11614 ipif_remove(ipif_t *ipif) 11615 { 11616 ipif_t **ipifp; 11617 ill_t *ill = ipif->ipif_ill; 11618 11619 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock)); 11620 11621 mutex_enter(&ill->ill_lock); 11622 ipifp = &ill->ill_ipif; 11623 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 11624 if (*ipifp == ipif) { 11625 *ipifp = ipif->ipif_next; 11626 break; 11627 } 11628 } 11629 mutex_exit(&ill->ill_lock); 11630 } 11631 11632 /* 11633 * Allocate and initialize a new interface control structure. (Always 11634 * called as writer.) 11635 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 11636 * is not part of the global linked list of ills. ipif_seqid is unique 11637 * in the system and to preserve the uniqueness, it is assigned only 11638 * when ill becomes part of the global list. At that point ill will 11639 * have a name. If it doesn't get assigned here, it will get assigned 11640 * in ipif_set_values() as part of SIOCSLIFNAME processing. 11641 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 11642 * the interface flags or any other information from the DL_INFO_ACK for 11643 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 11644 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 11645 * second DL_INFO_ACK comes in from the driver. 11646 */ 11647 static ipif_t * 11648 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, 11649 boolean_t insert, int *errorp) 11650 { 11651 int err; 11652 ipif_t *ipif; 11653 ip_stack_t *ipst = ill->ill_ipst; 11654 11655 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 11656 ill->ill_name, id, (void *)ill)); 11657 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 11658 11659 if (errorp != NULL) 11660 *errorp = 0; 11661 11662 if ((ipif = mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) { 11663 if (errorp != NULL) 11664 *errorp = ENOMEM; 11665 return (NULL); 11666 } 11667 *ipif = ipif_zero; /* start clean */ 11668 11669 ipif->ipif_ill = ill; 11670 ipif->ipif_id = id; /* could be -1 */ 11671 /* 11672 * Inherit the zoneid from the ill; for the shared stack instance 11673 * this is always the global zone 11674 */ 11675 ipif->ipif_zoneid = ill->ill_zoneid; 11676 11677 ipif->ipif_refcnt = 0; 11678 11679 if (insert) { 11680 if ((err = ipif_insert(ipif, ire_type != IRE_LOOPBACK)) != 0) { 11681 mi_free(ipif); 11682 if (errorp != NULL) 11683 *errorp = err; 11684 return (NULL); 11685 } 11686 /* -1 id should have been replaced by real id */ 11687 id = ipif->ipif_id; 11688 ASSERT(id >= 0); 11689 } 11690 11691 if (ill->ill_name[0] != '\0') 11692 ipif_assign_seqid(ipif); 11693 11694 /* 11695 * If this is the zeroth ipif on the IPMP ill, create the illgrp 11696 * (which must not exist yet because the zeroth ipif is created once 11697 * per ill). However, do not not link it to the ipmp_grp_t until 11698 * I_PLINK is called; see ip_sioctl_plink_ipmp() for details. 11699 */ 11700 if (id == 0 && IS_IPMP(ill)) { 11701 if (ipmp_illgrp_create(ill) == NULL) { 11702 if (insert) { 11703 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 11704 ipif_remove(ipif); 11705 rw_exit(&ipst->ips_ill_g_lock); 11706 } 11707 mi_free(ipif); 11708 if (errorp != NULL) 11709 *errorp = ENOMEM; 11710 return (NULL); 11711 } 11712 } 11713 11714 /* 11715 * We grab ill_lock to protect the flag changes. The ipif is still 11716 * not up and can't be looked up until the ioctl completes and the 11717 * IPIF_CHANGING flag is cleared. 11718 */ 11719 mutex_enter(&ill->ill_lock); 11720 11721 ipif->ipif_ire_type = ire_type; 11722 11723 if (ipif->ipif_isv6) { 11724 ill->ill_flags |= ILLF_IPV6; 11725 } else { 11726 ipaddr_t inaddr_any = INADDR_ANY; 11727 11728 ill->ill_flags |= ILLF_IPV4; 11729 11730 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 11731 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11732 &ipif->ipif_v6lcl_addr); 11733 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11734 &ipif->ipif_v6subnet); 11735 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11736 &ipif->ipif_v6net_mask); 11737 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11738 &ipif->ipif_v6brd_addr); 11739 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11740 &ipif->ipif_v6pp_dst_addr); 11741 } 11742 11743 /* 11744 * Don't set the interface flags etc. now, will do it in 11745 * ip_ll_subnet_defaults. 11746 */ 11747 if (!initialize) 11748 goto out; 11749 11750 /* 11751 * NOTE: The IPMP meta-interface is special-cased because it starts 11752 * with no underlying interfaces (and thus an unknown broadcast 11753 * address length), but all interfaces that can be placed into an IPMP 11754 * group are required to be broadcast-capable. 11755 */ 11756 if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) { 11757 /* 11758 * Later detect lack of DLPI driver multicast capability by 11759 * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi(). 11760 */ 11761 ill->ill_flags |= ILLF_MULTICAST; 11762 if (!ipif->ipif_isv6) 11763 ipif->ipif_flags |= IPIF_BROADCAST; 11764 } else { 11765 if (ill->ill_net_type != IRE_LOOPBACK) { 11766 if (ipif->ipif_isv6) 11767 /* 11768 * Note: xresolv interfaces will eventually need 11769 * NOARP set here as well, but that will require 11770 * those external resolvers to have some 11771 * knowledge of that flag and act appropriately. 11772 * Not to be changed at present. 11773 */ 11774 ill->ill_flags |= ILLF_NONUD; 11775 else 11776 ill->ill_flags |= ILLF_NOARP; 11777 } 11778 if (ill->ill_phys_addr_length == 0) { 11779 if (IS_VNI(ill)) { 11780 ipif->ipif_flags |= IPIF_NOXMIT; 11781 } else { 11782 /* pt-pt supports multicast. */ 11783 ill->ill_flags |= ILLF_MULTICAST; 11784 if (ill->ill_net_type != IRE_LOOPBACK) 11785 ipif->ipif_flags |= IPIF_POINTOPOINT; 11786 } 11787 } 11788 } 11789 out: 11790 mutex_exit(&ill->ill_lock); 11791 return (ipif); 11792 } 11793 11794 /* 11795 * Remove the neighbor cache entries associated with this logical 11796 * interface. 11797 */ 11798 int 11799 ipif_arp_down(ipif_t *ipif) 11800 { 11801 ill_t *ill = ipif->ipif_ill; 11802 int err = 0; 11803 11804 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 11805 ASSERT(IAM_WRITER_IPIF(ipif)); 11806 11807 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down", 11808 ill_t *, ill, ipif_t *, ipif); 11809 ipif_nce_down(ipif); 11810 11811 /* 11812 * If this is the last ipif that is going down and there are no 11813 * duplicate addresses we may yet attempt to re-probe, then we need to 11814 * clean up ARP completely. 11815 */ 11816 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 11817 !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) { 11818 /* 11819 * If this was the last ipif on an IPMP interface, purge any 11820 * static ARP entries associated with it. 11821 */ 11822 if (IS_IPMP(ill)) 11823 ipmp_illgrp_refresh_arpent(ill->ill_grp); 11824 11825 /* UNBIND, DETACH */ 11826 err = arp_ll_down(ill); 11827 } 11828 11829 return (err); 11830 } 11831 11832 /* 11833 * Get the resolver set up for a new IP address. (Always called as writer.) 11834 * Called both for IPv4 and IPv6 interfaces, though it only does some 11835 * basic DAD related initialization for IPv6. Honors ILLF_NOARP. 11836 * 11837 * The enumerated value res_act tunes the behavior: 11838 * * Res_act_initial: set up all the resolver structures for a new 11839 * IP address. 11840 * * Res_act_defend: tell ARP that it needs to send a single gratuitous 11841 * ARP message in defense of the address. 11842 * * Res_act_rebind: tell ARP to change the hardware address for an IP 11843 * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif(). 11844 * 11845 * Returns zero on success, or an errno upon failure. 11846 */ 11847 int 11848 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 11849 { 11850 ill_t *ill = ipif->ipif_ill; 11851 int err; 11852 boolean_t was_dup; 11853 11854 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 11855 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 11856 ASSERT(IAM_WRITER_IPIF(ipif)); 11857 11858 was_dup = B_FALSE; 11859 if (res_act == Res_act_initial) { 11860 ipif->ipif_addr_ready = 0; 11861 /* 11862 * We're bringing an interface up here. There's no way that we 11863 * should need to shut down ARP now. 11864 */ 11865 mutex_enter(&ill->ill_lock); 11866 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11867 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11868 ill->ill_ipif_dup_count--; 11869 was_dup = B_TRUE; 11870 } 11871 mutex_exit(&ill->ill_lock); 11872 } 11873 if (ipif->ipif_recovery_id != 0) 11874 (void) untimeout(ipif->ipif_recovery_id); 11875 ipif->ipif_recovery_id = 0; 11876 if (ill->ill_net_type != IRE_IF_RESOLVER) { 11877 ipif->ipif_addr_ready = 1; 11878 return (0); 11879 } 11880 /* NDP will set the ipif_addr_ready flag when it's ready */ 11881 if (ill->ill_isv6) 11882 return (0); 11883 11884 err = ipif_arp_up(ipif, res_act, was_dup); 11885 return (err); 11886 } 11887 11888 /* 11889 * This routine restarts IPv4/IPv6 duplicate address detection (DAD) 11890 * when a link has just gone back up. 11891 */ 11892 static void 11893 ipif_nce_start_dad(ipif_t *ipif) 11894 { 11895 ncec_t *ncec; 11896 ill_t *ill = ipif->ipif_ill; 11897 boolean_t isv6 = ill->ill_isv6; 11898 11899 if (isv6) { 11900 ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill, 11901 &ipif->ipif_v6lcl_addr); 11902 } else { 11903 ipaddr_t v4addr; 11904 11905 if (ill->ill_net_type != IRE_IF_RESOLVER || 11906 (ipif->ipif_flags & IPIF_UNNUMBERED) || 11907 ipif->ipif_lcl_addr == INADDR_ANY) { 11908 /* 11909 * If we can't contact ARP for some reason, 11910 * that's not really a problem. Just send 11911 * out the routing socket notification that 11912 * DAD completion would have done, and continue. 11913 */ 11914 ipif_mask_reply(ipif); 11915 ipif_up_notify(ipif); 11916 ipif->ipif_addr_ready = 1; 11917 return; 11918 } 11919 11920 IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr); 11921 ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr); 11922 } 11923 11924 if (ncec == NULL) { 11925 ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n", 11926 (void *)ipif)); 11927 return; 11928 } 11929 if (!nce_restart_dad(ncec)) { 11930 /* 11931 * If we can't restart DAD for some reason, that's not really a 11932 * problem. Just send out the routing socket notification that 11933 * DAD completion would have done, and continue. 11934 */ 11935 ipif_up_notify(ipif); 11936 ipif->ipif_addr_ready = 1; 11937 } 11938 ncec_refrele(ncec); 11939 } 11940 11941 /* 11942 * Restart duplicate address detection on all interfaces on the given ill. 11943 * 11944 * This is called when an interface transitions from down to up 11945 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 11946 * 11947 * Note that since the underlying physical link has transitioned, we must cause 11948 * at least one routing socket message to be sent here, either via DAD 11949 * completion or just by default on the first ipif. (If we don't do this, then 11950 * in.mpathd will see long delays when doing link-based failure recovery.) 11951 */ 11952 void 11953 ill_restart_dad(ill_t *ill, boolean_t went_up) 11954 { 11955 ipif_t *ipif; 11956 11957 if (ill == NULL) 11958 return; 11959 11960 /* 11961 * If layer two doesn't support duplicate address detection, then just 11962 * send the routing socket message now and be done with it. 11963 */ 11964 if (!ill->ill_isv6 && arp_no_defense) { 11965 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 11966 return; 11967 } 11968 11969 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 11970 if (went_up) { 11971 11972 if (ipif->ipif_flags & IPIF_UP) { 11973 ipif_nce_start_dad(ipif); 11974 } else if (ipif->ipif_flags & IPIF_DUPLICATE) { 11975 /* 11976 * kick off the bring-up process now. 11977 */ 11978 ipif_do_recovery(ipif); 11979 } else { 11980 /* 11981 * Unfortunately, the first ipif is "special" 11982 * and represents the underlying ill in the 11983 * routing socket messages. Thus, when this 11984 * one ipif is down, we must still notify so 11985 * that the user knows the IFF_RUNNING status 11986 * change. (If the first ipif is up, then 11987 * we'll handle eventual routing socket 11988 * notification via DAD completion.) 11989 */ 11990 if (ipif == ill->ill_ipif) { 11991 ip_rts_ifmsg(ill->ill_ipif, 11992 RTSQ_DEFAULT); 11993 } 11994 } 11995 } else { 11996 /* 11997 * After link down, we'll need to send a new routing 11998 * message when the link comes back, so clear 11999 * ipif_addr_ready. 12000 */ 12001 ipif->ipif_addr_ready = 0; 12002 } 12003 } 12004 12005 /* 12006 * If we've torn down links, then notify the user right away. 12007 */ 12008 if (!went_up) 12009 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 12010 } 12011 12012 static void 12013 ipsq_delete(ipsq_t *ipsq) 12014 { 12015 ipxop_t *ipx = ipsq->ipsq_xop; 12016 12017 ipsq->ipsq_ipst = NULL; 12018 ASSERT(ipsq->ipsq_phyint == NULL); 12019 ASSERT(ipsq->ipsq_xop != NULL); 12020 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL); 12021 ASSERT(ipx->ipx_pending_mp == NULL); 12022 kmem_free(ipsq, sizeof (ipsq_t)); 12023 } 12024 12025 static int 12026 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp) 12027 { 12028 int err = 0; 12029 ipif_t *ipif; 12030 12031 if (ill == NULL) 12032 return (0); 12033 12034 ASSERT(IAM_WRITER_ILL(ill)); 12035 ill->ill_up_ipifs = B_TRUE; 12036 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12037 if (ipif->ipif_was_up) { 12038 if (!(ipif->ipif_flags & IPIF_UP)) 12039 err = ipif_up(ipif, q, mp); 12040 ipif->ipif_was_up = B_FALSE; 12041 if (err != 0) { 12042 ASSERT(err == EINPROGRESS); 12043 return (err); 12044 } 12045 } 12046 } 12047 ill->ill_up_ipifs = B_FALSE; 12048 return (0); 12049 } 12050 12051 /* 12052 * This function is called to bring up all the ipifs that were up before 12053 * bringing the ill down via ill_down_ipifs(). 12054 */ 12055 int 12056 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 12057 { 12058 int err; 12059 12060 ASSERT(IAM_WRITER_ILL(ill)); 12061 12062 if (ill->ill_replumbing) { 12063 ill->ill_replumbing = 0; 12064 /* 12065 * Send down REPLUMB_DONE notification followed by the 12066 * BIND_REQ on the arp stream. 12067 */ 12068 if (!ill->ill_isv6) 12069 arp_send_replumb_conf(ill); 12070 } 12071 err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp); 12072 if (err != 0) 12073 return (err); 12074 12075 return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp)); 12076 } 12077 12078 /* 12079 * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring 12080 * down the ipifs without sending DL_UNBIND_REQ to the driver. 12081 */ 12082 static void 12083 ill_down_ipifs(ill_t *ill, boolean_t logical) 12084 { 12085 ipif_t *ipif; 12086 12087 ASSERT(IAM_WRITER_ILL(ill)); 12088 12089 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12090 /* 12091 * We go through the ipif_down logic even if the ipif 12092 * is already down, since routes can be added based 12093 * on down ipifs. Going through ipif_down once again 12094 * will delete any IREs created based on these routes. 12095 */ 12096 if (ipif->ipif_flags & IPIF_UP) 12097 ipif->ipif_was_up = B_TRUE; 12098 12099 if (logical) { 12100 (void) ipif_logical_down(ipif, NULL, NULL); 12101 ipif_non_duplicate(ipif); 12102 (void) ipif_down_tail(ipif); 12103 } else { 12104 (void) ipif_down(ipif, NULL, NULL); 12105 } 12106 } 12107 } 12108 12109 /* 12110 * Redo source address selection. This makes IXAF_VERIFY_SOURCE take 12111 * a look again at valid source addresses. 12112 * This should be called each time after the set of source addresses has been 12113 * changed. 12114 */ 12115 void 12116 ip_update_source_selection(ip_stack_t *ipst) 12117 { 12118 /* We skip past SRC_GENERATION_VERIFY */ 12119 if (atomic_add_32_nv(&ipst->ips_src_generation, 1) == 12120 SRC_GENERATION_VERIFY) 12121 atomic_add_32(&ipst->ips_src_generation, 1); 12122 } 12123 12124 /* 12125 * Finish the group join started in ip_sioctl_groupname(). 12126 */ 12127 /* ARGSUSED */ 12128 static void 12129 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 12130 { 12131 ill_t *ill = q->q_ptr; 12132 phyint_t *phyi = ill->ill_phyint; 12133 ipmp_grp_t *grp = phyi->phyint_grp; 12134 ip_stack_t *ipst = ill->ill_ipst; 12135 12136 /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */ 12137 ASSERT(!IS_IPMP(ill) && grp != NULL); 12138 ASSERT(IAM_WRITER_IPSQ(ipsq)); 12139 12140 if (phyi->phyint_illv4 != NULL) { 12141 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12142 VERIFY(grp->gr_pendv4-- > 0); 12143 rw_exit(&ipst->ips_ipmp_lock); 12144 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4); 12145 } 12146 if (phyi->phyint_illv6 != NULL) { 12147 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12148 VERIFY(grp->gr_pendv6-- > 0); 12149 rw_exit(&ipst->ips_ipmp_lock); 12150 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6); 12151 } 12152 freemsg(mp); 12153 } 12154 12155 /* 12156 * Process an SIOCSLIFGROUPNAME request. 12157 */ 12158 /* ARGSUSED */ 12159 int 12160 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12161 ip_ioctl_cmd_t *ipip, void *ifreq) 12162 { 12163 struct lifreq *lifr = ifreq; 12164 ill_t *ill = ipif->ipif_ill; 12165 ip_stack_t *ipst = ill->ill_ipst; 12166 phyint_t *phyi = ill->ill_phyint; 12167 ipmp_grp_t *grp = phyi->phyint_grp; 12168 mblk_t *ipsq_mp; 12169 int err = 0; 12170 12171 /* 12172 * Note that phyint_grp can only change here, where we're exclusive. 12173 */ 12174 ASSERT(IAM_WRITER_ILL(ill)); 12175 12176 if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL || 12177 (phyi->phyint_flags & PHYI_VIRTUAL)) 12178 return (EINVAL); 12179 12180 lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0'; 12181 12182 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12183 12184 /* 12185 * If the name hasn't changed, there's nothing to do. 12186 */ 12187 if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0) 12188 goto unlock; 12189 12190 /* 12191 * Handle requests to rename an IPMP meta-interface. 12192 * 12193 * Note that creation of the IPMP meta-interface is handled in 12194 * userland through the standard plumbing sequence. As part of the 12195 * plumbing the IPMP meta-interface, its initial groupname is set to 12196 * the name of the interface (see ipif_set_values_tail()). 12197 */ 12198 if (IS_IPMP(ill)) { 12199 err = ipmp_grp_rename(grp, lifr->lifr_groupname); 12200 goto unlock; 12201 } 12202 12203 /* 12204 * Handle requests to add or remove an IP interface from a group. 12205 */ 12206 if (lifr->lifr_groupname[0] != '\0') { /* add */ 12207 /* 12208 * Moves are handled by first removing the interface from 12209 * its existing group, and then adding it to another group. 12210 * So, fail if it's already in a group. 12211 */ 12212 if (IS_UNDER_IPMP(ill)) { 12213 err = EALREADY; 12214 goto unlock; 12215 } 12216 12217 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst); 12218 if (grp == NULL) { 12219 err = ENOENT; 12220 goto unlock; 12221 } 12222 12223 /* 12224 * Check if the phyint and its ills are suitable for 12225 * inclusion into the group. 12226 */ 12227 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0) 12228 goto unlock; 12229 12230 /* 12231 * Checks pass; join the group, and enqueue the remaining 12232 * illgrp joins for when we've become part of the group xop 12233 * and are exclusive across its IPSQs. Since qwriter_ip() 12234 * requires an mblk_t to scribble on, and since `mp' will be 12235 * freed as part of completing the ioctl, allocate another. 12236 */ 12237 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) { 12238 err = ENOMEM; 12239 goto unlock; 12240 } 12241 12242 /* 12243 * Before we drop ipmp_lock, bump gr_pend* to ensure that the 12244 * IPMP meta-interface ills needed by `phyi' cannot go away 12245 * before ip_join_illgrps() is called back. See the comments 12246 * in ip_sioctl_plink_ipmp() for more. 12247 */ 12248 if (phyi->phyint_illv4 != NULL) 12249 grp->gr_pendv4++; 12250 if (phyi->phyint_illv6 != NULL) 12251 grp->gr_pendv6++; 12252 12253 rw_exit(&ipst->ips_ipmp_lock); 12254 12255 ipmp_phyint_join_grp(phyi, grp); 12256 ill_refhold(ill); 12257 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps, 12258 SWITCH_OP, B_FALSE); 12259 return (0); 12260 } else { 12261 /* 12262 * Request to remove the interface from a group. If the 12263 * interface is not in a group, this trivially succeeds. 12264 */ 12265 rw_exit(&ipst->ips_ipmp_lock); 12266 if (IS_UNDER_IPMP(ill)) 12267 ipmp_phyint_leave_grp(phyi); 12268 return (0); 12269 } 12270 unlock: 12271 rw_exit(&ipst->ips_ipmp_lock); 12272 return (err); 12273 } 12274 12275 /* 12276 * Process an SIOCGLIFBINDING request. 12277 */ 12278 /* ARGSUSED */ 12279 int 12280 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12281 ip_ioctl_cmd_t *ipip, void *ifreq) 12282 { 12283 ill_t *ill; 12284 struct lifreq *lifr = ifreq; 12285 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12286 12287 if (!IS_IPMP(ipif->ipif_ill)) 12288 return (EINVAL); 12289 12290 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12291 if ((ill = ipif->ipif_bound_ill) == NULL) 12292 lifr->lifr_binding[0] = '\0'; 12293 else 12294 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ); 12295 rw_exit(&ipst->ips_ipmp_lock); 12296 return (0); 12297 } 12298 12299 /* 12300 * Process an SIOCGLIFGROUPNAME request. 12301 */ 12302 /* ARGSUSED */ 12303 int 12304 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12305 ip_ioctl_cmd_t *ipip, void *ifreq) 12306 { 12307 ipmp_grp_t *grp; 12308 struct lifreq *lifr = ifreq; 12309 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12310 12311 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12312 if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL) 12313 lifr->lifr_groupname[0] = '\0'; 12314 else 12315 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ); 12316 rw_exit(&ipst->ips_ipmp_lock); 12317 return (0); 12318 } 12319 12320 /* 12321 * Process an SIOCGLIFGROUPINFO request. 12322 */ 12323 /* ARGSUSED */ 12324 int 12325 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12326 ip_ioctl_cmd_t *ipip, void *dummy) 12327 { 12328 ipmp_grp_t *grp; 12329 lifgroupinfo_t *lifgr; 12330 ip_stack_t *ipst = CONNQ_TO_IPST(q); 12331 12332 /* ip_wput_nondata() verified mp->b_cont->b_cont */ 12333 lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr; 12334 lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0'; 12335 12336 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12337 if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) { 12338 rw_exit(&ipst->ips_ipmp_lock); 12339 return (ENOENT); 12340 } 12341 ipmp_grp_info(grp, lifgr); 12342 rw_exit(&ipst->ips_ipmp_lock); 12343 return (0); 12344 } 12345 12346 static void 12347 ill_dl_down(ill_t *ill) 12348 { 12349 DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill); 12350 12351 /* 12352 * The ill is down; unbind but stay attached since we're still 12353 * associated with a PPA. If we have negotiated DLPI capabilites 12354 * with the data link service provider (IDS_OK) then reset them. 12355 * The interval between unbinding and rebinding is potentially 12356 * unbounded hence we cannot assume things will be the same. 12357 * The DLPI capabilities will be probed again when the data link 12358 * is brought up. 12359 */ 12360 mblk_t *mp = ill->ill_unbind_mp; 12361 12362 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 12363 12364 if (!ill->ill_replumbing) { 12365 /* Free all ilms for this ill */ 12366 update_conn_ill(ill, ill->ill_ipst); 12367 } else { 12368 ill_leave_multicast(ill); 12369 } 12370 12371 ill->ill_unbind_mp = NULL; 12372 if (mp != NULL) { 12373 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 12374 dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 12375 ill->ill_name)); 12376 mutex_enter(&ill->ill_lock); 12377 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 12378 mutex_exit(&ill->ill_lock); 12379 /* 12380 * ip_rput does not pass up normal (M_PROTO) DLPI messages 12381 * after ILL_CONDEMNED is set. So in the unplumb case, we call 12382 * ill_capability_dld_disable disable rightaway. If this is not 12383 * an unplumb operation then the disable happens on receipt of 12384 * the capab ack via ip_rput_dlpi_writer -> 12385 * ill_capability_ack_thr. In both cases the order of 12386 * the operations seen by DLD is capability disable followed 12387 * by DL_UNBIND. Also the DLD capability disable needs a 12388 * cv_wait'able context. 12389 */ 12390 if (ill->ill_state_flags & ILL_CONDEMNED) 12391 ill_capability_dld_disable(ill); 12392 ill_capability_reset(ill, B_FALSE); 12393 ill_dlpi_send(ill, mp); 12394 } 12395 mutex_enter(&ill->ill_lock); 12396 ill->ill_dl_up = 0; 12397 ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); 12398 mutex_exit(&ill->ill_lock); 12399 } 12400 12401 void 12402 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 12403 { 12404 union DL_primitives *dlp; 12405 t_uscalar_t prim; 12406 boolean_t waitack = B_FALSE; 12407 12408 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12409 12410 dlp = (union DL_primitives *)mp->b_rptr; 12411 prim = dlp->dl_primitive; 12412 12413 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 12414 dl_primstr(prim), prim, ill->ill_name)); 12415 12416 switch (prim) { 12417 case DL_PHYS_ADDR_REQ: 12418 { 12419 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 12420 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 12421 break; 12422 } 12423 case DL_BIND_REQ: 12424 mutex_enter(&ill->ill_lock); 12425 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 12426 mutex_exit(&ill->ill_lock); 12427 break; 12428 } 12429 12430 /* 12431 * Except for the ACKs for the M_PCPROTO messages, all other ACKs 12432 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore 12433 * we only wait for the ACK of the DL_UNBIND_REQ. 12434 */ 12435 mutex_enter(&ill->ill_lock); 12436 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 12437 (prim == DL_UNBIND_REQ)) { 12438 ill->ill_dlpi_pending = prim; 12439 waitack = B_TRUE; 12440 } 12441 12442 mutex_exit(&ill->ill_lock); 12443 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch", 12444 char *, dl_primstr(prim), ill_t *, ill); 12445 putnext(ill->ill_wq, mp); 12446 12447 /* 12448 * There is no ack for DL_NOTIFY_CONF messages 12449 */ 12450 if (waitack && prim == DL_NOTIFY_CONF) 12451 ill_dlpi_done(ill, prim); 12452 } 12453 12454 /* 12455 * Helper function for ill_dlpi_send(). 12456 */ 12457 /* ARGSUSED */ 12458 static void 12459 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 12460 { 12461 ill_dlpi_send(q->q_ptr, mp); 12462 } 12463 12464 /* 12465 * Send a DLPI control message to the driver but make sure there 12466 * is only one outstanding message. Uses ill_dlpi_pending to tell 12467 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 12468 * when an ACK or a NAK is received to process the next queued message. 12469 */ 12470 void 12471 ill_dlpi_send(ill_t *ill, mblk_t *mp) 12472 { 12473 mblk_t **mpp; 12474 12475 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12476 12477 /* 12478 * To ensure that any DLPI requests for current exclusive operation 12479 * are always completely sent before any DLPI messages for other 12480 * operations, require writer access before enqueuing. 12481 */ 12482 if (!IAM_WRITER_ILL(ill)) { 12483 ill_refhold(ill); 12484 /* qwriter_ip() does the ill_refrele() */ 12485 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer, 12486 NEW_OP, B_TRUE); 12487 return; 12488 } 12489 12490 mutex_enter(&ill->ill_lock); 12491 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 12492 /* Must queue message. Tail insertion */ 12493 mpp = &ill->ill_dlpi_deferred; 12494 while (*mpp != NULL) 12495 mpp = &((*mpp)->b_next); 12496 12497 ip1dbg(("ill_dlpi_send: deferring request for %s " 12498 "while %s pending\n", ill->ill_name, 12499 dl_primstr(ill->ill_dlpi_pending))); 12500 12501 *mpp = mp; 12502 mutex_exit(&ill->ill_lock); 12503 return; 12504 } 12505 mutex_exit(&ill->ill_lock); 12506 ill_dlpi_dispatch(ill, mp); 12507 } 12508 12509 void 12510 ill_capability_send(ill_t *ill, mblk_t *mp) 12511 { 12512 ill->ill_capab_pending_cnt++; 12513 ill_dlpi_send(ill, mp); 12514 } 12515 12516 void 12517 ill_capability_done(ill_t *ill) 12518 { 12519 ASSERT(ill->ill_capab_pending_cnt != 0); 12520 12521 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 12522 12523 ill->ill_capab_pending_cnt--; 12524 if (ill->ill_capab_pending_cnt == 0 && 12525 ill->ill_dlpi_capab_state == IDCS_OK) 12526 ill_capability_reset_alloc(ill); 12527 } 12528 12529 /* 12530 * Send all deferred DLPI messages without waiting for their ACKs. 12531 */ 12532 void 12533 ill_dlpi_send_deferred(ill_t *ill) 12534 { 12535 mblk_t *mp, *nextmp; 12536 12537 /* 12538 * Clear ill_dlpi_pending so that the message is not queued in 12539 * ill_dlpi_send(). 12540 */ 12541 mutex_enter(&ill->ill_lock); 12542 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12543 mp = ill->ill_dlpi_deferred; 12544 ill->ill_dlpi_deferred = NULL; 12545 mutex_exit(&ill->ill_lock); 12546 12547 for (; mp != NULL; mp = nextmp) { 12548 nextmp = mp->b_next; 12549 mp->b_next = NULL; 12550 ill_dlpi_send(ill, mp); 12551 } 12552 } 12553 12554 /* 12555 * Check if the DLPI primitive `prim' is pending; print a warning if not. 12556 */ 12557 boolean_t 12558 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim) 12559 { 12560 t_uscalar_t pending; 12561 12562 mutex_enter(&ill->ill_lock); 12563 if (ill->ill_dlpi_pending == prim) { 12564 mutex_exit(&ill->ill_lock); 12565 return (B_TRUE); 12566 } 12567 12568 /* 12569 * During teardown, ill_dlpi_dispatch() will send DLPI requests 12570 * without waiting, so don't print any warnings in that case. 12571 */ 12572 if (ill->ill_state_flags & ILL_CONDEMNED) { 12573 mutex_exit(&ill->ill_lock); 12574 return (B_FALSE); 12575 } 12576 pending = ill->ill_dlpi_pending; 12577 mutex_exit(&ill->ill_lock); 12578 12579 if (pending == DL_PRIM_INVAL) { 12580 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 12581 "received unsolicited ack for %s on %s\n", 12582 dl_primstr(prim), ill->ill_name); 12583 } else { 12584 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 12585 "received unexpected ack for %s on %s (expecting %s)\n", 12586 dl_primstr(prim), ill->ill_name, dl_primstr(pending)); 12587 } 12588 return (B_FALSE); 12589 } 12590 12591 /* 12592 * Complete the current DLPI operation associated with `prim' on `ill' and 12593 * start the next queued DLPI operation (if any). If there are no queued DLPI 12594 * operations and the ill's current exclusive IPSQ operation has finished 12595 * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to 12596 * allow the next exclusive IPSQ operation to begin upon ipsq_exit(). See 12597 * the comments above ipsq_current_finish() for details. 12598 */ 12599 void 12600 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 12601 { 12602 mblk_t *mp; 12603 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 12604 ipxop_t *ipx = ipsq->ipsq_xop; 12605 12606 ASSERT(IAM_WRITER_IPSQ(ipsq)); 12607 mutex_enter(&ill->ill_lock); 12608 12609 ASSERT(prim != DL_PRIM_INVAL); 12610 ASSERT(ill->ill_dlpi_pending == prim); 12611 12612 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 12613 dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 12614 12615 if ((mp = ill->ill_dlpi_deferred) == NULL) { 12616 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12617 if (ipx->ipx_current_done) { 12618 mutex_enter(&ipx->ipx_lock); 12619 ipx->ipx_current_ipif = NULL; 12620 mutex_exit(&ipx->ipx_lock); 12621 } 12622 cv_signal(&ill->ill_cv); 12623 mutex_exit(&ill->ill_lock); 12624 return; 12625 } 12626 12627 ill->ill_dlpi_deferred = mp->b_next; 12628 mp->b_next = NULL; 12629 mutex_exit(&ill->ill_lock); 12630 12631 ill_dlpi_dispatch(ill, mp); 12632 } 12633 12634 /* 12635 * Queue a (multicast) DLPI control message to be sent to the driver by 12636 * later calling ill_dlpi_send_queued. 12637 * We queue them while holding a lock (ill_mcast_lock) to ensure that they 12638 * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ 12639 * for the same group to race. 12640 * We send DLPI control messages in order using ill_lock. 12641 * For IPMP we should be called on the cast_ill. 12642 */ 12643 void 12644 ill_dlpi_queue(ill_t *ill, mblk_t *mp) 12645 { 12646 mblk_t **mpp; 12647 12648 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12649 12650 mutex_enter(&ill->ill_lock); 12651 /* Must queue message. Tail insertion */ 12652 mpp = &ill->ill_dlpi_deferred; 12653 while (*mpp != NULL) 12654 mpp = &((*mpp)->b_next); 12655 12656 *mpp = mp; 12657 mutex_exit(&ill->ill_lock); 12658 } 12659 12660 /* 12661 * Send the messages that were queued. Make sure there is only 12662 * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done() 12663 * when an ACK or a NAK is received to process the next queued message. 12664 * For IPMP we are called on the upper ill, but when send what is queued 12665 * on the cast_ill. 12666 */ 12667 void 12668 ill_dlpi_send_queued(ill_t *ill) 12669 { 12670 mblk_t *mp; 12671 union DL_primitives *dlp; 12672 t_uscalar_t prim; 12673 ill_t *release_ill = NULL; 12674 12675 if (IS_IPMP(ill)) { 12676 /* On the upper IPMP ill. */ 12677 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12678 if (release_ill == NULL) { 12679 /* Avoid ever sending anything down to the ipmpstub */ 12680 return; 12681 } 12682 ill = release_ill; 12683 } 12684 mutex_enter(&ill->ill_lock); 12685 while ((mp = ill->ill_dlpi_deferred) != NULL) { 12686 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 12687 /* Can't send. Somebody else will send it */ 12688 mutex_exit(&ill->ill_lock); 12689 goto done; 12690 } 12691 ill->ill_dlpi_deferred = mp->b_next; 12692 mp->b_next = NULL; 12693 if (!ill->ill_dl_up) { 12694 /* 12695 * Nobody there. All multicast addresses will be 12696 * re-joined when we get the DL_BIND_ACK bringing the 12697 * interface up. 12698 */ 12699 freemsg(mp); 12700 continue; 12701 } 12702 dlp = (union DL_primitives *)mp->b_rptr; 12703 prim = dlp->dl_primitive; 12704 12705 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 12706 (prim == DL_UNBIND_REQ)) { 12707 ill->ill_dlpi_pending = prim; 12708 } 12709 mutex_exit(&ill->ill_lock); 12710 12711 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued", 12712 char *, dl_primstr(prim), ill_t *, ill); 12713 putnext(ill->ill_wq, mp); 12714 mutex_enter(&ill->ill_lock); 12715 } 12716 mutex_exit(&ill->ill_lock); 12717 done: 12718 if (release_ill != NULL) 12719 ill_refrele(release_ill); 12720 } 12721 12722 /* 12723 * Queue an IP (IGMP/MLD) message to be sent by IP from 12724 * ill_mcast_send_queued 12725 * We queue them while holding a lock (ill_mcast_lock) to ensure that they 12726 * are sent in order i.e., prevent a IGMP leave and IGMP join for the same 12727 * group to race. 12728 * We send them in order using ill_lock. 12729 * For IPMP we are called on the upper ill, but we queue on the cast_ill. 12730 */ 12731 void 12732 ill_mcast_queue(ill_t *ill, mblk_t *mp) 12733 { 12734 mblk_t **mpp; 12735 ill_t *release_ill = NULL; 12736 12737 ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock)); 12738 12739 if (IS_IPMP(ill)) { 12740 /* On the upper IPMP ill. */ 12741 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12742 if (release_ill == NULL) { 12743 /* Discard instead of queuing for the ipmp interface */ 12744 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 12745 ip_drop_output("ipIfStatsOutDiscards - no cast_ill", 12746 mp, ill); 12747 freemsg(mp); 12748 return; 12749 } 12750 ill = release_ill; 12751 } 12752 12753 mutex_enter(&ill->ill_lock); 12754 /* Must queue message. Tail insertion */ 12755 mpp = &ill->ill_mcast_deferred; 12756 while (*mpp != NULL) 12757 mpp = &((*mpp)->b_next); 12758 12759 *mpp = mp; 12760 mutex_exit(&ill->ill_lock); 12761 if (release_ill != NULL) 12762 ill_refrele(release_ill); 12763 } 12764 12765 /* 12766 * Send the IP packets that were queued by ill_mcast_queue. 12767 * These are IGMP/MLD packets. 12768 * 12769 * For IPMP we are called on the upper ill, but when send what is queued 12770 * on the cast_ill. 12771 * 12772 * Request loopback of the report if we are acting as a multicast 12773 * router, so that the process-level routing demon can hear it. 12774 * This will run multiple times for the same group if there are members 12775 * on the same group for multiple ipif's on the same ill. The 12776 * igmp_input/mld_input code will suppress this due to the loopback thus we 12777 * always loopback membership report. 12778 * 12779 * We also need to make sure that this does not get load balanced 12780 * by IPMP. We do this by passing an ill to ip_output_simple. 12781 */ 12782 void 12783 ill_mcast_send_queued(ill_t *ill) 12784 { 12785 mblk_t *mp; 12786 ip_xmit_attr_t ixas; 12787 ill_t *release_ill = NULL; 12788 12789 if (IS_IPMP(ill)) { 12790 /* On the upper IPMP ill. */ 12791 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12792 if (release_ill == NULL) { 12793 /* 12794 * We should have no messages on the ipmp interface 12795 * but no point in trying to send them. 12796 */ 12797 return; 12798 } 12799 ill = release_ill; 12800 } 12801 bzero(&ixas, sizeof (ixas)); 12802 ixas.ixa_zoneid = ALL_ZONES; 12803 ixas.ixa_cred = kcred; 12804 ixas.ixa_cpid = NOPID; 12805 ixas.ixa_tsl = NULL; 12806 /* 12807 * Here we set ixa_ifindex. If IPMP it will be the lower ill which 12808 * makes ip_select_route pick the IRE_MULTICAST for the cast_ill. 12809 * That is necessary to handle IGMP/MLD snooping switches. 12810 */ 12811 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 12812 ixas.ixa_ipst = ill->ill_ipst; 12813 12814 mutex_enter(&ill->ill_lock); 12815 while ((mp = ill->ill_mcast_deferred) != NULL) { 12816 ill->ill_mcast_deferred = mp->b_next; 12817 mp->b_next = NULL; 12818 if (!ill->ill_dl_up) { 12819 /* 12820 * Nobody there. Just drop the ip packets. 12821 * IGMP/MLD will resend later, if this is a replumb. 12822 */ 12823 freemsg(mp); 12824 continue; 12825 } 12826 mutex_enter(&ill->ill_phyint->phyint_lock); 12827 if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) { 12828 /* 12829 * When the ill is getting deactivated, we only want to 12830 * send the DLPI messages, so drop IGMP/MLD packets. 12831 * DLPI messages are handled by ill_dlpi_send_queued() 12832 */ 12833 mutex_exit(&ill->ill_phyint->phyint_lock); 12834 freemsg(mp); 12835 continue; 12836 } 12837 mutex_exit(&ill->ill_phyint->phyint_lock); 12838 mutex_exit(&ill->ill_lock); 12839 12840 /* Check whether we are sending IPv4 or IPv6. */ 12841 if (ill->ill_isv6) { 12842 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 12843 12844 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 12845 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; 12846 } else { 12847 ipha_t *ipha = (ipha_t *)mp->b_rptr; 12848 12849 ixas.ixa_multicast_ttl = ipha->ipha_ttl; 12850 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 12851 ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM; 12852 } 12853 12854 ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE; 12855 (void) ip_output_simple(mp, &ixas); 12856 ixa_cleanup(&ixas); 12857 12858 mutex_enter(&ill->ill_lock); 12859 } 12860 mutex_exit(&ill->ill_lock); 12861 12862 done: 12863 if (release_ill != NULL) 12864 ill_refrele(release_ill); 12865 } 12866 12867 /* 12868 * Take down a specific interface, but don't lose any information about it. 12869 * (Always called as writer.) 12870 * This function goes through the down sequence even if the interface is 12871 * already down. There are 2 reasons. 12872 * a. Currently we permit interface routes that depend on down interfaces 12873 * to be added. This behaviour itself is questionable. However it appears 12874 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 12875 * time. We go thru the cleanup in order to remove these routes. 12876 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 12877 * DL_ERROR_ACK in response to the DL_BIND request. The interface is 12878 * down, but we need to cleanup i.e. do ill_dl_down and 12879 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 12880 * 12881 * IP-MT notes: 12882 * 12883 * Model of reference to interfaces. 12884 * 12885 * The following members in ipif_t track references to the ipif. 12886 * int ipif_refcnt; Active reference count 12887 * 12888 * The following members in ill_t track references to the ill. 12889 * int ill_refcnt; active refcnt 12890 * uint_t ill_ire_cnt; Number of ires referencing ill 12891 * uint_t ill_ncec_cnt; Number of ncecs referencing ill 12892 * uint_t ill_nce_cnt; Number of nces referencing ill 12893 * uint_t ill_ilm_cnt; Number of ilms referencing ill 12894 * 12895 * Reference to an ipif or ill can be obtained in any of the following ways. 12896 * 12897 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 12898 * Pointers to ipif / ill from other data structures viz ire and conn. 12899 * Implicit reference to the ipif / ill by holding a reference to the ire. 12900 * 12901 * The ipif/ill lookup functions return a reference held ipif / ill. 12902 * ipif_refcnt and ill_refcnt track the reference counts respectively. 12903 * This is a purely dynamic reference count associated with threads holding 12904 * references to the ipif / ill. Pointers from other structures do not 12905 * count towards this reference count. 12906 * 12907 * ill_ire_cnt is the number of ire's associated with the 12908 * ill. This is incremented whenever a new ire is created referencing the 12909 * ill. This is done atomically inside ire_add_v[46] where the ire is 12910 * actually added to the ire hash table. The count is decremented in 12911 * ire_inactive where the ire is destroyed. 12912 * 12913 * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill. 12914 * This is incremented atomically in 12915 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the 12916 * table. Similarly it is decremented in ncec_inactive() where the ncec 12917 * is destroyed. 12918 * 12919 * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is 12920 * incremented atomically in nce_add() where the nce is actually added to the 12921 * ill_nce. Similarly it is decremented in nce_inactive() where the nce 12922 * is destroyed. 12923 * 12924 * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in 12925 * ilm_add() and decremented before the ilm is freed in ilm_delete(). 12926 * 12927 * Flow of ioctls involving interface down/up 12928 * 12929 * The following is the sequence of an attempt to set some critical flags on an 12930 * up interface. 12931 * ip_sioctl_flags 12932 * ipif_down 12933 * wait for ipif to be quiescent 12934 * ipif_down_tail 12935 * ip_sioctl_flags_tail 12936 * 12937 * All set ioctls that involve down/up sequence would have a skeleton similar 12938 * to the above. All the *tail functions are called after the refcounts have 12939 * dropped to the appropriate values. 12940 * 12941 * SIOC ioctls during the IPIF_CHANGING interval. 12942 * 12943 * Threads handling SIOC set ioctls serialize on the squeue, but this 12944 * is not done for SIOC get ioctls. Since a set ioctl can cause several 12945 * steps of internal changes to the state, some of which are visible in 12946 * ipif_flags (such as IFF_UP being cleared and later set), and we want 12947 * the set ioctl to be atomic related to the get ioctls, the SIOC get code 12948 * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then 12949 * enqueued in the ipsq and the operation is restarted by ipsq_exit() when 12950 * the current exclusive operation completes. The IPIF_CHANGING check 12951 * and enqueue is atomic using the ill_lock and ipsq_lock. The 12952 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 12953 * change while the ill_lock is held. Before dropping the ill_lock we acquire 12954 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 12955 * until we release the ipsq_lock, even though the ill/ipif state flags 12956 * can change after we drop the ill_lock. 12957 */ 12958 int 12959 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 12960 { 12961 ill_t *ill = ipif->ipif_ill; 12962 conn_t *connp; 12963 boolean_t success; 12964 boolean_t ipif_was_up = B_FALSE; 12965 ip_stack_t *ipst = ill->ill_ipst; 12966 12967 ASSERT(IAM_WRITER_IPIF(ipif)); 12968 12969 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 12970 12971 DTRACE_PROBE3(ipif__downup, char *, "ipif_down", 12972 ill_t *, ill, ipif_t *, ipif); 12973 12974 if (ipif->ipif_flags & IPIF_UP) { 12975 mutex_enter(&ill->ill_lock); 12976 ipif->ipif_flags &= ~IPIF_UP; 12977 ASSERT(ill->ill_ipif_up_count > 0); 12978 --ill->ill_ipif_up_count; 12979 mutex_exit(&ill->ill_lock); 12980 ipif_was_up = B_TRUE; 12981 /* Update status in SCTP's list */ 12982 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 12983 ill_nic_event_dispatch(ipif->ipif_ill, 12984 MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0); 12985 } 12986 12987 /* 12988 * Blow away memberships we established in ipif_multicast_up(). 12989 */ 12990 ipif_multicast_down(ipif); 12991 12992 /* 12993 * Remove from the mapping for __sin6_src_id. We insert only 12994 * when the address is not INADDR_ANY. As IPv4 addresses are 12995 * stored as mapped addresses, we need to check for mapped 12996 * INADDR_ANY also. 12997 */ 12998 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 12999 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 13000 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 13001 int err; 13002 13003 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 13004 ipif->ipif_zoneid, ipst); 13005 if (err != 0) { 13006 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 13007 } 13008 } 13009 13010 if (ipif_was_up) { 13011 /* only delete if we'd added ire's before */ 13012 if (ipif->ipif_isv6) 13013 ipif_delete_ires_v6(ipif); 13014 else 13015 ipif_delete_ires_v4(ipif); 13016 } 13017 13018 if (ipif_was_up && ill->ill_ipif_up_count == 0) { 13019 /* 13020 * Since the interface is now down, it may have just become 13021 * inactive. Note that this needs to be done even for a 13022 * lll_logical_down(), or ARP entries will not get correctly 13023 * restored when the interface comes back up. 13024 */ 13025 if (IS_UNDER_IPMP(ill)) 13026 ipmp_ill_refresh_active(ill); 13027 } 13028 13029 /* 13030 * neighbor-discovery or arp entries for this interface. The ipif 13031 * has to be quiesced, so we walk all the nce's and delete those 13032 * that point at the ipif->ipif_ill. At the same time, we also 13033 * update IPMP so that ipifs for data addresses are unbound. We dont 13034 * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer 13035 * that for ipif_down_tail() 13036 */ 13037 ipif_nce_down(ipif); 13038 13039 /* 13040 * If this is the last ipif on the ill, we also need to remove 13041 * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will 13042 * never succeed. 13043 */ 13044 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) 13045 ire_walk_ill(0, 0, ill_downi, ill, ill); 13046 13047 /* 13048 * Walk all CONNs that can have a reference on an ire for this 13049 * ipif (we actually walk all that now have stale references). 13050 */ 13051 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); 13052 13053 /* 13054 * If mp is NULL the caller will wait for the appropriate refcnt. 13055 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 13056 * and ill_delete -> ipif_free -> ipif_down 13057 */ 13058 if (mp == NULL) { 13059 ASSERT(q == NULL); 13060 return (0); 13061 } 13062 13063 if (CONN_Q(q)) { 13064 connp = Q_TO_CONN(q); 13065 mutex_enter(&connp->conn_lock); 13066 } else { 13067 connp = NULL; 13068 } 13069 mutex_enter(&ill->ill_lock); 13070 /* 13071 * Are there any ire's pointing to this ipif that are still active ? 13072 * If this is the last ipif going down, are there any ire's pointing 13073 * to this ill that are still active ? 13074 */ 13075 if (ipif_is_quiescent(ipif)) { 13076 mutex_exit(&ill->ill_lock); 13077 if (connp != NULL) 13078 mutex_exit(&connp->conn_lock); 13079 return (0); 13080 } 13081 13082 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 13083 ill->ill_name, (void *)ill)); 13084 /* 13085 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 13086 * drops down, the operation will be restarted by ipif_ill_refrele_tail 13087 * which in turn is called by the last refrele on the ipif/ill/ire. 13088 */ 13089 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 13090 if (!success) { 13091 /* The conn is closing. So just return */ 13092 ASSERT(connp != NULL); 13093 mutex_exit(&ill->ill_lock); 13094 mutex_exit(&connp->conn_lock); 13095 return (EINTR); 13096 } 13097 13098 mutex_exit(&ill->ill_lock); 13099 if (connp != NULL) 13100 mutex_exit(&connp->conn_lock); 13101 return (EINPROGRESS); 13102 } 13103 13104 int 13105 ipif_down_tail(ipif_t *ipif) 13106 { 13107 ill_t *ill = ipif->ipif_ill; 13108 int err = 0; 13109 13110 DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail", 13111 ill_t *, ill, ipif_t *, ipif); 13112 13113 /* 13114 * Skip any loopback interface (null wq). 13115 * If this is the last logical interface on the ill 13116 * have ill_dl_down tell the driver we are gone (unbind) 13117 * Note that lun 0 can ipif_down even though 13118 * there are other logical units that are up. 13119 * This occurs e.g. when we change a "significant" IFF_ flag. 13120 */ 13121 if (ill->ill_wq != NULL && !ill->ill_logical_down && 13122 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 13123 ill->ill_dl_up) { 13124 ill_dl_down(ill); 13125 } 13126 if (!ipif->ipif_isv6) 13127 err = ipif_arp_down(ipif); 13128 13129 ill->ill_logical_down = 0; 13130 13131 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 13132 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT); 13133 return (err); 13134 } 13135 13136 /* 13137 * Bring interface logically down without bringing the physical interface 13138 * down e.g. when the netmask is changed. This avoids long lasting link 13139 * negotiations between an ethernet interface and a certain switches. 13140 */ 13141 static int 13142 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 13143 { 13144 DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down", 13145 ill_t *, ipif->ipif_ill, ipif_t *, ipif); 13146 13147 /* 13148 * The ill_logical_down flag is a transient flag. It is set here 13149 * and is cleared once the down has completed in ipif_down_tail. 13150 * This flag does not indicate whether the ill stream is in the 13151 * DL_BOUND state with the driver. Instead this flag is used by 13152 * ipif_down_tail to determine whether to DL_UNBIND the stream with 13153 * the driver. The state of the ill stream i.e. whether it is 13154 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 13155 */ 13156 ipif->ipif_ill->ill_logical_down = 1; 13157 return (ipif_down(ipif, q, mp)); 13158 } 13159 13160 /* 13161 * Initiate deallocate of an IPIF. Always called as writer. Called by 13162 * ill_delete or ip_sioctl_removeif. 13163 */ 13164 static void 13165 ipif_free(ipif_t *ipif) 13166 { 13167 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13168 13169 ASSERT(IAM_WRITER_IPIF(ipif)); 13170 13171 if (ipif->ipif_recovery_id != 0) 13172 (void) untimeout(ipif->ipif_recovery_id); 13173 ipif->ipif_recovery_id = 0; 13174 13175 /* 13176 * Take down the interface. We can be called either from ill_delete 13177 * or from ip_sioctl_removeif. 13178 */ 13179 (void) ipif_down(ipif, NULL, NULL); 13180 13181 /* 13182 * Now that the interface is down, there's no chance it can still 13183 * become a duplicate. Cancel any timer that may have been set while 13184 * tearing down. 13185 */ 13186 if (ipif->ipif_recovery_id != 0) 13187 (void) untimeout(ipif->ipif_recovery_id); 13188 ipif->ipif_recovery_id = 0; 13189 13190 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13191 /* Remove pointers to this ill in the multicast routing tables */ 13192 reset_mrt_vif_ipif(ipif); 13193 /* If necessary, clear the cached source ipif rotor. */ 13194 if (ipif->ipif_ill->ill_src_ipif == ipif) 13195 ipif->ipif_ill->ill_src_ipif = NULL; 13196 rw_exit(&ipst->ips_ill_g_lock); 13197 } 13198 13199 static void 13200 ipif_free_tail(ipif_t *ipif) 13201 { 13202 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13203 13204 /* 13205 * Need to hold both ill_g_lock and ill_lock while 13206 * inserting or removing an ipif from the linked list 13207 * of ipifs hanging off the ill. 13208 */ 13209 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13210 13211 #ifdef DEBUG 13212 ipif_trace_cleanup(ipif); 13213 #endif 13214 13215 /* Ask SCTP to take it out of it list */ 13216 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 13217 ip_rts_newaddrmsg(RTM_FREEADDR, 0, ipif, RTSQ_DEFAULT); 13218 13219 /* Get it out of the ILL interface list. */ 13220 ipif_remove(ipif); 13221 rw_exit(&ipst->ips_ill_g_lock); 13222 13223 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 13224 ASSERT(ipif->ipif_recovery_id == 0); 13225 ASSERT(ipif->ipif_ire_local == NULL); 13226 ASSERT(ipif->ipif_ire_if == NULL); 13227 13228 /* Free the memory. */ 13229 mi_free(ipif); 13230 } 13231 13232 /* 13233 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id" 13234 * is zero. 13235 */ 13236 void 13237 ipif_get_name(const ipif_t *ipif, char *buf, int len) 13238 { 13239 char lbuf[LIFNAMSIZ]; 13240 char *name; 13241 size_t name_len; 13242 13243 buf[0] = '\0'; 13244 name = ipif->ipif_ill->ill_name; 13245 name_len = ipif->ipif_ill->ill_name_length; 13246 if (ipif->ipif_id != 0) { 13247 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 13248 ipif->ipif_id); 13249 name = lbuf; 13250 name_len = mi_strlen(name) + 1; 13251 } 13252 len -= 1; 13253 buf[len] = '\0'; 13254 len = MIN(len, name_len); 13255 bcopy(name, buf, len); 13256 } 13257 13258 /* 13259 * Sets `buf' to an ill name. 13260 */ 13261 void 13262 ill_get_name(const ill_t *ill, char *buf, int len) 13263 { 13264 char *name; 13265 size_t name_len; 13266 13267 name = ill->ill_name; 13268 name_len = ill->ill_name_length; 13269 len -= 1; 13270 buf[len] = '\0'; 13271 len = MIN(len, name_len); 13272 bcopy(name, buf, len); 13273 } 13274 13275 /* 13276 * Find an IPIF based on the name passed in. Names can be of the form <phys> 13277 * (e.g., le0) or <phys>:<#> (e.g., le0:1). When there is no colon, the 13278 * implied unit id is zero. <phys> must correspond to the name of an ILL. 13279 * (May be called as writer.) 13280 */ 13281 static ipif_t * 13282 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 13283 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst) 13284 { 13285 char *cp; 13286 char *endp; 13287 long id; 13288 ill_t *ill; 13289 ipif_t *ipif; 13290 uint_t ire_type; 13291 boolean_t did_alloc = B_FALSE; 13292 13293 /* 13294 * If the caller wants to us to create the ipif, make sure we have a 13295 * valid zoneid 13296 */ 13297 ASSERT(!do_alloc || zoneid != ALL_ZONES); 13298 13299 if (namelen == 0) { 13300 return (NULL); 13301 } 13302 13303 *exists = B_FALSE; 13304 /* Look for a colon in the name. */ 13305 endp = &name[namelen]; 13306 for (cp = endp; --cp > name; ) { 13307 if (*cp == IPIF_SEPARATOR_CHAR) 13308 break; 13309 } 13310 13311 if (*cp == IPIF_SEPARATOR_CHAR) { 13312 /* 13313 * Reject any non-decimal aliases for logical 13314 * interfaces. Aliases with leading zeroes 13315 * are also rejected as they introduce ambiguity 13316 * in the naming of the interfaces. 13317 * In order to confirm with existing semantics, 13318 * and to not break any programs/script relying 13319 * on that behaviour, if<0>:0 is considered to be 13320 * a valid interface. 13321 * 13322 * If alias has two or more digits and the first 13323 * is zero, fail. 13324 */ 13325 if (&cp[2] < endp && cp[1] == '0') { 13326 return (NULL); 13327 } 13328 } 13329 13330 if (cp <= name) { 13331 cp = endp; 13332 } else { 13333 *cp = '\0'; 13334 } 13335 13336 /* 13337 * Look up the ILL, based on the portion of the name 13338 * before the slash. ill_lookup_on_name returns a held ill. 13339 * Temporary to check whether ill exists already. If so 13340 * ill_lookup_on_name will clear it. 13341 */ 13342 ill = ill_lookup_on_name(name, do_alloc, isv6, 13343 &did_alloc, ipst); 13344 if (cp != endp) 13345 *cp = IPIF_SEPARATOR_CHAR; 13346 if (ill == NULL) 13347 return (NULL); 13348 13349 /* Establish the unit number in the name. */ 13350 id = 0; 13351 if (cp < endp && *endp == '\0') { 13352 /* If there was a colon, the unit number follows. */ 13353 cp++; 13354 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 13355 ill_refrele(ill); 13356 return (NULL); 13357 } 13358 } 13359 13360 mutex_enter(&ill->ill_lock); 13361 /* Now see if there is an IPIF with this unit number. */ 13362 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13363 if (ipif->ipif_id == id) { 13364 if (zoneid != ALL_ZONES && 13365 zoneid != ipif->ipif_zoneid && 13366 ipif->ipif_zoneid != ALL_ZONES) { 13367 mutex_exit(&ill->ill_lock); 13368 ill_refrele(ill); 13369 return (NULL); 13370 } 13371 if (IPIF_CAN_LOOKUP(ipif)) { 13372 ipif_refhold_locked(ipif); 13373 mutex_exit(&ill->ill_lock); 13374 if (!did_alloc) 13375 *exists = B_TRUE; 13376 /* 13377 * Drop locks before calling ill_refrele 13378 * since it can potentially call into 13379 * ipif_ill_refrele_tail which can end up 13380 * in trying to acquire any lock. 13381 */ 13382 ill_refrele(ill); 13383 return (ipif); 13384 } 13385 } 13386 } 13387 13388 if (!do_alloc) { 13389 mutex_exit(&ill->ill_lock); 13390 ill_refrele(ill); 13391 return (NULL); 13392 } 13393 13394 /* 13395 * If none found, atomically allocate and return a new one. 13396 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 13397 * to support "receive only" use of lo0:1 etc. as is still done 13398 * below as an initial guess. 13399 * However, this is now likely to be overriden later in ipif_up_done() 13400 * when we know for sure what address has been configured on the 13401 * interface, since we might have more than one loopback interface 13402 * with a loopback address, e.g. in the case of zones, and all the 13403 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 13404 */ 13405 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 13406 ire_type = IRE_LOOPBACK; 13407 else 13408 ire_type = IRE_LOCAL; 13409 ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE, NULL); 13410 if (ipif != NULL) 13411 ipif_refhold_locked(ipif); 13412 mutex_exit(&ill->ill_lock); 13413 ill_refrele(ill); 13414 return (ipif); 13415 } 13416 13417 /* 13418 * This routine is called whenever a new address comes up on an ipif. If 13419 * we are configured to respond to address mask requests, then we are supposed 13420 * to broadcast an address mask reply at this time. This routine is also 13421 * called if we are already up, but a netmask change is made. This is legal 13422 * but might not make the system manager very popular. (May be called 13423 * as writer.) 13424 */ 13425 void 13426 ipif_mask_reply(ipif_t *ipif) 13427 { 13428 icmph_t *icmph; 13429 ipha_t *ipha; 13430 mblk_t *mp; 13431 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13432 ip_xmit_attr_t ixas; 13433 13434 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 13435 13436 if (!ipst->ips_ip_respond_to_address_mask_broadcast) 13437 return; 13438 13439 /* ICMP mask reply is IPv4 only */ 13440 ASSERT(!ipif->ipif_isv6); 13441 /* ICMP mask reply is not for a loopback interface */ 13442 ASSERT(ipif->ipif_ill->ill_wq != NULL); 13443 13444 if (ipif->ipif_lcl_addr == INADDR_ANY) 13445 return; 13446 13447 mp = allocb(REPLY_LEN, BPRI_HI); 13448 if (mp == NULL) 13449 return; 13450 mp->b_wptr = mp->b_rptr + REPLY_LEN; 13451 13452 ipha = (ipha_t *)mp->b_rptr; 13453 bzero(ipha, REPLY_LEN); 13454 *ipha = icmp_ipha; 13455 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 13456 ipha->ipha_src = ipif->ipif_lcl_addr; 13457 ipha->ipha_dst = ipif->ipif_brd_addr; 13458 ipha->ipha_length = htons(REPLY_LEN); 13459 ipha->ipha_ident = 0; 13460 13461 icmph = (icmph_t *)&ipha[1]; 13462 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 13463 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 13464 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 13465 13466 bzero(&ixas, sizeof (ixas)); 13467 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 13468 ixas.ixa_flags |= IXAF_SET_SOURCE; 13469 ixas.ixa_zoneid = ALL_ZONES; 13470 ixas.ixa_ifindex = 0; 13471 ixas.ixa_ipst = ipst; 13472 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 13473 (void) ip_output_simple(mp, &ixas); 13474 ixa_cleanup(&ixas); 13475 #undef REPLY_LEN 13476 } 13477 13478 /* 13479 * Join the ipif specific multicast groups. 13480 * Must be called after a mapping has been set up in the resolver. (Always 13481 * called as writer.) 13482 */ 13483 void 13484 ipif_multicast_up(ipif_t *ipif) 13485 { 13486 int err; 13487 ill_t *ill; 13488 ilm_t *ilm; 13489 13490 ASSERT(IAM_WRITER_IPIF(ipif)); 13491 13492 ill = ipif->ipif_ill; 13493 13494 ip1dbg(("ipif_multicast_up\n")); 13495 if (!(ill->ill_flags & ILLF_MULTICAST) || 13496 ipif->ipif_allhosts_ilm != NULL) 13497 return; 13498 13499 if (ipif->ipif_isv6) { 13500 in6_addr_t v6allmc = ipv6_all_hosts_mcast; 13501 in6_addr_t v6solmc = ipv6_solicited_node_mcast; 13502 13503 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 13504 13505 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 13506 return; 13507 13508 ip1dbg(("ipif_multicast_up - addmulti\n")); 13509 13510 /* 13511 * Join the all hosts multicast address. We skip this for 13512 * underlying IPMP interfaces since they should be invisible. 13513 */ 13514 if (!IS_UNDER_IPMP(ill)) { 13515 ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid, 13516 &err); 13517 if (ilm == NULL) { 13518 ASSERT(err != 0); 13519 ip0dbg(("ipif_multicast_up: " 13520 "all_hosts_mcast failed %d\n", err)); 13521 return; 13522 } 13523 ipif->ipif_allhosts_ilm = ilm; 13524 } 13525 13526 /* 13527 * Enable multicast for the solicited node multicast address. 13528 * If IPMP we need to put the membership on the upper ill. 13529 */ 13530 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 13531 ill_t *mcast_ill = NULL; 13532 boolean_t need_refrele; 13533 13534 if (IS_UNDER_IPMP(ill) && 13535 (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) { 13536 need_refrele = B_TRUE; 13537 } else { 13538 mcast_ill = ill; 13539 need_refrele = B_FALSE; 13540 } 13541 13542 ilm = ip_addmulti(&v6solmc, mcast_ill, 13543 ipif->ipif_zoneid, &err); 13544 if (need_refrele) 13545 ill_refrele(mcast_ill); 13546 13547 if (ilm == NULL) { 13548 ASSERT(err != 0); 13549 ip0dbg(("ipif_multicast_up: solicited MC" 13550 " failed %d\n", err)); 13551 if ((ilm = ipif->ipif_allhosts_ilm) != NULL) { 13552 ipif->ipif_allhosts_ilm = NULL; 13553 (void) ip_delmulti(ilm); 13554 } 13555 return; 13556 } 13557 ipif->ipif_solmulti_ilm = ilm; 13558 } 13559 } else { 13560 in6_addr_t v6group; 13561 13562 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill)) 13563 return; 13564 13565 /* Join the all hosts multicast address */ 13566 ip1dbg(("ipif_multicast_up - addmulti\n")); 13567 IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group); 13568 13569 ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err); 13570 if (ilm == NULL) { 13571 ASSERT(err != 0); 13572 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 13573 return; 13574 } 13575 ipif->ipif_allhosts_ilm = ilm; 13576 } 13577 } 13578 13579 /* 13580 * Blow away any multicast groups that we joined in ipif_multicast_up(). 13581 * (ilms from explicit memberships are handled in conn_update_ill.) 13582 */ 13583 void 13584 ipif_multicast_down(ipif_t *ipif) 13585 { 13586 ASSERT(IAM_WRITER_IPIF(ipif)); 13587 13588 ip1dbg(("ipif_multicast_down\n")); 13589 13590 if (ipif->ipif_allhosts_ilm != NULL) { 13591 (void) ip_delmulti(ipif->ipif_allhosts_ilm); 13592 ipif->ipif_allhosts_ilm = NULL; 13593 } 13594 if (ipif->ipif_solmulti_ilm != NULL) { 13595 (void) ip_delmulti(ipif->ipif_solmulti_ilm); 13596 ipif->ipif_solmulti_ilm = NULL; 13597 } 13598 } 13599 13600 /* 13601 * Used when an interface comes up to recreate any extra routes on this 13602 * interface. 13603 */ 13604 int 13605 ill_recover_saved_ire(ill_t *ill) 13606 { 13607 mblk_t *mp; 13608 ip_stack_t *ipst = ill->ill_ipst; 13609 13610 ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name)); 13611 13612 mutex_enter(&ill->ill_saved_ire_lock); 13613 for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 13614 ire_t *ire, *nire; 13615 ifrt_t *ifrt; 13616 13617 ifrt = (ifrt_t *)mp->b_rptr; 13618 /* 13619 * Create a copy of the IRE with the saved address and netmask. 13620 */ 13621 if (ill->ill_isv6) { 13622 ire = ire_create_v6( 13623 &ifrt->ifrt_v6addr, 13624 &ifrt->ifrt_v6mask, 13625 &ifrt->ifrt_v6gateway_addr, 13626 ifrt->ifrt_type, 13627 ill, 13628 ifrt->ifrt_zoneid, 13629 ifrt->ifrt_flags, 13630 NULL, 13631 ipst); 13632 } else { 13633 ire = ire_create( 13634 (uint8_t *)&ifrt->ifrt_addr, 13635 (uint8_t *)&ifrt->ifrt_mask, 13636 (uint8_t *)&ifrt->ifrt_gateway_addr, 13637 ifrt->ifrt_type, 13638 ill, 13639 ifrt->ifrt_zoneid, 13640 ifrt->ifrt_flags, 13641 NULL, 13642 ipst); 13643 } 13644 if (ire == NULL) { 13645 mutex_exit(&ill->ill_saved_ire_lock); 13646 return (ENOMEM); 13647 } 13648 13649 if (ifrt->ifrt_flags & RTF_SETSRC) { 13650 if (ill->ill_isv6) { 13651 ire->ire_setsrc_addr_v6 = 13652 ifrt->ifrt_v6setsrc_addr; 13653 } else { 13654 ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr; 13655 } 13656 } 13657 13658 /* 13659 * Some software (for example, GateD and Sun Cluster) attempts 13660 * to create (what amount to) IRE_PREFIX routes with the 13661 * loopback address as the gateway. This is primarily done to 13662 * set up prefixes with the RTF_REJECT flag set (for example, 13663 * when generating aggregate routes.) 13664 * 13665 * If the IRE type (as defined by ill->ill_net_type) is 13666 * IRE_LOOPBACK, then we map the request into a 13667 * IRE_IF_NORESOLVER. 13668 */ 13669 if (ill->ill_net_type == IRE_LOOPBACK) 13670 ire->ire_type = IRE_IF_NORESOLVER; 13671 13672 /* 13673 * ire held by ire_add, will be refreled' towards the 13674 * the end of ipif_up_done 13675 */ 13676 nire = ire_add(ire); 13677 /* 13678 * Check if it was a duplicate entry. This handles 13679 * the case of two racing route adds for the same route 13680 */ 13681 if (nire == NULL) { 13682 ip1dbg(("ill_recover_saved_ire: FAILED\n")); 13683 } else if (nire != ire) { 13684 ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n", 13685 (void *)nire)); 13686 ire_delete(nire); 13687 } else { 13688 ip1dbg(("ill_recover_saved_ire: added ire %p\n", 13689 (void *)nire)); 13690 } 13691 if (nire != NULL) 13692 ire_refrele(nire); 13693 } 13694 mutex_exit(&ill->ill_saved_ire_lock); 13695 return (0); 13696 } 13697 13698 /* 13699 * Used to set the netmask and broadcast address to default values when the 13700 * interface is brought up. (Always called as writer.) 13701 */ 13702 static void 13703 ipif_set_default(ipif_t *ipif) 13704 { 13705 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 13706 13707 if (!ipif->ipif_isv6) { 13708 /* 13709 * Interface holds an IPv4 address. Default 13710 * mask is the natural netmask. 13711 */ 13712 if (!ipif->ipif_net_mask) { 13713 ipaddr_t v4mask; 13714 13715 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 13716 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 13717 } 13718 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 13719 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 13720 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 13721 } else { 13722 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 13723 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 13724 } 13725 /* 13726 * NOTE: SunOS 4.X does this even if the broadcast address 13727 * has been already set thus we do the same here. 13728 */ 13729 if (ipif->ipif_flags & IPIF_BROADCAST) { 13730 ipaddr_t v4addr; 13731 13732 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 13733 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 13734 } 13735 } else { 13736 /* 13737 * Interface holds an IPv6-only address. Default 13738 * mask is all-ones. 13739 */ 13740 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 13741 ipif->ipif_v6net_mask = ipv6_all_ones; 13742 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 13743 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 13744 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 13745 } else { 13746 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 13747 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 13748 } 13749 } 13750 } 13751 13752 /* 13753 * Return 0 if this address can be used as local address without causing 13754 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 13755 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 13756 * Note that the same IPv6 link-local address is allowed as long as the ills 13757 * are not on the same link. 13758 */ 13759 int 13760 ip_addr_availability_check(ipif_t *new_ipif) 13761 { 13762 in6_addr_t our_v6addr; 13763 ill_t *ill; 13764 ipif_t *ipif; 13765 ill_walk_context_t ctx; 13766 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst; 13767 13768 ASSERT(IAM_WRITER_IPIF(new_ipif)); 13769 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock)); 13770 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); 13771 13772 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 13773 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 13774 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 13775 return (0); 13776 13777 our_v6addr = new_ipif->ipif_v6lcl_addr; 13778 13779 if (new_ipif->ipif_isv6) 13780 ill = ILL_START_WALK_V6(&ctx, ipst); 13781 else 13782 ill = ILL_START_WALK_V4(&ctx, ipst); 13783 13784 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 13785 for (ipif = ill->ill_ipif; ipif != NULL; 13786 ipif = ipif->ipif_next) { 13787 if ((ipif == new_ipif) || 13788 !(ipif->ipif_flags & IPIF_UP) || 13789 (ipif->ipif_flags & IPIF_UNNUMBERED) || 13790 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 13791 &our_v6addr)) 13792 continue; 13793 13794 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 13795 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 13796 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 13797 ipif->ipif_flags |= IPIF_UNNUMBERED; 13798 else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) || 13799 IN6_IS_ADDR_SITELOCAL(&our_v6addr)) && 13800 !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill)) 13801 continue; 13802 else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid && 13803 ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill)) 13804 continue; 13805 else if (new_ipif->ipif_ill == ill) 13806 return (EADDRINUSE); 13807 else 13808 return (EADDRNOTAVAIL); 13809 } 13810 } 13811 13812 return (0); 13813 } 13814 13815 /* 13816 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 13817 * IREs for the ipif. 13818 * When the routine returns EINPROGRESS then mp has been consumed and 13819 * the ioctl will be acked from ip_rput_dlpi. 13820 */ 13821 int 13822 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 13823 { 13824 ill_t *ill = ipif->ipif_ill; 13825 boolean_t isv6 = ipif->ipif_isv6; 13826 int err = 0; 13827 boolean_t success; 13828 uint_t ipif_orig_id; 13829 ip_stack_t *ipst = ill->ill_ipst; 13830 13831 ASSERT(IAM_WRITER_IPIF(ipif)); 13832 13833 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13834 DTRACE_PROBE3(ipif__downup, char *, "ipif_up", 13835 ill_t *, ill, ipif_t *, ipif); 13836 13837 /* Shouldn't get here if it is already up. */ 13838 if (ipif->ipif_flags & IPIF_UP) 13839 return (EALREADY); 13840 13841 /* 13842 * If this is a request to bring up a data address on an interface 13843 * under IPMP, then move the address to its IPMP meta-interface and 13844 * try to bring it up. One complication is that the zeroth ipif for 13845 * an ill is special, in that every ill always has one, and that code 13846 * throughout IP deferences ill->ill_ipif without holding any locks. 13847 */ 13848 if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) && 13849 (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) { 13850 ipif_t *stubipif = NULL, *moveipif = NULL; 13851 ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); 13852 13853 /* 13854 * The ipif being brought up should be quiesced. If it's not, 13855 * something has gone amiss and we need to bail out. (If it's 13856 * quiesced, we know it will remain so via IPIF_CONDEMNED.) 13857 */ 13858 mutex_enter(&ill->ill_lock); 13859 if (!ipif_is_quiescent(ipif)) { 13860 mutex_exit(&ill->ill_lock); 13861 return (EINVAL); 13862 } 13863 mutex_exit(&ill->ill_lock); 13864 13865 /* 13866 * If we're going to need to allocate ipifs, do it prior 13867 * to starting the move (and grabbing locks). 13868 */ 13869 if (ipif->ipif_id == 0) { 13870 if ((moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 13871 B_FALSE, &err)) == NULL) { 13872 return (err); 13873 } 13874 if ((stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 13875 B_FALSE, &err)) == NULL) { 13876 mi_free(moveipif); 13877 return (err); 13878 } 13879 } 13880 13881 /* 13882 * Grab or transfer the ipif to move. During the move, keep 13883 * ill_g_lock held to prevent any ill walker threads from 13884 * seeing things in an inconsistent state. 13885 */ 13886 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13887 if (ipif->ipif_id != 0) { 13888 ipif_remove(ipif); 13889 } else { 13890 ipif_transfer(ipif, moveipif, stubipif); 13891 ipif = moveipif; 13892 } 13893 13894 /* 13895 * Place the ipif on the IPMP ill. If the zeroth ipif on 13896 * the IPMP ill is a stub (0.0.0.0 down address) then we 13897 * replace that one. Otherwise, pick the next available slot. 13898 */ 13899 ipif->ipif_ill = ipmp_ill; 13900 ipif_orig_id = ipif->ipif_id; 13901 13902 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) { 13903 ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL); 13904 ipif = ipmp_ill->ill_ipif; 13905 } else { 13906 ipif->ipif_id = -1; 13907 if ((err = ipif_insert(ipif, B_FALSE)) != 0) { 13908 /* 13909 * No more available ipif_id's -- put it back 13910 * on the original ill and fail the operation. 13911 * Since we're writer on the ill, we can be 13912 * sure our old slot is still available. 13913 */ 13914 ipif->ipif_id = ipif_orig_id; 13915 ipif->ipif_ill = ill; 13916 if (ipif_orig_id == 0) { 13917 ipif_transfer(ipif, ill->ill_ipif, 13918 NULL); 13919 } else { 13920 VERIFY(ipif_insert(ipif, B_FALSE) == 0); 13921 } 13922 rw_exit(&ipst->ips_ill_g_lock); 13923 return (err); 13924 } 13925 } 13926 rw_exit(&ipst->ips_ill_g_lock); 13927 13928 /* 13929 * Tell SCTP that the ipif has moved. Note that even if we 13930 * had to allocate a new ipif, the original sequence id was 13931 * preserved and therefore SCTP won't know. 13932 */ 13933 sctp_move_ipif(ipif, ill, ipmp_ill); 13934 13935 /* 13936 * If the ipif being brought up was on slot zero, then we 13937 * first need to bring up the placeholder we stuck there. In 13938 * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive 13939 * call to ipif_up() itself, if we successfully bring up the 13940 * placeholder, we'll check ill_move_ipif and bring it up too. 13941 */ 13942 if (ipif_orig_id == 0) { 13943 ASSERT(ill->ill_move_ipif == NULL); 13944 ill->ill_move_ipif = ipif; 13945 if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0) 13946 ASSERT(ill->ill_move_ipif == NULL); 13947 if (err != EINPROGRESS) 13948 ill->ill_move_ipif = NULL; 13949 return (err); 13950 } 13951 13952 /* 13953 * Bring it up on the IPMP ill. 13954 */ 13955 return (ipif_up(ipif, q, mp)); 13956 } 13957 13958 /* Skip arp/ndp for any loopback interface. */ 13959 if (ill->ill_wq != NULL) { 13960 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 13961 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 13962 13963 if (!ill->ill_dl_up) { 13964 /* 13965 * ill_dl_up is not yet set. i.e. we are yet to 13966 * DL_BIND with the driver and this is the first 13967 * logical interface on the ill to become "up". 13968 * Tell the driver to get going (via DL_BIND_REQ). 13969 * Note that changing "significant" IFF_ flags 13970 * address/netmask etc cause a down/up dance, but 13971 * does not cause an unbind (DL_UNBIND) with the driver 13972 */ 13973 return (ill_dl_up(ill, ipif, mp, q)); 13974 } 13975 13976 /* 13977 * ipif_resolver_up may end up needeing to bind/attach 13978 * the ARP stream, which in turn necessitates a 13979 * DLPI message exchange with the driver. ioctls are 13980 * serialized and so we cannot send more than one 13981 * interface up message at a time. If ipif_resolver_up 13982 * does need to wait for the DLPI handshake for the ARP stream, 13983 * we get EINPROGRESS and we will complete in arp_bringup_done. 13984 */ 13985 13986 ASSERT(connp != NULL || !CONN_Q(q)); 13987 if (connp != NULL) 13988 mutex_enter(&connp->conn_lock); 13989 mutex_enter(&ill->ill_lock); 13990 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 13991 mutex_exit(&ill->ill_lock); 13992 if (connp != NULL) 13993 mutex_exit(&connp->conn_lock); 13994 if (!success) 13995 return (EINTR); 13996 13997 /* 13998 * Crank up IPv6 neighbor discovery. Unlike ARP, this should 13999 * complete when ipif_ndp_up returns. 14000 */ 14001 err = ipif_resolver_up(ipif, Res_act_initial); 14002 if (err == EINPROGRESS) { 14003 /* We will complete it in arp_bringup_done() */ 14004 return (err); 14005 } 14006 14007 if (isv6 && err == 0) 14008 err = ipif_ndp_up(ipif, B_TRUE); 14009 14010 ASSERT(err != EINPROGRESS); 14011 mp = ipsq_pending_mp_get(ipsq, &connp); 14012 ASSERT(mp != NULL); 14013 if (err != 0) 14014 return (err); 14015 } else { 14016 /* 14017 * Interfaces without underlying hardware don't do duplicate 14018 * address detection. 14019 */ 14020 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 14021 ipif->ipif_addr_ready = 1; 14022 err = ill_add_ires(ill); 14023 /* allocation failure? */ 14024 if (err != 0) 14025 return (err); 14026 } 14027 14028 err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 14029 if (err == 0 && ill->ill_move_ipif != NULL) { 14030 ipif = ill->ill_move_ipif; 14031 ill->ill_move_ipif = NULL; 14032 return (ipif_up(ipif, q, mp)); 14033 } 14034 return (err); 14035 } 14036 14037 /* 14038 * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST. 14039 * The identical set of IREs need to be removed in ill_delete_ires(). 14040 */ 14041 int 14042 ill_add_ires(ill_t *ill) 14043 { 14044 ire_t *ire; 14045 in6_addr_t dummy6 = {(uint32_t)V6_MCAST, 0, 0, 1}; 14046 in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP); 14047 14048 if (ill->ill_ire_multicast != NULL) 14049 return (0); 14050 14051 /* 14052 * provide some dummy ire_addr for creating the ire. 14053 */ 14054 if (ill->ill_isv6) { 14055 ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill, 14056 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); 14057 } else { 14058 ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill, 14059 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); 14060 } 14061 if (ire == NULL) 14062 return (ENOMEM); 14063 14064 ill->ill_ire_multicast = ire; 14065 return (0); 14066 } 14067 14068 void 14069 ill_delete_ires(ill_t *ill) 14070 { 14071 if (ill->ill_ire_multicast != NULL) { 14072 /* 14073 * BIND/ATTACH completed; Release the ref for ill_ire_multicast 14074 * which was taken without any th_tracing enabled. 14075 * We also mark it as condemned (note that it was never added) 14076 * so that caching conn's can move off of it. 14077 */ 14078 ire_make_condemned(ill->ill_ire_multicast); 14079 ire_refrele_notr(ill->ill_ire_multicast); 14080 ill->ill_ire_multicast = NULL; 14081 } 14082 } 14083 14084 /* 14085 * Perform a bind for the physical device. 14086 * When the routine returns EINPROGRESS then mp has been consumed and 14087 * the ioctl will be acked from ip_rput_dlpi. 14088 * Allocate an unbind message and save it until ipif_down. 14089 */ 14090 static int 14091 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 14092 { 14093 mblk_t *bind_mp = NULL; 14094 mblk_t *unbind_mp = NULL; 14095 conn_t *connp; 14096 boolean_t success; 14097 int err; 14098 14099 DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill); 14100 14101 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 14102 ASSERT(IAM_WRITER_ILL(ill)); 14103 ASSERT(mp != NULL); 14104 14105 /* 14106 * Make sure we have an IRE_MULTICAST in case we immediately 14107 * start receiving packets. 14108 */ 14109 err = ill_add_ires(ill); 14110 if (err != 0) 14111 goto bad; 14112 14113 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 14114 DL_BIND_REQ); 14115 if (bind_mp == NULL) 14116 goto bad; 14117 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 14118 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 14119 14120 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 14121 if (unbind_mp == NULL) 14122 goto bad; 14123 14124 /* 14125 * Record state needed to complete this operation when the 14126 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 14127 */ 14128 connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 14129 ASSERT(connp != NULL || !CONN_Q(q)); 14130 GRAB_CONN_LOCK(q); 14131 mutex_enter(&ipif->ipif_ill->ill_lock); 14132 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 14133 mutex_exit(&ipif->ipif_ill->ill_lock); 14134 RELEASE_CONN_LOCK(q); 14135 if (!success) 14136 goto bad; 14137 14138 /* 14139 * Save the unbind message for ill_dl_down(); it will be consumed when 14140 * the interface goes down. 14141 */ 14142 ASSERT(ill->ill_unbind_mp == NULL); 14143 ill->ill_unbind_mp = unbind_mp; 14144 14145 ill_dlpi_send(ill, bind_mp); 14146 /* Send down link-layer capabilities probe if not already done. */ 14147 ill_capability_probe(ill); 14148 14149 /* 14150 * Sysid used to rely on the fact that netboots set domainname 14151 * and the like. Now that miniroot boots aren't strictly netboots 14152 * and miniroot network configuration is driven from userland 14153 * these things still need to be set. This situation can be detected 14154 * by comparing the interface being configured here to the one 14155 * dhcifname was set to reference by the boot loader. Once sysid is 14156 * converted to use dhcp_ipc_getinfo() this call can go away. 14157 */ 14158 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && 14159 (strcmp(ill->ill_name, dhcifname) == 0) && 14160 (strlen(srpc_domain) == 0)) { 14161 if (dhcpinit() != 0) 14162 cmn_err(CE_WARN, "no cached dhcp response"); 14163 } 14164 14165 /* 14166 * This operation will complete in ip_rput_dlpi with either 14167 * a DL_BIND_ACK or DL_ERROR_ACK. 14168 */ 14169 return (EINPROGRESS); 14170 bad: 14171 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 14172 14173 freemsg(bind_mp); 14174 freemsg(unbind_mp); 14175 return (ENOMEM); 14176 } 14177 14178 /* Add room for tcp+ip headers */ 14179 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 14180 14181 /* 14182 * DLPI and ARP is up. 14183 * Create all the IREs associated with an interface. Bring up multicast. 14184 * Set the interface flag and finish other initialization 14185 * that potentially had to be deferred to after DL_BIND_ACK. 14186 */ 14187 int 14188 ipif_up_done(ipif_t *ipif) 14189 { 14190 ill_t *ill = ipif->ipif_ill; 14191 int err = 0; 14192 boolean_t loopback = B_FALSE; 14193 boolean_t update_src_selection = B_TRUE; 14194 ipif_t *tmp_ipif; 14195 14196 ip1dbg(("ipif_up_done(%s:%u)\n", 14197 ipif->ipif_ill->ill_name, ipif->ipif_id)); 14198 DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done", 14199 ill_t *, ill, ipif_t *, ipif); 14200 14201 /* Check if this is a loopback interface */ 14202 if (ipif->ipif_ill->ill_wq == NULL) 14203 loopback = B_TRUE; 14204 14205 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 14206 14207 /* 14208 * If all other interfaces for this ill are down or DEPRECATED, 14209 * or otherwise unsuitable for source address selection, 14210 * reset the src generation numbers to make sure source 14211 * address selection gets to take this new ipif into account. 14212 * No need to hold ill_lock while traversing the ipif list since 14213 * we are writer 14214 */ 14215 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 14216 tmp_ipif = tmp_ipif->ipif_next) { 14217 if (((tmp_ipif->ipif_flags & 14218 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 14219 !(tmp_ipif->ipif_flags & IPIF_UP)) || 14220 (tmp_ipif == ipif)) 14221 continue; 14222 /* first useable pre-existing interface */ 14223 update_src_selection = B_FALSE; 14224 break; 14225 } 14226 if (update_src_selection) 14227 ip_update_source_selection(ill->ill_ipst); 14228 14229 if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) { 14230 nce_t *loop_nce = NULL; 14231 uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD); 14232 14233 /* 14234 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 14235 * ipif_lookup_on_name(), but in the case of zones we can have 14236 * several loopback addresses on lo0. So all the interfaces with 14237 * loopback addresses need to be marked IRE_LOOPBACK. 14238 */ 14239 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 14240 htonl(INADDR_LOOPBACK)) 14241 ipif->ipif_ire_type = IRE_LOOPBACK; 14242 else 14243 ipif->ipif_ire_type = IRE_LOCAL; 14244 if (ill->ill_net_type != IRE_LOOPBACK) 14245 flags |= NCE_F_PUBLISH; 14246 14247 /* add unicast nce for the local addr */ 14248 err = nce_lookup_then_add_v4(ill, NULL, 14249 ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags, 14250 ND_REACHABLE, &loop_nce); 14251 /* A shared-IP zone sees EEXIST for lo0:N */ 14252 if (err == 0 || err == EEXIST) { 14253 ipif->ipif_added_nce = 1; 14254 loop_nce->nce_ipif_cnt++; 14255 nce_refrele(loop_nce); 14256 err = 0; 14257 } else { 14258 ASSERT(loop_nce == NULL); 14259 return (err); 14260 } 14261 } 14262 14263 /* Create all the IREs associated with this interface */ 14264 err = ipif_add_ires_v4(ipif, loopback); 14265 if (err != 0) { 14266 /* 14267 * see comments about return value from 14268 * ip_addr_availability_check() in ipif_add_ires_v4(). 14269 */ 14270 if (err != EADDRINUSE) { 14271 (void) ipif_arp_down(ipif); 14272 } else { 14273 /* 14274 * Make IPMP aware of the deleted ipif so that 14275 * the needed ipmp cleanup (e.g., of ipif_bound_ill) 14276 * can be completed. Note that we do not want to 14277 * destroy the nce that was created on the ipmp_ill 14278 * for the active copy of the duplicate address in 14279 * use. 14280 */ 14281 if (IS_IPMP(ill)) 14282 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 14283 err = EADDRNOTAVAIL; 14284 } 14285 return (err); 14286 } 14287 14288 if (ill->ill_ipif_up_count == 1 && !loopback) { 14289 /* Recover any additional IREs entries for this ill */ 14290 (void) ill_recover_saved_ire(ill); 14291 } 14292 14293 if (ill->ill_need_recover_multicast) { 14294 /* 14295 * Need to recover all multicast memberships in the driver. 14296 * This had to be deferred until we had attached. The same 14297 * code exists in ipif_up_done_v6() to recover IPv6 14298 * memberships. 14299 * 14300 * Note that it would be preferable to unconditionally do the 14301 * ill_recover_multicast() in ill_dl_up(), but we cannot do 14302 * that since ill_join_allmulti() depends on ill_dl_up being 14303 * set, and it is not set until we receive a DL_BIND_ACK after 14304 * having called ill_dl_up(). 14305 */ 14306 ill_recover_multicast(ill); 14307 } 14308 14309 if (ill->ill_ipif_up_count == 1) { 14310 /* 14311 * Since the interface is now up, it may now be active. 14312 */ 14313 if (IS_UNDER_IPMP(ill)) 14314 ipmp_ill_refresh_active(ill); 14315 14316 /* 14317 * If this is an IPMP interface, we may now be able to 14318 * establish ARP entries. 14319 */ 14320 if (IS_IPMP(ill)) 14321 ipmp_illgrp_refresh_arpent(ill->ill_grp); 14322 } 14323 14324 /* Join the allhosts multicast address */ 14325 ipif_multicast_up(ipif); 14326 14327 if (!loopback && !update_src_selection && 14328 !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) 14329 ip_update_source_selection(ill->ill_ipst); 14330 14331 if (!loopback && ipif->ipif_addr_ready) { 14332 /* Broadcast an address mask reply. */ 14333 ipif_mask_reply(ipif); 14334 } 14335 /* Perhaps ilgs should use this ill */ 14336 update_conn_ill(NULL, ill->ill_ipst); 14337 14338 /* 14339 * This had to be deferred until we had bound. Tell routing sockets and 14340 * others that this interface is up if it looks like the address has 14341 * been validated. Otherwise, if it isn't ready yet, wait for 14342 * duplicate address detection to do its thing. 14343 */ 14344 if (ipif->ipif_addr_ready) 14345 ipif_up_notify(ipif); 14346 return (0); 14347 } 14348 14349 /* 14350 * Add the IREs associated with the ipif. 14351 * Those MUST be explicitly removed in ipif_delete_ires_v4. 14352 */ 14353 static int 14354 ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback) 14355 { 14356 ill_t *ill = ipif->ipif_ill; 14357 ip_stack_t *ipst = ill->ill_ipst; 14358 ire_t *ire_array[20]; 14359 ire_t **irep = ire_array; 14360 ire_t **irep1; 14361 ipaddr_t net_mask = 0; 14362 ipaddr_t subnet_mask, route_mask; 14363 int err; 14364 ire_t *ire_local = NULL; /* LOCAL or LOOPBACK */ 14365 ire_t *ire_if = NULL; 14366 14367 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14368 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14369 /* 14370 * If we're on a labeled system then make sure that zone- 14371 * private addresses have proper remote host database entries. 14372 */ 14373 if (is_system_labeled() && 14374 ipif->ipif_ire_type != IRE_LOOPBACK && 14375 !tsol_check_interface_address(ipif)) 14376 return (EINVAL); 14377 14378 /* Register the source address for __sin6_src_id */ 14379 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 14380 ipif->ipif_zoneid, ipst); 14381 if (err != 0) { 14382 ip0dbg(("ipif_add_ires: srcid_insert %d\n", err)); 14383 return (err); 14384 } 14385 14386 /* If the interface address is set, create the local IRE. */ 14387 ire_local = ire_create( 14388 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 14389 (uchar_t *)&ip_g_all_ones, /* mask */ 14390 NULL, /* no gateway */ 14391 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 14392 ipif->ipif_ill, 14393 ipif->ipif_zoneid, 14394 ((ipif->ipif_flags & IPIF_PRIVATE) ? 14395 RTF_PRIVATE : 0) | RTF_KERNEL, 14396 NULL, 14397 ipst); 14398 ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x" 14399 " for 0x%x\n", (void *)ipif, (void *)ire_local, 14400 ipif->ipif_ire_type, 14401 ntohl(ipif->ipif_lcl_addr))); 14402 if (ire_local == NULL) { 14403 ip1dbg(("ipif_up_done: NULL ire_local\n")); 14404 err = ENOMEM; 14405 goto bad; 14406 } 14407 } else { 14408 ip1dbg(( 14409 "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n", 14410 ipif->ipif_ire_type, 14411 ntohl(ipif->ipif_lcl_addr), 14412 (uint_t)ipif->ipif_flags)); 14413 } 14414 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14415 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14416 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14417 } else { 14418 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 14419 } 14420 14421 subnet_mask = ipif->ipif_net_mask; 14422 14423 /* 14424 * If mask was not specified, use natural netmask of 14425 * interface address. Also, store this mask back into the 14426 * ipif struct. 14427 */ 14428 if (subnet_mask == 0) { 14429 subnet_mask = net_mask; 14430 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 14431 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 14432 ipif->ipif_v6subnet); 14433 } 14434 14435 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 14436 if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) && 14437 ipif->ipif_subnet != INADDR_ANY) { 14438 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 14439 14440 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 14441 route_mask = IP_HOST_MASK; 14442 } else { 14443 route_mask = subnet_mask; 14444 } 14445 14446 ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p " 14447 "creating if IRE ill_net_type 0x%x for 0x%x\n", 14448 (void *)ipif, (void *)ill, ill->ill_net_type, 14449 ntohl(ipif->ipif_subnet))); 14450 ire_if = ire_create( 14451 (uchar_t *)&ipif->ipif_subnet, 14452 (uchar_t *)&route_mask, 14453 (uchar_t *)&ipif->ipif_lcl_addr, 14454 ill->ill_net_type, 14455 ill, 14456 ipif->ipif_zoneid, 14457 ((ipif->ipif_flags & IPIF_PRIVATE) ? 14458 RTF_PRIVATE: 0) | RTF_KERNEL, 14459 NULL, 14460 ipst); 14461 if (ire_if == NULL) { 14462 ip1dbg(("ipif_up_done: NULL ire_if\n")); 14463 err = ENOMEM; 14464 goto bad; 14465 } 14466 } 14467 14468 /* 14469 * Create any necessary broadcast IREs. 14470 */ 14471 if ((ipif->ipif_flags & IPIF_BROADCAST) && 14472 !(ipif->ipif_flags & IPIF_NOXMIT)) 14473 irep = ipif_create_bcast_ires(ipif, irep); 14474 14475 /* If an earlier ire_create failed, get out now */ 14476 for (irep1 = irep; irep1 > ire_array; ) { 14477 irep1--; 14478 if (*irep1 == NULL) { 14479 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 14480 err = ENOMEM; 14481 goto bad; 14482 } 14483 } 14484 14485 /* 14486 * Need to atomically check for IP address availability under 14487 * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new 14488 * ills or new ipifs can be added while we are checking availability. 14489 */ 14490 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 14491 mutex_enter(&ipst->ips_ip_addr_avail_lock); 14492 /* Mark it up, and increment counters. */ 14493 ipif->ipif_flags |= IPIF_UP; 14494 ill->ill_ipif_up_count++; 14495 err = ip_addr_availability_check(ipif); 14496 mutex_exit(&ipst->ips_ip_addr_avail_lock); 14497 rw_exit(&ipst->ips_ill_g_lock); 14498 14499 if (err != 0) { 14500 /* 14501 * Our address may already be up on the same ill. In this case, 14502 * the ARP entry for our ipif replaced the one for the other 14503 * ipif. So we don't want to delete it (otherwise the other ipif 14504 * would be unable to send packets). 14505 * ip_addr_availability_check() identifies this case for us and 14506 * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL 14507 * which is the expected error code. 14508 */ 14509 ill->ill_ipif_up_count--; 14510 ipif->ipif_flags &= ~IPIF_UP; 14511 goto bad; 14512 } 14513 14514 /* 14515 * Add in all newly created IREs. ire_create_bcast() has 14516 * already checked for duplicates of the IRE_BROADCAST type. 14517 * We add the IRE_INTERFACE before the IRE_LOCAL to ensure 14518 * that lookups find the IRE_LOCAL even if the IRE_INTERFACE is 14519 * a /32 route. 14520 */ 14521 if (ire_if != NULL) { 14522 ire_if = ire_add(ire_if); 14523 if (ire_if == NULL) { 14524 err = ENOMEM; 14525 goto bad2; 14526 } 14527 #ifdef DEBUG 14528 ire_refhold_notr(ire_if); 14529 ire_refrele(ire_if); 14530 #endif 14531 } 14532 if (ire_local != NULL) { 14533 ire_local = ire_add(ire_local); 14534 if (ire_local == NULL) { 14535 err = ENOMEM; 14536 goto bad2; 14537 } 14538 #ifdef DEBUG 14539 ire_refhold_notr(ire_local); 14540 ire_refrele(ire_local); 14541 #endif 14542 } 14543 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14544 if (ire_local != NULL) 14545 ipif->ipif_ire_local = ire_local; 14546 if (ire_if != NULL) 14547 ipif->ipif_ire_if = ire_if; 14548 rw_exit(&ipst->ips_ill_g_lock); 14549 ire_local = NULL; 14550 ire_if = NULL; 14551 14552 /* 14553 * We first add all of them, and if that succeeds we refrele the 14554 * bunch. That enables us to delete all of them should any of the 14555 * ire_adds fail. 14556 */ 14557 for (irep1 = irep; irep1 > ire_array; ) { 14558 irep1--; 14559 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock))); 14560 *irep1 = ire_add(*irep1); 14561 if (*irep1 == NULL) { 14562 err = ENOMEM; 14563 goto bad2; 14564 } 14565 } 14566 14567 for (irep1 = irep; irep1 > ire_array; ) { 14568 irep1--; 14569 /* refheld by ire_add. */ 14570 if (*irep1 != NULL) { 14571 ire_refrele(*irep1); 14572 *irep1 = NULL; 14573 } 14574 } 14575 14576 if (!loopback) { 14577 /* 14578 * If the broadcast address has been set, make sure it makes 14579 * sense based on the interface address. 14580 * Only match on ill since we are sharing broadcast addresses. 14581 */ 14582 if ((ipif->ipif_brd_addr != INADDR_ANY) && 14583 (ipif->ipif_flags & IPIF_BROADCAST)) { 14584 ire_t *ire; 14585 14586 ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0, 14587 IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES, NULL, 14588 (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL); 14589 14590 if (ire == NULL) { 14591 /* 14592 * If there isn't a matching broadcast IRE, 14593 * revert to the default for this netmask. 14594 */ 14595 ipif->ipif_v6brd_addr = ipv6_all_zeros; 14596 mutex_enter(&ipif->ipif_ill->ill_lock); 14597 ipif_set_default(ipif); 14598 mutex_exit(&ipif->ipif_ill->ill_lock); 14599 } else { 14600 ire_refrele(ire); 14601 } 14602 } 14603 14604 } 14605 return (0); 14606 14607 bad2: 14608 ill->ill_ipif_up_count--; 14609 ipif->ipif_flags &= ~IPIF_UP; 14610 14611 bad: 14612 ip1dbg(("ipif_add_ires: FAILED \n")); 14613 if (ire_local != NULL) 14614 ire_delete(ire_local); 14615 if (ire_if != NULL) 14616 ire_delete(ire_if); 14617 14618 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14619 ire_local = ipif->ipif_ire_local; 14620 ipif->ipif_ire_local = NULL; 14621 ire_if = ipif->ipif_ire_if; 14622 ipif->ipif_ire_if = NULL; 14623 rw_exit(&ipst->ips_ill_g_lock); 14624 if (ire_local != NULL) { 14625 ire_delete(ire_local); 14626 ire_refrele_notr(ire_local); 14627 } 14628 if (ire_if != NULL) { 14629 ire_delete(ire_if); 14630 ire_refrele_notr(ire_if); 14631 } 14632 14633 while (irep > ire_array) { 14634 irep--; 14635 if (*irep != NULL) { 14636 ire_delete(*irep); 14637 } 14638 } 14639 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); 14640 14641 return (err); 14642 } 14643 14644 /* Remove all the IREs created by ipif_add_ires_v4 */ 14645 void 14646 ipif_delete_ires_v4(ipif_t *ipif) 14647 { 14648 ill_t *ill = ipif->ipif_ill; 14649 ip_stack_t *ipst = ill->ill_ipst; 14650 ire_t *ire; 14651 14652 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14653 ire = ipif->ipif_ire_local; 14654 ipif->ipif_ire_local = NULL; 14655 rw_exit(&ipst->ips_ill_g_lock); 14656 if (ire != NULL) { 14657 /* 14658 * Move count to ipif so we don't loose the count due to 14659 * a down/up dance. 14660 */ 14661 atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count); 14662 14663 ire_delete(ire); 14664 ire_refrele_notr(ire); 14665 } 14666 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14667 ire = ipif->ipif_ire_if; 14668 ipif->ipif_ire_if = NULL; 14669 rw_exit(&ipst->ips_ill_g_lock); 14670 if (ire != NULL) { 14671 ire_delete(ire); 14672 ire_refrele_notr(ire); 14673 } 14674 14675 /* 14676 * Delete the broadcast IREs. 14677 */ 14678 if ((ipif->ipif_flags & IPIF_BROADCAST) && 14679 !(ipif->ipif_flags & IPIF_NOXMIT)) 14680 ipif_delete_bcast_ires(ipif); 14681 } 14682 14683 /* 14684 * Checks for availbility of a usable source address (if there is one) when the 14685 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 14686 * this selection is done regardless of the destination. 14687 */ 14688 boolean_t 14689 ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid, 14690 ip_stack_t *ipst) 14691 { 14692 ipif_t *ipif = NULL; 14693 ill_t *uill; 14694 14695 ASSERT(ifindex != 0); 14696 14697 uill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 14698 if (uill == NULL) 14699 return (B_FALSE); 14700 14701 mutex_enter(&uill->ill_lock); 14702 for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14703 if (IPIF_IS_CONDEMNED(ipif)) 14704 continue; 14705 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 14706 continue; 14707 if (!(ipif->ipif_flags & IPIF_UP)) 14708 continue; 14709 if (ipif->ipif_zoneid != zoneid) 14710 continue; 14711 if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : 14712 ipif->ipif_lcl_addr == INADDR_ANY) 14713 continue; 14714 mutex_exit(&uill->ill_lock); 14715 ill_refrele(uill); 14716 return (B_TRUE); 14717 } 14718 mutex_exit(&uill->ill_lock); 14719 ill_refrele(uill); 14720 return (B_FALSE); 14721 } 14722 14723 /* 14724 * Find an ipif with a good local address on the ill+zoneid. 14725 */ 14726 ipif_t * 14727 ipif_good_addr(ill_t *ill, zoneid_t zoneid) 14728 { 14729 ipif_t *ipif; 14730 14731 mutex_enter(&ill->ill_lock); 14732 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14733 if (IPIF_IS_CONDEMNED(ipif)) 14734 continue; 14735 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 14736 continue; 14737 if (!(ipif->ipif_flags & IPIF_UP)) 14738 continue; 14739 if (ipif->ipif_zoneid != zoneid && 14740 ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES) 14741 continue; 14742 if (ill->ill_isv6 ? 14743 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : 14744 ipif->ipif_lcl_addr == INADDR_ANY) 14745 continue; 14746 ipif_refhold_locked(ipif); 14747 mutex_exit(&ill->ill_lock); 14748 return (ipif); 14749 } 14750 mutex_exit(&ill->ill_lock); 14751 return (NULL); 14752 } 14753 14754 /* 14755 * IP source address type, sorted from worst to best. For a given type, 14756 * always prefer IP addresses on the same subnet. All-zones addresses are 14757 * suboptimal because they pose problems with unlabeled destinations. 14758 */ 14759 typedef enum { 14760 IPIF_NONE, 14761 IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */ 14762 IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */ 14763 IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */ 14764 IPIF_SAMENET_ALLZONES, /* allzones and same subnet */ 14765 IPIF_DIFFNET, /* normal and different subnet */ 14766 IPIF_SAMENET, /* normal and same subnet */ 14767 IPIF_LOCALADDR /* local loopback */ 14768 } ipif_type_t; 14769 14770 /* 14771 * Pick the optimal ipif on `ill' for sending to destination `dst' from zone 14772 * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t 14773 * enumeration, and return the highest-rated ipif. If there's a tie, we pick 14774 * the first one, unless IPMP is used in which case we round-robin among them; 14775 * see below for more. 14776 * 14777 * Returns NULL if there is no suitable source address for the ill. 14778 * This only occurs when there is no valid source address for the ill. 14779 */ 14780 ipif_t * 14781 ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid, 14782 boolean_t allow_usesrc, boolean_t *notreadyp) 14783 { 14784 ill_t *usill = NULL; 14785 ill_t *ipmp_ill = NULL; 14786 ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif; 14787 ipif_type_t type, best_type; 14788 tsol_tpc_t *src_rhtp, *dst_rhtp; 14789 ip_stack_t *ipst = ill->ill_ipst; 14790 boolean_t samenet; 14791 14792 if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) { 14793 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, 14794 B_FALSE, ipst); 14795 if (usill != NULL) 14796 ill = usill; /* Select source from usesrc ILL */ 14797 else 14798 return (NULL); 14799 } 14800 14801 /* 14802 * Test addresses should never be used for source address selection, 14803 * so if we were passed one, switch to the IPMP meta-interface. 14804 */ 14805 if (IS_UNDER_IPMP(ill)) { 14806 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) 14807 ill = ipmp_ill; /* Select source from IPMP ill */ 14808 else 14809 return (NULL); 14810 } 14811 14812 /* 14813 * If we're dealing with an unlabeled destination on a labeled system, 14814 * make sure that we ignore source addresses that are incompatible with 14815 * the destination's default label. That destination's default label 14816 * must dominate the minimum label on the source address. 14817 */ 14818 dst_rhtp = NULL; 14819 if (is_system_labeled()) { 14820 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 14821 if (dst_rhtp == NULL) 14822 return (NULL); 14823 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 14824 TPC_RELE(dst_rhtp); 14825 dst_rhtp = NULL; 14826 } 14827 } 14828 14829 /* 14830 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill 14831 * can be deleted. But an ipif/ill can get CONDEMNED any time. 14832 * After selecting the right ipif, under ill_lock make sure ipif is 14833 * not condemned, and increment refcnt. If ipif is CONDEMNED, 14834 * we retry. Inside the loop we still need to check for CONDEMNED, 14835 * but not under a lock. 14836 */ 14837 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 14838 retry: 14839 /* 14840 * For source address selection, we treat the ipif list as circular 14841 * and continue until we get back to where we started. This allows 14842 * IPMP to vary source address selection (which improves inbound load 14843 * spreading) by caching its last ending point and starting from 14844 * there. NOTE: we don't have to worry about ill_src_ipif changing 14845 * ills since that can't happen on the IPMP ill. 14846 */ 14847 start_ipif = ill->ill_ipif; 14848 if (IS_IPMP(ill) && ill->ill_src_ipif != NULL) 14849 start_ipif = ill->ill_src_ipif; 14850 14851 ipif = start_ipif; 14852 best_ipif = NULL; 14853 best_type = IPIF_NONE; 14854 do { 14855 if ((next_ipif = ipif->ipif_next) == NULL) 14856 next_ipif = ill->ill_ipif; 14857 14858 if (IPIF_IS_CONDEMNED(ipif)) 14859 continue; 14860 /* Always skip NOLOCAL and ANYCAST interfaces */ 14861 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 14862 continue; 14863 /* Always skip NOACCEPT interfaces */ 14864 if (ipif->ipif_ill->ill_flags & ILLF_NOACCEPT) 14865 continue; 14866 if (!(ipif->ipif_flags & IPIF_UP)) 14867 continue; 14868 14869 if (!ipif->ipif_addr_ready) { 14870 if (notreadyp != NULL) 14871 *notreadyp = B_TRUE; 14872 continue; 14873 } 14874 14875 if (zoneid != ALL_ZONES && 14876 ipif->ipif_zoneid != zoneid && 14877 ipif->ipif_zoneid != ALL_ZONES) 14878 continue; 14879 14880 /* 14881 * Interfaces with 0.0.0.0 address are allowed to be UP, but 14882 * are not valid as source addresses. 14883 */ 14884 if (ipif->ipif_lcl_addr == INADDR_ANY) 14885 continue; 14886 14887 /* 14888 * Check compatibility of local address for destination's 14889 * default label if we're on a labeled system. Incompatible 14890 * addresses can't be used at all. 14891 */ 14892 if (dst_rhtp != NULL) { 14893 boolean_t incompat; 14894 14895 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 14896 IPV4_VERSION, B_FALSE); 14897 if (src_rhtp == NULL) 14898 continue; 14899 incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO || 14900 src_rhtp->tpc_tp.tp_doi != 14901 dst_rhtp->tpc_tp.tp_doi || 14902 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 14903 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 14904 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 14905 src_rhtp->tpc_tp.tp_sl_set_cipso)); 14906 TPC_RELE(src_rhtp); 14907 if (incompat) 14908 continue; 14909 } 14910 14911 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet); 14912 14913 if (ipif->ipif_lcl_addr == dst) { 14914 type = IPIF_LOCALADDR; 14915 } else if (ipif->ipif_flags & IPIF_DEPRECATED) { 14916 type = samenet ? IPIF_SAMENET_DEPRECATED : 14917 IPIF_DIFFNET_DEPRECATED; 14918 } else if (ipif->ipif_zoneid == ALL_ZONES) { 14919 type = samenet ? IPIF_SAMENET_ALLZONES : 14920 IPIF_DIFFNET_ALLZONES; 14921 } else { 14922 type = samenet ? IPIF_SAMENET : IPIF_DIFFNET; 14923 } 14924 14925 if (type > best_type) { 14926 best_type = type; 14927 best_ipif = ipif; 14928 if (best_type == IPIF_LOCALADDR) 14929 break; /* can't get better */ 14930 } 14931 } while ((ipif = next_ipif) != start_ipif); 14932 14933 if ((ipif = best_ipif) != NULL) { 14934 mutex_enter(&ipif->ipif_ill->ill_lock); 14935 if (IPIF_IS_CONDEMNED(ipif)) { 14936 mutex_exit(&ipif->ipif_ill->ill_lock); 14937 goto retry; 14938 } 14939 ipif_refhold_locked(ipif); 14940 14941 /* 14942 * For IPMP, update the source ipif rotor to the next ipif, 14943 * provided we can look it up. (We must not use it if it's 14944 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after 14945 * ipif_free() checked ill_src_ipif.) 14946 */ 14947 if (IS_IPMP(ill) && ipif != NULL) { 14948 next_ipif = ipif->ipif_next; 14949 if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif)) 14950 ill->ill_src_ipif = next_ipif; 14951 else 14952 ill->ill_src_ipif = NULL; 14953 } 14954 mutex_exit(&ipif->ipif_ill->ill_lock); 14955 } 14956 14957 rw_exit(&ipst->ips_ill_g_lock); 14958 if (usill != NULL) 14959 ill_refrele(usill); 14960 if (ipmp_ill != NULL) 14961 ill_refrele(ipmp_ill); 14962 if (dst_rhtp != NULL) 14963 TPC_RELE(dst_rhtp); 14964 14965 #ifdef DEBUG 14966 if (ipif == NULL) { 14967 char buf1[INET6_ADDRSTRLEN]; 14968 14969 ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n", 14970 ill->ill_name, 14971 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 14972 } else { 14973 char buf1[INET6_ADDRSTRLEN]; 14974 char buf2[INET6_ADDRSTRLEN]; 14975 14976 ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n", 14977 ipif->ipif_ill->ill_name, 14978 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 14979 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 14980 buf2, sizeof (buf2)))); 14981 } 14982 #endif /* DEBUG */ 14983 return (ipif); 14984 } 14985 14986 /* 14987 * Pick a source address based on the destination ill and an optional setsrc 14988 * address. 14989 * The result is stored in srcp. If generation is set, then put the source 14990 * generation number there before we look for the source address (to avoid 14991 * missing changes in the set of source addresses. 14992 * If flagsp is set, then us it to pass back ipif_flags. 14993 * 14994 * If the caller wants to cache the returned source address and detect when 14995 * that might be stale, the caller should pass in a generation argument, 14996 * which the caller can later compare against ips_src_generation 14997 * 14998 * The precedence order for selecting an IPv4 source address is: 14999 * - RTF_SETSRC on the offlink ire always wins. 15000 * - If usrsrc is set, swap the ill to be the usesrc one. 15001 * - If IPMP is used on the ill, select a random address from the most 15002 * preferred ones below: 15003 * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES 15004 * 2. Not deprecated, not ALL_ZONES 15005 * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES 15006 * 4. Not deprecated, ALL_ZONES 15007 * 5. If onlink destination, same subnet and deprecated 15008 * 6. Deprecated. 15009 * 15010 * We have lower preference for ALL_ZONES IP addresses, 15011 * as they pose problems with unlabeled destinations. 15012 * 15013 * Note that when multiple IP addresses match e.g., #1 we pick 15014 * the first one if IPMP is not in use. With IPMP we randomize. 15015 */ 15016 int 15017 ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst, 15018 ipaddr_t multicast_ifaddr, 15019 zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp, 15020 uint32_t *generation, uint64_t *flagsp) 15021 { 15022 ipif_t *ipif; 15023 boolean_t notready = B_FALSE; /* Set if !ipif_addr_ready found */ 15024 15025 if (flagsp != NULL) 15026 *flagsp = 0; 15027 15028 /* 15029 * Need to grab the generation number before we check to 15030 * avoid a race with a change to the set of local addresses. 15031 * No lock needed since the thread which updates the set of local 15032 * addresses use ipif/ill locks and exit those (hence a store memory 15033 * barrier) before doing the atomic increase of ips_src_generation. 15034 */ 15035 if (generation != NULL) { 15036 *generation = ipst->ips_src_generation; 15037 } 15038 15039 if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) { 15040 *srcp = multicast_ifaddr; 15041 return (0); 15042 } 15043 15044 /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */ 15045 if (setsrc != INADDR_ANY) { 15046 *srcp = setsrc; 15047 return (0); 15048 } 15049 ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, ¬ready); 15050 if (ipif == NULL) { 15051 if (notready) 15052 return (ENETDOWN); 15053 else 15054 return (EADDRNOTAVAIL); 15055 } 15056 *srcp = ipif->ipif_lcl_addr; 15057 if (flagsp != NULL) 15058 *flagsp = ipif->ipif_flags; 15059 ipif_refrele(ipif); 15060 return (0); 15061 } 15062 15063 /* ARGSUSED */ 15064 int 15065 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15066 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15067 { 15068 /* 15069 * ill_phyint_reinit merged the v4 and v6 into a single 15070 * ipsq. We might not have been able to complete the 15071 * operation in ipif_set_values, if we could not become 15072 * exclusive. If so restart it here. 15073 */ 15074 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 15075 } 15076 15077 /* 15078 * Can operate on either a module or a driver queue. 15079 * Returns an error if not a module queue. 15080 */ 15081 /* ARGSUSED */ 15082 int 15083 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15084 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15085 { 15086 queue_t *q1 = q; 15087 char *cp; 15088 char interf_name[LIFNAMSIZ]; 15089 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 15090 15091 if (q->q_next == NULL) { 15092 ip1dbg(( 15093 "if_unitsel: IF_UNITSEL: no q_next\n")); 15094 return (EINVAL); 15095 } 15096 15097 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 15098 return (EALREADY); 15099 15100 do { 15101 q1 = q1->q_next; 15102 } while (q1->q_next); 15103 cp = q1->q_qinfo->qi_minfo->mi_idname; 15104 (void) sprintf(interf_name, "%s%d", cp, ppa); 15105 15106 /* 15107 * Here we are not going to delay the ioack until after 15108 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 15109 * original ioctl message before sending the requests. 15110 */ 15111 return (ipif_set_values(q, mp, interf_name, &ppa)); 15112 } 15113 15114 /* ARGSUSED */ 15115 int 15116 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15117 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15118 { 15119 return (ENXIO); 15120 } 15121 15122 /* 15123 * Create any IRE_BROADCAST entries for `ipif', and store those entries in 15124 * `irep'. Returns a pointer to the next free `irep' entry 15125 * A mirror exists in ipif_delete_bcast_ires(). 15126 * 15127 * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is 15128 * done in ire_add. 15129 */ 15130 static ire_t ** 15131 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) 15132 { 15133 ipaddr_t addr; 15134 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 15135 ipaddr_t subnetmask = ipif->ipif_net_mask; 15136 ill_t *ill = ipif->ipif_ill; 15137 zoneid_t zoneid = ipif->ipif_zoneid; 15138 15139 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n")); 15140 15141 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 15142 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); 15143 15144 if (ipif->ipif_lcl_addr == INADDR_ANY || 15145 (ipif->ipif_flags & IPIF_NOLOCAL)) 15146 netmask = htonl(IN_CLASSA_NET); /* fallback */ 15147 15148 irep = ire_create_bcast(ill, 0, zoneid, irep); 15149 irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep); 15150 15151 /* 15152 * For backward compatibility, we create net broadcast IREs based on 15153 * the old "IP address class system", since some old machines only 15154 * respond to these class derived net broadcast. However, we must not 15155 * create these net broadcast IREs if the subnetmask is shorter than 15156 * the IP address class based derived netmask. Otherwise, we may 15157 * create a net broadcast address which is the same as an IP address 15158 * on the subnet -- and then TCP will refuse to talk to that address. 15159 */ 15160 if (netmask < subnetmask) { 15161 addr = netmask & ipif->ipif_subnet; 15162 irep = ire_create_bcast(ill, addr, zoneid, irep); 15163 irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep); 15164 } 15165 15166 /* 15167 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 15168 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 15169 * created. Creating these broadcast IREs will only create confusion 15170 * as `addr' will be the same as the IP address. 15171 */ 15172 if (subnetmask != 0xFFFFFFFF) { 15173 addr = ipif->ipif_subnet; 15174 irep = ire_create_bcast(ill, addr, zoneid, irep); 15175 irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep); 15176 } 15177 15178 return (irep); 15179 } 15180 15181 /* 15182 * Mirror of ipif_create_bcast_ires() 15183 */ 15184 static void 15185 ipif_delete_bcast_ires(ipif_t *ipif) 15186 { 15187 ipaddr_t addr; 15188 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 15189 ipaddr_t subnetmask = ipif->ipif_net_mask; 15190 ill_t *ill = ipif->ipif_ill; 15191 zoneid_t zoneid = ipif->ipif_zoneid; 15192 ire_t *ire; 15193 15194 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 15195 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); 15196 15197 if (ipif->ipif_lcl_addr == INADDR_ANY || 15198 (ipif->ipif_flags & IPIF_NOLOCAL)) 15199 netmask = htonl(IN_CLASSA_NET); /* fallback */ 15200 15201 ire = ire_lookup_bcast(ill, 0, zoneid); 15202 ASSERT(ire != NULL); 15203 ire_delete(ire); ire_refrele(ire); 15204 ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid); 15205 ASSERT(ire != NULL); 15206 ire_delete(ire); ire_refrele(ire); 15207 15208 /* 15209 * For backward compatibility, we create net broadcast IREs based on 15210 * the old "IP address class system", since some old machines only 15211 * respond to these class derived net broadcast. However, we must not 15212 * create these net broadcast IREs if the subnetmask is shorter than 15213 * the IP address class based derived netmask. Otherwise, we may 15214 * create a net broadcast address which is the same as an IP address 15215 * on the subnet -- and then TCP will refuse to talk to that address. 15216 */ 15217 if (netmask < subnetmask) { 15218 addr = netmask & ipif->ipif_subnet; 15219 ire = ire_lookup_bcast(ill, addr, zoneid); 15220 ASSERT(ire != NULL); 15221 ire_delete(ire); ire_refrele(ire); 15222 ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid); 15223 ASSERT(ire != NULL); 15224 ire_delete(ire); ire_refrele(ire); 15225 } 15226 15227 /* 15228 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 15229 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 15230 * created. Creating these broadcast IREs will only create confusion 15231 * as `addr' will be the same as the IP address. 15232 */ 15233 if (subnetmask != 0xFFFFFFFF) { 15234 addr = ipif->ipif_subnet; 15235 ire = ire_lookup_bcast(ill, addr, zoneid); 15236 ASSERT(ire != NULL); 15237 ire_delete(ire); ire_refrele(ire); 15238 ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid); 15239 ASSERT(ire != NULL); 15240 ire_delete(ire); ire_refrele(ire); 15241 } 15242 } 15243 15244 /* 15245 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 15246 * from lifr_flags and the name from lifr_name. 15247 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 15248 * since ipif_lookup_on_name uses the _isv6 flags when matching. 15249 * Returns EINPROGRESS when mp has been consumed by queueing it on 15250 * ipx_pending_mp and the ioctl will complete in ip_rput. 15251 * 15252 * Can operate on either a module or a driver queue. 15253 * Returns an error if not a module queue. 15254 */ 15255 /* ARGSUSED */ 15256 int 15257 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15258 ip_ioctl_cmd_t *ipip, void *if_req) 15259 { 15260 ill_t *ill = q->q_ptr; 15261 phyint_t *phyi; 15262 ip_stack_t *ipst; 15263 struct lifreq *lifr = if_req; 15264 uint64_t new_flags; 15265 15266 ASSERT(ipif != NULL); 15267 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 15268 15269 if (q->q_next == NULL) { 15270 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n")); 15271 return (EINVAL); 15272 } 15273 15274 /* 15275 * If we are not writer on 'q' then this interface exists already 15276 * and previous lookups (ip_extract_lifreq()) found this ipif -- 15277 * so return EALREADY. 15278 */ 15279 if (ill != ipif->ipif_ill) 15280 return (EALREADY); 15281 15282 if (ill->ill_name[0] != '\0') 15283 return (EALREADY); 15284 15285 /* 15286 * If there's another ill already with the requested name, ensure 15287 * that it's of the same type. Otherwise, ill_phyint_reinit() will 15288 * fuse together two unrelated ills, which will cause chaos. 15289 */ 15290 ipst = ill->ill_ipst; 15291 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 15292 lifr->lifr_name, NULL); 15293 if (phyi != NULL) { 15294 ill_t *ill_mate = phyi->phyint_illv4; 15295 15296 if (ill_mate == NULL) 15297 ill_mate = phyi->phyint_illv6; 15298 ASSERT(ill_mate != NULL); 15299 15300 if (ill_mate->ill_media->ip_m_mac_type != 15301 ill->ill_media->ip_m_mac_type) { 15302 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to " 15303 "use the same ill name on differing media\n")); 15304 return (EINVAL); 15305 } 15306 } 15307 15308 /* 15309 * We start off as IFF_IPV4 in ipif_allocate and become 15310 * IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value. 15311 * The only flags that we read from user space are IFF_IPV4, 15312 * IFF_IPV6, and IFF_BROADCAST. 15313 * 15314 * This ill has not been inserted into the global list. 15315 * So we are still single threaded and don't need any lock 15316 * 15317 * Saniy check the flags. 15318 */ 15319 15320 if ((lifr->lifr_flags & IFF_BROADCAST) && 15321 ((lifr->lifr_flags & IFF_IPV6) || 15322 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 15323 ip1dbg(("ip_sioctl_slifname: link not broadcast capable " 15324 "or IPv6 i.e., no broadcast \n")); 15325 return (EINVAL); 15326 } 15327 15328 new_flags = 15329 lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST); 15330 15331 if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) { 15332 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of " 15333 "IFF_IPV4 or IFF_IPV6\n")); 15334 return (EINVAL); 15335 } 15336 15337 /* 15338 * We always start off as IPv4, so only need to check for IPv6. 15339 */ 15340 if ((new_flags & IFF_IPV6) != 0) { 15341 ill->ill_flags |= ILLF_IPV6; 15342 ill->ill_flags &= ~ILLF_IPV4; 15343 } 15344 15345 if ((new_flags & IFF_BROADCAST) != 0) 15346 ipif->ipif_flags |= IPIF_BROADCAST; 15347 else 15348 ipif->ipif_flags &= ~IPIF_BROADCAST; 15349 15350 /* We started off as V4. */ 15351 if (ill->ill_flags & ILLF_IPV6) { 15352 ill->ill_phyint->phyint_illv6 = ill; 15353 ill->ill_phyint->phyint_illv4 = NULL; 15354 } 15355 15356 return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa)); 15357 } 15358 15359 /* ARGSUSED */ 15360 int 15361 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15362 ip_ioctl_cmd_t *ipip, void *if_req) 15363 { 15364 /* 15365 * ill_phyint_reinit merged the v4 and v6 into a single 15366 * ipsq. We might not have been able to complete the 15367 * slifname in ipif_set_values, if we could not become 15368 * exclusive. If so restart it here 15369 */ 15370 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 15371 } 15372 15373 /* 15374 * Return a pointer to the ipif which matches the index, IP version type and 15375 * zoneid. 15376 */ 15377 ipif_t * 15378 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 15379 ip_stack_t *ipst) 15380 { 15381 ill_t *ill; 15382 ipif_t *ipif = NULL; 15383 15384 ill = ill_lookup_on_ifindex(index, isv6, ipst); 15385 if (ill != NULL) { 15386 mutex_enter(&ill->ill_lock); 15387 for (ipif = ill->ill_ipif; ipif != NULL; 15388 ipif = ipif->ipif_next) { 15389 if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES || 15390 zoneid == ipif->ipif_zoneid || 15391 ipif->ipif_zoneid == ALL_ZONES)) { 15392 ipif_refhold_locked(ipif); 15393 break; 15394 } 15395 } 15396 mutex_exit(&ill->ill_lock); 15397 ill_refrele(ill); 15398 } 15399 return (ipif); 15400 } 15401 15402 /* 15403 * Change an existing physical interface's index. If the new index 15404 * is acceptable we update the index and the phyint_list_avl_by_index tree. 15405 * Finally, we update other systems which may have a dependence on the 15406 * index value. 15407 */ 15408 /* ARGSUSED */ 15409 int 15410 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15411 ip_ioctl_cmd_t *ipip, void *ifreq) 15412 { 15413 ill_t *ill; 15414 phyint_t *phyi; 15415 struct ifreq *ifr = (struct ifreq *)ifreq; 15416 struct lifreq *lifr = (struct lifreq *)ifreq; 15417 uint_t old_index, index; 15418 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15419 avl_index_t where; 15420 15421 if (ipip->ipi_cmd_type == IF_CMD) 15422 index = ifr->ifr_index; 15423 else 15424 index = lifr->lifr_index; 15425 15426 /* 15427 * Only allow on physical interface. Also, index zero is illegal. 15428 */ 15429 ill = ipif->ipif_ill; 15430 phyi = ill->ill_phyint; 15431 if (ipif->ipif_id != 0 || index == 0) { 15432 return (EINVAL); 15433 } 15434 15435 /* If the index is not changing, no work to do */ 15436 if (phyi->phyint_ifindex == index) 15437 return (0); 15438 15439 /* 15440 * Use phyint_exists() to determine if the new interface index 15441 * is already in use. If the index is unused then we need to 15442 * change the phyint's position in the phyint_list_avl_by_index 15443 * tree. If we do not do this, subsequent lookups (using the new 15444 * index value) will not find the phyint. 15445 */ 15446 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15447 if (phyint_exists(index, ipst)) { 15448 rw_exit(&ipst->ips_ill_g_lock); 15449 return (EEXIST); 15450 } 15451 15452 /* 15453 * The new index is unused. Set it in the phyint. However we must not 15454 * forget to trigger NE_IFINDEX_CHANGE event before the ifindex 15455 * changes. The event must be bound to old ifindex value. 15456 */ 15457 ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE, 15458 &index, sizeof (index)); 15459 15460 old_index = phyi->phyint_ifindex; 15461 phyi->phyint_ifindex = index; 15462 15463 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi); 15464 (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 15465 &index, &where); 15466 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 15467 phyi, where); 15468 rw_exit(&ipst->ips_ill_g_lock); 15469 15470 /* Update SCTP's ILL list */ 15471 sctp_ill_reindex(ill, old_index); 15472 15473 /* Send the routing sockets message */ 15474 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 15475 if (ILL_OTHER(ill)) 15476 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT); 15477 15478 /* Perhaps ilgs should use this ill */ 15479 update_conn_ill(NULL, ill->ill_ipst); 15480 return (0); 15481 } 15482 15483 /* ARGSUSED */ 15484 int 15485 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15486 ip_ioctl_cmd_t *ipip, void *ifreq) 15487 { 15488 struct ifreq *ifr = (struct ifreq *)ifreq; 15489 struct lifreq *lifr = (struct lifreq *)ifreq; 15490 15491 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 15492 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15493 /* Get the interface index */ 15494 if (ipip->ipi_cmd_type == IF_CMD) { 15495 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 15496 } else { 15497 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 15498 } 15499 return (0); 15500 } 15501 15502 /* ARGSUSED */ 15503 int 15504 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15505 ip_ioctl_cmd_t *ipip, void *ifreq) 15506 { 15507 struct lifreq *lifr = (struct lifreq *)ifreq; 15508 15509 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 15510 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15511 /* Get the interface zone */ 15512 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15513 lifr->lifr_zoneid = ipif->ipif_zoneid; 15514 return (0); 15515 } 15516 15517 /* 15518 * Set the zoneid of an interface. 15519 */ 15520 /* ARGSUSED */ 15521 int 15522 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15523 ip_ioctl_cmd_t *ipip, void *ifreq) 15524 { 15525 struct lifreq *lifr = (struct lifreq *)ifreq; 15526 int err = 0; 15527 boolean_t need_up = B_FALSE; 15528 zone_t *zptr; 15529 zone_status_t status; 15530 zoneid_t zoneid; 15531 15532 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15533 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 15534 if (!is_system_labeled()) 15535 return (ENOTSUP); 15536 zoneid = GLOBAL_ZONEID; 15537 } 15538 15539 /* cannot assign instance zero to a non-global zone */ 15540 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 15541 return (ENOTSUP); 15542 15543 /* 15544 * Cannot assign to a zone that doesn't exist or is shutting down. In 15545 * the event of a race with the zone shutdown processing, since IP 15546 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 15547 * interface will be cleaned up even if the zone is shut down 15548 * immediately after the status check. If the interface can't be brought 15549 * down right away, and the zone is shut down before the restart 15550 * function is called, we resolve the possible races by rechecking the 15551 * zone status in the restart function. 15552 */ 15553 if ((zptr = zone_find_by_id(zoneid)) == NULL) 15554 return (EINVAL); 15555 status = zone_status_get(zptr); 15556 zone_rele(zptr); 15557 15558 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 15559 return (EINVAL); 15560 15561 if (ipif->ipif_flags & IPIF_UP) { 15562 /* 15563 * If the interface is already marked up, 15564 * we call ipif_down which will take care 15565 * of ditching any IREs that have been set 15566 * up based on the old interface address. 15567 */ 15568 err = ipif_logical_down(ipif, q, mp); 15569 if (err == EINPROGRESS) 15570 return (err); 15571 (void) ipif_down_tail(ipif); 15572 need_up = B_TRUE; 15573 } 15574 15575 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 15576 return (err); 15577 } 15578 15579 static int 15580 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 15581 queue_t *q, mblk_t *mp, boolean_t need_up) 15582 { 15583 int err = 0; 15584 ip_stack_t *ipst; 15585 15586 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 15587 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15588 15589 if (CONN_Q(q)) 15590 ipst = CONNQ_TO_IPST(q); 15591 else 15592 ipst = ILLQ_TO_IPST(q); 15593 15594 /* 15595 * For exclusive stacks we don't allow a different zoneid than 15596 * global. 15597 */ 15598 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID && 15599 zoneid != GLOBAL_ZONEID) 15600 return (EINVAL); 15601 15602 /* Set the new zone id. */ 15603 ipif->ipif_zoneid = zoneid; 15604 15605 /* Update sctp list */ 15606 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 15607 15608 /* The default multicast interface might have changed */ 15609 ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6); 15610 15611 if (need_up) { 15612 /* 15613 * Now bring the interface back up. If this 15614 * is the only IPIF for the ILL, ipif_up 15615 * will have to re-bind to the device, so 15616 * we may get back EINPROGRESS, in which 15617 * case, this IOCTL will get completed in 15618 * ip_rput_dlpi when we see the DL_BIND_ACK. 15619 */ 15620 err = ipif_up(ipif, q, mp); 15621 } 15622 return (err); 15623 } 15624 15625 /* ARGSUSED */ 15626 int 15627 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15628 ip_ioctl_cmd_t *ipip, void *if_req) 15629 { 15630 struct lifreq *lifr = (struct lifreq *)if_req; 15631 zoneid_t zoneid; 15632 zone_t *zptr; 15633 zone_status_t status; 15634 15635 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15636 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 15637 zoneid = GLOBAL_ZONEID; 15638 15639 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 15640 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15641 15642 /* 15643 * We recheck the zone status to resolve the following race condition: 15644 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 15645 * 2) hme0:1 is up and can't be brought down right away; 15646 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 15647 * 3) zone "myzone" is halted; the zone status switches to 15648 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 15649 * the interfaces to remove - hme0:1 is not returned because it's not 15650 * yet in "myzone", so it won't be removed; 15651 * 4) the restart function for SIOCSLIFZONE is called; without the 15652 * status check here, we would have hme0:1 in "myzone" after it's been 15653 * destroyed. 15654 * Note that if the status check fails, we need to bring the interface 15655 * back to its state prior to ip_sioctl_slifzone(), hence the call to 15656 * ipif_up_done[_v6](). 15657 */ 15658 status = ZONE_IS_UNINITIALIZED; 15659 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 15660 status = zone_status_get(zptr); 15661 zone_rele(zptr); 15662 } 15663 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 15664 if (ipif->ipif_isv6) { 15665 (void) ipif_up_done_v6(ipif); 15666 } else { 15667 (void) ipif_up_done(ipif); 15668 } 15669 return (EINVAL); 15670 } 15671 15672 (void) ipif_down_tail(ipif); 15673 15674 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 15675 B_TRUE)); 15676 } 15677 15678 /* 15679 * Return the number of addresses on `ill' with one or more of the values 15680 * in `set' set and all of the values in `clear' clear. 15681 */ 15682 static uint_t 15683 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear) 15684 { 15685 ipif_t *ipif; 15686 uint_t cnt = 0; 15687 15688 ASSERT(IAM_WRITER_ILL(ill)); 15689 15690 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 15691 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear)) 15692 cnt++; 15693 15694 return (cnt); 15695 } 15696 15697 /* 15698 * Return the number of migratable addresses on `ill' that are under 15699 * application control. 15700 */ 15701 uint_t 15702 ill_appaddr_cnt(const ill_t *ill) 15703 { 15704 return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF, 15705 IPIF_NOFAILOVER)); 15706 } 15707 15708 /* 15709 * Return the number of point-to-point addresses on `ill'. 15710 */ 15711 uint_t 15712 ill_ptpaddr_cnt(const ill_t *ill) 15713 { 15714 return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0)); 15715 } 15716 15717 /* ARGSUSED */ 15718 int 15719 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15720 ip_ioctl_cmd_t *ipip, void *ifreq) 15721 { 15722 struct lifreq *lifr = ifreq; 15723 15724 ASSERT(q->q_next == NULL); 15725 ASSERT(CONN_Q(q)); 15726 15727 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 15728 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15729 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 15730 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 15731 15732 return (0); 15733 } 15734 15735 /* Find the previous ILL in this usesrc group */ 15736 static ill_t * 15737 ill_prev_usesrc(ill_t *uill) 15738 { 15739 ill_t *ill; 15740 15741 for (ill = uill->ill_usesrc_grp_next; 15742 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 15743 ill = ill->ill_usesrc_grp_next) 15744 /* do nothing */; 15745 return (ill); 15746 } 15747 15748 /* 15749 * Release all members of the usesrc group. This routine is called 15750 * from ill_delete when the interface being unplumbed is the 15751 * group head. 15752 * 15753 * This silently clears the usesrc that ifconfig setup. 15754 * An alternative would be to keep that ifindex, and drop packets on the floor 15755 * since no source address can be selected. 15756 * Even if we keep the current semantics, don't need a lock and a linked list. 15757 * Can walk all the ills checking if they have a ill_usesrc_ifindex matching 15758 * the one that is being removed. Issue is how we return the usesrc users 15759 * (SIOCGLIFSRCOF). We want to be able to find the ills which have an 15760 * ill_usesrc_ifindex matching a target ill. We could also do that with an 15761 * ill walk, but the walker would need to insert in the ioctl response. 15762 */ 15763 static void 15764 ill_disband_usesrc_group(ill_t *uill) 15765 { 15766 ill_t *next_ill, *tmp_ill; 15767 ip_stack_t *ipst = uill->ill_ipst; 15768 15769 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 15770 next_ill = uill->ill_usesrc_grp_next; 15771 15772 do { 15773 ASSERT(next_ill != NULL); 15774 tmp_ill = next_ill->ill_usesrc_grp_next; 15775 ASSERT(tmp_ill != NULL); 15776 next_ill->ill_usesrc_grp_next = NULL; 15777 next_ill->ill_usesrc_ifindex = 0; 15778 next_ill = tmp_ill; 15779 } while (next_ill->ill_usesrc_ifindex != 0); 15780 uill->ill_usesrc_grp_next = NULL; 15781 } 15782 15783 /* 15784 * Remove the client usesrc ILL from the list and relink to a new list 15785 */ 15786 int 15787 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 15788 { 15789 ill_t *ill, *tmp_ill; 15790 ip_stack_t *ipst = ucill->ill_ipst; 15791 15792 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 15793 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 15794 15795 /* 15796 * Check if the usesrc client ILL passed in is not already 15797 * in use as a usesrc ILL i.e one whose source address is 15798 * in use OR a usesrc ILL is not already in use as a usesrc 15799 * client ILL 15800 */ 15801 if ((ucill->ill_usesrc_ifindex == 0) || 15802 (uill->ill_usesrc_ifindex != 0)) { 15803 return (-1); 15804 } 15805 15806 ill = ill_prev_usesrc(ucill); 15807 ASSERT(ill->ill_usesrc_grp_next != NULL); 15808 15809 /* Remove from the current list */ 15810 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 15811 /* Only two elements in the list */ 15812 ASSERT(ill->ill_usesrc_ifindex == 0); 15813 ill->ill_usesrc_grp_next = NULL; 15814 } else { 15815 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 15816 } 15817 15818 if (ifindex == 0) { 15819 ucill->ill_usesrc_ifindex = 0; 15820 ucill->ill_usesrc_grp_next = NULL; 15821 return (0); 15822 } 15823 15824 ucill->ill_usesrc_ifindex = ifindex; 15825 tmp_ill = uill->ill_usesrc_grp_next; 15826 uill->ill_usesrc_grp_next = ucill; 15827 ucill->ill_usesrc_grp_next = 15828 (tmp_ill != NULL) ? tmp_ill : uill; 15829 return (0); 15830 } 15831 15832 /* 15833 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 15834 * ip.c for locking details. 15835 */ 15836 /* ARGSUSED */ 15837 int 15838 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15839 ip_ioctl_cmd_t *ipip, void *ifreq) 15840 { 15841 struct lifreq *lifr = (struct lifreq *)ifreq; 15842 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE; 15843 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 15844 int err = 0, ret; 15845 uint_t ifindex; 15846 ipsq_t *ipsq = NULL; 15847 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15848 15849 ASSERT(IAM_WRITER_IPIF(ipif)); 15850 ASSERT(q->q_next == NULL); 15851 ASSERT(CONN_Q(q)); 15852 15853 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; 15854 15855 ifindex = lifr->lifr_index; 15856 if (ifindex == 0) { 15857 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 15858 /* non usesrc group interface, nothing to reset */ 15859 return (0); 15860 } 15861 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 15862 /* valid reset request */ 15863 reset_flg = B_TRUE; 15864 } 15865 15866 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 15867 if (usesrc_ill == NULL) { 15868 return (ENXIO); 15869 } 15870 15871 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 15872 NEW_OP, B_TRUE); 15873 if (ipsq == NULL) { 15874 err = EINPROGRESS; 15875 /* Operation enqueued on the ipsq of the usesrc ILL */ 15876 goto done; 15877 } 15878 15879 /* USESRC isn't currently supported with IPMP */ 15880 if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) { 15881 err = ENOTSUP; 15882 goto done; 15883 } 15884 15885 /* 15886 * USESRC isn't compatible with the STANDBY flag. (STANDBY is only 15887 * used by IPMP underlying interfaces, but someone might think it's 15888 * more general and try to use it independently with VNI.) 15889 */ 15890 if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 15891 err = ENOTSUP; 15892 goto done; 15893 } 15894 15895 /* 15896 * If the client is already in use as a usesrc_ill or a usesrc_ill is 15897 * already a client then return EINVAL 15898 */ 15899 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 15900 err = EINVAL; 15901 goto done; 15902 } 15903 15904 /* 15905 * If the ill_usesrc_ifindex field is already set to what it needs to 15906 * be then this is a duplicate operation. 15907 */ 15908 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 15909 err = 0; 15910 goto done; 15911 } 15912 15913 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 15914 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 15915 usesrc_ill->ill_isv6)); 15916 15917 /* 15918 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 15919 * and the ill_usesrc_ifindex fields 15920 */ 15921 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 15922 15923 if (reset_flg) { 15924 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 15925 if (ret != 0) { 15926 err = EINVAL; 15927 } 15928 rw_exit(&ipst->ips_ill_g_usesrc_lock); 15929 goto done; 15930 } 15931 15932 /* 15933 * Four possibilities to consider: 15934 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 15935 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 15936 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 15937 * 4. Both are part of their respective usesrc groups 15938 */ 15939 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 15940 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 15941 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 15942 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 15943 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 15944 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 15945 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 15946 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 15947 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 15948 /* Insert at head of list */ 15949 usesrc_cli_ill->ill_usesrc_grp_next = 15950 usesrc_ill->ill_usesrc_grp_next; 15951 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 15952 } else { 15953 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 15954 ifindex); 15955 if (ret != 0) 15956 err = EINVAL; 15957 } 15958 rw_exit(&ipst->ips_ill_g_usesrc_lock); 15959 15960 done: 15961 if (ipsq != NULL) 15962 ipsq_exit(ipsq); 15963 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 15964 ill_refrele(usesrc_ill); 15965 15966 /* Let conn_ixa caching know that source address selection changed */ 15967 ip_update_source_selection(ipst); 15968 15969 return (err); 15970 } 15971 15972 /* 15973 * comparison function used by avl. 15974 */ 15975 static int 15976 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 15977 { 15978 15979 uint_t index; 15980 15981 ASSERT(phyip != NULL && index_ptr != NULL); 15982 15983 index = *((uint_t *)index_ptr); 15984 /* 15985 * let the phyint with the lowest index be on top. 15986 */ 15987 if (((phyint_t *)phyip)->phyint_ifindex < index) 15988 return (1); 15989 if (((phyint_t *)phyip)->phyint_ifindex > index) 15990 return (-1); 15991 return (0); 15992 } 15993 15994 /* 15995 * comparison function used by avl. 15996 */ 15997 static int 15998 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 15999 { 16000 ill_t *ill; 16001 int res = 0; 16002 16003 ASSERT(phyip != NULL && name_ptr != NULL); 16004 16005 if (((phyint_t *)phyip)->phyint_illv4) 16006 ill = ((phyint_t *)phyip)->phyint_illv4; 16007 else 16008 ill = ((phyint_t *)phyip)->phyint_illv6; 16009 ASSERT(ill != NULL); 16010 16011 res = strcmp(ill->ill_name, (char *)name_ptr); 16012 if (res > 0) 16013 return (1); 16014 else if (res < 0) 16015 return (-1); 16016 return (0); 16017 } 16018 16019 /* 16020 * This function is called on the unplumb path via ill_glist_delete() when 16021 * there are no ills left on the phyint and thus the phyint can be freed. 16022 */ 16023 static void 16024 phyint_free(phyint_t *phyi) 16025 { 16026 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 16027 16028 ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL); 16029 16030 /* 16031 * If this phyint was an IPMP meta-interface, blow away the group. 16032 * This is safe to do because all of the illgrps have already been 16033 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us. 16034 * If we're cleaning up as a result of failed initialization, 16035 * phyint_grp may be NULL. 16036 */ 16037 if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) { 16038 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 16039 ipmp_grp_destroy(phyi->phyint_grp); 16040 phyi->phyint_grp = NULL; 16041 rw_exit(&ipst->ips_ipmp_lock); 16042 } 16043 16044 /* 16045 * If this interface was under IPMP, take it out of the group. 16046 */ 16047 if (phyi->phyint_grp != NULL) 16048 ipmp_phyint_leave_grp(phyi); 16049 16050 /* 16051 * Delete the phyint and disassociate its ipsq. The ipsq itself 16052 * will be freed in ipsq_exit(). 16053 */ 16054 phyi->phyint_ipsq->ipsq_phyint = NULL; 16055 phyi->phyint_name[0] = '\0'; 16056 16057 mi_free(phyi); 16058 } 16059 16060 /* 16061 * Attach the ill to the phyint structure which can be shared by both 16062 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 16063 * function is called from ipif_set_values and ill_lookup_on_name (for 16064 * loopback) where we know the name of the ill. We lookup the ill and if 16065 * there is one present already with the name use that phyint. Otherwise 16066 * reuse the one allocated by ill_init. 16067 */ 16068 static void 16069 ill_phyint_reinit(ill_t *ill) 16070 { 16071 boolean_t isv6 = ill->ill_isv6; 16072 phyint_t *phyi_old; 16073 phyint_t *phyi; 16074 avl_index_t where = 0; 16075 ill_t *ill_other = NULL; 16076 ip_stack_t *ipst = ill->ill_ipst; 16077 16078 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 16079 16080 phyi_old = ill->ill_phyint; 16081 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 16082 phyi_old->phyint_illv6 == NULL)); 16083 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 16084 phyi_old->phyint_illv4 == NULL)); 16085 ASSERT(phyi_old->phyint_ifindex == 0); 16086 16087 /* 16088 * Now that our ill has a name, set it in the phyint. 16089 */ 16090 (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ); 16091 16092 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16093 ill->ill_name, &where); 16094 16095 /* 16096 * 1. We grabbed the ill_g_lock before inserting this ill into 16097 * the global list of ills. So no other thread could have located 16098 * this ill and hence the ipsq of this ill is guaranteed to be empty. 16099 * 2. Now locate the other protocol instance of this ill. 16100 * 3. Now grab both ill locks in the right order, and the phyint lock of 16101 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 16102 * of neither ill can change. 16103 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 16104 * other ill. 16105 * 5. Release all locks. 16106 */ 16107 16108 /* 16109 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 16110 * we are initializing IPv4. 16111 */ 16112 if (phyi != NULL) { 16113 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6; 16114 ASSERT(ill_other->ill_phyint != NULL); 16115 ASSERT((isv6 && !ill_other->ill_isv6) || 16116 (!isv6 && ill_other->ill_isv6)); 16117 GRAB_ILL_LOCKS(ill, ill_other); 16118 /* 16119 * We are potentially throwing away phyint_flags which 16120 * could be different from the one that we obtain from 16121 * ill_other->ill_phyint. But it is okay as we are assuming 16122 * that the state maintained within IP is correct. 16123 */ 16124 mutex_enter(&phyi->phyint_lock); 16125 if (isv6) { 16126 ASSERT(phyi->phyint_illv6 == NULL); 16127 phyi->phyint_illv6 = ill; 16128 } else { 16129 ASSERT(phyi->phyint_illv4 == NULL); 16130 phyi->phyint_illv4 = ill; 16131 } 16132 16133 /* 16134 * Delete the old phyint and make its ipsq eligible 16135 * to be freed in ipsq_exit(). 16136 */ 16137 phyi_old->phyint_illv4 = NULL; 16138 phyi_old->phyint_illv6 = NULL; 16139 phyi_old->phyint_ipsq->ipsq_phyint = NULL; 16140 phyi_old->phyint_name[0] = '\0'; 16141 mi_free(phyi_old); 16142 } else { 16143 mutex_enter(&ill->ill_lock); 16144 /* 16145 * We don't need to acquire any lock, since 16146 * the ill is not yet visible globally and we 16147 * have not yet released the ill_g_lock. 16148 */ 16149 phyi = phyi_old; 16150 mutex_enter(&phyi->phyint_lock); 16151 /* XXX We need a recovery strategy here. */ 16152 if (!phyint_assign_ifindex(phyi, ipst)) 16153 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 16154 16155 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16156 (void *)phyi, where); 16157 16158 (void) avl_find(&ipst->ips_phyint_g_list-> 16159 phyint_list_avl_by_index, 16160 &phyi->phyint_ifindex, &where); 16161 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16162 (void *)phyi, where); 16163 } 16164 16165 /* 16166 * Reassigning ill_phyint automatically reassigns the ipsq also. 16167 * pending mp is not affected because that is per ill basis. 16168 */ 16169 ill->ill_phyint = phyi; 16170 16171 /* 16172 * Now that the phyint's ifindex has been assigned, complete the 16173 * remaining 16174 */ 16175 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex; 16176 if (ill->ill_isv6) { 16177 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 16178 ill->ill_phyint->phyint_ifindex; 16179 ill->ill_mcast_type = ipst->ips_mld_max_version; 16180 } else { 16181 ill->ill_mcast_type = ipst->ips_igmp_max_version; 16182 } 16183 16184 /* 16185 * Generate an event within the hooks framework to indicate that 16186 * a new interface has just been added to IP. For this event to 16187 * be generated, the network interface must, at least, have an 16188 * ifindex assigned to it. (We don't generate the event for 16189 * loopback since ill_lookup_on_name() has its own NE_PLUMB event.) 16190 * 16191 * This needs to be run inside the ill_g_lock perimeter to ensure 16192 * that the ordering of delivered events to listeners matches the 16193 * order of them in the kernel. 16194 */ 16195 if (!IS_LOOPBACK(ill)) { 16196 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name, 16197 ill->ill_name_length); 16198 } 16199 RELEASE_ILL_LOCKS(ill, ill_other); 16200 mutex_exit(&phyi->phyint_lock); 16201 } 16202 16203 /* 16204 * Notify any downstream modules of the name of this interface. 16205 * An M_IOCTL is used even though we don't expect a successful reply. 16206 * Any reply message from the driver (presumably an M_IOCNAK) will 16207 * eventually get discarded somewhere upstream. The message format is 16208 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 16209 * to IP. 16210 */ 16211 static void 16212 ip_ifname_notify(ill_t *ill, queue_t *q) 16213 { 16214 mblk_t *mp1, *mp2; 16215 struct iocblk *iocp; 16216 struct lifreq *lifr; 16217 16218 mp1 = mkiocb(SIOCSLIFNAME); 16219 if (mp1 == NULL) 16220 return; 16221 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 16222 if (mp2 == NULL) { 16223 freeb(mp1); 16224 return; 16225 } 16226 16227 mp1->b_cont = mp2; 16228 iocp = (struct iocblk *)mp1->b_rptr; 16229 iocp->ioc_count = sizeof (struct lifreq); 16230 16231 lifr = (struct lifreq *)mp2->b_rptr; 16232 mp2->b_wptr += sizeof (struct lifreq); 16233 bzero(lifr, sizeof (struct lifreq)); 16234 16235 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 16236 lifr->lifr_ppa = ill->ill_ppa; 16237 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 16238 16239 DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify", 16240 char *, "SIOCSLIFNAME", ill_t *, ill); 16241 putnext(q, mp1); 16242 } 16243 16244 static int 16245 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 16246 { 16247 int err; 16248 ip_stack_t *ipst = ill->ill_ipst; 16249 phyint_t *phyi = ill->ill_phyint; 16250 16251 /* Set the obsolete NDD per-interface forwarding name. */ 16252 err = ill_set_ndd_name(ill); 16253 if (err != 0) { 16254 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 16255 err); 16256 } 16257 16258 /* 16259 * Now that ill_name is set, the configuration for the IPMP 16260 * meta-interface can be performed. 16261 */ 16262 if (IS_IPMP(ill)) { 16263 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 16264 /* 16265 * If phyi->phyint_grp is NULL, then this is the first IPMP 16266 * meta-interface and we need to create the IPMP group. 16267 */ 16268 if (phyi->phyint_grp == NULL) { 16269 /* 16270 * If someone has renamed another IPMP group to have 16271 * the same name as our interface, bail. 16272 */ 16273 if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) { 16274 rw_exit(&ipst->ips_ipmp_lock); 16275 return (EEXIST); 16276 } 16277 phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi); 16278 if (phyi->phyint_grp == NULL) { 16279 rw_exit(&ipst->ips_ipmp_lock); 16280 return (ENOMEM); 16281 } 16282 } 16283 rw_exit(&ipst->ips_ipmp_lock); 16284 } 16285 16286 /* Tell downstream modules where they are. */ 16287 ip_ifname_notify(ill, q); 16288 16289 /* 16290 * ill_dl_phys returns EINPROGRESS in the usual case. 16291 * Error cases are ENOMEM ... 16292 */ 16293 err = ill_dl_phys(ill, ipif, mp, q); 16294 16295 if (ill->ill_isv6) { 16296 mutex_enter(&ipst->ips_mld_slowtimeout_lock); 16297 if (ipst->ips_mld_slowtimeout_id == 0) { 16298 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, 16299 (void *)ipst, 16300 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 16301 } 16302 mutex_exit(&ipst->ips_mld_slowtimeout_lock); 16303 } else { 16304 mutex_enter(&ipst->ips_igmp_slowtimeout_lock); 16305 if (ipst->ips_igmp_slowtimeout_id == 0) { 16306 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, 16307 (void *)ipst, 16308 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 16309 } 16310 mutex_exit(&ipst->ips_igmp_slowtimeout_lock); 16311 } 16312 16313 return (err); 16314 } 16315 16316 /* 16317 * Common routine for ppa and ifname setting. Should be called exclusive. 16318 * 16319 * Returns EINPROGRESS when mp has been consumed by queueing it on 16320 * ipx_pending_mp and the ioctl will complete in ip_rput. 16321 * 16322 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 16323 * the new name and new ppa in lifr_name and lifr_ppa respectively. 16324 * For SLIFNAME, we pass these values back to the userland. 16325 */ 16326 static int 16327 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 16328 { 16329 ill_t *ill; 16330 ipif_t *ipif; 16331 ipsq_t *ipsq; 16332 char *ppa_ptr; 16333 char *old_ptr; 16334 char old_char; 16335 int error; 16336 ip_stack_t *ipst; 16337 16338 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 16339 ASSERT(q->q_next != NULL); 16340 ASSERT(interf_name != NULL); 16341 16342 ill = (ill_t *)q->q_ptr; 16343 ipst = ill->ill_ipst; 16344 16345 ASSERT(ill->ill_ipst != NULL); 16346 ASSERT(ill->ill_name[0] == '\0'); 16347 ASSERT(IAM_WRITER_ILL(ill)); 16348 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 16349 ASSERT(ill->ill_ppa == UINT_MAX); 16350 16351 ill->ill_defend_start = ill->ill_defend_count = 0; 16352 /* The ppa is sent down by ifconfig or is chosen */ 16353 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 16354 return (EINVAL); 16355 } 16356 16357 /* 16358 * make sure ppa passed in is same as ppa in the name. 16359 * This check is not made when ppa == UINT_MAX in that case ppa 16360 * in the name could be anything. System will choose a ppa and 16361 * update new_ppa_ptr and inter_name to contain the choosen ppa. 16362 */ 16363 if (*new_ppa_ptr != UINT_MAX) { 16364 /* stoi changes the pointer */ 16365 old_ptr = ppa_ptr; 16366 /* 16367 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 16368 * (they don't have an externally visible ppa). We assign one 16369 * here so that we can manage the interface. Note that in 16370 * the past this value was always 0 for DLPI 1 drivers. 16371 */ 16372 if (*new_ppa_ptr == 0) 16373 *new_ppa_ptr = stoi(&old_ptr); 16374 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 16375 return (EINVAL); 16376 } 16377 /* 16378 * terminate string before ppa 16379 * save char at that location. 16380 */ 16381 old_char = ppa_ptr[0]; 16382 ppa_ptr[0] = '\0'; 16383 16384 ill->ill_ppa = *new_ppa_ptr; 16385 /* 16386 * Finish as much work now as possible before calling ill_glist_insert 16387 * which makes the ill globally visible and also merges it with the 16388 * other protocol instance of this phyint. The remaining work is 16389 * done after entering the ipsq which may happen sometime later. 16390 * ill_set_ndd_name occurs after the ill has been made globally visible. 16391 */ 16392 ipif = ill->ill_ipif; 16393 16394 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 16395 ipif_assign_seqid(ipif); 16396 16397 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 16398 ill->ill_flags |= ILLF_IPV4; 16399 16400 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 16401 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 16402 16403 if (ill->ill_flags & ILLF_IPV6) { 16404 16405 ill->ill_isv6 = B_TRUE; 16406 ill_set_inputfn(ill); 16407 if (ill->ill_rq != NULL) { 16408 ill->ill_rq->q_qinfo = &iprinitv6; 16409 } 16410 16411 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 16412 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 16413 ipif->ipif_v6subnet = ipv6_all_zeros; 16414 ipif->ipif_v6net_mask = ipv6_all_zeros; 16415 ipif->ipif_v6brd_addr = ipv6_all_zeros; 16416 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 16417 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 16418 /* 16419 * point-to-point or Non-mulicast capable 16420 * interfaces won't do NUD unless explicitly 16421 * configured to do so. 16422 */ 16423 if (ipif->ipif_flags & IPIF_POINTOPOINT || 16424 !(ill->ill_flags & ILLF_MULTICAST)) { 16425 ill->ill_flags |= ILLF_NONUD; 16426 } 16427 /* Make sure IPv4 specific flag is not set on IPv6 if */ 16428 if (ill->ill_flags & ILLF_NOARP) { 16429 /* 16430 * Note: xresolv interfaces will eventually need 16431 * NOARP set here as well, but that will require 16432 * those external resolvers to have some 16433 * knowledge of that flag and act appropriately. 16434 * Not to be changed at present. 16435 */ 16436 ill->ill_flags &= ~ILLF_NOARP; 16437 } 16438 /* 16439 * Set the ILLF_ROUTER flag according to the global 16440 * IPv6 forwarding policy. 16441 */ 16442 if (ipst->ips_ipv6_forward != 0) 16443 ill->ill_flags |= ILLF_ROUTER; 16444 } else if (ill->ill_flags & ILLF_IPV4) { 16445 ill->ill_isv6 = B_FALSE; 16446 ill_set_inputfn(ill); 16447 ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER; 16448 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 16449 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 16450 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 16451 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 16452 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 16453 /* 16454 * Set the ILLF_ROUTER flag according to the global 16455 * IPv4 forwarding policy. 16456 */ 16457 if (ipst->ips_ip_g_forward != 0) 16458 ill->ill_flags |= ILLF_ROUTER; 16459 } 16460 16461 ASSERT(ill->ill_phyint != NULL); 16462 16463 /* 16464 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will 16465 * be completed in ill_glist_insert -> ill_phyint_reinit 16466 */ 16467 if (!ill_allocate_mibs(ill)) 16468 return (ENOMEM); 16469 16470 /* 16471 * Pick a default sap until we get the DL_INFO_ACK back from 16472 * the driver. 16473 */ 16474 ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap : 16475 ill->ill_media->ip_m_ipv4sap; 16476 16477 ill->ill_ifname_pending = 1; 16478 ill->ill_ifname_pending_err = 0; 16479 16480 /* 16481 * When the first ipif comes up in ipif_up_done(), multicast groups 16482 * that were joined while this ill was not bound to the DLPI link need 16483 * to be recovered by ill_recover_multicast(). 16484 */ 16485 ill->ill_need_recover_multicast = 1; 16486 16487 ill_refhold(ill); 16488 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16489 if ((error = ill_glist_insert(ill, interf_name, 16490 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 16491 ill->ill_ppa = UINT_MAX; 16492 ill->ill_name[0] = '\0'; 16493 /* 16494 * undo null termination done above. 16495 */ 16496 ppa_ptr[0] = old_char; 16497 rw_exit(&ipst->ips_ill_g_lock); 16498 ill_refrele(ill); 16499 return (error); 16500 } 16501 16502 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 16503 16504 /* 16505 * When we return the buffer pointed to by interf_name should contain 16506 * the same name as in ill_name. 16507 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 16508 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 16509 * so copy full name and update the ppa ptr. 16510 * When ppa passed in != UINT_MAX all values are correct just undo 16511 * null termination, this saves a bcopy. 16512 */ 16513 if (*new_ppa_ptr == UINT_MAX) { 16514 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 16515 *new_ppa_ptr = ill->ill_ppa; 16516 } else { 16517 /* 16518 * undo null termination done above. 16519 */ 16520 ppa_ptr[0] = old_char; 16521 } 16522 16523 /* Let SCTP know about this ILL */ 16524 sctp_update_ill(ill, SCTP_ILL_INSERT); 16525 16526 /* 16527 * ill_glist_insert has made the ill visible globally, and 16528 * ill_phyint_reinit could have changed the ipsq. At this point, 16529 * we need to hold the ips_ill_g_lock across the call to enter the 16530 * ipsq to enforce atomicity and prevent reordering. In the event 16531 * the ipsq has changed, and if the new ipsq is currently busy, 16532 * we need to make sure that this half-completed ioctl is ahead of 16533 * any subsequent ioctl. We achieve this by not dropping the 16534 * ips_ill_g_lock which prevents any ill lookup itself thereby 16535 * ensuring that new ioctls can't start. 16536 */ 16537 ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP, 16538 B_TRUE); 16539 16540 rw_exit(&ipst->ips_ill_g_lock); 16541 ill_refrele(ill); 16542 if (ipsq == NULL) 16543 return (EINPROGRESS); 16544 16545 /* 16546 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. 16547 */ 16548 if (ipsq->ipsq_xop->ipx_current_ipif == NULL) 16549 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); 16550 else 16551 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif); 16552 16553 error = ipif_set_values_tail(ill, ipif, mp, q); 16554 ipsq_exit(ipsq); 16555 if (error != 0 && error != EINPROGRESS) { 16556 /* 16557 * restore previous values 16558 */ 16559 ill->ill_isv6 = B_FALSE; 16560 ill_set_inputfn(ill); 16561 } 16562 return (error); 16563 } 16564 16565 void 16566 ipif_init(ip_stack_t *ipst) 16567 { 16568 int i; 16569 16570 for (i = 0; i < MAX_G_HEADS; i++) { 16571 ipst->ips_ill_g_heads[i].ill_g_list_head = 16572 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 16573 ipst->ips_ill_g_heads[i].ill_g_list_tail = 16574 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 16575 } 16576 16577 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16578 ill_phyint_compare_index, 16579 sizeof (phyint_t), 16580 offsetof(struct phyint, phyint_avl_by_index)); 16581 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16582 ill_phyint_compare_name, 16583 sizeof (phyint_t), 16584 offsetof(struct phyint, phyint_avl_by_name)); 16585 } 16586 16587 /* 16588 * Save enough information so that we can recreate the IRE if 16589 * the interface goes down and then up. 16590 */ 16591 void 16592 ill_save_ire(ill_t *ill, ire_t *ire) 16593 { 16594 mblk_t *save_mp; 16595 16596 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 16597 if (save_mp != NULL) { 16598 ifrt_t *ifrt; 16599 16600 save_mp->b_wptr += sizeof (ifrt_t); 16601 ifrt = (ifrt_t *)save_mp->b_rptr; 16602 bzero(ifrt, sizeof (ifrt_t)); 16603 ifrt->ifrt_type = ire->ire_type; 16604 if (ire->ire_ipversion == IPV4_VERSION) { 16605 ASSERT(!ill->ill_isv6); 16606 ifrt->ifrt_addr = ire->ire_addr; 16607 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 16608 ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr; 16609 ifrt->ifrt_mask = ire->ire_mask; 16610 } else { 16611 ASSERT(ill->ill_isv6); 16612 ifrt->ifrt_v6addr = ire->ire_addr_v6; 16613 /* ire_gateway_addr_v6 can change due to RTM_CHANGE */ 16614 mutex_enter(&ire->ire_lock); 16615 ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6; 16616 mutex_exit(&ire->ire_lock); 16617 ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6; 16618 ifrt->ifrt_v6mask = ire->ire_mask_v6; 16619 } 16620 ifrt->ifrt_flags = ire->ire_flags; 16621 ifrt->ifrt_zoneid = ire->ire_zoneid; 16622 mutex_enter(&ill->ill_saved_ire_lock); 16623 save_mp->b_cont = ill->ill_saved_ire_mp; 16624 ill->ill_saved_ire_mp = save_mp; 16625 ill->ill_saved_ire_cnt++; 16626 mutex_exit(&ill->ill_saved_ire_lock); 16627 } 16628 } 16629 16630 /* 16631 * Remove one entry from ill_saved_ire_mp. 16632 */ 16633 void 16634 ill_remove_saved_ire(ill_t *ill, ire_t *ire) 16635 { 16636 mblk_t **mpp; 16637 mblk_t *mp; 16638 ifrt_t *ifrt; 16639 16640 /* Remove from ill_saved_ire_mp list if it is there */ 16641 mutex_enter(&ill->ill_saved_ire_lock); 16642 for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL; 16643 mpp = &(*mpp)->b_cont) { 16644 in6_addr_t gw_addr_v6; 16645 16646 /* 16647 * On a given ill, the tuple of address, gateway, mask, 16648 * ire_type, and zoneid is unique for each saved IRE. 16649 */ 16650 mp = *mpp; 16651 ifrt = (ifrt_t *)mp->b_rptr; 16652 /* ire_gateway_addr_v6 can change - need lock */ 16653 mutex_enter(&ire->ire_lock); 16654 gw_addr_v6 = ire->ire_gateway_addr_v6; 16655 mutex_exit(&ire->ire_lock); 16656 16657 if (ifrt->ifrt_zoneid != ire->ire_zoneid || 16658 ifrt->ifrt_type != ire->ire_type) 16659 continue; 16660 16661 if (ill->ill_isv6 ? 16662 (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr, 16663 &ire->ire_addr_v6) && 16664 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr, 16665 &gw_addr_v6) && 16666 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask, 16667 &ire->ire_mask_v6)) : 16668 (ifrt->ifrt_addr == ire->ire_addr && 16669 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 16670 ifrt->ifrt_mask == ire->ire_mask)) { 16671 *mpp = mp->b_cont; 16672 ill->ill_saved_ire_cnt--; 16673 freeb(mp); 16674 break; 16675 } 16676 } 16677 mutex_exit(&ill->ill_saved_ire_lock); 16678 } 16679 16680 /* 16681 * IP multirouting broadcast routes handling 16682 * Append CGTP broadcast IREs to regular ones created 16683 * at ifconfig time. 16684 * The usage is a route add <cgtp_bc> <nic_bc> -multirt i.e., both 16685 * the destination and the gateway are broadcast addresses. 16686 * The caller has verified that the destination is an IRE_BROADCAST and that 16687 * RTF_MULTIRT was set. Here if the gateway is a broadcast address, then 16688 * we create a MULTIRT IRE_BROADCAST. 16689 * Note that the IRE_HOST created by ire_rt_add doesn't get found by anything 16690 * since the IRE_BROADCAST takes precedence; ire_add_v4 does head insertion. 16691 */ 16692 static void 16693 ip_cgtp_bcast_add(ire_t *ire, ip_stack_t *ipst) 16694 { 16695 ire_t *ire_prim; 16696 16697 ASSERT(ire != NULL); 16698 16699 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, 16700 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, 16701 NULL); 16702 if (ire_prim != NULL) { 16703 /* 16704 * We are in the special case of broadcasts for 16705 * CGTP. We add an IRE_BROADCAST that holds 16706 * the RTF_MULTIRT flag, the destination 16707 * address and the low level 16708 * info of ire_prim. In other words, CGTP 16709 * broadcast is added to the redundant ipif. 16710 */ 16711 ill_t *ill_prim; 16712 ire_t *bcast_ire; 16713 16714 ill_prim = ire_prim->ire_ill; 16715 16716 ip2dbg(("ip_cgtp_filter_bcast_add: ire_prim %p, ill_prim %p\n", 16717 (void *)ire_prim, (void *)ill_prim)); 16718 16719 bcast_ire = ire_create( 16720 (uchar_t *)&ire->ire_addr, 16721 (uchar_t *)&ip_g_all_ones, 16722 (uchar_t *)&ire->ire_gateway_addr, 16723 IRE_BROADCAST, 16724 ill_prim, 16725 GLOBAL_ZONEID, /* CGTP is only for the global zone */ 16726 ire->ire_flags | RTF_KERNEL, 16727 NULL, 16728 ipst); 16729 16730 /* 16731 * Here we assume that ire_add does head insertion so that 16732 * the added IRE_BROADCAST comes before the existing IRE_HOST. 16733 */ 16734 if (bcast_ire != NULL) { 16735 if (ire->ire_flags & RTF_SETSRC) { 16736 bcast_ire->ire_setsrc_addr = 16737 ire->ire_setsrc_addr; 16738 } 16739 bcast_ire = ire_add(bcast_ire); 16740 if (bcast_ire != NULL) { 16741 ip2dbg(("ip_cgtp_filter_bcast_add: " 16742 "added bcast_ire %p\n", 16743 (void *)bcast_ire)); 16744 16745 ill_save_ire(ill_prim, bcast_ire); 16746 ire_refrele(bcast_ire); 16747 } 16748 } 16749 ire_refrele(ire_prim); 16750 } 16751 } 16752 16753 /* 16754 * IP multirouting broadcast routes handling 16755 * Remove the broadcast ire. 16756 * The usage is a route delete <cgtp_bc> <nic_bc> -multirt i.e., both 16757 * the destination and the gateway are broadcast addresses. 16758 * The caller has only verified that RTF_MULTIRT was set. We check 16759 * that the destination is broadcast and that the gateway is a broadcast 16760 * address, and if so delete the IRE added by ip_cgtp_bcast_add(). 16761 */ 16762 static void 16763 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst) 16764 { 16765 ASSERT(ire != NULL); 16766 16767 if (ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST) { 16768 ire_t *ire_prim; 16769 16770 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, 16771 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, 16772 ipst, NULL); 16773 if (ire_prim != NULL) { 16774 ill_t *ill_prim; 16775 ire_t *bcast_ire; 16776 16777 ill_prim = ire_prim->ire_ill; 16778 16779 ip2dbg(("ip_cgtp_filter_bcast_delete: " 16780 "ire_prim %p, ill_prim %p\n", 16781 (void *)ire_prim, (void *)ill_prim)); 16782 16783 bcast_ire = ire_ftable_lookup_v4(ire->ire_addr, 0, 16784 ire->ire_gateway_addr, IRE_BROADCAST, 16785 ill_prim, ALL_ZONES, NULL, 16786 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_ILL | 16787 MATCH_IRE_MASK, 0, ipst, NULL); 16788 16789 if (bcast_ire != NULL) { 16790 ip2dbg(("ip_cgtp_filter_bcast_delete: " 16791 "looked up bcast_ire %p\n", 16792 (void *)bcast_ire)); 16793 ill_remove_saved_ire(bcast_ire->ire_ill, 16794 bcast_ire); 16795 ire_delete(bcast_ire); 16796 ire_refrele(bcast_ire); 16797 } 16798 ire_refrele(ire_prim); 16799 } 16800 } 16801 } 16802 16803 /* 16804 * Derive an interface id from the link layer address. 16805 * Knows about IEEE 802 and IEEE EUI-64 mappings. 16806 */ 16807 static void 16808 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16809 { 16810 char *addr; 16811 16812 /* 16813 * Note that some IPv6 interfaces get plumbed over links that claim to 16814 * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g. 16815 * PPP links). The ETHERADDRL check here ensures that we only set the 16816 * interface ID on IPv6 interfaces above links that actually have real 16817 * Ethernet addresses. 16818 */ 16819 if (ill->ill_phys_addr_length == ETHERADDRL) { 16820 /* Form EUI-64 like address */ 16821 addr = (char *)&v6addr->s6_addr32[2]; 16822 bcopy(ill->ill_phys_addr, addr, 3); 16823 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 16824 addr[3] = (char)0xff; 16825 addr[4] = (char)0xfe; 16826 bcopy(ill->ill_phys_addr + 3, addr + 5, 3); 16827 } 16828 } 16829 16830 /* ARGSUSED */ 16831 static void 16832 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16833 { 16834 } 16835 16836 typedef struct ipmp_ifcookie { 16837 uint32_t ic_hostid; 16838 char ic_ifname[LIFNAMSIZ]; 16839 char ic_zonename[ZONENAME_MAX]; 16840 } ipmp_ifcookie_t; 16841 16842 /* 16843 * Construct a pseudo-random interface ID for the IPMP interface that's both 16844 * predictable and (almost) guaranteed to be unique. 16845 */ 16846 static void 16847 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16848 { 16849 zone_t *zp; 16850 uint8_t *addr; 16851 uchar_t hash[16]; 16852 ulong_t hostid; 16853 MD5_CTX ctx; 16854 ipmp_ifcookie_t ic = { 0 }; 16855 16856 ASSERT(IS_IPMP(ill)); 16857 16858 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); 16859 ic.ic_hostid = htonl((uint32_t)hostid); 16860 16861 (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ); 16862 16863 if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) { 16864 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX); 16865 zone_rele(zp); 16866 } 16867 16868 MD5Init(&ctx); 16869 MD5Update(&ctx, &ic, sizeof (ic)); 16870 MD5Final(hash, &ctx); 16871 16872 /* 16873 * Map the hash to an interface ID per the basic approach in RFC3041. 16874 */ 16875 addr = &v6addr->s6_addr8[8]; 16876 bcopy(hash + 8, addr, sizeof (uint64_t)); 16877 addr[0] &= ~0x2; /* set local bit */ 16878 } 16879 16880 /* 16881 * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet. 16882 */ 16883 static void 16884 ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr) 16885 { 16886 phyint_t *phyi = ill->ill_phyint; 16887 16888 /* 16889 * Check PHYI_MULTI_BCAST and length of physical 16890 * address to determine if we use the mapping or the 16891 * broadcast address. 16892 */ 16893 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || 16894 ill->ill_phys_addr_length != ETHERADDRL) { 16895 ip_mbcast_mapping(ill, m_ip6addr, m_physaddr); 16896 return; 16897 } 16898 m_physaddr[0] = 0x33; 16899 m_physaddr[1] = 0x33; 16900 m_physaddr[2] = m_ip6addr[12]; 16901 m_physaddr[3] = m_ip6addr[13]; 16902 m_physaddr[4] = m_ip6addr[14]; 16903 m_physaddr[5] = m_ip6addr[15]; 16904 } 16905 16906 /* 16907 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet. 16908 */ 16909 static void 16910 ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 16911 { 16912 phyint_t *phyi = ill->ill_phyint; 16913 16914 /* 16915 * Check PHYI_MULTI_BCAST and length of physical 16916 * address to determine if we use the mapping or the 16917 * broadcast address. 16918 */ 16919 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || 16920 ill->ill_phys_addr_length != ETHERADDRL) { 16921 ip_mbcast_mapping(ill, m_ipaddr, m_physaddr); 16922 return; 16923 } 16924 m_physaddr[0] = 0x01; 16925 m_physaddr[1] = 0x00; 16926 m_physaddr[2] = 0x5e; 16927 m_physaddr[3] = m_ipaddr[1] & 0x7f; 16928 m_physaddr[4] = m_ipaddr[2]; 16929 m_physaddr[5] = m_ipaddr[3]; 16930 } 16931 16932 /* ARGSUSED */ 16933 static void 16934 ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 16935 { 16936 /* 16937 * for the MULTI_BCAST case and other cases when we want to 16938 * use the link-layer broadcast address for multicast. 16939 */ 16940 uint8_t *bphys_addr; 16941 dl_unitdata_req_t *dlur; 16942 16943 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 16944 if (ill->ill_sap_length < 0) { 16945 bphys_addr = (uchar_t *)dlur + 16946 dlur->dl_dest_addr_offset; 16947 } else { 16948 bphys_addr = (uchar_t *)dlur + 16949 dlur->dl_dest_addr_offset + ill->ill_sap_length; 16950 } 16951 16952 bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length); 16953 } 16954 16955 /* 16956 * Derive IPoIB interface id from the link layer address. 16957 */ 16958 static void 16959 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16960 { 16961 char *addr; 16962 16963 ASSERT(ill->ill_phys_addr_length == 20); 16964 addr = (char *)&v6addr->s6_addr32[2]; 16965 bcopy(ill->ill_phys_addr + 12, addr, 8); 16966 /* 16967 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 16968 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 16969 * rules. In these cases, the IBA considers these GUIDs to be in 16970 * "Modified EUI-64" format, and thus toggling the u/l bit is not 16971 * required; vendors are required not to assign global EUI-64's 16972 * that differ only in u/l bit values, thus guaranteeing uniqueness 16973 * of the interface identifier. Whether the GUID is in modified 16974 * or proper EUI-64 format, the ipv6 identifier must have the u/l 16975 * bit set to 1. 16976 */ 16977 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 16978 } 16979 16980 /* 16981 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand. 16982 * Note on mapping from multicast IP addresses to IPoIB multicast link 16983 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 16984 * The format of an IPoIB multicast address is: 16985 * 16986 * 4 byte QPN Scope Sign. Pkey 16987 * +--------------------------------------------+ 16988 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 16989 * +--------------------------------------------+ 16990 * 16991 * The Scope and Pkey components are properties of the IBA port and 16992 * network interface. They can be ascertained from the broadcast address. 16993 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 16994 */ 16995 static void 16996 ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 16997 { 16998 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 16999 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 17000 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 17001 uint8_t *bphys_addr; 17002 dl_unitdata_req_t *dlur; 17003 17004 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); 17005 17006 /* 17007 * RFC 4391: IPv4 MGID is 28-bit long. 17008 */ 17009 m_physaddr[16] = m_ipaddr[0] & 0x0f; 17010 m_physaddr[17] = m_ipaddr[1]; 17011 m_physaddr[18] = m_ipaddr[2]; 17012 m_physaddr[19] = m_ipaddr[3]; 17013 17014 17015 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17016 if (ill->ill_sap_length < 0) { 17017 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 17018 } else { 17019 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + 17020 ill->ill_sap_length; 17021 } 17022 /* 17023 * Now fill in the IBA scope/Pkey values from the broadcast address. 17024 */ 17025 m_physaddr[5] = bphys_addr[5]; 17026 m_physaddr[8] = bphys_addr[8]; 17027 m_physaddr[9] = bphys_addr[9]; 17028 } 17029 17030 static void 17031 ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17032 { 17033 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 17034 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 17035 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 17036 uint8_t *bphys_addr; 17037 dl_unitdata_req_t *dlur; 17038 17039 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); 17040 17041 /* 17042 * RFC 4391: IPv4 MGID is 80-bit long. 17043 */ 17044 bcopy(&m_ipaddr[6], &m_physaddr[10], 10); 17045 17046 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17047 if (ill->ill_sap_length < 0) { 17048 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 17049 } else { 17050 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + 17051 ill->ill_sap_length; 17052 } 17053 /* 17054 * Now fill in the IBA scope/Pkey values from the broadcast address. 17055 */ 17056 m_physaddr[5] = bphys_addr[5]; 17057 m_physaddr[8] = bphys_addr[8]; 17058 m_physaddr[9] = bphys_addr[9]; 17059 } 17060 17061 /* 17062 * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4 17063 * tunnel). The IPv4 address simply get placed in the lower 4 bytes of the 17064 * IPv6 interface id. This is a suggested mechanism described in section 3.7 17065 * of RFC4213. 17066 */ 17067 static void 17068 ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 17069 { 17070 ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t)); 17071 v6addr->s6_addr32[2] = 0; 17072 bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t)); 17073 } 17074 17075 /* 17076 * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6 17077 * tunnel). The lower 8 bytes of the IPv6 address simply become the interface 17078 * id. 17079 */ 17080 static void 17081 ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 17082 { 17083 in6_addr_t *v6lladdr = (in6_addr_t *)physaddr; 17084 17085 ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t)); 17086 bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8); 17087 } 17088 17089 static void 17090 ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17091 { 17092 ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr); 17093 } 17094 17095 static void 17096 ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 17097 { 17098 ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr); 17099 } 17100 17101 static void 17102 ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17103 { 17104 ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr); 17105 } 17106 17107 static void 17108 ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 17109 { 17110 ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr); 17111 } 17112 17113 /* 17114 * Lookup an ill and verify that the zoneid has an ipif on that ill. 17115 * Returns an held ill, or NULL. 17116 */ 17117 ill_t * 17118 ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6, 17119 ip_stack_t *ipst) 17120 { 17121 ill_t *ill; 17122 ipif_t *ipif; 17123 17124 ill = ill_lookup_on_ifindex(index, isv6, ipst); 17125 if (ill == NULL) 17126 return (NULL); 17127 17128 mutex_enter(&ill->ill_lock); 17129 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17130 if (IPIF_IS_CONDEMNED(ipif)) 17131 continue; 17132 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 17133 ipif->ipif_zoneid != ALL_ZONES) 17134 continue; 17135 17136 mutex_exit(&ill->ill_lock); 17137 return (ill); 17138 } 17139 mutex_exit(&ill->ill_lock); 17140 ill_refrele(ill); 17141 return (NULL); 17142 } 17143 17144 /* 17145 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 17146 * If a pointer to an ipif_t is returned then the caller will need to do 17147 * an ill_refrele(). 17148 */ 17149 ipif_t * 17150 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, 17151 ip_stack_t *ipst) 17152 { 17153 ipif_t *ipif; 17154 ill_t *ill; 17155 17156 ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 17157 if (ill == NULL) 17158 return (NULL); 17159 17160 mutex_enter(&ill->ill_lock); 17161 if (ill->ill_state_flags & ILL_CONDEMNED) { 17162 mutex_exit(&ill->ill_lock); 17163 ill_refrele(ill); 17164 return (NULL); 17165 } 17166 17167 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17168 if (!IPIF_CAN_LOOKUP(ipif)) 17169 continue; 17170 if (lifidx == ipif->ipif_id) { 17171 ipif_refhold_locked(ipif); 17172 break; 17173 } 17174 } 17175 17176 mutex_exit(&ill->ill_lock); 17177 ill_refrele(ill); 17178 return (ipif); 17179 } 17180 17181 /* 17182 * Set ill_inputfn based on the current know state. 17183 * This needs to be called when any of the factors taken into 17184 * account changes. 17185 */ 17186 void 17187 ill_set_inputfn(ill_t *ill) 17188 { 17189 ip_stack_t *ipst = ill->ill_ipst; 17190 17191 if (ill->ill_isv6) { 17192 if (is_system_labeled()) 17193 ill->ill_inputfn = ill_input_full_v6; 17194 else 17195 ill->ill_inputfn = ill_input_short_v6; 17196 } else { 17197 if (is_system_labeled()) 17198 ill->ill_inputfn = ill_input_full_v4; 17199 else if (ill->ill_dhcpinit != 0) 17200 ill->ill_inputfn = ill_input_full_v4; 17201 else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head 17202 != NULL) 17203 ill->ill_inputfn = ill_input_full_v4; 17204 else if (ipst->ips_ip_cgtp_filter && 17205 ipst->ips_ip_cgtp_filter_ops != NULL) 17206 ill->ill_inputfn = ill_input_full_v4; 17207 else 17208 ill->ill_inputfn = ill_input_short_v4; 17209 } 17210 } 17211 17212 /* 17213 * Re-evaluate ill_inputfn for all the IPv4 ills. 17214 * Used when RSVP and CGTP comes and goes. 17215 */ 17216 void 17217 ill_set_inputfn_all(ip_stack_t *ipst) 17218 { 17219 ill_walk_context_t ctx; 17220 ill_t *ill; 17221 17222 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 17223 ill = ILL_START_WALK_V4(&ctx, ipst); 17224 for (; ill != NULL; ill = ill_next(&ctx, ill)) 17225 ill_set_inputfn(ill); 17226 17227 rw_exit(&ipst->ips_ill_g_lock); 17228 } 17229 17230 /* 17231 * Set the physical address information for `ill' to the contents of the 17232 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be 17233 * asynchronous if `ill' cannot immediately be quiesced -- in which case 17234 * EINPROGRESS will be returned. 17235 */ 17236 int 17237 ill_set_phys_addr(ill_t *ill, mblk_t *mp) 17238 { 17239 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 17240 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr; 17241 17242 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17243 17244 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR && 17245 dlindp->dl_data != DL_CURR_DEST_ADDR && 17246 dlindp->dl_data != DL_CURR_PHYS_ADDR) { 17247 /* Changing DL_IPV6_TOKEN is not yet supported */ 17248 return (0); 17249 } 17250 17251 /* 17252 * We need to store up to two copies of `mp' in `ill'. Due to the 17253 * design of ipsq_pending_mp_add(), we can't pass them as separate 17254 * arguments to ill_set_phys_addr_tail(). Instead, chain them 17255 * together here, then pull 'em apart in ill_set_phys_addr_tail(). 17256 */ 17257 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) { 17258 freemsg(mp); 17259 return (ENOMEM); 17260 } 17261 17262 ipsq_current_start(ipsq, ill->ill_ipif, 0); 17263 mutex_enter(&ill->ill_lock); 17264 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 17265 /* no more nce addition allowed */ 17266 mutex_exit(&ill->ill_lock); 17267 17268 /* 17269 * If we can quiesce the ill, then set the address. If not, then 17270 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). 17271 */ 17272 ill_down_ipifs(ill, B_TRUE); 17273 mutex_enter(&ill->ill_lock); 17274 if (!ill_is_quiescent(ill)) { 17275 /* call cannot fail since `conn_t *' argument is NULL */ 17276 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 17277 mp, ILL_DOWN); 17278 mutex_exit(&ill->ill_lock); 17279 return (EINPROGRESS); 17280 } 17281 mutex_exit(&ill->ill_lock); 17282 17283 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL); 17284 return (0); 17285 } 17286 17287 /* 17288 * Once the ill associated with `q' has quiesced, set its physical address 17289 * information to the values in `addrmp'. Note that two copies of `addrmp' 17290 * are passed (linked by b_cont), since we sometimes need to save two distinct 17291 * copies in the ill_t, and our context doesn't permit sleeping or allocation 17292 * failure (we'll free the other copy if it's not needed). Since the ill_t 17293 * is quiesced, we know any stale nce's with the old address information have 17294 * already been removed, so we don't need to call nce_flush(). 17295 */ 17296 /* ARGSUSED */ 17297 static void 17298 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) 17299 { 17300 ill_t *ill = q->q_ptr; 17301 mblk_t *addrmp2 = unlinkb(addrmp); 17302 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr; 17303 uint_t addrlen, addroff; 17304 int status; 17305 17306 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17307 17308 addroff = dlindp->dl_addr_offset; 17309 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length); 17310 17311 switch (dlindp->dl_data) { 17312 case DL_IPV6_LINK_LAYER_ADDR: 17313 ill_set_ndmp(ill, addrmp, addroff, addrlen); 17314 freemsg(addrmp2); 17315 break; 17316 17317 case DL_CURR_DEST_ADDR: 17318 freemsg(ill->ill_dest_addr_mp); 17319 ill->ill_dest_addr = addrmp->b_rptr + addroff; 17320 ill->ill_dest_addr_mp = addrmp; 17321 if (ill->ill_isv6) { 17322 ill_setdesttoken(ill); 17323 ipif_setdestlinklocal(ill->ill_ipif); 17324 } 17325 freemsg(addrmp2); 17326 break; 17327 17328 case DL_CURR_PHYS_ADDR: 17329 freemsg(ill->ill_phys_addr_mp); 17330 ill->ill_phys_addr = addrmp->b_rptr + addroff; 17331 ill->ill_phys_addr_mp = addrmp; 17332 ill->ill_phys_addr_length = addrlen; 17333 if (ill->ill_isv6) 17334 ill_set_ndmp(ill, addrmp2, addroff, addrlen); 17335 else 17336 freemsg(addrmp2); 17337 if (ill->ill_isv6) { 17338 ill_setdefaulttoken(ill); 17339 ipif_setlinklocal(ill->ill_ipif); 17340 } 17341 break; 17342 default: 17343 ASSERT(0); 17344 } 17345 17346 /* 17347 * If there are ipifs to bring up, ill_up_ipifs() will return 17348 * EINPROGRESS, and ipsq_current_finish() will be called by 17349 * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is 17350 * brought up. 17351 */ 17352 status = ill_up_ipifs(ill, q, addrmp); 17353 mutex_enter(&ill->ill_lock); 17354 if (ill->ill_dl_up) 17355 ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS; 17356 mutex_exit(&ill->ill_lock); 17357 if (status != EINPROGRESS) 17358 ipsq_current_finish(ipsq); 17359 } 17360 17361 /* 17362 * Helper routine for setting the ill_nd_lla fields. 17363 */ 17364 void 17365 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen) 17366 { 17367 freemsg(ill->ill_nd_lla_mp); 17368 ill->ill_nd_lla = ndmp->b_rptr + addroff; 17369 ill->ill_nd_lla_mp = ndmp; 17370 ill->ill_nd_lla_len = addrlen; 17371 } 17372 17373 /* 17374 * Replumb the ill. 17375 */ 17376 int 17377 ill_replumb(ill_t *ill, mblk_t *mp) 17378 { 17379 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 17380 17381 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17382 17383 ipsq_current_start(ipsq, ill->ill_ipif, 0); 17384 17385 mutex_enter(&ill->ill_lock); 17386 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 17387 /* no more nce addition allowed */ 17388 mutex_exit(&ill->ill_lock); 17389 17390 /* 17391 * If we can quiesce the ill, then continue. If not, then 17392 * ill_replumb_tail() will be called from ipif_ill_refrele_tail(). 17393 */ 17394 ill_down_ipifs(ill, B_FALSE); 17395 17396 mutex_enter(&ill->ill_lock); 17397 if (!ill_is_quiescent(ill)) { 17398 /* call cannot fail since `conn_t *' argument is NULL */ 17399 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 17400 mp, ILL_DOWN); 17401 mutex_exit(&ill->ill_lock); 17402 return (EINPROGRESS); 17403 } 17404 mutex_exit(&ill->ill_lock); 17405 17406 ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL); 17407 return (0); 17408 } 17409 17410 /* ARGSUSED */ 17411 static void 17412 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 17413 { 17414 ill_t *ill = q->q_ptr; 17415 int err; 17416 conn_t *connp = NULL; 17417 17418 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17419 freemsg(ill->ill_replumb_mp); 17420 ill->ill_replumb_mp = copyb(mp); 17421 17422 if (ill->ill_replumb_mp == NULL) { 17423 /* out of memory */ 17424 ipsq_current_finish(ipsq); 17425 return; 17426 } 17427 17428 mutex_enter(&ill->ill_lock); 17429 ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif, 17430 ill->ill_rq, ill->ill_replumb_mp, 0); 17431 mutex_exit(&ill->ill_lock); 17432 17433 if (!ill->ill_up_ipifs) { 17434 /* already closing */ 17435 ipsq_current_finish(ipsq); 17436 return; 17437 } 17438 ill->ill_replumbing = 1; 17439 err = ill_down_ipifs_tail(ill); 17440 17441 /* 17442 * Successfully quiesced and brought down the interface, now we send 17443 * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the 17444 * DL_NOTE_REPLUMB message. 17445 */ 17446 mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO, 17447 DL_NOTIFY_CONF); 17448 ASSERT(mp != NULL); 17449 ((dl_notify_conf_t *)mp->b_rptr)->dl_notification = 17450 DL_NOTE_REPLUMB_DONE; 17451 ill_dlpi_send(ill, mp); 17452 17453 /* 17454 * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP 17455 * streams have to be unbound. When all the DLPI exchanges are done, 17456 * ipsq_current_finish() will be called by arp_bringup_done(). The 17457 * remainder of ipif bringup via ill_up_ipifs() will also be done in 17458 * arp_bringup_done(). 17459 */ 17460 ASSERT(ill->ill_replumb_mp != NULL); 17461 if (err == EINPROGRESS) 17462 return; 17463 else 17464 ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp); 17465 ASSERT(connp == NULL); 17466 if (err == 0 && ill->ill_replumb_mp != NULL && 17467 ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) { 17468 return; 17469 } 17470 ipsq_current_finish(ipsq); 17471 } 17472 17473 /* 17474 * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf' 17475 * which is `bufsize' bytes. On success, zero is returned and `buf' updated 17476 * as per the ioctl. On failure, an errno is returned. 17477 */ 17478 static int 17479 ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr) 17480 { 17481 int rval; 17482 struct strioctl iocb; 17483 17484 iocb.ic_cmd = cmd; 17485 iocb.ic_timout = 15; 17486 iocb.ic_len = bufsize; 17487 iocb.ic_dp = buf; 17488 17489 return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval)); 17490 } 17491 17492 /* 17493 * Issue an SIOCGLIFCONF for address family `af' and store the result into a 17494 * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success. 17495 */ 17496 static int 17497 ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp, 17498 uint_t *bufsizep, cred_t *cr) 17499 { 17500 int err; 17501 struct lifnum lifn; 17502 17503 bzero(&lifn, sizeof (lifn)); 17504 lifn.lifn_family = af; 17505 lifn.lifn_flags = LIFC_UNDER_IPMP; 17506 17507 if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0) 17508 return (err); 17509 17510 /* 17511 * Pad the interface count to account for additional interfaces that 17512 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 17513 */ 17514 lifn.lifn_count += 4; 17515 bzero(lifcp, sizeof (*lifcp)); 17516 lifcp->lifc_flags = LIFC_UNDER_IPMP; 17517 lifcp->lifc_family = af; 17518 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 17519 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 17520 17521 err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr); 17522 if (err != 0) { 17523 kmem_free(lifcp->lifc_buf, *bufsizep); 17524 return (err); 17525 } 17526 17527 return (0); 17528 } 17529 17530 /* 17531 * Helper for ip_interface_cleanup() that removes the loopback interface. 17532 */ 17533 static void 17534 ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 17535 { 17536 int err; 17537 struct lifreq lifr; 17538 17539 bzero(&lifr, sizeof (lifr)); 17540 (void) strcpy(lifr.lifr_name, ipif_loopback_name); 17541 17542 err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr); 17543 if (err != 0) { 17544 ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: " 17545 "error %d\n", isv6 ? "v6" : "v4", err)); 17546 } 17547 } 17548 17549 /* 17550 * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP 17551 * groups and that IPMP data addresses are down. These conditions must be met 17552 * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp(). 17553 */ 17554 static void 17555 ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 17556 { 17557 int af = isv6 ? AF_INET6 : AF_INET; 17558 int i, nifs; 17559 int err; 17560 uint_t bufsize; 17561 uint_t lifrsize = sizeof (struct lifreq); 17562 struct lifconf lifc; 17563 struct lifreq *lifrp; 17564 17565 if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) { 17566 cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list " 17567 "(error %d); any IPMP interfaces cannot be shutdown", err); 17568 return; 17569 } 17570 17571 nifs = lifc.lifc_len / lifrsize; 17572 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 17573 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 17574 if (err != 0) { 17575 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get " 17576 "flags: error %d", lifrp->lifr_name, err); 17577 continue; 17578 } 17579 17580 if (lifrp->lifr_flags & IFF_IPMP) { 17581 if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0) 17582 continue; 17583 17584 lifrp->lifr_flags &= ~IFF_UP; 17585 err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr); 17586 if (err != 0) { 17587 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 17588 "bring down (error %d); IPMP interface may " 17589 "not be shutdown", lifrp->lifr_name, err); 17590 } 17591 17592 /* 17593 * Check if IFF_DUPLICATE is still set -- and if so, 17594 * reset the address to clear it. 17595 */ 17596 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 17597 if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE)) 17598 continue; 17599 17600 err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr); 17601 if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR, 17602 lifrp, lifrsize, cr)) != 0) { 17603 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 17604 "reset DAD (error %d); IPMP interface may " 17605 "not be shutdown", lifrp->lifr_name, err); 17606 } 17607 continue; 17608 } 17609 17610 lifrp->lifr_groupname[0] = '\0'; 17611 err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp, lifrsize, cr); 17612 if (err != 0) { 17613 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot leave " 17614 "IPMP group (error %d); associated IPMP interface " 17615 "may not be shutdown", lifrp->lifr_name, err); 17616 continue; 17617 } 17618 } 17619 17620 kmem_free(lifc.lifc_buf, bufsize); 17621 } 17622 17623 #define UDPDEV "/devices/pseudo/udp@0:udp" 17624 #define UDP6DEV "/devices/pseudo/udp6@0:udp6" 17625 17626 /* 17627 * Remove the loopback interfaces and prep the IPMP interfaces to be torn down. 17628 * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away 17629 * when the user-level processes in the zone are killed and the latter are 17630 * cleaned up by str_stack_shutdown(). 17631 */ 17632 void 17633 ip_interface_cleanup(ip_stack_t *ipst) 17634 { 17635 ldi_handle_t lh; 17636 ldi_ident_t li; 17637 cred_t *cr; 17638 int err; 17639 int i; 17640 char *devs[] = { UDP6DEV, UDPDEV }; 17641 netstackid_t stackid = ipst->ips_netstack->netstack_stackid; 17642 17643 if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) { 17644 cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:" 17645 " error %d", err); 17646 return; 17647 } 17648 17649 cr = zone_get_kcred(netstackid_to_zoneid(stackid)); 17650 ASSERT(cr != NULL); 17651 17652 /* 17653 * NOTE: loop executes exactly twice and is hardcoded to know that the 17654 * first iteration is IPv6. (Unrolling yields repetitious code, hence 17655 * the loop.) 17656 */ 17657 for (i = 0; i < 2; i++) { 17658 err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li); 17659 if (err != 0) { 17660 cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:" 17661 " error %d", devs[i], err); 17662 continue; 17663 } 17664 17665 ip_loopback_removeif(lh, i == 0, cr); 17666 ip_ipmp_cleanup(lh, i == 0, cr); 17667 17668 (void) ldi_close(lh, FREAD|FWRITE, cr); 17669 } 17670 17671 ldi_ident_release(li); 17672 crfree(cr); 17673 } 17674 17675 /* 17676 * This needs to be in-sync with nic_event_t definition 17677 */ 17678 static const char * 17679 ill_hook_event2str(nic_event_t event) 17680 { 17681 switch (event) { 17682 case NE_PLUMB: 17683 return ("PLUMB"); 17684 case NE_UNPLUMB: 17685 return ("UNPLUMB"); 17686 case NE_UP: 17687 return ("UP"); 17688 case NE_DOWN: 17689 return ("DOWN"); 17690 case NE_ADDRESS_CHANGE: 17691 return ("ADDRESS_CHANGE"); 17692 case NE_LIF_UP: 17693 return ("LIF_UP"); 17694 case NE_LIF_DOWN: 17695 return ("LIF_DOWN"); 17696 case NE_IFINDEX_CHANGE: 17697 return ("IFINDEX_CHANGE"); 17698 default: 17699 return ("UNKNOWN"); 17700 } 17701 } 17702 17703 void 17704 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event, 17705 nic_event_data_t data, size_t datalen) 17706 { 17707 ip_stack_t *ipst = ill->ill_ipst; 17708 hook_nic_event_int_t *info; 17709 const char *str = NULL; 17710 17711 /* create a new nic event info */ 17712 if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL) 17713 goto fail; 17714 17715 info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex; 17716 info->hnei_event.hne_lif = lif; 17717 info->hnei_event.hne_event = event; 17718 info->hnei_event.hne_protocol = ill->ill_isv6 ? 17719 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 17720 info->hnei_event.hne_data = NULL; 17721 info->hnei_event.hne_datalen = 0; 17722 info->hnei_stackid = ipst->ips_netstack->netstack_stackid; 17723 17724 if (data != NULL && datalen != 0) { 17725 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP); 17726 if (info->hnei_event.hne_data == NULL) 17727 goto fail; 17728 bcopy(data, info->hnei_event.hne_data, datalen); 17729 info->hnei_event.hne_datalen = datalen; 17730 } 17731 17732 if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info, 17733 DDI_NOSLEEP) == DDI_SUCCESS) 17734 return; 17735 17736 fail: 17737 if (info != NULL) { 17738 if (info->hnei_event.hne_data != NULL) { 17739 kmem_free(info->hnei_event.hne_data, 17740 info->hnei_event.hne_datalen); 17741 } 17742 kmem_free(info, sizeof (hook_nic_event_t)); 17743 } 17744 str = ill_hook_event2str(event); 17745 ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event " 17746 "information for %s (ENOMEM)\n", str, ill->ill_name)); 17747 } 17748 17749 static int 17750 ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act) 17751 { 17752 int err = 0; 17753 const in_addr_t *addr = NULL; 17754 nce_t *nce = NULL; 17755 ill_t *ill = ipif->ipif_ill; 17756 ill_t *bound_ill; 17757 boolean_t added_ipif = B_FALSE; 17758 uint16_t state; 17759 uint16_t flags; 17760 17761 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail", 17762 ill_t *, ill, ipif_t *, ipif); 17763 if (ipif->ipif_lcl_addr != INADDR_ANY) { 17764 addr = &ipif->ipif_lcl_addr; 17765 } 17766 17767 if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) { 17768 if (res_act != Res_act_initial) 17769 return (EINVAL); 17770 } 17771 17772 if (addr != NULL) { 17773 ipmp_illgrp_t *illg = ill->ill_grp; 17774 17775 /* add unicast nce for the local addr */ 17776 17777 if (IS_IPMP(ill)) { 17778 /* 17779 * If we're here via ipif_up(), then the ipif 17780 * won't be bound yet -- add it to the group, 17781 * which will bind it if possible. (We would 17782 * add it in ipif_up(), but deleting on failure 17783 * there is gruesome.) If we're here via 17784 * ipmp_ill_bind_ipif(), then the ipif has 17785 * already been added to the group and we 17786 * just need to use the binding. 17787 */ 17788 if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) { 17789 bound_ill = ipmp_illgrp_add_ipif(illg, ipif); 17790 if (bound_ill == NULL) { 17791 /* 17792 * We couldn't bind the ipif to an ill 17793 * yet, so we have nothing to publish. 17794 * Mark the address as ready and return. 17795 */ 17796 ipif->ipif_addr_ready = 1; 17797 return (0); 17798 } 17799 added_ipif = B_TRUE; 17800 } 17801 } else { 17802 bound_ill = ill; 17803 } 17804 17805 flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY | 17806 NCE_F_NONUD); 17807 /* 17808 * If this is an initial bring-up (or the ipif was never 17809 * completely brought up), do DAD. Otherwise, we're here 17810 * because IPMP has rebound an address to this ill: send 17811 * unsolicited advertisements (ARP announcements) to 17812 * inform others. 17813 */ 17814 if (res_act == Res_act_initial || !ipif->ipif_addr_ready) { 17815 state = ND_UNCHANGED; /* compute in nce_add_common() */ 17816 } else { 17817 state = ND_REACHABLE; 17818 flags |= NCE_F_UNSOL_ADV; 17819 } 17820 17821 retry: 17822 err = nce_lookup_then_add_v4(ill, 17823 bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length, 17824 addr, flags, state, &nce); 17825 17826 /* 17827 * note that we may encounter EEXIST if we are moving 17828 * the nce as a result of a rebind operation. 17829 */ 17830 switch (err) { 17831 case 0: 17832 ipif->ipif_added_nce = 1; 17833 nce->nce_ipif_cnt++; 17834 break; 17835 case EEXIST: 17836 ip1dbg(("ipif_arp_up: NCE already exists for %s\n", 17837 ill->ill_name)); 17838 if (!NCE_MYADDR(nce->nce_common)) { 17839 /* 17840 * A leftover nce from before this address 17841 * existed 17842 */ 17843 ncec_delete(nce->nce_common); 17844 nce_refrele(nce); 17845 nce = NULL; 17846 goto retry; 17847 } 17848 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 17849 nce_refrele(nce); 17850 nce = NULL; 17851 ip1dbg(("ipif_arp_up: NCE already exists " 17852 "for %s:%u\n", ill->ill_name, 17853 ipif->ipif_id)); 17854 goto arp_up_done; 17855 } 17856 /* 17857 * Duplicate local addresses are permissible for 17858 * IPIF_POINTOPOINT interfaces which will get marked 17859 * IPIF_UNNUMBERED later in 17860 * ip_addr_availability_check(). 17861 * 17862 * The nce_ipif_cnt field tracks the number of 17863 * ipifs that have nce_addr as their local address. 17864 */ 17865 ipif->ipif_addr_ready = 1; 17866 ipif->ipif_added_nce = 1; 17867 nce->nce_ipif_cnt++; 17868 err = 0; 17869 break; 17870 default: 17871 ASSERT(nce == NULL); 17872 goto arp_up_done; 17873 } 17874 if (arp_no_defense) { 17875 if ((ipif->ipif_flags & IPIF_UP) && 17876 !ipif->ipif_addr_ready) 17877 ipif_up_notify(ipif); 17878 ipif->ipif_addr_ready = 1; 17879 } 17880 } else { 17881 /* zero address. nothing to publish */ 17882 ipif->ipif_addr_ready = 1; 17883 } 17884 if (nce != NULL) 17885 nce_refrele(nce); 17886 arp_up_done: 17887 if (added_ipif && err != 0) 17888 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 17889 return (err); 17890 } 17891 17892 int 17893 ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup) 17894 { 17895 int err = 0; 17896 ill_t *ill = ipif->ipif_ill; 17897 boolean_t first_interface, wait_for_dlpi = B_FALSE; 17898 17899 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up", 17900 ill_t *, ill, ipif_t *, ipif); 17901 17902 /* 17903 * need to bring up ARP or setup mcast mapping only 17904 * when the first interface is coming UP. 17905 */ 17906 first_interface = (ill->ill_ipif_up_count == 0 && 17907 ill->ill_ipif_dup_count == 0 && !was_dup); 17908 17909 if (res_act == Res_act_initial && first_interface) { 17910 /* 17911 * Send ATTACH + BIND 17912 */ 17913 err = arp_ll_up(ill); 17914 if (err != EINPROGRESS && err != 0) 17915 return (err); 17916 17917 /* 17918 * Add NCE for local address. Start DAD. 17919 * we'll wait to hear that DAD has finished 17920 * before using the interface. 17921 */ 17922 if (err == EINPROGRESS) 17923 wait_for_dlpi = B_TRUE; 17924 } 17925 17926 if (!wait_for_dlpi) 17927 (void) ipif_arp_up_done_tail(ipif, res_act); 17928 17929 return (!wait_for_dlpi ? 0 : EINPROGRESS); 17930 } 17931 17932 /* 17933 * Finish processing of "arp_up" after all the DLPI message 17934 * exchanges have completed between arp and the driver. 17935 */ 17936 void 17937 arp_bringup_done(ill_t *ill, int err) 17938 { 17939 mblk_t *mp1; 17940 ipif_t *ipif; 17941 conn_t *connp = NULL; 17942 ipsq_t *ipsq; 17943 queue_t *q; 17944 17945 ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name)); 17946 17947 ASSERT(IAM_WRITER_ILL(ill)); 17948 17949 ipsq = ill->ill_phyint->phyint_ipsq; 17950 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 17951 mp1 = ipsq_pending_mp_get(ipsq, &connp); 17952 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 17953 if (mp1 == NULL) /* bringup was aborted by the user */ 17954 return; 17955 17956 /* 17957 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 17958 * must have an associated conn_t. Otherwise, we're bringing this 17959 * interface back up as part of handling an asynchronous event (e.g., 17960 * physical address change). 17961 */ 17962 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 17963 ASSERT(connp != NULL); 17964 q = CONNP_TO_WQ(connp); 17965 } else { 17966 ASSERT(connp == NULL); 17967 q = ill->ill_rq; 17968 } 17969 if (err == 0) { 17970 if (ipif->ipif_isv6) { 17971 if ((err = ipif_up_done_v6(ipif)) != 0) 17972 ip0dbg(("arp_bringup_done: init failed\n")); 17973 } else { 17974 err = ipif_arp_up_done_tail(ipif, Res_act_initial); 17975 if (err != 0 || 17976 (err = ipif_up_done(ipif)) != 0) { 17977 ip0dbg(("arp_bringup_done: " 17978 "init failed err %x\n", err)); 17979 (void) ipif_arp_down(ipif); 17980 } 17981 17982 } 17983 } else { 17984 ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n")); 17985 } 17986 17987 if ((err == 0) && (ill->ill_up_ipifs)) { 17988 err = ill_up_ipifs(ill, q, mp1); 17989 if (err == EINPROGRESS) 17990 return; 17991 } 17992 17993 /* 17994 * If we have a moved ipif to bring up, and everything has succeeded 17995 * to this point, bring it up on the IPMP ill. Otherwise, leave it 17996 * down -- the admin can try to bring it up by hand if need be. 17997 */ 17998 if (ill->ill_move_ipif != NULL) { 17999 ipif = ill->ill_move_ipif; 18000 ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif, 18001 ipif->ipif_ill->ill_name)); 18002 ill->ill_move_ipif = NULL; 18003 if (err == 0) { 18004 err = ipif_up(ipif, q, mp1); 18005 if (err == EINPROGRESS) 18006 return; 18007 } 18008 } 18009 18010 /* 18011 * The operation must complete without EINPROGRESS since 18012 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 18013 * Otherwise, the operation will be stuck forever in the ipsq. 18014 */ 18015 ASSERT(err != EINPROGRESS); 18016 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18017 DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish", 18018 int, ipsq->ipsq_xop->ipx_current_ioctl, 18019 ill_t *, ill, ipif_t *, ipif); 18020 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 18021 } else { 18022 ipsq_current_finish(ipsq); 18023 } 18024 } 18025 18026 /* 18027 * Finish processing of arp replumb after all the DLPI message 18028 * exchanges have completed between arp and the driver. 18029 */ 18030 void 18031 arp_replumb_done(ill_t *ill, int err) 18032 { 18033 mblk_t *mp1; 18034 ipif_t *ipif; 18035 conn_t *connp = NULL; 18036 ipsq_t *ipsq; 18037 queue_t *q; 18038 18039 ASSERT(IAM_WRITER_ILL(ill)); 18040 18041 ipsq = ill->ill_phyint->phyint_ipsq; 18042 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 18043 mp1 = ipsq_pending_mp_get(ipsq, &connp); 18044 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 18045 if (mp1 == NULL) { 18046 ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n", 18047 ipsq->ipsq_xop->ipx_current_ioctl)); 18048 /* bringup was aborted by the user */ 18049 return; 18050 } 18051 /* 18052 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 18053 * must have an associated conn_t. Otherwise, we're bringing this 18054 * interface back up as part of handling an asynchronous event (e.g., 18055 * physical address change). 18056 */ 18057 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18058 ASSERT(connp != NULL); 18059 q = CONNP_TO_WQ(connp); 18060 } else { 18061 ASSERT(connp == NULL); 18062 q = ill->ill_rq; 18063 } 18064 if ((err == 0) && (ill->ill_up_ipifs)) { 18065 err = ill_up_ipifs(ill, q, mp1); 18066 if (err == EINPROGRESS) 18067 return; 18068 } 18069 /* 18070 * The operation must complete without EINPROGRESS since 18071 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 18072 * Otherwise, the operation will be stuck forever in the ipsq. 18073 */ 18074 ASSERT(err != EINPROGRESS); 18075 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18076 DTRACE_PROBE4(ipif__ioctl, char *, 18077 "arp_replumb_done finish", 18078 int, ipsq->ipsq_xop->ipx_current_ioctl, 18079 ill_t *, ill, ipif_t *, ipif); 18080 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 18081 } else { 18082 ipsq_current_finish(ipsq); 18083 } 18084 } 18085 18086 void 18087 ipif_up_notify(ipif_t *ipif) 18088 { 18089 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 18090 ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT); 18091 sctp_update_ipif(ipif, SCTP_IPIF_UP); 18092 ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id), 18093 NE_LIF_UP, NULL, 0); 18094 } 18095 18096 /* 18097 * ILB ioctl uses cv_wait (such as deleting a rule or adding a server) and 18098 * this assumes the context is cv_wait'able. Hence it shouldnt' be used on 18099 * TPI end points with STREAMS modules pushed above. This is assured by not 18100 * having the IPI_MODOK flag for the ioctl. And IP ensures the ILB ioctl 18101 * never ends up on an ipsq, otherwise we may end up processing the ioctl 18102 * while unwinding from the ispq and that could be a thread from the bottom. 18103 */ 18104 /* ARGSUSED */ 18105 int 18106 ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 18107 ip_ioctl_cmd_t *ipip, void *arg) 18108 { 18109 mblk_t *cmd_mp = mp->b_cont->b_cont; 18110 ilb_cmd_t command = *((ilb_cmd_t *)cmd_mp->b_rptr); 18111 int ret = 0; 18112 int i; 18113 size_t size; 18114 ip_stack_t *ipst; 18115 zoneid_t zoneid; 18116 ilb_stack_t *ilbs; 18117 18118 ipst = CONNQ_TO_IPST(q); 18119 ilbs = ipst->ips_netstack->netstack_ilb; 18120 zoneid = Q_TO_CONN(q)->conn_zoneid; 18121 18122 switch (command) { 18123 case ILB_CREATE_RULE: { 18124 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 18125 18126 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 18127 ret = EINVAL; 18128 break; 18129 } 18130 18131 ret = ilb_rule_add(ilbs, zoneid, cmd); 18132 break; 18133 } 18134 case ILB_DESTROY_RULE: 18135 case ILB_ENABLE_RULE: 18136 case ILB_DISABLE_RULE: { 18137 ilb_name_cmd_t *cmd = (ilb_name_cmd_t *)cmd_mp->b_rptr; 18138 18139 if (MBLKL(cmd_mp) != sizeof (ilb_name_cmd_t)) { 18140 ret = EINVAL; 18141 break; 18142 } 18143 18144 if (cmd->flags & ILB_RULE_ALLRULES) { 18145 if (command == ILB_DESTROY_RULE) { 18146 ilb_rule_del_all(ilbs, zoneid); 18147 break; 18148 } else if (command == ILB_ENABLE_RULE) { 18149 ilb_rule_enable_all(ilbs, zoneid); 18150 break; 18151 } else if (command == ILB_DISABLE_RULE) { 18152 ilb_rule_disable_all(ilbs, zoneid); 18153 break; 18154 } 18155 } else { 18156 if (command == ILB_DESTROY_RULE) { 18157 ret = ilb_rule_del(ilbs, zoneid, cmd->name); 18158 } else if (command == ILB_ENABLE_RULE) { 18159 ret = ilb_rule_enable(ilbs, zoneid, cmd->name, 18160 NULL); 18161 } else if (command == ILB_DISABLE_RULE) { 18162 ret = ilb_rule_disable(ilbs, zoneid, cmd->name, 18163 NULL); 18164 } 18165 } 18166 break; 18167 } 18168 case ILB_NUM_RULES: { 18169 ilb_num_rules_cmd_t *cmd; 18170 18171 if (MBLKL(cmd_mp) != sizeof (ilb_num_rules_cmd_t)) { 18172 ret = EINVAL; 18173 break; 18174 } 18175 cmd = (ilb_num_rules_cmd_t *)cmd_mp->b_rptr; 18176 ilb_get_num_rules(ilbs, zoneid, &(cmd->num)); 18177 break; 18178 } 18179 case ILB_RULE_NAMES: { 18180 ilb_rule_names_cmd_t *cmd; 18181 18182 cmd = (ilb_rule_names_cmd_t *)cmd_mp->b_rptr; 18183 if (MBLKL(cmd_mp) < sizeof (ilb_rule_names_cmd_t) || 18184 cmd->num_names == 0) { 18185 ret = EINVAL; 18186 break; 18187 } 18188 size = cmd->num_names * ILB_RULE_NAMESZ; 18189 if (cmd_mp->b_rptr + offsetof(ilb_rule_names_cmd_t, buf) + 18190 size != cmd_mp->b_wptr) { 18191 ret = EINVAL; 18192 break; 18193 } 18194 ilb_get_rulenames(ilbs, zoneid, &cmd->num_names, cmd->buf); 18195 break; 18196 } 18197 case ILB_NUM_SERVERS: { 18198 ilb_num_servers_cmd_t *cmd; 18199 18200 if (MBLKL(cmd_mp) != sizeof (ilb_num_servers_cmd_t)) { 18201 ret = EINVAL; 18202 break; 18203 } 18204 cmd = (ilb_num_servers_cmd_t *)cmd_mp->b_rptr; 18205 ret = ilb_get_num_servers(ilbs, zoneid, cmd->name, 18206 &(cmd->num)); 18207 break; 18208 } 18209 case ILB_LIST_RULE: { 18210 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 18211 18212 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 18213 ret = EINVAL; 18214 break; 18215 } 18216 ret = ilb_rule_list(ilbs, zoneid, cmd); 18217 break; 18218 } 18219 case ILB_LIST_SERVERS: { 18220 ilb_servers_info_cmd_t *cmd; 18221 18222 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 18223 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t) || 18224 cmd->num_servers == 0) { 18225 ret = EINVAL; 18226 break; 18227 } 18228 size = cmd->num_servers * sizeof (ilb_server_info_t); 18229 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 18230 size != cmd_mp->b_wptr) { 18231 ret = EINVAL; 18232 break; 18233 } 18234 18235 ret = ilb_get_servers(ilbs, zoneid, cmd->name, cmd->servers, 18236 &cmd->num_servers); 18237 break; 18238 } 18239 case ILB_ADD_SERVERS: { 18240 ilb_servers_info_cmd_t *cmd; 18241 ilb_rule_t *rule; 18242 18243 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 18244 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t)) { 18245 ret = EINVAL; 18246 break; 18247 } 18248 size = cmd->num_servers * sizeof (ilb_server_info_t); 18249 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 18250 size != cmd_mp->b_wptr) { 18251 ret = EINVAL; 18252 break; 18253 } 18254 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 18255 if (rule == NULL) { 18256 ASSERT(ret != 0); 18257 break; 18258 } 18259 for (i = 0; i < cmd->num_servers; i++) { 18260 ilb_server_info_t *s; 18261 18262 s = &cmd->servers[i]; 18263 s->err = ilb_server_add(ilbs, rule, s); 18264 } 18265 ILB_RULE_REFRELE(rule); 18266 break; 18267 } 18268 case ILB_DEL_SERVERS: 18269 case ILB_ENABLE_SERVERS: 18270 case ILB_DISABLE_SERVERS: { 18271 ilb_servers_cmd_t *cmd; 18272 ilb_rule_t *rule; 18273 int (*f)(); 18274 18275 cmd = (ilb_servers_cmd_t *)cmd_mp->b_rptr; 18276 if (MBLKL(cmd_mp) < sizeof (ilb_servers_cmd_t)) { 18277 ret = EINVAL; 18278 break; 18279 } 18280 size = cmd->num_servers * sizeof (ilb_server_arg_t); 18281 if (cmd_mp->b_rptr + offsetof(ilb_servers_cmd_t, servers) + 18282 size != cmd_mp->b_wptr) { 18283 ret = EINVAL; 18284 break; 18285 } 18286 18287 if (command == ILB_DEL_SERVERS) 18288 f = ilb_server_del; 18289 else if (command == ILB_ENABLE_SERVERS) 18290 f = ilb_server_enable; 18291 else if (command == ILB_DISABLE_SERVERS) 18292 f = ilb_server_disable; 18293 18294 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 18295 if (rule == NULL) { 18296 ASSERT(ret != 0); 18297 break; 18298 } 18299 18300 for (i = 0; i < cmd->num_servers; i++) { 18301 ilb_server_arg_t *s; 18302 18303 s = &cmd->servers[i]; 18304 s->err = f(ilbs, zoneid, NULL, rule, &s->addr); 18305 } 18306 ILB_RULE_REFRELE(rule); 18307 break; 18308 } 18309 case ILB_LIST_NAT_TABLE: { 18310 ilb_list_nat_cmd_t *cmd; 18311 18312 cmd = (ilb_list_nat_cmd_t *)cmd_mp->b_rptr; 18313 if (MBLKL(cmd_mp) < sizeof (ilb_list_nat_cmd_t)) { 18314 ret = EINVAL; 18315 break; 18316 } 18317 size = cmd->num_nat * sizeof (ilb_nat_entry_t); 18318 if (cmd_mp->b_rptr + offsetof(ilb_list_nat_cmd_t, entries) + 18319 size != cmd_mp->b_wptr) { 18320 ret = EINVAL; 18321 break; 18322 } 18323 18324 ret = ilb_list_nat(ilbs, zoneid, cmd->entries, &cmd->num_nat, 18325 &cmd->flags); 18326 break; 18327 } 18328 case ILB_LIST_STICKY_TABLE: { 18329 ilb_list_sticky_cmd_t *cmd; 18330 18331 cmd = (ilb_list_sticky_cmd_t *)cmd_mp->b_rptr; 18332 if (MBLKL(cmd_mp) < sizeof (ilb_list_sticky_cmd_t)) { 18333 ret = EINVAL; 18334 break; 18335 } 18336 size = cmd->num_sticky * sizeof (ilb_sticky_entry_t); 18337 if (cmd_mp->b_rptr + offsetof(ilb_list_sticky_cmd_t, entries) + 18338 size != cmd_mp->b_wptr) { 18339 ret = EINVAL; 18340 break; 18341 } 18342 18343 ret = ilb_list_sticky(ilbs, zoneid, cmd->entries, 18344 &cmd->num_sticky, &cmd->flags); 18345 break; 18346 } 18347 default: 18348 ret = EINVAL; 18349 break; 18350 } 18351 done: 18352 return (ret); 18353 } 18354 18355 /* Remove all cache entries for this logical interface */ 18356 void 18357 ipif_nce_down(ipif_t *ipif) 18358 { 18359 ill_t *ill = ipif->ipif_ill; 18360 nce_t *nce; 18361 18362 DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down", 18363 ill_t *, ill, ipif_t *, ipif); 18364 if (ipif->ipif_added_nce) { 18365 if (ipif->ipif_isv6) 18366 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); 18367 else 18368 nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr); 18369 if (nce != NULL) { 18370 if (--nce->nce_ipif_cnt == 0) 18371 ncec_delete(nce->nce_common); 18372 ipif->ipif_added_nce = 0; 18373 nce_refrele(nce); 18374 } else { 18375 /* 18376 * nce may already be NULL because it was already 18377 * flushed, e.g., due to a call to nce_flush 18378 */ 18379 ipif->ipif_added_nce = 0; 18380 } 18381 } 18382 /* 18383 * Make IPMP aware of the deleted data address. 18384 */ 18385 if (IS_IPMP(ill)) 18386 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 18387 18388 /* 18389 * Remove all other nces dependent on this ill when the last ipif 18390 * is going away. 18391 */ 18392 if (ill->ill_ipif_up_count == 0) { 18393 ncec_walk(ill, (pfi_t)ncec_delete_per_ill, 18394 (uchar_t *)ill, ill->ill_ipst); 18395 if (IS_UNDER_IPMP(ill)) 18396 nce_flush(ill, B_TRUE); 18397 } 18398 } 18399