1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 /* 28 * This file contains the interface control functions for IP. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/stream.h> 33 #include <sys/dlpi.h> 34 #include <sys/stropts.h> 35 #include <sys/strsun.h> 36 #include <sys/sysmacros.h> 37 #include <sys/strsubr.h> 38 #include <sys/strlog.h> 39 #include <sys/ddi.h> 40 #include <sys/sunddi.h> 41 #include <sys/cmn_err.h> 42 #include <sys/kstat.h> 43 #include <sys/debug.h> 44 #include <sys/zone.h> 45 #include <sys/sunldi.h> 46 #include <sys/file.h> 47 #include <sys/bitmap.h> 48 #include <sys/cpuvar.h> 49 #include <sys/time.h> 50 #include <sys/ctype.h> 51 #include <sys/kmem.h> 52 #include <sys/systm.h> 53 #include <sys/param.h> 54 #include <sys/socket.h> 55 #include <sys/isa_defs.h> 56 #include <net/if.h> 57 #include <net/if_arp.h> 58 #include <net/if_types.h> 59 #include <net/if_dl.h> 60 #include <net/route.h> 61 #include <sys/sockio.h> 62 #include <netinet/in.h> 63 #include <netinet/ip6.h> 64 #include <netinet/icmp6.h> 65 #include <netinet/igmp_var.h> 66 #include <sys/policy.h> 67 #include <sys/ethernet.h> 68 #include <sys/callb.h> 69 #include <sys/md5.h> 70 71 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 72 #include <inet/mi.h> 73 #include <inet/nd.h> 74 #include <inet/arp.h> 75 #include <inet/ip_arp.h> 76 #include <inet/mib2.h> 77 #include <inet/ip.h> 78 #include <inet/ip6.h> 79 #include <inet/ip6_asp.h> 80 #include <inet/tcp.h> 81 #include <inet/ip_multi.h> 82 #include <inet/ip_ire.h> 83 #include <inet/ip_ftable.h> 84 #include <inet/ip_rts.h> 85 #include <inet/ip_ndp.h> 86 #include <inet/ip_if.h> 87 #include <inet/ip_impl.h> 88 #include <inet/sctp_ip.h> 89 #include <inet/ip_netinfo.h> 90 #include <inet/ilb_ip.h> 91 92 #include <netinet/igmp.h> 93 #include <inet/ip_listutils.h> 94 #include <inet/ipclassifier.h> 95 #include <sys/mac_client.h> 96 #include <sys/dld.h> 97 98 #include <sys/systeminfo.h> 99 #include <sys/bootconf.h> 100 101 #include <sys/tsol/tndb.h> 102 #include <sys/tsol/tnet.h> 103 104 /* The character which tells where the ill_name ends */ 105 #define IPIF_SEPARATOR_CHAR ':' 106 107 /* IP ioctl function table entry */ 108 typedef struct ipft_s { 109 int ipft_cmd; 110 pfi_t ipft_pfi; 111 int ipft_min_size; 112 int ipft_flags; 113 } ipft_t; 114 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 115 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 116 117 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 118 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 119 char *value, caddr_t cp, cred_t *ioc_cr); 120 121 static boolean_t ill_is_quiescent(ill_t *); 122 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 123 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 124 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 125 mblk_t *mp, boolean_t need_up); 126 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 127 mblk_t *mp, boolean_t need_up); 128 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 129 queue_t *q, mblk_t *mp, boolean_t need_up); 130 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 131 mblk_t *mp); 132 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 133 mblk_t *mp); 134 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 135 queue_t *q, mblk_t *mp, boolean_t need_up); 136 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, 137 int ioccmd, struct linkblk *li); 138 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); 139 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 140 static void ipsq_flush(ill_t *ill); 141 142 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 143 queue_t *q, mblk_t *mp, boolean_t need_up); 144 static void ipsq_delete(ipsq_t *); 145 146 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 147 boolean_t initialize, boolean_t insert, int *errorp); 148 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); 149 static void ipif_delete_bcast_ires(ipif_t *ipif); 150 static int ipif_add_ires_v4(ipif_t *, boolean_t); 151 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, 152 boolean_t isv6); 153 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 154 static void ipif_free(ipif_t *ipif); 155 static void ipif_free_tail(ipif_t *ipif); 156 static void ipif_set_default(ipif_t *ipif); 157 static int ipif_set_values(queue_t *q, mblk_t *mp, 158 char *interf_name, uint_t *ppa); 159 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 160 queue_t *q); 161 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 162 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 163 ip_stack_t *); 164 165 static int ill_alloc_ppa(ill_if_t *, ill_t *); 166 static void ill_delete_interface_type(ill_if_t *); 167 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 168 static void ill_dl_down(ill_t *ill); 169 static void ill_down(ill_t *ill); 170 static void ill_down_ipifs(ill_t *, boolean_t); 171 static void ill_free_mib(ill_t *ill); 172 static void ill_glist_delete(ill_t *); 173 static void ill_phyint_reinit(ill_t *ill); 174 static void ill_set_nce_router_flags(ill_t *, boolean_t); 175 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); 176 static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *); 177 178 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; 179 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid; 180 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; 181 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid; 182 static ip_v4mapinfo_func_t ip_ether_v4_mapping; 183 static ip_v6mapinfo_func_t ip_ether_v6_mapping; 184 static ip_v4mapinfo_func_t ip_ib_v4_mapping; 185 static ip_v6mapinfo_func_t ip_ib_v6_mapping; 186 static ip_v4mapinfo_func_t ip_mbcast_mapping; 187 static void ip_cgtp_bcast_add(ire_t *, ip_stack_t *); 188 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); 189 static void phyint_free(phyint_t *); 190 191 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *); 192 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 193 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 194 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 195 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *); 196 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 197 dl_capability_sub_t *); 198 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *); 199 static void ill_capability_dld_reset_fill(ill_t *, mblk_t *); 200 static void ill_capability_dld_ack(ill_t *, mblk_t *, 201 dl_capability_sub_t *); 202 static void ill_capability_dld_enable(ill_t *); 203 static void ill_capability_ack_thr(void *); 204 static void ill_capability_lso_enable(ill_t *); 205 206 static ill_t *ill_prev_usesrc(ill_t *); 207 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 208 static void ill_disband_usesrc_group(ill_t *); 209 static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int); 210 211 #ifdef DEBUG 212 static void ill_trace_cleanup(const ill_t *); 213 static void ipif_trace_cleanup(const ipif_t *); 214 #endif 215 216 static void ill_dlpi_clear_deferred(ill_t *ill); 217 218 /* 219 * if we go over the memory footprint limit more than once in this msec 220 * interval, we'll start pruning aggressively. 221 */ 222 int ip_min_frag_prune_time = 0; 223 224 static ipft_t ip_ioctl_ftbl[] = { 225 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 226 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 227 IPFT_F_NO_REPLY }, 228 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 229 { 0 } 230 }; 231 232 /* Simple ICMP IP Header Template */ 233 static ipha_t icmp_ipha = { 234 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 235 }; 236 237 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 238 239 static ip_m_t ip_m_tbl[] = { 240 { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 241 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 242 ip_nodef_v6intfid }, 243 { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6, 244 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 245 ip_nodef_v6intfid }, 246 { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6, 247 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 248 ip_nodef_v6intfid }, 249 { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6, 250 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 251 ip_nodef_v6intfid }, 252 { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6, 253 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 254 ip_nodef_v6intfid }, 255 { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6, 256 ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid, 257 ip_nodef_v6intfid }, 258 { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6, 259 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 260 ip_ipv4_v6destintfid }, 261 { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6, 262 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid, 263 ip_ipv6_v6destintfid }, 264 { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6, 265 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 266 ip_nodef_v6intfid }, 267 { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 268 NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid }, 269 { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 270 NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid }, 271 { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 272 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 273 ip_nodef_v6intfid } 274 }; 275 276 static ill_t ill_null; /* Empty ILL for init. */ 277 char ipif_loopback_name[] = "lo0"; 278 static char *ipv4_forward_suffix = ":ip_forwarding"; 279 static char *ipv6_forward_suffix = ":ip6_forwarding"; 280 281 /* These are used by all IP network modules. */ 282 sin6_t sin6_null; /* Zero address for quick clears */ 283 sin_t sin_null; /* Zero address for quick clears */ 284 285 /* When set search for unused ipif_seqid */ 286 static ipif_t ipif_zero; 287 288 /* 289 * ppa arena is created after these many 290 * interfaces have been plumbed. 291 */ 292 uint_t ill_no_arena = 12; /* Setable in /etc/system */ 293 294 /* 295 * Allocate per-interface mibs. 296 * Returns true if ok. False otherwise. 297 * ipsq may not yet be allocated (loopback case ). 298 */ 299 static boolean_t 300 ill_allocate_mibs(ill_t *ill) 301 { 302 /* Already allocated? */ 303 if (ill->ill_ip_mib != NULL) { 304 if (ill->ill_isv6) 305 ASSERT(ill->ill_icmp6_mib != NULL); 306 return (B_TRUE); 307 } 308 309 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), 310 KM_NOSLEEP); 311 if (ill->ill_ip_mib == NULL) { 312 return (B_FALSE); 313 } 314 315 /* Setup static information */ 316 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, 317 sizeof (mib2_ipIfStatsEntry_t)); 318 if (ill->ill_isv6) { 319 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 320 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 321 sizeof (mib2_ipv6AddrEntry_t)); 322 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 323 sizeof (mib2_ipv6RouteEntry_t)); 324 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 325 sizeof (mib2_ipv6NetToMediaEntry_t)); 326 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 327 sizeof (ipv6_member_t)); 328 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 329 sizeof (ipv6_grpsrc_t)); 330 } else { 331 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 332 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 333 sizeof (mib2_ipAddrEntry_t)); 334 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 335 sizeof (mib2_ipRouteEntry_t)); 336 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 337 sizeof (mib2_ipNetToMediaEntry_t)); 338 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 339 sizeof (ip_member_t)); 340 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 341 sizeof (ip_grpsrc_t)); 342 343 /* 344 * For a v4 ill, we are done at this point, because per ill 345 * icmp mibs are only used for v6. 346 */ 347 return (B_TRUE); 348 } 349 350 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 351 KM_NOSLEEP); 352 if (ill->ill_icmp6_mib == NULL) { 353 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 354 ill->ill_ip_mib = NULL; 355 return (B_FALSE); 356 } 357 /* static icmp info */ 358 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 359 sizeof (mib2_ipv6IfIcmpEntry_t); 360 /* 361 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later 362 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 363 * -> ill_phyint_reinit 364 */ 365 return (B_TRUE); 366 } 367 368 /* 369 * Completely vaporize a lower level tap and all associated interfaces. 370 * ill_delete is called only out of ip_close when the device control 371 * stream is being closed. 372 */ 373 void 374 ill_delete(ill_t *ill) 375 { 376 ipif_t *ipif; 377 ill_t *prev_ill; 378 ip_stack_t *ipst = ill->ill_ipst; 379 380 /* 381 * ill_delete may be forcibly entering the ipsq. The previous 382 * ioctl may not have completed and may need to be aborted. 383 * ipsq_flush takes care of it. If we don't need to enter the 384 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 385 * ill_delete_tail is sufficient. 386 */ 387 ipsq_flush(ill); 388 389 /* 390 * Nuke all interfaces. ipif_free will take down the interface, 391 * remove it from the list, and free the data structure. 392 * Walk down the ipif list and remove the logical interfaces 393 * first before removing the main ipif. We can't unplumb 394 * zeroth interface first in the case of IPv6 as update_conn_ill 395 * -> ip_ll_multireq de-references ill_ipif for checking 396 * POINTOPOINT. 397 * 398 * If ill_ipif was not properly initialized (i.e low on memory), 399 * then no interfaces to clean up. In this case just clean up the 400 * ill. 401 */ 402 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 403 ipif_free(ipif); 404 405 /* 406 * clean out all the nce_t entries that depend on this 407 * ill for the ill_phys_addr. 408 */ 409 nce_flush(ill, B_TRUE); 410 411 /* Clean up msgs on pending upcalls for mrouted */ 412 reset_mrt_ill(ill); 413 414 update_conn_ill(ill, ipst); 415 416 /* 417 * Remove multicast references added as a result of calls to 418 * ip_join_allmulti(). 419 */ 420 ip_purge_allmulti(ill); 421 422 /* 423 * If the ill being deleted is under IPMP, boot it out of the illgrp. 424 */ 425 if (IS_UNDER_IPMP(ill)) 426 ipmp_ill_leave_illgrp(ill); 427 428 /* 429 * ill_down will arrange to blow off any IRE's dependent on this 430 * ILL, and shut down fragmentation reassembly. 431 */ 432 ill_down(ill); 433 434 /* Let SCTP know, so that it can remove this from its list. */ 435 sctp_update_ill(ill, SCTP_ILL_REMOVE); 436 437 /* 438 * Walk all CONNs that can have a reference on an ire or nce for this 439 * ill (we actually walk all that now have stale references). 440 */ 441 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); 442 443 /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 444 if (ill->ill_isv6) 445 dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst); 446 447 /* 448 * If an address on this ILL is being used as a source address then 449 * clear out the pointers in other ILLs that point to this ILL. 450 */ 451 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 452 if (ill->ill_usesrc_grp_next != NULL) { 453 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 454 ill_disband_usesrc_group(ill); 455 } else { /* consumer of the usesrc ILL */ 456 prev_ill = ill_prev_usesrc(ill); 457 prev_ill->ill_usesrc_grp_next = 458 ill->ill_usesrc_grp_next; 459 } 460 } 461 rw_exit(&ipst->ips_ill_g_usesrc_lock); 462 } 463 464 static void 465 ipif_non_duplicate(ipif_t *ipif) 466 { 467 ill_t *ill = ipif->ipif_ill; 468 mutex_enter(&ill->ill_lock); 469 if (ipif->ipif_flags & IPIF_DUPLICATE) { 470 ipif->ipif_flags &= ~IPIF_DUPLICATE; 471 ASSERT(ill->ill_ipif_dup_count > 0); 472 ill->ill_ipif_dup_count--; 473 } 474 mutex_exit(&ill->ill_lock); 475 } 476 477 /* 478 * ill_delete_tail is called from ip_modclose after all references 479 * to the closing ill are gone. The wait is done in ip_modclose 480 */ 481 void 482 ill_delete_tail(ill_t *ill) 483 { 484 mblk_t **mpp; 485 ipif_t *ipif; 486 ip_stack_t *ipst = ill->ill_ipst; 487 488 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 489 ipif_non_duplicate(ipif); 490 (void) ipif_down_tail(ipif); 491 } 492 493 ASSERT(ill->ill_ipif_dup_count == 0); 494 495 /* 496 * If polling capability is enabled (which signifies direct 497 * upcall into IP and driver has ill saved as a handle), 498 * we need to make sure that unbind has completed before we 499 * let the ill disappear and driver no longer has any reference 500 * to this ill. 501 */ 502 mutex_enter(&ill->ill_lock); 503 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 504 cv_wait(&ill->ill_cv, &ill->ill_lock); 505 mutex_exit(&ill->ill_lock); 506 ASSERT(!(ill->ill_capabilities & 507 (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT))); 508 509 if (ill->ill_net_type != IRE_LOOPBACK) 510 qprocsoff(ill->ill_rq); 511 512 /* 513 * We do an ipsq_flush once again now. New messages could have 514 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 515 * could also have landed up if an ioctl thread had looked up 516 * the ill before we set the ILL_CONDEMNED flag, but not yet 517 * enqueued the ioctl when we did the ipsq_flush last time. 518 */ 519 ipsq_flush(ill); 520 521 /* 522 * Free capabilities. 523 */ 524 if (ill->ill_hcksum_capab != NULL) { 525 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 526 ill->ill_hcksum_capab = NULL; 527 } 528 529 if (ill->ill_zerocopy_capab != NULL) { 530 kmem_free(ill->ill_zerocopy_capab, 531 sizeof (ill_zerocopy_capab_t)); 532 ill->ill_zerocopy_capab = NULL; 533 } 534 535 if (ill->ill_lso_capab != NULL) { 536 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 537 ill->ill_lso_capab = NULL; 538 } 539 540 if (ill->ill_dld_capab != NULL) { 541 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t)); 542 ill->ill_dld_capab = NULL; 543 } 544 545 while (ill->ill_ipif != NULL) 546 ipif_free_tail(ill->ill_ipif); 547 548 /* 549 * We have removed all references to ilm from conn and the ones joined 550 * within the kernel. 551 * 552 * We don't walk conns, mrts and ires because 553 * 554 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts. 555 * 2) ill_down ->ill_downi walks all the ires and cleans up 556 * ill references. 557 */ 558 559 /* 560 * If this ill is an IPMP meta-interface, blow away the illgrp. This 561 * is safe to do because the illgrp has already been unlinked from the 562 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it. 563 */ 564 if (IS_IPMP(ill)) { 565 ipmp_illgrp_destroy(ill->ill_grp); 566 ill->ill_grp = NULL; 567 } 568 569 /* 570 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free 571 * could free the phyint. No more reference to the phyint after this 572 * point. 573 */ 574 (void) ill_glist_delete(ill); 575 576 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 577 if (ill->ill_ndd_name != NULL) 578 nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name); 579 rw_exit(&ipst->ips_ip_g_nd_lock); 580 581 if (ill->ill_frag_ptr != NULL) { 582 uint_t count; 583 584 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 585 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 586 } 587 mi_free(ill->ill_frag_ptr); 588 ill->ill_frag_ptr = NULL; 589 ill->ill_frag_hash_tbl = NULL; 590 } 591 592 freemsg(ill->ill_nd_lla_mp); 593 /* Free all retained control messages. */ 594 mpp = &ill->ill_first_mp_to_free; 595 do { 596 while (mpp[0]) { 597 mblk_t *mp; 598 mblk_t *mp1; 599 600 mp = mpp[0]; 601 mpp[0] = mp->b_next; 602 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 603 mp1->b_next = NULL; 604 mp1->b_prev = NULL; 605 } 606 freemsg(mp); 607 } 608 } while (mpp++ != &ill->ill_last_mp_to_free); 609 610 ill_free_mib(ill); 611 612 #ifdef DEBUG 613 ill_trace_cleanup(ill); 614 #endif 615 616 /* The default multicast interface might have changed */ 617 ire_increment_multicast_generation(ipst, ill->ill_isv6); 618 619 /* Drop refcnt here */ 620 netstack_rele(ill->ill_ipst->ips_netstack); 621 ill->ill_ipst = NULL; 622 } 623 624 static void 625 ill_free_mib(ill_t *ill) 626 { 627 ip_stack_t *ipst = ill->ill_ipst; 628 629 /* 630 * MIB statistics must not be lost, so when an interface 631 * goes away the counter values will be added to the global 632 * MIBs. 633 */ 634 if (ill->ill_ip_mib != NULL) { 635 if (ill->ill_isv6) { 636 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, 637 ill->ill_ip_mib); 638 } else { 639 ip_mib2_add_ip_stats(&ipst->ips_ip_mib, 640 ill->ill_ip_mib); 641 } 642 643 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 644 ill->ill_ip_mib = NULL; 645 } 646 if (ill->ill_icmp6_mib != NULL) { 647 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, 648 ill->ill_icmp6_mib); 649 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 650 ill->ill_icmp6_mib = NULL; 651 } 652 } 653 654 /* 655 * Concatenate together a physical address and a sap. 656 * 657 * Sap_lengths are interpreted as follows: 658 * sap_length == 0 ==> no sap 659 * sap_length > 0 ==> sap is at the head of the dlpi address 660 * sap_length < 0 ==> sap is at the tail of the dlpi address 661 */ 662 static void 663 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 664 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 665 { 666 uint16_t sap_addr = (uint16_t)sap_src; 667 668 if (sap_length == 0) { 669 if (phys_src == NULL) 670 bzero(dst, phys_length); 671 else 672 bcopy(phys_src, dst, phys_length); 673 } else if (sap_length < 0) { 674 if (phys_src == NULL) 675 bzero(dst, phys_length); 676 else 677 bcopy(phys_src, dst, phys_length); 678 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 679 } else { 680 bcopy(&sap_addr, dst, sizeof (sap_addr)); 681 if (phys_src == NULL) 682 bzero((char *)dst + sap_length, phys_length); 683 else 684 bcopy(phys_src, (char *)dst + sap_length, phys_length); 685 } 686 } 687 688 /* 689 * Generate a dl_unitdata_req mblk for the device and address given. 690 * addr_length is the length of the physical portion of the address. 691 * If addr is NULL include an all zero address of the specified length. 692 * TRUE? In any case, addr_length is taken to be the entire length of the 693 * dlpi address, including the absolute value of sap_length. 694 */ 695 mblk_t * 696 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 697 t_scalar_t sap_length) 698 { 699 dl_unitdata_req_t *dlur; 700 mblk_t *mp; 701 t_scalar_t abs_sap_length; /* absolute value */ 702 703 abs_sap_length = ABS(sap_length); 704 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 705 DL_UNITDATA_REQ); 706 if (mp == NULL) 707 return (NULL); 708 dlur = (dl_unitdata_req_t *)mp->b_rptr; 709 /* HACK: accomodate incompatible DLPI drivers */ 710 if (addr_length == 8) 711 addr_length = 6; 712 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 713 dlur->dl_dest_addr_offset = sizeof (*dlur); 714 dlur->dl_priority.dl_min = 0; 715 dlur->dl_priority.dl_max = 0; 716 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 717 (uchar_t *)&dlur[1]); 718 return (mp); 719 } 720 721 /* 722 * Add the pending mp to the list. There can be only 1 pending mp 723 * in the list. Any exclusive ioctl that needs to wait for a response 724 * from another module or driver needs to use this function to set 725 * the ipx_pending_mp to the ioctl mblk and wait for the response from 726 * the other module/driver. This is also used while waiting for the 727 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 728 */ 729 boolean_t 730 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 731 int waitfor) 732 { 733 ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop; 734 735 ASSERT(IAM_WRITER_IPIF(ipif)); 736 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 737 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 738 ASSERT(ipx->ipx_pending_mp == NULL); 739 /* 740 * The caller may be using a different ipif than the one passed into 741 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 742 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT 743 * that `ipx_current_ipif == ipif'. 744 */ 745 ASSERT(ipx->ipx_current_ipif != NULL); 746 747 /* 748 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the 749 * driver. 750 */ 751 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) || 752 (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) || 753 (DB_TYPE(add_mp) == M_PCPROTO)); 754 755 if (connp != NULL) { 756 ASSERT(MUTEX_HELD(&connp->conn_lock)); 757 /* 758 * Return error if the conn has started closing. The conn 759 * could have finished cleaning up the pending mp list, 760 * If so we should not add another mp to the list negating 761 * the cleanup. 762 */ 763 if (connp->conn_state_flags & CONN_CLOSING) 764 return (B_FALSE); 765 } 766 mutex_enter(&ipx->ipx_lock); 767 ipx->ipx_pending_ipif = ipif; 768 /* 769 * Note down the queue in b_queue. This will be returned by 770 * ipsq_pending_mp_get. Caller will then use these values to restart 771 * the processing 772 */ 773 add_mp->b_next = NULL; 774 add_mp->b_queue = q; 775 ipx->ipx_pending_mp = add_mp; 776 ipx->ipx_waitfor = waitfor; 777 mutex_exit(&ipx->ipx_lock); 778 779 if (connp != NULL) 780 connp->conn_oper_pending_ill = ipif->ipif_ill; 781 782 return (B_TRUE); 783 } 784 785 /* 786 * Retrieve the ipx_pending_mp and return it. There can be only 1 mp 787 * queued in the list. 788 */ 789 mblk_t * 790 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 791 { 792 mblk_t *curr = NULL; 793 ipxop_t *ipx = ipsq->ipsq_xop; 794 795 *connpp = NULL; 796 mutex_enter(&ipx->ipx_lock); 797 if (ipx->ipx_pending_mp == NULL) { 798 mutex_exit(&ipx->ipx_lock); 799 return (NULL); 800 } 801 802 /* There can be only 1 such excl message */ 803 curr = ipx->ipx_pending_mp; 804 ASSERT(curr->b_next == NULL); 805 ipx->ipx_pending_ipif = NULL; 806 ipx->ipx_pending_mp = NULL; 807 ipx->ipx_waitfor = 0; 808 mutex_exit(&ipx->ipx_lock); 809 810 if (CONN_Q(curr->b_queue)) { 811 /* 812 * This mp did a refhold on the conn, at the start of the ioctl. 813 * So we can safely return a pointer to the conn to the caller. 814 */ 815 *connpp = Q_TO_CONN(curr->b_queue); 816 } else { 817 *connpp = NULL; 818 } 819 curr->b_next = NULL; 820 curr->b_prev = NULL; 821 return (curr); 822 } 823 824 /* 825 * Cleanup the ioctl mp queued in ipx_pending_mp 826 * - Called in the ill_delete path 827 * - Called in the M_ERROR or M_HANGUP path on the ill. 828 * - Called in the conn close path. 829 * 830 * Returns success on finding the pending mblk associated with the ioctl or 831 * exclusive operation in progress, failure otherwise. 832 */ 833 boolean_t 834 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 835 { 836 mblk_t *mp; 837 ipxop_t *ipx; 838 queue_t *q; 839 ipif_t *ipif; 840 int cmd; 841 842 ASSERT(IAM_WRITER_ILL(ill)); 843 ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 844 845 mutex_enter(&ipx->ipx_lock); 846 mp = ipx->ipx_pending_mp; 847 if (connp != NULL) { 848 if (mp == NULL || mp->b_queue != CONNP_TO_WQ(connp)) { 849 /* 850 * Nothing to clean since the conn that is closing 851 * does not have a matching pending mblk in 852 * ipx_pending_mp. 853 */ 854 mutex_exit(&ipx->ipx_lock); 855 return (B_FALSE); 856 } 857 } else { 858 /* 859 * A non-zero ill_error signifies we are called in the 860 * M_ERROR or M_HANGUP path and we need to unconditionally 861 * abort any current ioctl and do the corresponding cleanup. 862 * A zero ill_error means we are in the ill_delete path and 863 * we do the cleanup only if there is a pending mp. 864 */ 865 if (mp == NULL && ill->ill_error == 0) { 866 mutex_exit(&ipx->ipx_lock); 867 return (B_FALSE); 868 } 869 } 870 871 /* Now remove from the ipx_pending_mp */ 872 ipx->ipx_pending_mp = NULL; 873 ipif = ipx->ipx_pending_ipif; 874 ipx->ipx_pending_ipif = NULL; 875 ipx->ipx_waitfor = 0; 876 ipx->ipx_current_ipif = NULL; 877 cmd = ipx->ipx_current_ioctl; 878 ipx->ipx_current_ioctl = 0; 879 ipx->ipx_current_done = B_TRUE; 880 mutex_exit(&ipx->ipx_lock); 881 882 if (mp == NULL) 883 return (B_FALSE); 884 885 q = mp->b_queue; 886 mp->b_next = NULL; 887 mp->b_prev = NULL; 888 mp->b_queue = NULL; 889 890 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 891 DTRACE_PROBE4(ipif__ioctl, 892 char *, "ipsq_pending_mp_cleanup", 893 int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill, 894 ipif_t *, ipif); 895 if (connp == NULL) { 896 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 897 } else { 898 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); 899 mutex_enter(&ipif->ipif_ill->ill_lock); 900 ipif->ipif_state_flags &= ~IPIF_CHANGING; 901 mutex_exit(&ipif->ipif_ill->ill_lock); 902 } 903 } else { 904 inet_freemsg(mp); 905 } 906 return (B_TRUE); 907 } 908 909 /* 910 * Called in the conn close path and ill delete path 911 */ 912 static void 913 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 914 { 915 ipsq_t *ipsq; 916 mblk_t *prev; 917 mblk_t *curr; 918 mblk_t *next; 919 queue_t *wq, *rq = NULL; 920 mblk_t *tmp_list = NULL; 921 922 ASSERT(IAM_WRITER_ILL(ill)); 923 if (connp != NULL) 924 wq = CONNP_TO_WQ(connp); 925 else 926 wq = ill->ill_wq; 927 928 /* 929 * In the case of lo0 being unplumbed, ill_wq will be NULL. Guard 930 * against this here. 931 */ 932 if (wq != NULL) 933 rq = RD(wq); 934 935 ipsq = ill->ill_phyint->phyint_ipsq; 936 /* 937 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 938 * In the case of ioctl from a conn, there can be only 1 mp 939 * queued on the ipsq. If an ill is being unplumbed flush all 940 * the messages. 941 */ 942 mutex_enter(&ipsq->ipsq_lock); 943 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 944 curr = next) { 945 next = curr->b_next; 946 if (connp == NULL || 947 (curr->b_queue == wq || curr->b_queue == rq)) { 948 /* Unlink the mblk from the pending mp list */ 949 if (prev != NULL) { 950 prev->b_next = curr->b_next; 951 } else { 952 ASSERT(ipsq->ipsq_xopq_mphead == curr); 953 ipsq->ipsq_xopq_mphead = curr->b_next; 954 } 955 if (ipsq->ipsq_xopq_mptail == curr) 956 ipsq->ipsq_xopq_mptail = prev; 957 /* 958 * Create a temporary list and release the ipsq lock 959 * New elements are added to the head of the tmp_list 960 */ 961 curr->b_next = tmp_list; 962 tmp_list = curr; 963 } else { 964 prev = curr; 965 } 966 } 967 mutex_exit(&ipsq->ipsq_lock); 968 969 while (tmp_list != NULL) { 970 curr = tmp_list; 971 tmp_list = curr->b_next; 972 curr->b_next = NULL; 973 curr->b_prev = NULL; 974 curr->b_queue = NULL; 975 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 976 DTRACE_PROBE4(ipif__ioctl, 977 char *, "ipsq_xopq_mp_cleanup", 978 int, 0, ill_t *, NULL, ipif_t *, NULL); 979 ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ? 980 CONN_CLOSE : NO_COPYOUT, NULL); 981 } else { 982 /* 983 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 984 * this can't be just inet_freemsg. we have to 985 * restart it otherwise the thread will be stuck. 986 */ 987 inet_freemsg(curr); 988 } 989 } 990 } 991 992 /* 993 * This conn has started closing. Cleanup any pending ioctl from this conn. 994 * STREAMS ensures that there can be at most 1 active ioctl on a stream. 995 */ 996 void 997 conn_ioctl_cleanup(conn_t *connp) 998 { 999 ipsq_t *ipsq; 1000 ill_t *ill; 1001 boolean_t refheld; 1002 1003 /* 1004 * Check for a queued ioctl. If the ioctl has not yet started, the mp 1005 * is pending in the list headed by ipsq_xopq_head. If the ioctl has 1006 * started the mp could be present in ipx_pending_mp. Note that if 1007 * conn_oper_pending_ill is NULL, the ioctl may still be in flight and 1008 * not yet queued anywhere. In this case, the conn close code will wait 1009 * until the conn_ref is dropped. If the stream was a tcp stream, then 1010 * tcp_close will wait first until all ioctls have completed for this 1011 * conn. 1012 */ 1013 mutex_enter(&connp->conn_lock); 1014 ill = connp->conn_oper_pending_ill; 1015 if (ill == NULL) { 1016 mutex_exit(&connp->conn_lock); 1017 return; 1018 } 1019 1020 /* 1021 * We may not be able to refhold the ill if the ill/ipif 1022 * is changing. But we need to make sure that the ill will 1023 * not vanish. So we just bump up the ill_waiter count. 1024 */ 1025 refheld = ill_waiter_inc(ill); 1026 mutex_exit(&connp->conn_lock); 1027 if (refheld) { 1028 if (ipsq_enter(ill, B_TRUE, NEW_OP)) { 1029 ill_waiter_dcr(ill); 1030 /* 1031 * Check whether this ioctl has started and is 1032 * pending. If it is not found there then check 1033 * whether this ioctl has not even started and is in 1034 * the ipsq_xopq list. 1035 */ 1036 if (!ipsq_pending_mp_cleanup(ill, connp)) 1037 ipsq_xopq_mp_cleanup(ill, connp); 1038 ipsq = ill->ill_phyint->phyint_ipsq; 1039 ipsq_exit(ipsq); 1040 return; 1041 } 1042 } 1043 1044 /* 1045 * The ill is also closing and we could not bump up the 1046 * ill_waiter_count or we could not enter the ipsq. Leave 1047 * the cleanup to ill_delete 1048 */ 1049 mutex_enter(&connp->conn_lock); 1050 while (connp->conn_oper_pending_ill != NULL) 1051 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1052 mutex_exit(&connp->conn_lock); 1053 if (refheld) 1054 ill_waiter_dcr(ill); 1055 } 1056 1057 /* 1058 * ipcl_walk function for cleaning up conn_*_ill fields. 1059 * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and 1060 * conn_bound_if in place. We prefer dropping 1061 * packets instead of sending them out the wrong interface, or accepting 1062 * packets from the wrong ifindex. 1063 */ 1064 static void 1065 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1066 { 1067 ill_t *ill = (ill_t *)arg; 1068 1069 mutex_enter(&connp->conn_lock); 1070 if (connp->conn_dhcpinit_ill == ill) { 1071 connp->conn_dhcpinit_ill = NULL; 1072 ASSERT(ill->ill_dhcpinit != 0); 1073 atomic_dec_32(&ill->ill_dhcpinit); 1074 ill_set_inputfn(ill); 1075 } 1076 mutex_exit(&connp->conn_lock); 1077 } 1078 1079 static int 1080 ill_down_ipifs_tail(ill_t *ill) 1081 { 1082 ipif_t *ipif; 1083 int err; 1084 1085 ASSERT(IAM_WRITER_ILL(ill)); 1086 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1087 ipif_non_duplicate(ipif); 1088 /* 1089 * ipif_down_tail will call arp_ll_down on the last ipif 1090 * and typically return EINPROGRESS when the DL_UNBIND is sent. 1091 */ 1092 if ((err = ipif_down_tail(ipif)) != 0) 1093 return (err); 1094 } 1095 return (0); 1096 } 1097 1098 /* ARGSUSED */ 1099 void 1100 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1101 { 1102 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1103 (void) ill_down_ipifs_tail(q->q_ptr); 1104 freemsg(mp); 1105 ipsq_current_finish(ipsq); 1106 } 1107 1108 /* 1109 * ill_down_start is called when we want to down this ill and bring it up again 1110 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1111 * all interfaces, but don't tear down any plumbing. 1112 */ 1113 boolean_t 1114 ill_down_start(queue_t *q, mblk_t *mp) 1115 { 1116 ill_t *ill = q->q_ptr; 1117 ipif_t *ipif; 1118 1119 ASSERT(IAM_WRITER_ILL(ill)); 1120 mutex_enter(&ill->ill_lock); 1121 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 1122 /* no more nce addition allowed */ 1123 mutex_exit(&ill->ill_lock); 1124 1125 /* 1126 * It is possible that some ioctl is already in progress while we 1127 * received the M_ERROR / M_HANGUP in which case, we need to abort 1128 * the ioctl. ill_down_start() is being processed as CUR_OP rather 1129 * than as NEW_OP since the cause of the M_ERROR / M_HANGUP may prevent 1130 * the in progress ioctl from ever completing. 1131 * 1132 * The thread that started the ioctl (if any) must have returned, 1133 * since we are now executing as writer. After the 2 calls below, 1134 * the state of the ipsq and the ill would reflect no trace of any 1135 * pending operation. Subsequently if there is any response to the 1136 * original ioctl from the driver, it would be discarded as an 1137 * unsolicited message from the driver. 1138 */ 1139 (void) ipsq_pending_mp_cleanup(ill, NULL); 1140 ill_dlpi_clear_deferred(ill); 1141 1142 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1143 (void) ipif_down(ipif, NULL, NULL); 1144 1145 ill_down(ill); 1146 1147 /* 1148 * Walk all CONNs that can have a reference on an ire or nce for this 1149 * ill (we actually walk all that now have stale references). 1150 */ 1151 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst); 1152 1153 /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 1154 if (ill->ill_isv6) 1155 dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst); 1156 1157 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); 1158 1159 /* 1160 * Atomically test and add the pending mp if references are active. 1161 */ 1162 mutex_enter(&ill->ill_lock); 1163 if (!ill_is_quiescent(ill)) { 1164 /* call cannot fail since `conn_t *' argument is NULL */ 1165 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1166 mp, ILL_DOWN); 1167 mutex_exit(&ill->ill_lock); 1168 return (B_FALSE); 1169 } 1170 mutex_exit(&ill->ill_lock); 1171 return (B_TRUE); 1172 } 1173 1174 static void 1175 ill_down(ill_t *ill) 1176 { 1177 mblk_t *mp; 1178 ip_stack_t *ipst = ill->ill_ipst; 1179 1180 /* 1181 * Blow off any IREs dependent on this ILL. 1182 * The caller needs to handle conn_ixa_cleanup 1183 */ 1184 ill_delete_ires(ill); 1185 1186 ire_walk_ill(0, 0, ill_downi, ill, ill); 1187 1188 /* Remove any conn_*_ill depending on this ill */ 1189 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); 1190 1191 /* 1192 * Free state for additional IREs. 1193 */ 1194 mutex_enter(&ill->ill_saved_ire_lock); 1195 mp = ill->ill_saved_ire_mp; 1196 ill->ill_saved_ire_mp = NULL; 1197 ill->ill_saved_ire_cnt = 0; 1198 mutex_exit(&ill->ill_saved_ire_lock); 1199 freemsg(mp); 1200 } 1201 1202 /* 1203 * ire_walk routine used to delete every IRE that depends on 1204 * 'ill'. (Always called as writer, and may only be called from ire_walk.) 1205 * 1206 * Note: since the routes added by the kernel are deleted separately, 1207 * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE. 1208 * 1209 * We also remove references on ire_nce_cache entries that refer to the ill. 1210 */ 1211 void 1212 ill_downi(ire_t *ire, char *ill_arg) 1213 { 1214 ill_t *ill = (ill_t *)ill_arg; 1215 nce_t *nce; 1216 1217 mutex_enter(&ire->ire_lock); 1218 nce = ire->ire_nce_cache; 1219 if (nce != NULL && nce->nce_ill == ill) 1220 ire->ire_nce_cache = NULL; 1221 else 1222 nce = NULL; 1223 mutex_exit(&ire->ire_lock); 1224 if (nce != NULL) 1225 nce_refrele(nce); 1226 if (ire->ire_ill == ill) { 1227 /* 1228 * The existing interface binding for ire must be 1229 * deleted before trying to bind the route to another 1230 * interface. However, since we are using the contents of the 1231 * ire after ire_delete, the caller has to ensure that 1232 * CONDEMNED (deleted) ire's are not removed from the list 1233 * when ire_delete() returns. Currently ill_downi() is 1234 * only called as part of ire_walk*() routines, so that 1235 * the irb_refhold() done by ire_walk*() will ensure that 1236 * ire_delete() does not lead to ire_inactive(). 1237 */ 1238 ASSERT(ire->ire_bucket->irb_refcnt > 0); 1239 ire_delete(ire); 1240 if (ire->ire_unbound) 1241 ire_rebind(ire); 1242 } 1243 } 1244 1245 /* Remove IRE_IF_CLONE on this ill */ 1246 void 1247 ill_downi_if_clone(ire_t *ire, char *ill_arg) 1248 { 1249 ill_t *ill = (ill_t *)ill_arg; 1250 1251 ASSERT(ire->ire_type & IRE_IF_CLONE); 1252 if (ire->ire_ill == ill) 1253 ire_delete(ire); 1254 } 1255 1256 /* Consume an M_IOCACK of the fastpath probe. */ 1257 void 1258 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1259 { 1260 mblk_t *mp1 = mp; 1261 1262 /* 1263 * If this was the first attempt turn on the fastpath probing. 1264 */ 1265 mutex_enter(&ill->ill_lock); 1266 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1267 ill->ill_dlpi_fastpath_state = IDS_OK; 1268 mutex_exit(&ill->ill_lock); 1269 1270 /* Free the M_IOCACK mblk, hold on to the data */ 1271 mp = mp->b_cont; 1272 freeb(mp1); 1273 if (mp == NULL) 1274 return; 1275 if (mp->b_cont != NULL) 1276 nce_fastpath_update(ill, mp); 1277 else 1278 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1279 freemsg(mp); 1280 } 1281 1282 /* 1283 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1284 * The data portion of the request is a dl_unitdata_req_t template for 1285 * what we would send downstream in the absence of a fastpath confirmation. 1286 */ 1287 int 1288 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1289 { 1290 struct iocblk *ioc; 1291 mblk_t *mp; 1292 1293 if (dlur_mp == NULL) 1294 return (EINVAL); 1295 1296 mutex_enter(&ill->ill_lock); 1297 switch (ill->ill_dlpi_fastpath_state) { 1298 case IDS_FAILED: 1299 /* 1300 * Driver NAKed the first fastpath ioctl - assume it doesn't 1301 * support it. 1302 */ 1303 mutex_exit(&ill->ill_lock); 1304 return (ENOTSUP); 1305 case IDS_UNKNOWN: 1306 /* This is the first probe */ 1307 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1308 break; 1309 default: 1310 break; 1311 } 1312 mutex_exit(&ill->ill_lock); 1313 1314 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1315 return (EAGAIN); 1316 1317 mp->b_cont = copyb(dlur_mp); 1318 if (mp->b_cont == NULL) { 1319 freeb(mp); 1320 return (EAGAIN); 1321 } 1322 1323 ioc = (struct iocblk *)mp->b_rptr; 1324 ioc->ioc_count = msgdsize(mp->b_cont); 1325 1326 DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe", 1327 char *, "DL_IOC_HDR_INFO", ill_t *, ill); 1328 putnext(ill->ill_wq, mp); 1329 return (0); 1330 } 1331 1332 void 1333 ill_capability_probe(ill_t *ill) 1334 { 1335 mblk_t *mp; 1336 1337 ASSERT(IAM_WRITER_ILL(ill)); 1338 1339 if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN && 1340 ill->ill_dlpi_capab_state != IDCS_FAILED) 1341 return; 1342 1343 /* 1344 * We are starting a new cycle of capability negotiation. 1345 * Free up the capab reset messages of any previous incarnation. 1346 * We will do a fresh allocation when we get the response to our probe 1347 */ 1348 if (ill->ill_capab_reset_mp != NULL) { 1349 freemsg(ill->ill_capab_reset_mp); 1350 ill->ill_capab_reset_mp = NULL; 1351 } 1352 1353 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1354 1355 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); 1356 if (mp == NULL) 1357 return; 1358 1359 ill_capability_send(ill, mp); 1360 ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; 1361 } 1362 1363 void 1364 ill_capability_reset(ill_t *ill, boolean_t reneg) 1365 { 1366 ASSERT(IAM_WRITER_ILL(ill)); 1367 1368 if (ill->ill_dlpi_capab_state != IDCS_OK) 1369 return; 1370 1371 ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; 1372 1373 ill_capability_send(ill, ill->ill_capab_reset_mp); 1374 ill->ill_capab_reset_mp = NULL; 1375 /* 1376 * We turn off all capabilities except those pertaining to 1377 * direct function call capabilities viz. ILL_CAPAB_DLD* 1378 * which will be turned off by the corresponding reset functions. 1379 */ 1380 ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY); 1381 } 1382 1383 static void 1384 ill_capability_reset_alloc(ill_t *ill) 1385 { 1386 mblk_t *mp; 1387 size_t size = 0; 1388 int err; 1389 dl_capability_req_t *capb; 1390 1391 ASSERT(IAM_WRITER_ILL(ill)); 1392 ASSERT(ill->ill_capab_reset_mp == NULL); 1393 1394 if (ILL_HCKSUM_CAPABLE(ill)) { 1395 size += sizeof (dl_capability_sub_t) + 1396 sizeof (dl_capab_hcksum_t); 1397 } 1398 1399 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) { 1400 size += sizeof (dl_capability_sub_t) + 1401 sizeof (dl_capab_zerocopy_t); 1402 } 1403 1404 if (ill->ill_capabilities & ILL_CAPAB_DLD) { 1405 size += sizeof (dl_capability_sub_t) + 1406 sizeof (dl_capab_dld_t); 1407 } 1408 1409 mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED, 1410 STR_NOSIG, &err); 1411 1412 mp->b_datap->db_type = M_PROTO; 1413 bzero(mp->b_rptr, size + sizeof (dl_capability_req_t)); 1414 1415 capb = (dl_capability_req_t *)mp->b_rptr; 1416 capb->dl_primitive = DL_CAPABILITY_REQ; 1417 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1418 capb->dl_sub_length = size; 1419 1420 mp->b_wptr += sizeof (dl_capability_req_t); 1421 1422 /* 1423 * Each handler fills in the corresponding dl_capability_sub_t 1424 * inside the mblk, 1425 */ 1426 ill_capability_hcksum_reset_fill(ill, mp); 1427 ill_capability_zerocopy_reset_fill(ill, mp); 1428 ill_capability_dld_reset_fill(ill, mp); 1429 1430 ill->ill_capab_reset_mp = mp; 1431 } 1432 1433 static void 1434 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1435 { 1436 dl_capab_id_t *id_ic; 1437 uint_t sub_dl_cap = outers->dl_cap; 1438 dl_capability_sub_t *inners; 1439 uint8_t *capend; 1440 1441 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1442 1443 /* 1444 * Note: range checks here are not absolutely sufficient to 1445 * make us robust against malformed messages sent by drivers; 1446 * this is in keeping with the rest of IP's dlpi handling. 1447 * (Remember, it's coming from something else in the kernel 1448 * address space) 1449 */ 1450 1451 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1452 if (capend > mp->b_wptr) { 1453 cmn_err(CE_WARN, "ill_capability_id_ack: " 1454 "malformed sub-capability too long for mblk"); 1455 return; 1456 } 1457 1458 id_ic = (dl_capab_id_t *)(outers + 1); 1459 1460 if (outers->dl_length < sizeof (*id_ic) || 1461 (inners = &id_ic->id_subcap, 1462 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1463 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1464 "encapsulated capab type %d too long for mblk", 1465 inners->dl_cap); 1466 return; 1467 } 1468 1469 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1470 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1471 "isn't as expected; pass-thru module(s) detected, " 1472 "discarding capability\n", inners->dl_cap)); 1473 return; 1474 } 1475 1476 /* Process the encapsulated sub-capability */ 1477 ill_capability_dispatch(ill, mp, inners); 1478 } 1479 1480 static void 1481 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp) 1482 { 1483 dl_capability_sub_t *dl_subcap; 1484 1485 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 1486 return; 1487 1488 /* 1489 * The dl_capab_dld_t that follows the dl_capability_sub_t is not 1490 * initialized below since it is not used by DLD. 1491 */ 1492 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1493 dl_subcap->dl_cap = DL_CAPAB_DLD; 1494 dl_subcap->dl_length = sizeof (dl_capab_dld_t); 1495 1496 mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); 1497 } 1498 1499 static void 1500 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp) 1501 { 1502 /* 1503 * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK 1504 * is only to get the VRRP capability. 1505 * 1506 * Note that we cannot check ill_ipif_up_count here since 1507 * ill_ipif_up_count is only incremented when the resolver is setup. 1508 * That is done asynchronously, and can race with this function. 1509 */ 1510 if (!ill->ill_dl_up) { 1511 if (subp->dl_cap == DL_CAPAB_VRRP) 1512 ill_capability_vrrp_ack(ill, mp, subp); 1513 return; 1514 } 1515 1516 switch (subp->dl_cap) { 1517 case DL_CAPAB_HCKSUM: 1518 ill_capability_hcksum_ack(ill, mp, subp); 1519 break; 1520 case DL_CAPAB_ZEROCOPY: 1521 ill_capability_zerocopy_ack(ill, mp, subp); 1522 break; 1523 case DL_CAPAB_DLD: 1524 ill_capability_dld_ack(ill, mp, subp); 1525 break; 1526 case DL_CAPAB_VRRP: 1527 break; 1528 default: 1529 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 1530 subp->dl_cap)); 1531 } 1532 } 1533 1534 /* 1535 * Process the vrrp capability received from a DLS Provider. isub must point 1536 * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message. 1537 */ 1538 static void 1539 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1540 { 1541 dl_capab_vrrp_t *vrrp; 1542 uint_t sub_dl_cap = isub->dl_cap; 1543 uint8_t *capend; 1544 1545 ASSERT(IAM_WRITER_ILL(ill)); 1546 ASSERT(sub_dl_cap == DL_CAPAB_VRRP); 1547 1548 /* 1549 * Note: range checks here are not absolutely sufficient to 1550 * make us robust against malformed messages sent by drivers; 1551 * this is in keeping with the rest of IP's dlpi handling. 1552 * (Remember, it's coming from something else in the kernel 1553 * address space) 1554 */ 1555 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1556 if (capend > mp->b_wptr) { 1557 cmn_err(CE_WARN, "ill_capability_vrrp_ack: " 1558 "malformed sub-capability too long for mblk"); 1559 return; 1560 } 1561 vrrp = (dl_capab_vrrp_t *)(isub + 1); 1562 1563 /* 1564 * Compare the IP address family and set ILLF_VRRP for the right ill. 1565 */ 1566 if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) || 1567 (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) { 1568 ill->ill_flags |= ILLF_VRRP; 1569 } 1570 } 1571 1572 /* 1573 * Process a hardware checksum offload capability negotiation ack received 1574 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 1575 * of a DL_CAPABILITY_ACK message. 1576 */ 1577 static void 1578 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1579 { 1580 dl_capability_req_t *ocap; 1581 dl_capab_hcksum_t *ihck, *ohck; 1582 ill_hcksum_capab_t **ill_hcksum; 1583 mblk_t *nmp = NULL; 1584 uint_t sub_dl_cap = isub->dl_cap; 1585 uint8_t *capend; 1586 1587 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 1588 1589 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 1590 1591 /* 1592 * Note: range checks here are not absolutely sufficient to 1593 * make us robust against malformed messages sent by drivers; 1594 * this is in keeping with the rest of IP's dlpi handling. 1595 * (Remember, it's coming from something else in the kernel 1596 * address space) 1597 */ 1598 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1599 if (capend > mp->b_wptr) { 1600 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1601 "malformed sub-capability too long for mblk"); 1602 return; 1603 } 1604 1605 /* 1606 * There are two types of acks we process here: 1607 * 1. acks in reply to a (first form) generic capability req 1608 * (no ENABLE flag set) 1609 * 2. acks in reply to a ENABLE capability req. 1610 * (ENABLE flag set) 1611 */ 1612 ihck = (dl_capab_hcksum_t *)(isub + 1); 1613 1614 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 1615 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 1616 "unsupported hardware checksum " 1617 "sub-capability (version %d, expected %d)", 1618 ihck->hcksum_version, HCKSUM_VERSION_1); 1619 return; 1620 } 1621 1622 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 1623 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 1624 "checksum capability isn't as expected; pass-thru " 1625 "module(s) detected, discarding capability\n")); 1626 return; 1627 } 1628 1629 #define CURR_HCKSUM_CAPAB \ 1630 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 1631 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 1632 1633 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 1634 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 1635 /* do ENABLE processing */ 1636 if (*ill_hcksum == NULL) { 1637 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 1638 KM_NOSLEEP); 1639 1640 if (*ill_hcksum == NULL) { 1641 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1642 "could not enable hcksum version %d " 1643 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 1644 ill->ill_name); 1645 return; 1646 } 1647 } 1648 1649 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 1650 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 1651 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 1652 ip1dbg(("ill_capability_hcksum_ack: interface %s " 1653 "has enabled hardware checksumming\n ", 1654 ill->ill_name)); 1655 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 1656 /* 1657 * Enabling hardware checksum offload 1658 * Currently IP supports {TCP,UDP}/IPv4 1659 * partial and full cksum offload and 1660 * IPv4 header checksum offload. 1661 * Allocate new mblk which will 1662 * contain a new capability request 1663 * to enable hardware checksum offload. 1664 */ 1665 uint_t size; 1666 uchar_t *rptr; 1667 1668 size = sizeof (dl_capability_req_t) + 1669 sizeof (dl_capability_sub_t) + isub->dl_length; 1670 1671 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1672 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1673 "could not enable hardware cksum for %s (ENOMEM)\n", 1674 ill->ill_name); 1675 return; 1676 } 1677 1678 rptr = nmp->b_rptr; 1679 /* initialize dl_capability_req_t */ 1680 ocap = (dl_capability_req_t *)nmp->b_rptr; 1681 ocap->dl_sub_offset = 1682 sizeof (dl_capability_req_t); 1683 ocap->dl_sub_length = 1684 sizeof (dl_capability_sub_t) + 1685 isub->dl_length; 1686 nmp->b_rptr += sizeof (dl_capability_req_t); 1687 1688 /* initialize dl_capability_sub_t */ 1689 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 1690 nmp->b_rptr += sizeof (*isub); 1691 1692 /* initialize dl_capab_hcksum_t */ 1693 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 1694 bcopy(ihck, ohck, sizeof (*ihck)); 1695 1696 nmp->b_rptr = rptr; 1697 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 1698 1699 /* Set ENABLE flag */ 1700 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 1701 ohck->hcksum_txflags |= HCKSUM_ENABLE; 1702 1703 /* 1704 * nmp points to a DL_CAPABILITY_REQ message to enable 1705 * hardware checksum acceleration. 1706 */ 1707 ill_capability_send(ill, nmp); 1708 } else { 1709 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 1710 "advertised %x hardware checksum capability flags\n", 1711 ill->ill_name, ihck->hcksum_txflags)); 1712 } 1713 } 1714 1715 static void 1716 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp) 1717 { 1718 dl_capab_hcksum_t *hck_subcap; 1719 dl_capability_sub_t *dl_subcap; 1720 1721 if (!ILL_HCKSUM_CAPABLE(ill)) 1722 return; 1723 1724 ASSERT(ill->ill_hcksum_capab != NULL); 1725 1726 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1727 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 1728 dl_subcap->dl_length = sizeof (*hck_subcap); 1729 1730 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 1731 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 1732 hck_subcap->hcksum_txflags = 0; 1733 1734 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap); 1735 } 1736 1737 static void 1738 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1739 { 1740 mblk_t *nmp = NULL; 1741 dl_capability_req_t *oc; 1742 dl_capab_zerocopy_t *zc_ic, *zc_oc; 1743 ill_zerocopy_capab_t **ill_zerocopy_capab; 1744 uint_t sub_dl_cap = isub->dl_cap; 1745 uint8_t *capend; 1746 1747 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 1748 1749 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 1750 1751 /* 1752 * Note: range checks here are not absolutely sufficient to 1753 * make us robust against malformed messages sent by drivers; 1754 * this is in keeping with the rest of IP's dlpi handling. 1755 * (Remember, it's coming from something else in the kernel 1756 * address space) 1757 */ 1758 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1759 if (capend > mp->b_wptr) { 1760 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1761 "malformed sub-capability too long for mblk"); 1762 return; 1763 } 1764 1765 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 1766 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 1767 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 1768 "unsupported ZEROCOPY sub-capability (version %d, " 1769 "expected %d)", zc_ic->zerocopy_version, 1770 ZEROCOPY_VERSION_1); 1771 return; 1772 } 1773 1774 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 1775 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 1776 "capability isn't as expected; pass-thru module(s) " 1777 "detected, discarding capability\n")); 1778 return; 1779 } 1780 1781 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 1782 if (*ill_zerocopy_capab == NULL) { 1783 *ill_zerocopy_capab = 1784 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 1785 KM_NOSLEEP); 1786 1787 if (*ill_zerocopy_capab == NULL) { 1788 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1789 "could not enable Zero-copy version %d " 1790 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 1791 ill->ill_name); 1792 return; 1793 } 1794 } 1795 1796 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 1797 "supports Zero-copy version %d\n", ill->ill_name, 1798 ZEROCOPY_VERSION_1)); 1799 1800 (*ill_zerocopy_capab)->ill_zerocopy_version = 1801 zc_ic->zerocopy_version; 1802 (*ill_zerocopy_capab)->ill_zerocopy_flags = 1803 zc_ic->zerocopy_flags; 1804 1805 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 1806 } else { 1807 uint_t size; 1808 uchar_t *rptr; 1809 1810 size = sizeof (dl_capability_req_t) + 1811 sizeof (dl_capability_sub_t) + 1812 sizeof (dl_capab_zerocopy_t); 1813 1814 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1815 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1816 "could not enable zerocopy for %s (ENOMEM)\n", 1817 ill->ill_name); 1818 return; 1819 } 1820 1821 rptr = nmp->b_rptr; 1822 /* initialize dl_capability_req_t */ 1823 oc = (dl_capability_req_t *)rptr; 1824 oc->dl_sub_offset = sizeof (dl_capability_req_t); 1825 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 1826 sizeof (dl_capab_zerocopy_t); 1827 rptr += sizeof (dl_capability_req_t); 1828 1829 /* initialize dl_capability_sub_t */ 1830 bcopy(isub, rptr, sizeof (*isub)); 1831 rptr += sizeof (*isub); 1832 1833 /* initialize dl_capab_zerocopy_t */ 1834 zc_oc = (dl_capab_zerocopy_t *)rptr; 1835 *zc_oc = *zc_ic; 1836 1837 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 1838 "to enable zero-copy version %d\n", ill->ill_name, 1839 ZEROCOPY_VERSION_1)); 1840 1841 /* set VMSAFE_MEM flag */ 1842 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 1843 1844 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 1845 ill_capability_send(ill, nmp); 1846 } 1847 } 1848 1849 static void 1850 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp) 1851 { 1852 dl_capab_zerocopy_t *zerocopy_subcap; 1853 dl_capability_sub_t *dl_subcap; 1854 1855 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 1856 return; 1857 1858 ASSERT(ill->ill_zerocopy_capab != NULL); 1859 1860 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1861 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 1862 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 1863 1864 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 1865 zerocopy_subcap->zerocopy_version = 1866 ill->ill_zerocopy_capab->ill_zerocopy_version; 1867 zerocopy_subcap->zerocopy_flags = 0; 1868 1869 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 1870 } 1871 1872 /* 1873 * DLD capability 1874 * Refer to dld.h for more information regarding the purpose and usage 1875 * of this capability. 1876 */ 1877 static void 1878 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1879 { 1880 dl_capab_dld_t *dld_ic, dld; 1881 uint_t sub_dl_cap = isub->dl_cap; 1882 uint8_t *capend; 1883 ill_dld_capab_t *idc; 1884 1885 ASSERT(IAM_WRITER_ILL(ill)); 1886 ASSERT(sub_dl_cap == DL_CAPAB_DLD); 1887 1888 /* 1889 * Note: range checks here are not absolutely sufficient to 1890 * make us robust against malformed messages sent by drivers; 1891 * this is in keeping with the rest of IP's dlpi handling. 1892 * (Remember, it's coming from something else in the kernel 1893 * address space) 1894 */ 1895 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1896 if (capend > mp->b_wptr) { 1897 cmn_err(CE_WARN, "ill_capability_dld_ack: " 1898 "malformed sub-capability too long for mblk"); 1899 return; 1900 } 1901 dld_ic = (dl_capab_dld_t *)(isub + 1); 1902 if (dld_ic->dld_version != DLD_CURRENT_VERSION) { 1903 cmn_err(CE_CONT, "ill_capability_dld_ack: " 1904 "unsupported DLD sub-capability (version %d, " 1905 "expected %d)", dld_ic->dld_version, 1906 DLD_CURRENT_VERSION); 1907 return; 1908 } 1909 if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) { 1910 ip1dbg(("ill_capability_dld_ack: mid token for dld " 1911 "capability isn't as expected; pass-thru module(s) " 1912 "detected, discarding capability\n")); 1913 return; 1914 } 1915 1916 /* 1917 * Copy locally to ensure alignment. 1918 */ 1919 bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t)); 1920 1921 if ((idc = ill->ill_dld_capab) == NULL) { 1922 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP); 1923 if (idc == NULL) { 1924 cmn_err(CE_WARN, "ill_capability_dld_ack: " 1925 "could not enable DLD version %d " 1926 "for %s (ENOMEM)\n", DLD_CURRENT_VERSION, 1927 ill->ill_name); 1928 return; 1929 } 1930 ill->ill_dld_capab = idc; 1931 } 1932 idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab; 1933 idc->idc_capab_dh = (void *)dld.dld_capab_handle; 1934 ip1dbg(("ill_capability_dld_ack: interface %s " 1935 "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION)); 1936 1937 ill_capability_dld_enable(ill); 1938 } 1939 1940 /* 1941 * Typically capability negotiation between IP and the driver happens via 1942 * DLPI message exchange. However GLD also offers a direct function call 1943 * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities, 1944 * But arbitrary function calls into IP or GLD are not permitted, since both 1945 * of them are protected by their own perimeter mechanism. The perimeter can 1946 * be viewed as a coarse lock or serialization mechanism. The hierarchy of 1947 * these perimeters is IP -> MAC. Thus for example to enable the squeue 1948 * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter 1949 * to enter the mac perimeter and then do the direct function calls into 1950 * GLD to enable squeue polling. The ring related callbacks from the mac into 1951 * the stack to add, bind, quiesce, restart or cleanup a ring are all 1952 * protected by the mac perimeter. 1953 */ 1954 static void 1955 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp) 1956 { 1957 ill_dld_capab_t *idc = ill->ill_dld_capab; 1958 int err; 1959 1960 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp, 1961 DLD_ENABLE); 1962 ASSERT(err == 0); 1963 } 1964 1965 static void 1966 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph) 1967 { 1968 ill_dld_capab_t *idc = ill->ill_dld_capab; 1969 int err; 1970 1971 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph, 1972 DLD_DISABLE); 1973 ASSERT(err == 0); 1974 } 1975 1976 boolean_t 1977 ill_mac_perim_held(ill_t *ill) 1978 { 1979 ill_dld_capab_t *idc = ill->ill_dld_capab; 1980 1981 return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL, 1982 DLD_QUERY)); 1983 } 1984 1985 static void 1986 ill_capability_direct_enable(ill_t *ill) 1987 { 1988 ill_dld_capab_t *idc = ill->ill_dld_capab; 1989 ill_dld_direct_t *idd = &idc->idc_direct; 1990 dld_capab_direct_t direct; 1991 int rc; 1992 1993 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 1994 1995 bzero(&direct, sizeof (direct)); 1996 direct.di_rx_cf = (uintptr_t)ip_input; 1997 direct.di_rx_ch = ill; 1998 1999 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct, 2000 DLD_ENABLE); 2001 if (rc == 0) { 2002 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df; 2003 idd->idd_tx_dh = direct.di_tx_dh; 2004 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df; 2005 idd->idd_tx_cb_dh = direct.di_tx_cb_dh; 2006 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df; 2007 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh; 2008 ASSERT(idd->idd_tx_cb_df != NULL); 2009 ASSERT(idd->idd_tx_fctl_df != NULL); 2010 ASSERT(idd->idd_tx_df != NULL); 2011 /* 2012 * One time registration of flow enable callback function 2013 */ 2014 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh, 2015 ill_flow_enable, ill); 2016 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT; 2017 DTRACE_PROBE1(direct_on, (ill_t *), ill); 2018 } else { 2019 cmn_err(CE_WARN, "warning: could not enable DIRECT " 2020 "capability, rc = %d\n", rc); 2021 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc); 2022 } 2023 } 2024 2025 static void 2026 ill_capability_poll_enable(ill_t *ill) 2027 { 2028 ill_dld_capab_t *idc = ill->ill_dld_capab; 2029 dld_capab_poll_t poll; 2030 int rc; 2031 2032 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 2033 2034 bzero(&poll, sizeof (poll)); 2035 poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring; 2036 poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring; 2037 poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring; 2038 poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring; 2039 poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring; 2040 poll.poll_ring_ch = ill; 2041 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll, 2042 DLD_ENABLE); 2043 if (rc == 0) { 2044 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL; 2045 DTRACE_PROBE1(poll_on, (ill_t *), ill); 2046 } else { 2047 ip1dbg(("warning: could not enable POLL " 2048 "capability, rc = %d\n", rc)); 2049 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc); 2050 } 2051 } 2052 2053 /* 2054 * Enable the LSO capability. 2055 */ 2056 static void 2057 ill_capability_lso_enable(ill_t *ill) 2058 { 2059 ill_dld_capab_t *idc = ill->ill_dld_capab; 2060 dld_capab_lso_t lso; 2061 int rc; 2062 2063 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 2064 2065 if (ill->ill_lso_capab == NULL) { 2066 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), 2067 KM_NOSLEEP); 2068 if (ill->ill_lso_capab == NULL) { 2069 cmn_err(CE_WARN, "ill_capability_lso_enable: " 2070 "could not enable LSO for %s (ENOMEM)\n", 2071 ill->ill_name); 2072 return; 2073 } 2074 } 2075 2076 bzero(&lso, sizeof (lso)); 2077 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso, 2078 DLD_ENABLE)) == 0) { 2079 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags; 2080 ill->ill_lso_capab->ill_lso_max = lso.lso_max; 2081 ill->ill_capabilities |= ILL_CAPAB_LSO; 2082 ip1dbg(("ill_capability_lso_enable: interface %s " 2083 "has enabled LSO\n ", ill->ill_name)); 2084 } else { 2085 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 2086 ill->ill_lso_capab = NULL; 2087 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc); 2088 } 2089 } 2090 2091 static void 2092 ill_capability_dld_enable(ill_t *ill) 2093 { 2094 mac_perim_handle_t mph; 2095 2096 ASSERT(IAM_WRITER_ILL(ill)); 2097 2098 if (ill->ill_isv6) 2099 return; 2100 2101 ill_mac_perim_enter(ill, &mph); 2102 if (!ill->ill_isv6) { 2103 ill_capability_direct_enable(ill); 2104 ill_capability_poll_enable(ill); 2105 ill_capability_lso_enable(ill); 2106 } 2107 ill->ill_capabilities |= ILL_CAPAB_DLD; 2108 ill_mac_perim_exit(ill, mph); 2109 } 2110 2111 static void 2112 ill_capability_dld_disable(ill_t *ill) 2113 { 2114 ill_dld_capab_t *idc; 2115 ill_dld_direct_t *idd; 2116 mac_perim_handle_t mph; 2117 2118 ASSERT(IAM_WRITER_ILL(ill)); 2119 2120 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 2121 return; 2122 2123 ill_mac_perim_enter(ill, &mph); 2124 2125 idc = ill->ill_dld_capab; 2126 if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) { 2127 /* 2128 * For performance we avoid locks in the transmit data path 2129 * and don't maintain a count of the number of threads using 2130 * direct calls. Thus some threads could be using direct 2131 * transmit calls to GLD, even after the capability mechanism 2132 * turns it off. This is still safe since the handles used in 2133 * the direct calls continue to be valid until the unplumb is 2134 * completed. Remove the callback that was added (1-time) at 2135 * capab enable time. 2136 */ 2137 mutex_enter(&ill->ill_lock); 2138 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT; 2139 mutex_exit(&ill->ill_lock); 2140 if (ill->ill_flownotify_mh != NULL) { 2141 idd = &idc->idc_direct; 2142 idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL, 2143 ill->ill_flownotify_mh); 2144 ill->ill_flownotify_mh = NULL; 2145 } 2146 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, 2147 NULL, DLD_DISABLE); 2148 } 2149 2150 if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) { 2151 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL; 2152 ip_squeue_clean_all(ill); 2153 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, 2154 NULL, DLD_DISABLE); 2155 } 2156 2157 if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) { 2158 ASSERT(ill->ill_lso_capab != NULL); 2159 /* 2160 * Clear the capability flag for LSO but retain the 2161 * ill_lso_capab structure since it's possible that another 2162 * thread is still referring to it. The structure only gets 2163 * deallocated when we destroy the ill. 2164 */ 2165 2166 ill->ill_capabilities &= ~ILL_CAPAB_LSO; 2167 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, 2168 NULL, DLD_DISABLE); 2169 } 2170 2171 ill->ill_capabilities &= ~ILL_CAPAB_DLD; 2172 ill_mac_perim_exit(ill, mph); 2173 } 2174 2175 /* 2176 * Capability Negotiation protocol 2177 * 2178 * We don't wait for DLPI capability operations to finish during interface 2179 * bringup or teardown. Doing so would introduce more asynchrony and the 2180 * interface up/down operations will need multiple return and restarts. 2181 * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as 2182 * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next 2183 * exclusive operation won't start until the DLPI operations of the previous 2184 * exclusive operation complete. 2185 * 2186 * The capability state machine is shown below. 2187 * 2188 * state next state event, action 2189 * 2190 * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe 2191 * IDCS_PROBE_SENT IDCS_OK ill_capability_ack 2192 * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack) 2193 * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG 2194 * IDCS_OK IDCS_RESET_SENT ill_capability_reset 2195 * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr 2196 * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr -> 2197 * ill_capability_probe. 2198 */ 2199 2200 /* 2201 * Dedicated thread started from ip_stack_init that handles capability 2202 * disable. This thread ensures the taskq dispatch does not fail by waiting 2203 * for resources using TQ_SLEEP. The taskq mechanism is used to ensure 2204 * that direct calls to DLD are done in a cv_waitable context. 2205 */ 2206 void 2207 ill_taskq_dispatch(ip_stack_t *ipst) 2208 { 2209 callb_cpr_t cprinfo; 2210 char name[64]; 2211 mblk_t *mp; 2212 2213 (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d", 2214 ipst->ips_netstack->netstack_stackid); 2215 CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr, 2216 name); 2217 mutex_enter(&ipst->ips_capab_taskq_lock); 2218 2219 for (;;) { 2220 mp = ipst->ips_capab_taskq_head; 2221 while (mp != NULL) { 2222 ipst->ips_capab_taskq_head = mp->b_next; 2223 if (ipst->ips_capab_taskq_head == NULL) 2224 ipst->ips_capab_taskq_tail = NULL; 2225 mutex_exit(&ipst->ips_capab_taskq_lock); 2226 mp->b_next = NULL; 2227 2228 VERIFY(taskq_dispatch(system_taskq, 2229 ill_capability_ack_thr, mp, TQ_SLEEP) != 0); 2230 mutex_enter(&ipst->ips_capab_taskq_lock); 2231 mp = ipst->ips_capab_taskq_head; 2232 } 2233 2234 if (ipst->ips_capab_taskq_quit) 2235 break; 2236 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2237 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock); 2238 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock); 2239 } 2240 VERIFY(ipst->ips_capab_taskq_head == NULL); 2241 VERIFY(ipst->ips_capab_taskq_tail == NULL); 2242 CALLB_CPR_EXIT(&cprinfo); 2243 thread_exit(); 2244 } 2245 2246 /* 2247 * Consume a new-style hardware capabilities negotiation ack. 2248 * Called via taskq on receipt of DL_CAPABILITY_ACK. 2249 */ 2250 static void 2251 ill_capability_ack_thr(void *arg) 2252 { 2253 mblk_t *mp = arg; 2254 dl_capability_ack_t *capp; 2255 dl_capability_sub_t *subp, *endp; 2256 ill_t *ill; 2257 boolean_t reneg; 2258 2259 ill = (ill_t *)mp->b_prev; 2260 mp->b_prev = NULL; 2261 2262 VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE); 2263 2264 if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT || 2265 ill->ill_dlpi_capab_state == IDCS_RENEG) { 2266 /* 2267 * We have received the ack for our DL_CAPAB reset request. 2268 * There isnt' anything in the message that needs processing. 2269 * All message based capabilities have been disabled, now 2270 * do the function call based capability disable. 2271 */ 2272 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG; 2273 ill_capability_dld_disable(ill); 2274 ill->ill_dlpi_capab_state = IDCS_UNKNOWN; 2275 if (reneg) 2276 ill_capability_probe(ill); 2277 goto done; 2278 } 2279 2280 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT) 2281 ill->ill_dlpi_capab_state = IDCS_OK; 2282 2283 capp = (dl_capability_ack_t *)mp->b_rptr; 2284 2285 if (capp->dl_sub_length == 0) { 2286 /* no new-style capabilities */ 2287 goto done; 2288 } 2289 2290 /* make sure the driver supplied correct dl_sub_length */ 2291 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 2292 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 2293 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 2294 goto done; 2295 } 2296 2297 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 2298 /* 2299 * There are sub-capabilities. Process the ones we know about. 2300 * Loop until we don't have room for another sub-cap header.. 2301 */ 2302 for (subp = SC(capp, capp->dl_sub_offset), 2303 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 2304 subp <= endp; 2305 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 2306 2307 switch (subp->dl_cap) { 2308 case DL_CAPAB_ID_WRAPPER: 2309 ill_capability_id_ack(ill, mp, subp); 2310 break; 2311 default: 2312 ill_capability_dispatch(ill, mp, subp); 2313 break; 2314 } 2315 } 2316 #undef SC 2317 done: 2318 inet_freemsg(mp); 2319 ill_capability_done(ill); 2320 ipsq_exit(ill->ill_phyint->phyint_ipsq); 2321 } 2322 2323 /* 2324 * This needs to be started in a taskq thread to provide a cv_waitable 2325 * context. 2326 */ 2327 void 2328 ill_capability_ack(ill_t *ill, mblk_t *mp) 2329 { 2330 ip_stack_t *ipst = ill->ill_ipst; 2331 2332 mp->b_prev = (mblk_t *)ill; 2333 ASSERT(mp->b_next == NULL); 2334 2335 if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp, 2336 TQ_NOSLEEP) != 0) 2337 return; 2338 2339 /* 2340 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread 2341 * which will do the dispatch using TQ_SLEEP to guarantee success. 2342 */ 2343 mutex_enter(&ipst->ips_capab_taskq_lock); 2344 if (ipst->ips_capab_taskq_head == NULL) { 2345 ASSERT(ipst->ips_capab_taskq_tail == NULL); 2346 ipst->ips_capab_taskq_head = mp; 2347 } else { 2348 ipst->ips_capab_taskq_tail->b_next = mp; 2349 } 2350 ipst->ips_capab_taskq_tail = mp; 2351 2352 cv_signal(&ipst->ips_capab_taskq_cv); 2353 mutex_exit(&ipst->ips_capab_taskq_lock); 2354 } 2355 2356 /* 2357 * This routine is called to scan the fragmentation reassembly table for 2358 * the specified ILL for any packets that are starting to smell. 2359 * dead_interval is the maximum time in seconds that will be tolerated. It 2360 * will either be the value specified in ip_g_frag_timeout, or zero if the 2361 * ILL is shutting down and it is time to blow everything off. 2362 * 2363 * It returns the number of seconds (as a time_t) that the next frag timer 2364 * should be scheduled for, 0 meaning that the timer doesn't need to be 2365 * re-started. Note that the method of calculating next_timeout isn't 2366 * entirely accurate since time will flow between the time we grab 2367 * current_time and the time we schedule the next timeout. This isn't a 2368 * big problem since this is the timer for sending an ICMP reassembly time 2369 * exceeded messages, and it doesn't have to be exactly accurate. 2370 * 2371 * This function is 2372 * sometimes called as writer, although this is not required. 2373 */ 2374 time_t 2375 ill_frag_timeout(ill_t *ill, time_t dead_interval) 2376 { 2377 ipfb_t *ipfb; 2378 ipfb_t *endp; 2379 ipf_t *ipf; 2380 ipf_t *ipfnext; 2381 mblk_t *mp; 2382 time_t current_time = gethrestime_sec(); 2383 time_t next_timeout = 0; 2384 uint32_t hdr_length; 2385 mblk_t *send_icmp_head; 2386 mblk_t *send_icmp_head_v6; 2387 ip_stack_t *ipst = ill->ill_ipst; 2388 ip_recv_attr_t iras; 2389 2390 bzero(&iras, sizeof (iras)); 2391 iras.ira_flags = 0; 2392 iras.ira_ill = iras.ira_rill = ill; 2393 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2394 iras.ira_rifindex = iras.ira_ruifindex; 2395 2396 ipfb = ill->ill_frag_hash_tbl; 2397 if (ipfb == NULL) 2398 return (B_FALSE); 2399 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 2400 /* Walk the frag hash table. */ 2401 for (; ipfb < endp; ipfb++) { 2402 send_icmp_head = NULL; 2403 send_icmp_head_v6 = NULL; 2404 mutex_enter(&ipfb->ipfb_lock); 2405 while ((ipf = ipfb->ipfb_ipf) != 0) { 2406 time_t frag_time = current_time - ipf->ipf_timestamp; 2407 time_t frag_timeout; 2408 2409 if (frag_time < dead_interval) { 2410 /* 2411 * There are some outstanding fragments 2412 * that will timeout later. Make note of 2413 * the time so that we can reschedule the 2414 * next timeout appropriately. 2415 */ 2416 frag_timeout = dead_interval - frag_time; 2417 if (next_timeout == 0 || 2418 frag_timeout < next_timeout) { 2419 next_timeout = frag_timeout; 2420 } 2421 break; 2422 } 2423 /* Time's up. Get it out of here. */ 2424 hdr_length = ipf->ipf_nf_hdr_len; 2425 ipfnext = ipf->ipf_hash_next; 2426 if (ipfnext) 2427 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 2428 *ipf->ipf_ptphn = ipfnext; 2429 mp = ipf->ipf_mp->b_cont; 2430 for (; mp; mp = mp->b_cont) { 2431 /* Extra points for neatness. */ 2432 IP_REASS_SET_START(mp, 0); 2433 IP_REASS_SET_END(mp, 0); 2434 } 2435 mp = ipf->ipf_mp->b_cont; 2436 atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count); 2437 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 2438 ipfb->ipfb_count -= ipf->ipf_count; 2439 ASSERT(ipfb->ipfb_frag_pkts > 0); 2440 ipfb->ipfb_frag_pkts--; 2441 /* 2442 * We do not send any icmp message from here because 2443 * we currently are holding the ipfb_lock for this 2444 * hash chain. If we try and send any icmp messages 2445 * from here we may end up via a put back into ip 2446 * trying to get the same lock, causing a recursive 2447 * mutex panic. Instead we build a list and send all 2448 * the icmp messages after we have dropped the lock. 2449 */ 2450 if (ill->ill_isv6) { 2451 if (hdr_length != 0) { 2452 mp->b_next = send_icmp_head_v6; 2453 send_icmp_head_v6 = mp; 2454 } else { 2455 freemsg(mp); 2456 } 2457 } else { 2458 if (hdr_length != 0) { 2459 mp->b_next = send_icmp_head; 2460 send_icmp_head = mp; 2461 } else { 2462 freemsg(mp); 2463 } 2464 } 2465 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2466 ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill); 2467 freeb(ipf->ipf_mp); 2468 } 2469 mutex_exit(&ipfb->ipfb_lock); 2470 /* 2471 * Now need to send any icmp messages that we delayed from 2472 * above. 2473 */ 2474 while (send_icmp_head_v6 != NULL) { 2475 ip6_t *ip6h; 2476 2477 mp = send_icmp_head_v6; 2478 send_icmp_head_v6 = send_icmp_head_v6->b_next; 2479 mp->b_next = NULL; 2480 ip6h = (ip6_t *)mp->b_rptr; 2481 iras.ira_flags = 0; 2482 /* 2483 * This will result in an incorrect ALL_ZONES zoneid 2484 * for multicast packets, but we 2485 * don't send ICMP errors for those in any case. 2486 */ 2487 iras.ira_zoneid = 2488 ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 2489 ill, ipst); 2490 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2491 icmp_time_exceeded_v6(mp, 2492 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 2493 &iras); 2494 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2495 } 2496 while (send_icmp_head != NULL) { 2497 ipaddr_t dst; 2498 2499 mp = send_icmp_head; 2500 send_icmp_head = send_icmp_head->b_next; 2501 mp->b_next = NULL; 2502 2503 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 2504 2505 iras.ira_flags = IRAF_IS_IPV4; 2506 /* 2507 * This will result in an incorrect ALL_ZONES zoneid 2508 * for broadcast and multicast packets, but we 2509 * don't send ICMP errors for those in any case. 2510 */ 2511 iras.ira_zoneid = ipif_lookup_addr_zoneid(dst, 2512 ill, ipst); 2513 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2514 icmp_time_exceeded(mp, 2515 ICMP_REASSEMBLY_TIME_EXCEEDED, &iras); 2516 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2517 } 2518 } 2519 /* 2520 * A non-dying ILL will use the return value to decide whether to 2521 * restart the frag timer, and for how long. 2522 */ 2523 return (next_timeout); 2524 } 2525 2526 /* 2527 * This routine is called when the approximate count of mblk memory used 2528 * for the specified ILL has exceeded max_count. 2529 */ 2530 void 2531 ill_frag_prune(ill_t *ill, uint_t max_count) 2532 { 2533 ipfb_t *ipfb; 2534 ipf_t *ipf; 2535 size_t count; 2536 clock_t now; 2537 2538 /* 2539 * If we are here within ip_min_frag_prune_time msecs remove 2540 * ill_frag_free_num_pkts oldest packets from each bucket and increment 2541 * ill_frag_free_num_pkts. 2542 */ 2543 mutex_enter(&ill->ill_lock); 2544 now = ddi_get_lbolt(); 2545 if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <= 2546 (ip_min_frag_prune_time != 0 ? 2547 ip_min_frag_prune_time : msec_per_tick)) { 2548 2549 ill->ill_frag_free_num_pkts++; 2550 2551 } else { 2552 ill->ill_frag_free_num_pkts = 0; 2553 } 2554 ill->ill_last_frag_clean_time = now; 2555 mutex_exit(&ill->ill_lock); 2556 2557 /* 2558 * free ill_frag_free_num_pkts oldest packets from each bucket. 2559 */ 2560 if (ill->ill_frag_free_num_pkts != 0) { 2561 int ix; 2562 2563 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2564 ipfb = &ill->ill_frag_hash_tbl[ix]; 2565 mutex_enter(&ipfb->ipfb_lock); 2566 if (ipfb->ipfb_ipf != NULL) { 2567 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 2568 ill->ill_frag_free_num_pkts); 2569 } 2570 mutex_exit(&ipfb->ipfb_lock); 2571 } 2572 } 2573 /* 2574 * While the reassembly list for this ILL is too big, prune a fragment 2575 * queue by age, oldest first. 2576 */ 2577 while (ill->ill_frag_count > max_count) { 2578 int ix; 2579 ipfb_t *oipfb = NULL; 2580 uint_t oldest = UINT_MAX; 2581 2582 count = 0; 2583 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2584 ipfb = &ill->ill_frag_hash_tbl[ix]; 2585 mutex_enter(&ipfb->ipfb_lock); 2586 ipf = ipfb->ipfb_ipf; 2587 if (ipf != NULL && ipf->ipf_gen < oldest) { 2588 oldest = ipf->ipf_gen; 2589 oipfb = ipfb; 2590 } 2591 count += ipfb->ipfb_count; 2592 mutex_exit(&ipfb->ipfb_lock); 2593 } 2594 if (oipfb == NULL) 2595 break; 2596 2597 if (count <= max_count) 2598 return; /* Somebody beat us to it, nothing to do */ 2599 mutex_enter(&oipfb->ipfb_lock); 2600 ipf = oipfb->ipfb_ipf; 2601 if (ipf != NULL) { 2602 ill_frag_free_pkts(ill, oipfb, ipf, 1); 2603 } 2604 mutex_exit(&oipfb->ipfb_lock); 2605 } 2606 } 2607 2608 /* 2609 * free 'free_cnt' fragmented packets starting at ipf. 2610 */ 2611 void 2612 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 2613 { 2614 size_t count; 2615 mblk_t *mp; 2616 mblk_t *tmp; 2617 ipf_t **ipfp = ipf->ipf_ptphn; 2618 2619 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 2620 ASSERT(ipfp != NULL); 2621 ASSERT(ipf != NULL); 2622 2623 while (ipf != NULL && free_cnt-- > 0) { 2624 count = ipf->ipf_count; 2625 mp = ipf->ipf_mp; 2626 ipf = ipf->ipf_hash_next; 2627 for (tmp = mp; tmp; tmp = tmp->b_cont) { 2628 IP_REASS_SET_START(tmp, 0); 2629 IP_REASS_SET_END(tmp, 0); 2630 } 2631 atomic_add_32(&ill->ill_frag_count, -count); 2632 ASSERT(ipfb->ipfb_count >= count); 2633 ipfb->ipfb_count -= count; 2634 ASSERT(ipfb->ipfb_frag_pkts > 0); 2635 ipfb->ipfb_frag_pkts--; 2636 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2637 ip_drop_input("ipIfStatsReasmFails", mp, ill); 2638 freemsg(mp); 2639 } 2640 2641 if (ipf) 2642 ipf->ipf_ptphn = ipfp; 2643 ipfp[0] = ipf; 2644 } 2645 2646 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 2647 "obsolete and may be removed in a future release of Solaris. Use " \ 2648 "ifconfig(1M) to manipulate the forwarding status of an interface." 2649 2650 /* 2651 * For obsolete per-interface forwarding configuration; 2652 * called in response to ND_GET. 2653 */ 2654 /* ARGSUSED */ 2655 static int 2656 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 2657 { 2658 ill_t *ill = (ill_t *)cp; 2659 2660 cmn_err(CE_WARN, ND_FORWARD_WARNING); 2661 2662 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 2663 return (0); 2664 } 2665 2666 /* 2667 * For obsolete per-interface forwarding configuration; 2668 * called in response to ND_SET. 2669 */ 2670 /* ARGSUSED */ 2671 static int 2672 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 2673 cred_t *ioc_cr) 2674 { 2675 long value; 2676 int retval; 2677 ip_stack_t *ipst = CONNQ_TO_IPST(q); 2678 2679 cmn_err(CE_WARN, ND_FORWARD_WARNING); 2680 2681 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 2682 value < 0 || value > 1) { 2683 return (EINVAL); 2684 } 2685 2686 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2687 retval = ill_forward_set((ill_t *)cp, (value != 0)); 2688 rw_exit(&ipst->ips_ill_g_lock); 2689 return (retval); 2690 } 2691 2692 /* 2693 * Helper function for ill_forward_set(). 2694 */ 2695 static void 2696 ill_forward_set_on_ill(ill_t *ill, boolean_t enable) 2697 { 2698 ip_stack_t *ipst = ill->ill_ipst; 2699 2700 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2701 2702 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 2703 (enable ? "Enabling" : "Disabling"), 2704 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 2705 mutex_enter(&ill->ill_lock); 2706 if (enable) 2707 ill->ill_flags |= ILLF_ROUTER; 2708 else 2709 ill->ill_flags &= ~ILLF_ROUTER; 2710 mutex_exit(&ill->ill_lock); 2711 if (ill->ill_isv6) 2712 ill_set_nce_router_flags(ill, enable); 2713 /* Notify routing socket listeners of this change. */ 2714 if (ill->ill_ipif != NULL) 2715 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 2716 } 2717 2718 /* 2719 * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing 2720 * socket messages for each interface whose flags we change. 2721 */ 2722 int 2723 ill_forward_set(ill_t *ill, boolean_t enable) 2724 { 2725 ipmp_illgrp_t *illg; 2726 ip_stack_t *ipst = ill->ill_ipst; 2727 2728 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2729 2730 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 2731 (!enable && !(ill->ill_flags & ILLF_ROUTER))) 2732 return (0); 2733 2734 if (IS_LOOPBACK(ill)) 2735 return (EINVAL); 2736 2737 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { 2738 /* 2739 * Update all of the interfaces in the group. 2740 */ 2741 illg = ill->ill_grp; 2742 ill = list_head(&illg->ig_if); 2743 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) 2744 ill_forward_set_on_ill(ill, enable); 2745 2746 /* 2747 * Update the IPMP meta-interface. 2748 */ 2749 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable); 2750 return (0); 2751 } 2752 2753 ill_forward_set_on_ill(ill, enable); 2754 return (0); 2755 } 2756 2757 /* 2758 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 2759 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 2760 * set or clear. 2761 */ 2762 static void 2763 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 2764 { 2765 ipif_t *ipif; 2766 ncec_t *ncec; 2767 nce_t *nce; 2768 2769 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 2770 /* 2771 * NOTE: we match across the illgrp because nce's for 2772 * addresses on IPMP interfaces have an nce_ill that points to 2773 * the bound underlying ill. 2774 */ 2775 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); 2776 if (nce != NULL) { 2777 ncec = nce->nce_common; 2778 mutex_enter(&ncec->ncec_lock); 2779 if (enable) 2780 ncec->ncec_flags |= NCE_F_ISROUTER; 2781 else 2782 ncec->ncec_flags &= ~NCE_F_ISROUTER; 2783 mutex_exit(&ncec->ncec_lock); 2784 nce_refrele(nce); 2785 } 2786 } 2787 } 2788 2789 /* 2790 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 2791 * for this ill. Make sure the v6/v4 question has been answered about this 2792 * ill. The creation of this ndd variable is only for backwards compatibility. 2793 * The preferred way to control per-interface IP forwarding is through the 2794 * ILLF_ROUTER interface flag. 2795 */ 2796 static int 2797 ill_set_ndd_name(ill_t *ill) 2798 { 2799 char *suffix; 2800 ip_stack_t *ipst = ill->ill_ipst; 2801 2802 ASSERT(IAM_WRITER_ILL(ill)); 2803 2804 if (ill->ill_isv6) 2805 suffix = ipv6_forward_suffix; 2806 else 2807 suffix = ipv4_forward_suffix; 2808 2809 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 2810 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 2811 /* 2812 * Copies over the '\0'. 2813 * Note that strlen(suffix) is always bounded. 2814 */ 2815 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 2816 strlen(suffix) + 1); 2817 2818 /* 2819 * Use of the nd table requires holding the reader lock. 2820 * Modifying the nd table thru nd_load/nd_unload requires 2821 * the writer lock. 2822 */ 2823 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 2824 if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 2825 nd_ill_forward_set, (caddr_t)ill)) { 2826 /* 2827 * If the nd_load failed, it only meant that it could not 2828 * allocate a new bunch of room for further NDD expansion. 2829 * Because of that, the ill_ndd_name will be set to 0, and 2830 * this interface is at the mercy of the global ip_forwarding 2831 * variable. 2832 */ 2833 rw_exit(&ipst->ips_ip_g_nd_lock); 2834 ill->ill_ndd_name = NULL; 2835 return (ENOMEM); 2836 } 2837 rw_exit(&ipst->ips_ip_g_nd_lock); 2838 return (0); 2839 } 2840 2841 /* 2842 * Intializes the context structure and returns the first ill in the list 2843 * cuurently start_list and end_list can have values: 2844 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 2845 * IP_V4_G_HEAD Traverse IPV4 list only. 2846 * IP_V6_G_HEAD Traverse IPV6 list only. 2847 */ 2848 2849 /* 2850 * We don't check for CONDEMNED ills here. Caller must do that if 2851 * necessary under the ill lock. 2852 */ 2853 ill_t * 2854 ill_first(int start_list, int end_list, ill_walk_context_t *ctx, 2855 ip_stack_t *ipst) 2856 { 2857 ill_if_t *ifp; 2858 ill_t *ill; 2859 avl_tree_t *avl_tree; 2860 2861 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 2862 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 2863 2864 /* 2865 * setup the lists to search 2866 */ 2867 if (end_list != MAX_G_HEADS) { 2868 ctx->ctx_current_list = start_list; 2869 ctx->ctx_last_list = end_list; 2870 } else { 2871 ctx->ctx_last_list = MAX_G_HEADS - 1; 2872 ctx->ctx_current_list = 0; 2873 } 2874 2875 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 2876 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2877 if (ifp != (ill_if_t *) 2878 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2879 avl_tree = &ifp->illif_avl_by_ppa; 2880 ill = avl_first(avl_tree); 2881 /* 2882 * ill is guaranteed to be non NULL or ifp should have 2883 * not existed. 2884 */ 2885 ASSERT(ill != NULL); 2886 return (ill); 2887 } 2888 ctx->ctx_current_list++; 2889 } 2890 2891 return (NULL); 2892 } 2893 2894 /* 2895 * returns the next ill in the list. ill_first() must have been called 2896 * before calling ill_next() or bad things will happen. 2897 */ 2898 2899 /* 2900 * We don't check for CONDEMNED ills here. Caller must do that if 2901 * necessary under the ill lock. 2902 */ 2903 ill_t * 2904 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 2905 { 2906 ill_if_t *ifp; 2907 ill_t *ill; 2908 ip_stack_t *ipst = lastill->ill_ipst; 2909 2910 ASSERT(lastill->ill_ifptr != (ill_if_t *) 2911 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)); 2912 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 2913 AVL_AFTER)) != NULL) { 2914 return (ill); 2915 } 2916 2917 /* goto next ill_ifp in the list. */ 2918 ifp = lastill->ill_ifptr->illif_next; 2919 2920 /* make sure not at end of circular list */ 2921 while (ifp == 2922 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2923 if (++ctx->ctx_current_list > ctx->ctx_last_list) 2924 return (NULL); 2925 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2926 } 2927 2928 return (avl_first(&ifp->illif_avl_by_ppa)); 2929 } 2930 2931 /* 2932 * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+ 2933 * The final number (PPA) must not have any leading zeros. Upon success, a 2934 * pointer to the start of the PPA is returned; otherwise NULL is returned. 2935 */ 2936 static char * 2937 ill_get_ppa_ptr(char *name) 2938 { 2939 int namelen = strlen(name); 2940 int end_ndx = namelen - 1; 2941 int ppa_ndx, i; 2942 2943 /* 2944 * Check that the first character is [a-zA-Z], and that the last 2945 * character is [0-9]. 2946 */ 2947 if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx])) 2948 return (NULL); 2949 2950 /* 2951 * Set `ppa_ndx' to the PPA start, and check for leading zeroes. 2952 */ 2953 for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--) 2954 if (!isdigit(name[ppa_ndx - 1])) 2955 break; 2956 2957 if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx) 2958 return (NULL); 2959 2960 /* 2961 * Check that the intermediate characters are [a-z0-9.] 2962 */ 2963 for (i = 1; i < ppa_ndx; i++) { 2964 if (!isalpha(name[i]) && !isdigit(name[i]) && 2965 name[i] != '.' && name[i] != '_') { 2966 return (NULL); 2967 } 2968 } 2969 2970 return (name + ppa_ndx); 2971 } 2972 2973 /* 2974 * use avl tree to locate the ill. 2975 */ 2976 static ill_t * 2977 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst) 2978 { 2979 char *ppa_ptr = NULL; 2980 int len; 2981 uint_t ppa; 2982 ill_t *ill = NULL; 2983 ill_if_t *ifp; 2984 int list; 2985 2986 /* 2987 * get ppa ptr 2988 */ 2989 if (isv6) 2990 list = IP_V6_G_HEAD; 2991 else 2992 list = IP_V4_G_HEAD; 2993 2994 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 2995 return (NULL); 2996 } 2997 2998 len = ppa_ptr - name + 1; 2999 3000 ppa = stoi(&ppa_ptr); 3001 3002 ifp = IP_VX_ILL_G_LIST(list, ipst); 3003 3004 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 3005 /* 3006 * match is done on len - 1 as the name is not null 3007 * terminated it contains ppa in addition to the interface 3008 * name. 3009 */ 3010 if ((ifp->illif_name_len == len) && 3011 bcmp(ifp->illif_name, name, len - 1) == 0) { 3012 break; 3013 } else { 3014 ifp = ifp->illif_next; 3015 } 3016 } 3017 3018 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 3019 /* 3020 * Even the interface type does not exist. 3021 */ 3022 return (NULL); 3023 } 3024 3025 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 3026 if (ill != NULL) { 3027 mutex_enter(&ill->ill_lock); 3028 if (ILL_CAN_LOOKUP(ill)) { 3029 ill_refhold_locked(ill); 3030 mutex_exit(&ill->ill_lock); 3031 return (ill); 3032 } 3033 mutex_exit(&ill->ill_lock); 3034 } 3035 return (NULL); 3036 } 3037 3038 /* 3039 * comparison function for use with avl. 3040 */ 3041 static int 3042 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 3043 { 3044 uint_t ppa; 3045 uint_t ill_ppa; 3046 3047 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 3048 3049 ppa = *((uint_t *)ppa_ptr); 3050 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 3051 /* 3052 * We want the ill with the lowest ppa to be on the 3053 * top. 3054 */ 3055 if (ill_ppa < ppa) 3056 return (1); 3057 if (ill_ppa > ppa) 3058 return (-1); 3059 return (0); 3060 } 3061 3062 /* 3063 * remove an interface type from the global list. 3064 */ 3065 static void 3066 ill_delete_interface_type(ill_if_t *interface) 3067 { 3068 ASSERT(interface != NULL); 3069 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 3070 3071 avl_destroy(&interface->illif_avl_by_ppa); 3072 if (interface->illif_ppa_arena != NULL) 3073 vmem_destroy(interface->illif_ppa_arena); 3074 3075 remque(interface); 3076 3077 mi_free(interface); 3078 } 3079 3080 /* 3081 * remove ill from the global list. 3082 */ 3083 static void 3084 ill_glist_delete(ill_t *ill) 3085 { 3086 ip_stack_t *ipst; 3087 phyint_t *phyi; 3088 3089 if (ill == NULL) 3090 return; 3091 ipst = ill->ill_ipst; 3092 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 3093 3094 /* 3095 * If the ill was never inserted into the AVL tree 3096 * we skip the if branch. 3097 */ 3098 if (ill->ill_ifptr != NULL) { 3099 /* 3100 * remove from AVL tree and free ppa number 3101 */ 3102 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 3103 3104 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 3105 vmem_free(ill->ill_ifptr->illif_ppa_arena, 3106 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 3107 } 3108 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 3109 ill_delete_interface_type(ill->ill_ifptr); 3110 } 3111 3112 /* 3113 * Indicate ill is no longer in the list. 3114 */ 3115 ill->ill_ifptr = NULL; 3116 ill->ill_name_length = 0; 3117 ill->ill_name[0] = '\0'; 3118 ill->ill_ppa = UINT_MAX; 3119 } 3120 3121 /* Generate one last event for this ill. */ 3122 ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name, 3123 ill->ill_name_length); 3124 3125 ASSERT(ill->ill_phyint != NULL); 3126 phyi = ill->ill_phyint; 3127 ill->ill_phyint = NULL; 3128 3129 /* 3130 * ill_init allocates a phyint always to store the copy 3131 * of flags relevant to phyint. At that point in time, we could 3132 * not assign the name and hence phyint_illv4/v6 could not be 3133 * initialized. Later in ipif_set_values, we assign the name to 3134 * the ill, at which point in time we assign phyint_illv4/v6. 3135 * Thus we don't rely on phyint_illv6 to be initialized always. 3136 */ 3137 if (ill->ill_flags & ILLF_IPV6) 3138 phyi->phyint_illv6 = NULL; 3139 else 3140 phyi->phyint_illv4 = NULL; 3141 3142 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) { 3143 rw_exit(&ipst->ips_ill_g_lock); 3144 return; 3145 } 3146 3147 /* 3148 * There are no ills left on this phyint; pull it out of the phyint 3149 * avl trees, and free it. 3150 */ 3151 if (phyi->phyint_ifindex > 0) { 3152 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3153 phyi); 3154 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 3155 phyi); 3156 } 3157 rw_exit(&ipst->ips_ill_g_lock); 3158 3159 phyint_free(phyi); 3160 } 3161 3162 /* 3163 * allocate a ppa, if the number of plumbed interfaces of this type are 3164 * less than ill_no_arena do a linear search to find a unused ppa. 3165 * When the number goes beyond ill_no_arena switch to using an arena. 3166 * Note: ppa value of zero cannot be allocated from vmem_arena as it 3167 * is the return value for an error condition, so allocation starts at one 3168 * and is decremented by one. 3169 */ 3170 static int 3171 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 3172 { 3173 ill_t *tmp_ill; 3174 uint_t start, end; 3175 int ppa; 3176 3177 if (ifp->illif_ppa_arena == NULL && 3178 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 3179 /* 3180 * Create an arena. 3181 */ 3182 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 3183 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 3184 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 3185 /* allocate what has already been assigned */ 3186 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 3187 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 3188 tmp_ill, AVL_AFTER)) { 3189 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 3190 1, /* size */ 3191 1, /* align/quantum */ 3192 0, /* phase */ 3193 0, /* nocross */ 3194 /* minaddr */ 3195 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), 3196 /* maxaddr */ 3197 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), 3198 VM_NOSLEEP|VM_FIRSTFIT); 3199 if (ppa == 0) { 3200 ip1dbg(("ill_alloc_ppa: ppa allocation" 3201 " failed while switching")); 3202 vmem_destroy(ifp->illif_ppa_arena); 3203 ifp->illif_ppa_arena = NULL; 3204 break; 3205 } 3206 } 3207 } 3208 3209 if (ifp->illif_ppa_arena != NULL) { 3210 if (ill->ill_ppa == UINT_MAX) { 3211 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 3212 1, VM_NOSLEEP|VM_FIRSTFIT); 3213 if (ppa == 0) 3214 return (EAGAIN); 3215 ill->ill_ppa = --ppa; 3216 } else { 3217 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 3218 1, /* size */ 3219 1, /* align/quantum */ 3220 0, /* phase */ 3221 0, /* nocross */ 3222 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 3223 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 3224 VM_NOSLEEP|VM_FIRSTFIT); 3225 /* 3226 * Most likely the allocation failed because 3227 * the requested ppa was in use. 3228 */ 3229 if (ppa == 0) 3230 return (EEXIST); 3231 } 3232 return (0); 3233 } 3234 3235 /* 3236 * No arena is in use and not enough (>ill_no_arena) interfaces have 3237 * been plumbed to create one. Do a linear search to get a unused ppa. 3238 */ 3239 if (ill->ill_ppa == UINT_MAX) { 3240 end = UINT_MAX - 1; 3241 start = 0; 3242 } else { 3243 end = start = ill->ill_ppa; 3244 } 3245 3246 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 3247 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 3248 if (start++ >= end) { 3249 if (ill->ill_ppa == UINT_MAX) 3250 return (EAGAIN); 3251 else 3252 return (EEXIST); 3253 } 3254 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 3255 } 3256 ill->ill_ppa = start; 3257 return (0); 3258 } 3259 3260 /* 3261 * Insert ill into the list of configured ill's. Once this function completes, 3262 * the ill is globally visible and is available through lookups. More precisely 3263 * this happens after the caller drops the ill_g_lock. 3264 */ 3265 static int 3266 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 3267 { 3268 ill_if_t *ill_interface; 3269 avl_index_t where = 0; 3270 int error; 3271 int name_length; 3272 int index; 3273 boolean_t check_length = B_FALSE; 3274 ip_stack_t *ipst = ill->ill_ipst; 3275 3276 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 3277 3278 name_length = mi_strlen(name) + 1; 3279 3280 if (isv6) 3281 index = IP_V6_G_HEAD; 3282 else 3283 index = IP_V4_G_HEAD; 3284 3285 ill_interface = IP_VX_ILL_G_LIST(index, ipst); 3286 /* 3287 * Search for interface type based on name 3288 */ 3289 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 3290 if ((ill_interface->illif_name_len == name_length) && 3291 (strcmp(ill_interface->illif_name, name) == 0)) { 3292 break; 3293 } 3294 ill_interface = ill_interface->illif_next; 3295 } 3296 3297 /* 3298 * Interface type not found, create one. 3299 */ 3300 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 3301 ill_g_head_t ghead; 3302 3303 /* 3304 * allocate ill_if_t structure 3305 */ 3306 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 3307 if (ill_interface == NULL) { 3308 return (ENOMEM); 3309 } 3310 3311 (void) strcpy(ill_interface->illif_name, name); 3312 ill_interface->illif_name_len = name_length; 3313 3314 avl_create(&ill_interface->illif_avl_by_ppa, 3315 ill_compare_ppa, sizeof (ill_t), 3316 offsetof(struct ill_s, ill_avl_byppa)); 3317 3318 /* 3319 * link the structure in the back to maintain order 3320 * of configuration for ifconfig output. 3321 */ 3322 ghead = ipst->ips_ill_g_heads[index]; 3323 insque(ill_interface, ghead.ill_g_list_tail); 3324 } 3325 3326 if (ill->ill_ppa == UINT_MAX) 3327 check_length = B_TRUE; 3328 3329 error = ill_alloc_ppa(ill_interface, ill); 3330 if (error != 0) { 3331 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 3332 ill_delete_interface_type(ill->ill_ifptr); 3333 return (error); 3334 } 3335 3336 /* 3337 * When the ppa is choosen by the system, check that there is 3338 * enough space to insert ppa. if a specific ppa was passed in this 3339 * check is not required as the interface name passed in will have 3340 * the right ppa in it. 3341 */ 3342 if (check_length) { 3343 /* 3344 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 3345 */ 3346 char buf[sizeof (uint_t) * 3]; 3347 3348 /* 3349 * convert ppa to string to calculate the amount of space 3350 * required for it in the name. 3351 */ 3352 numtos(ill->ill_ppa, buf); 3353 3354 /* Do we have enough space to insert ppa ? */ 3355 3356 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 3357 /* Free ppa and interface type struct */ 3358 if (ill_interface->illif_ppa_arena != NULL) { 3359 vmem_free(ill_interface->illif_ppa_arena, 3360 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 3361 } 3362 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 3363 ill_delete_interface_type(ill->ill_ifptr); 3364 3365 return (EINVAL); 3366 } 3367 } 3368 3369 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 3370 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 3371 3372 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 3373 &where); 3374 ill->ill_ifptr = ill_interface; 3375 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 3376 3377 ill_phyint_reinit(ill); 3378 return (0); 3379 } 3380 3381 /* Initialize the per phyint ipsq used for serialization */ 3382 static boolean_t 3383 ipsq_init(ill_t *ill, boolean_t enter) 3384 { 3385 ipsq_t *ipsq; 3386 ipxop_t *ipx; 3387 3388 if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL) 3389 return (B_FALSE); 3390 3391 ill->ill_phyint->phyint_ipsq = ipsq; 3392 ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop; 3393 ipx->ipx_ipsq = ipsq; 3394 ipsq->ipsq_next = ipsq; 3395 ipsq->ipsq_phyint = ill->ill_phyint; 3396 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 3397 mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0); 3398 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */ 3399 if (enter) { 3400 ipx->ipx_writer = curthread; 3401 ipx->ipx_forced = B_FALSE; 3402 ipx->ipx_reentry_cnt = 1; 3403 #ifdef DEBUG 3404 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 3405 #endif 3406 } 3407 return (B_TRUE); 3408 } 3409 3410 /* 3411 * ill_init is called by ip_open when a device control stream is opened. 3412 * It does a few initializations, and shoots a DL_INFO_REQ message down 3413 * to the driver. The response is later picked up in ip_rput_dlpi and 3414 * used to set up default mechanisms for talking to the driver. (Always 3415 * called as writer.) 3416 * 3417 * If this function returns error, ip_open will call ip_close which in 3418 * turn will call ill_delete to clean up any memory allocated here that 3419 * is not yet freed. 3420 */ 3421 int 3422 ill_init(queue_t *q, ill_t *ill) 3423 { 3424 int count; 3425 dl_info_req_t *dlir; 3426 mblk_t *info_mp; 3427 uchar_t *frag_ptr; 3428 3429 /* 3430 * The ill is initialized to zero by mi_alloc*(). In addition 3431 * some fields already contain valid values, initialized in 3432 * ip_open(), before we reach here. 3433 */ 3434 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 3435 mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 3436 ill->ill_saved_ire_cnt = 0; 3437 3438 ill->ill_rq = q; 3439 ill->ill_wq = WR(q); 3440 3441 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 3442 BPRI_HI); 3443 if (info_mp == NULL) 3444 return (ENOMEM); 3445 3446 /* 3447 * Allocate sufficient space to contain our fragment hash table and 3448 * the device name. 3449 */ 3450 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 3451 2 * LIFNAMSIZ + strlen(ipv6_forward_suffix)); 3452 if (frag_ptr == NULL) { 3453 freemsg(info_mp); 3454 return (ENOMEM); 3455 } 3456 ill->ill_frag_ptr = frag_ptr; 3457 ill->ill_frag_free_num_pkts = 0; 3458 ill->ill_last_frag_clean_time = 0; 3459 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 3460 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 3461 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 3462 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 3463 NULL, MUTEX_DEFAULT, NULL); 3464 } 3465 3466 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 3467 if (ill->ill_phyint == NULL) { 3468 freemsg(info_mp); 3469 mi_free(frag_ptr); 3470 return (ENOMEM); 3471 } 3472 3473 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 3474 /* 3475 * For now pretend this is a v4 ill. We need to set phyint_ill* 3476 * at this point because of the following reason. If we can't 3477 * enter the ipsq at some point and cv_wait, the writer that 3478 * wakes us up tries to locate us using the list of all phyints 3479 * in an ipsq and the ills from the phyint thru the phyint_ill*. 3480 * If we don't set it now, we risk a missed wakeup. 3481 */ 3482 ill->ill_phyint->phyint_illv4 = ill; 3483 ill->ill_ppa = UINT_MAX; 3484 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); 3485 3486 ill_set_inputfn(ill); 3487 3488 if (!ipsq_init(ill, B_TRUE)) { 3489 freemsg(info_mp); 3490 mi_free(frag_ptr); 3491 mi_free(ill->ill_phyint); 3492 return (ENOMEM); 3493 } 3494 3495 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 3496 3497 /* Frag queue limit stuff */ 3498 ill->ill_frag_count = 0; 3499 ill->ill_ipf_gen = 0; 3500 3501 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); 3502 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); 3503 ill->ill_global_timer = INFINITY; 3504 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 3505 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 3506 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 3507 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 3508 3509 /* 3510 * Initialize IPv6 configuration variables. The IP module is always 3511 * opened as an IPv4 module. Instead tracking down the cases where 3512 * it switches to do ipv6, we'll just initialize the IPv6 configuration 3513 * here for convenience, this has no effect until the ill is set to do 3514 * IPv6. 3515 */ 3516 ill->ill_reachable_time = ND_REACHABLE_TIME; 3517 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 3518 ill->ill_max_buf = ND_MAX_Q; 3519 ill->ill_refcnt = 0; 3520 3521 /* Send down the Info Request to the driver. */ 3522 info_mp->b_datap->db_type = M_PCPROTO; 3523 dlir = (dl_info_req_t *)info_mp->b_rptr; 3524 info_mp->b_wptr = (uchar_t *)&dlir[1]; 3525 dlir->dl_primitive = DL_INFO_REQ; 3526 3527 ill->ill_dlpi_pending = DL_PRIM_INVAL; 3528 3529 qprocson(q); 3530 ill_dlpi_send(ill, info_mp); 3531 3532 return (0); 3533 } 3534 3535 /* 3536 * ill_dls_info 3537 * creates datalink socket info from the device. 3538 */ 3539 int 3540 ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill) 3541 { 3542 size_t len; 3543 3544 sdl->sdl_family = AF_LINK; 3545 sdl->sdl_index = ill_get_upper_ifindex(ill); 3546 sdl->sdl_type = ill->ill_type; 3547 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); 3548 len = strlen(sdl->sdl_data); 3549 ASSERT(len < 256); 3550 sdl->sdl_nlen = (uchar_t)len; 3551 sdl->sdl_alen = ill->ill_phys_addr_length; 3552 sdl->sdl_slen = 0; 3553 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) 3554 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen); 3555 3556 return (sizeof (struct sockaddr_dl)); 3557 } 3558 3559 /* 3560 * ill_xarp_info 3561 * creates xarp info from the device. 3562 */ 3563 static int 3564 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 3565 { 3566 sdl->sdl_family = AF_LINK; 3567 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 3568 sdl->sdl_type = ill->ill_type; 3569 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); 3570 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 3571 sdl->sdl_alen = ill->ill_phys_addr_length; 3572 sdl->sdl_slen = 0; 3573 return (sdl->sdl_nlen); 3574 } 3575 3576 static int 3577 loopback_kstat_update(kstat_t *ksp, int rw) 3578 { 3579 kstat_named_t *kn; 3580 netstackid_t stackid; 3581 netstack_t *ns; 3582 ip_stack_t *ipst; 3583 3584 if (ksp == NULL || ksp->ks_data == NULL) 3585 return (EIO); 3586 3587 if (rw == KSTAT_WRITE) 3588 return (EACCES); 3589 3590 kn = KSTAT_NAMED_PTR(ksp); 3591 stackid = (zoneid_t)(uintptr_t)ksp->ks_private; 3592 3593 ns = netstack_find_by_stackid(stackid); 3594 if (ns == NULL) 3595 return (-1); 3596 3597 ipst = ns->netstack_ip; 3598 if (ipst == NULL) { 3599 netstack_rele(ns); 3600 return (-1); 3601 } 3602 kn[0].value.ui32 = ipst->ips_loopback_packets; 3603 kn[1].value.ui32 = ipst->ips_loopback_packets; 3604 netstack_rele(ns); 3605 return (0); 3606 } 3607 3608 /* 3609 * Has ifindex been plumbed already? 3610 */ 3611 static boolean_t 3612 phyint_exists(uint_t index, ip_stack_t *ipst) 3613 { 3614 ASSERT(index != 0); 3615 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 3616 3617 return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3618 &index, NULL) != NULL); 3619 } 3620 3621 /* Pick a unique ifindex */ 3622 boolean_t 3623 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst) 3624 { 3625 uint_t starting_index; 3626 3627 if (!ipst->ips_ill_index_wrap) { 3628 *indexp = ipst->ips_ill_index++; 3629 if (ipst->ips_ill_index == 0) { 3630 /* Reached the uint_t limit Next time wrap */ 3631 ipst->ips_ill_index_wrap = B_TRUE; 3632 } 3633 return (B_TRUE); 3634 } 3635 3636 /* 3637 * Start reusing unused indexes. Note that we hold the ill_g_lock 3638 * at this point and don't want to call any function that attempts 3639 * to get the lock again. 3640 */ 3641 starting_index = ipst->ips_ill_index++; 3642 for (; ipst->ips_ill_index != starting_index; ipst->ips_ill_index++) { 3643 if (ipst->ips_ill_index != 0 && 3644 !phyint_exists(ipst->ips_ill_index, ipst)) { 3645 /* found unused index - use it */ 3646 *indexp = ipst->ips_ill_index; 3647 return (B_TRUE); 3648 } 3649 } 3650 3651 /* 3652 * all interface indicies are inuse. 3653 */ 3654 return (B_FALSE); 3655 } 3656 3657 /* 3658 * Assign a unique interface index for the phyint. 3659 */ 3660 static boolean_t 3661 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst) 3662 { 3663 ASSERT(phyi->phyint_ifindex == 0); 3664 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst)); 3665 } 3666 3667 /* 3668 * Initialize the flags on `phyi' as per the provided mactype. 3669 */ 3670 static void 3671 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype) 3672 { 3673 uint64_t flags = 0; 3674 3675 /* 3676 * Initialize PHYI_RUNNING and PHYI_FAILED. For non-IPMP interfaces, 3677 * we always presume the underlying hardware is working and set 3678 * PHYI_RUNNING (if it's not, the driver will subsequently send a 3679 * DL_NOTE_LINK_DOWN message). For IPMP interfaces, at initialization 3680 * there are no active interfaces in the group so we set PHYI_FAILED. 3681 */ 3682 if (mactype == SUNW_DL_IPMP) 3683 flags |= PHYI_FAILED; 3684 else 3685 flags |= PHYI_RUNNING; 3686 3687 switch (mactype) { 3688 case SUNW_DL_VNI: 3689 flags |= PHYI_VIRTUAL; 3690 break; 3691 case SUNW_DL_IPMP: 3692 flags |= PHYI_IPMP; 3693 break; 3694 case DL_LOOP: 3695 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL); 3696 break; 3697 } 3698 3699 mutex_enter(&phyi->phyint_lock); 3700 phyi->phyint_flags |= flags; 3701 mutex_exit(&phyi->phyint_lock); 3702 } 3703 3704 /* 3705 * Return a pointer to the ill which matches the supplied name. Note that 3706 * the ill name length includes the null termination character. (May be 3707 * called as writer.) 3708 * If do_alloc and the interface is "lo0" it will be automatically created. 3709 * Cannot bump up reference on condemned ills. So dup detect can't be done 3710 * using this func. 3711 */ 3712 ill_t * 3713 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 3714 boolean_t *did_alloc, ip_stack_t *ipst) 3715 { 3716 ill_t *ill; 3717 ipif_t *ipif; 3718 ipsq_t *ipsq; 3719 kstat_named_t *kn; 3720 boolean_t isloopback; 3721 in6_addr_t ov6addr; 3722 3723 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 3724 3725 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3726 ill = ill_find_by_name(name, isv6, ipst); 3727 rw_exit(&ipst->ips_ill_g_lock); 3728 if (ill != NULL) 3729 return (ill); 3730 3731 /* 3732 * Couldn't find it. Does this happen to be a lookup for the 3733 * loopback device and are we allowed to allocate it? 3734 */ 3735 if (!isloopback || !do_alloc) 3736 return (NULL); 3737 3738 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 3739 ill = ill_find_by_name(name, isv6, ipst); 3740 if (ill != NULL) { 3741 rw_exit(&ipst->ips_ill_g_lock); 3742 return (ill); 3743 } 3744 3745 /* Create the loopback device on demand */ 3746 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 3747 sizeof (ipif_loopback_name), BPRI_MED)); 3748 if (ill == NULL) 3749 goto done; 3750 3751 *ill = ill_null; 3752 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 3753 ill->ill_ipst = ipst; 3754 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); 3755 netstack_hold(ipst->ips_netstack); 3756 /* 3757 * For exclusive stacks we set the zoneid to zero 3758 * to make IP operate as if in the global zone. 3759 */ 3760 ill->ill_zoneid = GLOBAL_ZONEID; 3761 3762 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 3763 if (ill->ill_phyint == NULL) 3764 goto done; 3765 3766 if (isv6) 3767 ill->ill_phyint->phyint_illv6 = ill; 3768 else 3769 ill->ill_phyint->phyint_illv4 = ill; 3770 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 3771 phyint_flags_init(ill->ill_phyint, DL_LOOP); 3772 3773 if (isv6) { 3774 ill->ill_isv6 = B_TRUE; 3775 ill->ill_max_frag = ip_loopback_mtu_v6plus; 3776 } else { 3777 ill->ill_max_frag = ip_loopback_mtuplus; 3778 } 3779 if (!ill_allocate_mibs(ill)) 3780 goto done; 3781 ill->ill_current_frag = ill->ill_max_frag; 3782 ill->ill_mtu = ill->ill_max_frag; /* Initial value */ 3783 /* 3784 * ipif_loopback_name can't be pointed at directly because its used 3785 * by both the ipv4 and ipv6 interfaces. When the ill is removed 3786 * from the glist, ill_glist_delete() sets the first character of 3787 * ill_name to '\0'. 3788 */ 3789 ill->ill_name = (char *)ill + sizeof (*ill); 3790 (void) strcpy(ill->ill_name, ipif_loopback_name); 3791 ill->ill_name_length = sizeof (ipif_loopback_name); 3792 /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */ 3793 ill->ill_dlpi_pending = DL_PRIM_INVAL; 3794 3795 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); 3796 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); 3797 ill->ill_global_timer = INFINITY; 3798 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 3799 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 3800 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 3801 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 3802 3803 /* No resolver here. */ 3804 ill->ill_net_type = IRE_LOOPBACK; 3805 3806 /* Initialize the ipsq */ 3807 if (!ipsq_init(ill, B_FALSE)) 3808 goto done; 3809 3810 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL); 3811 if (ipif == NULL) 3812 goto done; 3813 3814 ill->ill_flags = ILLF_MULTICAST; 3815 3816 ov6addr = ipif->ipif_v6lcl_addr; 3817 /* Set up default loopback address and mask. */ 3818 if (!isv6) { 3819 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 3820 3821 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 3822 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 3823 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 3824 ipif->ipif_v6subnet); 3825 ill->ill_flags |= ILLF_IPV4; 3826 } else { 3827 ipif->ipif_v6lcl_addr = ipv6_loopback; 3828 ipif->ipif_v6net_mask = ipv6_all_ones; 3829 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 3830 ipif->ipif_v6subnet); 3831 ill->ill_flags |= ILLF_IPV6; 3832 } 3833 3834 /* 3835 * Chain us in at the end of the ill list. hold the ill 3836 * before we make it globally visible. 1 for the lookup. 3837 */ 3838 ill->ill_refcnt = 0; 3839 ill_refhold(ill); 3840 3841 ill->ill_frag_count = 0; 3842 ill->ill_frag_free_num_pkts = 0; 3843 ill->ill_last_frag_clean_time = 0; 3844 3845 ipsq = ill->ill_phyint->phyint_ipsq; 3846 3847 ill_set_inputfn(ill); 3848 3849 if (ill_glist_insert(ill, "lo", isv6) != 0) 3850 cmn_err(CE_PANIC, "cannot insert loopback interface"); 3851 3852 /* Let SCTP know so that it can add this to its list */ 3853 sctp_update_ill(ill, SCTP_ILL_INSERT); 3854 3855 /* 3856 * We have already assigned ipif_v6lcl_addr above, but we need to 3857 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which 3858 * requires to be after ill_glist_insert() since we need the 3859 * ill_index set. Pass on ipv6_loopback as the old address. 3860 */ 3861 sctp_update_ipif_addr(ipif, ov6addr); 3862 3863 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT); 3864 3865 /* 3866 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs. 3867 * If so, free our original one. 3868 */ 3869 if (ipsq != ill->ill_phyint->phyint_ipsq) 3870 ipsq_delete(ipsq); 3871 3872 if (ipst->ips_loopback_ksp == NULL) { 3873 /* Export loopback interface statistics */ 3874 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0, 3875 ipif_loopback_name, "net", 3876 KSTAT_TYPE_NAMED, 2, 0, 3877 ipst->ips_netstack->netstack_stackid); 3878 if (ipst->ips_loopback_ksp != NULL) { 3879 ipst->ips_loopback_ksp->ks_update = 3880 loopback_kstat_update; 3881 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp); 3882 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 3883 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 3884 ipst->ips_loopback_ksp->ks_private = 3885 (void *)(uintptr_t)ipst->ips_netstack-> 3886 netstack_stackid; 3887 kstat_install(ipst->ips_loopback_ksp); 3888 } 3889 } 3890 3891 *did_alloc = B_TRUE; 3892 rw_exit(&ipst->ips_ill_g_lock); 3893 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id), 3894 NE_PLUMB, ill->ill_name, ill->ill_name_length); 3895 return (ill); 3896 done: 3897 if (ill != NULL) { 3898 if (ill->ill_phyint != NULL) { 3899 ipsq = ill->ill_phyint->phyint_ipsq; 3900 if (ipsq != NULL) { 3901 ipsq->ipsq_phyint = NULL; 3902 ipsq_delete(ipsq); 3903 } 3904 mi_free(ill->ill_phyint); 3905 } 3906 ill_free_mib(ill); 3907 if (ill->ill_ipst != NULL) 3908 netstack_rele(ill->ill_ipst->ips_netstack); 3909 mi_free(ill); 3910 } 3911 rw_exit(&ipst->ips_ill_g_lock); 3912 return (NULL); 3913 } 3914 3915 /* 3916 * For IPP calls - use the ip_stack_t for global stack. 3917 */ 3918 ill_t * 3919 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6) 3920 { 3921 ip_stack_t *ipst; 3922 ill_t *ill; 3923 3924 ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip; 3925 if (ipst == NULL) { 3926 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n"); 3927 return (NULL); 3928 } 3929 3930 ill = ill_lookup_on_ifindex(index, isv6, ipst); 3931 netstack_rele(ipst->ips_netstack); 3932 return (ill); 3933 } 3934 3935 /* 3936 * Return a pointer to the ill which matches the index and IP version type. 3937 */ 3938 ill_t * 3939 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 3940 { 3941 ill_t *ill; 3942 phyint_t *phyi; 3943 3944 /* 3945 * Indexes are stored in the phyint - a common structure 3946 * to both IPv4 and IPv6. 3947 */ 3948 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3949 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3950 (void *) &index, NULL); 3951 if (phyi != NULL) { 3952 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 3953 if (ill != NULL) { 3954 mutex_enter(&ill->ill_lock); 3955 if (!ILL_IS_CONDEMNED(ill)) { 3956 ill_refhold_locked(ill); 3957 mutex_exit(&ill->ill_lock); 3958 rw_exit(&ipst->ips_ill_g_lock); 3959 return (ill); 3960 } 3961 mutex_exit(&ill->ill_lock); 3962 } 3963 } 3964 rw_exit(&ipst->ips_ill_g_lock); 3965 return (NULL); 3966 } 3967 3968 /* 3969 * Verify whether or not an interface index is valid for the specified zoneid 3970 * to transmit packets. 3971 * It can be zero (meaning "reset") or an interface index assigned 3972 * to a non-VNI interface. (We don't use VNI interface to send packets.) 3973 */ 3974 boolean_t 3975 ip_xmit_ifindex_valid(uint_t ifindex, zoneid_t zoneid, boolean_t isv6, 3976 ip_stack_t *ipst) 3977 { 3978 ill_t *ill; 3979 3980 if (ifindex == 0) 3981 return (B_TRUE); 3982 3983 ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid, isv6, ipst); 3984 if (ill == NULL) 3985 return (B_FALSE); 3986 if (IS_VNI(ill)) { 3987 ill_refrele(ill); 3988 return (B_FALSE); 3989 } 3990 ill_refrele(ill); 3991 return (B_TRUE); 3992 } 3993 3994 /* 3995 * Return the ifindex next in sequence after the passed in ifindex. 3996 * If there is no next ifindex for the given protocol, return 0. 3997 */ 3998 uint_t 3999 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 4000 { 4001 phyint_t *phyi; 4002 phyint_t *phyi_initial; 4003 uint_t ifindex; 4004 4005 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4006 4007 if (index == 0) { 4008 phyi = avl_first( 4009 &ipst->ips_phyint_g_list->phyint_list_avl_by_index); 4010 } else { 4011 phyi = phyi_initial = avl_find( 4012 &ipst->ips_phyint_g_list->phyint_list_avl_by_index, 4013 (void *) &index, NULL); 4014 } 4015 4016 for (; phyi != NULL; 4017 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 4018 phyi, AVL_AFTER)) { 4019 /* 4020 * If we're not returning the first interface in the tree 4021 * and we still haven't moved past the phyint_t that 4022 * corresponds to index, avl_walk needs to be called again 4023 */ 4024 if (!((index != 0) && (phyi == phyi_initial))) { 4025 if (isv6) { 4026 if ((phyi->phyint_illv6) && 4027 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 4028 (phyi->phyint_illv6->ill_isv6 == 1)) 4029 break; 4030 } else { 4031 if ((phyi->phyint_illv4) && 4032 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 4033 (phyi->phyint_illv4->ill_isv6 == 0)) 4034 break; 4035 } 4036 } 4037 } 4038 4039 rw_exit(&ipst->ips_ill_g_lock); 4040 4041 if (phyi != NULL) 4042 ifindex = phyi->phyint_ifindex; 4043 else 4044 ifindex = 0; 4045 4046 return (ifindex); 4047 } 4048 4049 /* 4050 * Return the ifindex for the named interface. 4051 * If there is no next ifindex for the interface, return 0. 4052 */ 4053 uint_t 4054 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst) 4055 { 4056 phyint_t *phyi; 4057 avl_index_t where = 0; 4058 uint_t ifindex; 4059 4060 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4061 4062 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 4063 name, &where)) == NULL) { 4064 rw_exit(&ipst->ips_ill_g_lock); 4065 return (0); 4066 } 4067 4068 ifindex = phyi->phyint_ifindex; 4069 4070 rw_exit(&ipst->ips_ill_g_lock); 4071 4072 return (ifindex); 4073 } 4074 4075 /* 4076 * Return the ifindex to be used by upper layer protocols for instance 4077 * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill. 4078 */ 4079 uint_t 4080 ill_get_upper_ifindex(const ill_t *ill) 4081 { 4082 if (IS_UNDER_IPMP(ill)) 4083 return (ipmp_ill_get_ipmp_ifindex(ill)); 4084 else 4085 return (ill->ill_phyint->phyint_ifindex); 4086 } 4087 4088 4089 /* 4090 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 4091 * that gives a running thread a reference to the ill. This reference must be 4092 * released by the thread when it is done accessing the ill and related 4093 * objects. ill_refcnt can not be used to account for static references 4094 * such as other structures pointing to an ill. Callers must generally 4095 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 4096 * or be sure that the ill is not being deleted or changing state before 4097 * calling the refhold functions. A non-zero ill_refcnt ensures that the 4098 * ill won't change any of its critical state such as address, netmask etc. 4099 */ 4100 void 4101 ill_refhold(ill_t *ill) 4102 { 4103 mutex_enter(&ill->ill_lock); 4104 ill->ill_refcnt++; 4105 ILL_TRACE_REF(ill); 4106 mutex_exit(&ill->ill_lock); 4107 } 4108 4109 void 4110 ill_refhold_locked(ill_t *ill) 4111 { 4112 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4113 ill->ill_refcnt++; 4114 ILL_TRACE_REF(ill); 4115 } 4116 4117 /* Returns true if we managed to get a refhold */ 4118 boolean_t 4119 ill_check_and_refhold(ill_t *ill) 4120 { 4121 mutex_enter(&ill->ill_lock); 4122 if (!ILL_IS_CONDEMNED(ill)) { 4123 ill_refhold_locked(ill); 4124 mutex_exit(&ill->ill_lock); 4125 return (B_TRUE); 4126 } 4127 mutex_exit(&ill->ill_lock); 4128 return (B_FALSE); 4129 } 4130 4131 /* 4132 * Must not be called while holding any locks. Otherwise if this is 4133 * the last reference to be released, there is a chance of recursive mutex 4134 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 4135 * to restart an ioctl. 4136 */ 4137 void 4138 ill_refrele(ill_t *ill) 4139 { 4140 mutex_enter(&ill->ill_lock); 4141 ASSERT(ill->ill_refcnt != 0); 4142 ill->ill_refcnt--; 4143 ILL_UNTRACE_REF(ill); 4144 if (ill->ill_refcnt != 0) { 4145 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 4146 mutex_exit(&ill->ill_lock); 4147 return; 4148 } 4149 4150 /* Drops the ill_lock */ 4151 ipif_ill_refrele_tail(ill); 4152 } 4153 4154 /* 4155 * Obtain a weak reference count on the ill. This reference ensures the 4156 * ill won't be freed, but the ill may change any of its critical state 4157 * such as netmask, address etc. Returns an error if the ill has started 4158 * closing. 4159 */ 4160 boolean_t 4161 ill_waiter_inc(ill_t *ill) 4162 { 4163 mutex_enter(&ill->ill_lock); 4164 if (ill->ill_state_flags & ILL_CONDEMNED) { 4165 mutex_exit(&ill->ill_lock); 4166 return (B_FALSE); 4167 } 4168 ill->ill_waiters++; 4169 mutex_exit(&ill->ill_lock); 4170 return (B_TRUE); 4171 } 4172 4173 void 4174 ill_waiter_dcr(ill_t *ill) 4175 { 4176 mutex_enter(&ill->ill_lock); 4177 ill->ill_waiters--; 4178 if (ill->ill_waiters == 0) 4179 cv_broadcast(&ill->ill_cv); 4180 mutex_exit(&ill->ill_lock); 4181 } 4182 4183 /* 4184 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 4185 * driver. We construct best guess defaults for lower level information that 4186 * we need. If an interface is brought up without injection of any overriding 4187 * information from outside, we have to be ready to go with these defaults. 4188 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 4189 * we primarely want the dl_provider_style. 4190 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 4191 * at which point we assume the other part of the information is valid. 4192 */ 4193 void 4194 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 4195 { 4196 uchar_t *brdcst_addr; 4197 uint_t brdcst_addr_length, phys_addr_length; 4198 t_scalar_t sap_length; 4199 dl_info_ack_t *dlia; 4200 ip_m_t *ipm; 4201 dl_qos_cl_sel1_t *sel1; 4202 int min_mtu; 4203 4204 ASSERT(IAM_WRITER_ILL(ill)); 4205 4206 /* 4207 * Till the ill is fully up the ill is not globally visible. 4208 * So no need for a lock. 4209 */ 4210 dlia = (dl_info_ack_t *)mp->b_rptr; 4211 ill->ill_mactype = dlia->dl_mac_type; 4212 4213 ipm = ip_m_lookup(dlia->dl_mac_type); 4214 if (ipm == NULL) { 4215 ipm = ip_m_lookup(DL_OTHER); 4216 ASSERT(ipm != NULL); 4217 } 4218 ill->ill_media = ipm; 4219 4220 /* 4221 * When the new DLPI stuff is ready we'll pull lengths 4222 * from dlia. 4223 */ 4224 if (dlia->dl_version == DL_VERSION_2) { 4225 brdcst_addr_length = dlia->dl_brdcst_addr_length; 4226 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 4227 brdcst_addr_length); 4228 if (brdcst_addr == NULL) { 4229 brdcst_addr_length = 0; 4230 } 4231 sap_length = dlia->dl_sap_length; 4232 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 4233 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 4234 brdcst_addr_length, sap_length, phys_addr_length)); 4235 } else { 4236 brdcst_addr_length = 6; 4237 brdcst_addr = ip_six_byte_all_ones; 4238 sap_length = -2; 4239 phys_addr_length = brdcst_addr_length; 4240 } 4241 4242 ill->ill_bcast_addr_length = brdcst_addr_length; 4243 ill->ill_phys_addr_length = phys_addr_length; 4244 ill->ill_sap_length = sap_length; 4245 4246 /* 4247 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU, 4248 * but we must ensure a minimum IP MTU is used since other bits of 4249 * IP will fly apart otherwise. 4250 */ 4251 min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; 4252 ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu); 4253 ill->ill_current_frag = ill->ill_max_frag; 4254 ill->ill_mtu = ill->ill_max_frag; 4255 4256 ill->ill_type = ipm->ip_m_type; 4257 4258 if (!ill->ill_dlpi_style_set) { 4259 if (dlia->dl_provider_style == DL_STYLE2) 4260 ill->ill_needs_attach = 1; 4261 4262 phyint_flags_init(ill->ill_phyint, ill->ill_mactype); 4263 4264 /* 4265 * Allocate the first ipif on this ill. We don't delay it 4266 * further as ioctl handling assumes at least one ipif exists. 4267 * 4268 * At this point we don't know whether the ill is v4 or v6. 4269 * We will know this whan the SIOCSLIFNAME happens and 4270 * the correct value for ill_isv6 will be assigned in 4271 * ipif_set_values(). We need to hold the ill lock and 4272 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 4273 * the wakeup. 4274 */ 4275 (void) ipif_allocate(ill, 0, IRE_LOCAL, 4276 dlia->dl_provider_style != DL_STYLE2, B_TRUE, NULL); 4277 mutex_enter(&ill->ill_lock); 4278 ASSERT(ill->ill_dlpi_style_set == 0); 4279 ill->ill_dlpi_style_set = 1; 4280 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 4281 cv_broadcast(&ill->ill_cv); 4282 mutex_exit(&ill->ill_lock); 4283 freemsg(mp); 4284 return; 4285 } 4286 ASSERT(ill->ill_ipif != NULL); 4287 /* 4288 * We know whether it is IPv4 or IPv6 now, as this is the 4289 * second DL_INFO_ACK we are recieving in response to the 4290 * DL_INFO_REQ sent in ipif_set_values. 4291 */ 4292 ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap; 4293 /* 4294 * Clear all the flags that were set based on ill_bcast_addr_length 4295 * and ill_phys_addr_length (in ipif_set_values) as these could have 4296 * changed now and we need to re-evaluate. 4297 */ 4298 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 4299 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 4300 4301 /* 4302 * Free ill_bcast_mp as things could have changed now. 4303 * 4304 * NOTE: The IPMP meta-interface is special-cased because it starts 4305 * with no underlying interfaces (and thus an unknown broadcast 4306 * address length), but we enforce that an interface is broadcast- 4307 * capable as part of allowing it to join a group. 4308 */ 4309 if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) { 4310 if (ill->ill_bcast_mp != NULL) 4311 freemsg(ill->ill_bcast_mp); 4312 ill->ill_net_type = IRE_IF_NORESOLVER; 4313 4314 ill->ill_bcast_mp = ill_dlur_gen(NULL, 4315 ill->ill_phys_addr_length, 4316 ill->ill_sap, 4317 ill->ill_sap_length); 4318 4319 if (ill->ill_isv6) 4320 /* 4321 * Note: xresolv interfaces will eventually need NOARP 4322 * set here as well, but that will require those 4323 * external resolvers to have some knowledge of 4324 * that flag and act appropriately. Not to be changed 4325 * at present. 4326 */ 4327 ill->ill_flags |= ILLF_NONUD; 4328 else 4329 ill->ill_flags |= ILLF_NOARP; 4330 4331 if (ill->ill_mactype == SUNW_DL_VNI) { 4332 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 4333 } else if (ill->ill_phys_addr_length == 0 || 4334 ill->ill_mactype == DL_IPV4 || 4335 ill->ill_mactype == DL_IPV6) { 4336 /* 4337 * The underying link is point-to-point, so mark the 4338 * interface as such. We can do IP multicast over 4339 * such a link since it transmits all network-layer 4340 * packets to the remote side the same way. 4341 */ 4342 ill->ill_flags |= ILLF_MULTICAST; 4343 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 4344 } 4345 } else { 4346 ill->ill_net_type = IRE_IF_RESOLVER; 4347 if (ill->ill_bcast_mp != NULL) 4348 freemsg(ill->ill_bcast_mp); 4349 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 4350 ill->ill_bcast_addr_length, ill->ill_sap, 4351 ill->ill_sap_length); 4352 /* 4353 * Later detect lack of DLPI driver multicast 4354 * capability by catching DL_ENABMULTI errors in 4355 * ip_rput_dlpi. 4356 */ 4357 ill->ill_flags |= ILLF_MULTICAST; 4358 if (!ill->ill_isv6) 4359 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 4360 } 4361 4362 /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */ 4363 if (ill->ill_mactype == SUNW_DL_IPMP) 4364 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP); 4365 4366 /* By default an interface does not support any CoS marking */ 4367 ill->ill_flags &= ~ILLF_COS_ENABLED; 4368 4369 /* 4370 * If we get QoS information in DL_INFO_ACK, the device supports 4371 * some form of CoS marking, set ILLF_COS_ENABLED. 4372 */ 4373 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 4374 dlia->dl_qos_length); 4375 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 4376 ill->ill_flags |= ILLF_COS_ENABLED; 4377 } 4378 4379 /* Clear any previous error indication. */ 4380 ill->ill_error = 0; 4381 freemsg(mp); 4382 } 4383 4384 /* 4385 * Perform various checks to verify that an address would make sense as a 4386 * local, remote, or subnet interface address. 4387 */ 4388 static boolean_t 4389 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 4390 { 4391 ipaddr_t net_mask; 4392 4393 /* 4394 * Don't allow all zeroes, or all ones, but allow 4395 * all ones netmask. 4396 */ 4397 if ((net_mask = ip_net_mask(addr)) == 0) 4398 return (B_FALSE); 4399 /* A given netmask overrides the "guess" netmask */ 4400 if (subnet_mask != 0) 4401 net_mask = subnet_mask; 4402 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 4403 (addr == (addr | ~net_mask)))) { 4404 return (B_FALSE); 4405 } 4406 4407 /* 4408 * Even if the netmask is all ones, we do not allow address to be 4409 * 255.255.255.255 4410 */ 4411 if (addr == INADDR_BROADCAST) 4412 return (B_FALSE); 4413 4414 if (CLASSD(addr)) 4415 return (B_FALSE); 4416 4417 return (B_TRUE); 4418 } 4419 4420 #define V6_IPIF_LINKLOCAL(p) \ 4421 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr) 4422 4423 /* 4424 * Compare two given ipifs and check if the second one is better than 4425 * the first one using the order of preference (not taking deprecated 4426 * into acount) specified in ipif_lookup_multicast(). 4427 */ 4428 static boolean_t 4429 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) 4430 { 4431 /* Check the least preferred first. */ 4432 if (IS_LOOPBACK(old_ipif->ipif_ill)) { 4433 /* If both ipifs are the same, use the first one. */ 4434 if (IS_LOOPBACK(new_ipif->ipif_ill)) 4435 return (B_FALSE); 4436 else 4437 return (B_TRUE); 4438 } 4439 4440 /* For IPv6, check for link local address. */ 4441 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) { 4442 if (IS_LOOPBACK(new_ipif->ipif_ill) || 4443 V6_IPIF_LINKLOCAL(new_ipif)) { 4444 /* The second one is equal or less preferred. */ 4445 return (B_FALSE); 4446 } else { 4447 return (B_TRUE); 4448 } 4449 } 4450 4451 /* Then check for point to point interface. */ 4452 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) { 4453 if (IS_LOOPBACK(new_ipif->ipif_ill) || 4454 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) || 4455 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) { 4456 return (B_FALSE); 4457 } else { 4458 return (B_TRUE); 4459 } 4460 } 4461 4462 /* old_ipif is a normal interface, so no need to use the new one. */ 4463 return (B_FALSE); 4464 } 4465 4466 /* 4467 * Find a mulitcast-capable ipif given an IP instance and zoneid. 4468 * The ipif must be up, and its ill must multicast-capable, not 4469 * condemned, not an underlying interface in an IPMP group, and 4470 * not a VNI interface. Order of preference: 4471 * 4472 * 1a. normal 4473 * 1b. normal, but deprecated 4474 * 2a. point to point 4475 * 2b. point to point, but deprecated 4476 * 3a. link local 4477 * 3b. link local, but deprecated 4478 * 4. loopback. 4479 */ 4480 static ipif_t * 4481 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 4482 { 4483 ill_t *ill; 4484 ill_walk_context_t ctx; 4485 ipif_t *ipif; 4486 ipif_t *saved_ipif = NULL; 4487 ipif_t *dep_ipif = NULL; 4488 4489 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4490 if (isv6) 4491 ill = ILL_START_WALK_V6(&ctx, ipst); 4492 else 4493 ill = ILL_START_WALK_V4(&ctx, ipst); 4494 4495 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4496 mutex_enter(&ill->ill_lock); 4497 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || 4498 ILL_IS_CONDEMNED(ill) || 4499 !(ill->ill_flags & ILLF_MULTICAST)) { 4500 mutex_exit(&ill->ill_lock); 4501 continue; 4502 } 4503 for (ipif = ill->ill_ipif; ipif != NULL; 4504 ipif = ipif->ipif_next) { 4505 if (zoneid != ipif->ipif_zoneid && 4506 zoneid != ALL_ZONES && 4507 ipif->ipif_zoneid != ALL_ZONES) { 4508 continue; 4509 } 4510 if (!(ipif->ipif_flags & IPIF_UP) || 4511 IPIF_IS_CONDEMNED(ipif)) { 4512 continue; 4513 } 4514 4515 /* 4516 * Found one candidate. If it is deprecated, 4517 * remember it in dep_ipif. If it is not deprecated, 4518 * remember it in saved_ipif. 4519 */ 4520 if (ipif->ipif_flags & IPIF_DEPRECATED) { 4521 if (dep_ipif == NULL) { 4522 dep_ipif = ipif; 4523 } else if (ipif_comp_multi(dep_ipif, ipif, 4524 isv6)) { 4525 /* 4526 * If the previous dep_ipif does not 4527 * belong to the same ill, we've done 4528 * a ipif_refhold() on it. So we need 4529 * to release it. 4530 */ 4531 if (dep_ipif->ipif_ill != ill) 4532 ipif_refrele(dep_ipif); 4533 dep_ipif = ipif; 4534 } 4535 continue; 4536 } 4537 if (saved_ipif == NULL) { 4538 saved_ipif = ipif; 4539 } else { 4540 if (ipif_comp_multi(saved_ipif, ipif, isv6)) { 4541 if (saved_ipif->ipif_ill != ill) 4542 ipif_refrele(saved_ipif); 4543 saved_ipif = ipif; 4544 } 4545 } 4546 } 4547 /* 4548 * Before going to the next ill, do a ipif_refhold() on the 4549 * saved ones. 4550 */ 4551 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill) 4552 ipif_refhold_locked(saved_ipif); 4553 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill) 4554 ipif_refhold_locked(dep_ipif); 4555 mutex_exit(&ill->ill_lock); 4556 } 4557 rw_exit(&ipst->ips_ill_g_lock); 4558 4559 /* 4560 * If we have only the saved_ipif, return it. But if we have both 4561 * saved_ipif and dep_ipif, check to see which one is better. 4562 */ 4563 if (saved_ipif != NULL) { 4564 if (dep_ipif != NULL) { 4565 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) { 4566 ipif_refrele(saved_ipif); 4567 return (dep_ipif); 4568 } else { 4569 ipif_refrele(dep_ipif); 4570 return (saved_ipif); 4571 } 4572 } 4573 return (saved_ipif); 4574 } else { 4575 return (dep_ipif); 4576 } 4577 } 4578 4579 ill_t * 4580 ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 4581 { 4582 ipif_t *ipif; 4583 ill_t *ill; 4584 4585 ipif = ipif_lookup_multicast(ipst, zoneid, isv6); 4586 if (ipif == NULL) 4587 return (NULL); 4588 4589 ill = ipif->ipif_ill; 4590 ill_refhold(ill); 4591 ipif_refrele(ipif); 4592 return (ill); 4593 } 4594 4595 /* 4596 * This function is called when an application does not specify an interface 4597 * to be used for multicast traffic (joining a group/sending data). It 4598 * calls ire_lookup_multi() to look for an interface route for the 4599 * specified multicast group. Doing this allows the administrator to add 4600 * prefix routes for multicast to indicate which interface to be used for 4601 * multicast traffic in the above scenario. The route could be for all 4602 * multicast (224.0/4), for a single multicast group (a /32 route) or 4603 * anything in between. If there is no such multicast route, we just find 4604 * any multicast capable interface and return it. The returned ipif 4605 * is refhold'ed. 4606 * 4607 * We support MULTIRT and RTF_SETSRC on the multicast routes added to the 4608 * unicast table. This is used by CGTP. 4609 */ 4610 ill_t * 4611 ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 4612 boolean_t *multirtp, ipaddr_t *setsrcp) 4613 { 4614 ill_t *ill; 4615 4616 ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp); 4617 if (ill != NULL) 4618 return (ill); 4619 4620 return (ill_lookup_multicast(ipst, zoneid, B_FALSE)); 4621 } 4622 4623 /* 4624 * Look for an ipif with the specified interface address and destination. 4625 * The destination address is used only for matching point-to-point interfaces. 4626 */ 4627 ipif_t * 4628 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst) 4629 { 4630 ipif_t *ipif; 4631 ill_t *ill; 4632 ill_walk_context_t ctx; 4633 4634 /* 4635 * First match all the point-to-point interfaces 4636 * before looking at non-point-to-point interfaces. 4637 * This is done to avoid returning non-point-to-point 4638 * ipif instead of unnumbered point-to-point ipif. 4639 */ 4640 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4641 ill = ILL_START_WALK_V4(&ctx, ipst); 4642 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4643 mutex_enter(&ill->ill_lock); 4644 for (ipif = ill->ill_ipif; ipif != NULL; 4645 ipif = ipif->ipif_next) { 4646 /* Allow the ipif to be down */ 4647 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 4648 (ipif->ipif_lcl_addr == if_addr) && 4649 (ipif->ipif_pp_dst_addr == dst)) { 4650 if (!IPIF_IS_CONDEMNED(ipif)) { 4651 ipif_refhold_locked(ipif); 4652 mutex_exit(&ill->ill_lock); 4653 rw_exit(&ipst->ips_ill_g_lock); 4654 return (ipif); 4655 } 4656 } 4657 } 4658 mutex_exit(&ill->ill_lock); 4659 } 4660 rw_exit(&ipst->ips_ill_g_lock); 4661 4662 /* lookup the ipif based on interface address */ 4663 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst); 4664 ASSERT(ipif == NULL || !ipif->ipif_isv6); 4665 return (ipif); 4666 } 4667 4668 /* 4669 * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact(). 4670 */ 4671 static ipif_t * 4672 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags, 4673 zoneid_t zoneid, ip_stack_t *ipst) 4674 { 4675 ipif_t *ipif; 4676 ill_t *ill; 4677 boolean_t ptp = B_FALSE; 4678 ill_walk_context_t ctx; 4679 boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP); 4680 boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP); 4681 4682 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4683 /* 4684 * Repeat twice, first based on local addresses and 4685 * next time for pointopoint. 4686 */ 4687 repeat: 4688 ill = ILL_START_WALK_V4(&ctx, ipst); 4689 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4690 if (match_ill != NULL && ill != match_ill && 4691 (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { 4692 continue; 4693 } 4694 mutex_enter(&ill->ill_lock); 4695 for (ipif = ill->ill_ipif; ipif != NULL; 4696 ipif = ipif->ipif_next) { 4697 if (zoneid != ALL_ZONES && 4698 zoneid != ipif->ipif_zoneid && 4699 ipif->ipif_zoneid != ALL_ZONES) 4700 continue; 4701 4702 if (no_duplicate && !(ipif->ipif_flags & IPIF_UP)) 4703 continue; 4704 4705 /* Allow the ipif to be down */ 4706 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 4707 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 4708 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 4709 (ipif->ipif_pp_dst_addr == addr))) { 4710 if (!IPIF_IS_CONDEMNED(ipif)) { 4711 ipif_refhold_locked(ipif); 4712 mutex_exit(&ill->ill_lock); 4713 rw_exit(&ipst->ips_ill_g_lock); 4714 return (ipif); 4715 } 4716 } 4717 } 4718 mutex_exit(&ill->ill_lock); 4719 } 4720 4721 /* If we already did the ptp case, then we are done */ 4722 if (ptp) { 4723 rw_exit(&ipst->ips_ill_g_lock); 4724 return (NULL); 4725 } 4726 ptp = B_TRUE; 4727 goto repeat; 4728 } 4729 4730 /* 4731 * Lookup an ipif with the specified address. For point-to-point links we 4732 * look for matches on either the destination address or the local address, 4733 * but we skip the local address check if IPIF_UNNUMBERED is set. If the 4734 * `match_ill' argument is non-NULL, the lookup is restricted to that ill 4735 * (or illgrp if `match_ill' is in an IPMP group). 4736 */ 4737 ipif_t * 4738 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, 4739 ip_stack_t *ipst) 4740 { 4741 return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP, 4742 zoneid, ipst)); 4743 } 4744 4745 /* 4746 * Lookup an ipif with the specified address. Similar to ipif_lookup_addr, 4747 * except that we will only return an address if it is not marked as 4748 * IPIF_DUPLICATE 4749 */ 4750 ipif_t * 4751 ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, 4752 ip_stack_t *ipst) 4753 { 4754 return (ipif_lookup_addr_common(addr, match_ill, 4755 (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP), 4756 zoneid, ipst)); 4757 } 4758 4759 /* 4760 * Special abbreviated version of ipif_lookup_addr() that doesn't match 4761 * `match_ill' across the IPMP group. This function is only needed in some 4762 * corner-cases; almost everything should use ipif_lookup_addr(). 4763 */ 4764 ipif_t * 4765 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 4766 { 4767 ASSERT(match_ill != NULL); 4768 return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES, 4769 ipst)); 4770 } 4771 4772 /* 4773 * Look for an ipif with the specified address. For point-point links 4774 * we look for matches on either the destination address and the local 4775 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 4776 * is set. 4777 * If the `match_ill' argument is non-NULL, the lookup is restricted to that 4778 * ill (or illgrp if `match_ill' is in an IPMP group). 4779 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 4780 */ 4781 zoneid_t 4782 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 4783 { 4784 zoneid_t zoneid; 4785 ipif_t *ipif; 4786 ill_t *ill; 4787 boolean_t ptp = B_FALSE; 4788 ill_walk_context_t ctx; 4789 4790 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4791 /* 4792 * Repeat twice, first based on local addresses and 4793 * next time for pointopoint. 4794 */ 4795 repeat: 4796 ill = ILL_START_WALK_V4(&ctx, ipst); 4797 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4798 if (match_ill != NULL && ill != match_ill && 4799 !IS_IN_SAME_ILLGRP(ill, match_ill)) { 4800 continue; 4801 } 4802 mutex_enter(&ill->ill_lock); 4803 for (ipif = ill->ill_ipif; ipif != NULL; 4804 ipif = ipif->ipif_next) { 4805 /* Allow the ipif to be down */ 4806 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 4807 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 4808 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 4809 (ipif->ipif_pp_dst_addr == addr)) && 4810 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 4811 zoneid = ipif->ipif_zoneid; 4812 mutex_exit(&ill->ill_lock); 4813 rw_exit(&ipst->ips_ill_g_lock); 4814 /* 4815 * If ipif_zoneid was ALL_ZONES then we have 4816 * a trusted extensions shared IP address. 4817 * In that case GLOBAL_ZONEID works to send. 4818 */ 4819 if (zoneid == ALL_ZONES) 4820 zoneid = GLOBAL_ZONEID; 4821 return (zoneid); 4822 } 4823 } 4824 mutex_exit(&ill->ill_lock); 4825 } 4826 4827 /* If we already did the ptp case, then we are done */ 4828 if (ptp) { 4829 rw_exit(&ipst->ips_ill_g_lock); 4830 return (ALL_ZONES); 4831 } 4832 ptp = B_TRUE; 4833 goto repeat; 4834 } 4835 4836 /* 4837 * Look for an ipif that matches the specified remote address i.e. the 4838 * ipif that would receive the specified packet. 4839 * First look for directly connected interfaces and then do a recursive 4840 * IRE lookup and pick the first ipif corresponding to the source address in the 4841 * ire. 4842 * Returns: held ipif 4843 * 4844 * This is only used for ICMP_ADDRESS_MASK_REQUESTs 4845 */ 4846 ipif_t * 4847 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 4848 { 4849 ipif_t *ipif; 4850 4851 ASSERT(!ill->ill_isv6); 4852 4853 /* 4854 * Someone could be changing this ipif currently or change it 4855 * after we return this. Thus a few packets could use the old 4856 * old values. However structure updates/creates (ire, ilg, ilm etc) 4857 * will atomically be updated or cleaned up with the new value 4858 * Thus we don't need a lock to check the flags or other attrs below. 4859 */ 4860 mutex_enter(&ill->ill_lock); 4861 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4862 if (IPIF_IS_CONDEMNED(ipif)) 4863 continue; 4864 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 4865 ipif->ipif_zoneid != ALL_ZONES) 4866 continue; 4867 /* Allow the ipif to be down */ 4868 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 4869 if ((ipif->ipif_pp_dst_addr == addr) || 4870 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 4871 ipif->ipif_lcl_addr == addr)) { 4872 ipif_refhold_locked(ipif); 4873 mutex_exit(&ill->ill_lock); 4874 return (ipif); 4875 } 4876 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 4877 ipif_refhold_locked(ipif); 4878 mutex_exit(&ill->ill_lock); 4879 return (ipif); 4880 } 4881 } 4882 mutex_exit(&ill->ill_lock); 4883 /* 4884 * For a remote destination it isn't possible to nail down a particular 4885 * ipif. 4886 */ 4887 4888 /* Pick the first interface */ 4889 ipif = ipif_get_next_ipif(NULL, ill); 4890 return (ipif); 4891 } 4892 4893 /* 4894 * This func does not prevent refcnt from increasing. But if 4895 * the caller has taken steps to that effect, then this func 4896 * can be used to determine whether the ill has become quiescent 4897 */ 4898 static boolean_t 4899 ill_is_quiescent(ill_t *ill) 4900 { 4901 ipif_t *ipif; 4902 4903 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4904 4905 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4906 if (ipif->ipif_refcnt != 0) 4907 return (B_FALSE); 4908 } 4909 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) { 4910 return (B_FALSE); 4911 } 4912 return (B_TRUE); 4913 } 4914 4915 boolean_t 4916 ill_is_freeable(ill_t *ill) 4917 { 4918 ipif_t *ipif; 4919 4920 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4921 4922 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4923 if (ipif->ipif_refcnt != 0) { 4924 return (B_FALSE); 4925 } 4926 } 4927 if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) { 4928 return (B_FALSE); 4929 } 4930 return (B_TRUE); 4931 } 4932 4933 /* 4934 * This func does not prevent refcnt from increasing. But if 4935 * the caller has taken steps to that effect, then this func 4936 * can be used to determine whether the ipif has become quiescent 4937 */ 4938 static boolean_t 4939 ipif_is_quiescent(ipif_t *ipif) 4940 { 4941 ill_t *ill; 4942 4943 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 4944 4945 if (ipif->ipif_refcnt != 0) 4946 return (B_FALSE); 4947 4948 ill = ipif->ipif_ill; 4949 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 4950 ill->ill_logical_down) { 4951 return (B_TRUE); 4952 } 4953 4954 /* This is the last ipif going down or being deleted on this ill */ 4955 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { 4956 return (B_FALSE); 4957 } 4958 4959 return (B_TRUE); 4960 } 4961 4962 /* 4963 * return true if the ipif can be destroyed: the ipif has to be quiescent 4964 * with zero references from ire/ilm to it. 4965 */ 4966 static boolean_t 4967 ipif_is_freeable(ipif_t *ipif) 4968 { 4969 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 4970 ASSERT(ipif->ipif_id != 0); 4971 return (ipif->ipif_refcnt == 0); 4972 } 4973 4974 /* 4975 * The ipif/ill/ire has been refreled. Do the tail processing. 4976 * Determine if the ipif or ill in question has become quiescent and if so 4977 * wakeup close and/or restart any queued pending ioctl that is waiting 4978 * for the ipif_down (or ill_down) 4979 */ 4980 void 4981 ipif_ill_refrele_tail(ill_t *ill) 4982 { 4983 mblk_t *mp; 4984 conn_t *connp; 4985 ipsq_t *ipsq; 4986 ipxop_t *ipx; 4987 ipif_t *ipif; 4988 dl_notify_ind_t *dlindp; 4989 4990 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4991 4992 if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) { 4993 /* ip_modclose() may be waiting */ 4994 cv_broadcast(&ill->ill_cv); 4995 } 4996 4997 ipsq = ill->ill_phyint->phyint_ipsq; 4998 mutex_enter(&ipsq->ipsq_lock); 4999 ipx = ipsq->ipsq_xop; 5000 mutex_enter(&ipx->ipx_lock); 5001 if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */ 5002 goto unlock; 5003 5004 ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL); 5005 5006 ipif = ipx->ipx_pending_ipif; 5007 if (ipif->ipif_ill != ill) /* wait is for another ill; bail */ 5008 goto unlock; 5009 5010 switch (ipx->ipx_waitfor) { 5011 case IPIF_DOWN: 5012 if (!ipif_is_quiescent(ipif)) 5013 goto unlock; 5014 break; 5015 case IPIF_FREE: 5016 if (!ipif_is_freeable(ipif)) 5017 goto unlock; 5018 break; 5019 case ILL_DOWN: 5020 if (!ill_is_quiescent(ill)) 5021 goto unlock; 5022 break; 5023 case ILL_FREE: 5024 /* 5025 * ILL_FREE is only for loopback; normal ill teardown waits 5026 * synchronously in ip_modclose() without using ipx_waitfor, 5027 * handled by the cv_broadcast() at the top of this function. 5028 */ 5029 if (!ill_is_freeable(ill)) 5030 goto unlock; 5031 break; 5032 default: 5033 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n", 5034 (void *)ipsq, ipx->ipx_waitfor); 5035 } 5036 5037 ill_refhold_locked(ill); /* for qwriter_ip() call below */ 5038 mutex_exit(&ipx->ipx_lock); 5039 mp = ipsq_pending_mp_get(ipsq, &connp); 5040 mutex_exit(&ipsq->ipsq_lock); 5041 mutex_exit(&ill->ill_lock); 5042 5043 ASSERT(mp != NULL); 5044 /* 5045 * NOTE: all of the qwriter_ip() calls below use CUR_OP since 5046 * we can only get here when the current operation decides it 5047 * it needs to quiesce via ipsq_pending_mp_add(). 5048 */ 5049 switch (mp->b_datap->db_type) { 5050 case M_PCPROTO: 5051 case M_PROTO: 5052 /* 5053 * For now, only DL_NOTIFY_IND messages can use this facility. 5054 */ 5055 dlindp = (dl_notify_ind_t *)mp->b_rptr; 5056 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND); 5057 5058 switch (dlindp->dl_notification) { 5059 case DL_NOTE_PHYS_ADDR: 5060 qwriter_ip(ill, ill->ill_rq, mp, 5061 ill_set_phys_addr_tail, CUR_OP, B_TRUE); 5062 return; 5063 case DL_NOTE_REPLUMB: 5064 qwriter_ip(ill, ill->ill_rq, mp, 5065 ill_replumb_tail, CUR_OP, B_TRUE); 5066 return; 5067 default: 5068 ASSERT(0); 5069 ill_refrele(ill); 5070 } 5071 break; 5072 5073 case M_ERROR: 5074 case M_HANGUP: 5075 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP, 5076 B_TRUE); 5077 return; 5078 5079 case M_IOCTL: 5080 case M_IOCDATA: 5081 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) : 5082 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE); 5083 return; 5084 5085 default: 5086 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 5087 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 5088 } 5089 return; 5090 unlock: 5091 mutex_exit(&ipsq->ipsq_lock); 5092 mutex_exit(&ipx->ipx_lock); 5093 mutex_exit(&ill->ill_lock); 5094 } 5095 5096 #ifdef DEBUG 5097 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 5098 static void 5099 th_trace_rrecord(th_trace_t *th_trace) 5100 { 5101 tr_buf_t *tr_buf; 5102 uint_t lastref; 5103 5104 lastref = th_trace->th_trace_lastref; 5105 lastref++; 5106 if (lastref == TR_BUF_MAX) 5107 lastref = 0; 5108 th_trace->th_trace_lastref = lastref; 5109 tr_buf = &th_trace->th_trbuf[lastref]; 5110 tr_buf->tr_time = ddi_get_lbolt(); 5111 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH); 5112 } 5113 5114 static void 5115 th_trace_free(void *value) 5116 { 5117 th_trace_t *th_trace = value; 5118 5119 ASSERT(th_trace->th_refcnt == 0); 5120 kmem_free(th_trace, sizeof (*th_trace)); 5121 } 5122 5123 /* 5124 * Find or create the per-thread hash table used to track object references. 5125 * The ipst argument is NULL if we shouldn't allocate. 5126 * 5127 * Accesses per-thread data, so there's no need to lock here. 5128 */ 5129 static mod_hash_t * 5130 th_trace_gethash(ip_stack_t *ipst) 5131 { 5132 th_hash_t *thh; 5133 5134 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) { 5135 mod_hash_t *mh; 5136 char name[256]; 5137 size_t objsize, rshift; 5138 int retv; 5139 5140 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL) 5141 return (NULL); 5142 (void) snprintf(name, sizeof (name), "th_trace_%p", 5143 (void *)curthread); 5144 5145 /* 5146 * We use mod_hash_create_extended here rather than the more 5147 * obvious mod_hash_create_ptrhash because the latter has a 5148 * hard-coded KM_SLEEP, and we'd prefer to fail rather than 5149 * block. 5150 */ 5151 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)), 5152 MAX(sizeof (ire_t), sizeof (ncec_t))); 5153 rshift = highbit(objsize); 5154 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor, 5155 th_trace_free, mod_hash_byptr, (void *)rshift, 5156 mod_hash_ptrkey_cmp, KM_NOSLEEP); 5157 if (mh == NULL) { 5158 kmem_free(thh, sizeof (*thh)); 5159 return (NULL); 5160 } 5161 thh->thh_hash = mh; 5162 thh->thh_ipst = ipst; 5163 /* 5164 * We trace ills, ipifs, ires, and nces. All of these are 5165 * per-IP-stack, so the lock on the thread list is as well. 5166 */ 5167 rw_enter(&ip_thread_rwlock, RW_WRITER); 5168 list_insert_tail(&ip_thread_list, thh); 5169 rw_exit(&ip_thread_rwlock); 5170 retv = tsd_set(ip_thread_data, thh); 5171 ASSERT(retv == 0); 5172 } 5173 return (thh != NULL ? thh->thh_hash : NULL); 5174 } 5175 5176 boolean_t 5177 th_trace_ref(const void *obj, ip_stack_t *ipst) 5178 { 5179 th_trace_t *th_trace; 5180 mod_hash_t *mh; 5181 mod_hash_val_t val; 5182 5183 if ((mh = th_trace_gethash(ipst)) == NULL) 5184 return (B_FALSE); 5185 5186 /* 5187 * Attempt to locate the trace buffer for this obj and thread. 5188 * If it does not exist, then allocate a new trace buffer and 5189 * insert into the hash. 5190 */ 5191 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) { 5192 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP); 5193 if (th_trace == NULL) 5194 return (B_FALSE); 5195 5196 th_trace->th_id = curthread; 5197 if (mod_hash_insert(mh, (mod_hash_key_t)obj, 5198 (mod_hash_val_t)th_trace) != 0) { 5199 kmem_free(th_trace, sizeof (th_trace_t)); 5200 return (B_FALSE); 5201 } 5202 } else { 5203 th_trace = (th_trace_t *)val; 5204 } 5205 5206 ASSERT(th_trace->th_refcnt >= 0 && 5207 th_trace->th_refcnt < TR_BUF_MAX - 1); 5208 5209 th_trace->th_refcnt++; 5210 th_trace_rrecord(th_trace); 5211 return (B_TRUE); 5212 } 5213 5214 /* 5215 * For the purpose of tracing a reference release, we assume that global 5216 * tracing is always on and that the same thread initiated the reference hold 5217 * is releasing. 5218 */ 5219 void 5220 th_trace_unref(const void *obj) 5221 { 5222 int retv; 5223 mod_hash_t *mh; 5224 th_trace_t *th_trace; 5225 mod_hash_val_t val; 5226 5227 mh = th_trace_gethash(NULL); 5228 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val); 5229 ASSERT(retv == 0); 5230 th_trace = (th_trace_t *)val; 5231 5232 ASSERT(th_trace->th_refcnt > 0); 5233 th_trace->th_refcnt--; 5234 th_trace_rrecord(th_trace); 5235 } 5236 5237 /* 5238 * If tracing has been disabled, then we assume that the reference counts are 5239 * now useless, and we clear them out before destroying the entries. 5240 */ 5241 void 5242 th_trace_cleanup(const void *obj, boolean_t trace_disable) 5243 { 5244 th_hash_t *thh; 5245 mod_hash_t *mh; 5246 mod_hash_val_t val; 5247 th_trace_t *th_trace; 5248 int retv; 5249 5250 rw_enter(&ip_thread_rwlock, RW_READER); 5251 for (thh = list_head(&ip_thread_list); thh != NULL; 5252 thh = list_next(&ip_thread_list, thh)) { 5253 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj, 5254 &val) == 0) { 5255 th_trace = (th_trace_t *)val; 5256 if (trace_disable) 5257 th_trace->th_refcnt = 0; 5258 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj); 5259 ASSERT(retv == 0); 5260 } 5261 } 5262 rw_exit(&ip_thread_rwlock); 5263 } 5264 5265 void 5266 ipif_trace_ref(ipif_t *ipif) 5267 { 5268 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5269 5270 if (ipif->ipif_trace_disable) 5271 return; 5272 5273 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) { 5274 ipif->ipif_trace_disable = B_TRUE; 5275 ipif_trace_cleanup(ipif); 5276 } 5277 } 5278 5279 void 5280 ipif_untrace_ref(ipif_t *ipif) 5281 { 5282 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5283 5284 if (!ipif->ipif_trace_disable) 5285 th_trace_unref(ipif); 5286 } 5287 5288 void 5289 ill_trace_ref(ill_t *ill) 5290 { 5291 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5292 5293 if (ill->ill_trace_disable) 5294 return; 5295 5296 if (!th_trace_ref(ill, ill->ill_ipst)) { 5297 ill->ill_trace_disable = B_TRUE; 5298 ill_trace_cleanup(ill); 5299 } 5300 } 5301 5302 void 5303 ill_untrace_ref(ill_t *ill) 5304 { 5305 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5306 5307 if (!ill->ill_trace_disable) 5308 th_trace_unref(ill); 5309 } 5310 5311 /* 5312 * Called when ipif is unplumbed or when memory alloc fails. Note that on 5313 * failure, ipif_trace_disable is set. 5314 */ 5315 static void 5316 ipif_trace_cleanup(const ipif_t *ipif) 5317 { 5318 th_trace_cleanup(ipif, ipif->ipif_trace_disable); 5319 } 5320 5321 /* 5322 * Called when ill is unplumbed or when memory alloc fails. Note that on 5323 * failure, ill_trace_disable is set. 5324 */ 5325 static void 5326 ill_trace_cleanup(const ill_t *ill) 5327 { 5328 th_trace_cleanup(ill, ill->ill_trace_disable); 5329 } 5330 #endif /* DEBUG */ 5331 5332 void 5333 ipif_refhold_locked(ipif_t *ipif) 5334 { 5335 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5336 ipif->ipif_refcnt++; 5337 IPIF_TRACE_REF(ipif); 5338 } 5339 5340 void 5341 ipif_refhold(ipif_t *ipif) 5342 { 5343 ill_t *ill; 5344 5345 ill = ipif->ipif_ill; 5346 mutex_enter(&ill->ill_lock); 5347 ipif->ipif_refcnt++; 5348 IPIF_TRACE_REF(ipif); 5349 mutex_exit(&ill->ill_lock); 5350 } 5351 5352 /* 5353 * Must not be called while holding any locks. Otherwise if this is 5354 * the last reference to be released there is a chance of recursive mutex 5355 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5356 * to restart an ioctl. 5357 */ 5358 void 5359 ipif_refrele(ipif_t *ipif) 5360 { 5361 ill_t *ill; 5362 5363 ill = ipif->ipif_ill; 5364 5365 mutex_enter(&ill->ill_lock); 5366 ASSERT(ipif->ipif_refcnt != 0); 5367 ipif->ipif_refcnt--; 5368 IPIF_UNTRACE_REF(ipif); 5369 if (ipif->ipif_refcnt != 0) { 5370 mutex_exit(&ill->ill_lock); 5371 return; 5372 } 5373 5374 /* Drops the ill_lock */ 5375 ipif_ill_refrele_tail(ill); 5376 } 5377 5378 ipif_t * 5379 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 5380 { 5381 ipif_t *ipif; 5382 5383 mutex_enter(&ill->ill_lock); 5384 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 5385 ipif != NULL; ipif = ipif->ipif_next) { 5386 if (IPIF_IS_CONDEMNED(ipif)) 5387 continue; 5388 ipif_refhold_locked(ipif); 5389 mutex_exit(&ill->ill_lock); 5390 return (ipif); 5391 } 5392 mutex_exit(&ill->ill_lock); 5393 return (NULL); 5394 } 5395 5396 /* 5397 * TODO: make this table extendible at run time 5398 * Return a pointer to the mac type info for 'mac_type' 5399 */ 5400 static ip_m_t * 5401 ip_m_lookup(t_uscalar_t mac_type) 5402 { 5403 ip_m_t *ipm; 5404 5405 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 5406 if (ipm->ip_m_mac_type == mac_type) 5407 return (ipm); 5408 return (NULL); 5409 } 5410 5411 /* 5412 * Make a link layer address from the multicast IP address *addr. 5413 * To form the link layer address, invoke the ip_m_v*mapping function 5414 * associated with the link-layer type. 5415 */ 5416 void 5417 ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr) 5418 { 5419 ip_m_t *ipm; 5420 5421 if (ill->ill_net_type == IRE_IF_NORESOLVER) 5422 return; 5423 5424 ASSERT(addr != NULL); 5425 5426 ipm = ip_m_lookup(ill->ill_mactype); 5427 if (ipm == NULL || 5428 (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) || 5429 (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) { 5430 ip0dbg(("no mapping for ill %s mactype 0x%x\n", 5431 ill->ill_name, ill->ill_mactype)); 5432 return; 5433 } 5434 if (ill->ill_isv6) 5435 (*ipm->ip_m_v6mapping)(ill, addr, hwaddr); 5436 else 5437 (*ipm->ip_m_v4mapping)(ill, addr, hwaddr); 5438 } 5439 5440 /* 5441 * ip_rt_add is called to add an IPv4 route to the forwarding table. 5442 * ill is passed in to associate it with the correct interface. 5443 * If ire_arg is set, then we return the held IRE in that location. 5444 */ 5445 int 5446 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 5447 ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg, 5448 boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid) 5449 { 5450 ire_t *ire, *nire; 5451 ire_t *gw_ire = NULL; 5452 ipif_t *ipif = NULL; 5453 uint_t type; 5454 int match_flags = MATCH_IRE_TYPE; 5455 tsol_gc_t *gc = NULL; 5456 tsol_gcgrp_t *gcgrp = NULL; 5457 boolean_t gcgrp_xtraref = B_FALSE; 5458 boolean_t cgtp_broadcast; 5459 boolean_t unbound = B_FALSE; 5460 5461 ip1dbg(("ip_rt_add:")); 5462 5463 if (ire_arg != NULL) 5464 *ire_arg = NULL; 5465 5466 /* 5467 * If this is the case of RTF_HOST being set, then we set the netmask 5468 * to all ones (regardless if one was supplied). 5469 */ 5470 if (flags & RTF_HOST) 5471 mask = IP_HOST_MASK; 5472 5473 /* 5474 * Prevent routes with a zero gateway from being created (since 5475 * interfaces can currently be plumbed and brought up no assigned 5476 * address). 5477 */ 5478 if (gw_addr == 0) 5479 return (ENETUNREACH); 5480 /* 5481 * Get the ipif, if any, corresponding to the gw_addr 5482 * If -ifp was specified we restrict ourselves to the ill, otherwise 5483 * we match on the gatway and destination to handle unnumbered pt-pt 5484 * interfaces. 5485 */ 5486 if (ill != NULL) 5487 ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst); 5488 else 5489 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); 5490 if (ipif != NULL) { 5491 if (IS_VNI(ipif->ipif_ill)) { 5492 ipif_refrele(ipif); 5493 return (EINVAL); 5494 } 5495 } 5496 5497 /* 5498 * GateD will attempt to create routes with a loopback interface 5499 * address as the gateway and with RTF_GATEWAY set. We allow 5500 * these routes to be added, but create them as interface routes 5501 * since the gateway is an interface address. 5502 */ 5503 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 5504 flags &= ~RTF_GATEWAY; 5505 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 5506 mask == IP_HOST_MASK) { 5507 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK, 5508 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, 5509 NULL); 5510 if (ire != NULL) { 5511 ire_refrele(ire); 5512 ipif_refrele(ipif); 5513 return (EEXIST); 5514 } 5515 ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x" 5516 "for 0x%x\n", (void *)ipif, 5517 ipif->ipif_ire_type, 5518 ntohl(ipif->ipif_lcl_addr))); 5519 ire = ire_create( 5520 (uchar_t *)&dst_addr, /* dest address */ 5521 (uchar_t *)&mask, /* mask */ 5522 NULL, /* no gateway */ 5523 ipif->ipif_ire_type, /* LOOPBACK */ 5524 ipif->ipif_ill, 5525 zoneid, 5526 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0, 5527 NULL, 5528 ipst); 5529 5530 if (ire == NULL) { 5531 ipif_refrele(ipif); 5532 return (ENOMEM); 5533 } 5534 /* src address assigned by the caller? */ 5535 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5536 ire->ire_setsrc_addr = src_addr; 5537 5538 nire = ire_add(ire); 5539 if (nire == NULL) { 5540 /* 5541 * In the result of failure, ire_add() will have 5542 * already deleted the ire in question, so there 5543 * is no need to do that here. 5544 */ 5545 ipif_refrele(ipif); 5546 return (ENOMEM); 5547 } 5548 /* 5549 * Check if it was a duplicate entry. This handles 5550 * the case of two racing route adds for the same route 5551 */ 5552 if (nire != ire) { 5553 ASSERT(nire->ire_identical_ref > 1); 5554 ire_delete(nire); 5555 ire_refrele(nire); 5556 ipif_refrele(ipif); 5557 return (EEXIST); 5558 } 5559 ire = nire; 5560 goto save_ire; 5561 } 5562 } 5563 5564 /* 5565 * The routes for multicast with CGTP are quite special in that 5566 * the gateway is the local interface address, yet RTF_GATEWAY 5567 * is set. We turn off RTF_GATEWAY to provide compatibility with 5568 * this undocumented and unusual use of multicast routes. 5569 */ 5570 if ((flags & RTF_MULTIRT) && ipif != NULL) 5571 flags &= ~RTF_GATEWAY; 5572 5573 /* 5574 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 5575 * and the gateway address provided is one of the system's interface 5576 * addresses. By using the routing socket interface and supplying an 5577 * RTA_IFP sockaddr with an interface index, an alternate method of 5578 * specifying an interface route to be created is available which uses 5579 * the interface index that specifies the outgoing interface rather than 5580 * the address of an outgoing interface (which may not be able to 5581 * uniquely identify an interface). When coupled with the RTF_GATEWAY 5582 * flag, routes can be specified which not only specify the next-hop to 5583 * be used when routing to a certain prefix, but also which outgoing 5584 * interface should be used. 5585 * 5586 * Previously, interfaces would have unique addresses assigned to them 5587 * and so the address assigned to a particular interface could be used 5588 * to identify a particular interface. One exception to this was the 5589 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 5590 * 5591 * With the advent of IPv6 and its link-local addresses, this 5592 * restriction was relaxed and interfaces could share addresses between 5593 * themselves. In fact, typically all of the link-local interfaces on 5594 * an IPv6 node or router will have the same link-local address. In 5595 * order to differentiate between these interfaces, the use of an 5596 * interface index is necessary and this index can be carried inside a 5597 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 5598 * of using the interface index, however, is that all of the ipif's that 5599 * are part of an ill have the same index and so the RTA_IFP sockaddr 5600 * cannot be used to differentiate between ipif's (or logical 5601 * interfaces) that belong to the same ill (physical interface). 5602 * 5603 * For example, in the following case involving IPv4 interfaces and 5604 * logical interfaces 5605 * 5606 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 5607 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0 5608 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0 5609 * 5610 * the ipif's corresponding to each of these interface routes can be 5611 * uniquely identified by the "gateway" (actually interface address). 5612 * 5613 * In this case involving multiple IPv6 default routes to a particular 5614 * link-local gateway, the use of RTA_IFP is necessary to specify which 5615 * default route is of interest: 5616 * 5617 * default fe80::123:4567:89ab:cdef U if0 5618 * default fe80::123:4567:89ab:cdef U if1 5619 */ 5620 5621 /* RTF_GATEWAY not set */ 5622 if (!(flags & RTF_GATEWAY)) { 5623 if (sp != NULL) { 5624 ip2dbg(("ip_rt_add: gateway security attributes " 5625 "cannot be set with interface route\n")); 5626 if (ipif != NULL) 5627 ipif_refrele(ipif); 5628 return (EINVAL); 5629 } 5630 5631 /* 5632 * Whether or not ill (RTA_IFP) is set, we require that 5633 * the gateway is one of our local addresses. 5634 */ 5635 if (ipif == NULL) 5636 return (ENETUNREACH); 5637 5638 /* 5639 * We use MATCH_IRE_ILL here. If the caller specified an 5640 * interface (from the RTA_IFP sockaddr) we use it, otherwise 5641 * we use the ill derived from the gateway address. 5642 * We can always match the gateway address since we record it 5643 * in ire_gateway_addr. 5644 * We don't allow RTA_IFP to specify a different ill than the 5645 * one matching the ipif to make sure we can delete the route. 5646 */ 5647 match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL; 5648 if (ill == NULL) { 5649 ill = ipif->ipif_ill; 5650 } else if (ill != ipif->ipif_ill) { 5651 ipif_refrele(ipif); 5652 return (EINVAL); 5653 } 5654 5655 /* 5656 * We check for an existing entry at this point. 5657 * 5658 * Since a netmask isn't passed in via the ioctl interface 5659 * (SIOCADDRT), we don't check for a matching netmask in that 5660 * case. 5661 */ 5662 if (!ioctl_msg) 5663 match_flags |= MATCH_IRE_MASK; 5664 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, 5665 IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst, 5666 NULL); 5667 if (ire != NULL) { 5668 ire_refrele(ire); 5669 ipif_refrele(ipif); 5670 return (EEXIST); 5671 } 5672 5673 /* 5674 * Create a copy of the IRE_LOOPBACK, IRE_IF_NORESOLVER or 5675 * IRE_IF_RESOLVER with the modified address, netmask, and 5676 * gateway. 5677 */ 5678 ire = ire_create( 5679 (uchar_t *)&dst_addr, 5680 (uint8_t *)&mask, 5681 (uint8_t *)&gw_addr, 5682 ill->ill_net_type, 5683 ill, 5684 zoneid, 5685 flags, 5686 NULL, 5687 ipst); 5688 if (ire == NULL) { 5689 ipif_refrele(ipif); 5690 return (ENOMEM); 5691 } 5692 5693 /* 5694 * Some software (for example, GateD and Sun Cluster) attempts 5695 * to create (what amount to) IRE_PREFIX routes with the 5696 * loopback address as the gateway. This is primarily done to 5697 * set up prefixes with the RTF_REJECT flag set (for example, 5698 * when generating aggregate routes.) 5699 * 5700 * If the IRE type (as defined by ill->ill_net_type) is 5701 * IRE_LOOPBACK, then we map the request into a 5702 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as 5703 * these interface routes, by definition, can only be that. 5704 * 5705 * Needless to say, the real IRE_LOOPBACK is NOT created by this 5706 * routine, but rather using ire_create() directly. 5707 * 5708 */ 5709 if (ill->ill_net_type == IRE_LOOPBACK) { 5710 ire->ire_type = IRE_IF_NORESOLVER; 5711 ire->ire_flags |= RTF_BLACKHOLE; 5712 } 5713 5714 /* src address assigned by the caller? */ 5715 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5716 ire->ire_setsrc_addr = src_addr; 5717 5718 nire = ire_add(ire); 5719 if (nire == NULL) { 5720 /* 5721 * In the result of failure, ire_add() will have 5722 * already deleted the ire in question, so there 5723 * is no need to do that here. 5724 */ 5725 ipif_refrele(ipif); 5726 return (ENOMEM); 5727 } 5728 /* 5729 * Check if it was a duplicate entry. This handles 5730 * the case of two racing route adds for the same route 5731 */ 5732 if (nire != ire) { 5733 ire_delete(nire); 5734 ire_refrele(nire); 5735 ipif_refrele(ipif); 5736 return (EEXIST); 5737 } 5738 ire = nire; 5739 goto save_ire; 5740 } 5741 5742 /* 5743 * Get an interface IRE for the specified gateway. 5744 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 5745 * gateway, it is currently unreachable and we fail the request 5746 * accordingly. We reject any RTF_GATEWAY routes where the gateway 5747 * is an IRE_LOCAL or IRE_LOOPBACK. 5748 * If RTA_IFP was specified we look on that particular ill. 5749 */ 5750 if (ill != NULL) 5751 match_flags |= MATCH_IRE_ILL; 5752 5753 /* Check whether the gateway is reachable. */ 5754 again: 5755 type = IRE_INTERFACE | IRE_LOCAL | IRE_LOOPBACK; 5756 if (flags & RTF_INDIRECT) 5757 type |= IRE_OFFLINK; 5758 5759 gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill, 5760 ALL_ZONES, NULL, match_flags, 0, ipst, NULL); 5761 if (gw_ire == NULL) { 5762 /* 5763 * With IPMP, we allow host routes to influence in.mpathd's 5764 * target selection. However, if the test addresses are on 5765 * their own network, the above lookup will fail since the 5766 * underlying IRE_INTERFACEs are marked hidden. So allow 5767 * hidden test IREs to be found and try again. 5768 */ 5769 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) { 5770 match_flags |= MATCH_IRE_TESTHIDDEN; 5771 goto again; 5772 } 5773 if (ipif != NULL) 5774 ipif_refrele(ipif); 5775 return (ENETUNREACH); 5776 } 5777 if (gw_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) { 5778 ire_refrele(gw_ire); 5779 if (ipif != NULL) 5780 ipif_refrele(ipif); 5781 return (ENETUNREACH); 5782 } 5783 5784 if (ill == NULL && !(flags & RTF_INDIRECT)) { 5785 unbound = B_TRUE; 5786 if (ipst->ips_ip_strict_src_multihoming > 0) 5787 ill = gw_ire->ire_ill; 5788 } 5789 5790 /* 5791 * We create one of three types of IREs as a result of this request 5792 * based on the netmask. A netmask of all ones (which is automatically 5793 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 5794 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 5795 * created. Otherwise, an IRE_PREFIX route is created for the 5796 * destination prefix. 5797 */ 5798 if (mask == IP_HOST_MASK) 5799 type = IRE_HOST; 5800 else if (mask == 0) 5801 type = IRE_DEFAULT; 5802 else 5803 type = IRE_PREFIX; 5804 5805 /* check for a duplicate entry */ 5806 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, 5807 ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, 5808 0, ipst, NULL); 5809 if (ire != NULL) { 5810 if (ipif != NULL) 5811 ipif_refrele(ipif); 5812 ire_refrele(gw_ire); 5813 ire_refrele(ire); 5814 return (EEXIST); 5815 } 5816 5817 /* Security attribute exists */ 5818 if (sp != NULL) { 5819 tsol_gcgrp_addr_t ga; 5820 5821 /* find or create the gateway credentials group */ 5822 ga.ga_af = AF_INET; 5823 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 5824 5825 /* we hold reference to it upon success */ 5826 gcgrp = gcgrp_lookup(&ga, B_TRUE); 5827 if (gcgrp == NULL) { 5828 if (ipif != NULL) 5829 ipif_refrele(ipif); 5830 ire_refrele(gw_ire); 5831 return (ENOMEM); 5832 } 5833 5834 /* 5835 * Create and add the security attribute to the group; a 5836 * reference to the group is made upon allocating a new 5837 * entry successfully. If it finds an already-existing 5838 * entry for the security attribute in the group, it simply 5839 * returns it and no new reference is made to the group. 5840 */ 5841 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 5842 if (gc == NULL) { 5843 if (ipif != NULL) 5844 ipif_refrele(ipif); 5845 /* release reference held by gcgrp_lookup */ 5846 GCGRP_REFRELE(gcgrp); 5847 ire_refrele(gw_ire); 5848 return (ENOMEM); 5849 } 5850 } 5851 5852 /* Create the IRE. */ 5853 ire = ire_create( 5854 (uchar_t *)&dst_addr, /* dest address */ 5855 (uchar_t *)&mask, /* mask */ 5856 (uchar_t *)&gw_addr, /* gateway address */ 5857 (ushort_t)type, /* IRE type */ 5858 ill, 5859 zoneid, 5860 flags, 5861 gc, /* security attribute */ 5862 ipst); 5863 5864 /* 5865 * The ire holds a reference to the 'gc' and the 'gc' holds a 5866 * reference to the 'gcgrp'. We can now release the extra reference 5867 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 5868 */ 5869 if (gcgrp_xtraref) 5870 GCGRP_REFRELE(gcgrp); 5871 if (ire == NULL) { 5872 if (gc != NULL) 5873 GC_REFRELE(gc); 5874 if (ipif != NULL) 5875 ipif_refrele(ipif); 5876 ire_refrele(gw_ire); 5877 return (ENOMEM); 5878 } 5879 5880 /* Before we add, check if an extra CGTP broadcast is needed */ 5881 cgtp_broadcast = ((flags & RTF_MULTIRT) && 5882 ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST); 5883 5884 /* src address assigned by the caller? */ 5885 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5886 ire->ire_setsrc_addr = src_addr; 5887 5888 ire->ire_unbound = unbound; 5889 5890 /* 5891 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 5892 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 5893 */ 5894 5895 /* Add the new IRE. */ 5896 nire = ire_add(ire); 5897 if (nire == NULL) { 5898 /* 5899 * In the result of failure, ire_add() will have 5900 * already deleted the ire in question, so there 5901 * is no need to do that here. 5902 */ 5903 if (ipif != NULL) 5904 ipif_refrele(ipif); 5905 ire_refrele(gw_ire); 5906 return (ENOMEM); 5907 } 5908 /* 5909 * Check if it was a duplicate entry. This handles 5910 * the case of two racing route adds for the same route 5911 */ 5912 if (nire != ire) { 5913 ire_delete(nire); 5914 ire_refrele(nire); 5915 if (ipif != NULL) 5916 ipif_refrele(ipif); 5917 ire_refrele(gw_ire); 5918 return (EEXIST); 5919 } 5920 ire = nire; 5921 5922 if (flags & RTF_MULTIRT) { 5923 /* 5924 * Invoke the CGTP (multirouting) filtering module 5925 * to add the dst address in the filtering database. 5926 * Replicated inbound packets coming from that address 5927 * will be filtered to discard the duplicates. 5928 * It is not necessary to call the CGTP filter hook 5929 * when the dst address is a broadcast or multicast, 5930 * because an IP source address cannot be a broadcast 5931 * or a multicast. 5932 */ 5933 if (cgtp_broadcast) { 5934 ip_cgtp_bcast_add(ire, ipst); 5935 goto save_ire; 5936 } 5937 if (ipst->ips_ip_cgtp_filter_ops != NULL && 5938 !CLASSD(ire->ire_addr)) { 5939 int res; 5940 ipif_t *src_ipif; 5941 5942 /* Find the source address corresponding to gw_ire */ 5943 src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr, 5944 NULL, zoneid, ipst); 5945 if (src_ipif != NULL) { 5946 res = ipst->ips_ip_cgtp_filter_ops-> 5947 cfo_add_dest_v4( 5948 ipst->ips_netstack->netstack_stackid, 5949 ire->ire_addr, 5950 ire->ire_gateway_addr, 5951 ire->ire_setsrc_addr, 5952 src_ipif->ipif_lcl_addr); 5953 ipif_refrele(src_ipif); 5954 } else { 5955 res = EADDRNOTAVAIL; 5956 } 5957 if (res != 0) { 5958 if (ipif != NULL) 5959 ipif_refrele(ipif); 5960 ire_refrele(gw_ire); 5961 ire_delete(ire); 5962 ire_refrele(ire); /* Held in ire_add */ 5963 return (res); 5964 } 5965 } 5966 } 5967 5968 save_ire: 5969 if (gw_ire != NULL) { 5970 ire_refrele(gw_ire); 5971 gw_ire = NULL; 5972 } 5973 if (ill != NULL) { 5974 /* 5975 * Save enough information so that we can recreate the IRE if 5976 * the interface goes down and then up. The metrics associated 5977 * with the route will be saved as well when rts_setmetrics() is 5978 * called after the IRE has been created. In the case where 5979 * memory cannot be allocated, none of this information will be 5980 * saved. 5981 */ 5982 ill_save_ire(ill, ire); 5983 } 5984 if (ioctl_msg) 5985 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst); 5986 if (ire_arg != NULL) { 5987 /* 5988 * Store the ire that was successfully added into where ire_arg 5989 * points to so that callers don't have to look it up 5990 * themselves (but they are responsible for ire_refrele()ing 5991 * the ire when they are finished with it). 5992 */ 5993 *ire_arg = ire; 5994 } else { 5995 ire_refrele(ire); /* Held in ire_add */ 5996 } 5997 if (ipif != NULL) 5998 ipif_refrele(ipif); 5999 return (0); 6000 } 6001 6002 /* 6003 * ip_rt_delete is called to delete an IPv4 route. 6004 * ill is passed in to associate it with the correct interface. 6005 */ 6006 /* ARGSUSED4 */ 6007 int 6008 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6009 uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg, 6010 ip_stack_t *ipst, zoneid_t zoneid) 6011 { 6012 ire_t *ire = NULL; 6013 ipif_t *ipif; 6014 uint_t type; 6015 uint_t match_flags = MATCH_IRE_TYPE; 6016 int err = 0; 6017 6018 ip1dbg(("ip_rt_delete:")); 6019 /* 6020 * If this is the case of RTF_HOST being set, then we set the netmask 6021 * to all ones. Otherwise, we use the netmask if one was supplied. 6022 */ 6023 if (flags & RTF_HOST) { 6024 mask = IP_HOST_MASK; 6025 match_flags |= MATCH_IRE_MASK; 6026 } else if (rtm_addrs & RTA_NETMASK) { 6027 match_flags |= MATCH_IRE_MASK; 6028 } 6029 6030 /* 6031 * Note that RTF_GATEWAY is never set on a delete, therefore 6032 * we check if the gateway address is one of our interfaces first, 6033 * and fall back on RTF_GATEWAY routes. 6034 * 6035 * This makes it possible to delete an original 6036 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 6037 * However, we have RTF_KERNEL set on the ones created by ipif_up 6038 * and those can not be deleted here. 6039 * 6040 * We use MATCH_IRE_ILL if we know the interface. If the caller 6041 * specified an interface (from the RTA_IFP sockaddr) we use it, 6042 * otherwise we use the ill derived from the gateway address. 6043 * We can always match the gateway address since we record it 6044 * in ire_gateway_addr. 6045 * 6046 * For more detail on specifying routes by gateway address and by 6047 * interface index, see the comments in ip_rt_add(). 6048 */ 6049 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); 6050 if (ipif != NULL) { 6051 ill_t *ill_match; 6052 6053 if (ill != NULL) 6054 ill_match = ill; 6055 else 6056 ill_match = ipif->ipif_ill; 6057 6058 match_flags |= MATCH_IRE_ILL; 6059 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 6060 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK, 6061 ill_match, ALL_ZONES, NULL, match_flags, 0, ipst, 6062 NULL); 6063 } 6064 if (ire == NULL) { 6065 match_flags |= MATCH_IRE_GW; 6066 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, 6067 IRE_INTERFACE, ill_match, ALL_ZONES, NULL, 6068 match_flags, 0, ipst, NULL); 6069 } 6070 /* Avoid deleting routes created by kernel from an ipif */ 6071 if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) { 6072 ire_refrele(ire); 6073 ire = NULL; 6074 } 6075 6076 /* Restore in case we didn't find a match */ 6077 match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL); 6078 } 6079 6080 if (ire == NULL) { 6081 /* 6082 * At this point, the gateway address is not one of our own 6083 * addresses or a matching interface route was not found. We 6084 * set the IRE type to lookup based on whether 6085 * this is a host route, a default route or just a prefix. 6086 * 6087 * If an ill was passed in, then the lookup is based on an 6088 * interface index so MATCH_IRE_ILL is added to match_flags. 6089 */ 6090 match_flags |= MATCH_IRE_GW; 6091 if (ill != NULL) 6092 match_flags |= MATCH_IRE_ILL; 6093 if (mask == IP_HOST_MASK) 6094 type = IRE_HOST; 6095 else if (mask == 0) 6096 type = IRE_DEFAULT; 6097 else 6098 type = IRE_PREFIX; 6099 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, 6100 ALL_ZONES, NULL, match_flags, 0, ipst, NULL); 6101 } 6102 6103 if (ipif != NULL) { 6104 ipif_refrele(ipif); 6105 ipif = NULL; 6106 } 6107 6108 if (ire == NULL) 6109 return (ESRCH); 6110 6111 if (ire->ire_flags & RTF_MULTIRT) { 6112 /* 6113 * Invoke the CGTP (multirouting) filtering module 6114 * to remove the dst address from the filtering database. 6115 * Packets coming from that address will no longer be 6116 * filtered to remove duplicates. 6117 */ 6118 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 6119 err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4( 6120 ipst->ips_netstack->netstack_stackid, 6121 ire->ire_addr, ire->ire_gateway_addr); 6122 } 6123 ip_cgtp_bcast_delete(ire, ipst); 6124 } 6125 6126 ill = ire->ire_ill; 6127 if (ill != NULL) 6128 ill_remove_saved_ire(ill, ire); 6129 if (ioctl_msg) 6130 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst); 6131 ire_delete(ire); 6132 ire_refrele(ire); 6133 return (err); 6134 } 6135 6136 /* 6137 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 6138 */ 6139 /* ARGSUSED */ 6140 int 6141 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6142 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6143 { 6144 ipaddr_t dst_addr; 6145 ipaddr_t gw_addr; 6146 ipaddr_t mask; 6147 int error = 0; 6148 mblk_t *mp1; 6149 struct rtentry *rt; 6150 ipif_t *ipif = NULL; 6151 ip_stack_t *ipst; 6152 6153 ASSERT(q->q_next == NULL); 6154 ipst = CONNQ_TO_IPST(q); 6155 6156 ip1dbg(("ip_siocaddrt:")); 6157 /* Existence of mp1 verified in ip_wput_nondata */ 6158 mp1 = mp->b_cont->b_cont; 6159 rt = (struct rtentry *)mp1->b_rptr; 6160 6161 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6162 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6163 6164 /* 6165 * If the RTF_HOST flag is on, this is a request to assign a gateway 6166 * to a particular host address. In this case, we set the netmask to 6167 * all ones for the particular destination address. Otherwise, 6168 * determine the netmask to be used based on dst_addr and the interfaces 6169 * in use. 6170 */ 6171 if (rt->rt_flags & RTF_HOST) { 6172 mask = IP_HOST_MASK; 6173 } else { 6174 /* 6175 * Note that ip_subnet_mask returns a zero mask in the case of 6176 * default (an all-zeroes address). 6177 */ 6178 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 6179 } 6180 6181 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 6182 B_TRUE, NULL, ipst, ALL_ZONES); 6183 if (ipif != NULL) 6184 ipif_refrele(ipif); 6185 return (error); 6186 } 6187 6188 /* 6189 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 6190 */ 6191 /* ARGSUSED */ 6192 int 6193 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6194 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6195 { 6196 ipaddr_t dst_addr; 6197 ipaddr_t gw_addr; 6198 ipaddr_t mask; 6199 int error; 6200 mblk_t *mp1; 6201 struct rtentry *rt; 6202 ipif_t *ipif = NULL; 6203 ip_stack_t *ipst; 6204 6205 ASSERT(q->q_next == NULL); 6206 ipst = CONNQ_TO_IPST(q); 6207 6208 ip1dbg(("ip_siocdelrt:")); 6209 /* Existence of mp1 verified in ip_wput_nondata */ 6210 mp1 = mp->b_cont->b_cont; 6211 rt = (struct rtentry *)mp1->b_rptr; 6212 6213 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6214 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6215 6216 /* 6217 * If the RTF_HOST flag is on, this is a request to delete a gateway 6218 * to a particular host address. In this case, we set the netmask to 6219 * all ones for the particular destination address. Otherwise, 6220 * determine the netmask to be used based on dst_addr and the interfaces 6221 * in use. 6222 */ 6223 if (rt->rt_flags & RTF_HOST) { 6224 mask = IP_HOST_MASK; 6225 } else { 6226 /* 6227 * Note that ip_subnet_mask returns a zero mask in the case of 6228 * default (an all-zeroes address). 6229 */ 6230 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 6231 } 6232 6233 error = ip_rt_delete(dst_addr, mask, gw_addr, 6234 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, 6235 ipst, ALL_ZONES); 6236 if (ipif != NULL) 6237 ipif_refrele(ipif); 6238 return (error); 6239 } 6240 6241 /* 6242 * Enqueue the mp onto the ipsq, chained by b_next. 6243 * b_prev stores the function to be executed later, and b_queue the queue 6244 * where this mp originated. 6245 */ 6246 void 6247 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 6248 ill_t *pending_ill) 6249 { 6250 conn_t *connp; 6251 ipxop_t *ipx = ipsq->ipsq_xop; 6252 6253 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 6254 ASSERT(MUTEX_HELD(&ipx->ipx_lock)); 6255 ASSERT(func != NULL); 6256 6257 mp->b_queue = q; 6258 mp->b_prev = (void *)func; 6259 mp->b_next = NULL; 6260 6261 switch (type) { 6262 case CUR_OP: 6263 if (ipx->ipx_mptail != NULL) { 6264 ASSERT(ipx->ipx_mphead != NULL); 6265 ipx->ipx_mptail->b_next = mp; 6266 } else { 6267 ASSERT(ipx->ipx_mphead == NULL); 6268 ipx->ipx_mphead = mp; 6269 } 6270 ipx->ipx_mptail = mp; 6271 break; 6272 6273 case NEW_OP: 6274 if (ipsq->ipsq_xopq_mptail != NULL) { 6275 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 6276 ipsq->ipsq_xopq_mptail->b_next = mp; 6277 } else { 6278 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 6279 ipsq->ipsq_xopq_mphead = mp; 6280 } 6281 ipsq->ipsq_xopq_mptail = mp; 6282 ipx->ipx_ipsq_queued = B_TRUE; 6283 break; 6284 6285 case SWITCH_OP: 6286 ASSERT(ipsq->ipsq_swxop != NULL); 6287 /* only one switch operation is currently allowed */ 6288 ASSERT(ipsq->ipsq_switch_mp == NULL); 6289 ipsq->ipsq_switch_mp = mp; 6290 ipx->ipx_ipsq_queued = B_TRUE; 6291 break; 6292 default: 6293 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 6294 } 6295 6296 if (CONN_Q(q) && pending_ill != NULL) { 6297 connp = Q_TO_CONN(q); 6298 ASSERT(MUTEX_HELD(&connp->conn_lock)); 6299 connp->conn_oper_pending_ill = pending_ill; 6300 } 6301 } 6302 6303 /* 6304 * Dequeue the next message that requested exclusive access to this IPSQ's 6305 * xop. Specifically: 6306 * 6307 * 1. If we're still processing the current operation on `ipsq', then 6308 * dequeue the next message for the operation (from ipx_mphead), or 6309 * return NULL if there are no queued messages for the operation. 6310 * These messages are queued via CUR_OP to qwriter_ip() and friends. 6311 * 6312 * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is 6313 * not set) see if the ipsq has requested an xop switch. If so, switch 6314 * `ipsq' to a different xop. Xop switches only happen when joining or 6315 * leaving IPMP groups and require a careful dance -- see the comments 6316 * in-line below for details. If we're leaving a group xop or if we're 6317 * joining a group xop and become writer on it, then we proceed to (3). 6318 * Otherwise, we return NULL and exit the xop. 6319 * 6320 * 3. For each IPSQ in the xop, return any switch operation stored on 6321 * ipsq_switch_mp (set via SWITCH_OP); these must be processed before 6322 * any other messages queued on the IPSQ. Otherwise, dequeue the next 6323 * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead. 6324 * Note that if the phyint tied to `ipsq' is not using IPMP there will 6325 * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for 6326 * each phyint in the group, including the IPMP meta-interface phyint. 6327 */ 6328 static mblk_t * 6329 ipsq_dq(ipsq_t *ipsq) 6330 { 6331 ill_t *illv4, *illv6; 6332 mblk_t *mp; 6333 ipsq_t *xopipsq; 6334 ipsq_t *leftipsq = NULL; 6335 ipxop_t *ipx; 6336 phyint_t *phyi = ipsq->ipsq_phyint; 6337 ip_stack_t *ipst = ipsq->ipsq_ipst; 6338 boolean_t emptied = B_FALSE; 6339 6340 /* 6341 * Grab all the locks we need in the defined order (ill_g_lock -> 6342 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next. 6343 */ 6344 rw_enter(&ipst->ips_ill_g_lock, 6345 ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER); 6346 mutex_enter(&ipsq->ipsq_lock); 6347 ipx = ipsq->ipsq_xop; 6348 mutex_enter(&ipx->ipx_lock); 6349 6350 /* 6351 * Dequeue the next message associated with the current exclusive 6352 * operation, if any. 6353 */ 6354 if ((mp = ipx->ipx_mphead) != NULL) { 6355 ipx->ipx_mphead = mp->b_next; 6356 if (ipx->ipx_mphead == NULL) 6357 ipx->ipx_mptail = NULL; 6358 mp->b_next = (void *)ipsq; 6359 goto out; 6360 } 6361 6362 if (ipx->ipx_current_ipif != NULL) 6363 goto empty; 6364 6365 if (ipsq->ipsq_swxop != NULL) { 6366 /* 6367 * The exclusive operation that is now being completed has 6368 * requested a switch to a different xop. This happens 6369 * when an interface joins or leaves an IPMP group. Joins 6370 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()). 6371 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb 6372 * (phyint_free()), or interface plumb for an ill type 6373 * not in the IPMP group (ip_rput_dlpi_writer()). 6374 * 6375 * Xop switches are not allowed on the IPMP meta-interface. 6376 */ 6377 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP)); 6378 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 6379 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq); 6380 6381 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) { 6382 /* 6383 * We're switching back to our own xop, so we have two 6384 * xop's to drain/exit: our own, and the group xop 6385 * that we are leaving. 6386 * 6387 * First, pull ourselves out of the group ipsq list. 6388 * This is safe since we're writer on ill_g_lock. 6389 */ 6390 ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop); 6391 6392 xopipsq = ipx->ipx_ipsq; 6393 while (xopipsq->ipsq_next != ipsq) 6394 xopipsq = xopipsq->ipsq_next; 6395 6396 xopipsq->ipsq_next = ipsq->ipsq_next; 6397 ipsq->ipsq_next = ipsq; 6398 ipsq->ipsq_xop = ipsq->ipsq_swxop; 6399 ipsq->ipsq_swxop = NULL; 6400 6401 /* 6402 * Second, prepare to exit the group xop. The actual 6403 * ipsq_exit() is done at the end of this function 6404 * since we cannot hold any locks across ipsq_exit(). 6405 * Note that although we drop the group's ipx_lock, no 6406 * threads can proceed since we're still ipx_writer. 6407 */ 6408 leftipsq = xopipsq; 6409 mutex_exit(&ipx->ipx_lock); 6410 6411 /* 6412 * Third, set ipx to point to our own xop (which was 6413 * inactive and therefore can be entered). 6414 */ 6415 ipx = ipsq->ipsq_xop; 6416 mutex_enter(&ipx->ipx_lock); 6417 ASSERT(ipx->ipx_writer == NULL); 6418 ASSERT(ipx->ipx_current_ipif == NULL); 6419 } else { 6420 /* 6421 * We're switching from our own xop to a group xop. 6422 * The requestor of the switch must ensure that the 6423 * group xop cannot go away (e.g. by ensuring the 6424 * phyint associated with the xop cannot go away). 6425 * 6426 * If we can become writer on our new xop, then we'll 6427 * do the drain. Otherwise, the current writer of our 6428 * new xop will do the drain when it exits. 6429 * 6430 * First, splice ourselves into the group IPSQ list. 6431 * This is safe since we're writer on ill_g_lock. 6432 */ 6433 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 6434 6435 xopipsq = ipsq->ipsq_swxop->ipx_ipsq; 6436 while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq) 6437 xopipsq = xopipsq->ipsq_next; 6438 6439 xopipsq->ipsq_next = ipsq; 6440 ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq; 6441 ipsq->ipsq_xop = ipsq->ipsq_swxop; 6442 ipsq->ipsq_swxop = NULL; 6443 6444 /* 6445 * Second, exit our own xop, since it's now unused. 6446 * This is safe since we've got the only reference. 6447 */ 6448 ASSERT(ipx->ipx_writer == curthread); 6449 ipx->ipx_writer = NULL; 6450 VERIFY(--ipx->ipx_reentry_cnt == 0); 6451 ipx->ipx_ipsq_queued = B_FALSE; 6452 mutex_exit(&ipx->ipx_lock); 6453 6454 /* 6455 * Third, set ipx to point to our new xop, and check 6456 * if we can become writer on it. If we cannot, then 6457 * the current writer will drain the IPSQ group when 6458 * it exits. Our ipsq_xop is guaranteed to be stable 6459 * because we're still holding ipsq_lock. 6460 */ 6461 ipx = ipsq->ipsq_xop; 6462 mutex_enter(&ipx->ipx_lock); 6463 if (ipx->ipx_writer != NULL || 6464 ipx->ipx_current_ipif != NULL) { 6465 goto out; 6466 } 6467 } 6468 6469 /* 6470 * Fourth, become writer on our new ipx before we continue 6471 * with the drain. Note that we never dropped ipsq_lock 6472 * above, so no other thread could've raced with us to 6473 * become writer first. Also, we're holding ipx_lock, so 6474 * no other thread can examine the ipx right now. 6475 */ 6476 ASSERT(ipx->ipx_current_ipif == NULL); 6477 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 6478 VERIFY(ipx->ipx_reentry_cnt++ == 0); 6479 ipx->ipx_writer = curthread; 6480 ipx->ipx_forced = B_FALSE; 6481 #ifdef DEBUG 6482 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6483 #endif 6484 } 6485 6486 xopipsq = ipsq; 6487 do { 6488 /* 6489 * So that other operations operate on a consistent and 6490 * complete phyint, a switch message on an IPSQ must be 6491 * handled prior to any other operations on that IPSQ. 6492 */ 6493 if ((mp = xopipsq->ipsq_switch_mp) != NULL) { 6494 xopipsq->ipsq_switch_mp = NULL; 6495 ASSERT(mp->b_next == NULL); 6496 mp->b_next = (void *)xopipsq; 6497 goto out; 6498 } 6499 6500 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) { 6501 xopipsq->ipsq_xopq_mphead = mp->b_next; 6502 if (xopipsq->ipsq_xopq_mphead == NULL) 6503 xopipsq->ipsq_xopq_mptail = NULL; 6504 mp->b_next = (void *)xopipsq; 6505 goto out; 6506 } 6507 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 6508 empty: 6509 /* 6510 * There are no messages. Further, we are holding ipx_lock, hence no 6511 * new messages can end up on any IPSQ in the xop. 6512 */ 6513 ipx->ipx_writer = NULL; 6514 ipx->ipx_forced = B_FALSE; 6515 VERIFY(--ipx->ipx_reentry_cnt == 0); 6516 ipx->ipx_ipsq_queued = B_FALSE; 6517 emptied = B_TRUE; 6518 #ifdef DEBUG 6519 ipx->ipx_depth = 0; 6520 #endif 6521 out: 6522 mutex_exit(&ipx->ipx_lock); 6523 mutex_exit(&ipsq->ipsq_lock); 6524 6525 /* 6526 * If we completely emptied the xop, then wake up any threads waiting 6527 * to enter any of the IPSQ's associated with it. 6528 */ 6529 if (emptied) { 6530 xopipsq = ipsq; 6531 do { 6532 if ((phyi = xopipsq->ipsq_phyint) == NULL) 6533 continue; 6534 6535 illv4 = phyi->phyint_illv4; 6536 illv6 = phyi->phyint_illv6; 6537 6538 GRAB_ILL_LOCKS(illv4, illv6); 6539 if (illv4 != NULL) 6540 cv_broadcast(&illv4->ill_cv); 6541 if (illv6 != NULL) 6542 cv_broadcast(&illv6->ill_cv); 6543 RELEASE_ILL_LOCKS(illv4, illv6); 6544 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 6545 } 6546 rw_exit(&ipst->ips_ill_g_lock); 6547 6548 /* 6549 * Now that all locks are dropped, exit the IPSQ we left. 6550 */ 6551 if (leftipsq != NULL) 6552 ipsq_exit(leftipsq); 6553 6554 return (mp); 6555 } 6556 6557 /* 6558 * Return completion status of previously initiated DLPI operations on 6559 * ills in the purview of an ipsq. 6560 */ 6561 static boolean_t 6562 ipsq_dlpi_done(ipsq_t *ipsq) 6563 { 6564 ipsq_t *ipsq_start; 6565 phyint_t *phyi; 6566 ill_t *ill; 6567 6568 ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock)); 6569 ipsq_start = ipsq; 6570 6571 do { 6572 /* 6573 * The only current users of this function are ipsq_try_enter 6574 * and ipsq_enter which have made sure that ipsq_writer is 6575 * NULL before we reach here. ill_dlpi_pending is modified 6576 * only by an ipsq writer 6577 */ 6578 ASSERT(ipsq->ipsq_xop->ipx_writer == NULL); 6579 phyi = ipsq->ipsq_phyint; 6580 /* 6581 * phyi could be NULL if a phyint that is part of an 6582 * IPMP group is being unplumbed. A more detailed 6583 * comment is in ipmp_grp_update_kstats() 6584 */ 6585 if (phyi != NULL) { 6586 ill = phyi->phyint_illv4; 6587 if (ill != NULL && 6588 (ill->ill_dlpi_pending != DL_PRIM_INVAL || 6589 ill->ill_arl_dlpi_pending)) 6590 return (B_FALSE); 6591 6592 ill = phyi->phyint_illv6; 6593 if (ill != NULL && 6594 ill->ill_dlpi_pending != DL_PRIM_INVAL) 6595 return (B_FALSE); 6596 } 6597 6598 } while ((ipsq = ipsq->ipsq_next) != ipsq_start); 6599 6600 return (B_TRUE); 6601 } 6602 6603 /* 6604 * Enter the ipsq corresponding to ill, by waiting synchronously till 6605 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 6606 * will have to drain completely before ipsq_enter returns success. 6607 * ipx_current_ipif will be set if some exclusive op is in progress, 6608 * and the ipsq_exit logic will start the next enqueued op after 6609 * completion of the current op. If 'force' is used, we don't wait 6610 * for the enqueued ops. This is needed when a conn_close wants to 6611 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 6612 * of an ill can also use this option. But we dont' use it currently. 6613 */ 6614 #define ENTER_SQ_WAIT_TICKS 100 6615 boolean_t 6616 ipsq_enter(ill_t *ill, boolean_t force, int type) 6617 { 6618 ipsq_t *ipsq; 6619 ipxop_t *ipx; 6620 boolean_t waited_enough = B_FALSE; 6621 ip_stack_t *ipst = ill->ill_ipst; 6622 6623 /* 6624 * Note that the relationship between ill and ipsq is fixed as long as 6625 * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the 6626 * relationship between the IPSQ and xop cannot change. However, 6627 * since we cannot hold ipsq_lock across the cv_wait(), it may change 6628 * while we're waiting. We wait on ill_cv and rely on ipsq_exit() 6629 * waking up all ills in the xop when it becomes available. 6630 */ 6631 for (;;) { 6632 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6633 mutex_enter(&ill->ill_lock); 6634 if (ill->ill_state_flags & ILL_CONDEMNED) { 6635 mutex_exit(&ill->ill_lock); 6636 rw_exit(&ipst->ips_ill_g_lock); 6637 return (B_FALSE); 6638 } 6639 6640 ipsq = ill->ill_phyint->phyint_ipsq; 6641 mutex_enter(&ipsq->ipsq_lock); 6642 ipx = ipsq->ipsq_xop; 6643 mutex_enter(&ipx->ipx_lock); 6644 6645 if (ipx->ipx_writer == NULL && (type == CUR_OP || 6646 (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) || 6647 waited_enough)) 6648 break; 6649 6650 rw_exit(&ipst->ips_ill_g_lock); 6651 6652 if (!force || ipx->ipx_writer != NULL) { 6653 mutex_exit(&ipx->ipx_lock); 6654 mutex_exit(&ipsq->ipsq_lock); 6655 cv_wait(&ill->ill_cv, &ill->ill_lock); 6656 } else { 6657 mutex_exit(&ipx->ipx_lock); 6658 mutex_exit(&ipsq->ipsq_lock); 6659 (void) cv_reltimedwait(&ill->ill_cv, 6660 &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK); 6661 waited_enough = B_TRUE; 6662 } 6663 mutex_exit(&ill->ill_lock); 6664 } 6665 6666 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 6667 ASSERT(ipx->ipx_reentry_cnt == 0); 6668 ipx->ipx_writer = curthread; 6669 ipx->ipx_forced = (ipx->ipx_current_ipif != NULL); 6670 ipx->ipx_reentry_cnt++; 6671 #ifdef DEBUG 6672 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6673 #endif 6674 mutex_exit(&ipx->ipx_lock); 6675 mutex_exit(&ipsq->ipsq_lock); 6676 mutex_exit(&ill->ill_lock); 6677 rw_exit(&ipst->ips_ill_g_lock); 6678 6679 return (B_TRUE); 6680 } 6681 6682 /* 6683 * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock 6684 * across the call to the core interface ipsq_try_enter() and hence calls this 6685 * function directly. This is explained more fully in ipif_set_values(). 6686 * In order to support the above constraint, ipsq_try_enter is implemented as 6687 * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently 6688 */ 6689 static ipsq_t * 6690 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, 6691 int type, boolean_t reentry_ok) 6692 { 6693 ipsq_t *ipsq; 6694 ipxop_t *ipx; 6695 ip_stack_t *ipst = ill->ill_ipst; 6696 6697 /* 6698 * lock ordering: 6699 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock. 6700 * 6701 * ipx of an ipsq can't change when ipsq_lock is held. 6702 */ 6703 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 6704 GRAB_CONN_LOCK(q); 6705 mutex_enter(&ill->ill_lock); 6706 ipsq = ill->ill_phyint->phyint_ipsq; 6707 mutex_enter(&ipsq->ipsq_lock); 6708 ipx = ipsq->ipsq_xop; 6709 mutex_enter(&ipx->ipx_lock); 6710 6711 /* 6712 * 1. Enter the ipsq if we are already writer and reentry is ok. 6713 * (Note: If the caller does not specify reentry_ok then neither 6714 * 'func' nor any of its callees must ever attempt to enter the ipsq 6715 * again. Otherwise it can lead to an infinite loop 6716 * 2. Enter the ipsq if there is no current writer and this attempted 6717 * entry is part of the current operation 6718 * 3. Enter the ipsq if there is no current writer and this is a new 6719 * operation and the operation queue is empty and there is no 6720 * operation currently in progress and if all previously initiated 6721 * DLPI operations have completed. 6722 */ 6723 if ((ipx->ipx_writer == curthread && reentry_ok) || 6724 (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP && 6725 !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL && 6726 ipsq_dlpi_done(ipsq))))) { 6727 /* Success. */ 6728 ipx->ipx_reentry_cnt++; 6729 ipx->ipx_writer = curthread; 6730 ipx->ipx_forced = B_FALSE; 6731 mutex_exit(&ipx->ipx_lock); 6732 mutex_exit(&ipsq->ipsq_lock); 6733 mutex_exit(&ill->ill_lock); 6734 RELEASE_CONN_LOCK(q); 6735 #ifdef DEBUG 6736 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6737 #endif 6738 return (ipsq); 6739 } 6740 6741 if (func != NULL) 6742 ipsq_enq(ipsq, q, mp, func, type, ill); 6743 6744 mutex_exit(&ipx->ipx_lock); 6745 mutex_exit(&ipsq->ipsq_lock); 6746 mutex_exit(&ill->ill_lock); 6747 RELEASE_CONN_LOCK(q); 6748 return (NULL); 6749 } 6750 6751 /* 6752 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 6753 * certain critical operations like plumbing (i.e. most set ioctls), etc. 6754 * There is one ipsq per phyint. The ipsq 6755 * serializes exclusive ioctls issued by applications on a per ipsq basis in 6756 * ipsq_xopq_mphead. It also protects against multiple threads executing in 6757 * the ipsq. Responses from the driver pertain to the current ioctl (say a 6758 * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing 6759 * up the interface) and are enqueued in ipx_mphead. 6760 * 6761 * If a thread does not want to reenter the ipsq when it is already writer, 6762 * it must make sure that the specified reentry point to be called later 6763 * when the ipsq is empty, nor any code path starting from the specified reentry 6764 * point must never ever try to enter the ipsq again. Otherwise it can lead 6765 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 6766 * When the thread that is currently exclusive finishes, it (ipsq_exit) 6767 * dequeues the requests waiting to become exclusive in ipx_mphead and calls 6768 * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit 6769 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 6770 * ioctl if the current ioctl has completed. If the current ioctl is still 6771 * in progress it simply returns. The current ioctl could be waiting for 6772 * a response from another module (the driver or could be waiting for 6773 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp 6774 * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the 6775 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 6776 * ipx_current_ipif is NULL which happens only once the ioctl is complete and 6777 * all associated DLPI operations have completed. 6778 */ 6779 6780 /* 6781 * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif' 6782 * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ 6783 * on success, or NULL on failure. The caller ensures ipif/ill is valid by 6784 * refholding it as necessary. If the IPSQ cannot be entered and `func' is 6785 * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ 6786 * can be entered. If `func' is NULL, then `q' and `mp' are ignored. 6787 */ 6788 ipsq_t * 6789 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 6790 ipsq_func_t func, int type, boolean_t reentry_ok) 6791 { 6792 ip_stack_t *ipst; 6793 ipsq_t *ipsq; 6794 6795 /* Only 1 of ipif or ill can be specified */ 6796 ASSERT((ipif != NULL) ^ (ill != NULL)); 6797 6798 if (ipif != NULL) 6799 ill = ipif->ipif_ill; 6800 ipst = ill->ill_ipst; 6801 6802 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6803 ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok); 6804 rw_exit(&ipst->ips_ill_g_lock); 6805 6806 return (ipsq); 6807 } 6808 6809 /* 6810 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures 6811 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ 6812 * cannot be entered, the mp is queued for completion. 6813 */ 6814 void 6815 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 6816 boolean_t reentry_ok) 6817 { 6818 ipsq_t *ipsq; 6819 6820 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok); 6821 6822 /* 6823 * Drop the caller's refhold on the ill. This is safe since we either 6824 * entered the IPSQ (and thus are exclusive), or failed to enter the 6825 * IPSQ, in which case we return without accessing ill anymore. This 6826 * is needed because func needs to see the correct refcount. 6827 * e.g. removeif can work only then. 6828 */ 6829 ill_refrele(ill); 6830 if (ipsq != NULL) { 6831 (*func)(ipsq, q, mp, NULL); 6832 ipsq_exit(ipsq); 6833 } 6834 } 6835 6836 /* 6837 * Exit the specified IPSQ. If this is the final exit on it then drain it 6838 * prior to exiting. Caller must be writer on the specified IPSQ. 6839 */ 6840 void 6841 ipsq_exit(ipsq_t *ipsq) 6842 { 6843 mblk_t *mp; 6844 ipsq_t *mp_ipsq; 6845 queue_t *q; 6846 phyint_t *phyi; 6847 ipsq_func_t func; 6848 6849 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6850 6851 ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1); 6852 if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) { 6853 ipsq->ipsq_xop->ipx_reentry_cnt--; 6854 return; 6855 } 6856 6857 for (;;) { 6858 phyi = ipsq->ipsq_phyint; 6859 mp = ipsq_dq(ipsq); 6860 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next; 6861 6862 /* 6863 * If we've changed to a new IPSQ, and the phyint associated 6864 * with the old one has gone away, free the old IPSQ. Note 6865 * that this cannot happen while the IPSQ is in a group. 6866 */ 6867 if (mp_ipsq != ipsq && phyi == NULL) { 6868 ASSERT(ipsq->ipsq_next == ipsq); 6869 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 6870 ipsq_delete(ipsq); 6871 } 6872 6873 if (mp == NULL) 6874 break; 6875 6876 q = mp->b_queue; 6877 func = (ipsq_func_t)mp->b_prev; 6878 ipsq = mp_ipsq; 6879 mp->b_next = mp->b_prev = NULL; 6880 mp->b_queue = NULL; 6881 6882 /* 6883 * If 'q' is an conn queue, it is valid, since we did a 6884 * a refhold on the conn at the start of the ioctl. 6885 * If 'q' is an ill queue, it is valid, since close of an 6886 * ill will clean up its IPSQ. 6887 */ 6888 (*func)(ipsq, q, mp, NULL); 6889 } 6890 } 6891 6892 /* 6893 * Used to start any igmp or mld timers that could not be started 6894 * while holding ill_mcast_lock. The timers can't be started while holding 6895 * the lock, since mld/igmp_start_timers may need to call untimeout() 6896 * which can't be done while holding the lock which the timeout handler 6897 * acquires. Otherwise 6898 * there could be a deadlock since the timeout handlers 6899 * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire 6900 * ill_mcast_lock. 6901 */ 6902 void 6903 ill_mcast_timer_start(ip_stack_t *ipst) 6904 { 6905 int next; 6906 6907 mutex_enter(&ipst->ips_igmp_timer_lock); 6908 next = ipst->ips_igmp_deferred_next; 6909 ipst->ips_igmp_deferred_next = INFINITY; 6910 mutex_exit(&ipst->ips_igmp_timer_lock); 6911 6912 if (next != INFINITY) 6913 igmp_start_timers(next, ipst); 6914 6915 mutex_enter(&ipst->ips_mld_timer_lock); 6916 next = ipst->ips_mld_deferred_next; 6917 ipst->ips_mld_deferred_next = INFINITY; 6918 mutex_exit(&ipst->ips_mld_timer_lock); 6919 6920 if (next != INFINITY) 6921 mld_start_timers(next, ipst); 6922 } 6923 6924 /* 6925 * Start the current exclusive operation on `ipsq'; associate it with `ipif' 6926 * and `ioccmd'. 6927 */ 6928 void 6929 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) 6930 { 6931 ill_t *ill = ipif->ipif_ill; 6932 ipxop_t *ipx = ipsq->ipsq_xop; 6933 6934 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6935 ASSERT(ipx->ipx_current_ipif == NULL); 6936 ASSERT(ipx->ipx_current_ioctl == 0); 6937 6938 ipx->ipx_current_done = B_FALSE; 6939 ipx->ipx_current_ioctl = ioccmd; 6940 mutex_enter(&ipx->ipx_lock); 6941 ipx->ipx_current_ipif = ipif; 6942 mutex_exit(&ipx->ipx_lock); 6943 6944 /* 6945 * Set IPIF_CHANGING on one or more ipifs associated with the 6946 * current exclusive operation. IPIF_CHANGING prevents any new 6947 * references to the ipif (so that the references will eventually 6948 * drop to zero) and also prevents any "get" operations (e.g., 6949 * SIOCGLIFFLAGS) from being able to access the ipif until the 6950 * operation has completed and the ipif is again in a stable state. 6951 * 6952 * For ioctls, IPIF_CHANGING is set on the ipif associated with the 6953 * ioctl. For internal operations (where ioccmd is zero), all ipifs 6954 * on the ill are marked with IPIF_CHANGING since it's unclear which 6955 * ipifs will be affected. 6956 * 6957 * Note that SIOCLIFREMOVEIF is a special case as it sets 6958 * IPIF_CONDEMNED internally after identifying the right ipif to 6959 * operate on. 6960 */ 6961 switch (ioccmd) { 6962 case SIOCLIFREMOVEIF: 6963 break; 6964 case 0: 6965 mutex_enter(&ill->ill_lock); 6966 ipif = ipif->ipif_ill->ill_ipif; 6967 for (; ipif != NULL; ipif = ipif->ipif_next) 6968 ipif->ipif_state_flags |= IPIF_CHANGING; 6969 mutex_exit(&ill->ill_lock); 6970 break; 6971 default: 6972 mutex_enter(&ill->ill_lock); 6973 ipif->ipif_state_flags |= IPIF_CHANGING; 6974 mutex_exit(&ill->ill_lock); 6975 } 6976 } 6977 6978 /* 6979 * Finish the current exclusive operation on `ipsq'. Usually, this will allow 6980 * the next exclusive operation to begin once we ipsq_exit(). However, if 6981 * pending DLPI operations remain, then we will wait for the queue to drain 6982 * before allowing the next exclusive operation to begin. This ensures that 6983 * DLPI operations from one exclusive operation are never improperly processed 6984 * as part of a subsequent exclusive operation. 6985 */ 6986 void 6987 ipsq_current_finish(ipsq_t *ipsq) 6988 { 6989 ipxop_t *ipx = ipsq->ipsq_xop; 6990 t_uscalar_t dlpi_pending = DL_PRIM_INVAL; 6991 ipif_t *ipif = ipx->ipx_current_ipif; 6992 6993 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6994 6995 /* 6996 * For SIOCLIFREMOVEIF, the ipif has been already been blown away 6997 * (but in that case, IPIF_CHANGING will already be clear and no 6998 * pending DLPI messages can remain). 6999 */ 7000 if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) { 7001 ill_t *ill = ipif->ipif_ill; 7002 7003 mutex_enter(&ill->ill_lock); 7004 dlpi_pending = ill->ill_dlpi_pending; 7005 if (ipx->ipx_current_ioctl == 0) { 7006 ipif = ill->ill_ipif; 7007 for (; ipif != NULL; ipif = ipif->ipif_next) 7008 ipif->ipif_state_flags &= ~IPIF_CHANGING; 7009 } else { 7010 ipif->ipif_state_flags &= ~IPIF_CHANGING; 7011 } 7012 mutex_exit(&ill->ill_lock); 7013 } 7014 7015 ASSERT(!ipx->ipx_current_done); 7016 ipx->ipx_current_done = B_TRUE; 7017 ipx->ipx_current_ioctl = 0; 7018 if (dlpi_pending == DL_PRIM_INVAL) { 7019 mutex_enter(&ipx->ipx_lock); 7020 ipx->ipx_current_ipif = NULL; 7021 mutex_exit(&ipx->ipx_lock); 7022 } 7023 } 7024 7025 /* 7026 * The ill is closing. Flush all messages on the ipsq that originated 7027 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 7028 * for this ill since ipsq_enter could not have entered until then. 7029 * New messages can't be queued since the CONDEMNED flag is set. 7030 */ 7031 static void 7032 ipsq_flush(ill_t *ill) 7033 { 7034 queue_t *q; 7035 mblk_t *prev; 7036 mblk_t *mp; 7037 mblk_t *mp_next; 7038 ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 7039 7040 ASSERT(IAM_WRITER_ILL(ill)); 7041 7042 /* 7043 * Flush any messages sent up by the driver. 7044 */ 7045 mutex_enter(&ipx->ipx_lock); 7046 for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) { 7047 mp_next = mp->b_next; 7048 q = mp->b_queue; 7049 if (q == ill->ill_rq || q == ill->ill_wq) { 7050 /* dequeue mp */ 7051 if (prev == NULL) 7052 ipx->ipx_mphead = mp->b_next; 7053 else 7054 prev->b_next = mp->b_next; 7055 if (ipx->ipx_mptail == mp) { 7056 ASSERT(mp_next == NULL); 7057 ipx->ipx_mptail = prev; 7058 } 7059 inet_freemsg(mp); 7060 } else { 7061 prev = mp; 7062 } 7063 } 7064 mutex_exit(&ipx->ipx_lock); 7065 (void) ipsq_pending_mp_cleanup(ill, NULL); 7066 ipsq_xopq_mp_cleanup(ill, NULL); 7067 } 7068 7069 /* 7070 * Parse an ifreq or lifreq struct coming down ioctls and refhold 7071 * and return the associated ipif. 7072 * Return value: 7073 * Non zero: An error has occurred. ci may not be filled out. 7074 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 7075 * a held ipif in ci.ci_ipif. 7076 */ 7077 int 7078 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 7079 cmd_info_t *ci) 7080 { 7081 char *name; 7082 struct ifreq *ifr; 7083 struct lifreq *lifr; 7084 ipif_t *ipif = NULL; 7085 ill_t *ill; 7086 conn_t *connp; 7087 boolean_t isv6; 7088 boolean_t exists; 7089 mblk_t *mp1; 7090 zoneid_t zoneid; 7091 ip_stack_t *ipst; 7092 7093 if (q->q_next != NULL) { 7094 ill = (ill_t *)q->q_ptr; 7095 isv6 = ill->ill_isv6; 7096 connp = NULL; 7097 zoneid = ALL_ZONES; 7098 ipst = ill->ill_ipst; 7099 } else { 7100 ill = NULL; 7101 connp = Q_TO_CONN(q); 7102 isv6 = (connp->conn_family == AF_INET6); 7103 zoneid = connp->conn_zoneid; 7104 if (zoneid == GLOBAL_ZONEID) { 7105 /* global zone can access ipifs in all zones */ 7106 zoneid = ALL_ZONES; 7107 } 7108 ipst = connp->conn_netstack->netstack_ip; 7109 } 7110 7111 /* Has been checked in ip_wput_nondata */ 7112 mp1 = mp->b_cont->b_cont; 7113 7114 if (ipip->ipi_cmd_type == IF_CMD) { 7115 /* This a old style SIOC[GS]IF* command */ 7116 ifr = (struct ifreq *)mp1->b_rptr; 7117 /* 7118 * Null terminate the string to protect against buffer 7119 * overrun. String was generated by user code and may not 7120 * be trusted. 7121 */ 7122 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 7123 name = ifr->ifr_name; 7124 ci->ci_sin = (sin_t *)&ifr->ifr_addr; 7125 ci->ci_sin6 = NULL; 7126 ci->ci_lifr = (struct lifreq *)ifr; 7127 } else { 7128 /* This a new style SIOC[GS]LIF* command */ 7129 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 7130 lifr = (struct lifreq *)mp1->b_rptr; 7131 /* 7132 * Null terminate the string to protect against buffer 7133 * overrun. String was generated by user code and may not 7134 * be trusted. 7135 */ 7136 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 7137 name = lifr->lifr_name; 7138 ci->ci_sin = (sin_t *)&lifr->lifr_addr; 7139 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr; 7140 ci->ci_lifr = lifr; 7141 } 7142 7143 if (ipip->ipi_cmd == SIOCSLIFNAME) { 7144 /* 7145 * The ioctl will be failed if the ioctl comes down 7146 * an conn stream 7147 */ 7148 if (ill == NULL) { 7149 /* 7150 * Not an ill queue, return EINVAL same as the 7151 * old error code. 7152 */ 7153 return (ENXIO); 7154 } 7155 ipif = ill->ill_ipif; 7156 ipif_refhold(ipif); 7157 } else { 7158 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 7159 &exists, isv6, zoneid, ipst); 7160 7161 /* 7162 * Ensure that get ioctls don't see any internal state changes 7163 * caused by set ioctls by deferring them if IPIF_CHANGING is 7164 * set. 7165 */ 7166 if (ipif != NULL && !(ipip->ipi_flags & IPI_WR) && 7167 !IAM_WRITER_IPIF(ipif)) { 7168 ipsq_t *ipsq; 7169 7170 if (connp != NULL) 7171 mutex_enter(&connp->conn_lock); 7172 mutex_enter(&ipif->ipif_ill->ill_lock); 7173 if (IPIF_IS_CHANGING(ipif) && 7174 !IPIF_IS_CONDEMNED(ipif)) { 7175 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 7176 mutex_enter(&ipsq->ipsq_lock); 7177 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 7178 mutex_exit(&ipif->ipif_ill->ill_lock); 7179 ipsq_enq(ipsq, q, mp, ip_process_ioctl, 7180 NEW_OP, ipif->ipif_ill); 7181 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 7182 mutex_exit(&ipsq->ipsq_lock); 7183 if (connp != NULL) 7184 mutex_exit(&connp->conn_lock); 7185 ipif_refrele(ipif); 7186 return (EINPROGRESS); 7187 } 7188 mutex_exit(&ipif->ipif_ill->ill_lock); 7189 if (connp != NULL) 7190 mutex_exit(&connp->conn_lock); 7191 } 7192 } 7193 7194 /* 7195 * Old style [GS]IFCMD does not admit IPv6 ipif 7196 */ 7197 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) { 7198 ipif_refrele(ipif); 7199 return (ENXIO); 7200 } 7201 7202 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 7203 name[0] == '\0') { 7204 /* 7205 * Handle a or a SIOC?IF* with a null name 7206 * during plumb (on the ill queue before the I_PLINK). 7207 */ 7208 ipif = ill->ill_ipif; 7209 ipif_refhold(ipif); 7210 } 7211 7212 if (ipif == NULL) 7213 return (ENXIO); 7214 7215 DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq", 7216 int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif); 7217 7218 ci->ci_ipif = ipif; 7219 return (0); 7220 } 7221 7222 /* 7223 * Return the total number of ipifs. 7224 */ 7225 static uint_t 7226 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst) 7227 { 7228 uint_t numifs = 0; 7229 ill_t *ill; 7230 ill_walk_context_t ctx; 7231 ipif_t *ipif; 7232 7233 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7234 ill = ILL_START_WALK_V4(&ctx, ipst); 7235 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7236 if (IS_UNDER_IPMP(ill)) 7237 continue; 7238 for (ipif = ill->ill_ipif; ipif != NULL; 7239 ipif = ipif->ipif_next) { 7240 if (ipif->ipif_zoneid == zoneid || 7241 ipif->ipif_zoneid == ALL_ZONES) 7242 numifs++; 7243 } 7244 } 7245 rw_exit(&ipst->ips_ill_g_lock); 7246 return (numifs); 7247 } 7248 7249 /* 7250 * Return the total number of ipifs. 7251 */ 7252 static uint_t 7253 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst) 7254 { 7255 uint_t numifs = 0; 7256 ill_t *ill; 7257 ipif_t *ipif; 7258 ill_walk_context_t ctx; 7259 7260 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 7261 7262 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7263 if (family == AF_INET) 7264 ill = ILL_START_WALK_V4(&ctx, ipst); 7265 else if (family == AF_INET6) 7266 ill = ILL_START_WALK_V6(&ctx, ipst); 7267 else 7268 ill = ILL_START_WALK_ALL(&ctx, ipst); 7269 7270 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7271 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP)) 7272 continue; 7273 7274 for (ipif = ill->ill_ipif; ipif != NULL; 7275 ipif = ipif->ipif_next) { 7276 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7277 !(lifn_flags & LIFC_NOXMIT)) 7278 continue; 7279 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7280 !(lifn_flags & LIFC_TEMPORARY)) 7281 continue; 7282 if (((ipif->ipif_flags & 7283 (IPIF_NOXMIT|IPIF_NOLOCAL| 7284 IPIF_DEPRECATED)) || 7285 IS_LOOPBACK(ill) || 7286 !(ipif->ipif_flags & IPIF_UP)) && 7287 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 7288 continue; 7289 7290 if (zoneid != ipif->ipif_zoneid && 7291 ipif->ipif_zoneid != ALL_ZONES && 7292 (zoneid != GLOBAL_ZONEID || 7293 !(lifn_flags & LIFC_ALLZONES))) 7294 continue; 7295 7296 numifs++; 7297 } 7298 } 7299 rw_exit(&ipst->ips_ill_g_lock); 7300 return (numifs); 7301 } 7302 7303 uint_t 7304 ip_get_lifsrcofnum(ill_t *ill) 7305 { 7306 uint_t numifs = 0; 7307 ill_t *ill_head = ill; 7308 ip_stack_t *ipst = ill->ill_ipst; 7309 7310 /* 7311 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 7312 * other thread may be trying to relink the ILLs in this usesrc group 7313 * and adjusting the ill_usesrc_grp_next pointers 7314 */ 7315 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 7316 if ((ill->ill_usesrc_ifindex == 0) && 7317 (ill->ill_usesrc_grp_next != NULL)) { 7318 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 7319 ill = ill->ill_usesrc_grp_next) 7320 numifs++; 7321 } 7322 rw_exit(&ipst->ips_ill_g_usesrc_lock); 7323 7324 return (numifs); 7325 } 7326 7327 /* Null values are passed in for ipif, sin, and ifreq */ 7328 /* ARGSUSED */ 7329 int 7330 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7331 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7332 { 7333 int *nump; 7334 conn_t *connp = Q_TO_CONN(q); 7335 7336 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7337 7338 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 7339 nump = (int *)mp->b_cont->b_cont->b_rptr; 7340 7341 *nump = ip_get_numifs(connp->conn_zoneid, 7342 connp->conn_netstack->netstack_ip); 7343 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 7344 return (0); 7345 } 7346 7347 /* Null values are passed in for ipif, sin, and ifreq */ 7348 /* ARGSUSED */ 7349 int 7350 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 7351 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7352 { 7353 struct lifnum *lifn; 7354 mblk_t *mp1; 7355 conn_t *connp = Q_TO_CONN(q); 7356 7357 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7358 7359 /* Existence checked in ip_wput_nondata */ 7360 mp1 = mp->b_cont->b_cont; 7361 7362 lifn = (struct lifnum *)mp1->b_rptr; 7363 switch (lifn->lifn_family) { 7364 case AF_UNSPEC: 7365 case AF_INET: 7366 case AF_INET6: 7367 break; 7368 default: 7369 return (EAFNOSUPPORT); 7370 } 7371 7372 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 7373 connp->conn_zoneid, connp->conn_netstack->netstack_ip); 7374 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 7375 return (0); 7376 } 7377 7378 /* ARGSUSED */ 7379 int 7380 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7381 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7382 { 7383 STRUCT_HANDLE(ifconf, ifc); 7384 mblk_t *mp1; 7385 struct iocblk *iocp; 7386 struct ifreq *ifr; 7387 ill_walk_context_t ctx; 7388 ill_t *ill; 7389 ipif_t *ipif; 7390 struct sockaddr_in *sin; 7391 int32_t ifclen; 7392 zoneid_t zoneid; 7393 ip_stack_t *ipst = CONNQ_TO_IPST(q); 7394 7395 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 7396 7397 ip1dbg(("ip_sioctl_get_ifconf")); 7398 /* Existence verified in ip_wput_nondata */ 7399 mp1 = mp->b_cont->b_cont; 7400 iocp = (struct iocblk *)mp->b_rptr; 7401 zoneid = Q_TO_CONN(q)->conn_zoneid; 7402 7403 /* 7404 * The original SIOCGIFCONF passed in a struct ifconf which specified 7405 * the user buffer address and length into which the list of struct 7406 * ifreqs was to be copied. Since AT&T Streams does not seem to 7407 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 7408 * the SIOCGIFCONF operation was redefined to simply provide 7409 * a large output buffer into which we are supposed to jam the ifreq 7410 * array. The same ioctl command code was used, despite the fact that 7411 * both the applications and the kernel code had to change, thus making 7412 * it impossible to support both interfaces. 7413 * 7414 * For reasons not good enough to try to explain, the following 7415 * algorithm is used for deciding what to do with one of these: 7416 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 7417 * form with the output buffer coming down as the continuation message. 7418 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 7419 * and we have to copy in the ifconf structure to find out how big the 7420 * output buffer is and where to copy out to. Sure no problem... 7421 * 7422 */ 7423 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 7424 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 7425 int numifs = 0; 7426 size_t ifc_bufsize; 7427 7428 /* 7429 * Must be (better be!) continuation of a TRANSPARENT 7430 * IOCTL. We just copied in the ifconf structure. 7431 */ 7432 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 7433 (struct ifconf *)mp1->b_rptr); 7434 7435 /* 7436 * Allocate a buffer to hold requested information. 7437 * 7438 * If ifc_len is larger than what is needed, we only 7439 * allocate what we will use. 7440 * 7441 * If ifc_len is smaller than what is needed, return 7442 * EINVAL. 7443 * 7444 * XXX: the ill_t structure can hava 2 counters, for 7445 * v4 and v6 (not just ill_ipif_up_count) to store the 7446 * number of interfaces for a device, so we don't need 7447 * to count them here... 7448 */ 7449 numifs = ip_get_numifs(zoneid, ipst); 7450 7451 ifclen = STRUCT_FGET(ifc, ifc_len); 7452 ifc_bufsize = numifs * sizeof (struct ifreq); 7453 if (ifc_bufsize > ifclen) { 7454 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 7455 /* old behaviour */ 7456 return (EINVAL); 7457 } else { 7458 ifc_bufsize = ifclen; 7459 } 7460 } 7461 7462 mp1 = mi_copyout_alloc(q, mp, 7463 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 7464 if (mp1 == NULL) 7465 return (ENOMEM); 7466 7467 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 7468 } 7469 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 7470 /* 7471 * the SIOCGIFCONF ioctl only knows about 7472 * IPv4 addresses, so don't try to tell 7473 * it about interfaces with IPv6-only 7474 * addresses. (Last parm 'isv6' is B_FALSE) 7475 */ 7476 7477 ifr = (struct ifreq *)mp1->b_rptr; 7478 7479 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7480 ill = ILL_START_WALK_V4(&ctx, ipst); 7481 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7482 if (IS_UNDER_IPMP(ill)) 7483 continue; 7484 for (ipif = ill->ill_ipif; ipif != NULL; 7485 ipif = ipif->ipif_next) { 7486 if (zoneid != ipif->ipif_zoneid && 7487 ipif->ipif_zoneid != ALL_ZONES) 7488 continue; 7489 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 7490 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 7491 /* old behaviour */ 7492 rw_exit(&ipst->ips_ill_g_lock); 7493 return (EINVAL); 7494 } else { 7495 goto if_copydone; 7496 } 7497 } 7498 ipif_get_name(ipif, ifr->ifr_name, 7499 sizeof (ifr->ifr_name)); 7500 sin = (sin_t *)&ifr->ifr_addr; 7501 *sin = sin_null; 7502 sin->sin_family = AF_INET; 7503 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 7504 ifr++; 7505 } 7506 } 7507 if_copydone: 7508 rw_exit(&ipst->ips_ill_g_lock); 7509 mp1->b_wptr = (uchar_t *)ifr; 7510 7511 if (STRUCT_BUF(ifc) != NULL) { 7512 STRUCT_FSET(ifc, ifc_len, 7513 (int)((uchar_t *)ifr - mp1->b_rptr)); 7514 } 7515 return (0); 7516 } 7517 7518 /* 7519 * Get the interfaces using the address hosted on the interface passed in, 7520 * as a source adddress 7521 */ 7522 /* ARGSUSED */ 7523 int 7524 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7525 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7526 { 7527 mblk_t *mp1; 7528 ill_t *ill, *ill_head; 7529 ipif_t *ipif, *orig_ipif; 7530 int numlifs = 0; 7531 size_t lifs_bufsize, lifsmaxlen; 7532 struct lifreq *lifr; 7533 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7534 uint_t ifindex; 7535 zoneid_t zoneid; 7536 boolean_t isv6 = B_FALSE; 7537 struct sockaddr_in *sin; 7538 struct sockaddr_in6 *sin6; 7539 STRUCT_HANDLE(lifsrcof, lifs); 7540 ip_stack_t *ipst; 7541 7542 ipst = CONNQ_TO_IPST(q); 7543 7544 ASSERT(q->q_next == NULL); 7545 7546 zoneid = Q_TO_CONN(q)->conn_zoneid; 7547 7548 /* Existence verified in ip_wput_nondata */ 7549 mp1 = mp->b_cont->b_cont; 7550 7551 /* 7552 * Must be (better be!) continuation of a TRANSPARENT 7553 * IOCTL. We just copied in the lifsrcof structure. 7554 */ 7555 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 7556 (struct lifsrcof *)mp1->b_rptr); 7557 7558 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 7559 return (EINVAL); 7560 7561 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 7562 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; 7563 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst); 7564 if (ipif == NULL) { 7565 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 7566 ifindex)); 7567 return (ENXIO); 7568 } 7569 7570 /* Allocate a buffer to hold requested information */ 7571 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 7572 lifs_bufsize = numlifs * sizeof (struct lifreq); 7573 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 7574 /* The actual size needed is always returned in lifs_len */ 7575 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 7576 7577 /* If the amount we need is more than what is passed in, abort */ 7578 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 7579 ipif_refrele(ipif); 7580 return (0); 7581 } 7582 7583 mp1 = mi_copyout_alloc(q, mp, 7584 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 7585 if (mp1 == NULL) { 7586 ipif_refrele(ipif); 7587 return (ENOMEM); 7588 } 7589 7590 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 7591 bzero(mp1->b_rptr, lifs_bufsize); 7592 7593 lifr = (struct lifreq *)mp1->b_rptr; 7594 7595 ill = ill_head = ipif->ipif_ill; 7596 orig_ipif = ipif; 7597 7598 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 7599 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 7600 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7601 7602 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 7603 for (; (ill != NULL) && (ill != ill_head); 7604 ill = ill->ill_usesrc_grp_next) { 7605 7606 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 7607 break; 7608 7609 ipif = ill->ill_ipif; 7610 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name)); 7611 if (ipif->ipif_isv6) { 7612 sin6 = (sin6_t *)&lifr->lifr_addr; 7613 *sin6 = sin6_null; 7614 sin6->sin6_family = AF_INET6; 7615 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 7616 lifr->lifr_addrlen = ip_mask_to_plen_v6( 7617 &ipif->ipif_v6net_mask); 7618 } else { 7619 sin = (sin_t *)&lifr->lifr_addr; 7620 *sin = sin_null; 7621 sin->sin_family = AF_INET; 7622 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 7623 lifr->lifr_addrlen = ip_mask_to_plen( 7624 ipif->ipif_net_mask); 7625 } 7626 lifr++; 7627 } 7628 rw_exit(&ipst->ips_ill_g_lock); 7629 rw_exit(&ipst->ips_ill_g_usesrc_lock); 7630 ipif_refrele(orig_ipif); 7631 mp1->b_wptr = (uchar_t *)lifr; 7632 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 7633 7634 return (0); 7635 } 7636 7637 /* ARGSUSED */ 7638 int 7639 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7640 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7641 { 7642 mblk_t *mp1; 7643 int list; 7644 ill_t *ill; 7645 ipif_t *ipif; 7646 int flags; 7647 int numlifs = 0; 7648 size_t lifc_bufsize; 7649 struct lifreq *lifr; 7650 sa_family_t family; 7651 struct sockaddr_in *sin; 7652 struct sockaddr_in6 *sin6; 7653 ill_walk_context_t ctx; 7654 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7655 int32_t lifclen; 7656 zoneid_t zoneid; 7657 STRUCT_HANDLE(lifconf, lifc); 7658 ip_stack_t *ipst = CONNQ_TO_IPST(q); 7659 7660 ip1dbg(("ip_sioctl_get_lifconf")); 7661 7662 ASSERT(q->q_next == NULL); 7663 7664 zoneid = Q_TO_CONN(q)->conn_zoneid; 7665 7666 /* Existence verified in ip_wput_nondata */ 7667 mp1 = mp->b_cont->b_cont; 7668 7669 /* 7670 * An extended version of SIOCGIFCONF that takes an 7671 * additional address family and flags field. 7672 * AF_UNSPEC retrieve both IPv4 and IPv6. 7673 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 7674 * interfaces are omitted. 7675 * Similarly, IPIF_TEMPORARY interfaces are omitted 7676 * unless LIFC_TEMPORARY is specified. 7677 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 7678 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 7679 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 7680 * has priority over LIFC_NOXMIT. 7681 */ 7682 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 7683 7684 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 7685 return (EINVAL); 7686 7687 /* 7688 * Must be (better be!) continuation of a TRANSPARENT 7689 * IOCTL. We just copied in the lifconf structure. 7690 */ 7691 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 7692 7693 family = STRUCT_FGET(lifc, lifc_family); 7694 flags = STRUCT_FGET(lifc, lifc_flags); 7695 7696 switch (family) { 7697 case AF_UNSPEC: 7698 /* 7699 * walk all ILL's. 7700 */ 7701 list = MAX_G_HEADS; 7702 break; 7703 case AF_INET: 7704 /* 7705 * walk only IPV4 ILL's. 7706 */ 7707 list = IP_V4_G_HEAD; 7708 break; 7709 case AF_INET6: 7710 /* 7711 * walk only IPV6 ILL's. 7712 */ 7713 list = IP_V6_G_HEAD; 7714 break; 7715 default: 7716 return (EAFNOSUPPORT); 7717 } 7718 7719 /* 7720 * Allocate a buffer to hold requested information. 7721 * 7722 * If lifc_len is larger than what is needed, we only 7723 * allocate what we will use. 7724 * 7725 * If lifc_len is smaller than what is needed, return 7726 * EINVAL. 7727 */ 7728 numlifs = ip_get_numlifs(family, flags, zoneid, ipst); 7729 lifc_bufsize = numlifs * sizeof (struct lifreq); 7730 lifclen = STRUCT_FGET(lifc, lifc_len); 7731 if (lifc_bufsize > lifclen) { 7732 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 7733 return (EINVAL); 7734 else 7735 lifc_bufsize = lifclen; 7736 } 7737 7738 mp1 = mi_copyout_alloc(q, mp, 7739 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 7740 if (mp1 == NULL) 7741 return (ENOMEM); 7742 7743 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 7744 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 7745 7746 lifr = (struct lifreq *)mp1->b_rptr; 7747 7748 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7749 ill = ill_first(list, list, &ctx, ipst); 7750 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7751 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP)) 7752 continue; 7753 7754 for (ipif = ill->ill_ipif; ipif != NULL; 7755 ipif = ipif->ipif_next) { 7756 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7757 !(flags & LIFC_NOXMIT)) 7758 continue; 7759 7760 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7761 !(flags & LIFC_TEMPORARY)) 7762 continue; 7763 7764 if (((ipif->ipif_flags & 7765 (IPIF_NOXMIT|IPIF_NOLOCAL| 7766 IPIF_DEPRECATED)) || 7767 IS_LOOPBACK(ill) || 7768 !(ipif->ipif_flags & IPIF_UP)) && 7769 (flags & LIFC_EXTERNAL_SOURCE)) 7770 continue; 7771 7772 if (zoneid != ipif->ipif_zoneid && 7773 ipif->ipif_zoneid != ALL_ZONES && 7774 (zoneid != GLOBAL_ZONEID || 7775 !(flags & LIFC_ALLZONES))) 7776 continue; 7777 7778 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 7779 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 7780 rw_exit(&ipst->ips_ill_g_lock); 7781 return (EINVAL); 7782 } else { 7783 goto lif_copydone; 7784 } 7785 } 7786 7787 ipif_get_name(ipif, lifr->lifr_name, 7788 sizeof (lifr->lifr_name)); 7789 lifr->lifr_type = ill->ill_type; 7790 if (ipif->ipif_isv6) { 7791 sin6 = (sin6_t *)&lifr->lifr_addr; 7792 *sin6 = sin6_null; 7793 sin6->sin6_family = AF_INET6; 7794 sin6->sin6_addr = 7795 ipif->ipif_v6lcl_addr; 7796 lifr->lifr_addrlen = 7797 ip_mask_to_plen_v6( 7798 &ipif->ipif_v6net_mask); 7799 } else { 7800 sin = (sin_t *)&lifr->lifr_addr; 7801 *sin = sin_null; 7802 sin->sin_family = AF_INET; 7803 sin->sin_addr.s_addr = 7804 ipif->ipif_lcl_addr; 7805 lifr->lifr_addrlen = 7806 ip_mask_to_plen( 7807 ipif->ipif_net_mask); 7808 } 7809 lifr++; 7810 } 7811 } 7812 lif_copydone: 7813 rw_exit(&ipst->ips_ill_g_lock); 7814 7815 mp1->b_wptr = (uchar_t *)lifr; 7816 if (STRUCT_BUF(lifc) != NULL) { 7817 STRUCT_FSET(lifc, lifc_len, 7818 (int)((uchar_t *)lifr - mp1->b_rptr)); 7819 } 7820 return (0); 7821 } 7822 7823 static void 7824 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 7825 { 7826 ip6_asp_t *table; 7827 size_t table_size; 7828 mblk_t *data_mp; 7829 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7830 ip_stack_t *ipst; 7831 7832 if (q->q_next == NULL) 7833 ipst = CONNQ_TO_IPST(q); 7834 else 7835 ipst = ILLQ_TO_IPST(q); 7836 7837 /* These two ioctls are I_STR only */ 7838 if (iocp->ioc_count == TRANSPARENT) { 7839 miocnak(q, mp, 0, EINVAL); 7840 return; 7841 } 7842 7843 data_mp = mp->b_cont; 7844 if (data_mp == NULL) { 7845 /* The user passed us a NULL argument */ 7846 table = NULL; 7847 table_size = iocp->ioc_count; 7848 } else { 7849 /* 7850 * The user provided a table. The stream head 7851 * may have copied in the user data in chunks, 7852 * so make sure everything is pulled up 7853 * properly. 7854 */ 7855 if (MBLKL(data_mp) < iocp->ioc_count) { 7856 mblk_t *new_data_mp; 7857 if ((new_data_mp = msgpullup(data_mp, -1)) == 7858 NULL) { 7859 miocnak(q, mp, 0, ENOMEM); 7860 return; 7861 } 7862 freemsg(data_mp); 7863 data_mp = new_data_mp; 7864 mp->b_cont = data_mp; 7865 } 7866 table = (ip6_asp_t *)data_mp->b_rptr; 7867 table_size = iocp->ioc_count; 7868 } 7869 7870 switch (iocp->ioc_cmd) { 7871 case SIOCGIP6ADDRPOLICY: 7872 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst); 7873 if (iocp->ioc_rval == -1) 7874 iocp->ioc_error = EINVAL; 7875 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 7876 else if (table != NULL && 7877 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 7878 ip6_asp_t *src = table; 7879 ip6_asp32_t *dst = (void *)table; 7880 int count = table_size / sizeof (ip6_asp_t); 7881 int i; 7882 7883 /* 7884 * We need to do an in-place shrink of the array 7885 * to match the alignment attributes of the 7886 * 32-bit ABI looking at it. 7887 */ 7888 /* LINTED: logical expression always true: op "||" */ 7889 ASSERT(sizeof (*src) > sizeof (*dst)); 7890 for (i = 1; i < count; i++) 7891 bcopy(src + i, dst + i, sizeof (*dst)); 7892 } 7893 #endif 7894 break; 7895 7896 case SIOCSIP6ADDRPOLICY: 7897 ASSERT(mp->b_prev == NULL); 7898 mp->b_prev = (void *)q; 7899 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 7900 /* 7901 * We pass in the datamodel here so that the ip6_asp_replace() 7902 * routine can handle converting from 32-bit to native formats 7903 * where necessary. 7904 * 7905 * A better way to handle this might be to convert the inbound 7906 * data structure here, and hang it off a new 'mp'; thus the 7907 * ip6_asp_replace() logic would always be dealing with native 7908 * format data structures.. 7909 * 7910 * (An even simpler way to handle these ioctls is to just 7911 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 7912 * and just recompile everything that depends on it.) 7913 */ 7914 #endif 7915 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst, 7916 iocp->ioc_flag & IOC_MODELS); 7917 return; 7918 } 7919 7920 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 7921 qreply(q, mp); 7922 } 7923 7924 static void 7925 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 7926 { 7927 mblk_t *data_mp; 7928 struct dstinforeq *dir; 7929 uint8_t *end, *cur; 7930 in6_addr_t *daddr, *saddr; 7931 ipaddr_t v4daddr; 7932 ire_t *ire; 7933 ipaddr_t v4setsrc; 7934 in6_addr_t v6setsrc; 7935 char *slabel, *dlabel; 7936 boolean_t isipv4; 7937 int match_ire; 7938 ill_t *dst_ill; 7939 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7940 conn_t *connp = Q_TO_CONN(q); 7941 zoneid_t zoneid = IPCL_ZONEID(connp); 7942 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 7943 uint64_t ipif_flags; 7944 7945 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 7946 7947 /* 7948 * This ioctl is I_STR only, and must have a 7949 * data mblk following the M_IOCTL mblk. 7950 */ 7951 data_mp = mp->b_cont; 7952 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 7953 miocnak(q, mp, 0, EINVAL); 7954 return; 7955 } 7956 7957 if (MBLKL(data_mp) < iocp->ioc_count) { 7958 mblk_t *new_data_mp; 7959 7960 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 7961 miocnak(q, mp, 0, ENOMEM); 7962 return; 7963 } 7964 freemsg(data_mp); 7965 data_mp = new_data_mp; 7966 mp->b_cont = data_mp; 7967 } 7968 match_ire = MATCH_IRE_DSTONLY; 7969 7970 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 7971 end - cur >= sizeof (struct dstinforeq); 7972 cur += sizeof (struct dstinforeq)) { 7973 dir = (struct dstinforeq *)cur; 7974 daddr = &dir->dir_daddr; 7975 saddr = &dir->dir_saddr; 7976 7977 /* 7978 * ip_addr_scope_v6() and ip6_asp_lookup() handle 7979 * v4 mapped addresses; ire_ftable_lookup_v6() 7980 * and ip_select_source_v6() do not. 7981 */ 7982 dir->dir_dscope = ip_addr_scope_v6(daddr); 7983 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst); 7984 7985 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 7986 if (isipv4) { 7987 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 7988 v4setsrc = INADDR_ANY; 7989 ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid, 7990 NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v4setsrc, 7991 NULL, NULL); 7992 } else { 7993 v6setsrc = ipv6_all_zeros; 7994 ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid, 7995 NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v6setsrc, 7996 NULL, NULL); 7997 } 7998 ASSERT(ire != NULL); 7999 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 8000 ire_refrele(ire); 8001 dir->dir_dreachable = 0; 8002 8003 /* move on to next dst addr */ 8004 continue; 8005 } 8006 dir->dir_dreachable = 1; 8007 8008 dst_ill = ire_nexthop_ill(ire); 8009 if (dst_ill == NULL) { 8010 ire_refrele(ire); 8011 continue; 8012 } 8013 8014 /* With ipmp we most likely look at the ipmp ill here */ 8015 dir->dir_dmactype = dst_ill->ill_mactype; 8016 8017 if (isipv4) { 8018 ipaddr_t v4saddr; 8019 8020 if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr, 8021 connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst, 8022 &v4saddr, NULL, &ipif_flags) != 0) { 8023 v4saddr = INADDR_ANY; 8024 ipif_flags = 0; 8025 } 8026 IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr); 8027 } else { 8028 if (ip_select_source_v6(dst_ill, &v6setsrc, daddr, 8029 zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT, 8030 saddr, NULL, &ipif_flags) != 0) { 8031 *saddr = ipv6_all_zeros; 8032 ipif_flags = 0; 8033 } 8034 } 8035 8036 dir->dir_sscope = ip_addr_scope_v6(saddr); 8037 slabel = ip6_asp_lookup(saddr, NULL, ipst); 8038 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 8039 dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 8040 ire_refrele(ire); 8041 ill_refrele(dst_ill); 8042 } 8043 miocack(q, mp, iocp->ioc_count, 0); 8044 } 8045 8046 /* 8047 * Check if this is an address assigned to this machine. 8048 * Skips interfaces that are down by using ire checks. 8049 * Translates mapped addresses to v4 addresses and then 8050 * treats them as such, returning true if the v4 address 8051 * associated with this mapped address is configured. 8052 * Note: Applications will have to be careful what they do 8053 * with the response; use of mapped addresses limits 8054 * what can be done with the socket, especially with 8055 * respect to socket options and ioctls - neither IPv4 8056 * options nor IPv6 sticky options/ancillary data options 8057 * may be used. 8058 */ 8059 /* ARGSUSED */ 8060 int 8061 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8062 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8063 { 8064 struct sioc_addrreq *sia; 8065 sin_t *sin; 8066 ire_t *ire; 8067 mblk_t *mp1; 8068 zoneid_t zoneid; 8069 ip_stack_t *ipst; 8070 8071 ip1dbg(("ip_sioctl_tmyaddr")); 8072 8073 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8074 zoneid = Q_TO_CONN(q)->conn_zoneid; 8075 ipst = CONNQ_TO_IPST(q); 8076 8077 /* Existence verified in ip_wput_nondata */ 8078 mp1 = mp->b_cont->b_cont; 8079 sia = (struct sioc_addrreq *)mp1->b_rptr; 8080 sin = (sin_t *)&sia->sa_addr; 8081 switch (sin->sin_family) { 8082 case AF_INET6: { 8083 sin6_t *sin6 = (sin6_t *)sin; 8084 8085 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8086 ipaddr_t v4_addr; 8087 8088 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8089 v4_addr); 8090 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 8091 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, 8092 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8093 } else { 8094 in6_addr_t v6addr; 8095 8096 v6addr = sin6->sin6_addr; 8097 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 8098 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, 8099 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8100 } 8101 break; 8102 } 8103 case AF_INET: { 8104 ipaddr_t v4addr; 8105 8106 v4addr = sin->sin_addr.s_addr; 8107 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 8108 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8109 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8110 break; 8111 } 8112 default: 8113 return (EAFNOSUPPORT); 8114 } 8115 if (ire != NULL) { 8116 sia->sa_res = 1; 8117 ire_refrele(ire); 8118 } else { 8119 sia->sa_res = 0; 8120 } 8121 return (0); 8122 } 8123 8124 /* 8125 * Check if this is an address assigned on-link i.e. neighbor, 8126 * and makes sure it's reachable from the current zone. 8127 * Returns true for my addresses as well. 8128 * Translates mapped addresses to v4 addresses and then 8129 * treats them as such, returning true if the v4 address 8130 * associated with this mapped address is configured. 8131 * Note: Applications will have to be careful what they do 8132 * with the response; use of mapped addresses limits 8133 * what can be done with the socket, especially with 8134 * respect to socket options and ioctls - neither IPv4 8135 * options nor IPv6 sticky options/ancillary data options 8136 * may be used. 8137 */ 8138 /* ARGSUSED */ 8139 int 8140 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8141 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 8142 { 8143 struct sioc_addrreq *sia; 8144 sin_t *sin; 8145 mblk_t *mp1; 8146 ire_t *ire = NULL; 8147 zoneid_t zoneid; 8148 ip_stack_t *ipst; 8149 8150 ip1dbg(("ip_sioctl_tonlink")); 8151 8152 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8153 zoneid = Q_TO_CONN(q)->conn_zoneid; 8154 ipst = CONNQ_TO_IPST(q); 8155 8156 /* Existence verified in ip_wput_nondata */ 8157 mp1 = mp->b_cont->b_cont; 8158 sia = (struct sioc_addrreq *)mp1->b_rptr; 8159 sin = (sin_t *)&sia->sa_addr; 8160 8161 /* 8162 * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST 8163 * to make sure we only look at on-link unicast address. 8164 */ 8165 switch (sin->sin_family) { 8166 case AF_INET6: { 8167 sin6_t *sin6 = (sin6_t *)sin; 8168 8169 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8170 ipaddr_t v4_addr; 8171 8172 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8173 v4_addr); 8174 if (!CLASSD(v4_addr)) { 8175 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0, 8176 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 8177 0, ipst, NULL); 8178 } 8179 } else { 8180 in6_addr_t v6addr; 8181 8182 v6addr = sin6->sin6_addr; 8183 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 8184 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0, 8185 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 0, 8186 ipst, NULL); 8187 } 8188 } 8189 break; 8190 } 8191 case AF_INET: { 8192 ipaddr_t v4addr; 8193 8194 v4addr = sin->sin_addr.s_addr; 8195 if (!CLASSD(v4addr)) { 8196 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL, 8197 zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 8198 } 8199 break; 8200 } 8201 default: 8202 return (EAFNOSUPPORT); 8203 } 8204 sia->sa_res = 0; 8205 if (ire != NULL) { 8206 ASSERT(!(ire->ire_type & IRE_MULTICAST)); 8207 8208 if ((ire->ire_type & IRE_ONLINK) && 8209 !(ire->ire_type & IRE_BROADCAST)) 8210 sia->sa_res = 1; 8211 ire_refrele(ire); 8212 } 8213 return (0); 8214 } 8215 8216 /* 8217 * TBD: implement when kernel maintaines a list of site prefixes. 8218 */ 8219 /* ARGSUSED */ 8220 int 8221 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8222 ip_ioctl_cmd_t *ipip, void *ifreq) 8223 { 8224 return (ENXIO); 8225 } 8226 8227 /* ARP IOCTLs. */ 8228 /* ARGSUSED */ 8229 int 8230 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8231 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8232 { 8233 int err; 8234 ipaddr_t ipaddr; 8235 struct iocblk *iocp; 8236 conn_t *connp; 8237 struct arpreq *ar; 8238 struct xarpreq *xar; 8239 int arp_flags, flags, alength; 8240 uchar_t *lladdr; 8241 ip_stack_t *ipst; 8242 ill_t *ill = ipif->ipif_ill; 8243 ill_t *proxy_ill = NULL; 8244 ipmp_arpent_t *entp = NULL; 8245 boolean_t proxyarp = B_FALSE; 8246 boolean_t if_arp_ioctl = B_FALSE; 8247 ncec_t *ncec = NULL; 8248 nce_t *nce; 8249 8250 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8251 connp = Q_TO_CONN(q); 8252 ipst = connp->conn_netstack->netstack_ip; 8253 iocp = (struct iocblk *)mp->b_rptr; 8254 8255 if (ipip->ipi_cmd_type == XARP_CMD) { 8256 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 8257 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 8258 ar = NULL; 8259 8260 arp_flags = xar->xarp_flags; 8261 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha); 8262 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0); 8263 /* 8264 * Validate against user's link layer address length 8265 * input and name and addr length limits. 8266 */ 8267 alength = ill->ill_phys_addr_length; 8268 if (ipip->ipi_cmd == SIOCSXARP) { 8269 if (alength != xar->xarp_ha.sdl_alen || 8270 (alength + xar->xarp_ha.sdl_nlen > 8271 sizeof (xar->xarp_ha.sdl_data))) 8272 return (EINVAL); 8273 } 8274 } else { 8275 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 8276 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 8277 xar = NULL; 8278 8279 arp_flags = ar->arp_flags; 8280 lladdr = (uchar_t *)ar->arp_ha.sa_data; 8281 /* 8282 * Theoretically, the sa_family could tell us what link 8283 * layer type this operation is trying to deal with. By 8284 * common usage AF_UNSPEC means ethernet. We'll assume 8285 * any attempt to use the SIOC?ARP ioctls is for ethernet, 8286 * for now. Our new SIOC*XARP ioctls can be used more 8287 * generally. 8288 * 8289 * If the underlying media happens to have a non 6 byte 8290 * address, arp module will fail set/get, but the del 8291 * operation will succeed. 8292 */ 8293 alength = 6; 8294 if ((ipip->ipi_cmd != SIOCDARP) && 8295 (alength != ill->ill_phys_addr_length)) { 8296 return (EINVAL); 8297 } 8298 } 8299 8300 /* Translate ATF* flags to NCE* flags */ 8301 flags = 0; 8302 if (arp_flags & ATF_AUTHORITY) 8303 flags |= NCE_F_AUTHORITY; 8304 if (arp_flags & ATF_PERM) 8305 flags |= NCE_F_NONUD; /* not subject to aging */ 8306 if (arp_flags & ATF_PUBL) 8307 flags |= NCE_F_PUBLISH; 8308 8309 /* 8310 * IPMP ARP special handling: 8311 * 8312 * 1. Since ARP mappings must appear consistent across the group, 8313 * prohibit changing ARP mappings on the underlying interfaces. 8314 * 8315 * 2. Since ARP mappings for IPMP data addresses are maintained by 8316 * IP itself, prohibit changing them. 8317 * 8318 * 3. For proxy ARP, use a functioning hardware address in the group, 8319 * provided one exists. If one doesn't, just add the entry as-is; 8320 * ipmp_illgrp_refresh_arpent() will refresh it if things change. 8321 */ 8322 if (IS_UNDER_IPMP(ill)) { 8323 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP) 8324 return (EPERM); 8325 } 8326 if (IS_IPMP(ill)) { 8327 ipmp_illgrp_t *illg = ill->ill_grp; 8328 8329 switch (ipip->ipi_cmd) { 8330 case SIOCSARP: 8331 case SIOCSXARP: 8332 proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength); 8333 if (proxy_ill != NULL) { 8334 proxyarp = B_TRUE; 8335 if (!ipmp_ill_is_active(proxy_ill)) 8336 proxy_ill = ipmp_illgrp_next_ill(illg); 8337 if (proxy_ill != NULL) 8338 lladdr = proxy_ill->ill_phys_addr; 8339 } 8340 /* FALLTHRU */ 8341 } 8342 } 8343 8344 ipaddr = sin->sin_addr.s_addr; 8345 /* 8346 * don't match across illgrp per case (1) and (2). 8347 * XXX use IS_IPMP(ill) like ndp_sioc_update? 8348 */ 8349 nce = nce_lookup_v4(ill, &ipaddr); 8350 if (nce != NULL) 8351 ncec = nce->nce_common; 8352 8353 switch (iocp->ioc_cmd) { 8354 case SIOCDARP: 8355 case SIOCDXARP: { 8356 /* 8357 * Delete the NCE if any. 8358 */ 8359 if (ncec == NULL) { 8360 iocp->ioc_error = ENXIO; 8361 break; 8362 } 8363 /* Don't allow changes to arp mappings of local addresses. */ 8364 if (NCE_MYADDR(ncec)) { 8365 nce_refrele(nce); 8366 return (ENOTSUP); 8367 } 8368 iocp->ioc_error = 0; 8369 8370 /* 8371 * Delete the nce_common which has ncec_ill set to ipmp_ill. 8372 * This will delete all the nce entries on the under_ills. 8373 */ 8374 ncec_delete(ncec); 8375 /* 8376 * Once the NCE has been deleted, then the ire_dep* consistency 8377 * mechanism will find any IRE which depended on the now 8378 * condemned NCE (as part of sending packets). 8379 * That mechanism handles redirects by deleting redirects 8380 * that refer to UNREACHABLE nces. 8381 */ 8382 break; 8383 } 8384 case SIOCGARP: 8385 case SIOCGXARP: 8386 if (ncec != NULL) { 8387 lladdr = ncec->ncec_lladdr; 8388 flags = ncec->ncec_flags; 8389 iocp->ioc_error = 0; 8390 ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags); 8391 } else { 8392 iocp->ioc_error = ENXIO; 8393 } 8394 break; 8395 case SIOCSARP: 8396 case SIOCSXARP: 8397 /* Don't allow changes to arp mappings of local addresses. */ 8398 if (ncec != NULL && NCE_MYADDR(ncec)) { 8399 nce_refrele(nce); 8400 return (ENOTSUP); 8401 } 8402 8403 /* static arp entries will undergo NUD if ATF_PERM is not set */ 8404 flags |= NCE_F_STATIC; 8405 if (!if_arp_ioctl) { 8406 ip_nce_lookup_and_update(&ipaddr, NULL, ipst, 8407 lladdr, alength, flags); 8408 } else { 8409 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 8410 if (ipif != NULL) { 8411 ip_nce_lookup_and_update(&ipaddr, ipif, ipst, 8412 lladdr, alength, flags); 8413 ipif_refrele(ipif); 8414 } 8415 } 8416 if (nce != NULL) { 8417 nce_refrele(nce); 8418 nce = NULL; 8419 } 8420 /* 8421 * NCE_F_STATIC entries will be added in state ND_REACHABLE 8422 * by nce_add_common() 8423 */ 8424 err = nce_lookup_then_add_v4(ill, lladdr, 8425 ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED, 8426 &nce); 8427 if (err == EEXIST) { 8428 ncec = nce->nce_common; 8429 mutex_enter(&ncec->ncec_lock); 8430 ncec->ncec_state = ND_REACHABLE; 8431 ncec->ncec_flags = flags; 8432 nce_update(ncec, ND_UNCHANGED, lladdr); 8433 mutex_exit(&ncec->ncec_lock); 8434 err = 0; 8435 } 8436 if (nce != NULL) { 8437 nce_refrele(nce); 8438 nce = NULL; 8439 } 8440 if (IS_IPMP(ill) && err == 0) { 8441 entp = ipmp_illgrp_create_arpent(ill->ill_grp, 8442 proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length, 8443 flags); 8444 if (entp == NULL || (proxyarp && proxy_ill == NULL)) { 8445 iocp->ioc_error = (entp == NULL ? ENOMEM : 0); 8446 break; 8447 } 8448 } 8449 iocp->ioc_error = err; 8450 } 8451 8452 if (nce != NULL) { 8453 nce_refrele(nce); 8454 } 8455 8456 /* 8457 * If we created an IPMP ARP entry, mark that we've notified ARP. 8458 */ 8459 if (entp != NULL) 8460 ipmp_illgrp_mark_arpent(ill->ill_grp, entp); 8461 8462 return (iocp->ioc_error); 8463 } 8464 8465 /* 8466 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify 8467 * the associated sin and refhold and return the associated ipif via `ci'. 8468 */ 8469 int 8470 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 8471 cmd_info_t *ci) 8472 { 8473 mblk_t *mp1; 8474 sin_t *sin; 8475 conn_t *connp; 8476 ipif_t *ipif; 8477 ire_t *ire = NULL; 8478 ill_t *ill = NULL; 8479 boolean_t exists; 8480 ip_stack_t *ipst; 8481 struct arpreq *ar; 8482 struct xarpreq *xar; 8483 struct sockaddr_dl *sdl; 8484 8485 /* ioctl comes down on a conn */ 8486 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8487 connp = Q_TO_CONN(q); 8488 if (connp->conn_family == AF_INET6) 8489 return (ENXIO); 8490 8491 ipst = connp->conn_netstack->netstack_ip; 8492 8493 /* Verified in ip_wput_nondata */ 8494 mp1 = mp->b_cont->b_cont; 8495 8496 if (ipip->ipi_cmd_type == XARP_CMD) { 8497 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq)); 8498 xar = (struct xarpreq *)mp1->b_rptr; 8499 sin = (sin_t *)&xar->xarp_pa; 8500 sdl = &xar->xarp_ha; 8501 8502 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET) 8503 return (ENXIO); 8504 if (sdl->sdl_nlen >= LIFNAMSIZ) 8505 return (EINVAL); 8506 } else { 8507 ASSERT(ipip->ipi_cmd_type == ARP_CMD); 8508 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq)); 8509 ar = (struct arpreq *)mp1->b_rptr; 8510 sin = (sin_t *)&ar->arp_pa; 8511 } 8512 8513 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) { 8514 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen, 8515 B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst); 8516 if (ipif == NULL) 8517 return (ENXIO); 8518 if (ipif->ipif_id != 0) { 8519 ipif_refrele(ipif); 8520 return (ENXIO); 8521 } 8522 } else { 8523 /* 8524 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen 8525 * of 0: use the IP address to find the ipif. If the IP 8526 * address is an IPMP test address, ire_ftable_lookup() will 8527 * find the wrong ill, so we first do an ipif_lookup_addr(). 8528 */ 8529 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES, 8530 ipst); 8531 if (ipif == NULL) { 8532 ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr, 8533 0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES, 8534 NULL, MATCH_IRE_TYPE, 0, ipst, NULL); 8535 if (ire == NULL || ((ill = ire->ire_ill) == NULL)) { 8536 if (ire != NULL) 8537 ire_refrele(ire); 8538 return (ENXIO); 8539 } 8540 ASSERT(ire != NULL && ill != NULL); 8541 ipif = ill->ill_ipif; 8542 ipif_refhold(ipif); 8543 ire_refrele(ire); 8544 } 8545 } 8546 8547 if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) { 8548 ipif_refrele(ipif); 8549 return (ENXIO); 8550 } 8551 8552 ci->ci_sin = sin; 8553 ci->ci_ipif = ipif; 8554 return (0); 8555 } 8556 8557 /* 8558 * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the 8559 * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is 8560 * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it 8561 * up and thus an ill can join that illgrp. 8562 * 8563 * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than 8564 * open()/close() primarily because close() is not allowed to fail or block 8565 * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason 8566 * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure 8567 * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the 8568 * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts 8569 * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent 8570 * state if I_UNLINK didn't occur. 8571 * 8572 * Note that for each plumb/unplumb operation, we may end up here more than 8573 * once because of the way ifconfig works. However, it's OK to link the same 8574 * illgrp more than once, or unlink an illgrp that's already unlinked. 8575 */ 8576 static int 8577 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd) 8578 { 8579 int err; 8580 ip_stack_t *ipst = ill->ill_ipst; 8581 8582 ASSERT(IS_IPMP(ill)); 8583 ASSERT(IAM_WRITER_ILL(ill)); 8584 8585 switch (ioccmd) { 8586 case I_LINK: 8587 return (ENOTSUP); 8588 8589 case I_PLINK: 8590 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 8591 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp); 8592 rw_exit(&ipst->ips_ipmp_lock); 8593 break; 8594 8595 case I_PUNLINK: 8596 /* 8597 * Require all UP ipifs be brought down prior to unlinking the 8598 * illgrp so any associated IREs (and other state) is torched. 8599 */ 8600 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) 8601 return (EBUSY); 8602 8603 /* 8604 * NOTE: We hold ipmp_lock across the unlink to prevent a race 8605 * with an SIOCSLIFGROUPNAME request from an ill trying to 8606 * join this group. Specifically: ills trying to join grab 8607 * ipmp_lock and bump a "pending join" counter checked by 8608 * ipmp_illgrp_unlink_grp(). During the unlink no new pending 8609 * joins can occur (since we have ipmp_lock). Once we drop 8610 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not 8611 * find the illgrp (since we unlinked it) and will return 8612 * EAFNOSUPPORT. This will then take them back through the 8613 * IPMP meta-interface plumbing logic in ifconfig, and thus 8614 * back through I_PLINK above. 8615 */ 8616 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 8617 err = ipmp_illgrp_unlink_grp(ill->ill_grp); 8618 rw_exit(&ipst->ips_ipmp_lock); 8619 return (err); 8620 default: 8621 break; 8622 } 8623 return (0); 8624 } 8625 8626 /* 8627 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 8628 * atomically set/clear the muxids. Also complete the ioctl by acking or 8629 * naking it. Note that the code is structured such that the link type, 8630 * whether it's persistent or not, is treated equally. ifconfig(1M) and 8631 * its clones use the persistent link, while pppd(1M) and perhaps many 8632 * other daemons may use non-persistent link. When combined with some 8633 * ill_t states, linking and unlinking lower streams may be used as 8634 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 8635 */ 8636 /* ARGSUSED */ 8637 void 8638 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 8639 { 8640 mblk_t *mp1; 8641 struct linkblk *li; 8642 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; 8643 int err = 0; 8644 8645 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK || 8646 ioccmd == I_LINK || ioccmd == I_UNLINK); 8647 8648 mp1 = mp->b_cont; /* This is the linkblk info */ 8649 li = (struct linkblk *)mp1->b_rptr; 8650 8651 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li); 8652 if (err == EINPROGRESS) 8653 return; 8654 done: 8655 if (err == 0) 8656 miocack(q, mp, 0, 0); 8657 else 8658 miocnak(q, mp, 0, err); 8659 8660 /* Conn was refheld in ip_sioctl_copyin_setup */ 8661 if (CONN_Q(q)) 8662 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 8663 } 8664 8665 /* 8666 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to 8667 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP 8668 * module stream). If `doconsist' is set, then do the extended consistency 8669 * checks requested by ifconfig(1M) and (atomically) set ill_muxid here. 8670 * Returns zero on success, EINPROGRESS if the operation is still pending, or 8671 * an error code on failure. 8672 */ 8673 static int 8674 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, 8675 struct linkblk *li) 8676 { 8677 int err = 0; 8678 ill_t *ill; 8679 queue_t *ipwq, *dwq; 8680 const char *name; 8681 struct qinit *qinfo; 8682 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 8683 boolean_t entered_ipsq = B_FALSE; 8684 boolean_t is_ip = B_FALSE; 8685 arl_t *arl; 8686 8687 /* 8688 * Walk the lower stream to verify it's the IP module stream. 8689 * The IP module is identified by its name, wput function, 8690 * and non-NULL q_next. STREAMS ensures that the lower stream 8691 * (li->l_qbot) will not vanish until this ioctl completes. 8692 */ 8693 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) { 8694 qinfo = ipwq->q_qinfo; 8695 name = qinfo->qi_minfo->mi_idname; 8696 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 && 8697 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 8698 is_ip = B_TRUE; 8699 break; 8700 } 8701 if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 && 8702 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 8703 break; 8704 } 8705 } 8706 8707 /* 8708 * If this isn't an IP module stream, bail. 8709 */ 8710 if (ipwq == NULL) 8711 return (0); 8712 8713 if (!is_ip) { 8714 arl = (arl_t *)ipwq->q_ptr; 8715 ill = arl_to_ill(arl); 8716 if (ill == NULL) 8717 return (0); 8718 } else { 8719 ill = ipwq->q_ptr; 8720 } 8721 ASSERT(ill != NULL); 8722 8723 if (ipsq == NULL) { 8724 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 8725 NEW_OP, B_FALSE); 8726 if (ipsq == NULL) { 8727 if (!is_ip) 8728 ill_refrele(ill); 8729 return (EINPROGRESS); 8730 } 8731 entered_ipsq = B_TRUE; 8732 } 8733 ASSERT(IAM_WRITER_ILL(ill)); 8734 mutex_enter(&ill->ill_lock); 8735 if (!is_ip) { 8736 if (islink && ill->ill_muxid == 0) { 8737 /* 8738 * Plumbing has to be done with IP plumbed first, arp 8739 * second, but here we have arp being plumbed first. 8740 */ 8741 mutex_exit(&ill->ill_lock); 8742 ipsq_exit(ipsq); 8743 ill_refrele(ill); 8744 return (EINVAL); 8745 } 8746 } 8747 mutex_exit(&ill->ill_lock); 8748 if (!is_ip) { 8749 arl->arl_muxid = islink ? li->l_index : 0; 8750 ill_refrele(ill); 8751 goto done; 8752 } 8753 8754 if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) 8755 goto done; 8756 8757 /* 8758 * As part of I_{P}LINKing, stash the number of downstream modules and 8759 * the read queue of the module immediately below IP in the ill. 8760 * These are used during the capability negotiation below. 8761 */ 8762 ill->ill_lmod_rq = NULL; 8763 ill->ill_lmod_cnt = 0; 8764 if (islink && ((dwq = ipwq->q_next) != NULL)) { 8765 ill->ill_lmod_rq = RD(dwq); 8766 for (; dwq != NULL; dwq = dwq->q_next) 8767 ill->ill_lmod_cnt++; 8768 } 8769 8770 ill->ill_muxid = islink ? li->l_index : 0; 8771 8772 /* 8773 * Mark the ipsq busy until the capability operations initiated below 8774 * complete. The PLINK/UNLINK ioctl itself completes when our caller 8775 * returns, but the capability operation may complete asynchronously 8776 * much later. 8777 */ 8778 ipsq_current_start(ipsq, ill->ill_ipif, ioccmd); 8779 /* 8780 * If there's at least one up ipif on this ill, then we're bound to 8781 * the underlying driver via DLPI. In that case, renegotiate 8782 * capabilities to account for any possible change in modules 8783 * interposed between IP and the driver. 8784 */ 8785 if (ill->ill_ipif_up_count > 0) { 8786 if (islink) 8787 ill_capability_probe(ill); 8788 else 8789 ill_capability_reset(ill, B_FALSE); 8790 } 8791 ipsq_current_finish(ipsq); 8792 done: 8793 if (entered_ipsq) 8794 ipsq_exit(ipsq); 8795 8796 return (err); 8797 } 8798 8799 /* 8800 * Search the ioctl command in the ioctl tables and return a pointer 8801 * to the ioctl command information. The ioctl command tables are 8802 * static and fully populated at compile time. 8803 */ 8804 ip_ioctl_cmd_t * 8805 ip_sioctl_lookup(int ioc_cmd) 8806 { 8807 int index; 8808 ip_ioctl_cmd_t *ipip; 8809 ip_ioctl_cmd_t *ipip_end; 8810 8811 if (ioc_cmd == IPI_DONTCARE) 8812 return (NULL); 8813 8814 /* 8815 * Do a 2 step search. First search the indexed table 8816 * based on the least significant byte of the ioctl cmd. 8817 * If we don't find a match, then search the misc table 8818 * serially. 8819 */ 8820 index = ioc_cmd & 0xFF; 8821 if (index < ip_ndx_ioctl_count) { 8822 ipip = &ip_ndx_ioctl_table[index]; 8823 if (ipip->ipi_cmd == ioc_cmd) { 8824 /* Found a match in the ndx table */ 8825 return (ipip); 8826 } 8827 } 8828 8829 /* Search the misc table */ 8830 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 8831 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 8832 if (ipip->ipi_cmd == ioc_cmd) 8833 /* Found a match in the misc table */ 8834 return (ipip); 8835 } 8836 8837 return (NULL); 8838 } 8839 8840 /* 8841 * Wrapper function for resuming deferred ioctl processing 8842 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 8843 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 8844 */ 8845 /* ARGSUSED */ 8846 void 8847 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 8848 void *dummy_arg) 8849 { 8850 ip_sioctl_copyin_setup(q, mp); 8851 } 8852 8853 /* 8854 * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message 8855 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 8856 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 8857 * We establish here the size of the block to be copied in. mi_copyin 8858 * arranges for this to happen, an processing continues in ip_wput_nondata with 8859 * an M_IOCDATA message. 8860 */ 8861 void 8862 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 8863 { 8864 int copyin_size; 8865 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8866 ip_ioctl_cmd_t *ipip; 8867 cred_t *cr; 8868 ip_stack_t *ipst; 8869 8870 if (CONN_Q(q)) 8871 ipst = CONNQ_TO_IPST(q); 8872 else 8873 ipst = ILLQ_TO_IPST(q); 8874 8875 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 8876 if (ipip == NULL) { 8877 /* 8878 * The ioctl is not one we understand or own. 8879 * Pass it along to be processed down stream, 8880 * if this is a module instance of IP, else nak 8881 * the ioctl. 8882 */ 8883 if (q->q_next == NULL) { 8884 goto nak; 8885 } else { 8886 putnext(q, mp); 8887 return; 8888 } 8889 } 8890 8891 /* 8892 * If this is deferred, then we will do all the checks when we 8893 * come back. 8894 */ 8895 if ((iocp->ioc_cmd == SIOCGDSTINFO || 8896 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) { 8897 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 8898 return; 8899 } 8900 8901 /* 8902 * Only allow a very small subset of IP ioctls on this stream if 8903 * IP is a module and not a driver. Allowing ioctls to be processed 8904 * in this case may cause assert failures or data corruption. 8905 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 8906 * ioctls allowed on an IP module stream, after which this stream 8907 * normally becomes a multiplexor (at which time the stream head 8908 * will fail all ioctls). 8909 */ 8910 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 8911 goto nak; 8912 } 8913 8914 /* Make sure we have ioctl data to process. */ 8915 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 8916 goto nak; 8917 8918 /* 8919 * Prefer dblk credential over ioctl credential; some synthesized 8920 * ioctls have kcred set because there's no way to crhold() 8921 * a credential in some contexts. (ioc_cr is not crfree() by 8922 * the framework; the caller of ioctl needs to hold the reference 8923 * for the duration of the call). 8924 */ 8925 cr = msg_getcred(mp, NULL); 8926 if (cr == NULL) 8927 cr = iocp->ioc_cr; 8928 8929 /* Make sure normal users don't send down privileged ioctls */ 8930 if ((ipip->ipi_flags & IPI_PRIV) && 8931 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) { 8932 /* We checked the privilege earlier but log it here */ 8933 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE)); 8934 return; 8935 } 8936 8937 /* 8938 * The ioctl command tables can only encode fixed length 8939 * ioctl data. If the length is variable, the table will 8940 * encode the length as zero. Such special cases are handled 8941 * below in the switch. 8942 */ 8943 if (ipip->ipi_copyin_size != 0) { 8944 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 8945 return; 8946 } 8947 8948 switch (iocp->ioc_cmd) { 8949 case O_SIOCGIFCONF: 8950 case SIOCGIFCONF: 8951 /* 8952 * This IOCTL is hilarious. See comments in 8953 * ip_sioctl_get_ifconf for the story. 8954 */ 8955 if (iocp->ioc_count == TRANSPARENT) 8956 copyin_size = SIZEOF_STRUCT(ifconf, 8957 iocp->ioc_flag); 8958 else 8959 copyin_size = iocp->ioc_count; 8960 mi_copyin(q, mp, NULL, copyin_size); 8961 return; 8962 8963 case O_SIOCGLIFCONF: 8964 case SIOCGLIFCONF: 8965 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 8966 mi_copyin(q, mp, NULL, copyin_size); 8967 return; 8968 8969 case SIOCGLIFSRCOF: 8970 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 8971 mi_copyin(q, mp, NULL, copyin_size); 8972 return; 8973 case SIOCGIP6ADDRPOLICY: 8974 ip_sioctl_ip6addrpolicy(q, mp); 8975 ip6_asp_table_refrele(ipst); 8976 return; 8977 8978 case SIOCSIP6ADDRPOLICY: 8979 ip_sioctl_ip6addrpolicy(q, mp); 8980 return; 8981 8982 case SIOCGDSTINFO: 8983 ip_sioctl_dstinfo(q, mp); 8984 ip6_asp_table_refrele(ipst); 8985 return; 8986 8987 case I_PLINK: 8988 case I_PUNLINK: 8989 case I_LINK: 8990 case I_UNLINK: 8991 /* 8992 * We treat non-persistent link similarly as the persistent 8993 * link case, in terms of plumbing/unplumbing, as well as 8994 * dynamic re-plumbing events indicator. See comments 8995 * in ip_sioctl_plink() for more. 8996 * 8997 * Request can be enqueued in the 'ipsq' while waiting 8998 * to become exclusive. So bump up the conn ref. 8999 */ 9000 if (CONN_Q(q)) 9001 CONN_INC_REF(Q_TO_CONN(q)); 9002 ip_sioctl_plink(NULL, q, mp, NULL); 9003 return; 9004 9005 case ND_GET: 9006 case ND_SET: 9007 /* 9008 * Use of the nd table requires holding the reader lock. 9009 * Modifying the nd table thru nd_load/nd_unload requires 9010 * the writer lock. 9011 */ 9012 rw_enter(&ipst->ips_ip_g_nd_lock, RW_READER); 9013 if (nd_getset(q, ipst->ips_ip_g_nd, mp)) { 9014 rw_exit(&ipst->ips_ip_g_nd_lock); 9015 9016 if (iocp->ioc_error) 9017 iocp->ioc_count = 0; 9018 mp->b_datap->db_type = M_IOCACK; 9019 qreply(q, mp); 9020 return; 9021 } 9022 rw_exit(&ipst->ips_ip_g_nd_lock); 9023 /* 9024 * We don't understand this subioctl of ND_GET / ND_SET. 9025 * Maybe intended for some driver / module below us 9026 */ 9027 if (q->q_next) { 9028 putnext(q, mp); 9029 } else { 9030 iocp->ioc_error = ENOENT; 9031 mp->b_datap->db_type = M_IOCNAK; 9032 iocp->ioc_count = 0; 9033 qreply(q, mp); 9034 } 9035 return; 9036 9037 case IP_IOCTL: 9038 ip_wput_ioctl(q, mp); 9039 return; 9040 9041 case SIOCILB: 9042 /* The ioctl length varies depending on the ILB command. */ 9043 copyin_size = iocp->ioc_count; 9044 if (copyin_size < sizeof (ilb_cmd_t)) 9045 goto nak; 9046 mi_copyin(q, mp, NULL, copyin_size); 9047 return; 9048 9049 default: 9050 cmn_err(CE_PANIC, "should not happen "); 9051 } 9052 nak: 9053 if (mp->b_cont != NULL) { 9054 freemsg(mp->b_cont); 9055 mp->b_cont = NULL; 9056 } 9057 iocp->ioc_error = EINVAL; 9058 mp->b_datap->db_type = M_IOCNAK; 9059 iocp->ioc_count = 0; 9060 qreply(q, mp); 9061 } 9062 9063 static void 9064 ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags) 9065 { 9066 struct arpreq *ar; 9067 struct xarpreq *xar; 9068 mblk_t *tmp; 9069 struct iocblk *iocp; 9070 int x_arp_ioctl = B_FALSE; 9071 int *flagsp; 9072 char *storage = NULL; 9073 9074 ASSERT(ill != NULL); 9075 9076 iocp = (struct iocblk *)mp->b_rptr; 9077 ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP); 9078 9079 tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */ 9080 if ((iocp->ioc_cmd == SIOCGXARP) || 9081 (iocp->ioc_cmd == SIOCSXARP)) { 9082 x_arp_ioctl = B_TRUE; 9083 xar = (struct xarpreq *)tmp->b_rptr; 9084 flagsp = &xar->xarp_flags; 9085 storage = xar->xarp_ha.sdl_data; 9086 } else { 9087 ar = (struct arpreq *)tmp->b_rptr; 9088 flagsp = &ar->arp_flags; 9089 storage = ar->arp_ha.sa_data; 9090 } 9091 9092 /* 9093 * We're done if this is not an SIOCG{X}ARP 9094 */ 9095 if (x_arp_ioctl) { 9096 storage += ill_xarp_info(&xar->xarp_ha, ill); 9097 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 9098 sizeof (xar->xarp_ha.sdl_data)) { 9099 iocp->ioc_error = EINVAL; 9100 return; 9101 } 9102 } 9103 *flagsp = ATF_INUSE; 9104 /* 9105 * If /sbin/arp told us we are the authority using the "permanent" 9106 * flag, or if this is one of my addresses print "permanent" 9107 * in the /sbin/arp output. 9108 */ 9109 if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY)) 9110 *flagsp |= ATF_AUTHORITY; 9111 if (flags & NCE_F_NONUD) 9112 *flagsp |= ATF_PERM; /* not subject to aging */ 9113 if (flags & NCE_F_PUBLISH) 9114 *flagsp |= ATF_PUBL; 9115 if (hwaddr != NULL) { 9116 *flagsp |= ATF_COM; 9117 bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length); 9118 } 9119 } 9120 9121 /* 9122 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 9123 * interface) create the next available logical interface for this 9124 * physical interface. 9125 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 9126 * ipif with the specified name. 9127 * 9128 * If the address family is not AF_UNSPEC then set the address as well. 9129 * 9130 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 9131 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 9132 * 9133 * Executed as a writer on the ill. 9134 * So no lock is needed to traverse the ipif chain, or examine the 9135 * phyint flags. 9136 */ 9137 /* ARGSUSED */ 9138 int 9139 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9140 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9141 { 9142 mblk_t *mp1; 9143 struct lifreq *lifr; 9144 boolean_t isv6; 9145 boolean_t exists; 9146 char *name; 9147 char *endp; 9148 char *cp; 9149 int namelen; 9150 ipif_t *ipif; 9151 long id; 9152 ipsq_t *ipsq; 9153 ill_t *ill; 9154 sin_t *sin; 9155 int err = 0; 9156 boolean_t found_sep = B_FALSE; 9157 conn_t *connp; 9158 zoneid_t zoneid; 9159 ip_stack_t *ipst = CONNQ_TO_IPST(q); 9160 9161 ASSERT(q->q_next == NULL); 9162 ip1dbg(("ip_sioctl_addif\n")); 9163 /* Existence of mp1 has been checked in ip_wput_nondata */ 9164 mp1 = mp->b_cont->b_cont; 9165 /* 9166 * Null terminate the string to protect against buffer 9167 * overrun. String was generated by user code and may not 9168 * be trusted. 9169 */ 9170 lifr = (struct lifreq *)mp1->b_rptr; 9171 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 9172 name = lifr->lifr_name; 9173 ASSERT(CONN_Q(q)); 9174 connp = Q_TO_CONN(q); 9175 isv6 = (connp->conn_family == AF_INET6); 9176 zoneid = connp->conn_zoneid; 9177 namelen = mi_strlen(name); 9178 if (namelen == 0) 9179 return (EINVAL); 9180 9181 exists = B_FALSE; 9182 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 9183 (mi_strcmp(name, ipif_loopback_name) == 0)) { 9184 /* 9185 * Allow creating lo0 using SIOCLIFADDIF. 9186 * can't be any other writer thread. So can pass null below 9187 * for the last 4 args to ipif_lookup_name. 9188 */ 9189 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE, 9190 &exists, isv6, zoneid, ipst); 9191 /* Prevent any further action */ 9192 if (ipif == NULL) { 9193 return (ENOBUFS); 9194 } else if (!exists) { 9195 /* We created the ipif now and as writer */ 9196 ipif_refrele(ipif); 9197 return (0); 9198 } else { 9199 ill = ipif->ipif_ill; 9200 ill_refhold(ill); 9201 ipif_refrele(ipif); 9202 } 9203 } else { 9204 /* Look for a colon in the name. */ 9205 endp = &name[namelen]; 9206 for (cp = endp; --cp > name; ) { 9207 if (*cp == IPIF_SEPARATOR_CHAR) { 9208 found_sep = B_TRUE; 9209 /* 9210 * Reject any non-decimal aliases for plumbing 9211 * of logical interfaces. Aliases with leading 9212 * zeroes are also rejected as they introduce 9213 * ambiguity in the naming of the interfaces. 9214 * Comparing with "0" takes care of all such 9215 * cases. 9216 */ 9217 if ((strncmp("0", cp+1, 1)) == 0) 9218 return (EINVAL); 9219 9220 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 9221 id <= 0 || *endp != '\0') { 9222 return (EINVAL); 9223 } 9224 *cp = '\0'; 9225 break; 9226 } 9227 } 9228 ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst); 9229 if (found_sep) 9230 *cp = IPIF_SEPARATOR_CHAR; 9231 if (ill == NULL) 9232 return (ENXIO); 9233 } 9234 9235 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 9236 B_TRUE); 9237 9238 /* 9239 * Release the refhold due to the lookup, now that we are excl 9240 * or we are just returning 9241 */ 9242 ill_refrele(ill); 9243 9244 if (ipsq == NULL) 9245 return (EINPROGRESS); 9246 9247 /* We are now exclusive on the IPSQ */ 9248 ASSERT(IAM_WRITER_ILL(ill)); 9249 9250 if (found_sep) { 9251 /* Now see if there is an IPIF with this unit number. */ 9252 for (ipif = ill->ill_ipif; ipif != NULL; 9253 ipif = ipif->ipif_next) { 9254 if (ipif->ipif_id == id) { 9255 err = EEXIST; 9256 goto done; 9257 } 9258 } 9259 } 9260 9261 /* 9262 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 9263 * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name() 9264 * instead. 9265 */ 9266 if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL, 9267 B_TRUE, B_TRUE, &err)) == NULL) { 9268 goto done; 9269 } 9270 9271 /* Return created name with ioctl */ 9272 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 9273 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 9274 ip1dbg(("created %s\n", lifr->lifr_name)); 9275 9276 /* Set address */ 9277 sin = (sin_t *)&lifr->lifr_addr; 9278 if (sin->sin_family != AF_UNSPEC) { 9279 err = ip_sioctl_addr(ipif, sin, q, mp, 9280 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 9281 } 9282 9283 done: 9284 ipsq_exit(ipsq); 9285 return (err); 9286 } 9287 9288 /* 9289 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 9290 * interface) delete it based on the IP address (on this physical interface). 9291 * Otherwise delete it based on the ipif_id. 9292 * Also, special handling to allow a removeif of lo0. 9293 */ 9294 /* ARGSUSED */ 9295 int 9296 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9297 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 9298 { 9299 conn_t *connp; 9300 ill_t *ill = ipif->ipif_ill; 9301 boolean_t success; 9302 ip_stack_t *ipst; 9303 9304 ipst = CONNQ_TO_IPST(q); 9305 9306 ASSERT(q->q_next == NULL); 9307 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 9308 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9309 ASSERT(IAM_WRITER_IPIF(ipif)); 9310 9311 connp = Q_TO_CONN(q); 9312 /* 9313 * Special case for unplumbing lo0 (the loopback physical interface). 9314 * If unplumbing lo0, the incoming address structure has been 9315 * initialized to all zeros. When unplumbing lo0, all its logical 9316 * interfaces must be removed too. 9317 * 9318 * Note that this interface may be called to remove a specific 9319 * loopback logical interface (eg, lo0:1). But in that case 9320 * ipif->ipif_id != 0 so that the code path for that case is the 9321 * same as any other interface (meaning it skips the code directly 9322 * below). 9323 */ 9324 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { 9325 if (sin->sin_family == AF_UNSPEC && 9326 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 9327 /* 9328 * Mark it condemned. No new ref. will be made to ill. 9329 */ 9330 mutex_enter(&ill->ill_lock); 9331 ill->ill_state_flags |= ILL_CONDEMNED; 9332 for (ipif = ill->ill_ipif; ipif != NULL; 9333 ipif = ipif->ipif_next) { 9334 ipif->ipif_state_flags |= IPIF_CONDEMNED; 9335 } 9336 mutex_exit(&ill->ill_lock); 9337 9338 ipif = ill->ill_ipif; 9339 /* unplumb the loopback interface */ 9340 ill_delete(ill); 9341 mutex_enter(&connp->conn_lock); 9342 mutex_enter(&ill->ill_lock); 9343 9344 /* Are any references to this ill active */ 9345 if (ill_is_freeable(ill)) { 9346 mutex_exit(&ill->ill_lock); 9347 mutex_exit(&connp->conn_lock); 9348 ill_delete_tail(ill); 9349 mi_free(ill); 9350 return (0); 9351 } 9352 success = ipsq_pending_mp_add(connp, ipif, 9353 CONNP_TO_WQ(connp), mp, ILL_FREE); 9354 mutex_exit(&connp->conn_lock); 9355 mutex_exit(&ill->ill_lock); 9356 if (success) 9357 return (EINPROGRESS); 9358 else 9359 return (EINTR); 9360 } 9361 } 9362 9363 if (ipif->ipif_id == 0) { 9364 ipsq_t *ipsq; 9365 9366 /* Find based on address */ 9367 if (ipif->ipif_isv6) { 9368 sin6_t *sin6; 9369 9370 if (sin->sin_family != AF_INET6) 9371 return (EAFNOSUPPORT); 9372 9373 sin6 = (sin6_t *)sin; 9374 /* We are a writer, so we should be able to lookup */ 9375 ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill, 9376 ipst); 9377 } else { 9378 if (sin->sin_family != AF_INET) 9379 return (EAFNOSUPPORT); 9380 9381 /* We are a writer, so we should be able to lookup */ 9382 ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill, 9383 ipst); 9384 } 9385 if (ipif == NULL) { 9386 return (EADDRNOTAVAIL); 9387 } 9388 9389 /* 9390 * It is possible for a user to send an SIOCLIFREMOVEIF with 9391 * lifr_name of the physical interface but with an ip address 9392 * lifr_addr of a logical interface plumbed over it. 9393 * So update ipx_current_ipif now that ipif points to the 9394 * correct one. 9395 */ 9396 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 9397 ipsq->ipsq_xop->ipx_current_ipif = ipif; 9398 9399 /* This is a writer */ 9400 ipif_refrele(ipif); 9401 } 9402 9403 /* 9404 * Can not delete instance zero since it is tied to the ill. 9405 */ 9406 if (ipif->ipif_id == 0) 9407 return (EBUSY); 9408 9409 mutex_enter(&ill->ill_lock); 9410 ipif->ipif_state_flags |= IPIF_CONDEMNED; 9411 mutex_exit(&ill->ill_lock); 9412 9413 ipif_free(ipif); 9414 9415 mutex_enter(&connp->conn_lock); 9416 mutex_enter(&ill->ill_lock); 9417 9418 /* Are any references to this ipif active */ 9419 if (ipif_is_freeable(ipif)) { 9420 mutex_exit(&ill->ill_lock); 9421 mutex_exit(&connp->conn_lock); 9422 ipif_non_duplicate(ipif); 9423 (void) ipif_down_tail(ipif); 9424 ipif_free_tail(ipif); /* frees ipif */ 9425 return (0); 9426 } 9427 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 9428 IPIF_FREE); 9429 mutex_exit(&ill->ill_lock); 9430 mutex_exit(&connp->conn_lock); 9431 if (success) 9432 return (EINPROGRESS); 9433 else 9434 return (EINTR); 9435 } 9436 9437 /* 9438 * Restart the removeif ioctl. The refcnt has gone down to 0. 9439 * The ipif is already condemned. So can't find it thru lookups. 9440 */ 9441 /* ARGSUSED */ 9442 int 9443 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 9444 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 9445 { 9446 ill_t *ill = ipif->ipif_ill; 9447 9448 ASSERT(IAM_WRITER_IPIF(ipif)); 9449 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 9450 9451 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 9452 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9453 9454 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { 9455 ASSERT(ill->ill_state_flags & ILL_CONDEMNED); 9456 ill_delete_tail(ill); 9457 mi_free(ill); 9458 return (0); 9459 } 9460 9461 ipif_non_duplicate(ipif); 9462 (void) ipif_down_tail(ipif); 9463 ipif_free_tail(ipif); 9464 9465 return (0); 9466 } 9467 9468 /* 9469 * Set the local interface address. 9470 * Allow an address of all zero when the interface is down. 9471 */ 9472 /* ARGSUSED */ 9473 int 9474 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9475 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9476 { 9477 int err = 0; 9478 in6_addr_t v6addr; 9479 boolean_t need_up = B_FALSE; 9480 9481 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 9482 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9483 9484 ASSERT(IAM_WRITER_IPIF(ipif)); 9485 9486 if (ipif->ipif_isv6) { 9487 sin6_t *sin6; 9488 ill_t *ill; 9489 phyint_t *phyi; 9490 9491 if (sin->sin_family != AF_INET6) 9492 return (EAFNOSUPPORT); 9493 9494 sin6 = (sin6_t *)sin; 9495 v6addr = sin6->sin6_addr; 9496 ill = ipif->ipif_ill; 9497 phyi = ill->ill_phyint; 9498 9499 /* 9500 * Enforce that true multicast interfaces have a link-local 9501 * address for logical unit 0. 9502 */ 9503 if (ipif->ipif_id == 0 && 9504 (ill->ill_flags & ILLF_MULTICAST) && 9505 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 9506 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 9507 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 9508 return (EADDRNOTAVAIL); 9509 } 9510 9511 /* 9512 * up interfaces shouldn't have the unspecified address 9513 * unless they also have the IPIF_NOLOCAL flags set and 9514 * have a subnet assigned. 9515 */ 9516 if ((ipif->ipif_flags & IPIF_UP) && 9517 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 9518 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 9519 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 9520 return (EADDRNOTAVAIL); 9521 } 9522 9523 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 9524 return (EADDRNOTAVAIL); 9525 } else { 9526 ipaddr_t addr; 9527 9528 if (sin->sin_family != AF_INET) 9529 return (EAFNOSUPPORT); 9530 9531 addr = sin->sin_addr.s_addr; 9532 9533 /* Allow 0 as the local address. */ 9534 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 9535 return (EADDRNOTAVAIL); 9536 9537 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9538 } 9539 9540 /* 9541 * Even if there is no change we redo things just to rerun 9542 * ipif_set_default. 9543 */ 9544 if (ipif->ipif_flags & IPIF_UP) { 9545 /* 9546 * Setting a new local address, make sure 9547 * we have net and subnet bcast ire's for 9548 * the old address if we need them. 9549 */ 9550 /* 9551 * If the interface is already marked up, 9552 * we call ipif_down which will take care 9553 * of ditching any IREs that have been set 9554 * up based on the old interface address. 9555 */ 9556 err = ipif_logical_down(ipif, q, mp); 9557 if (err == EINPROGRESS) 9558 return (err); 9559 (void) ipif_down_tail(ipif); 9560 need_up = 1; 9561 } 9562 9563 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 9564 return (err); 9565 } 9566 9567 int 9568 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9569 boolean_t need_up) 9570 { 9571 in6_addr_t v6addr; 9572 in6_addr_t ov6addr; 9573 ipaddr_t addr; 9574 sin6_t *sin6; 9575 int sinlen; 9576 int err = 0; 9577 ill_t *ill = ipif->ipif_ill; 9578 boolean_t need_dl_down; 9579 boolean_t need_arp_down; 9580 struct iocblk *iocp; 9581 9582 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL; 9583 9584 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 9585 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9586 ASSERT(IAM_WRITER_IPIF(ipif)); 9587 9588 /* Must cancel any pending timer before taking the ill_lock */ 9589 if (ipif->ipif_recovery_id != 0) 9590 (void) untimeout(ipif->ipif_recovery_id); 9591 ipif->ipif_recovery_id = 0; 9592 9593 if (ipif->ipif_isv6) { 9594 sin6 = (sin6_t *)sin; 9595 v6addr = sin6->sin6_addr; 9596 sinlen = sizeof (struct sockaddr_in6); 9597 } else { 9598 addr = sin->sin_addr.s_addr; 9599 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9600 sinlen = sizeof (struct sockaddr_in); 9601 } 9602 mutex_enter(&ill->ill_lock); 9603 ov6addr = ipif->ipif_v6lcl_addr; 9604 ipif->ipif_v6lcl_addr = v6addr; 9605 sctp_update_ipif_addr(ipif, ov6addr); 9606 ipif->ipif_addr_ready = 0; 9607 9608 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT); 9609 9610 /* 9611 * If the interface was previously marked as a duplicate, then since 9612 * we've now got a "new" address, it should no longer be considered a 9613 * duplicate -- even if the "new" address is the same as the old one. 9614 * Note that if all ipifs are down, we may have a pending ARP down 9615 * event to handle. This is because we want to recover from duplicates 9616 * and thus delay tearing down ARP until the duplicates have been 9617 * removed or disabled. 9618 */ 9619 need_dl_down = need_arp_down = B_FALSE; 9620 if (ipif->ipif_flags & IPIF_DUPLICATE) { 9621 need_arp_down = !need_up; 9622 ipif->ipif_flags &= ~IPIF_DUPLICATE; 9623 if (--ill->ill_ipif_dup_count == 0 && !need_up && 9624 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 9625 need_dl_down = B_TRUE; 9626 } 9627 } 9628 9629 ipif_set_default(ipif); 9630 9631 /* 9632 * If we've just manually set the IPv6 link-local address (0th ipif), 9633 * tag the ill so that future updates to the interface ID don't result 9634 * in this address getting automatically reconfigured from under the 9635 * administrator. 9636 */ 9637 if (ipif->ipif_isv6 && ipif->ipif_id == 0) 9638 ill->ill_manual_linklocal = 1; 9639 9640 /* 9641 * When publishing an interface address change event, we only notify 9642 * the event listeners of the new address. It is assumed that if they 9643 * actively care about the addresses assigned that they will have 9644 * already discovered the previous address assigned (if there was one.) 9645 * 9646 * Don't attach nic event message for SIOCLIFADDIF ioctl. 9647 */ 9648 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) { 9649 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id), 9650 NE_ADDRESS_CHANGE, sin, sinlen); 9651 } 9652 9653 mutex_exit(&ill->ill_lock); 9654 9655 if (need_up) { 9656 /* 9657 * Now bring the interface back up. If this 9658 * is the only IPIF for the ILL, ipif_up 9659 * will have to re-bind to the device, so 9660 * we may get back EINPROGRESS, in which 9661 * case, this IOCTL will get completed in 9662 * ip_rput_dlpi when we see the DL_BIND_ACK. 9663 */ 9664 err = ipif_up(ipif, q, mp); 9665 } else { 9666 /* Perhaps ilgs should use this ill */ 9667 update_conn_ill(NULL, ill->ill_ipst); 9668 } 9669 9670 if (need_dl_down) 9671 ill_dl_down(ill); 9672 9673 if (need_arp_down && !ill->ill_isv6) 9674 (void) ipif_arp_down(ipif); 9675 9676 /* 9677 * The default multicast interface might have changed (for 9678 * instance if the IPv6 scope of the address changed) 9679 */ 9680 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); 9681 9682 return (err); 9683 } 9684 9685 /* 9686 * Restart entry point to restart the address set operation after the 9687 * refcounts have dropped to zero. 9688 */ 9689 /* ARGSUSED */ 9690 int 9691 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9692 ip_ioctl_cmd_t *ipip, void *ifreq) 9693 { 9694 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 9695 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9696 ASSERT(IAM_WRITER_IPIF(ipif)); 9697 (void) ipif_down_tail(ipif); 9698 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 9699 } 9700 9701 /* ARGSUSED */ 9702 int 9703 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9704 ip_ioctl_cmd_t *ipip, void *if_req) 9705 { 9706 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 9707 struct lifreq *lifr = (struct lifreq *)if_req; 9708 9709 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 9710 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9711 /* 9712 * The net mask and address can't change since we have a 9713 * reference to the ipif. So no lock is necessary. 9714 */ 9715 if (ipif->ipif_isv6) { 9716 *sin6 = sin6_null; 9717 sin6->sin6_family = AF_INET6; 9718 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 9719 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 9720 lifr->lifr_addrlen = 9721 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 9722 } else { 9723 *sin = sin_null; 9724 sin->sin_family = AF_INET; 9725 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 9726 if (ipip->ipi_cmd_type == LIF_CMD) { 9727 lifr->lifr_addrlen = 9728 ip_mask_to_plen(ipif->ipif_net_mask); 9729 } 9730 } 9731 return (0); 9732 } 9733 9734 /* 9735 * Set the destination address for a pt-pt interface. 9736 */ 9737 /* ARGSUSED */ 9738 int 9739 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9740 ip_ioctl_cmd_t *ipip, void *if_req) 9741 { 9742 int err = 0; 9743 in6_addr_t v6addr; 9744 boolean_t need_up = B_FALSE; 9745 9746 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 9747 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9748 ASSERT(IAM_WRITER_IPIF(ipif)); 9749 9750 if (ipif->ipif_isv6) { 9751 sin6_t *sin6; 9752 9753 if (sin->sin_family != AF_INET6) 9754 return (EAFNOSUPPORT); 9755 9756 sin6 = (sin6_t *)sin; 9757 v6addr = sin6->sin6_addr; 9758 9759 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 9760 return (EADDRNOTAVAIL); 9761 } else { 9762 ipaddr_t addr; 9763 9764 if (sin->sin_family != AF_INET) 9765 return (EAFNOSUPPORT); 9766 9767 addr = sin->sin_addr.s_addr; 9768 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 9769 return (EADDRNOTAVAIL); 9770 9771 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9772 } 9773 9774 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 9775 return (0); /* No change */ 9776 9777 if (ipif->ipif_flags & IPIF_UP) { 9778 /* 9779 * If the interface is already marked up, 9780 * we call ipif_down which will take care 9781 * of ditching any IREs that have been set 9782 * up based on the old pp dst address. 9783 */ 9784 err = ipif_logical_down(ipif, q, mp); 9785 if (err == EINPROGRESS) 9786 return (err); 9787 (void) ipif_down_tail(ipif); 9788 need_up = B_TRUE; 9789 } 9790 /* 9791 * could return EINPROGRESS. If so ioctl will complete in 9792 * ip_rput_dlpi_writer 9793 */ 9794 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 9795 return (err); 9796 } 9797 9798 static int 9799 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9800 boolean_t need_up) 9801 { 9802 in6_addr_t v6addr; 9803 ill_t *ill = ipif->ipif_ill; 9804 int err = 0; 9805 boolean_t need_dl_down; 9806 boolean_t need_arp_down; 9807 9808 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 9809 ipif->ipif_id, (void *)ipif)); 9810 9811 /* Must cancel any pending timer before taking the ill_lock */ 9812 if (ipif->ipif_recovery_id != 0) 9813 (void) untimeout(ipif->ipif_recovery_id); 9814 ipif->ipif_recovery_id = 0; 9815 9816 if (ipif->ipif_isv6) { 9817 sin6_t *sin6; 9818 9819 sin6 = (sin6_t *)sin; 9820 v6addr = sin6->sin6_addr; 9821 } else { 9822 ipaddr_t addr; 9823 9824 addr = sin->sin_addr.s_addr; 9825 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9826 } 9827 mutex_enter(&ill->ill_lock); 9828 /* Set point to point destination address. */ 9829 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 9830 /* 9831 * Allow this as a means of creating logical 9832 * pt-pt interfaces on top of e.g. an Ethernet. 9833 * XXX Undocumented HACK for testing. 9834 * pt-pt interfaces are created with NUD disabled. 9835 */ 9836 ipif->ipif_flags |= IPIF_POINTOPOINT; 9837 ipif->ipif_flags &= ~IPIF_BROADCAST; 9838 if (ipif->ipif_isv6) 9839 ill->ill_flags |= ILLF_NONUD; 9840 } 9841 9842 /* 9843 * If the interface was previously marked as a duplicate, then since 9844 * we've now got a "new" address, it should no longer be considered a 9845 * duplicate -- even if the "new" address is the same as the old one. 9846 * Note that if all ipifs are down, we may have a pending ARP down 9847 * event to handle. 9848 */ 9849 need_dl_down = need_arp_down = B_FALSE; 9850 if (ipif->ipif_flags & IPIF_DUPLICATE) { 9851 need_arp_down = !need_up; 9852 ipif->ipif_flags &= ~IPIF_DUPLICATE; 9853 if (--ill->ill_ipif_dup_count == 0 && !need_up && 9854 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 9855 need_dl_down = B_TRUE; 9856 } 9857 } 9858 9859 /* 9860 * If we've just manually set the IPv6 destination link-local address 9861 * (0th ipif), tag the ill so that future updates to the destination 9862 * interface ID (as can happen with interfaces over IP tunnels) don't 9863 * result in this address getting automatically reconfigured from 9864 * under the administrator. 9865 */ 9866 if (ipif->ipif_isv6 && ipif->ipif_id == 0) 9867 ill->ill_manual_dst_linklocal = 1; 9868 9869 /* Set the new address. */ 9870 ipif->ipif_v6pp_dst_addr = v6addr; 9871 /* Make sure subnet tracks pp_dst */ 9872 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 9873 mutex_exit(&ill->ill_lock); 9874 9875 if (need_up) { 9876 /* 9877 * Now bring the interface back up. If this 9878 * is the only IPIF for the ILL, ipif_up 9879 * will have to re-bind to the device, so 9880 * we may get back EINPROGRESS, in which 9881 * case, this IOCTL will get completed in 9882 * ip_rput_dlpi when we see the DL_BIND_ACK. 9883 */ 9884 err = ipif_up(ipif, q, mp); 9885 } 9886 9887 if (need_dl_down) 9888 ill_dl_down(ill); 9889 if (need_arp_down && !ipif->ipif_isv6) 9890 (void) ipif_arp_down(ipif); 9891 9892 return (err); 9893 } 9894 9895 /* 9896 * Restart entry point to restart the dstaddress set operation after the 9897 * refcounts have dropped to zero. 9898 */ 9899 /* ARGSUSED */ 9900 int 9901 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9902 ip_ioctl_cmd_t *ipip, void *ifreq) 9903 { 9904 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 9905 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9906 (void) ipif_down_tail(ipif); 9907 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 9908 } 9909 9910 /* ARGSUSED */ 9911 int 9912 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9913 ip_ioctl_cmd_t *ipip, void *if_req) 9914 { 9915 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 9916 9917 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 9918 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9919 /* 9920 * Get point to point destination address. The addresses can't 9921 * change since we hold a reference to the ipif. 9922 */ 9923 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 9924 return (EADDRNOTAVAIL); 9925 9926 if (ipif->ipif_isv6) { 9927 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 9928 *sin6 = sin6_null; 9929 sin6->sin6_family = AF_INET6; 9930 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 9931 } else { 9932 *sin = sin_null; 9933 sin->sin_family = AF_INET; 9934 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 9935 } 9936 return (0); 9937 } 9938 9939 /* 9940 * Check which flags will change by the given flags being set 9941 * silently ignore flags which userland is not allowed to control. 9942 * (Because these flags may change between SIOCGLIFFLAGS and 9943 * SIOCSLIFFLAGS, and that's outside of userland's control, 9944 * we need to silently ignore them rather than fail.) 9945 */ 9946 static void 9947 ip_sioctl_flags_onoff(ipif_t *ipif, uint64_t flags, uint64_t *onp, 9948 uint64_t *offp) 9949 { 9950 ill_t *ill = ipif->ipif_ill; 9951 phyint_t *phyi = ill->ill_phyint; 9952 uint64_t cantchange_flags, intf_flags; 9953 uint64_t turn_on, turn_off; 9954 9955 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 9956 cantchange_flags = IFF_CANTCHANGE; 9957 if (IS_IPMP(ill)) 9958 cantchange_flags |= IFF_IPMP_CANTCHANGE; 9959 turn_on = (flags ^ intf_flags) & ~cantchange_flags; 9960 turn_off = intf_flags & turn_on; 9961 turn_on ^= turn_off; 9962 *onp = turn_on; 9963 *offp = turn_off; 9964 } 9965 9966 /* 9967 * Set interface flags. Many flags require special handling (e.g., 9968 * bringing the interface down); see below for details. 9969 * 9970 * NOTE : We really don't enforce that ipif_id zero should be used 9971 * for setting any flags other than IFF_LOGINT_FLAGS. This 9972 * is because applications generally does SICGLIFFLAGS and 9973 * ORs in the new flags (that affects the logical) and does a 9974 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 9975 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 9976 * flags that will be turned on is correct with respect to 9977 * ipif_id 0. For backward compatibility reasons, it is not done. 9978 */ 9979 /* ARGSUSED */ 9980 int 9981 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9982 ip_ioctl_cmd_t *ipip, void *if_req) 9983 { 9984 uint64_t turn_on; 9985 uint64_t turn_off; 9986 int err = 0; 9987 phyint_t *phyi; 9988 ill_t *ill; 9989 conn_t *connp; 9990 uint64_t intf_flags; 9991 boolean_t phyint_flags_modified = B_FALSE; 9992 uint64_t flags; 9993 struct ifreq *ifr; 9994 struct lifreq *lifr; 9995 boolean_t set_linklocal = B_FALSE; 9996 9997 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 9998 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9999 10000 ASSERT(IAM_WRITER_IPIF(ipif)); 10001 10002 ill = ipif->ipif_ill; 10003 phyi = ill->ill_phyint; 10004 10005 if (ipip->ipi_cmd_type == IF_CMD) { 10006 ifr = (struct ifreq *)if_req; 10007 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 10008 } else { 10009 lifr = (struct lifreq *)if_req; 10010 flags = lifr->lifr_flags; 10011 } 10012 10013 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 10014 10015 /* 10016 * Have the flags been set correctly until now? 10017 */ 10018 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 10019 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 10020 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 10021 /* 10022 * Compare the new flags to the old, and partition 10023 * into those coming on and those going off. 10024 * For the 16 bit command keep the bits above bit 16 unchanged. 10025 */ 10026 if (ipip->ipi_cmd == SIOCSIFFLAGS) 10027 flags |= intf_flags & ~0xFFFF; 10028 10029 /* 10030 * Explicitly fail attempts to change flags that are always invalid on 10031 * an IPMP meta-interface. 10032 */ 10033 if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID)) 10034 return (EINVAL); 10035 10036 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10037 if ((turn_on|turn_off) == 0) 10038 return (0); /* No change */ 10039 10040 /* 10041 * All test addresses must be IFF_DEPRECATED (to ensure source address 10042 * selection avoids them) -- so force IFF_DEPRECATED on, and do not 10043 * allow it to be turned off. 10044 */ 10045 if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED && 10046 (turn_on|intf_flags) & IFF_NOFAILOVER) 10047 return (EINVAL); 10048 10049 if ((connp = Q_TO_CONN(q)) == NULL) 10050 return (EINVAL); 10051 10052 /* 10053 * Only vrrp control socket is allowed to change IFF_UP and 10054 * IFF_NOACCEPT flags when IFF_VRRP is set. 10055 */ 10056 if ((intf_flags & IFF_VRRP) && ((turn_off | turn_on) & IFF_UP)) { 10057 if (!connp->conn_isvrrp) 10058 return (EINVAL); 10059 } 10060 10061 /* 10062 * The IFF_NOACCEPT flag can only be set on an IFF_VRRP IP address by 10063 * VRRP control socket. 10064 */ 10065 if ((turn_off | turn_on) & IFF_NOACCEPT) { 10066 if (!connp->conn_isvrrp || !(intf_flags & IFF_VRRP)) 10067 return (EINVAL); 10068 } 10069 10070 if (turn_on & IFF_NOFAILOVER) { 10071 turn_on |= IFF_DEPRECATED; 10072 flags |= IFF_DEPRECATED; 10073 } 10074 10075 /* 10076 * On underlying interfaces, only allow applications to manage test 10077 * addresses -- otherwise, they may get confused when the address 10078 * moves as part of being brought up. Likewise, prevent an 10079 * application-managed test address from being converted to a data 10080 * address. To prevent migration of administratively up addresses in 10081 * the kernel, we don't allow them to be converted either. 10082 */ 10083 if (IS_UNDER_IPMP(ill)) { 10084 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF; 10085 10086 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER)) 10087 return (EINVAL); 10088 10089 if ((turn_off & IFF_NOFAILOVER) && 10090 (flags & (appflags | IFF_UP | IFF_DUPLICATE))) 10091 return (EINVAL); 10092 } 10093 10094 /* 10095 * Only allow IFF_TEMPORARY flag to be set on 10096 * IPv6 interfaces. 10097 */ 10098 if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6)) 10099 return (EINVAL); 10100 10101 /* 10102 * cannot turn off IFF_NOXMIT on VNI interfaces. 10103 */ 10104 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill)) 10105 return (EINVAL); 10106 10107 /* 10108 * Don't allow the IFF_ROUTER flag to be turned on on loopback 10109 * interfaces. It makes no sense in that context. 10110 */ 10111 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 10112 return (EINVAL); 10113 10114 /* 10115 * For IPv6 ipif_id 0, don't allow the interface to be up without 10116 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 10117 * If the link local address isn't set, and can be set, it will get 10118 * set later on in this function. 10119 */ 10120 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 10121 (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) && 10122 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 10123 if (ipif_cant_setlinklocal(ipif)) 10124 return (EINVAL); 10125 set_linklocal = B_TRUE; 10126 } 10127 10128 /* 10129 * If we modify physical interface flags, we'll potentially need to 10130 * send up two routing socket messages for the changes (one for the 10131 * IPv4 ill, and another for the IPv6 ill). Note that here. 10132 */ 10133 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 10134 phyint_flags_modified = B_TRUE; 10135 10136 /* 10137 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE 10138 * (otherwise, we'd immediately use them, defeating standby). Also, 10139 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not 10140 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already 10141 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We 10142 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics 10143 * will not be honored. 10144 */ 10145 if (turn_on & PHYI_STANDBY) { 10146 /* 10147 * No need to grab ill_g_usesrc_lock here; see the 10148 * synchronization notes in ip.c. 10149 */ 10150 if (ill->ill_usesrc_grp_next != NULL || 10151 intf_flags & PHYI_INACTIVE) 10152 return (EINVAL); 10153 if (!(flags & PHYI_FAILED)) { 10154 flags |= PHYI_INACTIVE; 10155 turn_on |= PHYI_INACTIVE; 10156 } 10157 } 10158 10159 if (turn_off & PHYI_STANDBY) { 10160 flags &= ~PHYI_INACTIVE; 10161 turn_off |= PHYI_INACTIVE; 10162 } 10163 10164 /* 10165 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both 10166 * would end up on. 10167 */ 10168 if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) == 10169 (PHYI_FAILED | PHYI_INACTIVE)) 10170 return (EINVAL); 10171 10172 /* 10173 * If ILLF_ROUTER changes, we need to change the ip forwarding 10174 * status of the interface. 10175 */ 10176 if ((turn_on | turn_off) & ILLF_ROUTER) 10177 (void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0)); 10178 10179 /* 10180 * If the interface is not UP and we are not going to 10181 * bring it UP, record the flags and return. When the 10182 * interface comes UP later, the right actions will be 10183 * taken. 10184 */ 10185 if (!(ipif->ipif_flags & IPIF_UP) && 10186 !(turn_on & IPIF_UP)) { 10187 /* Record new flags in their respective places. */ 10188 mutex_enter(&ill->ill_lock); 10189 mutex_enter(&ill->ill_phyint->phyint_lock); 10190 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 10191 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 10192 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 10193 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 10194 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 10195 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 10196 mutex_exit(&ill->ill_lock); 10197 mutex_exit(&ill->ill_phyint->phyint_lock); 10198 10199 /* 10200 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the 10201 * same to the kernel: if any of them has been set by 10202 * userland, the interface cannot be used for data traffic. 10203 */ 10204 if ((turn_on|turn_off) & 10205 (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 10206 ASSERT(!IS_IPMP(ill)); 10207 /* 10208 * It's possible the ill is part of an "anonymous" 10209 * IPMP group rather than a real group. In that case, 10210 * there are no other interfaces in the group and thus 10211 * no need to call ipmp_phyint_refresh_active(). 10212 */ 10213 if (IS_UNDER_IPMP(ill)) 10214 ipmp_phyint_refresh_active(phyi); 10215 } 10216 10217 if (phyint_flags_modified) { 10218 if (phyi->phyint_illv4 != NULL) { 10219 ip_rts_ifmsg(phyi->phyint_illv4-> 10220 ill_ipif, RTSQ_DEFAULT); 10221 } 10222 if (phyi->phyint_illv6 != NULL) { 10223 ip_rts_ifmsg(phyi->phyint_illv6-> 10224 ill_ipif, RTSQ_DEFAULT); 10225 } 10226 } 10227 /* The default multicast interface might have changed */ 10228 ire_increment_multicast_generation(ill->ill_ipst, 10229 ill->ill_isv6); 10230 10231 return (0); 10232 } else if (set_linklocal) { 10233 mutex_enter(&ill->ill_lock); 10234 if (set_linklocal) 10235 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 10236 mutex_exit(&ill->ill_lock); 10237 } 10238 10239 /* 10240 * Disallow IPv6 interfaces coming up that have the unspecified address, 10241 * or point-to-point interfaces with an unspecified destination. We do 10242 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 10243 * have a subnet assigned, which is how in.ndpd currently manages its 10244 * onlink prefix list when no addresses are configured with those 10245 * prefixes. 10246 */ 10247 if (ipif->ipif_isv6 && 10248 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 10249 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 10250 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 10251 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 10252 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 10253 return (EINVAL); 10254 } 10255 10256 /* 10257 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 10258 * from being brought up. 10259 */ 10260 if (!ipif->ipif_isv6 && 10261 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 10262 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 10263 return (EINVAL); 10264 } 10265 10266 /* 10267 * If we are going to change one or more of the flags that are 10268 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP, 10269 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and 10270 * IPIF_NOFAILOVER, we will take special action. This is 10271 * done by bring the ipif down, changing the flags and bringing 10272 * it back up again. For IPIF_NOFAILOVER, the act of bringing it 10273 * back up will trigger the address to be moved. 10274 * 10275 * If we are going to change IFF_NOACCEPT, we need to bring 10276 * all the ipifs down then bring them up again. The act of 10277 * bringing all the ipifs back up will trigger the local 10278 * ires being recreated with "no_accept" set/cleared. 10279 * 10280 * Note that ILLF_NOACCEPT is always set separately from the 10281 * other flags. 10282 */ 10283 if ((turn_on|turn_off) & 10284 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 10285 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED| 10286 IPIF_NOFAILOVER)) { 10287 /* 10288 * ipif_down() will ire_delete bcast ire's for the subnet, 10289 * while the ire_identical_ref tracks the case of IRE_BROADCAST 10290 * entries shared between multiple ipifs on the same subnet. 10291 */ 10292 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 10293 !(turn_off & IPIF_UP)) { 10294 if (ipif->ipif_flags & IPIF_UP) 10295 ill->ill_logical_down = 1; 10296 turn_on &= ~IPIF_UP; 10297 } 10298 err = ipif_down(ipif, q, mp); 10299 ip1dbg(("ipif_down returns %d err ", err)); 10300 if (err == EINPROGRESS) 10301 return (err); 10302 (void) ipif_down_tail(ipif); 10303 } else if ((turn_on|turn_off) & ILLF_NOACCEPT) { 10304 /* 10305 * If we can quiesce the ill, then continue. If not, then 10306 * ip_sioctl_flags_tail() will be called from 10307 * ipif_ill_refrele_tail(). 10308 */ 10309 ill_down_ipifs(ill, B_TRUE); 10310 10311 mutex_enter(&connp->conn_lock); 10312 mutex_enter(&ill->ill_lock); 10313 if (!ill_is_quiescent(ill)) { 10314 boolean_t success; 10315 10316 success = ipsq_pending_mp_add(connp, ill->ill_ipif, 10317 q, mp, ILL_DOWN); 10318 mutex_exit(&ill->ill_lock); 10319 mutex_exit(&connp->conn_lock); 10320 return (success ? EINPROGRESS : EINTR); 10321 } 10322 mutex_exit(&ill->ill_lock); 10323 mutex_exit(&connp->conn_lock); 10324 } 10325 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 10326 } 10327 10328 static int 10329 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) 10330 { 10331 ill_t *ill; 10332 phyint_t *phyi; 10333 uint64_t turn_on, turn_off; 10334 boolean_t phyint_flags_modified = B_FALSE; 10335 int err = 0; 10336 boolean_t set_linklocal = B_FALSE; 10337 10338 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 10339 ipif->ipif_ill->ill_name, ipif->ipif_id)); 10340 10341 ASSERT(IAM_WRITER_IPIF(ipif)); 10342 10343 ill = ipif->ipif_ill; 10344 phyi = ill->ill_phyint; 10345 10346 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10347 10348 /* 10349 * IFF_UP is handled separately. 10350 */ 10351 turn_on &= ~IFF_UP; 10352 turn_off &= ~IFF_UP; 10353 10354 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 10355 phyint_flags_modified = B_TRUE; 10356 10357 /* 10358 * Now we change the flags. Track current value of 10359 * other flags in their respective places. 10360 */ 10361 mutex_enter(&ill->ill_lock); 10362 mutex_enter(&phyi->phyint_lock); 10363 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 10364 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 10365 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 10366 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 10367 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 10368 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 10369 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 10370 set_linklocal = B_TRUE; 10371 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 10372 } 10373 10374 mutex_exit(&ill->ill_lock); 10375 mutex_exit(&phyi->phyint_lock); 10376 10377 if (set_linklocal) 10378 (void) ipif_setlinklocal(ipif); 10379 10380 /* 10381 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to 10382 * the kernel: if any of them has been set by userland, the interface 10383 * cannot be used for data traffic. 10384 */ 10385 if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 10386 ASSERT(!IS_IPMP(ill)); 10387 /* 10388 * It's possible the ill is part of an "anonymous" IPMP group 10389 * rather than a real group. In that case, there are no other 10390 * interfaces in the group and thus no need for us to call 10391 * ipmp_phyint_refresh_active(). 10392 */ 10393 if (IS_UNDER_IPMP(ill)) 10394 ipmp_phyint_refresh_active(phyi); 10395 } 10396 10397 if ((turn_on|turn_off) & ILLF_NOACCEPT) { 10398 /* 10399 * If the ILLF_NOACCEPT flag is changed, bring up all the 10400 * ipifs that were brought down. 10401 * 10402 * The routing sockets messages are sent as the result 10403 * of ill_up_ipifs(), further, SCTP's IPIF list was updated 10404 * as well. 10405 */ 10406 err = ill_up_ipifs(ill, q, mp); 10407 } else if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) { 10408 /* 10409 * XXX ipif_up really does not know whether a phyint flags 10410 * was modified or not. So, it sends up information on 10411 * only one routing sockets message. As we don't bring up 10412 * the interface and also set PHYI_ flags simultaneously 10413 * it should be okay. 10414 */ 10415 err = ipif_up(ipif, q, mp); 10416 } else { 10417 /* 10418 * Make sure routing socket sees all changes to the flags. 10419 * ipif_up_done* handles this when we use ipif_up. 10420 */ 10421 if (phyint_flags_modified) { 10422 if (phyi->phyint_illv4 != NULL) { 10423 ip_rts_ifmsg(phyi->phyint_illv4-> 10424 ill_ipif, RTSQ_DEFAULT); 10425 } 10426 if (phyi->phyint_illv6 != NULL) { 10427 ip_rts_ifmsg(phyi->phyint_illv6-> 10428 ill_ipif, RTSQ_DEFAULT); 10429 } 10430 } else { 10431 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 10432 } 10433 /* 10434 * Update the flags in SCTP's IPIF list, ipif_up() will do 10435 * this in need_up case. 10436 */ 10437 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10438 } 10439 10440 /* The default multicast interface might have changed */ 10441 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); 10442 return (err); 10443 } 10444 10445 /* 10446 * Restart the flags operation now that the refcounts have dropped to zero. 10447 */ 10448 /* ARGSUSED */ 10449 int 10450 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10451 ip_ioctl_cmd_t *ipip, void *if_req) 10452 { 10453 uint64_t flags; 10454 struct ifreq *ifr = if_req; 10455 struct lifreq *lifr = if_req; 10456 uint64_t turn_on, turn_off; 10457 10458 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 10459 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10460 10461 if (ipip->ipi_cmd_type == IF_CMD) { 10462 /* cast to uint16_t prevents unwanted sign extension */ 10463 flags = (uint16_t)ifr->ifr_flags; 10464 } else { 10465 flags = lifr->lifr_flags; 10466 } 10467 10468 /* 10469 * If this function call is a result of the ILLF_NOACCEPT flag 10470 * change, do not call ipif_down_tail(). See ip_sioctl_flags(). 10471 */ 10472 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10473 if (!((turn_on|turn_off) & ILLF_NOACCEPT)) 10474 (void) ipif_down_tail(ipif); 10475 10476 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 10477 } 10478 10479 /* 10480 * Can operate on either a module or a driver queue. 10481 */ 10482 /* ARGSUSED */ 10483 int 10484 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10485 ip_ioctl_cmd_t *ipip, void *if_req) 10486 { 10487 /* 10488 * Has the flags been set correctly till now ? 10489 */ 10490 ill_t *ill = ipif->ipif_ill; 10491 phyint_t *phyi = ill->ill_phyint; 10492 10493 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 10494 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10495 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 10496 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 10497 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 10498 10499 /* 10500 * Need a lock since some flags can be set even when there are 10501 * references to the ipif. 10502 */ 10503 mutex_enter(&ill->ill_lock); 10504 if (ipip->ipi_cmd_type == IF_CMD) { 10505 struct ifreq *ifr = (struct ifreq *)if_req; 10506 10507 /* Get interface flags (low 16 only). */ 10508 ifr->ifr_flags = ((ipif->ipif_flags | 10509 ill->ill_flags | phyi->phyint_flags) & 0xffff); 10510 } else { 10511 struct lifreq *lifr = (struct lifreq *)if_req; 10512 10513 /* Get interface flags. */ 10514 lifr->lifr_flags = ipif->ipif_flags | 10515 ill->ill_flags | phyi->phyint_flags; 10516 } 10517 mutex_exit(&ill->ill_lock); 10518 return (0); 10519 } 10520 10521 /* 10522 * We allow the MTU to be set on an ILL, but not have it be different 10523 * for different IPIFs since we don't actually send packets on IPIFs. 10524 */ 10525 /* ARGSUSED */ 10526 int 10527 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10528 ip_ioctl_cmd_t *ipip, void *if_req) 10529 { 10530 int mtu; 10531 int ip_min_mtu; 10532 struct ifreq *ifr; 10533 struct lifreq *lifr; 10534 ill_t *ill; 10535 10536 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 10537 ipif->ipif_id, (void *)ipif)); 10538 if (ipip->ipi_cmd_type == IF_CMD) { 10539 ifr = (struct ifreq *)if_req; 10540 mtu = ifr->ifr_metric; 10541 } else { 10542 lifr = (struct lifreq *)if_req; 10543 mtu = lifr->lifr_mtu; 10544 } 10545 /* Only allow for logical unit zero i.e. not on "bge0:17" */ 10546 if (ipif->ipif_id != 0) 10547 return (EINVAL); 10548 10549 ill = ipif->ipif_ill; 10550 if (ipif->ipif_isv6) 10551 ip_min_mtu = IPV6_MIN_MTU; 10552 else 10553 ip_min_mtu = IP_MIN_MTU; 10554 10555 mutex_enter(&ill->ill_lock); 10556 if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) { 10557 mutex_exit(&ill->ill_lock); 10558 return (EINVAL); 10559 } 10560 /* 10561 * The dce and fragmentation code can handle changes to ill_mtu 10562 * concurrent with sending/fragmenting packets. 10563 */ 10564 ill->ill_mtu = mtu; 10565 ill->ill_flags |= ILLF_FIXEDMTU; 10566 mutex_exit(&ill->ill_lock); 10567 10568 /* 10569 * Make sure all dce_generation checks find out 10570 * that ill_mtu has changed. 10571 */ 10572 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); 10573 10574 /* Update the MTU in SCTP's list */ 10575 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10576 return (0); 10577 } 10578 10579 /* Get interface MTU. */ 10580 /* ARGSUSED */ 10581 int 10582 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10583 ip_ioctl_cmd_t *ipip, void *if_req) 10584 { 10585 struct ifreq *ifr; 10586 struct lifreq *lifr; 10587 10588 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 10589 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10590 10591 /* 10592 * We allow a get on any logical interface even though the set 10593 * can only be done on logical unit 0. 10594 */ 10595 if (ipip->ipi_cmd_type == IF_CMD) { 10596 ifr = (struct ifreq *)if_req; 10597 ifr->ifr_metric = ipif->ipif_ill->ill_mtu; 10598 } else { 10599 lifr = (struct lifreq *)if_req; 10600 lifr->lifr_mtu = ipif->ipif_ill->ill_mtu; 10601 } 10602 return (0); 10603 } 10604 10605 /* Set interface broadcast address. */ 10606 /* ARGSUSED2 */ 10607 int 10608 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10609 ip_ioctl_cmd_t *ipip, void *if_req) 10610 { 10611 ipaddr_t addr; 10612 ire_t *ire; 10613 ill_t *ill = ipif->ipif_ill; 10614 ip_stack_t *ipst = ill->ill_ipst; 10615 10616 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name, 10617 ipif->ipif_id)); 10618 10619 ASSERT(IAM_WRITER_IPIF(ipif)); 10620 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 10621 return (EADDRNOTAVAIL); 10622 10623 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 10624 10625 if (sin->sin_family != AF_INET) 10626 return (EAFNOSUPPORT); 10627 10628 addr = sin->sin_addr.s_addr; 10629 if (ipif->ipif_flags & IPIF_UP) { 10630 /* 10631 * If we are already up, make sure the new 10632 * broadcast address makes sense. If it does, 10633 * there should be an IRE for it already. 10634 */ 10635 ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST, 10636 ill, ipif->ipif_zoneid, NULL, 10637 (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL); 10638 if (ire == NULL) { 10639 return (EINVAL); 10640 } else { 10641 ire_refrele(ire); 10642 } 10643 } 10644 /* 10645 * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST 10646 * needs to already exist we never need to change the set of 10647 * IRE_BROADCASTs when we are UP. 10648 */ 10649 if (addr != ipif->ipif_brd_addr) 10650 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 10651 10652 return (0); 10653 } 10654 10655 /* Get interface broadcast address. */ 10656 /* ARGSUSED */ 10657 int 10658 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10659 ip_ioctl_cmd_t *ipip, void *if_req) 10660 { 10661 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 10662 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10663 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 10664 return (EADDRNOTAVAIL); 10665 10666 /* IPIF_BROADCAST not possible with IPv6 */ 10667 ASSERT(!ipif->ipif_isv6); 10668 *sin = sin_null; 10669 sin->sin_family = AF_INET; 10670 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 10671 return (0); 10672 } 10673 10674 /* 10675 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 10676 */ 10677 /* ARGSUSED */ 10678 int 10679 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10680 ip_ioctl_cmd_t *ipip, void *if_req) 10681 { 10682 int err = 0; 10683 in6_addr_t v6mask; 10684 10685 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 10686 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10687 10688 ASSERT(IAM_WRITER_IPIF(ipif)); 10689 10690 if (ipif->ipif_isv6) { 10691 sin6_t *sin6; 10692 10693 if (sin->sin_family != AF_INET6) 10694 return (EAFNOSUPPORT); 10695 10696 sin6 = (sin6_t *)sin; 10697 v6mask = sin6->sin6_addr; 10698 } else { 10699 ipaddr_t mask; 10700 10701 if (sin->sin_family != AF_INET) 10702 return (EAFNOSUPPORT); 10703 10704 mask = sin->sin_addr.s_addr; 10705 V4MASK_TO_V6(mask, v6mask); 10706 } 10707 10708 /* 10709 * No big deal if the interface isn't already up, or the mask 10710 * isn't really changing, or this is pt-pt. 10711 */ 10712 if (!(ipif->ipif_flags & IPIF_UP) || 10713 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 10714 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 10715 ipif->ipif_v6net_mask = v6mask; 10716 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10717 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 10718 ipif->ipif_v6net_mask, 10719 ipif->ipif_v6subnet); 10720 } 10721 return (0); 10722 } 10723 /* 10724 * Make sure we have valid net and subnet broadcast ire's 10725 * for the old netmask, if needed by other logical interfaces. 10726 */ 10727 err = ipif_logical_down(ipif, q, mp); 10728 if (err == EINPROGRESS) 10729 return (err); 10730 (void) ipif_down_tail(ipif); 10731 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 10732 return (err); 10733 } 10734 10735 static int 10736 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 10737 { 10738 in6_addr_t v6mask; 10739 int err = 0; 10740 10741 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 10742 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10743 10744 if (ipif->ipif_isv6) { 10745 sin6_t *sin6; 10746 10747 sin6 = (sin6_t *)sin; 10748 v6mask = sin6->sin6_addr; 10749 } else { 10750 ipaddr_t mask; 10751 10752 mask = sin->sin_addr.s_addr; 10753 V4MASK_TO_V6(mask, v6mask); 10754 } 10755 10756 ipif->ipif_v6net_mask = v6mask; 10757 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10758 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 10759 ipif->ipif_v6subnet); 10760 } 10761 err = ipif_up(ipif, q, mp); 10762 10763 if (err == 0 || err == EINPROGRESS) { 10764 /* 10765 * The interface must be DL_BOUND if this packet has to 10766 * go out on the wire. Since we only go through a logical 10767 * down and are bound with the driver during an internal 10768 * down/up that is satisfied. 10769 */ 10770 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 10771 /* Potentially broadcast an address mask reply. */ 10772 ipif_mask_reply(ipif); 10773 } 10774 } 10775 return (err); 10776 } 10777 10778 /* ARGSUSED */ 10779 int 10780 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10781 ip_ioctl_cmd_t *ipip, void *if_req) 10782 { 10783 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 10784 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10785 (void) ipif_down_tail(ipif); 10786 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 10787 } 10788 10789 /* Get interface net mask. */ 10790 /* ARGSUSED */ 10791 int 10792 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10793 ip_ioctl_cmd_t *ipip, void *if_req) 10794 { 10795 struct lifreq *lifr = (struct lifreq *)if_req; 10796 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 10797 10798 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 10799 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10800 10801 /* 10802 * net mask can't change since we have a reference to the ipif. 10803 */ 10804 if (ipif->ipif_isv6) { 10805 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 10806 *sin6 = sin6_null; 10807 sin6->sin6_family = AF_INET6; 10808 sin6->sin6_addr = ipif->ipif_v6net_mask; 10809 lifr->lifr_addrlen = 10810 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 10811 } else { 10812 *sin = sin_null; 10813 sin->sin_family = AF_INET; 10814 sin->sin_addr.s_addr = ipif->ipif_net_mask; 10815 if (ipip->ipi_cmd_type == LIF_CMD) { 10816 lifr->lifr_addrlen = 10817 ip_mask_to_plen(ipif->ipif_net_mask); 10818 } 10819 } 10820 return (0); 10821 } 10822 10823 /* ARGSUSED */ 10824 int 10825 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10826 ip_ioctl_cmd_t *ipip, void *if_req) 10827 { 10828 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 10829 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10830 10831 /* 10832 * Since no applications should ever be setting metrics on underlying 10833 * interfaces, we explicitly fail to smoke 'em out. 10834 */ 10835 if (IS_UNDER_IPMP(ipif->ipif_ill)) 10836 return (EINVAL); 10837 10838 /* 10839 * Set interface metric. We don't use this for 10840 * anything but we keep track of it in case it is 10841 * important to routing applications or such. 10842 */ 10843 if (ipip->ipi_cmd_type == IF_CMD) { 10844 struct ifreq *ifr; 10845 10846 ifr = (struct ifreq *)if_req; 10847 ipif->ipif_metric = ifr->ifr_metric; 10848 } else { 10849 struct lifreq *lifr; 10850 10851 lifr = (struct lifreq *)if_req; 10852 ipif->ipif_metric = lifr->lifr_metric; 10853 } 10854 return (0); 10855 } 10856 10857 /* ARGSUSED */ 10858 int 10859 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10860 ip_ioctl_cmd_t *ipip, void *if_req) 10861 { 10862 /* Get interface metric. */ 10863 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 10864 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10865 10866 if (ipip->ipi_cmd_type == IF_CMD) { 10867 struct ifreq *ifr; 10868 10869 ifr = (struct ifreq *)if_req; 10870 ifr->ifr_metric = ipif->ipif_metric; 10871 } else { 10872 struct lifreq *lifr; 10873 10874 lifr = (struct lifreq *)if_req; 10875 lifr->lifr_metric = ipif->ipif_metric; 10876 } 10877 10878 return (0); 10879 } 10880 10881 /* ARGSUSED */ 10882 int 10883 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10884 ip_ioctl_cmd_t *ipip, void *if_req) 10885 { 10886 int arp_muxid; 10887 10888 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 10889 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10890 /* 10891 * Set the muxid returned from I_PLINK. 10892 */ 10893 if (ipip->ipi_cmd_type == IF_CMD) { 10894 struct ifreq *ifr = (struct ifreq *)if_req; 10895 10896 ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid; 10897 arp_muxid = ifr->ifr_arp_muxid; 10898 } else { 10899 struct lifreq *lifr = (struct lifreq *)if_req; 10900 10901 ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid; 10902 arp_muxid = lifr->lifr_arp_muxid; 10903 } 10904 arl_set_muxid(ipif->ipif_ill, arp_muxid); 10905 return (0); 10906 } 10907 10908 /* ARGSUSED */ 10909 int 10910 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10911 ip_ioctl_cmd_t *ipip, void *if_req) 10912 { 10913 int arp_muxid = 0; 10914 10915 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 10916 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10917 /* 10918 * Get the muxid saved in ill for I_PUNLINK. 10919 */ 10920 arp_muxid = arl_get_muxid(ipif->ipif_ill); 10921 if (ipip->ipi_cmd_type == IF_CMD) { 10922 struct ifreq *ifr = (struct ifreq *)if_req; 10923 10924 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid; 10925 ifr->ifr_arp_muxid = arp_muxid; 10926 } else { 10927 struct lifreq *lifr = (struct lifreq *)if_req; 10928 10929 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid; 10930 lifr->lifr_arp_muxid = arp_muxid; 10931 } 10932 return (0); 10933 } 10934 10935 /* 10936 * Set the subnet prefix. Does not modify the broadcast address. 10937 */ 10938 /* ARGSUSED */ 10939 int 10940 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10941 ip_ioctl_cmd_t *ipip, void *if_req) 10942 { 10943 int err = 0; 10944 in6_addr_t v6addr; 10945 in6_addr_t v6mask; 10946 boolean_t need_up = B_FALSE; 10947 int addrlen; 10948 10949 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 10950 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10951 10952 ASSERT(IAM_WRITER_IPIF(ipif)); 10953 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 10954 10955 if (ipif->ipif_isv6) { 10956 sin6_t *sin6; 10957 10958 if (sin->sin_family != AF_INET6) 10959 return (EAFNOSUPPORT); 10960 10961 sin6 = (sin6_t *)sin; 10962 v6addr = sin6->sin6_addr; 10963 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 10964 return (EADDRNOTAVAIL); 10965 } else { 10966 ipaddr_t addr; 10967 10968 if (sin->sin_family != AF_INET) 10969 return (EAFNOSUPPORT); 10970 10971 addr = sin->sin_addr.s_addr; 10972 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 10973 return (EADDRNOTAVAIL); 10974 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10975 /* Add 96 bits */ 10976 addrlen += IPV6_ABITS - IP_ABITS; 10977 } 10978 10979 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 10980 return (EINVAL); 10981 10982 /* Check if bits in the address is set past the mask */ 10983 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 10984 return (EINVAL); 10985 10986 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 10987 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 10988 return (0); /* No change */ 10989 10990 if (ipif->ipif_flags & IPIF_UP) { 10991 /* 10992 * If the interface is already marked up, 10993 * we call ipif_down which will take care 10994 * of ditching any IREs that have been set 10995 * up based on the old interface address. 10996 */ 10997 err = ipif_logical_down(ipif, q, mp); 10998 if (err == EINPROGRESS) 10999 return (err); 11000 (void) ipif_down_tail(ipif); 11001 need_up = B_TRUE; 11002 } 11003 11004 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 11005 return (err); 11006 } 11007 11008 static int 11009 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 11010 queue_t *q, mblk_t *mp, boolean_t need_up) 11011 { 11012 ill_t *ill = ipif->ipif_ill; 11013 int err = 0; 11014 11015 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 11016 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11017 11018 /* Set the new address. */ 11019 mutex_enter(&ill->ill_lock); 11020 ipif->ipif_v6net_mask = v6mask; 11021 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11022 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 11023 ipif->ipif_v6subnet); 11024 } 11025 mutex_exit(&ill->ill_lock); 11026 11027 if (need_up) { 11028 /* 11029 * Now bring the interface back up. If this 11030 * is the only IPIF for the ILL, ipif_up 11031 * will have to re-bind to the device, so 11032 * we may get back EINPROGRESS, in which 11033 * case, this IOCTL will get completed in 11034 * ip_rput_dlpi when we see the DL_BIND_ACK. 11035 */ 11036 err = ipif_up(ipif, q, mp); 11037 if (err == EINPROGRESS) 11038 return (err); 11039 } 11040 return (err); 11041 } 11042 11043 /* ARGSUSED */ 11044 int 11045 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11046 ip_ioctl_cmd_t *ipip, void *if_req) 11047 { 11048 int addrlen; 11049 in6_addr_t v6addr; 11050 in6_addr_t v6mask; 11051 struct lifreq *lifr = (struct lifreq *)if_req; 11052 11053 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 11054 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11055 (void) ipif_down_tail(ipif); 11056 11057 addrlen = lifr->lifr_addrlen; 11058 if (ipif->ipif_isv6) { 11059 sin6_t *sin6; 11060 11061 sin6 = (sin6_t *)sin; 11062 v6addr = sin6->sin6_addr; 11063 } else { 11064 ipaddr_t addr; 11065 11066 addr = sin->sin_addr.s_addr; 11067 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11068 addrlen += IPV6_ABITS - IP_ABITS; 11069 } 11070 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 11071 11072 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 11073 } 11074 11075 /* ARGSUSED */ 11076 int 11077 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11078 ip_ioctl_cmd_t *ipip, void *if_req) 11079 { 11080 struct lifreq *lifr = (struct lifreq *)if_req; 11081 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 11082 11083 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 11084 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11085 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11086 11087 if (ipif->ipif_isv6) { 11088 *sin6 = sin6_null; 11089 sin6->sin6_family = AF_INET6; 11090 sin6->sin6_addr = ipif->ipif_v6subnet; 11091 lifr->lifr_addrlen = 11092 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11093 } else { 11094 *sin = sin_null; 11095 sin->sin_family = AF_INET; 11096 sin->sin_addr.s_addr = ipif->ipif_subnet; 11097 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 11098 } 11099 return (0); 11100 } 11101 11102 /* 11103 * Set the IPv6 address token. 11104 */ 11105 /* ARGSUSED */ 11106 int 11107 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11108 ip_ioctl_cmd_t *ipi, void *if_req) 11109 { 11110 ill_t *ill = ipif->ipif_ill; 11111 int err; 11112 in6_addr_t v6addr; 11113 in6_addr_t v6mask; 11114 boolean_t need_up = B_FALSE; 11115 int i; 11116 sin6_t *sin6 = (sin6_t *)sin; 11117 struct lifreq *lifr = (struct lifreq *)if_req; 11118 int addrlen; 11119 11120 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 11121 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11122 ASSERT(IAM_WRITER_IPIF(ipif)); 11123 11124 addrlen = lifr->lifr_addrlen; 11125 /* Only allow for logical unit zero i.e. not on "le0:17" */ 11126 if (ipif->ipif_id != 0) 11127 return (EINVAL); 11128 11129 if (!ipif->ipif_isv6) 11130 return (EINVAL); 11131 11132 if (addrlen > IPV6_ABITS) 11133 return (EINVAL); 11134 11135 v6addr = sin6->sin6_addr; 11136 11137 /* 11138 * The length of the token is the length from the end. To get 11139 * the proper mask for this, compute the mask of the bits not 11140 * in the token; ie. the prefix, and then xor to get the mask. 11141 */ 11142 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 11143 return (EINVAL); 11144 for (i = 0; i < 4; i++) { 11145 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 11146 } 11147 11148 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 11149 ill->ill_token_length == addrlen) 11150 return (0); /* No change */ 11151 11152 if (ipif->ipif_flags & IPIF_UP) { 11153 err = ipif_logical_down(ipif, q, mp); 11154 if (err == EINPROGRESS) 11155 return (err); 11156 (void) ipif_down_tail(ipif); 11157 need_up = B_TRUE; 11158 } 11159 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 11160 return (err); 11161 } 11162 11163 static int 11164 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 11165 mblk_t *mp, boolean_t need_up) 11166 { 11167 in6_addr_t v6addr; 11168 in6_addr_t v6mask; 11169 ill_t *ill = ipif->ipif_ill; 11170 int i; 11171 int err = 0; 11172 11173 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 11174 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11175 v6addr = sin6->sin6_addr; 11176 /* 11177 * The length of the token is the length from the end. To get 11178 * the proper mask for this, compute the mask of the bits not 11179 * in the token; ie. the prefix, and then xor to get the mask. 11180 */ 11181 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 11182 for (i = 0; i < 4; i++) 11183 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 11184 11185 mutex_enter(&ill->ill_lock); 11186 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 11187 ill->ill_token_length = addrlen; 11188 ill->ill_manual_token = 1; 11189 11190 /* Reconfigure the link-local address based on this new token */ 11191 ipif_setlinklocal(ill->ill_ipif); 11192 11193 mutex_exit(&ill->ill_lock); 11194 11195 if (need_up) { 11196 /* 11197 * Now bring the interface back up. If this 11198 * is the only IPIF for the ILL, ipif_up 11199 * will have to re-bind to the device, so 11200 * we may get back EINPROGRESS, in which 11201 * case, this IOCTL will get completed in 11202 * ip_rput_dlpi when we see the DL_BIND_ACK. 11203 */ 11204 err = ipif_up(ipif, q, mp); 11205 if (err == EINPROGRESS) 11206 return (err); 11207 } 11208 return (err); 11209 } 11210 11211 /* ARGSUSED */ 11212 int 11213 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11214 ip_ioctl_cmd_t *ipi, void *if_req) 11215 { 11216 ill_t *ill; 11217 sin6_t *sin6 = (sin6_t *)sin; 11218 struct lifreq *lifr = (struct lifreq *)if_req; 11219 11220 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 11221 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11222 if (ipif->ipif_id != 0) 11223 return (EINVAL); 11224 11225 ill = ipif->ipif_ill; 11226 if (!ill->ill_isv6) 11227 return (ENXIO); 11228 11229 *sin6 = sin6_null; 11230 sin6->sin6_family = AF_INET6; 11231 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 11232 sin6->sin6_addr = ill->ill_token; 11233 lifr->lifr_addrlen = ill->ill_token_length; 11234 return (0); 11235 } 11236 11237 /* 11238 * Set (hardware) link specific information that might override 11239 * what was acquired through the DL_INFO_ACK. 11240 */ 11241 /* ARGSUSED */ 11242 int 11243 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11244 ip_ioctl_cmd_t *ipi, void *if_req) 11245 { 11246 ill_t *ill = ipif->ipif_ill; 11247 int ip_min_mtu; 11248 struct lifreq *lifr = (struct lifreq *)if_req; 11249 lif_ifinfo_req_t *lir; 11250 11251 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 11252 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11253 lir = &lifr->lifr_ifinfo; 11254 ASSERT(IAM_WRITER_IPIF(ipif)); 11255 11256 /* Only allow for logical unit zero i.e. not on "bge0:17" */ 11257 if (ipif->ipif_id != 0) 11258 return (EINVAL); 11259 11260 /* Set interface MTU. */ 11261 if (ipif->ipif_isv6) 11262 ip_min_mtu = IPV6_MIN_MTU; 11263 else 11264 ip_min_mtu = IP_MIN_MTU; 11265 11266 /* 11267 * Verify values before we set anything. Allow zero to 11268 * mean unspecified. 11269 * 11270 * XXX We should be able to set the user-defined lir_mtu to some value 11271 * that is greater than ill_current_frag but less than ill_max_frag- the 11272 * ill_max_frag value tells us the max MTU that can be handled by the 11273 * datalink, whereas the ill_current_frag is dynamically computed for 11274 * some link-types like tunnels, based on the tunnel PMTU. However, 11275 * since there is currently no way of distinguishing between 11276 * administratively fixed link mtu values (e.g., those set via 11277 * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered 11278 * for tunnels) we conservatively choose the ill_current_frag as the 11279 * upper-bound. 11280 */ 11281 if (lir->lir_maxmtu != 0 && 11282 (lir->lir_maxmtu > ill->ill_current_frag || 11283 lir->lir_maxmtu < ip_min_mtu)) 11284 return (EINVAL); 11285 if (lir->lir_reachtime != 0 && 11286 lir->lir_reachtime > ND_MAX_REACHTIME) 11287 return (EINVAL); 11288 if (lir->lir_reachretrans != 0 && 11289 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 11290 return (EINVAL); 11291 11292 mutex_enter(&ill->ill_lock); 11293 /* 11294 * The dce and fragmentation code can handle changes to ill_mtu 11295 * concurrent with sending/fragmenting packets. 11296 */ 11297 if (lir->lir_maxmtu != 0) 11298 ill->ill_user_mtu = lir->lir_maxmtu; 11299 11300 if (lir->lir_reachtime != 0) 11301 ill->ill_reachable_time = lir->lir_reachtime; 11302 11303 if (lir->lir_reachretrans != 0) 11304 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 11305 11306 ill->ill_max_hops = lir->lir_maxhops; 11307 ill->ill_max_buf = ND_MAX_Q; 11308 if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) { 11309 /* 11310 * ill_mtu is the actual interface MTU, obtained as the min 11311 * of user-configured mtu and the value announced by the 11312 * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since 11313 * we have already made the choice of requiring 11314 * ill_user_mtu < ill_current_frag by the time we get here, 11315 * the ill_mtu effectively gets assigned to the ill_user_mtu 11316 * here. 11317 */ 11318 ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu); 11319 } 11320 mutex_exit(&ill->ill_lock); 11321 11322 /* 11323 * Make sure all dce_generation checks find out 11324 * that ill_mtu has changed. 11325 */ 11326 if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0)) 11327 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); 11328 11329 /* 11330 * Refresh IPMP meta-interface MTU if necessary. 11331 */ 11332 if (IS_UNDER_IPMP(ill)) 11333 ipmp_illgrp_refresh_mtu(ill->ill_grp); 11334 11335 return (0); 11336 } 11337 11338 /* ARGSUSED */ 11339 int 11340 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11341 ip_ioctl_cmd_t *ipi, void *if_req) 11342 { 11343 struct lif_ifinfo_req *lir; 11344 ill_t *ill = ipif->ipif_ill; 11345 11346 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 11347 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11348 if (ipif->ipif_id != 0) 11349 return (EINVAL); 11350 11351 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 11352 lir->lir_maxhops = ill->ill_max_hops; 11353 lir->lir_reachtime = ill->ill_reachable_time; 11354 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 11355 lir->lir_maxmtu = ill->ill_mtu; 11356 11357 return (0); 11358 } 11359 11360 /* 11361 * Return best guess as to the subnet mask for the specified address. 11362 * Based on the subnet masks for all the configured interfaces. 11363 * 11364 * We end up returning a zero mask in the case of default, multicast or 11365 * experimental. 11366 */ 11367 static ipaddr_t 11368 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst) 11369 { 11370 ipaddr_t net_mask; 11371 ill_t *ill; 11372 ipif_t *ipif; 11373 ill_walk_context_t ctx; 11374 ipif_t *fallback_ipif = NULL; 11375 11376 net_mask = ip_net_mask(addr); 11377 if (net_mask == 0) { 11378 *ipifp = NULL; 11379 return (0); 11380 } 11381 11382 /* Let's check to see if this is maybe a local subnet route. */ 11383 /* this function only applies to IPv4 interfaces */ 11384 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 11385 ill = ILL_START_WALK_V4(&ctx, ipst); 11386 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 11387 mutex_enter(&ill->ill_lock); 11388 for (ipif = ill->ill_ipif; ipif != NULL; 11389 ipif = ipif->ipif_next) { 11390 if (IPIF_IS_CONDEMNED(ipif)) 11391 continue; 11392 if (!(ipif->ipif_flags & IPIF_UP)) 11393 continue; 11394 if ((ipif->ipif_subnet & net_mask) == 11395 (addr & net_mask)) { 11396 /* 11397 * Don't trust pt-pt interfaces if there are 11398 * other interfaces. 11399 */ 11400 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 11401 if (fallback_ipif == NULL) { 11402 ipif_refhold_locked(ipif); 11403 fallback_ipif = ipif; 11404 } 11405 continue; 11406 } 11407 11408 /* 11409 * Fine. Just assume the same net mask as the 11410 * directly attached subnet interface is using. 11411 */ 11412 ipif_refhold_locked(ipif); 11413 mutex_exit(&ill->ill_lock); 11414 rw_exit(&ipst->ips_ill_g_lock); 11415 if (fallback_ipif != NULL) 11416 ipif_refrele(fallback_ipif); 11417 *ipifp = ipif; 11418 return (ipif->ipif_net_mask); 11419 } 11420 } 11421 mutex_exit(&ill->ill_lock); 11422 } 11423 rw_exit(&ipst->ips_ill_g_lock); 11424 11425 *ipifp = fallback_ipif; 11426 return ((fallback_ipif != NULL) ? 11427 fallback_ipif->ipif_net_mask : net_mask); 11428 } 11429 11430 /* 11431 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 11432 */ 11433 static void 11434 ip_wput_ioctl(queue_t *q, mblk_t *mp) 11435 { 11436 IOCP iocp; 11437 ipft_t *ipft; 11438 ipllc_t *ipllc; 11439 mblk_t *mp1; 11440 cred_t *cr; 11441 int error = 0; 11442 conn_t *connp; 11443 11444 ip1dbg(("ip_wput_ioctl")); 11445 iocp = (IOCP)mp->b_rptr; 11446 mp1 = mp->b_cont; 11447 if (mp1 == NULL) { 11448 iocp->ioc_error = EINVAL; 11449 mp->b_datap->db_type = M_IOCNAK; 11450 iocp->ioc_count = 0; 11451 qreply(q, mp); 11452 return; 11453 } 11454 11455 /* 11456 * These IOCTLs provide various control capabilities to 11457 * upstream agents such as ULPs and processes. There 11458 * are currently two such IOCTLs implemented. They 11459 * are used by TCP to provide update information for 11460 * existing IREs and to forcibly delete an IRE for a 11461 * host that is not responding, thereby forcing an 11462 * attempt at a new route. 11463 */ 11464 iocp->ioc_error = EINVAL; 11465 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 11466 goto done; 11467 11468 ipllc = (ipllc_t *)mp1->b_rptr; 11469 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 11470 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 11471 break; 11472 } 11473 /* 11474 * prefer credential from mblk over ioctl; 11475 * see ip_sioctl_copyin_setup 11476 */ 11477 cr = msg_getcred(mp, NULL); 11478 if (cr == NULL) 11479 cr = iocp->ioc_cr; 11480 11481 /* 11482 * Refhold the conn in case the request gets queued up in some lookup 11483 */ 11484 ASSERT(CONN_Q(q)); 11485 connp = Q_TO_CONN(q); 11486 CONN_INC_REF(connp); 11487 if (ipft->ipft_pfi && 11488 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 11489 pullupmsg(mp1, ipft->ipft_min_size))) { 11490 error = (*ipft->ipft_pfi)(q, 11491 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 11492 } 11493 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 11494 /* 11495 * CONN_OPER_PENDING_DONE happens in the function called 11496 * through ipft_pfi above. 11497 */ 11498 return; 11499 } 11500 11501 CONN_OPER_PENDING_DONE(connp); 11502 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 11503 freemsg(mp); 11504 return; 11505 } 11506 iocp->ioc_error = error; 11507 11508 done: 11509 mp->b_datap->db_type = M_IOCACK; 11510 if (iocp->ioc_error) 11511 iocp->ioc_count = 0; 11512 qreply(q, mp); 11513 } 11514 11515 /* 11516 * Assign a unique id for the ipif. This is used by sctp_addr.c 11517 * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures. 11518 */ 11519 static void 11520 ipif_assign_seqid(ipif_t *ipif) 11521 { 11522 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 11523 11524 ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1); 11525 } 11526 11527 /* 11528 * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are 11529 * administratively down (i.e., no DAD), of the same type, and locked. Note 11530 * that the clone is complete -- including the seqid -- and the expectation is 11531 * that the caller will either free or overwrite `sipif' before it's unlocked. 11532 */ 11533 static void 11534 ipif_clone(const ipif_t *sipif, ipif_t *dipif) 11535 { 11536 ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock)); 11537 ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock)); 11538 ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 11539 ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 11540 ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type); 11541 11542 dipif->ipif_flags = sipif->ipif_flags; 11543 dipif->ipif_metric = sipif->ipif_metric; 11544 dipif->ipif_zoneid = sipif->ipif_zoneid; 11545 dipif->ipif_v6subnet = sipif->ipif_v6subnet; 11546 dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr; 11547 dipif->ipif_v6net_mask = sipif->ipif_v6net_mask; 11548 dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr; 11549 dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr; 11550 11551 /* 11552 * As per the comment atop the function, we assume that these sipif 11553 * fields will be changed before sipif is unlocked. 11554 */ 11555 dipif->ipif_seqid = sipif->ipif_seqid; 11556 dipif->ipif_state_flags = sipif->ipif_state_flags; 11557 } 11558 11559 /* 11560 * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif' 11561 * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin 11562 * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then 11563 * transfer the xop to `dipif'. Requires that all ipifs are administratively 11564 * down (i.e., no DAD), of the same type, and unlocked. 11565 */ 11566 static void 11567 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif) 11568 { 11569 ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq; 11570 ipxop_t *ipx = ipsq->ipsq_xop; 11571 11572 ASSERT(sipif != dipif); 11573 ASSERT(sipif != virgipif); 11574 11575 /* 11576 * Grab all of the locks that protect the ipif in a defined order. 11577 */ 11578 GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 11579 11580 ipif_clone(sipif, dipif); 11581 if (virgipif != NULL) { 11582 ipif_clone(virgipif, sipif); 11583 mi_free(virgipif); 11584 } 11585 11586 RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 11587 11588 /* 11589 * Transfer ownership of the current xop, if necessary. 11590 */ 11591 if (ipx->ipx_current_ipif == sipif) { 11592 ASSERT(ipx->ipx_pending_ipif == NULL); 11593 mutex_enter(&ipx->ipx_lock); 11594 ipx->ipx_current_ipif = dipif; 11595 mutex_exit(&ipx->ipx_lock); 11596 } 11597 11598 if (virgipif == NULL) 11599 mi_free(sipif); 11600 } 11601 11602 /* 11603 * checks if: 11604 * - <ill_name>:<ipif_id> is at most LIFNAMSIZ - 1 and 11605 * - logical interface is within the allowed range 11606 */ 11607 static int 11608 is_lifname_valid(ill_t *ill, unsigned int ipif_id) 11609 { 11610 if (snprintf(NULL, 0, "%s:%d", ill->ill_name, ipif_id) >= LIFNAMSIZ) 11611 return (ENAMETOOLONG); 11612 11613 if (ipif_id >= ill->ill_ipst->ips_ip_addrs_per_if) 11614 return (ERANGE); 11615 return (0); 11616 } 11617 11618 /* 11619 * Insert the ipif, so that the list of ipifs on the ill will be sorted 11620 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 11621 * be inserted into the first space available in the list. The value of 11622 * ipif_id will then be set to the appropriate value for its position. 11623 */ 11624 static int 11625 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock) 11626 { 11627 ill_t *ill; 11628 ipif_t *tipif; 11629 ipif_t **tipifp; 11630 int id, err; 11631 ip_stack_t *ipst; 11632 11633 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 11634 IAM_WRITER_IPIF(ipif)); 11635 11636 ill = ipif->ipif_ill; 11637 ASSERT(ill != NULL); 11638 ipst = ill->ill_ipst; 11639 11640 /* 11641 * In the case of lo0:0 we already hold the ill_g_lock. 11642 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 11643 * ipif_insert. 11644 */ 11645 if (acquire_g_lock) 11646 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 11647 mutex_enter(&ill->ill_lock); 11648 id = ipif->ipif_id; 11649 tipifp = &(ill->ill_ipif); 11650 if (id == -1) { /* need to find a real id */ 11651 id = 0; 11652 while ((tipif = *tipifp) != NULL) { 11653 ASSERT(tipif->ipif_id >= id); 11654 if (tipif->ipif_id != id) 11655 break; /* non-consecutive id */ 11656 id++; 11657 tipifp = &(tipif->ipif_next); 11658 } 11659 if ((err = is_lifname_valid(ill, id)) != 0) { 11660 mutex_exit(&ill->ill_lock); 11661 if (acquire_g_lock) 11662 rw_exit(&ipst->ips_ill_g_lock); 11663 return (err); 11664 } 11665 ipif->ipif_id = id; /* assign new id */ 11666 } else if ((err = is_lifname_valid(ill, id)) == 0) { 11667 /* we have a real id; insert ipif in the right place */ 11668 while ((tipif = *tipifp) != NULL) { 11669 ASSERT(tipif->ipif_id != id); 11670 if (tipif->ipif_id > id) 11671 break; /* found correct location */ 11672 tipifp = &(tipif->ipif_next); 11673 } 11674 } else { 11675 mutex_exit(&ill->ill_lock); 11676 if (acquire_g_lock) 11677 rw_exit(&ipst->ips_ill_g_lock); 11678 return (err); 11679 } 11680 11681 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 11682 11683 ipif->ipif_next = tipif; 11684 *tipifp = ipif; 11685 mutex_exit(&ill->ill_lock); 11686 if (acquire_g_lock) 11687 rw_exit(&ipst->ips_ill_g_lock); 11688 11689 return (0); 11690 } 11691 11692 static void 11693 ipif_remove(ipif_t *ipif) 11694 { 11695 ipif_t **ipifp; 11696 ill_t *ill = ipif->ipif_ill; 11697 11698 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock)); 11699 11700 mutex_enter(&ill->ill_lock); 11701 ipifp = &ill->ill_ipif; 11702 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 11703 if (*ipifp == ipif) { 11704 *ipifp = ipif->ipif_next; 11705 break; 11706 } 11707 } 11708 mutex_exit(&ill->ill_lock); 11709 } 11710 11711 /* 11712 * Allocate and initialize a new interface control structure. (Always 11713 * called as writer.) 11714 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 11715 * is not part of the global linked list of ills. ipif_seqid is unique 11716 * in the system and to preserve the uniqueness, it is assigned only 11717 * when ill becomes part of the global list. At that point ill will 11718 * have a name. If it doesn't get assigned here, it will get assigned 11719 * in ipif_set_values() as part of SIOCSLIFNAME processing. 11720 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 11721 * the interface flags or any other information from the DL_INFO_ACK for 11722 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 11723 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 11724 * second DL_INFO_ACK comes in from the driver. 11725 */ 11726 static ipif_t * 11727 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, 11728 boolean_t insert, int *errorp) 11729 { 11730 int err; 11731 ipif_t *ipif; 11732 ip_stack_t *ipst = ill->ill_ipst; 11733 11734 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 11735 ill->ill_name, id, (void *)ill)); 11736 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 11737 11738 if (errorp != NULL) 11739 *errorp = 0; 11740 11741 if ((ipif = mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) { 11742 if (errorp != NULL) 11743 *errorp = ENOMEM; 11744 return (NULL); 11745 } 11746 *ipif = ipif_zero; /* start clean */ 11747 11748 ipif->ipif_ill = ill; 11749 ipif->ipif_id = id; /* could be -1 */ 11750 /* 11751 * Inherit the zoneid from the ill; for the shared stack instance 11752 * this is always the global zone 11753 */ 11754 ipif->ipif_zoneid = ill->ill_zoneid; 11755 11756 ipif->ipif_refcnt = 0; 11757 11758 if (insert) { 11759 if ((err = ipif_insert(ipif, ire_type != IRE_LOOPBACK)) != 0) { 11760 mi_free(ipif); 11761 if (errorp != NULL) 11762 *errorp = err; 11763 return (NULL); 11764 } 11765 /* -1 id should have been replaced by real id */ 11766 id = ipif->ipif_id; 11767 ASSERT(id >= 0); 11768 } 11769 11770 if (ill->ill_name[0] != '\0') 11771 ipif_assign_seqid(ipif); 11772 11773 /* 11774 * If this is the zeroth ipif on the IPMP ill, create the illgrp 11775 * (which must not exist yet because the zeroth ipif is created once 11776 * per ill). However, do not not link it to the ipmp_grp_t until 11777 * I_PLINK is called; see ip_sioctl_plink_ipmp() for details. 11778 */ 11779 if (id == 0 && IS_IPMP(ill)) { 11780 if (ipmp_illgrp_create(ill) == NULL) { 11781 if (insert) { 11782 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 11783 ipif_remove(ipif); 11784 rw_exit(&ipst->ips_ill_g_lock); 11785 } 11786 mi_free(ipif); 11787 if (errorp != NULL) 11788 *errorp = ENOMEM; 11789 return (NULL); 11790 } 11791 } 11792 11793 /* 11794 * We grab ill_lock to protect the flag changes. The ipif is still 11795 * not up and can't be looked up until the ioctl completes and the 11796 * IPIF_CHANGING flag is cleared. 11797 */ 11798 mutex_enter(&ill->ill_lock); 11799 11800 ipif->ipif_ire_type = ire_type; 11801 11802 if (ipif->ipif_isv6) { 11803 ill->ill_flags |= ILLF_IPV6; 11804 } else { 11805 ipaddr_t inaddr_any = INADDR_ANY; 11806 11807 ill->ill_flags |= ILLF_IPV4; 11808 11809 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 11810 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11811 &ipif->ipif_v6lcl_addr); 11812 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11813 &ipif->ipif_v6subnet); 11814 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11815 &ipif->ipif_v6net_mask); 11816 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11817 &ipif->ipif_v6brd_addr); 11818 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 11819 &ipif->ipif_v6pp_dst_addr); 11820 } 11821 11822 /* 11823 * Don't set the interface flags etc. now, will do it in 11824 * ip_ll_subnet_defaults. 11825 */ 11826 if (!initialize) 11827 goto out; 11828 11829 /* 11830 * NOTE: The IPMP meta-interface is special-cased because it starts 11831 * with no underlying interfaces (and thus an unknown broadcast 11832 * address length), but all interfaces that can be placed into an IPMP 11833 * group are required to be broadcast-capable. 11834 */ 11835 if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) { 11836 /* 11837 * Later detect lack of DLPI driver multicast capability by 11838 * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi(). 11839 */ 11840 ill->ill_flags |= ILLF_MULTICAST; 11841 if (!ipif->ipif_isv6) 11842 ipif->ipif_flags |= IPIF_BROADCAST; 11843 } else { 11844 if (ill->ill_net_type != IRE_LOOPBACK) { 11845 if (ipif->ipif_isv6) 11846 /* 11847 * Note: xresolv interfaces will eventually need 11848 * NOARP set here as well, but that will require 11849 * those external resolvers to have some 11850 * knowledge of that flag and act appropriately. 11851 * Not to be changed at present. 11852 */ 11853 ill->ill_flags |= ILLF_NONUD; 11854 else 11855 ill->ill_flags |= ILLF_NOARP; 11856 } 11857 if (ill->ill_phys_addr_length == 0) { 11858 if (IS_VNI(ill)) { 11859 ipif->ipif_flags |= IPIF_NOXMIT; 11860 } else { 11861 /* pt-pt supports multicast. */ 11862 ill->ill_flags |= ILLF_MULTICAST; 11863 if (ill->ill_net_type != IRE_LOOPBACK) 11864 ipif->ipif_flags |= IPIF_POINTOPOINT; 11865 } 11866 } 11867 } 11868 out: 11869 mutex_exit(&ill->ill_lock); 11870 return (ipif); 11871 } 11872 11873 /* 11874 * Remove the neighbor cache entries associated with this logical 11875 * interface. 11876 */ 11877 int 11878 ipif_arp_down(ipif_t *ipif) 11879 { 11880 ill_t *ill = ipif->ipif_ill; 11881 int err = 0; 11882 11883 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 11884 ASSERT(IAM_WRITER_IPIF(ipif)); 11885 11886 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down", 11887 ill_t *, ill, ipif_t *, ipif); 11888 ipif_nce_down(ipif); 11889 11890 /* 11891 * If this is the last ipif that is going down and there are no 11892 * duplicate addresses we may yet attempt to re-probe, then we need to 11893 * clean up ARP completely. 11894 */ 11895 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 11896 !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) { 11897 /* 11898 * If this was the last ipif on an IPMP interface, purge any 11899 * static ARP entries associated with it. 11900 */ 11901 if (IS_IPMP(ill)) 11902 ipmp_illgrp_refresh_arpent(ill->ill_grp); 11903 11904 /* UNBIND, DETACH */ 11905 err = arp_ll_down(ill); 11906 } 11907 11908 return (err); 11909 } 11910 11911 /* 11912 * Get the resolver set up for a new IP address. (Always called as writer.) 11913 * Called both for IPv4 and IPv6 interfaces, though it only does some 11914 * basic DAD related initialization for IPv6. Honors ILLF_NOARP. 11915 * 11916 * The enumerated value res_act tunes the behavior: 11917 * * Res_act_initial: set up all the resolver structures for a new 11918 * IP address. 11919 * * Res_act_defend: tell ARP that it needs to send a single gratuitous 11920 * ARP message in defense of the address. 11921 * * Res_act_rebind: tell ARP to change the hardware address for an IP 11922 * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif(). 11923 * 11924 * Returns zero on success, or an errno upon failure. 11925 */ 11926 int 11927 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 11928 { 11929 ill_t *ill = ipif->ipif_ill; 11930 int err; 11931 boolean_t was_dup; 11932 11933 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 11934 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 11935 ASSERT(IAM_WRITER_IPIF(ipif)); 11936 11937 was_dup = B_FALSE; 11938 if (res_act == Res_act_initial) { 11939 ipif->ipif_addr_ready = 0; 11940 /* 11941 * We're bringing an interface up here. There's no way that we 11942 * should need to shut down ARP now. 11943 */ 11944 mutex_enter(&ill->ill_lock); 11945 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11946 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11947 ill->ill_ipif_dup_count--; 11948 was_dup = B_TRUE; 11949 } 11950 mutex_exit(&ill->ill_lock); 11951 } 11952 if (ipif->ipif_recovery_id != 0) 11953 (void) untimeout(ipif->ipif_recovery_id); 11954 ipif->ipif_recovery_id = 0; 11955 if (ill->ill_net_type != IRE_IF_RESOLVER) { 11956 ipif->ipif_addr_ready = 1; 11957 return (0); 11958 } 11959 /* NDP will set the ipif_addr_ready flag when it's ready */ 11960 if (ill->ill_isv6) 11961 return (0); 11962 11963 err = ipif_arp_up(ipif, res_act, was_dup); 11964 return (err); 11965 } 11966 11967 /* 11968 * This routine restarts IPv4/IPv6 duplicate address detection (DAD) 11969 * when a link has just gone back up. 11970 */ 11971 static void 11972 ipif_nce_start_dad(ipif_t *ipif) 11973 { 11974 ncec_t *ncec; 11975 ill_t *ill = ipif->ipif_ill; 11976 boolean_t isv6 = ill->ill_isv6; 11977 11978 if (isv6) { 11979 ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill, 11980 &ipif->ipif_v6lcl_addr); 11981 } else { 11982 ipaddr_t v4addr; 11983 11984 if (ill->ill_net_type != IRE_IF_RESOLVER || 11985 (ipif->ipif_flags & IPIF_UNNUMBERED) || 11986 ipif->ipif_lcl_addr == INADDR_ANY) { 11987 /* 11988 * If we can't contact ARP for some reason, 11989 * that's not really a problem. Just send 11990 * out the routing socket notification that 11991 * DAD completion would have done, and continue. 11992 */ 11993 ipif_mask_reply(ipif); 11994 ipif_up_notify(ipif); 11995 ipif->ipif_addr_ready = 1; 11996 return; 11997 } 11998 11999 IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr); 12000 ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr); 12001 } 12002 12003 if (ncec == NULL) { 12004 ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n", 12005 (void *)ipif)); 12006 return; 12007 } 12008 if (!nce_restart_dad(ncec)) { 12009 /* 12010 * If we can't restart DAD for some reason, that's not really a 12011 * problem. Just send out the routing socket notification that 12012 * DAD completion would have done, and continue. 12013 */ 12014 ipif_up_notify(ipif); 12015 ipif->ipif_addr_ready = 1; 12016 } 12017 ncec_refrele(ncec); 12018 } 12019 12020 /* 12021 * Restart duplicate address detection on all interfaces on the given ill. 12022 * 12023 * This is called when an interface transitions from down to up 12024 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 12025 * 12026 * Note that since the underlying physical link has transitioned, we must cause 12027 * at least one routing socket message to be sent here, either via DAD 12028 * completion or just by default on the first ipif. (If we don't do this, then 12029 * in.mpathd will see long delays when doing link-based failure recovery.) 12030 */ 12031 void 12032 ill_restart_dad(ill_t *ill, boolean_t went_up) 12033 { 12034 ipif_t *ipif; 12035 12036 if (ill == NULL) 12037 return; 12038 12039 /* 12040 * If layer two doesn't support duplicate address detection, then just 12041 * send the routing socket message now and be done with it. 12042 */ 12043 if (!ill->ill_isv6 && arp_no_defense) { 12044 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 12045 return; 12046 } 12047 12048 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12049 if (went_up) { 12050 12051 if (ipif->ipif_flags & IPIF_UP) { 12052 ipif_nce_start_dad(ipif); 12053 } else if (ipif->ipif_flags & IPIF_DUPLICATE) { 12054 /* 12055 * kick off the bring-up process now. 12056 */ 12057 ipif_do_recovery(ipif); 12058 } else { 12059 /* 12060 * Unfortunately, the first ipif is "special" 12061 * and represents the underlying ill in the 12062 * routing socket messages. Thus, when this 12063 * one ipif is down, we must still notify so 12064 * that the user knows the IFF_RUNNING status 12065 * change. (If the first ipif is up, then 12066 * we'll handle eventual routing socket 12067 * notification via DAD completion.) 12068 */ 12069 if (ipif == ill->ill_ipif) { 12070 ip_rts_ifmsg(ill->ill_ipif, 12071 RTSQ_DEFAULT); 12072 } 12073 } 12074 } else { 12075 /* 12076 * After link down, we'll need to send a new routing 12077 * message when the link comes back, so clear 12078 * ipif_addr_ready. 12079 */ 12080 ipif->ipif_addr_ready = 0; 12081 } 12082 } 12083 12084 /* 12085 * If we've torn down links, then notify the user right away. 12086 */ 12087 if (!went_up) 12088 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 12089 } 12090 12091 static void 12092 ipsq_delete(ipsq_t *ipsq) 12093 { 12094 ipxop_t *ipx = ipsq->ipsq_xop; 12095 12096 ipsq->ipsq_ipst = NULL; 12097 ASSERT(ipsq->ipsq_phyint == NULL); 12098 ASSERT(ipsq->ipsq_xop != NULL); 12099 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL); 12100 ASSERT(ipx->ipx_pending_mp == NULL); 12101 kmem_free(ipsq, sizeof (ipsq_t)); 12102 } 12103 12104 static int 12105 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp) 12106 { 12107 int err = 0; 12108 ipif_t *ipif; 12109 12110 if (ill == NULL) 12111 return (0); 12112 12113 ASSERT(IAM_WRITER_ILL(ill)); 12114 ill->ill_up_ipifs = B_TRUE; 12115 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12116 if (ipif->ipif_was_up) { 12117 if (!(ipif->ipif_flags & IPIF_UP)) 12118 err = ipif_up(ipif, q, mp); 12119 ipif->ipif_was_up = B_FALSE; 12120 if (err != 0) { 12121 ASSERT(err == EINPROGRESS); 12122 return (err); 12123 } 12124 } 12125 } 12126 ill->ill_up_ipifs = B_FALSE; 12127 return (0); 12128 } 12129 12130 /* 12131 * This function is called to bring up all the ipifs that were up before 12132 * bringing the ill down via ill_down_ipifs(). 12133 */ 12134 int 12135 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 12136 { 12137 int err; 12138 12139 ASSERT(IAM_WRITER_ILL(ill)); 12140 12141 if (ill->ill_replumbing) { 12142 ill->ill_replumbing = 0; 12143 /* 12144 * Send down REPLUMB_DONE notification followed by the 12145 * BIND_REQ on the arp stream. 12146 */ 12147 if (!ill->ill_isv6) 12148 arp_send_replumb_conf(ill); 12149 } 12150 err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp); 12151 if (err != 0) 12152 return (err); 12153 12154 return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp)); 12155 } 12156 12157 /* 12158 * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring 12159 * down the ipifs without sending DL_UNBIND_REQ to the driver. 12160 */ 12161 static void 12162 ill_down_ipifs(ill_t *ill, boolean_t logical) 12163 { 12164 ipif_t *ipif; 12165 12166 ASSERT(IAM_WRITER_ILL(ill)); 12167 12168 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12169 /* 12170 * We go through the ipif_down logic even if the ipif 12171 * is already down, since routes can be added based 12172 * on down ipifs. Going through ipif_down once again 12173 * will delete any IREs created based on these routes. 12174 */ 12175 if (ipif->ipif_flags & IPIF_UP) 12176 ipif->ipif_was_up = B_TRUE; 12177 12178 if (logical) { 12179 (void) ipif_logical_down(ipif, NULL, NULL); 12180 ipif_non_duplicate(ipif); 12181 (void) ipif_down_tail(ipif); 12182 } else { 12183 (void) ipif_down(ipif, NULL, NULL); 12184 } 12185 } 12186 } 12187 12188 /* 12189 * Redo source address selection. This makes IXAF_VERIFY_SOURCE take 12190 * a look again at valid source addresses. 12191 * This should be called each time after the set of source addresses has been 12192 * changed. 12193 */ 12194 void 12195 ip_update_source_selection(ip_stack_t *ipst) 12196 { 12197 /* We skip past SRC_GENERATION_VERIFY */ 12198 if (atomic_add_32_nv(&ipst->ips_src_generation, 1) == 12199 SRC_GENERATION_VERIFY) 12200 atomic_add_32(&ipst->ips_src_generation, 1); 12201 } 12202 12203 /* 12204 * Finish the group join started in ip_sioctl_groupname(). 12205 */ 12206 /* ARGSUSED */ 12207 static void 12208 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 12209 { 12210 ill_t *ill = q->q_ptr; 12211 phyint_t *phyi = ill->ill_phyint; 12212 ipmp_grp_t *grp = phyi->phyint_grp; 12213 ip_stack_t *ipst = ill->ill_ipst; 12214 12215 /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */ 12216 ASSERT(!IS_IPMP(ill) && grp != NULL); 12217 ASSERT(IAM_WRITER_IPSQ(ipsq)); 12218 12219 if (phyi->phyint_illv4 != NULL) { 12220 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12221 VERIFY(grp->gr_pendv4-- > 0); 12222 rw_exit(&ipst->ips_ipmp_lock); 12223 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4); 12224 } 12225 if (phyi->phyint_illv6 != NULL) { 12226 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12227 VERIFY(grp->gr_pendv6-- > 0); 12228 rw_exit(&ipst->ips_ipmp_lock); 12229 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6); 12230 } 12231 freemsg(mp); 12232 } 12233 12234 /* 12235 * Process an SIOCSLIFGROUPNAME request. 12236 */ 12237 /* ARGSUSED */ 12238 int 12239 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12240 ip_ioctl_cmd_t *ipip, void *ifreq) 12241 { 12242 struct lifreq *lifr = ifreq; 12243 ill_t *ill = ipif->ipif_ill; 12244 ip_stack_t *ipst = ill->ill_ipst; 12245 phyint_t *phyi = ill->ill_phyint; 12246 ipmp_grp_t *grp = phyi->phyint_grp; 12247 mblk_t *ipsq_mp; 12248 int err = 0; 12249 12250 /* 12251 * Note that phyint_grp can only change here, where we're exclusive. 12252 */ 12253 ASSERT(IAM_WRITER_ILL(ill)); 12254 12255 if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL || 12256 (phyi->phyint_flags & PHYI_VIRTUAL)) 12257 return (EINVAL); 12258 12259 lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0'; 12260 12261 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12262 12263 /* 12264 * If the name hasn't changed, there's nothing to do. 12265 */ 12266 if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0) 12267 goto unlock; 12268 12269 /* 12270 * Handle requests to rename an IPMP meta-interface. 12271 * 12272 * Note that creation of the IPMP meta-interface is handled in 12273 * userland through the standard plumbing sequence. As part of the 12274 * plumbing the IPMP meta-interface, its initial groupname is set to 12275 * the name of the interface (see ipif_set_values_tail()). 12276 */ 12277 if (IS_IPMP(ill)) { 12278 err = ipmp_grp_rename(grp, lifr->lifr_groupname); 12279 goto unlock; 12280 } 12281 12282 /* 12283 * Handle requests to add or remove an IP interface from a group. 12284 */ 12285 if (lifr->lifr_groupname[0] != '\0') { /* add */ 12286 /* 12287 * Moves are handled by first removing the interface from 12288 * its existing group, and then adding it to another group. 12289 * So, fail if it's already in a group. 12290 */ 12291 if (IS_UNDER_IPMP(ill)) { 12292 err = EALREADY; 12293 goto unlock; 12294 } 12295 12296 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst); 12297 if (grp == NULL) { 12298 err = ENOENT; 12299 goto unlock; 12300 } 12301 12302 /* 12303 * Check if the phyint and its ills are suitable for 12304 * inclusion into the group. 12305 */ 12306 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0) 12307 goto unlock; 12308 12309 /* 12310 * Checks pass; join the group, and enqueue the remaining 12311 * illgrp joins for when we've become part of the group xop 12312 * and are exclusive across its IPSQs. Since qwriter_ip() 12313 * requires an mblk_t to scribble on, and since `mp' will be 12314 * freed as part of completing the ioctl, allocate another. 12315 */ 12316 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) { 12317 err = ENOMEM; 12318 goto unlock; 12319 } 12320 12321 /* 12322 * Before we drop ipmp_lock, bump gr_pend* to ensure that the 12323 * IPMP meta-interface ills needed by `phyi' cannot go away 12324 * before ip_join_illgrps() is called back. See the comments 12325 * in ip_sioctl_plink_ipmp() for more. 12326 */ 12327 if (phyi->phyint_illv4 != NULL) 12328 grp->gr_pendv4++; 12329 if (phyi->phyint_illv6 != NULL) 12330 grp->gr_pendv6++; 12331 12332 rw_exit(&ipst->ips_ipmp_lock); 12333 12334 ipmp_phyint_join_grp(phyi, grp); 12335 ill_refhold(ill); 12336 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps, 12337 SWITCH_OP, B_FALSE); 12338 return (0); 12339 } else { 12340 /* 12341 * Request to remove the interface from a group. If the 12342 * interface is not in a group, this trivially succeeds. 12343 */ 12344 rw_exit(&ipst->ips_ipmp_lock); 12345 if (IS_UNDER_IPMP(ill)) 12346 ipmp_phyint_leave_grp(phyi); 12347 return (0); 12348 } 12349 unlock: 12350 rw_exit(&ipst->ips_ipmp_lock); 12351 return (err); 12352 } 12353 12354 /* 12355 * Process an SIOCGLIFBINDING request. 12356 */ 12357 /* ARGSUSED */ 12358 int 12359 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12360 ip_ioctl_cmd_t *ipip, void *ifreq) 12361 { 12362 ill_t *ill; 12363 struct lifreq *lifr = ifreq; 12364 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12365 12366 if (!IS_IPMP(ipif->ipif_ill)) 12367 return (EINVAL); 12368 12369 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12370 if ((ill = ipif->ipif_bound_ill) == NULL) 12371 lifr->lifr_binding[0] = '\0'; 12372 else 12373 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ); 12374 rw_exit(&ipst->ips_ipmp_lock); 12375 return (0); 12376 } 12377 12378 /* 12379 * Process an SIOCGLIFGROUPNAME request. 12380 */ 12381 /* ARGSUSED */ 12382 int 12383 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12384 ip_ioctl_cmd_t *ipip, void *ifreq) 12385 { 12386 ipmp_grp_t *grp; 12387 struct lifreq *lifr = ifreq; 12388 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12389 12390 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12391 if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL) 12392 lifr->lifr_groupname[0] = '\0'; 12393 else 12394 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ); 12395 rw_exit(&ipst->ips_ipmp_lock); 12396 return (0); 12397 } 12398 12399 /* 12400 * Process an SIOCGLIFGROUPINFO request. 12401 */ 12402 /* ARGSUSED */ 12403 int 12404 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12405 ip_ioctl_cmd_t *ipip, void *dummy) 12406 { 12407 ipmp_grp_t *grp; 12408 lifgroupinfo_t *lifgr; 12409 ip_stack_t *ipst = CONNQ_TO_IPST(q); 12410 12411 /* ip_wput_nondata() verified mp->b_cont->b_cont */ 12412 lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr; 12413 lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0'; 12414 12415 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12416 if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) { 12417 rw_exit(&ipst->ips_ipmp_lock); 12418 return (ENOENT); 12419 } 12420 ipmp_grp_info(grp, lifgr); 12421 rw_exit(&ipst->ips_ipmp_lock); 12422 return (0); 12423 } 12424 12425 static void 12426 ill_dl_down(ill_t *ill) 12427 { 12428 DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill); 12429 12430 /* 12431 * The ill is down; unbind but stay attached since we're still 12432 * associated with a PPA. If we have negotiated DLPI capabilites 12433 * with the data link service provider (IDS_OK) then reset them. 12434 * The interval between unbinding and rebinding is potentially 12435 * unbounded hence we cannot assume things will be the same. 12436 * The DLPI capabilities will be probed again when the data link 12437 * is brought up. 12438 */ 12439 mblk_t *mp = ill->ill_unbind_mp; 12440 12441 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 12442 12443 if (!ill->ill_replumbing) { 12444 /* Free all ilms for this ill */ 12445 update_conn_ill(ill, ill->ill_ipst); 12446 } else { 12447 ill_leave_multicast(ill); 12448 } 12449 12450 ill->ill_unbind_mp = NULL; 12451 if (mp != NULL) { 12452 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 12453 dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 12454 ill->ill_name)); 12455 mutex_enter(&ill->ill_lock); 12456 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 12457 mutex_exit(&ill->ill_lock); 12458 /* 12459 * ip_rput does not pass up normal (M_PROTO) DLPI messages 12460 * after ILL_CONDEMNED is set. So in the unplumb case, we call 12461 * ill_capability_dld_disable disable rightaway. If this is not 12462 * an unplumb operation then the disable happens on receipt of 12463 * the capab ack via ip_rput_dlpi_writer -> 12464 * ill_capability_ack_thr. In both cases the order of 12465 * the operations seen by DLD is capability disable followed 12466 * by DL_UNBIND. Also the DLD capability disable needs a 12467 * cv_wait'able context. 12468 */ 12469 if (ill->ill_state_flags & ILL_CONDEMNED) 12470 ill_capability_dld_disable(ill); 12471 ill_capability_reset(ill, B_FALSE); 12472 ill_dlpi_send(ill, mp); 12473 } 12474 mutex_enter(&ill->ill_lock); 12475 ill->ill_dl_up = 0; 12476 ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); 12477 mutex_exit(&ill->ill_lock); 12478 } 12479 12480 void 12481 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 12482 { 12483 union DL_primitives *dlp; 12484 t_uscalar_t prim; 12485 boolean_t waitack = B_FALSE; 12486 12487 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12488 12489 dlp = (union DL_primitives *)mp->b_rptr; 12490 prim = dlp->dl_primitive; 12491 12492 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 12493 dl_primstr(prim), prim, ill->ill_name)); 12494 12495 switch (prim) { 12496 case DL_PHYS_ADDR_REQ: 12497 { 12498 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 12499 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 12500 break; 12501 } 12502 case DL_BIND_REQ: 12503 mutex_enter(&ill->ill_lock); 12504 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 12505 mutex_exit(&ill->ill_lock); 12506 break; 12507 } 12508 12509 /* 12510 * Except for the ACKs for the M_PCPROTO messages, all other ACKs 12511 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore 12512 * we only wait for the ACK of the DL_UNBIND_REQ. 12513 */ 12514 mutex_enter(&ill->ill_lock); 12515 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 12516 (prim == DL_UNBIND_REQ)) { 12517 ill->ill_dlpi_pending = prim; 12518 waitack = B_TRUE; 12519 } 12520 12521 mutex_exit(&ill->ill_lock); 12522 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch", 12523 char *, dl_primstr(prim), ill_t *, ill); 12524 putnext(ill->ill_wq, mp); 12525 12526 /* 12527 * There is no ack for DL_NOTIFY_CONF messages 12528 */ 12529 if (waitack && prim == DL_NOTIFY_CONF) 12530 ill_dlpi_done(ill, prim); 12531 } 12532 12533 /* 12534 * Helper function for ill_dlpi_send(). 12535 */ 12536 /* ARGSUSED */ 12537 static void 12538 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 12539 { 12540 ill_dlpi_send(q->q_ptr, mp); 12541 } 12542 12543 /* 12544 * Send a DLPI control message to the driver but make sure there 12545 * is only one outstanding message. Uses ill_dlpi_pending to tell 12546 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 12547 * when an ACK or a NAK is received to process the next queued message. 12548 */ 12549 void 12550 ill_dlpi_send(ill_t *ill, mblk_t *mp) 12551 { 12552 mblk_t **mpp; 12553 12554 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12555 12556 /* 12557 * To ensure that any DLPI requests for current exclusive operation 12558 * are always completely sent before any DLPI messages for other 12559 * operations, require writer access before enqueuing. 12560 */ 12561 if (!IAM_WRITER_ILL(ill)) { 12562 ill_refhold(ill); 12563 /* qwriter_ip() does the ill_refrele() */ 12564 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer, 12565 NEW_OP, B_TRUE); 12566 return; 12567 } 12568 12569 mutex_enter(&ill->ill_lock); 12570 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 12571 /* Must queue message. Tail insertion */ 12572 mpp = &ill->ill_dlpi_deferred; 12573 while (*mpp != NULL) 12574 mpp = &((*mpp)->b_next); 12575 12576 ip1dbg(("ill_dlpi_send: deferring request for %s " 12577 "while %s pending\n", ill->ill_name, 12578 dl_primstr(ill->ill_dlpi_pending))); 12579 12580 *mpp = mp; 12581 mutex_exit(&ill->ill_lock); 12582 return; 12583 } 12584 mutex_exit(&ill->ill_lock); 12585 ill_dlpi_dispatch(ill, mp); 12586 } 12587 12588 void 12589 ill_capability_send(ill_t *ill, mblk_t *mp) 12590 { 12591 ill->ill_capab_pending_cnt++; 12592 ill_dlpi_send(ill, mp); 12593 } 12594 12595 void 12596 ill_capability_done(ill_t *ill) 12597 { 12598 ASSERT(ill->ill_capab_pending_cnt != 0); 12599 12600 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 12601 12602 ill->ill_capab_pending_cnt--; 12603 if (ill->ill_capab_pending_cnt == 0 && 12604 ill->ill_dlpi_capab_state == IDCS_OK) 12605 ill_capability_reset_alloc(ill); 12606 } 12607 12608 /* 12609 * Send all deferred DLPI messages without waiting for their ACKs. 12610 */ 12611 void 12612 ill_dlpi_send_deferred(ill_t *ill) 12613 { 12614 mblk_t *mp, *nextmp; 12615 12616 /* 12617 * Clear ill_dlpi_pending so that the message is not queued in 12618 * ill_dlpi_send(). 12619 */ 12620 mutex_enter(&ill->ill_lock); 12621 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12622 mp = ill->ill_dlpi_deferred; 12623 ill->ill_dlpi_deferred = NULL; 12624 mutex_exit(&ill->ill_lock); 12625 12626 for (; mp != NULL; mp = nextmp) { 12627 nextmp = mp->b_next; 12628 mp->b_next = NULL; 12629 ill_dlpi_send(ill, mp); 12630 } 12631 } 12632 12633 /* 12634 * Clear all the deferred DLPI messages. Called on receiving an M_ERROR 12635 * or M_HANGUP 12636 */ 12637 static void 12638 ill_dlpi_clear_deferred(ill_t *ill) 12639 { 12640 mblk_t *mp, *nextmp; 12641 12642 mutex_enter(&ill->ill_lock); 12643 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12644 mp = ill->ill_dlpi_deferred; 12645 ill->ill_dlpi_deferred = NULL; 12646 mutex_exit(&ill->ill_lock); 12647 12648 for (; mp != NULL; mp = nextmp) { 12649 nextmp = mp->b_next; 12650 inet_freemsg(mp); 12651 } 12652 } 12653 12654 /* 12655 * Check if the DLPI primitive `prim' is pending; print a warning if not. 12656 */ 12657 boolean_t 12658 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim) 12659 { 12660 t_uscalar_t pending; 12661 12662 mutex_enter(&ill->ill_lock); 12663 if (ill->ill_dlpi_pending == prim) { 12664 mutex_exit(&ill->ill_lock); 12665 return (B_TRUE); 12666 } 12667 12668 /* 12669 * During teardown, ill_dlpi_dispatch() will send DLPI requests 12670 * without waiting, so don't print any warnings in that case. 12671 */ 12672 if (ill->ill_state_flags & ILL_CONDEMNED) { 12673 mutex_exit(&ill->ill_lock); 12674 return (B_FALSE); 12675 } 12676 pending = ill->ill_dlpi_pending; 12677 mutex_exit(&ill->ill_lock); 12678 12679 if (pending == DL_PRIM_INVAL) { 12680 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 12681 "received unsolicited ack for %s on %s\n", 12682 dl_primstr(prim), ill->ill_name); 12683 } else { 12684 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 12685 "received unexpected ack for %s on %s (expecting %s)\n", 12686 dl_primstr(prim), ill->ill_name, dl_primstr(pending)); 12687 } 12688 return (B_FALSE); 12689 } 12690 12691 /* 12692 * Complete the current DLPI operation associated with `prim' on `ill' and 12693 * start the next queued DLPI operation (if any). If there are no queued DLPI 12694 * operations and the ill's current exclusive IPSQ operation has finished 12695 * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to 12696 * allow the next exclusive IPSQ operation to begin upon ipsq_exit(). See 12697 * the comments above ipsq_current_finish() for details. 12698 */ 12699 void 12700 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 12701 { 12702 mblk_t *mp; 12703 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 12704 ipxop_t *ipx = ipsq->ipsq_xop; 12705 12706 ASSERT(IAM_WRITER_IPSQ(ipsq)); 12707 mutex_enter(&ill->ill_lock); 12708 12709 ASSERT(prim != DL_PRIM_INVAL); 12710 ASSERT(ill->ill_dlpi_pending == prim); 12711 12712 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 12713 dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 12714 12715 if ((mp = ill->ill_dlpi_deferred) == NULL) { 12716 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12717 if (ipx->ipx_current_done) { 12718 mutex_enter(&ipx->ipx_lock); 12719 ipx->ipx_current_ipif = NULL; 12720 mutex_exit(&ipx->ipx_lock); 12721 } 12722 cv_signal(&ill->ill_cv); 12723 mutex_exit(&ill->ill_lock); 12724 return; 12725 } 12726 12727 ill->ill_dlpi_deferred = mp->b_next; 12728 mp->b_next = NULL; 12729 mutex_exit(&ill->ill_lock); 12730 12731 ill_dlpi_dispatch(ill, mp); 12732 } 12733 12734 /* 12735 * Queue a (multicast) DLPI control message to be sent to the driver by 12736 * later calling ill_dlpi_send_queued. 12737 * We queue them while holding a lock (ill_mcast_lock) to ensure that they 12738 * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ 12739 * for the same group to race. 12740 * We send DLPI control messages in order using ill_lock. 12741 * For IPMP we should be called on the cast_ill. 12742 */ 12743 void 12744 ill_dlpi_queue(ill_t *ill, mblk_t *mp) 12745 { 12746 mblk_t **mpp; 12747 12748 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12749 12750 mutex_enter(&ill->ill_lock); 12751 /* Must queue message. Tail insertion */ 12752 mpp = &ill->ill_dlpi_deferred; 12753 while (*mpp != NULL) 12754 mpp = &((*mpp)->b_next); 12755 12756 *mpp = mp; 12757 mutex_exit(&ill->ill_lock); 12758 } 12759 12760 /* 12761 * Send the messages that were queued. Make sure there is only 12762 * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done() 12763 * when an ACK or a NAK is received to process the next queued message. 12764 * For IPMP we are called on the upper ill, but when send what is queued 12765 * on the cast_ill. 12766 */ 12767 void 12768 ill_dlpi_send_queued(ill_t *ill) 12769 { 12770 mblk_t *mp; 12771 union DL_primitives *dlp; 12772 t_uscalar_t prim; 12773 ill_t *release_ill = NULL; 12774 12775 if (IS_IPMP(ill)) { 12776 /* On the upper IPMP ill. */ 12777 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12778 if (release_ill == NULL) { 12779 /* Avoid ever sending anything down to the ipmpstub */ 12780 return; 12781 } 12782 ill = release_ill; 12783 } 12784 mutex_enter(&ill->ill_lock); 12785 while ((mp = ill->ill_dlpi_deferred) != NULL) { 12786 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 12787 /* Can't send. Somebody else will send it */ 12788 mutex_exit(&ill->ill_lock); 12789 goto done; 12790 } 12791 ill->ill_dlpi_deferred = mp->b_next; 12792 mp->b_next = NULL; 12793 if (!ill->ill_dl_up) { 12794 /* 12795 * Nobody there. All multicast addresses will be 12796 * re-joined when we get the DL_BIND_ACK bringing the 12797 * interface up. 12798 */ 12799 freemsg(mp); 12800 continue; 12801 } 12802 dlp = (union DL_primitives *)mp->b_rptr; 12803 prim = dlp->dl_primitive; 12804 12805 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 12806 (prim == DL_UNBIND_REQ)) { 12807 ill->ill_dlpi_pending = prim; 12808 } 12809 mutex_exit(&ill->ill_lock); 12810 12811 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued", 12812 char *, dl_primstr(prim), ill_t *, ill); 12813 putnext(ill->ill_wq, mp); 12814 mutex_enter(&ill->ill_lock); 12815 } 12816 mutex_exit(&ill->ill_lock); 12817 done: 12818 if (release_ill != NULL) 12819 ill_refrele(release_ill); 12820 } 12821 12822 /* 12823 * Queue an IP (IGMP/MLD) message to be sent by IP from 12824 * ill_mcast_send_queued 12825 * We queue them while holding a lock (ill_mcast_lock) to ensure that they 12826 * are sent in order i.e., prevent a IGMP leave and IGMP join for the same 12827 * group to race. 12828 * We send them in order using ill_lock. 12829 * For IPMP we are called on the upper ill, but we queue on the cast_ill. 12830 */ 12831 void 12832 ill_mcast_queue(ill_t *ill, mblk_t *mp) 12833 { 12834 mblk_t **mpp; 12835 ill_t *release_ill = NULL; 12836 12837 ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock)); 12838 12839 if (IS_IPMP(ill)) { 12840 /* On the upper IPMP ill. */ 12841 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12842 if (release_ill == NULL) { 12843 /* Discard instead of queuing for the ipmp interface */ 12844 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 12845 ip_drop_output("ipIfStatsOutDiscards - no cast_ill", 12846 mp, ill); 12847 freemsg(mp); 12848 return; 12849 } 12850 ill = release_ill; 12851 } 12852 12853 mutex_enter(&ill->ill_lock); 12854 /* Must queue message. Tail insertion */ 12855 mpp = &ill->ill_mcast_deferred; 12856 while (*mpp != NULL) 12857 mpp = &((*mpp)->b_next); 12858 12859 *mpp = mp; 12860 mutex_exit(&ill->ill_lock); 12861 if (release_ill != NULL) 12862 ill_refrele(release_ill); 12863 } 12864 12865 /* 12866 * Send the IP packets that were queued by ill_mcast_queue. 12867 * These are IGMP/MLD packets. 12868 * 12869 * For IPMP we are called on the upper ill, but when send what is queued 12870 * on the cast_ill. 12871 * 12872 * Request loopback of the report if we are acting as a multicast 12873 * router, so that the process-level routing demon can hear it. 12874 * This will run multiple times for the same group if there are members 12875 * on the same group for multiple ipif's on the same ill. The 12876 * igmp_input/mld_input code will suppress this due to the loopback thus we 12877 * always loopback membership report. 12878 * 12879 * We also need to make sure that this does not get load balanced 12880 * by IPMP. We do this by passing an ill to ip_output_simple. 12881 */ 12882 void 12883 ill_mcast_send_queued(ill_t *ill) 12884 { 12885 mblk_t *mp; 12886 ip_xmit_attr_t ixas; 12887 ill_t *release_ill = NULL; 12888 12889 if (IS_IPMP(ill)) { 12890 /* On the upper IPMP ill. */ 12891 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 12892 if (release_ill == NULL) { 12893 /* 12894 * We should have no messages on the ipmp interface 12895 * but no point in trying to send them. 12896 */ 12897 return; 12898 } 12899 ill = release_ill; 12900 } 12901 bzero(&ixas, sizeof (ixas)); 12902 ixas.ixa_zoneid = ALL_ZONES; 12903 ixas.ixa_cred = kcred; 12904 ixas.ixa_cpid = NOPID; 12905 ixas.ixa_tsl = NULL; 12906 /* 12907 * Here we set ixa_ifindex. If IPMP it will be the lower ill which 12908 * makes ip_select_route pick the IRE_MULTICAST for the cast_ill. 12909 * That is necessary to handle IGMP/MLD snooping switches. 12910 */ 12911 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 12912 ixas.ixa_ipst = ill->ill_ipst; 12913 12914 mutex_enter(&ill->ill_lock); 12915 while ((mp = ill->ill_mcast_deferred) != NULL) { 12916 ill->ill_mcast_deferred = mp->b_next; 12917 mp->b_next = NULL; 12918 if (!ill->ill_dl_up) { 12919 /* 12920 * Nobody there. Just drop the ip packets. 12921 * IGMP/MLD will resend later, if this is a replumb. 12922 */ 12923 freemsg(mp); 12924 continue; 12925 } 12926 mutex_enter(&ill->ill_phyint->phyint_lock); 12927 if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) { 12928 /* 12929 * When the ill is getting deactivated, we only want to 12930 * send the DLPI messages, so drop IGMP/MLD packets. 12931 * DLPI messages are handled by ill_dlpi_send_queued() 12932 */ 12933 mutex_exit(&ill->ill_phyint->phyint_lock); 12934 freemsg(mp); 12935 continue; 12936 } 12937 mutex_exit(&ill->ill_phyint->phyint_lock); 12938 mutex_exit(&ill->ill_lock); 12939 12940 /* Check whether we are sending IPv4 or IPv6. */ 12941 if (ill->ill_isv6) { 12942 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 12943 12944 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 12945 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; 12946 } else { 12947 ipha_t *ipha = (ipha_t *)mp->b_rptr; 12948 12949 ixas.ixa_multicast_ttl = ipha->ipha_ttl; 12950 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 12951 ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM; 12952 } 12953 12954 ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE; 12955 (void) ip_output_simple(mp, &ixas); 12956 ixa_cleanup(&ixas); 12957 12958 mutex_enter(&ill->ill_lock); 12959 } 12960 mutex_exit(&ill->ill_lock); 12961 12962 done: 12963 if (release_ill != NULL) 12964 ill_refrele(release_ill); 12965 } 12966 12967 /* 12968 * Take down a specific interface, but don't lose any information about it. 12969 * (Always called as writer.) 12970 * This function goes through the down sequence even if the interface is 12971 * already down. There are 2 reasons. 12972 * a. Currently we permit interface routes that depend on down interfaces 12973 * to be added. This behaviour itself is questionable. However it appears 12974 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 12975 * time. We go thru the cleanup in order to remove these routes. 12976 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 12977 * DL_ERROR_ACK in response to the DL_BIND request. The interface is 12978 * down, but we need to cleanup i.e. do ill_dl_down and 12979 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 12980 * 12981 * IP-MT notes: 12982 * 12983 * Model of reference to interfaces. 12984 * 12985 * The following members in ipif_t track references to the ipif. 12986 * int ipif_refcnt; Active reference count 12987 * 12988 * The following members in ill_t track references to the ill. 12989 * int ill_refcnt; active refcnt 12990 * uint_t ill_ire_cnt; Number of ires referencing ill 12991 * uint_t ill_ncec_cnt; Number of ncecs referencing ill 12992 * uint_t ill_nce_cnt; Number of nces referencing ill 12993 * uint_t ill_ilm_cnt; Number of ilms referencing ill 12994 * 12995 * Reference to an ipif or ill can be obtained in any of the following ways. 12996 * 12997 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 12998 * Pointers to ipif / ill from other data structures viz ire and conn. 12999 * Implicit reference to the ipif / ill by holding a reference to the ire. 13000 * 13001 * The ipif/ill lookup functions return a reference held ipif / ill. 13002 * ipif_refcnt and ill_refcnt track the reference counts respectively. 13003 * This is a purely dynamic reference count associated with threads holding 13004 * references to the ipif / ill. Pointers from other structures do not 13005 * count towards this reference count. 13006 * 13007 * ill_ire_cnt is the number of ire's associated with the 13008 * ill. This is incremented whenever a new ire is created referencing the 13009 * ill. This is done atomically inside ire_add_v[46] where the ire is 13010 * actually added to the ire hash table. The count is decremented in 13011 * ire_inactive where the ire is destroyed. 13012 * 13013 * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill. 13014 * This is incremented atomically in 13015 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the 13016 * table. Similarly it is decremented in ncec_inactive() where the ncec 13017 * is destroyed. 13018 * 13019 * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is 13020 * incremented atomically in nce_add() where the nce is actually added to the 13021 * ill_nce. Similarly it is decremented in nce_inactive() where the nce 13022 * is destroyed. 13023 * 13024 * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in 13025 * ilm_add() and decremented before the ilm is freed in ilm_delete(). 13026 * 13027 * Flow of ioctls involving interface down/up 13028 * 13029 * The following is the sequence of an attempt to set some critical flags on an 13030 * up interface. 13031 * ip_sioctl_flags 13032 * ipif_down 13033 * wait for ipif to be quiescent 13034 * ipif_down_tail 13035 * ip_sioctl_flags_tail 13036 * 13037 * All set ioctls that involve down/up sequence would have a skeleton similar 13038 * to the above. All the *tail functions are called after the refcounts have 13039 * dropped to the appropriate values. 13040 * 13041 * SIOC ioctls during the IPIF_CHANGING interval. 13042 * 13043 * Threads handling SIOC set ioctls serialize on the squeue, but this 13044 * is not done for SIOC get ioctls. Since a set ioctl can cause several 13045 * steps of internal changes to the state, some of which are visible in 13046 * ipif_flags (such as IFF_UP being cleared and later set), and we want 13047 * the set ioctl to be atomic related to the get ioctls, the SIOC get code 13048 * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then 13049 * enqueued in the ipsq and the operation is restarted by ipsq_exit() when 13050 * the current exclusive operation completes. The IPIF_CHANGING check 13051 * and enqueue is atomic using the ill_lock and ipsq_lock. The 13052 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 13053 * change while the ill_lock is held. Before dropping the ill_lock we acquire 13054 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 13055 * until we release the ipsq_lock, even though the ill/ipif state flags 13056 * can change after we drop the ill_lock. 13057 */ 13058 int 13059 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 13060 { 13061 ill_t *ill = ipif->ipif_ill; 13062 conn_t *connp; 13063 boolean_t success; 13064 boolean_t ipif_was_up = B_FALSE; 13065 ip_stack_t *ipst = ill->ill_ipst; 13066 13067 ASSERT(IAM_WRITER_IPIF(ipif)); 13068 13069 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13070 13071 DTRACE_PROBE3(ipif__downup, char *, "ipif_down", 13072 ill_t *, ill, ipif_t *, ipif); 13073 13074 if (ipif->ipif_flags & IPIF_UP) { 13075 mutex_enter(&ill->ill_lock); 13076 ipif->ipif_flags &= ~IPIF_UP; 13077 ASSERT(ill->ill_ipif_up_count > 0); 13078 --ill->ill_ipif_up_count; 13079 mutex_exit(&ill->ill_lock); 13080 ipif_was_up = B_TRUE; 13081 /* Update status in SCTP's list */ 13082 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 13083 ill_nic_event_dispatch(ipif->ipif_ill, 13084 MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0); 13085 } 13086 13087 /* 13088 * Blow away memberships we established in ipif_multicast_up(). 13089 */ 13090 ipif_multicast_down(ipif); 13091 13092 /* 13093 * Remove from the mapping for __sin6_src_id. We insert only 13094 * when the address is not INADDR_ANY. As IPv4 addresses are 13095 * stored as mapped addresses, we need to check for mapped 13096 * INADDR_ANY also. 13097 */ 13098 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 13099 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 13100 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 13101 int err; 13102 13103 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 13104 ipif->ipif_zoneid, ipst); 13105 if (err != 0) { 13106 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 13107 } 13108 } 13109 13110 if (ipif_was_up) { 13111 /* only delete if we'd added ire's before */ 13112 if (ipif->ipif_isv6) 13113 ipif_delete_ires_v6(ipif); 13114 else 13115 ipif_delete_ires_v4(ipif); 13116 } 13117 13118 if (ipif_was_up && ill->ill_ipif_up_count == 0) { 13119 /* 13120 * Since the interface is now down, it may have just become 13121 * inactive. Note that this needs to be done even for a 13122 * lll_logical_down(), or ARP entries will not get correctly 13123 * restored when the interface comes back up. 13124 */ 13125 if (IS_UNDER_IPMP(ill)) 13126 ipmp_ill_refresh_active(ill); 13127 } 13128 13129 /* 13130 * neighbor-discovery or arp entries for this interface. The ipif 13131 * has to be quiesced, so we walk all the nce's and delete those 13132 * that point at the ipif->ipif_ill. At the same time, we also 13133 * update IPMP so that ipifs for data addresses are unbound. We dont 13134 * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer 13135 * that for ipif_down_tail() 13136 */ 13137 ipif_nce_down(ipif); 13138 13139 /* 13140 * If this is the last ipif on the ill, we also need to remove 13141 * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will 13142 * never succeed. 13143 */ 13144 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) 13145 ire_walk_ill(0, 0, ill_downi, ill, ill); 13146 13147 /* 13148 * Walk all CONNs that can have a reference on an ire for this 13149 * ipif (we actually walk all that now have stale references). 13150 */ 13151 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); 13152 13153 /* 13154 * If mp is NULL the caller will wait for the appropriate refcnt. 13155 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 13156 * and ill_delete -> ipif_free -> ipif_down 13157 */ 13158 if (mp == NULL) { 13159 ASSERT(q == NULL); 13160 return (0); 13161 } 13162 13163 if (CONN_Q(q)) { 13164 connp = Q_TO_CONN(q); 13165 mutex_enter(&connp->conn_lock); 13166 } else { 13167 connp = NULL; 13168 } 13169 mutex_enter(&ill->ill_lock); 13170 /* 13171 * Are there any ire's pointing to this ipif that are still active ? 13172 * If this is the last ipif going down, are there any ire's pointing 13173 * to this ill that are still active ? 13174 */ 13175 if (ipif_is_quiescent(ipif)) { 13176 mutex_exit(&ill->ill_lock); 13177 if (connp != NULL) 13178 mutex_exit(&connp->conn_lock); 13179 return (0); 13180 } 13181 13182 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 13183 ill->ill_name, (void *)ill)); 13184 /* 13185 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 13186 * drops down, the operation will be restarted by ipif_ill_refrele_tail 13187 * which in turn is called by the last refrele on the ipif/ill/ire. 13188 */ 13189 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 13190 if (!success) { 13191 /* The conn is closing. So just return */ 13192 ASSERT(connp != NULL); 13193 mutex_exit(&ill->ill_lock); 13194 mutex_exit(&connp->conn_lock); 13195 return (EINTR); 13196 } 13197 13198 mutex_exit(&ill->ill_lock); 13199 if (connp != NULL) 13200 mutex_exit(&connp->conn_lock); 13201 return (EINPROGRESS); 13202 } 13203 13204 int 13205 ipif_down_tail(ipif_t *ipif) 13206 { 13207 ill_t *ill = ipif->ipif_ill; 13208 int err = 0; 13209 13210 DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail", 13211 ill_t *, ill, ipif_t *, ipif); 13212 13213 /* 13214 * Skip any loopback interface (null wq). 13215 * If this is the last logical interface on the ill 13216 * have ill_dl_down tell the driver we are gone (unbind) 13217 * Note that lun 0 can ipif_down even though 13218 * there are other logical units that are up. 13219 * This occurs e.g. when we change a "significant" IFF_ flag. 13220 */ 13221 if (ill->ill_wq != NULL && !ill->ill_logical_down && 13222 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 13223 ill->ill_dl_up) { 13224 ill_dl_down(ill); 13225 } 13226 if (!ipif->ipif_isv6) 13227 err = ipif_arp_down(ipif); 13228 13229 ill->ill_logical_down = 0; 13230 13231 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 13232 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT); 13233 return (err); 13234 } 13235 13236 /* 13237 * Bring interface logically down without bringing the physical interface 13238 * down e.g. when the netmask is changed. This avoids long lasting link 13239 * negotiations between an ethernet interface and a certain switches. 13240 */ 13241 static int 13242 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 13243 { 13244 DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down", 13245 ill_t *, ipif->ipif_ill, ipif_t *, ipif); 13246 13247 /* 13248 * The ill_logical_down flag is a transient flag. It is set here 13249 * and is cleared once the down has completed in ipif_down_tail. 13250 * This flag does not indicate whether the ill stream is in the 13251 * DL_BOUND state with the driver. Instead this flag is used by 13252 * ipif_down_tail to determine whether to DL_UNBIND the stream with 13253 * the driver. The state of the ill stream i.e. whether it is 13254 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 13255 */ 13256 ipif->ipif_ill->ill_logical_down = 1; 13257 return (ipif_down(ipif, q, mp)); 13258 } 13259 13260 /* 13261 * Initiate deallocate of an IPIF. Always called as writer. Called by 13262 * ill_delete or ip_sioctl_removeif. 13263 */ 13264 static void 13265 ipif_free(ipif_t *ipif) 13266 { 13267 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13268 13269 ASSERT(IAM_WRITER_IPIF(ipif)); 13270 13271 if (ipif->ipif_recovery_id != 0) 13272 (void) untimeout(ipif->ipif_recovery_id); 13273 ipif->ipif_recovery_id = 0; 13274 13275 /* 13276 * Take down the interface. We can be called either from ill_delete 13277 * or from ip_sioctl_removeif. 13278 */ 13279 (void) ipif_down(ipif, NULL, NULL); 13280 13281 /* 13282 * Now that the interface is down, there's no chance it can still 13283 * become a duplicate. Cancel any timer that may have been set while 13284 * tearing down. 13285 */ 13286 if (ipif->ipif_recovery_id != 0) 13287 (void) untimeout(ipif->ipif_recovery_id); 13288 ipif->ipif_recovery_id = 0; 13289 13290 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13291 /* Remove pointers to this ill in the multicast routing tables */ 13292 reset_mrt_vif_ipif(ipif); 13293 /* If necessary, clear the cached source ipif rotor. */ 13294 if (ipif->ipif_ill->ill_src_ipif == ipif) 13295 ipif->ipif_ill->ill_src_ipif = NULL; 13296 rw_exit(&ipst->ips_ill_g_lock); 13297 } 13298 13299 static void 13300 ipif_free_tail(ipif_t *ipif) 13301 { 13302 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13303 13304 /* 13305 * Need to hold both ill_g_lock and ill_lock while 13306 * inserting or removing an ipif from the linked list 13307 * of ipifs hanging off the ill. 13308 */ 13309 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13310 13311 #ifdef DEBUG 13312 ipif_trace_cleanup(ipif); 13313 #endif 13314 13315 /* Ask SCTP to take it out of it list */ 13316 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 13317 ip_rts_newaddrmsg(RTM_FREEADDR, 0, ipif, RTSQ_DEFAULT); 13318 13319 /* Get it out of the ILL interface list. */ 13320 ipif_remove(ipif); 13321 rw_exit(&ipst->ips_ill_g_lock); 13322 13323 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 13324 ASSERT(ipif->ipif_recovery_id == 0); 13325 ASSERT(ipif->ipif_ire_local == NULL); 13326 ASSERT(ipif->ipif_ire_if == NULL); 13327 13328 /* Free the memory. */ 13329 mi_free(ipif); 13330 } 13331 13332 /* 13333 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id" 13334 * is zero. 13335 */ 13336 void 13337 ipif_get_name(const ipif_t *ipif, char *buf, int len) 13338 { 13339 char lbuf[LIFNAMSIZ]; 13340 char *name; 13341 size_t name_len; 13342 13343 buf[0] = '\0'; 13344 name = ipif->ipif_ill->ill_name; 13345 name_len = ipif->ipif_ill->ill_name_length; 13346 if (ipif->ipif_id != 0) { 13347 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 13348 ipif->ipif_id); 13349 name = lbuf; 13350 name_len = mi_strlen(name) + 1; 13351 } 13352 len -= 1; 13353 buf[len] = '\0'; 13354 len = MIN(len, name_len); 13355 bcopy(name, buf, len); 13356 } 13357 13358 /* 13359 * Sets `buf' to an ill name. 13360 */ 13361 void 13362 ill_get_name(const ill_t *ill, char *buf, int len) 13363 { 13364 char *name; 13365 size_t name_len; 13366 13367 name = ill->ill_name; 13368 name_len = ill->ill_name_length; 13369 len -= 1; 13370 buf[len] = '\0'; 13371 len = MIN(len, name_len); 13372 bcopy(name, buf, len); 13373 } 13374 13375 /* 13376 * Find an IPIF based on the name passed in. Names can be of the form <phys> 13377 * (e.g., le0) or <phys>:<#> (e.g., le0:1). When there is no colon, the 13378 * implied unit id is zero. <phys> must correspond to the name of an ILL. 13379 * (May be called as writer.) 13380 */ 13381 static ipif_t * 13382 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 13383 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst) 13384 { 13385 char *cp; 13386 char *endp; 13387 long id; 13388 ill_t *ill; 13389 ipif_t *ipif; 13390 uint_t ire_type; 13391 boolean_t did_alloc = B_FALSE; 13392 13393 /* 13394 * If the caller wants to us to create the ipif, make sure we have a 13395 * valid zoneid 13396 */ 13397 ASSERT(!do_alloc || zoneid != ALL_ZONES); 13398 13399 if (namelen == 0) { 13400 return (NULL); 13401 } 13402 13403 *exists = B_FALSE; 13404 /* Look for a colon in the name. */ 13405 endp = &name[namelen]; 13406 for (cp = endp; --cp > name; ) { 13407 if (*cp == IPIF_SEPARATOR_CHAR) 13408 break; 13409 } 13410 13411 if (*cp == IPIF_SEPARATOR_CHAR) { 13412 /* 13413 * Reject any non-decimal aliases for logical 13414 * interfaces. Aliases with leading zeroes 13415 * are also rejected as they introduce ambiguity 13416 * in the naming of the interfaces. 13417 * In order to confirm with existing semantics, 13418 * and to not break any programs/script relying 13419 * on that behaviour, if<0>:0 is considered to be 13420 * a valid interface. 13421 * 13422 * If alias has two or more digits and the first 13423 * is zero, fail. 13424 */ 13425 if (&cp[2] < endp && cp[1] == '0') { 13426 return (NULL); 13427 } 13428 } 13429 13430 if (cp <= name) { 13431 cp = endp; 13432 } else { 13433 *cp = '\0'; 13434 } 13435 13436 /* 13437 * Look up the ILL, based on the portion of the name 13438 * before the slash. ill_lookup_on_name returns a held ill. 13439 * Temporary to check whether ill exists already. If so 13440 * ill_lookup_on_name will clear it. 13441 */ 13442 ill = ill_lookup_on_name(name, do_alloc, isv6, 13443 &did_alloc, ipst); 13444 if (cp != endp) 13445 *cp = IPIF_SEPARATOR_CHAR; 13446 if (ill == NULL) 13447 return (NULL); 13448 13449 /* Establish the unit number in the name. */ 13450 id = 0; 13451 if (cp < endp && *endp == '\0') { 13452 /* If there was a colon, the unit number follows. */ 13453 cp++; 13454 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 13455 ill_refrele(ill); 13456 return (NULL); 13457 } 13458 } 13459 13460 mutex_enter(&ill->ill_lock); 13461 /* Now see if there is an IPIF with this unit number. */ 13462 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13463 if (ipif->ipif_id == id) { 13464 if (zoneid != ALL_ZONES && 13465 zoneid != ipif->ipif_zoneid && 13466 ipif->ipif_zoneid != ALL_ZONES) { 13467 mutex_exit(&ill->ill_lock); 13468 ill_refrele(ill); 13469 return (NULL); 13470 } 13471 if (IPIF_CAN_LOOKUP(ipif)) { 13472 ipif_refhold_locked(ipif); 13473 mutex_exit(&ill->ill_lock); 13474 if (!did_alloc) 13475 *exists = B_TRUE; 13476 /* 13477 * Drop locks before calling ill_refrele 13478 * since it can potentially call into 13479 * ipif_ill_refrele_tail which can end up 13480 * in trying to acquire any lock. 13481 */ 13482 ill_refrele(ill); 13483 return (ipif); 13484 } 13485 } 13486 } 13487 13488 if (!do_alloc) { 13489 mutex_exit(&ill->ill_lock); 13490 ill_refrele(ill); 13491 return (NULL); 13492 } 13493 13494 /* 13495 * If none found, atomically allocate and return a new one. 13496 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 13497 * to support "receive only" use of lo0:1 etc. as is still done 13498 * below as an initial guess. 13499 * However, this is now likely to be overriden later in ipif_up_done() 13500 * when we know for sure what address has been configured on the 13501 * interface, since we might have more than one loopback interface 13502 * with a loopback address, e.g. in the case of zones, and all the 13503 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 13504 */ 13505 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 13506 ire_type = IRE_LOOPBACK; 13507 else 13508 ire_type = IRE_LOCAL; 13509 ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE, NULL); 13510 if (ipif != NULL) 13511 ipif_refhold_locked(ipif); 13512 mutex_exit(&ill->ill_lock); 13513 ill_refrele(ill); 13514 return (ipif); 13515 } 13516 13517 /* 13518 * This routine is called whenever a new address comes up on an ipif. If 13519 * we are configured to respond to address mask requests, then we are supposed 13520 * to broadcast an address mask reply at this time. This routine is also 13521 * called if we are already up, but a netmask change is made. This is legal 13522 * but might not make the system manager very popular. (May be called 13523 * as writer.) 13524 */ 13525 void 13526 ipif_mask_reply(ipif_t *ipif) 13527 { 13528 icmph_t *icmph; 13529 ipha_t *ipha; 13530 mblk_t *mp; 13531 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13532 ip_xmit_attr_t ixas; 13533 13534 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 13535 13536 if (!ipst->ips_ip_respond_to_address_mask_broadcast) 13537 return; 13538 13539 /* ICMP mask reply is IPv4 only */ 13540 ASSERT(!ipif->ipif_isv6); 13541 /* ICMP mask reply is not for a loopback interface */ 13542 ASSERT(ipif->ipif_ill->ill_wq != NULL); 13543 13544 if (ipif->ipif_lcl_addr == INADDR_ANY) 13545 return; 13546 13547 mp = allocb(REPLY_LEN, BPRI_HI); 13548 if (mp == NULL) 13549 return; 13550 mp->b_wptr = mp->b_rptr + REPLY_LEN; 13551 13552 ipha = (ipha_t *)mp->b_rptr; 13553 bzero(ipha, REPLY_LEN); 13554 *ipha = icmp_ipha; 13555 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 13556 ipha->ipha_src = ipif->ipif_lcl_addr; 13557 ipha->ipha_dst = ipif->ipif_brd_addr; 13558 ipha->ipha_length = htons(REPLY_LEN); 13559 ipha->ipha_ident = 0; 13560 13561 icmph = (icmph_t *)&ipha[1]; 13562 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 13563 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 13564 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 13565 13566 bzero(&ixas, sizeof (ixas)); 13567 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 13568 ixas.ixa_flags |= IXAF_SET_SOURCE; 13569 ixas.ixa_zoneid = ALL_ZONES; 13570 ixas.ixa_ifindex = 0; 13571 ixas.ixa_ipst = ipst; 13572 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 13573 (void) ip_output_simple(mp, &ixas); 13574 ixa_cleanup(&ixas); 13575 #undef REPLY_LEN 13576 } 13577 13578 /* 13579 * Join the ipif specific multicast groups. 13580 * Must be called after a mapping has been set up in the resolver. (Always 13581 * called as writer.) 13582 */ 13583 void 13584 ipif_multicast_up(ipif_t *ipif) 13585 { 13586 int err; 13587 ill_t *ill; 13588 ilm_t *ilm; 13589 13590 ASSERT(IAM_WRITER_IPIF(ipif)); 13591 13592 ill = ipif->ipif_ill; 13593 13594 ip1dbg(("ipif_multicast_up\n")); 13595 if (!(ill->ill_flags & ILLF_MULTICAST) || 13596 ipif->ipif_allhosts_ilm != NULL) 13597 return; 13598 13599 if (ipif->ipif_isv6) { 13600 in6_addr_t v6allmc = ipv6_all_hosts_mcast; 13601 in6_addr_t v6solmc = ipv6_solicited_node_mcast; 13602 13603 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 13604 13605 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 13606 return; 13607 13608 ip1dbg(("ipif_multicast_up - addmulti\n")); 13609 13610 /* 13611 * Join the all hosts multicast address. We skip this for 13612 * underlying IPMP interfaces since they should be invisible. 13613 */ 13614 if (!IS_UNDER_IPMP(ill)) { 13615 ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid, 13616 &err); 13617 if (ilm == NULL) { 13618 ASSERT(err != 0); 13619 ip0dbg(("ipif_multicast_up: " 13620 "all_hosts_mcast failed %d\n", err)); 13621 return; 13622 } 13623 ipif->ipif_allhosts_ilm = ilm; 13624 } 13625 13626 /* 13627 * Enable multicast for the solicited node multicast address. 13628 * If IPMP we need to put the membership on the upper ill. 13629 */ 13630 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 13631 ill_t *mcast_ill = NULL; 13632 boolean_t need_refrele; 13633 13634 if (IS_UNDER_IPMP(ill) && 13635 (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) { 13636 need_refrele = B_TRUE; 13637 } else { 13638 mcast_ill = ill; 13639 need_refrele = B_FALSE; 13640 } 13641 13642 ilm = ip_addmulti(&v6solmc, mcast_ill, 13643 ipif->ipif_zoneid, &err); 13644 if (need_refrele) 13645 ill_refrele(mcast_ill); 13646 13647 if (ilm == NULL) { 13648 ASSERT(err != 0); 13649 ip0dbg(("ipif_multicast_up: solicited MC" 13650 " failed %d\n", err)); 13651 if ((ilm = ipif->ipif_allhosts_ilm) != NULL) { 13652 ipif->ipif_allhosts_ilm = NULL; 13653 (void) ip_delmulti(ilm); 13654 } 13655 return; 13656 } 13657 ipif->ipif_solmulti_ilm = ilm; 13658 } 13659 } else { 13660 in6_addr_t v6group; 13661 13662 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill)) 13663 return; 13664 13665 /* Join the all hosts multicast address */ 13666 ip1dbg(("ipif_multicast_up - addmulti\n")); 13667 IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group); 13668 13669 ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err); 13670 if (ilm == NULL) { 13671 ASSERT(err != 0); 13672 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 13673 return; 13674 } 13675 ipif->ipif_allhosts_ilm = ilm; 13676 } 13677 } 13678 13679 /* 13680 * Blow away any multicast groups that we joined in ipif_multicast_up(). 13681 * (ilms from explicit memberships are handled in conn_update_ill.) 13682 */ 13683 void 13684 ipif_multicast_down(ipif_t *ipif) 13685 { 13686 ASSERT(IAM_WRITER_IPIF(ipif)); 13687 13688 ip1dbg(("ipif_multicast_down\n")); 13689 13690 if (ipif->ipif_allhosts_ilm != NULL) { 13691 (void) ip_delmulti(ipif->ipif_allhosts_ilm); 13692 ipif->ipif_allhosts_ilm = NULL; 13693 } 13694 if (ipif->ipif_solmulti_ilm != NULL) { 13695 (void) ip_delmulti(ipif->ipif_solmulti_ilm); 13696 ipif->ipif_solmulti_ilm = NULL; 13697 } 13698 } 13699 13700 /* 13701 * Used when an interface comes up to recreate any extra routes on this 13702 * interface. 13703 */ 13704 int 13705 ill_recover_saved_ire(ill_t *ill) 13706 { 13707 mblk_t *mp; 13708 ip_stack_t *ipst = ill->ill_ipst; 13709 13710 ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name)); 13711 13712 mutex_enter(&ill->ill_saved_ire_lock); 13713 for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 13714 ire_t *ire, *nire; 13715 ifrt_t *ifrt; 13716 13717 ifrt = (ifrt_t *)mp->b_rptr; 13718 /* 13719 * Create a copy of the IRE with the saved address and netmask. 13720 */ 13721 if (ill->ill_isv6) { 13722 ire = ire_create_v6( 13723 &ifrt->ifrt_v6addr, 13724 &ifrt->ifrt_v6mask, 13725 &ifrt->ifrt_v6gateway_addr, 13726 ifrt->ifrt_type, 13727 ill, 13728 ifrt->ifrt_zoneid, 13729 ifrt->ifrt_flags, 13730 NULL, 13731 ipst); 13732 } else { 13733 ire = ire_create( 13734 (uint8_t *)&ifrt->ifrt_addr, 13735 (uint8_t *)&ifrt->ifrt_mask, 13736 (uint8_t *)&ifrt->ifrt_gateway_addr, 13737 ifrt->ifrt_type, 13738 ill, 13739 ifrt->ifrt_zoneid, 13740 ifrt->ifrt_flags, 13741 NULL, 13742 ipst); 13743 } 13744 if (ire == NULL) { 13745 mutex_exit(&ill->ill_saved_ire_lock); 13746 return (ENOMEM); 13747 } 13748 13749 if (ifrt->ifrt_flags & RTF_SETSRC) { 13750 if (ill->ill_isv6) { 13751 ire->ire_setsrc_addr_v6 = 13752 ifrt->ifrt_v6setsrc_addr; 13753 } else { 13754 ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr; 13755 } 13756 } 13757 13758 /* 13759 * Some software (for example, GateD and Sun Cluster) attempts 13760 * to create (what amount to) IRE_PREFIX routes with the 13761 * loopback address as the gateway. This is primarily done to 13762 * set up prefixes with the RTF_REJECT flag set (for example, 13763 * when generating aggregate routes.) 13764 * 13765 * If the IRE type (as defined by ill->ill_net_type) is 13766 * IRE_LOOPBACK, then we map the request into a 13767 * IRE_IF_NORESOLVER. 13768 */ 13769 if (ill->ill_net_type == IRE_LOOPBACK) 13770 ire->ire_type = IRE_IF_NORESOLVER; 13771 13772 /* 13773 * ire held by ire_add, will be refreled' towards the 13774 * the end of ipif_up_done 13775 */ 13776 nire = ire_add(ire); 13777 /* 13778 * Check if it was a duplicate entry. This handles 13779 * the case of two racing route adds for the same route 13780 */ 13781 if (nire == NULL) { 13782 ip1dbg(("ill_recover_saved_ire: FAILED\n")); 13783 } else if (nire != ire) { 13784 ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n", 13785 (void *)nire)); 13786 ire_delete(nire); 13787 } else { 13788 ip1dbg(("ill_recover_saved_ire: added ire %p\n", 13789 (void *)nire)); 13790 } 13791 if (nire != NULL) 13792 ire_refrele(nire); 13793 } 13794 mutex_exit(&ill->ill_saved_ire_lock); 13795 return (0); 13796 } 13797 13798 /* 13799 * Used to set the netmask and broadcast address to default values when the 13800 * interface is brought up. (Always called as writer.) 13801 */ 13802 static void 13803 ipif_set_default(ipif_t *ipif) 13804 { 13805 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 13806 13807 if (!ipif->ipif_isv6) { 13808 /* 13809 * Interface holds an IPv4 address. Default 13810 * mask is the natural netmask. 13811 */ 13812 if (!ipif->ipif_net_mask) { 13813 ipaddr_t v4mask; 13814 13815 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 13816 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 13817 } 13818 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 13819 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 13820 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 13821 } else { 13822 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 13823 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 13824 } 13825 /* 13826 * NOTE: SunOS 4.X does this even if the broadcast address 13827 * has been already set thus we do the same here. 13828 */ 13829 if (ipif->ipif_flags & IPIF_BROADCAST) { 13830 ipaddr_t v4addr; 13831 13832 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 13833 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 13834 } 13835 } else { 13836 /* 13837 * Interface holds an IPv6-only address. Default 13838 * mask is all-ones. 13839 */ 13840 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 13841 ipif->ipif_v6net_mask = ipv6_all_ones; 13842 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 13843 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 13844 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 13845 } else { 13846 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 13847 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 13848 } 13849 } 13850 } 13851 13852 /* 13853 * Return 0 if this address can be used as local address without causing 13854 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 13855 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 13856 * Note that the same IPv6 link-local address is allowed as long as the ills 13857 * are not on the same link. 13858 */ 13859 int 13860 ip_addr_availability_check(ipif_t *new_ipif) 13861 { 13862 in6_addr_t our_v6addr; 13863 ill_t *ill; 13864 ipif_t *ipif; 13865 ill_walk_context_t ctx; 13866 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst; 13867 13868 ASSERT(IAM_WRITER_IPIF(new_ipif)); 13869 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock)); 13870 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); 13871 13872 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 13873 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 13874 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 13875 return (0); 13876 13877 our_v6addr = new_ipif->ipif_v6lcl_addr; 13878 13879 if (new_ipif->ipif_isv6) 13880 ill = ILL_START_WALK_V6(&ctx, ipst); 13881 else 13882 ill = ILL_START_WALK_V4(&ctx, ipst); 13883 13884 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 13885 for (ipif = ill->ill_ipif; ipif != NULL; 13886 ipif = ipif->ipif_next) { 13887 if ((ipif == new_ipif) || 13888 !(ipif->ipif_flags & IPIF_UP) || 13889 (ipif->ipif_flags & IPIF_UNNUMBERED) || 13890 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 13891 &our_v6addr)) 13892 continue; 13893 13894 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 13895 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 13896 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 13897 ipif->ipif_flags |= IPIF_UNNUMBERED; 13898 else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) || 13899 IN6_IS_ADDR_SITELOCAL(&our_v6addr)) && 13900 !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill)) 13901 continue; 13902 else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid && 13903 ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill)) 13904 continue; 13905 else if (new_ipif->ipif_ill == ill) 13906 return (EADDRINUSE); 13907 else 13908 return (EADDRNOTAVAIL); 13909 } 13910 } 13911 13912 return (0); 13913 } 13914 13915 /* 13916 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 13917 * IREs for the ipif. 13918 * When the routine returns EINPROGRESS then mp has been consumed and 13919 * the ioctl will be acked from ip_rput_dlpi. 13920 */ 13921 int 13922 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 13923 { 13924 ill_t *ill = ipif->ipif_ill; 13925 boolean_t isv6 = ipif->ipif_isv6; 13926 int err = 0; 13927 boolean_t success; 13928 uint_t ipif_orig_id; 13929 ip_stack_t *ipst = ill->ill_ipst; 13930 13931 ASSERT(IAM_WRITER_IPIF(ipif)); 13932 13933 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13934 DTRACE_PROBE3(ipif__downup, char *, "ipif_up", 13935 ill_t *, ill, ipif_t *, ipif); 13936 13937 /* Shouldn't get here if it is already up. */ 13938 if (ipif->ipif_flags & IPIF_UP) 13939 return (EALREADY); 13940 13941 /* 13942 * If this is a request to bring up a data address on an interface 13943 * under IPMP, then move the address to its IPMP meta-interface and 13944 * try to bring it up. One complication is that the zeroth ipif for 13945 * an ill is special, in that every ill always has one, and that code 13946 * throughout IP deferences ill->ill_ipif without holding any locks. 13947 */ 13948 if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) && 13949 (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) { 13950 ipif_t *stubipif = NULL, *moveipif = NULL; 13951 ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); 13952 13953 /* 13954 * The ipif being brought up should be quiesced. If it's not, 13955 * something has gone amiss and we need to bail out. (If it's 13956 * quiesced, we know it will remain so via IPIF_CONDEMNED.) 13957 */ 13958 mutex_enter(&ill->ill_lock); 13959 if (!ipif_is_quiescent(ipif)) { 13960 mutex_exit(&ill->ill_lock); 13961 return (EINVAL); 13962 } 13963 mutex_exit(&ill->ill_lock); 13964 13965 /* 13966 * If we're going to need to allocate ipifs, do it prior 13967 * to starting the move (and grabbing locks). 13968 */ 13969 if (ipif->ipif_id == 0) { 13970 if ((moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 13971 B_FALSE, &err)) == NULL) { 13972 return (err); 13973 } 13974 if ((stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 13975 B_FALSE, &err)) == NULL) { 13976 mi_free(moveipif); 13977 return (err); 13978 } 13979 } 13980 13981 /* 13982 * Grab or transfer the ipif to move. During the move, keep 13983 * ill_g_lock held to prevent any ill walker threads from 13984 * seeing things in an inconsistent state. 13985 */ 13986 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13987 if (ipif->ipif_id != 0) { 13988 ipif_remove(ipif); 13989 } else { 13990 ipif_transfer(ipif, moveipif, stubipif); 13991 ipif = moveipif; 13992 } 13993 13994 /* 13995 * Place the ipif on the IPMP ill. If the zeroth ipif on 13996 * the IPMP ill is a stub (0.0.0.0 down address) then we 13997 * replace that one. Otherwise, pick the next available slot. 13998 */ 13999 ipif->ipif_ill = ipmp_ill; 14000 ipif_orig_id = ipif->ipif_id; 14001 14002 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) { 14003 ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL); 14004 ipif = ipmp_ill->ill_ipif; 14005 } else { 14006 ipif->ipif_id = -1; 14007 if ((err = ipif_insert(ipif, B_FALSE)) != 0) { 14008 /* 14009 * No more available ipif_id's -- put it back 14010 * on the original ill and fail the operation. 14011 * Since we're writer on the ill, we can be 14012 * sure our old slot is still available. 14013 */ 14014 ipif->ipif_id = ipif_orig_id; 14015 ipif->ipif_ill = ill; 14016 if (ipif_orig_id == 0) { 14017 ipif_transfer(ipif, ill->ill_ipif, 14018 NULL); 14019 } else { 14020 VERIFY(ipif_insert(ipif, B_FALSE) == 0); 14021 } 14022 rw_exit(&ipst->ips_ill_g_lock); 14023 return (err); 14024 } 14025 } 14026 rw_exit(&ipst->ips_ill_g_lock); 14027 14028 /* 14029 * Tell SCTP that the ipif has moved. Note that even if we 14030 * had to allocate a new ipif, the original sequence id was 14031 * preserved and therefore SCTP won't know. 14032 */ 14033 sctp_move_ipif(ipif, ill, ipmp_ill); 14034 14035 /* 14036 * If the ipif being brought up was on slot zero, then we 14037 * first need to bring up the placeholder we stuck there. In 14038 * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive 14039 * call to ipif_up() itself, if we successfully bring up the 14040 * placeholder, we'll check ill_move_ipif and bring it up too. 14041 */ 14042 if (ipif_orig_id == 0) { 14043 ASSERT(ill->ill_move_ipif == NULL); 14044 ill->ill_move_ipif = ipif; 14045 if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0) 14046 ASSERT(ill->ill_move_ipif == NULL); 14047 if (err != EINPROGRESS) 14048 ill->ill_move_ipif = NULL; 14049 return (err); 14050 } 14051 14052 /* 14053 * Bring it up on the IPMP ill. 14054 */ 14055 return (ipif_up(ipif, q, mp)); 14056 } 14057 14058 /* Skip arp/ndp for any loopback interface. */ 14059 if (ill->ill_wq != NULL) { 14060 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 14061 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 14062 14063 if (!ill->ill_dl_up) { 14064 /* 14065 * ill_dl_up is not yet set. i.e. we are yet to 14066 * DL_BIND with the driver and this is the first 14067 * logical interface on the ill to become "up". 14068 * Tell the driver to get going (via DL_BIND_REQ). 14069 * Note that changing "significant" IFF_ flags 14070 * address/netmask etc cause a down/up dance, but 14071 * does not cause an unbind (DL_UNBIND) with the driver 14072 */ 14073 return (ill_dl_up(ill, ipif, mp, q)); 14074 } 14075 14076 /* 14077 * ipif_resolver_up may end up needeing to bind/attach 14078 * the ARP stream, which in turn necessitates a 14079 * DLPI message exchange with the driver. ioctls are 14080 * serialized and so we cannot send more than one 14081 * interface up message at a time. If ipif_resolver_up 14082 * does need to wait for the DLPI handshake for the ARP stream, 14083 * we get EINPROGRESS and we will complete in arp_bringup_done. 14084 */ 14085 14086 ASSERT(connp != NULL || !CONN_Q(q)); 14087 if (connp != NULL) 14088 mutex_enter(&connp->conn_lock); 14089 mutex_enter(&ill->ill_lock); 14090 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 14091 mutex_exit(&ill->ill_lock); 14092 if (connp != NULL) 14093 mutex_exit(&connp->conn_lock); 14094 if (!success) 14095 return (EINTR); 14096 14097 /* 14098 * Crank up IPv6 neighbor discovery. Unlike ARP, this should 14099 * complete when ipif_ndp_up returns. 14100 */ 14101 err = ipif_resolver_up(ipif, Res_act_initial); 14102 if (err == EINPROGRESS) { 14103 /* We will complete it in arp_bringup_done() */ 14104 return (err); 14105 } 14106 14107 if (isv6 && err == 0) 14108 err = ipif_ndp_up(ipif, B_TRUE); 14109 14110 ASSERT(err != EINPROGRESS); 14111 mp = ipsq_pending_mp_get(ipsq, &connp); 14112 ASSERT(mp != NULL); 14113 if (err != 0) 14114 return (err); 14115 } else { 14116 /* 14117 * Interfaces without underlying hardware don't do duplicate 14118 * address detection. 14119 */ 14120 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 14121 ipif->ipif_addr_ready = 1; 14122 err = ill_add_ires(ill); 14123 /* allocation failure? */ 14124 if (err != 0) 14125 return (err); 14126 } 14127 14128 err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 14129 if (err == 0 && ill->ill_move_ipif != NULL) { 14130 ipif = ill->ill_move_ipif; 14131 ill->ill_move_ipif = NULL; 14132 return (ipif_up(ipif, q, mp)); 14133 } 14134 return (err); 14135 } 14136 14137 /* 14138 * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST. 14139 * The identical set of IREs need to be removed in ill_delete_ires(). 14140 */ 14141 int 14142 ill_add_ires(ill_t *ill) 14143 { 14144 ire_t *ire; 14145 in6_addr_t dummy6 = {(uint32_t)V6_MCAST, 0, 0, 1}; 14146 in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP); 14147 14148 if (ill->ill_ire_multicast != NULL) 14149 return (0); 14150 14151 /* 14152 * provide some dummy ire_addr for creating the ire. 14153 */ 14154 if (ill->ill_isv6) { 14155 ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill, 14156 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); 14157 } else { 14158 ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill, 14159 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); 14160 } 14161 if (ire == NULL) 14162 return (ENOMEM); 14163 14164 ill->ill_ire_multicast = ire; 14165 return (0); 14166 } 14167 14168 void 14169 ill_delete_ires(ill_t *ill) 14170 { 14171 if (ill->ill_ire_multicast != NULL) { 14172 /* 14173 * BIND/ATTACH completed; Release the ref for ill_ire_multicast 14174 * which was taken without any th_tracing enabled. 14175 * We also mark it as condemned (note that it was never added) 14176 * so that caching conn's can move off of it. 14177 */ 14178 ire_make_condemned(ill->ill_ire_multicast); 14179 ire_refrele_notr(ill->ill_ire_multicast); 14180 ill->ill_ire_multicast = NULL; 14181 } 14182 } 14183 14184 /* 14185 * Perform a bind for the physical device. 14186 * When the routine returns EINPROGRESS then mp has been consumed and 14187 * the ioctl will be acked from ip_rput_dlpi. 14188 * Allocate an unbind message and save it until ipif_down. 14189 */ 14190 static int 14191 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 14192 { 14193 mblk_t *bind_mp = NULL; 14194 mblk_t *unbind_mp = NULL; 14195 conn_t *connp; 14196 boolean_t success; 14197 int err; 14198 14199 DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill); 14200 14201 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 14202 ASSERT(IAM_WRITER_ILL(ill)); 14203 ASSERT(mp != NULL); 14204 14205 /* 14206 * Make sure we have an IRE_MULTICAST in case we immediately 14207 * start receiving packets. 14208 */ 14209 err = ill_add_ires(ill); 14210 if (err != 0) 14211 goto bad; 14212 14213 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 14214 DL_BIND_REQ); 14215 if (bind_mp == NULL) 14216 goto bad; 14217 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 14218 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 14219 14220 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 14221 if (unbind_mp == NULL) 14222 goto bad; 14223 14224 /* 14225 * Record state needed to complete this operation when the 14226 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 14227 */ 14228 connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 14229 ASSERT(connp != NULL || !CONN_Q(q)); 14230 GRAB_CONN_LOCK(q); 14231 mutex_enter(&ipif->ipif_ill->ill_lock); 14232 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 14233 mutex_exit(&ipif->ipif_ill->ill_lock); 14234 RELEASE_CONN_LOCK(q); 14235 if (!success) 14236 goto bad; 14237 14238 /* 14239 * Save the unbind message for ill_dl_down(); it will be consumed when 14240 * the interface goes down. 14241 */ 14242 ASSERT(ill->ill_unbind_mp == NULL); 14243 ill->ill_unbind_mp = unbind_mp; 14244 14245 ill_dlpi_send(ill, bind_mp); 14246 /* Send down link-layer capabilities probe if not already done. */ 14247 ill_capability_probe(ill); 14248 14249 /* 14250 * Sysid used to rely on the fact that netboots set domainname 14251 * and the like. Now that miniroot boots aren't strictly netboots 14252 * and miniroot network configuration is driven from userland 14253 * these things still need to be set. This situation can be detected 14254 * by comparing the interface being configured here to the one 14255 * dhcifname was set to reference by the boot loader. Once sysid is 14256 * converted to use dhcp_ipc_getinfo() this call can go away. 14257 */ 14258 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && 14259 (strcmp(ill->ill_name, dhcifname) == 0) && 14260 (strlen(srpc_domain) == 0)) { 14261 if (dhcpinit() != 0) 14262 cmn_err(CE_WARN, "no cached dhcp response"); 14263 } 14264 14265 /* 14266 * This operation will complete in ip_rput_dlpi with either 14267 * a DL_BIND_ACK or DL_ERROR_ACK. 14268 */ 14269 return (EINPROGRESS); 14270 bad: 14271 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 14272 14273 freemsg(bind_mp); 14274 freemsg(unbind_mp); 14275 return (ENOMEM); 14276 } 14277 14278 /* Add room for tcp+ip headers */ 14279 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 14280 14281 /* 14282 * DLPI and ARP is up. 14283 * Create all the IREs associated with an interface. Bring up multicast. 14284 * Set the interface flag and finish other initialization 14285 * that potentially had to be deferred to after DL_BIND_ACK. 14286 */ 14287 int 14288 ipif_up_done(ipif_t *ipif) 14289 { 14290 ill_t *ill = ipif->ipif_ill; 14291 int err = 0; 14292 boolean_t loopback = B_FALSE; 14293 boolean_t update_src_selection = B_TRUE; 14294 ipif_t *tmp_ipif; 14295 14296 ip1dbg(("ipif_up_done(%s:%u)\n", 14297 ipif->ipif_ill->ill_name, ipif->ipif_id)); 14298 DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done", 14299 ill_t *, ill, ipif_t *, ipif); 14300 14301 /* Check if this is a loopback interface */ 14302 if (ipif->ipif_ill->ill_wq == NULL) 14303 loopback = B_TRUE; 14304 14305 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 14306 14307 /* 14308 * If all other interfaces for this ill are down or DEPRECATED, 14309 * or otherwise unsuitable for source address selection, 14310 * reset the src generation numbers to make sure source 14311 * address selection gets to take this new ipif into account. 14312 * No need to hold ill_lock while traversing the ipif list since 14313 * we are writer 14314 */ 14315 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 14316 tmp_ipif = tmp_ipif->ipif_next) { 14317 if (((tmp_ipif->ipif_flags & 14318 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 14319 !(tmp_ipif->ipif_flags & IPIF_UP)) || 14320 (tmp_ipif == ipif)) 14321 continue; 14322 /* first useable pre-existing interface */ 14323 update_src_selection = B_FALSE; 14324 break; 14325 } 14326 if (update_src_selection) 14327 ip_update_source_selection(ill->ill_ipst); 14328 14329 if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) { 14330 nce_t *loop_nce = NULL; 14331 uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD); 14332 14333 /* 14334 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 14335 * ipif_lookup_on_name(), but in the case of zones we can have 14336 * several loopback addresses on lo0. So all the interfaces with 14337 * loopback addresses need to be marked IRE_LOOPBACK. 14338 */ 14339 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 14340 htonl(INADDR_LOOPBACK)) 14341 ipif->ipif_ire_type = IRE_LOOPBACK; 14342 else 14343 ipif->ipif_ire_type = IRE_LOCAL; 14344 if (ill->ill_net_type != IRE_LOOPBACK) 14345 flags |= NCE_F_PUBLISH; 14346 14347 /* add unicast nce for the local addr */ 14348 err = nce_lookup_then_add_v4(ill, NULL, 14349 ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags, 14350 ND_REACHABLE, &loop_nce); 14351 /* A shared-IP zone sees EEXIST for lo0:N */ 14352 if (err == 0 || err == EEXIST) { 14353 ipif->ipif_added_nce = 1; 14354 loop_nce->nce_ipif_cnt++; 14355 nce_refrele(loop_nce); 14356 err = 0; 14357 } else { 14358 ASSERT(loop_nce == NULL); 14359 return (err); 14360 } 14361 } 14362 14363 /* Create all the IREs associated with this interface */ 14364 err = ipif_add_ires_v4(ipif, loopback); 14365 if (err != 0) { 14366 /* 14367 * see comments about return value from 14368 * ip_addr_availability_check() in ipif_add_ires_v4(). 14369 */ 14370 if (err != EADDRINUSE) { 14371 (void) ipif_arp_down(ipif); 14372 } else { 14373 /* 14374 * Make IPMP aware of the deleted ipif so that 14375 * the needed ipmp cleanup (e.g., of ipif_bound_ill) 14376 * can be completed. Note that we do not want to 14377 * destroy the nce that was created on the ipmp_ill 14378 * for the active copy of the duplicate address in 14379 * use. 14380 */ 14381 if (IS_IPMP(ill)) 14382 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 14383 err = EADDRNOTAVAIL; 14384 } 14385 return (err); 14386 } 14387 14388 if (ill->ill_ipif_up_count == 1 && !loopback) { 14389 /* Recover any additional IREs entries for this ill */ 14390 (void) ill_recover_saved_ire(ill); 14391 } 14392 14393 if (ill->ill_need_recover_multicast) { 14394 /* 14395 * Need to recover all multicast memberships in the driver. 14396 * This had to be deferred until we had attached. The same 14397 * code exists in ipif_up_done_v6() to recover IPv6 14398 * memberships. 14399 * 14400 * Note that it would be preferable to unconditionally do the 14401 * ill_recover_multicast() in ill_dl_up(), but we cannot do 14402 * that since ill_join_allmulti() depends on ill_dl_up being 14403 * set, and it is not set until we receive a DL_BIND_ACK after 14404 * having called ill_dl_up(). 14405 */ 14406 ill_recover_multicast(ill); 14407 } 14408 14409 if (ill->ill_ipif_up_count == 1) { 14410 /* 14411 * Since the interface is now up, it may now be active. 14412 */ 14413 if (IS_UNDER_IPMP(ill)) 14414 ipmp_ill_refresh_active(ill); 14415 14416 /* 14417 * If this is an IPMP interface, we may now be able to 14418 * establish ARP entries. 14419 */ 14420 if (IS_IPMP(ill)) 14421 ipmp_illgrp_refresh_arpent(ill->ill_grp); 14422 } 14423 14424 /* Join the allhosts multicast address */ 14425 ipif_multicast_up(ipif); 14426 14427 if (!loopback && !update_src_selection && 14428 !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) 14429 ip_update_source_selection(ill->ill_ipst); 14430 14431 if (!loopback && ipif->ipif_addr_ready) { 14432 /* Broadcast an address mask reply. */ 14433 ipif_mask_reply(ipif); 14434 } 14435 /* Perhaps ilgs should use this ill */ 14436 update_conn_ill(NULL, ill->ill_ipst); 14437 14438 /* 14439 * This had to be deferred until we had bound. Tell routing sockets and 14440 * others that this interface is up if it looks like the address has 14441 * been validated. Otherwise, if it isn't ready yet, wait for 14442 * duplicate address detection to do its thing. 14443 */ 14444 if (ipif->ipif_addr_ready) 14445 ipif_up_notify(ipif); 14446 return (0); 14447 } 14448 14449 /* 14450 * Add the IREs associated with the ipif. 14451 * Those MUST be explicitly removed in ipif_delete_ires_v4. 14452 */ 14453 static int 14454 ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback) 14455 { 14456 ill_t *ill = ipif->ipif_ill; 14457 ip_stack_t *ipst = ill->ill_ipst; 14458 ire_t *ire_array[20]; 14459 ire_t **irep = ire_array; 14460 ire_t **irep1; 14461 ipaddr_t net_mask = 0; 14462 ipaddr_t subnet_mask, route_mask; 14463 int err; 14464 ire_t *ire_local = NULL; /* LOCAL or LOOPBACK */ 14465 ire_t *ire_if = NULL; 14466 uchar_t *gw; 14467 14468 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14469 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14470 /* 14471 * If we're on a labeled system then make sure that zone- 14472 * private addresses have proper remote host database entries. 14473 */ 14474 if (is_system_labeled() && 14475 ipif->ipif_ire_type != IRE_LOOPBACK && 14476 !tsol_check_interface_address(ipif)) 14477 return (EINVAL); 14478 14479 /* Register the source address for __sin6_src_id */ 14480 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 14481 ipif->ipif_zoneid, ipst); 14482 if (err != 0) { 14483 ip0dbg(("ipif_add_ires: srcid_insert %d\n", err)); 14484 return (err); 14485 } 14486 14487 if (loopback) 14488 gw = (uchar_t *)&ipif->ipif_lcl_addr; 14489 else 14490 gw = NULL; 14491 14492 /* If the interface address is set, create the local IRE. */ 14493 ire_local = ire_create( 14494 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 14495 (uchar_t *)&ip_g_all_ones, /* mask */ 14496 gw, /* gateway */ 14497 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 14498 ipif->ipif_ill, 14499 ipif->ipif_zoneid, 14500 ((ipif->ipif_flags & IPIF_PRIVATE) ? 14501 RTF_PRIVATE : 0) | RTF_KERNEL, 14502 NULL, 14503 ipst); 14504 ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x" 14505 " for 0x%x\n", (void *)ipif, (void *)ire_local, 14506 ipif->ipif_ire_type, 14507 ntohl(ipif->ipif_lcl_addr))); 14508 if (ire_local == NULL) { 14509 ip1dbg(("ipif_up_done: NULL ire_local\n")); 14510 err = ENOMEM; 14511 goto bad; 14512 } 14513 } else { 14514 ip1dbg(( 14515 "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n", 14516 ipif->ipif_ire_type, 14517 ntohl(ipif->ipif_lcl_addr), 14518 (uint_t)ipif->ipif_flags)); 14519 } 14520 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14521 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14522 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14523 } else { 14524 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 14525 } 14526 14527 subnet_mask = ipif->ipif_net_mask; 14528 14529 /* 14530 * If mask was not specified, use natural netmask of 14531 * interface address. Also, store this mask back into the 14532 * ipif struct. 14533 */ 14534 if (subnet_mask == 0) { 14535 subnet_mask = net_mask; 14536 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 14537 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 14538 ipif->ipif_v6subnet); 14539 } 14540 14541 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 14542 if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) && 14543 ipif->ipif_subnet != INADDR_ANY) { 14544 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 14545 14546 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 14547 route_mask = IP_HOST_MASK; 14548 } else { 14549 route_mask = subnet_mask; 14550 } 14551 14552 ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p " 14553 "creating if IRE ill_net_type 0x%x for 0x%x\n", 14554 (void *)ipif, (void *)ill, ill->ill_net_type, 14555 ntohl(ipif->ipif_subnet))); 14556 ire_if = ire_create( 14557 (uchar_t *)&ipif->ipif_subnet, 14558 (uchar_t *)&route_mask, 14559 (uchar_t *)&ipif->ipif_lcl_addr, 14560 ill->ill_net_type, 14561 ill, 14562 ipif->ipif_zoneid, 14563 ((ipif->ipif_flags & IPIF_PRIVATE) ? 14564 RTF_PRIVATE: 0) | RTF_KERNEL, 14565 NULL, 14566 ipst); 14567 if (ire_if == NULL) { 14568 ip1dbg(("ipif_up_done: NULL ire_if\n")); 14569 err = ENOMEM; 14570 goto bad; 14571 } 14572 } 14573 14574 /* 14575 * Create any necessary broadcast IREs. 14576 */ 14577 if ((ipif->ipif_flags & IPIF_BROADCAST) && 14578 !(ipif->ipif_flags & IPIF_NOXMIT)) 14579 irep = ipif_create_bcast_ires(ipif, irep); 14580 14581 /* If an earlier ire_create failed, get out now */ 14582 for (irep1 = irep; irep1 > ire_array; ) { 14583 irep1--; 14584 if (*irep1 == NULL) { 14585 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 14586 err = ENOMEM; 14587 goto bad; 14588 } 14589 } 14590 14591 /* 14592 * Need to atomically check for IP address availability under 14593 * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new 14594 * ills or new ipifs can be added while we are checking availability. 14595 */ 14596 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 14597 mutex_enter(&ipst->ips_ip_addr_avail_lock); 14598 /* Mark it up, and increment counters. */ 14599 ipif->ipif_flags |= IPIF_UP; 14600 ill->ill_ipif_up_count++; 14601 err = ip_addr_availability_check(ipif); 14602 mutex_exit(&ipst->ips_ip_addr_avail_lock); 14603 rw_exit(&ipst->ips_ill_g_lock); 14604 14605 if (err != 0) { 14606 /* 14607 * Our address may already be up on the same ill. In this case, 14608 * the ARP entry for our ipif replaced the one for the other 14609 * ipif. So we don't want to delete it (otherwise the other ipif 14610 * would be unable to send packets). 14611 * ip_addr_availability_check() identifies this case for us and 14612 * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL 14613 * which is the expected error code. 14614 */ 14615 ill->ill_ipif_up_count--; 14616 ipif->ipif_flags &= ~IPIF_UP; 14617 goto bad; 14618 } 14619 14620 /* 14621 * Add in all newly created IREs. ire_create_bcast() has 14622 * already checked for duplicates of the IRE_BROADCAST type. 14623 * We add the IRE_INTERFACE before the IRE_LOCAL to ensure 14624 * that lookups find the IRE_LOCAL even if the IRE_INTERFACE is 14625 * a /32 route. 14626 */ 14627 if (ire_if != NULL) { 14628 ire_if = ire_add(ire_if); 14629 if (ire_if == NULL) { 14630 err = ENOMEM; 14631 goto bad2; 14632 } 14633 #ifdef DEBUG 14634 ire_refhold_notr(ire_if); 14635 ire_refrele(ire_if); 14636 #endif 14637 } 14638 if (ire_local != NULL) { 14639 ire_local = ire_add(ire_local); 14640 if (ire_local == NULL) { 14641 err = ENOMEM; 14642 goto bad2; 14643 } 14644 #ifdef DEBUG 14645 ire_refhold_notr(ire_local); 14646 ire_refrele(ire_local); 14647 #endif 14648 } 14649 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14650 if (ire_local != NULL) 14651 ipif->ipif_ire_local = ire_local; 14652 if (ire_if != NULL) 14653 ipif->ipif_ire_if = ire_if; 14654 rw_exit(&ipst->ips_ill_g_lock); 14655 ire_local = NULL; 14656 ire_if = NULL; 14657 14658 /* 14659 * We first add all of them, and if that succeeds we refrele the 14660 * bunch. That enables us to delete all of them should any of the 14661 * ire_adds fail. 14662 */ 14663 for (irep1 = irep; irep1 > ire_array; ) { 14664 irep1--; 14665 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock))); 14666 *irep1 = ire_add(*irep1); 14667 if (*irep1 == NULL) { 14668 err = ENOMEM; 14669 goto bad2; 14670 } 14671 } 14672 14673 for (irep1 = irep; irep1 > ire_array; ) { 14674 irep1--; 14675 /* refheld by ire_add. */ 14676 if (*irep1 != NULL) { 14677 ire_refrele(*irep1); 14678 *irep1 = NULL; 14679 } 14680 } 14681 14682 if (!loopback) { 14683 /* 14684 * If the broadcast address has been set, make sure it makes 14685 * sense based on the interface address. 14686 * Only match on ill since we are sharing broadcast addresses. 14687 */ 14688 if ((ipif->ipif_brd_addr != INADDR_ANY) && 14689 (ipif->ipif_flags & IPIF_BROADCAST)) { 14690 ire_t *ire; 14691 14692 ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0, 14693 IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES, NULL, 14694 (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL); 14695 14696 if (ire == NULL) { 14697 /* 14698 * If there isn't a matching broadcast IRE, 14699 * revert to the default for this netmask. 14700 */ 14701 ipif->ipif_v6brd_addr = ipv6_all_zeros; 14702 mutex_enter(&ipif->ipif_ill->ill_lock); 14703 ipif_set_default(ipif); 14704 mutex_exit(&ipif->ipif_ill->ill_lock); 14705 } else { 14706 ire_refrele(ire); 14707 } 14708 } 14709 14710 } 14711 return (0); 14712 14713 bad2: 14714 ill->ill_ipif_up_count--; 14715 ipif->ipif_flags &= ~IPIF_UP; 14716 14717 bad: 14718 ip1dbg(("ipif_add_ires: FAILED \n")); 14719 if (ire_local != NULL) 14720 ire_delete(ire_local); 14721 if (ire_if != NULL) 14722 ire_delete(ire_if); 14723 14724 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14725 ire_local = ipif->ipif_ire_local; 14726 ipif->ipif_ire_local = NULL; 14727 ire_if = ipif->ipif_ire_if; 14728 ipif->ipif_ire_if = NULL; 14729 rw_exit(&ipst->ips_ill_g_lock); 14730 if (ire_local != NULL) { 14731 ire_delete(ire_local); 14732 ire_refrele_notr(ire_local); 14733 } 14734 if (ire_if != NULL) { 14735 ire_delete(ire_if); 14736 ire_refrele_notr(ire_if); 14737 } 14738 14739 while (irep > ire_array) { 14740 irep--; 14741 if (*irep != NULL) { 14742 ire_delete(*irep); 14743 } 14744 } 14745 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); 14746 14747 return (err); 14748 } 14749 14750 /* Remove all the IREs created by ipif_add_ires_v4 */ 14751 void 14752 ipif_delete_ires_v4(ipif_t *ipif) 14753 { 14754 ill_t *ill = ipif->ipif_ill; 14755 ip_stack_t *ipst = ill->ill_ipst; 14756 ire_t *ire; 14757 14758 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14759 ire = ipif->ipif_ire_local; 14760 ipif->ipif_ire_local = NULL; 14761 rw_exit(&ipst->ips_ill_g_lock); 14762 if (ire != NULL) { 14763 /* 14764 * Move count to ipif so we don't loose the count due to 14765 * a down/up dance. 14766 */ 14767 atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count); 14768 14769 ire_delete(ire); 14770 ire_refrele_notr(ire); 14771 } 14772 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14773 ire = ipif->ipif_ire_if; 14774 ipif->ipif_ire_if = NULL; 14775 rw_exit(&ipst->ips_ill_g_lock); 14776 if (ire != NULL) { 14777 ire_delete(ire); 14778 ire_refrele_notr(ire); 14779 } 14780 14781 /* 14782 * Delete the broadcast IREs. 14783 */ 14784 if ((ipif->ipif_flags & IPIF_BROADCAST) && 14785 !(ipif->ipif_flags & IPIF_NOXMIT)) 14786 ipif_delete_bcast_ires(ipif); 14787 } 14788 14789 /* 14790 * Checks for availbility of a usable source address (if there is one) when the 14791 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 14792 * this selection is done regardless of the destination. 14793 */ 14794 boolean_t 14795 ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid, 14796 ip_stack_t *ipst) 14797 { 14798 ipif_t *ipif = NULL; 14799 ill_t *uill; 14800 14801 ASSERT(ifindex != 0); 14802 14803 uill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 14804 if (uill == NULL) 14805 return (B_FALSE); 14806 14807 mutex_enter(&uill->ill_lock); 14808 for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14809 if (IPIF_IS_CONDEMNED(ipif)) 14810 continue; 14811 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 14812 continue; 14813 if (!(ipif->ipif_flags & IPIF_UP)) 14814 continue; 14815 if (ipif->ipif_zoneid != zoneid) 14816 continue; 14817 if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : 14818 ipif->ipif_lcl_addr == INADDR_ANY) 14819 continue; 14820 mutex_exit(&uill->ill_lock); 14821 ill_refrele(uill); 14822 return (B_TRUE); 14823 } 14824 mutex_exit(&uill->ill_lock); 14825 ill_refrele(uill); 14826 return (B_FALSE); 14827 } 14828 14829 /* 14830 * Find an ipif with a good local address on the ill+zoneid. 14831 */ 14832 ipif_t * 14833 ipif_good_addr(ill_t *ill, zoneid_t zoneid) 14834 { 14835 ipif_t *ipif; 14836 14837 mutex_enter(&ill->ill_lock); 14838 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14839 if (IPIF_IS_CONDEMNED(ipif)) 14840 continue; 14841 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 14842 continue; 14843 if (!(ipif->ipif_flags & IPIF_UP)) 14844 continue; 14845 if (ipif->ipif_zoneid != zoneid && 14846 ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES) 14847 continue; 14848 if (ill->ill_isv6 ? 14849 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : 14850 ipif->ipif_lcl_addr == INADDR_ANY) 14851 continue; 14852 ipif_refhold_locked(ipif); 14853 mutex_exit(&ill->ill_lock); 14854 return (ipif); 14855 } 14856 mutex_exit(&ill->ill_lock); 14857 return (NULL); 14858 } 14859 14860 /* 14861 * IP source address type, sorted from worst to best. For a given type, 14862 * always prefer IP addresses on the same subnet. All-zones addresses are 14863 * suboptimal because they pose problems with unlabeled destinations. 14864 */ 14865 typedef enum { 14866 IPIF_NONE, 14867 IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */ 14868 IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */ 14869 IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */ 14870 IPIF_SAMENET_ALLZONES, /* allzones and same subnet */ 14871 IPIF_DIFFNET, /* normal and different subnet */ 14872 IPIF_SAMENET, /* normal and same subnet */ 14873 IPIF_LOCALADDR /* local loopback */ 14874 } ipif_type_t; 14875 14876 /* 14877 * Pick the optimal ipif on `ill' for sending to destination `dst' from zone 14878 * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t 14879 * enumeration, and return the highest-rated ipif. If there's a tie, we pick 14880 * the first one, unless IPMP is used in which case we round-robin among them; 14881 * see below for more. 14882 * 14883 * Returns NULL if there is no suitable source address for the ill. 14884 * This only occurs when there is no valid source address for the ill. 14885 */ 14886 ipif_t * 14887 ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid, 14888 boolean_t allow_usesrc, boolean_t *notreadyp) 14889 { 14890 ill_t *usill = NULL; 14891 ill_t *ipmp_ill = NULL; 14892 ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif; 14893 ipif_type_t type, best_type; 14894 tsol_tpc_t *src_rhtp, *dst_rhtp; 14895 ip_stack_t *ipst = ill->ill_ipst; 14896 boolean_t samenet; 14897 14898 if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) { 14899 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, 14900 B_FALSE, ipst); 14901 if (usill != NULL) 14902 ill = usill; /* Select source from usesrc ILL */ 14903 else 14904 return (NULL); 14905 } 14906 14907 /* 14908 * Test addresses should never be used for source address selection, 14909 * so if we were passed one, switch to the IPMP meta-interface. 14910 */ 14911 if (IS_UNDER_IPMP(ill)) { 14912 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) 14913 ill = ipmp_ill; /* Select source from IPMP ill */ 14914 else 14915 return (NULL); 14916 } 14917 14918 /* 14919 * If we're dealing with an unlabeled destination on a labeled system, 14920 * make sure that we ignore source addresses that are incompatible with 14921 * the destination's default label. That destination's default label 14922 * must dominate the minimum label on the source address. 14923 */ 14924 dst_rhtp = NULL; 14925 if (is_system_labeled()) { 14926 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 14927 if (dst_rhtp == NULL) 14928 return (NULL); 14929 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 14930 TPC_RELE(dst_rhtp); 14931 dst_rhtp = NULL; 14932 } 14933 } 14934 14935 /* 14936 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill 14937 * can be deleted. But an ipif/ill can get CONDEMNED any time. 14938 * After selecting the right ipif, under ill_lock make sure ipif is 14939 * not condemned, and increment refcnt. If ipif is CONDEMNED, 14940 * we retry. Inside the loop we still need to check for CONDEMNED, 14941 * but not under a lock. 14942 */ 14943 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 14944 retry: 14945 /* 14946 * For source address selection, we treat the ipif list as circular 14947 * and continue until we get back to where we started. This allows 14948 * IPMP to vary source address selection (which improves inbound load 14949 * spreading) by caching its last ending point and starting from 14950 * there. NOTE: we don't have to worry about ill_src_ipif changing 14951 * ills since that can't happen on the IPMP ill. 14952 */ 14953 start_ipif = ill->ill_ipif; 14954 if (IS_IPMP(ill) && ill->ill_src_ipif != NULL) 14955 start_ipif = ill->ill_src_ipif; 14956 14957 ipif = start_ipif; 14958 best_ipif = NULL; 14959 best_type = IPIF_NONE; 14960 do { 14961 if ((next_ipif = ipif->ipif_next) == NULL) 14962 next_ipif = ill->ill_ipif; 14963 14964 if (IPIF_IS_CONDEMNED(ipif)) 14965 continue; 14966 /* Always skip NOLOCAL and ANYCAST interfaces */ 14967 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 14968 continue; 14969 /* Always skip NOACCEPT interfaces */ 14970 if (ipif->ipif_ill->ill_flags & ILLF_NOACCEPT) 14971 continue; 14972 if (!(ipif->ipif_flags & IPIF_UP)) 14973 continue; 14974 14975 if (!ipif->ipif_addr_ready) { 14976 if (notreadyp != NULL) 14977 *notreadyp = B_TRUE; 14978 continue; 14979 } 14980 14981 if (zoneid != ALL_ZONES && 14982 ipif->ipif_zoneid != zoneid && 14983 ipif->ipif_zoneid != ALL_ZONES) 14984 continue; 14985 14986 /* 14987 * Interfaces with 0.0.0.0 address are allowed to be UP, but 14988 * are not valid as source addresses. 14989 */ 14990 if (ipif->ipif_lcl_addr == INADDR_ANY) 14991 continue; 14992 14993 /* 14994 * Check compatibility of local address for destination's 14995 * default label if we're on a labeled system. Incompatible 14996 * addresses can't be used at all. 14997 */ 14998 if (dst_rhtp != NULL) { 14999 boolean_t incompat; 15000 15001 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 15002 IPV4_VERSION, B_FALSE); 15003 if (src_rhtp == NULL) 15004 continue; 15005 incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO || 15006 src_rhtp->tpc_tp.tp_doi != 15007 dst_rhtp->tpc_tp.tp_doi || 15008 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 15009 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 15010 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 15011 src_rhtp->tpc_tp.tp_sl_set_cipso)); 15012 TPC_RELE(src_rhtp); 15013 if (incompat) 15014 continue; 15015 } 15016 15017 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet); 15018 15019 if (ipif->ipif_lcl_addr == dst) { 15020 type = IPIF_LOCALADDR; 15021 } else if (ipif->ipif_flags & IPIF_DEPRECATED) { 15022 type = samenet ? IPIF_SAMENET_DEPRECATED : 15023 IPIF_DIFFNET_DEPRECATED; 15024 } else if (ipif->ipif_zoneid == ALL_ZONES) { 15025 type = samenet ? IPIF_SAMENET_ALLZONES : 15026 IPIF_DIFFNET_ALLZONES; 15027 } else { 15028 type = samenet ? IPIF_SAMENET : IPIF_DIFFNET; 15029 } 15030 15031 if (type > best_type) { 15032 best_type = type; 15033 best_ipif = ipif; 15034 if (best_type == IPIF_LOCALADDR) 15035 break; /* can't get better */ 15036 } 15037 } while ((ipif = next_ipif) != start_ipif); 15038 15039 if ((ipif = best_ipif) != NULL) { 15040 mutex_enter(&ipif->ipif_ill->ill_lock); 15041 if (IPIF_IS_CONDEMNED(ipif)) { 15042 mutex_exit(&ipif->ipif_ill->ill_lock); 15043 goto retry; 15044 } 15045 ipif_refhold_locked(ipif); 15046 15047 /* 15048 * For IPMP, update the source ipif rotor to the next ipif, 15049 * provided we can look it up. (We must not use it if it's 15050 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after 15051 * ipif_free() checked ill_src_ipif.) 15052 */ 15053 if (IS_IPMP(ill) && ipif != NULL) { 15054 next_ipif = ipif->ipif_next; 15055 if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif)) 15056 ill->ill_src_ipif = next_ipif; 15057 else 15058 ill->ill_src_ipif = NULL; 15059 } 15060 mutex_exit(&ipif->ipif_ill->ill_lock); 15061 } 15062 15063 rw_exit(&ipst->ips_ill_g_lock); 15064 if (usill != NULL) 15065 ill_refrele(usill); 15066 if (ipmp_ill != NULL) 15067 ill_refrele(ipmp_ill); 15068 if (dst_rhtp != NULL) 15069 TPC_RELE(dst_rhtp); 15070 15071 #ifdef DEBUG 15072 if (ipif == NULL) { 15073 char buf1[INET6_ADDRSTRLEN]; 15074 15075 ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n", 15076 ill->ill_name, 15077 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 15078 } else { 15079 char buf1[INET6_ADDRSTRLEN]; 15080 char buf2[INET6_ADDRSTRLEN]; 15081 15082 ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n", 15083 ipif->ipif_ill->ill_name, 15084 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 15085 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 15086 buf2, sizeof (buf2)))); 15087 } 15088 #endif /* DEBUG */ 15089 return (ipif); 15090 } 15091 15092 /* 15093 * Pick a source address based on the destination ill and an optional setsrc 15094 * address. 15095 * The result is stored in srcp. If generation is set, then put the source 15096 * generation number there before we look for the source address (to avoid 15097 * missing changes in the set of source addresses. 15098 * If flagsp is set, then us it to pass back ipif_flags. 15099 * 15100 * If the caller wants to cache the returned source address and detect when 15101 * that might be stale, the caller should pass in a generation argument, 15102 * which the caller can later compare against ips_src_generation 15103 * 15104 * The precedence order for selecting an IPv4 source address is: 15105 * - RTF_SETSRC on the offlink ire always wins. 15106 * - If usrsrc is set, swap the ill to be the usesrc one. 15107 * - If IPMP is used on the ill, select a random address from the most 15108 * preferred ones below: 15109 * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES 15110 * 2. Not deprecated, not ALL_ZONES 15111 * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES 15112 * 4. Not deprecated, ALL_ZONES 15113 * 5. If onlink destination, same subnet and deprecated 15114 * 6. Deprecated. 15115 * 15116 * We have lower preference for ALL_ZONES IP addresses, 15117 * as they pose problems with unlabeled destinations. 15118 * 15119 * Note that when multiple IP addresses match e.g., #1 we pick 15120 * the first one if IPMP is not in use. With IPMP we randomize. 15121 */ 15122 int 15123 ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst, 15124 ipaddr_t multicast_ifaddr, 15125 zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp, 15126 uint32_t *generation, uint64_t *flagsp) 15127 { 15128 ipif_t *ipif; 15129 boolean_t notready = B_FALSE; /* Set if !ipif_addr_ready found */ 15130 15131 if (flagsp != NULL) 15132 *flagsp = 0; 15133 15134 /* 15135 * Need to grab the generation number before we check to 15136 * avoid a race with a change to the set of local addresses. 15137 * No lock needed since the thread which updates the set of local 15138 * addresses use ipif/ill locks and exit those (hence a store memory 15139 * barrier) before doing the atomic increase of ips_src_generation. 15140 */ 15141 if (generation != NULL) { 15142 *generation = ipst->ips_src_generation; 15143 } 15144 15145 if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) { 15146 *srcp = multicast_ifaddr; 15147 return (0); 15148 } 15149 15150 /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */ 15151 if (setsrc != INADDR_ANY) { 15152 *srcp = setsrc; 15153 return (0); 15154 } 15155 ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, ¬ready); 15156 if (ipif == NULL) { 15157 if (notready) 15158 return (ENETDOWN); 15159 else 15160 return (EADDRNOTAVAIL); 15161 } 15162 *srcp = ipif->ipif_lcl_addr; 15163 if (flagsp != NULL) 15164 *flagsp = ipif->ipif_flags; 15165 ipif_refrele(ipif); 15166 return (0); 15167 } 15168 15169 /* ARGSUSED */ 15170 int 15171 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15172 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15173 { 15174 /* 15175 * ill_phyint_reinit merged the v4 and v6 into a single 15176 * ipsq. We might not have been able to complete the 15177 * operation in ipif_set_values, if we could not become 15178 * exclusive. If so restart it here. 15179 */ 15180 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 15181 } 15182 15183 /* 15184 * Can operate on either a module or a driver queue. 15185 * Returns an error if not a module queue. 15186 */ 15187 /* ARGSUSED */ 15188 int 15189 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15190 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15191 { 15192 queue_t *q1 = q; 15193 char *cp; 15194 char interf_name[LIFNAMSIZ]; 15195 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 15196 15197 if (q->q_next == NULL) { 15198 ip1dbg(( 15199 "if_unitsel: IF_UNITSEL: no q_next\n")); 15200 return (EINVAL); 15201 } 15202 15203 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 15204 return (EALREADY); 15205 15206 do { 15207 q1 = q1->q_next; 15208 } while (q1->q_next); 15209 cp = q1->q_qinfo->qi_minfo->mi_idname; 15210 (void) sprintf(interf_name, "%s%d", cp, ppa); 15211 15212 /* 15213 * Here we are not going to delay the ioack until after 15214 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 15215 * original ioctl message before sending the requests. 15216 */ 15217 return (ipif_set_values(q, mp, interf_name, &ppa)); 15218 } 15219 15220 /* ARGSUSED */ 15221 int 15222 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15223 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15224 { 15225 return (ENXIO); 15226 } 15227 15228 /* 15229 * Create any IRE_BROADCAST entries for `ipif', and store those entries in 15230 * `irep'. Returns a pointer to the next free `irep' entry 15231 * A mirror exists in ipif_delete_bcast_ires(). 15232 * 15233 * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is 15234 * done in ire_add. 15235 */ 15236 static ire_t ** 15237 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) 15238 { 15239 ipaddr_t addr; 15240 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 15241 ipaddr_t subnetmask = ipif->ipif_net_mask; 15242 ill_t *ill = ipif->ipif_ill; 15243 zoneid_t zoneid = ipif->ipif_zoneid; 15244 15245 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n")); 15246 15247 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 15248 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); 15249 15250 if (ipif->ipif_lcl_addr == INADDR_ANY || 15251 (ipif->ipif_flags & IPIF_NOLOCAL)) 15252 netmask = htonl(IN_CLASSA_NET); /* fallback */ 15253 15254 irep = ire_create_bcast(ill, 0, zoneid, irep); 15255 irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep); 15256 15257 /* 15258 * For backward compatibility, we create net broadcast IREs based on 15259 * the old "IP address class system", since some old machines only 15260 * respond to these class derived net broadcast. However, we must not 15261 * create these net broadcast IREs if the subnetmask is shorter than 15262 * the IP address class based derived netmask. Otherwise, we may 15263 * create a net broadcast address which is the same as an IP address 15264 * on the subnet -- and then TCP will refuse to talk to that address. 15265 */ 15266 if (netmask < subnetmask) { 15267 addr = netmask & ipif->ipif_subnet; 15268 irep = ire_create_bcast(ill, addr, zoneid, irep); 15269 irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep); 15270 } 15271 15272 /* 15273 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 15274 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 15275 * created. Creating these broadcast IREs will only create confusion 15276 * as `addr' will be the same as the IP address. 15277 */ 15278 if (subnetmask != 0xFFFFFFFF) { 15279 addr = ipif->ipif_subnet; 15280 irep = ire_create_bcast(ill, addr, zoneid, irep); 15281 irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep); 15282 } 15283 15284 return (irep); 15285 } 15286 15287 /* 15288 * Mirror of ipif_create_bcast_ires() 15289 */ 15290 static void 15291 ipif_delete_bcast_ires(ipif_t *ipif) 15292 { 15293 ipaddr_t addr; 15294 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 15295 ipaddr_t subnetmask = ipif->ipif_net_mask; 15296 ill_t *ill = ipif->ipif_ill; 15297 zoneid_t zoneid = ipif->ipif_zoneid; 15298 ire_t *ire; 15299 15300 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 15301 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); 15302 15303 if (ipif->ipif_lcl_addr == INADDR_ANY || 15304 (ipif->ipif_flags & IPIF_NOLOCAL)) 15305 netmask = htonl(IN_CLASSA_NET); /* fallback */ 15306 15307 ire = ire_lookup_bcast(ill, 0, zoneid); 15308 ASSERT(ire != NULL); 15309 ire_delete(ire); ire_refrele(ire); 15310 ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid); 15311 ASSERT(ire != NULL); 15312 ire_delete(ire); ire_refrele(ire); 15313 15314 /* 15315 * For backward compatibility, we create net broadcast IREs based on 15316 * the old "IP address class system", since some old machines only 15317 * respond to these class derived net broadcast. However, we must not 15318 * create these net broadcast IREs if the subnetmask is shorter than 15319 * the IP address class based derived netmask. Otherwise, we may 15320 * create a net broadcast address which is the same as an IP address 15321 * on the subnet -- and then TCP will refuse to talk to that address. 15322 */ 15323 if (netmask < subnetmask) { 15324 addr = netmask & ipif->ipif_subnet; 15325 ire = ire_lookup_bcast(ill, addr, zoneid); 15326 ASSERT(ire != NULL); 15327 ire_delete(ire); ire_refrele(ire); 15328 ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid); 15329 ASSERT(ire != NULL); 15330 ire_delete(ire); ire_refrele(ire); 15331 } 15332 15333 /* 15334 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 15335 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 15336 * created. Creating these broadcast IREs will only create confusion 15337 * as `addr' will be the same as the IP address. 15338 */ 15339 if (subnetmask != 0xFFFFFFFF) { 15340 addr = ipif->ipif_subnet; 15341 ire = ire_lookup_bcast(ill, addr, zoneid); 15342 ASSERT(ire != NULL); 15343 ire_delete(ire); ire_refrele(ire); 15344 ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid); 15345 ASSERT(ire != NULL); 15346 ire_delete(ire); ire_refrele(ire); 15347 } 15348 } 15349 15350 /* 15351 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 15352 * from lifr_flags and the name from lifr_name. 15353 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 15354 * since ipif_lookup_on_name uses the _isv6 flags when matching. 15355 * Returns EINPROGRESS when mp has been consumed by queueing it on 15356 * ipx_pending_mp and the ioctl will complete in ip_rput. 15357 * 15358 * Can operate on either a module or a driver queue. 15359 * Returns an error if not a module queue. 15360 */ 15361 /* ARGSUSED */ 15362 int 15363 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15364 ip_ioctl_cmd_t *ipip, void *if_req) 15365 { 15366 ill_t *ill = q->q_ptr; 15367 phyint_t *phyi; 15368 ip_stack_t *ipst; 15369 struct lifreq *lifr = if_req; 15370 uint64_t new_flags; 15371 15372 ASSERT(ipif != NULL); 15373 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 15374 15375 if (q->q_next == NULL) { 15376 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n")); 15377 return (EINVAL); 15378 } 15379 15380 /* 15381 * If we are not writer on 'q' then this interface exists already 15382 * and previous lookups (ip_extract_lifreq()) found this ipif -- 15383 * so return EALREADY. 15384 */ 15385 if (ill != ipif->ipif_ill) 15386 return (EALREADY); 15387 15388 if (ill->ill_name[0] != '\0') 15389 return (EALREADY); 15390 15391 /* 15392 * If there's another ill already with the requested name, ensure 15393 * that it's of the same type. Otherwise, ill_phyint_reinit() will 15394 * fuse together two unrelated ills, which will cause chaos. 15395 */ 15396 ipst = ill->ill_ipst; 15397 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 15398 lifr->lifr_name, NULL); 15399 if (phyi != NULL) { 15400 ill_t *ill_mate = phyi->phyint_illv4; 15401 15402 if (ill_mate == NULL) 15403 ill_mate = phyi->phyint_illv6; 15404 ASSERT(ill_mate != NULL); 15405 15406 if (ill_mate->ill_media->ip_m_mac_type != 15407 ill->ill_media->ip_m_mac_type) { 15408 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to " 15409 "use the same ill name on differing media\n")); 15410 return (EINVAL); 15411 } 15412 } 15413 15414 /* 15415 * We start off as IFF_IPV4 in ipif_allocate and become 15416 * IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value. 15417 * The only flags that we read from user space are IFF_IPV4, 15418 * IFF_IPV6, and IFF_BROADCAST. 15419 * 15420 * This ill has not been inserted into the global list. 15421 * So we are still single threaded and don't need any lock 15422 * 15423 * Saniy check the flags. 15424 */ 15425 15426 if ((lifr->lifr_flags & IFF_BROADCAST) && 15427 ((lifr->lifr_flags & IFF_IPV6) || 15428 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 15429 ip1dbg(("ip_sioctl_slifname: link not broadcast capable " 15430 "or IPv6 i.e., no broadcast \n")); 15431 return (EINVAL); 15432 } 15433 15434 new_flags = 15435 lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST); 15436 15437 if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) { 15438 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of " 15439 "IFF_IPV4 or IFF_IPV6\n")); 15440 return (EINVAL); 15441 } 15442 15443 /* 15444 * We always start off as IPv4, so only need to check for IPv6. 15445 */ 15446 if ((new_flags & IFF_IPV6) != 0) { 15447 ill->ill_flags |= ILLF_IPV6; 15448 ill->ill_flags &= ~ILLF_IPV4; 15449 } 15450 15451 if ((new_flags & IFF_BROADCAST) != 0) 15452 ipif->ipif_flags |= IPIF_BROADCAST; 15453 else 15454 ipif->ipif_flags &= ~IPIF_BROADCAST; 15455 15456 /* We started off as V4. */ 15457 if (ill->ill_flags & ILLF_IPV6) { 15458 ill->ill_phyint->phyint_illv6 = ill; 15459 ill->ill_phyint->phyint_illv4 = NULL; 15460 } 15461 15462 return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa)); 15463 } 15464 15465 /* ARGSUSED */ 15466 int 15467 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15468 ip_ioctl_cmd_t *ipip, void *if_req) 15469 { 15470 /* 15471 * ill_phyint_reinit merged the v4 and v6 into a single 15472 * ipsq. We might not have been able to complete the 15473 * slifname in ipif_set_values, if we could not become 15474 * exclusive. If so restart it here 15475 */ 15476 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 15477 } 15478 15479 /* 15480 * Return a pointer to the ipif which matches the index, IP version type and 15481 * zoneid. 15482 */ 15483 ipif_t * 15484 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 15485 ip_stack_t *ipst) 15486 { 15487 ill_t *ill; 15488 ipif_t *ipif = NULL; 15489 15490 ill = ill_lookup_on_ifindex(index, isv6, ipst); 15491 if (ill != NULL) { 15492 mutex_enter(&ill->ill_lock); 15493 for (ipif = ill->ill_ipif; ipif != NULL; 15494 ipif = ipif->ipif_next) { 15495 if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES || 15496 zoneid == ipif->ipif_zoneid || 15497 ipif->ipif_zoneid == ALL_ZONES)) { 15498 ipif_refhold_locked(ipif); 15499 break; 15500 } 15501 } 15502 mutex_exit(&ill->ill_lock); 15503 ill_refrele(ill); 15504 } 15505 return (ipif); 15506 } 15507 15508 /* 15509 * Change an existing physical interface's index. If the new index 15510 * is acceptable we update the index and the phyint_list_avl_by_index tree. 15511 * Finally, we update other systems which may have a dependence on the 15512 * index value. 15513 */ 15514 /* ARGSUSED */ 15515 int 15516 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15517 ip_ioctl_cmd_t *ipip, void *ifreq) 15518 { 15519 ill_t *ill; 15520 phyint_t *phyi; 15521 struct ifreq *ifr = (struct ifreq *)ifreq; 15522 struct lifreq *lifr = (struct lifreq *)ifreq; 15523 uint_t old_index, index; 15524 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15525 avl_index_t where; 15526 15527 if (ipip->ipi_cmd_type == IF_CMD) 15528 index = ifr->ifr_index; 15529 else 15530 index = lifr->lifr_index; 15531 15532 /* 15533 * Only allow on physical interface. Also, index zero is illegal. 15534 */ 15535 ill = ipif->ipif_ill; 15536 phyi = ill->ill_phyint; 15537 if (ipif->ipif_id != 0 || index == 0) { 15538 return (EINVAL); 15539 } 15540 15541 /* If the index is not changing, no work to do */ 15542 if (phyi->phyint_ifindex == index) 15543 return (0); 15544 15545 /* 15546 * Use phyint_exists() to determine if the new interface index 15547 * is already in use. If the index is unused then we need to 15548 * change the phyint's position in the phyint_list_avl_by_index 15549 * tree. If we do not do this, subsequent lookups (using the new 15550 * index value) will not find the phyint. 15551 */ 15552 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15553 if (phyint_exists(index, ipst)) { 15554 rw_exit(&ipst->ips_ill_g_lock); 15555 return (EEXIST); 15556 } 15557 15558 /* 15559 * The new index is unused. Set it in the phyint. However we must not 15560 * forget to trigger NE_IFINDEX_CHANGE event before the ifindex 15561 * changes. The event must be bound to old ifindex value. 15562 */ 15563 ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE, 15564 &index, sizeof (index)); 15565 15566 old_index = phyi->phyint_ifindex; 15567 phyi->phyint_ifindex = index; 15568 15569 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi); 15570 (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 15571 &index, &where); 15572 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 15573 phyi, where); 15574 rw_exit(&ipst->ips_ill_g_lock); 15575 15576 /* Update SCTP's ILL list */ 15577 sctp_ill_reindex(ill, old_index); 15578 15579 /* Send the routing sockets message */ 15580 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 15581 if (ILL_OTHER(ill)) 15582 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT); 15583 15584 /* Perhaps ilgs should use this ill */ 15585 update_conn_ill(NULL, ill->ill_ipst); 15586 return (0); 15587 } 15588 15589 /* ARGSUSED */ 15590 int 15591 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15592 ip_ioctl_cmd_t *ipip, void *ifreq) 15593 { 15594 struct ifreq *ifr = (struct ifreq *)ifreq; 15595 struct lifreq *lifr = (struct lifreq *)ifreq; 15596 15597 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 15598 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15599 /* Get the interface index */ 15600 if (ipip->ipi_cmd_type == IF_CMD) { 15601 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 15602 } else { 15603 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 15604 } 15605 return (0); 15606 } 15607 15608 /* ARGSUSED */ 15609 int 15610 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15611 ip_ioctl_cmd_t *ipip, void *ifreq) 15612 { 15613 struct lifreq *lifr = (struct lifreq *)ifreq; 15614 15615 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 15616 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15617 /* Get the interface zone */ 15618 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15619 lifr->lifr_zoneid = ipif->ipif_zoneid; 15620 return (0); 15621 } 15622 15623 /* 15624 * Set the zoneid of an interface. 15625 */ 15626 /* ARGSUSED */ 15627 int 15628 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15629 ip_ioctl_cmd_t *ipip, void *ifreq) 15630 { 15631 struct lifreq *lifr = (struct lifreq *)ifreq; 15632 int err = 0; 15633 boolean_t need_up = B_FALSE; 15634 zone_t *zptr; 15635 zone_status_t status; 15636 zoneid_t zoneid; 15637 15638 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15639 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 15640 if (!is_system_labeled()) 15641 return (ENOTSUP); 15642 zoneid = GLOBAL_ZONEID; 15643 } 15644 15645 /* cannot assign instance zero to a non-global zone */ 15646 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 15647 return (ENOTSUP); 15648 15649 /* 15650 * Cannot assign to a zone that doesn't exist or is shutting down. In 15651 * the event of a race with the zone shutdown processing, since IP 15652 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 15653 * interface will be cleaned up even if the zone is shut down 15654 * immediately after the status check. If the interface can't be brought 15655 * down right away, and the zone is shut down before the restart 15656 * function is called, we resolve the possible races by rechecking the 15657 * zone status in the restart function. 15658 */ 15659 if ((zptr = zone_find_by_id(zoneid)) == NULL) 15660 return (EINVAL); 15661 status = zone_status_get(zptr); 15662 zone_rele(zptr); 15663 15664 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 15665 return (EINVAL); 15666 15667 if (ipif->ipif_flags & IPIF_UP) { 15668 /* 15669 * If the interface is already marked up, 15670 * we call ipif_down which will take care 15671 * of ditching any IREs that have been set 15672 * up based on the old interface address. 15673 */ 15674 err = ipif_logical_down(ipif, q, mp); 15675 if (err == EINPROGRESS) 15676 return (err); 15677 (void) ipif_down_tail(ipif); 15678 need_up = B_TRUE; 15679 } 15680 15681 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 15682 return (err); 15683 } 15684 15685 static int 15686 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 15687 queue_t *q, mblk_t *mp, boolean_t need_up) 15688 { 15689 int err = 0; 15690 ip_stack_t *ipst; 15691 15692 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 15693 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15694 15695 if (CONN_Q(q)) 15696 ipst = CONNQ_TO_IPST(q); 15697 else 15698 ipst = ILLQ_TO_IPST(q); 15699 15700 /* 15701 * For exclusive stacks we don't allow a different zoneid than 15702 * global. 15703 */ 15704 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID && 15705 zoneid != GLOBAL_ZONEID) 15706 return (EINVAL); 15707 15708 /* Set the new zone id. */ 15709 ipif->ipif_zoneid = zoneid; 15710 15711 /* Update sctp list */ 15712 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 15713 15714 /* The default multicast interface might have changed */ 15715 ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6); 15716 15717 if (need_up) { 15718 /* 15719 * Now bring the interface back up. If this 15720 * is the only IPIF for the ILL, ipif_up 15721 * will have to re-bind to the device, so 15722 * we may get back EINPROGRESS, in which 15723 * case, this IOCTL will get completed in 15724 * ip_rput_dlpi when we see the DL_BIND_ACK. 15725 */ 15726 err = ipif_up(ipif, q, mp); 15727 } 15728 return (err); 15729 } 15730 15731 /* ARGSUSED */ 15732 int 15733 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15734 ip_ioctl_cmd_t *ipip, void *if_req) 15735 { 15736 struct lifreq *lifr = (struct lifreq *)if_req; 15737 zoneid_t zoneid; 15738 zone_t *zptr; 15739 zone_status_t status; 15740 15741 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 15742 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 15743 zoneid = GLOBAL_ZONEID; 15744 15745 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 15746 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15747 15748 /* 15749 * We recheck the zone status to resolve the following race condition: 15750 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 15751 * 2) hme0:1 is up and can't be brought down right away; 15752 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 15753 * 3) zone "myzone" is halted; the zone status switches to 15754 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 15755 * the interfaces to remove - hme0:1 is not returned because it's not 15756 * yet in "myzone", so it won't be removed; 15757 * 4) the restart function for SIOCSLIFZONE is called; without the 15758 * status check here, we would have hme0:1 in "myzone" after it's been 15759 * destroyed. 15760 * Note that if the status check fails, we need to bring the interface 15761 * back to its state prior to ip_sioctl_slifzone(), hence the call to 15762 * ipif_up_done[_v6](). 15763 */ 15764 status = ZONE_IS_UNINITIALIZED; 15765 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 15766 status = zone_status_get(zptr); 15767 zone_rele(zptr); 15768 } 15769 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 15770 if (ipif->ipif_isv6) { 15771 (void) ipif_up_done_v6(ipif); 15772 } else { 15773 (void) ipif_up_done(ipif); 15774 } 15775 return (EINVAL); 15776 } 15777 15778 (void) ipif_down_tail(ipif); 15779 15780 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 15781 B_TRUE)); 15782 } 15783 15784 /* 15785 * Return the number of addresses on `ill' with one or more of the values 15786 * in `set' set and all of the values in `clear' clear. 15787 */ 15788 static uint_t 15789 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear) 15790 { 15791 ipif_t *ipif; 15792 uint_t cnt = 0; 15793 15794 ASSERT(IAM_WRITER_ILL(ill)); 15795 15796 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 15797 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear)) 15798 cnt++; 15799 15800 return (cnt); 15801 } 15802 15803 /* 15804 * Return the number of migratable addresses on `ill' that are under 15805 * application control. 15806 */ 15807 uint_t 15808 ill_appaddr_cnt(const ill_t *ill) 15809 { 15810 return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF, 15811 IPIF_NOFAILOVER)); 15812 } 15813 15814 /* 15815 * Return the number of point-to-point addresses on `ill'. 15816 */ 15817 uint_t 15818 ill_ptpaddr_cnt(const ill_t *ill) 15819 { 15820 return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0)); 15821 } 15822 15823 /* ARGSUSED */ 15824 int 15825 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15826 ip_ioctl_cmd_t *ipip, void *ifreq) 15827 { 15828 struct lifreq *lifr = ifreq; 15829 15830 ASSERT(q->q_next == NULL); 15831 ASSERT(CONN_Q(q)); 15832 15833 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 15834 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 15835 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 15836 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 15837 15838 return (0); 15839 } 15840 15841 /* Find the previous ILL in this usesrc group */ 15842 static ill_t * 15843 ill_prev_usesrc(ill_t *uill) 15844 { 15845 ill_t *ill; 15846 15847 for (ill = uill->ill_usesrc_grp_next; 15848 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 15849 ill = ill->ill_usesrc_grp_next) 15850 /* do nothing */; 15851 return (ill); 15852 } 15853 15854 /* 15855 * Release all members of the usesrc group. This routine is called 15856 * from ill_delete when the interface being unplumbed is the 15857 * group head. 15858 * 15859 * This silently clears the usesrc that ifconfig setup. 15860 * An alternative would be to keep that ifindex, and drop packets on the floor 15861 * since no source address can be selected. 15862 * Even if we keep the current semantics, don't need a lock and a linked list. 15863 * Can walk all the ills checking if they have a ill_usesrc_ifindex matching 15864 * the one that is being removed. Issue is how we return the usesrc users 15865 * (SIOCGLIFSRCOF). We want to be able to find the ills which have an 15866 * ill_usesrc_ifindex matching a target ill. We could also do that with an 15867 * ill walk, but the walker would need to insert in the ioctl response. 15868 */ 15869 static void 15870 ill_disband_usesrc_group(ill_t *uill) 15871 { 15872 ill_t *next_ill, *tmp_ill; 15873 ip_stack_t *ipst = uill->ill_ipst; 15874 15875 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 15876 next_ill = uill->ill_usesrc_grp_next; 15877 15878 do { 15879 ASSERT(next_ill != NULL); 15880 tmp_ill = next_ill->ill_usesrc_grp_next; 15881 ASSERT(tmp_ill != NULL); 15882 next_ill->ill_usesrc_grp_next = NULL; 15883 next_ill->ill_usesrc_ifindex = 0; 15884 next_ill = tmp_ill; 15885 } while (next_ill->ill_usesrc_ifindex != 0); 15886 uill->ill_usesrc_grp_next = NULL; 15887 } 15888 15889 /* 15890 * Remove the client usesrc ILL from the list and relink to a new list 15891 */ 15892 int 15893 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 15894 { 15895 ill_t *ill, *tmp_ill; 15896 ip_stack_t *ipst = ucill->ill_ipst; 15897 15898 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 15899 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 15900 15901 /* 15902 * Check if the usesrc client ILL passed in is not already 15903 * in use as a usesrc ILL i.e one whose source address is 15904 * in use OR a usesrc ILL is not already in use as a usesrc 15905 * client ILL 15906 */ 15907 if ((ucill->ill_usesrc_ifindex == 0) || 15908 (uill->ill_usesrc_ifindex != 0)) { 15909 return (-1); 15910 } 15911 15912 ill = ill_prev_usesrc(ucill); 15913 ASSERT(ill->ill_usesrc_grp_next != NULL); 15914 15915 /* Remove from the current list */ 15916 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 15917 /* Only two elements in the list */ 15918 ASSERT(ill->ill_usesrc_ifindex == 0); 15919 ill->ill_usesrc_grp_next = NULL; 15920 } else { 15921 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 15922 } 15923 15924 if (ifindex == 0) { 15925 ucill->ill_usesrc_ifindex = 0; 15926 ucill->ill_usesrc_grp_next = NULL; 15927 return (0); 15928 } 15929 15930 ucill->ill_usesrc_ifindex = ifindex; 15931 tmp_ill = uill->ill_usesrc_grp_next; 15932 uill->ill_usesrc_grp_next = ucill; 15933 ucill->ill_usesrc_grp_next = 15934 (tmp_ill != NULL) ? tmp_ill : uill; 15935 return (0); 15936 } 15937 15938 /* 15939 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 15940 * ip.c for locking details. 15941 */ 15942 /* ARGSUSED */ 15943 int 15944 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15945 ip_ioctl_cmd_t *ipip, void *ifreq) 15946 { 15947 struct lifreq *lifr = (struct lifreq *)ifreq; 15948 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE; 15949 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 15950 int err = 0, ret; 15951 uint_t ifindex; 15952 ipsq_t *ipsq = NULL; 15953 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15954 15955 ASSERT(IAM_WRITER_IPIF(ipif)); 15956 ASSERT(q->q_next == NULL); 15957 ASSERT(CONN_Q(q)); 15958 15959 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; 15960 15961 ifindex = lifr->lifr_index; 15962 if (ifindex == 0) { 15963 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 15964 /* non usesrc group interface, nothing to reset */ 15965 return (0); 15966 } 15967 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 15968 /* valid reset request */ 15969 reset_flg = B_TRUE; 15970 } 15971 15972 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 15973 if (usesrc_ill == NULL) { 15974 return (ENXIO); 15975 } 15976 15977 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 15978 NEW_OP, B_TRUE); 15979 if (ipsq == NULL) { 15980 err = EINPROGRESS; 15981 /* Operation enqueued on the ipsq of the usesrc ILL */ 15982 goto done; 15983 } 15984 15985 /* USESRC isn't currently supported with IPMP */ 15986 if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) { 15987 err = ENOTSUP; 15988 goto done; 15989 } 15990 15991 /* 15992 * USESRC isn't compatible with the STANDBY flag. (STANDBY is only 15993 * used by IPMP underlying interfaces, but someone might think it's 15994 * more general and try to use it independently with VNI.) 15995 */ 15996 if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 15997 err = ENOTSUP; 15998 goto done; 15999 } 16000 16001 /* 16002 * If the client is already in use as a usesrc_ill or a usesrc_ill is 16003 * already a client then return EINVAL 16004 */ 16005 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 16006 err = EINVAL; 16007 goto done; 16008 } 16009 16010 /* 16011 * If the ill_usesrc_ifindex field is already set to what it needs to 16012 * be then this is a duplicate operation. 16013 */ 16014 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 16015 err = 0; 16016 goto done; 16017 } 16018 16019 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 16020 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 16021 usesrc_ill->ill_isv6)); 16022 16023 /* 16024 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 16025 * and the ill_usesrc_ifindex fields 16026 */ 16027 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 16028 16029 if (reset_flg) { 16030 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 16031 if (ret != 0) { 16032 err = EINVAL; 16033 } 16034 rw_exit(&ipst->ips_ill_g_usesrc_lock); 16035 goto done; 16036 } 16037 16038 /* 16039 * Four possibilities to consider: 16040 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 16041 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 16042 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 16043 * 4. Both are part of their respective usesrc groups 16044 */ 16045 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 16046 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 16047 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 16048 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 16049 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 16050 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 16051 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 16052 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 16053 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 16054 /* Insert at head of list */ 16055 usesrc_cli_ill->ill_usesrc_grp_next = 16056 usesrc_ill->ill_usesrc_grp_next; 16057 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 16058 } else { 16059 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 16060 ifindex); 16061 if (ret != 0) 16062 err = EINVAL; 16063 } 16064 rw_exit(&ipst->ips_ill_g_usesrc_lock); 16065 16066 done: 16067 if (ipsq != NULL) 16068 ipsq_exit(ipsq); 16069 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 16070 ill_refrele(usesrc_ill); 16071 16072 /* Let conn_ixa caching know that source address selection changed */ 16073 ip_update_source_selection(ipst); 16074 16075 return (err); 16076 } 16077 16078 /* 16079 * comparison function used by avl. 16080 */ 16081 static int 16082 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 16083 { 16084 16085 uint_t index; 16086 16087 ASSERT(phyip != NULL && index_ptr != NULL); 16088 16089 index = *((uint_t *)index_ptr); 16090 /* 16091 * let the phyint with the lowest index be on top. 16092 */ 16093 if (((phyint_t *)phyip)->phyint_ifindex < index) 16094 return (1); 16095 if (((phyint_t *)phyip)->phyint_ifindex > index) 16096 return (-1); 16097 return (0); 16098 } 16099 16100 /* 16101 * comparison function used by avl. 16102 */ 16103 static int 16104 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 16105 { 16106 ill_t *ill; 16107 int res = 0; 16108 16109 ASSERT(phyip != NULL && name_ptr != NULL); 16110 16111 if (((phyint_t *)phyip)->phyint_illv4) 16112 ill = ((phyint_t *)phyip)->phyint_illv4; 16113 else 16114 ill = ((phyint_t *)phyip)->phyint_illv6; 16115 ASSERT(ill != NULL); 16116 16117 res = strcmp(ill->ill_name, (char *)name_ptr); 16118 if (res > 0) 16119 return (1); 16120 else if (res < 0) 16121 return (-1); 16122 return (0); 16123 } 16124 16125 /* 16126 * This function is called on the unplumb path via ill_glist_delete() when 16127 * there are no ills left on the phyint and thus the phyint can be freed. 16128 */ 16129 static void 16130 phyint_free(phyint_t *phyi) 16131 { 16132 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 16133 16134 ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL); 16135 16136 /* 16137 * If this phyint was an IPMP meta-interface, blow away the group. 16138 * This is safe to do because all of the illgrps have already been 16139 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us. 16140 * If we're cleaning up as a result of failed initialization, 16141 * phyint_grp may be NULL. 16142 */ 16143 if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) { 16144 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 16145 ipmp_grp_destroy(phyi->phyint_grp); 16146 phyi->phyint_grp = NULL; 16147 rw_exit(&ipst->ips_ipmp_lock); 16148 } 16149 16150 /* 16151 * If this interface was under IPMP, take it out of the group. 16152 */ 16153 if (phyi->phyint_grp != NULL) 16154 ipmp_phyint_leave_grp(phyi); 16155 16156 /* 16157 * Delete the phyint and disassociate its ipsq. The ipsq itself 16158 * will be freed in ipsq_exit(). 16159 */ 16160 phyi->phyint_ipsq->ipsq_phyint = NULL; 16161 phyi->phyint_name[0] = '\0'; 16162 16163 mi_free(phyi); 16164 } 16165 16166 /* 16167 * Attach the ill to the phyint structure which can be shared by both 16168 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 16169 * function is called from ipif_set_values and ill_lookup_on_name (for 16170 * loopback) where we know the name of the ill. We lookup the ill and if 16171 * there is one present already with the name use that phyint. Otherwise 16172 * reuse the one allocated by ill_init. 16173 */ 16174 static void 16175 ill_phyint_reinit(ill_t *ill) 16176 { 16177 boolean_t isv6 = ill->ill_isv6; 16178 phyint_t *phyi_old; 16179 phyint_t *phyi; 16180 avl_index_t where = 0; 16181 ill_t *ill_other = NULL; 16182 ip_stack_t *ipst = ill->ill_ipst; 16183 16184 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 16185 16186 phyi_old = ill->ill_phyint; 16187 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 16188 phyi_old->phyint_illv6 == NULL)); 16189 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 16190 phyi_old->phyint_illv4 == NULL)); 16191 ASSERT(phyi_old->phyint_ifindex == 0); 16192 16193 /* 16194 * Now that our ill has a name, set it in the phyint. 16195 */ 16196 (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ); 16197 16198 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16199 ill->ill_name, &where); 16200 16201 /* 16202 * 1. We grabbed the ill_g_lock before inserting this ill into 16203 * the global list of ills. So no other thread could have located 16204 * this ill and hence the ipsq of this ill is guaranteed to be empty. 16205 * 2. Now locate the other protocol instance of this ill. 16206 * 3. Now grab both ill locks in the right order, and the phyint lock of 16207 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 16208 * of neither ill can change. 16209 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 16210 * other ill. 16211 * 5. Release all locks. 16212 */ 16213 16214 /* 16215 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 16216 * we are initializing IPv4. 16217 */ 16218 if (phyi != NULL) { 16219 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6; 16220 ASSERT(ill_other->ill_phyint != NULL); 16221 ASSERT((isv6 && !ill_other->ill_isv6) || 16222 (!isv6 && ill_other->ill_isv6)); 16223 GRAB_ILL_LOCKS(ill, ill_other); 16224 /* 16225 * We are potentially throwing away phyint_flags which 16226 * could be different from the one that we obtain from 16227 * ill_other->ill_phyint. But it is okay as we are assuming 16228 * that the state maintained within IP is correct. 16229 */ 16230 mutex_enter(&phyi->phyint_lock); 16231 if (isv6) { 16232 ASSERT(phyi->phyint_illv6 == NULL); 16233 phyi->phyint_illv6 = ill; 16234 } else { 16235 ASSERT(phyi->phyint_illv4 == NULL); 16236 phyi->phyint_illv4 = ill; 16237 } 16238 16239 /* 16240 * Delete the old phyint and make its ipsq eligible 16241 * to be freed in ipsq_exit(). 16242 */ 16243 phyi_old->phyint_illv4 = NULL; 16244 phyi_old->phyint_illv6 = NULL; 16245 phyi_old->phyint_ipsq->ipsq_phyint = NULL; 16246 phyi_old->phyint_name[0] = '\0'; 16247 mi_free(phyi_old); 16248 } else { 16249 mutex_enter(&ill->ill_lock); 16250 /* 16251 * We don't need to acquire any lock, since 16252 * the ill is not yet visible globally and we 16253 * have not yet released the ill_g_lock. 16254 */ 16255 phyi = phyi_old; 16256 mutex_enter(&phyi->phyint_lock); 16257 /* XXX We need a recovery strategy here. */ 16258 if (!phyint_assign_ifindex(phyi, ipst)) 16259 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 16260 16261 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16262 (void *)phyi, where); 16263 16264 (void) avl_find(&ipst->ips_phyint_g_list-> 16265 phyint_list_avl_by_index, 16266 &phyi->phyint_ifindex, &where); 16267 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16268 (void *)phyi, where); 16269 } 16270 16271 /* 16272 * Reassigning ill_phyint automatically reassigns the ipsq also. 16273 * pending mp is not affected because that is per ill basis. 16274 */ 16275 ill->ill_phyint = phyi; 16276 16277 /* 16278 * Now that the phyint's ifindex has been assigned, complete the 16279 * remaining 16280 */ 16281 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex; 16282 if (ill->ill_isv6) { 16283 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 16284 ill->ill_phyint->phyint_ifindex; 16285 ill->ill_mcast_type = ipst->ips_mld_max_version; 16286 } else { 16287 ill->ill_mcast_type = ipst->ips_igmp_max_version; 16288 } 16289 16290 /* 16291 * Generate an event within the hooks framework to indicate that 16292 * a new interface has just been added to IP. For this event to 16293 * be generated, the network interface must, at least, have an 16294 * ifindex assigned to it. (We don't generate the event for 16295 * loopback since ill_lookup_on_name() has its own NE_PLUMB event.) 16296 * 16297 * This needs to be run inside the ill_g_lock perimeter to ensure 16298 * that the ordering of delivered events to listeners matches the 16299 * order of them in the kernel. 16300 */ 16301 if (!IS_LOOPBACK(ill)) { 16302 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name, 16303 ill->ill_name_length); 16304 } 16305 RELEASE_ILL_LOCKS(ill, ill_other); 16306 mutex_exit(&phyi->phyint_lock); 16307 } 16308 16309 /* 16310 * Notify any downstream modules of the name of this interface. 16311 * An M_IOCTL is used even though we don't expect a successful reply. 16312 * Any reply message from the driver (presumably an M_IOCNAK) will 16313 * eventually get discarded somewhere upstream. The message format is 16314 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 16315 * to IP. 16316 */ 16317 static void 16318 ip_ifname_notify(ill_t *ill, queue_t *q) 16319 { 16320 mblk_t *mp1, *mp2; 16321 struct iocblk *iocp; 16322 struct lifreq *lifr; 16323 16324 mp1 = mkiocb(SIOCSLIFNAME); 16325 if (mp1 == NULL) 16326 return; 16327 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 16328 if (mp2 == NULL) { 16329 freeb(mp1); 16330 return; 16331 } 16332 16333 mp1->b_cont = mp2; 16334 iocp = (struct iocblk *)mp1->b_rptr; 16335 iocp->ioc_count = sizeof (struct lifreq); 16336 16337 lifr = (struct lifreq *)mp2->b_rptr; 16338 mp2->b_wptr += sizeof (struct lifreq); 16339 bzero(lifr, sizeof (struct lifreq)); 16340 16341 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 16342 lifr->lifr_ppa = ill->ill_ppa; 16343 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 16344 16345 DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify", 16346 char *, "SIOCSLIFNAME", ill_t *, ill); 16347 putnext(q, mp1); 16348 } 16349 16350 static int 16351 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 16352 { 16353 int err; 16354 ip_stack_t *ipst = ill->ill_ipst; 16355 phyint_t *phyi = ill->ill_phyint; 16356 16357 /* Set the obsolete NDD per-interface forwarding name. */ 16358 err = ill_set_ndd_name(ill); 16359 if (err != 0) { 16360 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 16361 err); 16362 } 16363 16364 /* 16365 * Now that ill_name is set, the configuration for the IPMP 16366 * meta-interface can be performed. 16367 */ 16368 if (IS_IPMP(ill)) { 16369 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 16370 /* 16371 * If phyi->phyint_grp is NULL, then this is the first IPMP 16372 * meta-interface and we need to create the IPMP group. 16373 */ 16374 if (phyi->phyint_grp == NULL) { 16375 /* 16376 * If someone has renamed another IPMP group to have 16377 * the same name as our interface, bail. 16378 */ 16379 if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) { 16380 rw_exit(&ipst->ips_ipmp_lock); 16381 return (EEXIST); 16382 } 16383 phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi); 16384 if (phyi->phyint_grp == NULL) { 16385 rw_exit(&ipst->ips_ipmp_lock); 16386 return (ENOMEM); 16387 } 16388 } 16389 rw_exit(&ipst->ips_ipmp_lock); 16390 } 16391 16392 /* Tell downstream modules where they are. */ 16393 ip_ifname_notify(ill, q); 16394 16395 /* 16396 * ill_dl_phys returns EINPROGRESS in the usual case. 16397 * Error cases are ENOMEM ... 16398 */ 16399 err = ill_dl_phys(ill, ipif, mp, q); 16400 16401 if (ill->ill_isv6) { 16402 mutex_enter(&ipst->ips_mld_slowtimeout_lock); 16403 if (ipst->ips_mld_slowtimeout_id == 0) { 16404 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, 16405 (void *)ipst, 16406 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 16407 } 16408 mutex_exit(&ipst->ips_mld_slowtimeout_lock); 16409 } else { 16410 mutex_enter(&ipst->ips_igmp_slowtimeout_lock); 16411 if (ipst->ips_igmp_slowtimeout_id == 0) { 16412 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, 16413 (void *)ipst, 16414 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 16415 } 16416 mutex_exit(&ipst->ips_igmp_slowtimeout_lock); 16417 } 16418 16419 return (err); 16420 } 16421 16422 /* 16423 * Common routine for ppa and ifname setting. Should be called exclusive. 16424 * 16425 * Returns EINPROGRESS when mp has been consumed by queueing it on 16426 * ipx_pending_mp and the ioctl will complete in ip_rput. 16427 * 16428 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 16429 * the new name and new ppa in lifr_name and lifr_ppa respectively. 16430 * For SLIFNAME, we pass these values back to the userland. 16431 */ 16432 static int 16433 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 16434 { 16435 ill_t *ill; 16436 ipif_t *ipif; 16437 ipsq_t *ipsq; 16438 char *ppa_ptr; 16439 char *old_ptr; 16440 char old_char; 16441 int error; 16442 ip_stack_t *ipst; 16443 16444 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 16445 ASSERT(q->q_next != NULL); 16446 ASSERT(interf_name != NULL); 16447 16448 ill = (ill_t *)q->q_ptr; 16449 ipst = ill->ill_ipst; 16450 16451 ASSERT(ill->ill_ipst != NULL); 16452 ASSERT(ill->ill_name[0] == '\0'); 16453 ASSERT(IAM_WRITER_ILL(ill)); 16454 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 16455 ASSERT(ill->ill_ppa == UINT_MAX); 16456 16457 ill->ill_defend_start = ill->ill_defend_count = 0; 16458 /* The ppa is sent down by ifconfig or is chosen */ 16459 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 16460 return (EINVAL); 16461 } 16462 16463 /* 16464 * make sure ppa passed in is same as ppa in the name. 16465 * This check is not made when ppa == UINT_MAX in that case ppa 16466 * in the name could be anything. System will choose a ppa and 16467 * update new_ppa_ptr and inter_name to contain the choosen ppa. 16468 */ 16469 if (*new_ppa_ptr != UINT_MAX) { 16470 /* stoi changes the pointer */ 16471 old_ptr = ppa_ptr; 16472 /* 16473 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 16474 * (they don't have an externally visible ppa). We assign one 16475 * here so that we can manage the interface. Note that in 16476 * the past this value was always 0 for DLPI 1 drivers. 16477 */ 16478 if (*new_ppa_ptr == 0) 16479 *new_ppa_ptr = stoi(&old_ptr); 16480 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 16481 return (EINVAL); 16482 } 16483 /* 16484 * terminate string before ppa 16485 * save char at that location. 16486 */ 16487 old_char = ppa_ptr[0]; 16488 ppa_ptr[0] = '\0'; 16489 16490 ill->ill_ppa = *new_ppa_ptr; 16491 /* 16492 * Finish as much work now as possible before calling ill_glist_insert 16493 * which makes the ill globally visible and also merges it with the 16494 * other protocol instance of this phyint. The remaining work is 16495 * done after entering the ipsq which may happen sometime later. 16496 * ill_set_ndd_name occurs after the ill has been made globally visible. 16497 */ 16498 ipif = ill->ill_ipif; 16499 16500 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 16501 ipif_assign_seqid(ipif); 16502 16503 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 16504 ill->ill_flags |= ILLF_IPV4; 16505 16506 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 16507 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 16508 16509 if (ill->ill_flags & ILLF_IPV6) { 16510 16511 ill->ill_isv6 = B_TRUE; 16512 ill_set_inputfn(ill); 16513 if (ill->ill_rq != NULL) { 16514 ill->ill_rq->q_qinfo = &iprinitv6; 16515 } 16516 16517 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 16518 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 16519 ipif->ipif_v6subnet = ipv6_all_zeros; 16520 ipif->ipif_v6net_mask = ipv6_all_zeros; 16521 ipif->ipif_v6brd_addr = ipv6_all_zeros; 16522 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 16523 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 16524 /* 16525 * point-to-point or Non-mulicast capable 16526 * interfaces won't do NUD unless explicitly 16527 * configured to do so. 16528 */ 16529 if (ipif->ipif_flags & IPIF_POINTOPOINT || 16530 !(ill->ill_flags & ILLF_MULTICAST)) { 16531 ill->ill_flags |= ILLF_NONUD; 16532 } 16533 /* Make sure IPv4 specific flag is not set on IPv6 if */ 16534 if (ill->ill_flags & ILLF_NOARP) { 16535 /* 16536 * Note: xresolv interfaces will eventually need 16537 * NOARP set here as well, but that will require 16538 * those external resolvers to have some 16539 * knowledge of that flag and act appropriately. 16540 * Not to be changed at present. 16541 */ 16542 ill->ill_flags &= ~ILLF_NOARP; 16543 } 16544 /* 16545 * Set the ILLF_ROUTER flag according to the global 16546 * IPv6 forwarding policy. 16547 */ 16548 if (ipst->ips_ipv6_forward != 0) 16549 ill->ill_flags |= ILLF_ROUTER; 16550 } else if (ill->ill_flags & ILLF_IPV4) { 16551 ill->ill_isv6 = B_FALSE; 16552 ill_set_inputfn(ill); 16553 ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER; 16554 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 16555 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 16556 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 16557 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 16558 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 16559 /* 16560 * Set the ILLF_ROUTER flag according to the global 16561 * IPv4 forwarding policy. 16562 */ 16563 if (ipst->ips_ip_g_forward != 0) 16564 ill->ill_flags |= ILLF_ROUTER; 16565 } 16566 16567 ASSERT(ill->ill_phyint != NULL); 16568 16569 /* 16570 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will 16571 * be completed in ill_glist_insert -> ill_phyint_reinit 16572 */ 16573 if (!ill_allocate_mibs(ill)) 16574 return (ENOMEM); 16575 16576 /* 16577 * Pick a default sap until we get the DL_INFO_ACK back from 16578 * the driver. 16579 */ 16580 ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap : 16581 ill->ill_media->ip_m_ipv4sap; 16582 16583 ill->ill_ifname_pending = 1; 16584 ill->ill_ifname_pending_err = 0; 16585 16586 /* 16587 * When the first ipif comes up in ipif_up_done(), multicast groups 16588 * that were joined while this ill was not bound to the DLPI link need 16589 * to be recovered by ill_recover_multicast(). 16590 */ 16591 ill->ill_need_recover_multicast = 1; 16592 16593 ill_refhold(ill); 16594 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16595 if ((error = ill_glist_insert(ill, interf_name, 16596 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 16597 ill->ill_ppa = UINT_MAX; 16598 ill->ill_name[0] = '\0'; 16599 /* 16600 * undo null termination done above. 16601 */ 16602 ppa_ptr[0] = old_char; 16603 rw_exit(&ipst->ips_ill_g_lock); 16604 ill_refrele(ill); 16605 return (error); 16606 } 16607 16608 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 16609 16610 /* 16611 * When we return the buffer pointed to by interf_name should contain 16612 * the same name as in ill_name. 16613 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 16614 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 16615 * so copy full name and update the ppa ptr. 16616 * When ppa passed in != UINT_MAX all values are correct just undo 16617 * null termination, this saves a bcopy. 16618 */ 16619 if (*new_ppa_ptr == UINT_MAX) { 16620 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 16621 *new_ppa_ptr = ill->ill_ppa; 16622 } else { 16623 /* 16624 * undo null termination done above. 16625 */ 16626 ppa_ptr[0] = old_char; 16627 } 16628 16629 /* Let SCTP know about this ILL */ 16630 sctp_update_ill(ill, SCTP_ILL_INSERT); 16631 16632 /* 16633 * ill_glist_insert has made the ill visible globally, and 16634 * ill_phyint_reinit could have changed the ipsq. At this point, 16635 * we need to hold the ips_ill_g_lock across the call to enter the 16636 * ipsq to enforce atomicity and prevent reordering. In the event 16637 * the ipsq has changed, and if the new ipsq is currently busy, 16638 * we need to make sure that this half-completed ioctl is ahead of 16639 * any subsequent ioctl. We achieve this by not dropping the 16640 * ips_ill_g_lock which prevents any ill lookup itself thereby 16641 * ensuring that new ioctls can't start. 16642 */ 16643 ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP, 16644 B_TRUE); 16645 16646 rw_exit(&ipst->ips_ill_g_lock); 16647 ill_refrele(ill); 16648 if (ipsq == NULL) 16649 return (EINPROGRESS); 16650 16651 /* 16652 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. 16653 */ 16654 if (ipsq->ipsq_xop->ipx_current_ipif == NULL) 16655 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); 16656 else 16657 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif); 16658 16659 error = ipif_set_values_tail(ill, ipif, mp, q); 16660 ipsq_exit(ipsq); 16661 if (error != 0 && error != EINPROGRESS) { 16662 /* 16663 * restore previous values 16664 */ 16665 ill->ill_isv6 = B_FALSE; 16666 ill_set_inputfn(ill); 16667 } 16668 return (error); 16669 } 16670 16671 void 16672 ipif_init(ip_stack_t *ipst) 16673 { 16674 int i; 16675 16676 for (i = 0; i < MAX_G_HEADS; i++) { 16677 ipst->ips_ill_g_heads[i].ill_g_list_head = 16678 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 16679 ipst->ips_ill_g_heads[i].ill_g_list_tail = 16680 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 16681 } 16682 16683 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16684 ill_phyint_compare_index, 16685 sizeof (phyint_t), 16686 offsetof(struct phyint, phyint_avl_by_index)); 16687 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16688 ill_phyint_compare_name, 16689 sizeof (phyint_t), 16690 offsetof(struct phyint, phyint_avl_by_name)); 16691 } 16692 16693 /* 16694 * Save enough information so that we can recreate the IRE if 16695 * the interface goes down and then up. 16696 */ 16697 void 16698 ill_save_ire(ill_t *ill, ire_t *ire) 16699 { 16700 mblk_t *save_mp; 16701 16702 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 16703 if (save_mp != NULL) { 16704 ifrt_t *ifrt; 16705 16706 save_mp->b_wptr += sizeof (ifrt_t); 16707 ifrt = (ifrt_t *)save_mp->b_rptr; 16708 bzero(ifrt, sizeof (ifrt_t)); 16709 ifrt->ifrt_type = ire->ire_type; 16710 if (ire->ire_ipversion == IPV4_VERSION) { 16711 ASSERT(!ill->ill_isv6); 16712 ifrt->ifrt_addr = ire->ire_addr; 16713 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 16714 ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr; 16715 ifrt->ifrt_mask = ire->ire_mask; 16716 } else { 16717 ASSERT(ill->ill_isv6); 16718 ifrt->ifrt_v6addr = ire->ire_addr_v6; 16719 /* ire_gateway_addr_v6 can change due to RTM_CHANGE */ 16720 mutex_enter(&ire->ire_lock); 16721 ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6; 16722 mutex_exit(&ire->ire_lock); 16723 ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6; 16724 ifrt->ifrt_v6mask = ire->ire_mask_v6; 16725 } 16726 ifrt->ifrt_flags = ire->ire_flags; 16727 ifrt->ifrt_zoneid = ire->ire_zoneid; 16728 mutex_enter(&ill->ill_saved_ire_lock); 16729 save_mp->b_cont = ill->ill_saved_ire_mp; 16730 ill->ill_saved_ire_mp = save_mp; 16731 ill->ill_saved_ire_cnt++; 16732 mutex_exit(&ill->ill_saved_ire_lock); 16733 } 16734 } 16735 16736 /* 16737 * Remove one entry from ill_saved_ire_mp. 16738 */ 16739 void 16740 ill_remove_saved_ire(ill_t *ill, ire_t *ire) 16741 { 16742 mblk_t **mpp; 16743 mblk_t *mp; 16744 ifrt_t *ifrt; 16745 16746 /* Remove from ill_saved_ire_mp list if it is there */ 16747 mutex_enter(&ill->ill_saved_ire_lock); 16748 for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL; 16749 mpp = &(*mpp)->b_cont) { 16750 in6_addr_t gw_addr_v6; 16751 16752 /* 16753 * On a given ill, the tuple of address, gateway, mask, 16754 * ire_type, and zoneid is unique for each saved IRE. 16755 */ 16756 mp = *mpp; 16757 ifrt = (ifrt_t *)mp->b_rptr; 16758 /* ire_gateway_addr_v6 can change - need lock */ 16759 mutex_enter(&ire->ire_lock); 16760 gw_addr_v6 = ire->ire_gateway_addr_v6; 16761 mutex_exit(&ire->ire_lock); 16762 16763 if (ifrt->ifrt_zoneid != ire->ire_zoneid || 16764 ifrt->ifrt_type != ire->ire_type) 16765 continue; 16766 16767 if (ill->ill_isv6 ? 16768 (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr, 16769 &ire->ire_addr_v6) && 16770 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr, 16771 &gw_addr_v6) && 16772 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask, 16773 &ire->ire_mask_v6)) : 16774 (ifrt->ifrt_addr == ire->ire_addr && 16775 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 16776 ifrt->ifrt_mask == ire->ire_mask)) { 16777 *mpp = mp->b_cont; 16778 ill->ill_saved_ire_cnt--; 16779 freeb(mp); 16780 break; 16781 } 16782 } 16783 mutex_exit(&ill->ill_saved_ire_lock); 16784 } 16785 16786 /* 16787 * IP multirouting broadcast routes handling 16788 * Append CGTP broadcast IREs to regular ones created 16789 * at ifconfig time. 16790 * The usage is a route add <cgtp_bc> <nic_bc> -multirt i.e., both 16791 * the destination and the gateway are broadcast addresses. 16792 * The caller has verified that the destination is an IRE_BROADCAST and that 16793 * RTF_MULTIRT was set. Here if the gateway is a broadcast address, then 16794 * we create a MULTIRT IRE_BROADCAST. 16795 * Note that the IRE_HOST created by ire_rt_add doesn't get found by anything 16796 * since the IRE_BROADCAST takes precedence; ire_add_v4 does head insertion. 16797 */ 16798 static void 16799 ip_cgtp_bcast_add(ire_t *ire, ip_stack_t *ipst) 16800 { 16801 ire_t *ire_prim; 16802 16803 ASSERT(ire != NULL); 16804 16805 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, 16806 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, 16807 NULL); 16808 if (ire_prim != NULL) { 16809 /* 16810 * We are in the special case of broadcasts for 16811 * CGTP. We add an IRE_BROADCAST that holds 16812 * the RTF_MULTIRT flag, the destination 16813 * address and the low level 16814 * info of ire_prim. In other words, CGTP 16815 * broadcast is added to the redundant ipif. 16816 */ 16817 ill_t *ill_prim; 16818 ire_t *bcast_ire; 16819 16820 ill_prim = ire_prim->ire_ill; 16821 16822 ip2dbg(("ip_cgtp_filter_bcast_add: ire_prim %p, ill_prim %p\n", 16823 (void *)ire_prim, (void *)ill_prim)); 16824 16825 bcast_ire = ire_create( 16826 (uchar_t *)&ire->ire_addr, 16827 (uchar_t *)&ip_g_all_ones, 16828 (uchar_t *)&ire->ire_gateway_addr, 16829 IRE_BROADCAST, 16830 ill_prim, 16831 GLOBAL_ZONEID, /* CGTP is only for the global zone */ 16832 ire->ire_flags | RTF_KERNEL, 16833 NULL, 16834 ipst); 16835 16836 /* 16837 * Here we assume that ire_add does head insertion so that 16838 * the added IRE_BROADCAST comes before the existing IRE_HOST. 16839 */ 16840 if (bcast_ire != NULL) { 16841 if (ire->ire_flags & RTF_SETSRC) { 16842 bcast_ire->ire_setsrc_addr = 16843 ire->ire_setsrc_addr; 16844 } 16845 bcast_ire = ire_add(bcast_ire); 16846 if (bcast_ire != NULL) { 16847 ip2dbg(("ip_cgtp_filter_bcast_add: " 16848 "added bcast_ire %p\n", 16849 (void *)bcast_ire)); 16850 16851 ill_save_ire(ill_prim, bcast_ire); 16852 ire_refrele(bcast_ire); 16853 } 16854 } 16855 ire_refrele(ire_prim); 16856 } 16857 } 16858 16859 /* 16860 * IP multirouting broadcast routes handling 16861 * Remove the broadcast ire. 16862 * The usage is a route delete <cgtp_bc> <nic_bc> -multirt i.e., both 16863 * the destination and the gateway are broadcast addresses. 16864 * The caller has only verified that RTF_MULTIRT was set. We check 16865 * that the destination is broadcast and that the gateway is a broadcast 16866 * address, and if so delete the IRE added by ip_cgtp_bcast_add(). 16867 */ 16868 static void 16869 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst) 16870 { 16871 ASSERT(ire != NULL); 16872 16873 if (ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST) { 16874 ire_t *ire_prim; 16875 16876 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, 16877 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, 16878 ipst, NULL); 16879 if (ire_prim != NULL) { 16880 ill_t *ill_prim; 16881 ire_t *bcast_ire; 16882 16883 ill_prim = ire_prim->ire_ill; 16884 16885 ip2dbg(("ip_cgtp_filter_bcast_delete: " 16886 "ire_prim %p, ill_prim %p\n", 16887 (void *)ire_prim, (void *)ill_prim)); 16888 16889 bcast_ire = ire_ftable_lookup_v4(ire->ire_addr, 0, 16890 ire->ire_gateway_addr, IRE_BROADCAST, 16891 ill_prim, ALL_ZONES, NULL, 16892 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_ILL | 16893 MATCH_IRE_MASK, 0, ipst, NULL); 16894 16895 if (bcast_ire != NULL) { 16896 ip2dbg(("ip_cgtp_filter_bcast_delete: " 16897 "looked up bcast_ire %p\n", 16898 (void *)bcast_ire)); 16899 ill_remove_saved_ire(bcast_ire->ire_ill, 16900 bcast_ire); 16901 ire_delete(bcast_ire); 16902 ire_refrele(bcast_ire); 16903 } 16904 ire_refrele(ire_prim); 16905 } 16906 } 16907 } 16908 16909 /* 16910 * Derive an interface id from the link layer address. 16911 * Knows about IEEE 802 and IEEE EUI-64 mappings. 16912 */ 16913 static void 16914 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16915 { 16916 char *addr; 16917 16918 /* 16919 * Note that some IPv6 interfaces get plumbed over links that claim to 16920 * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g. 16921 * PPP links). The ETHERADDRL check here ensures that we only set the 16922 * interface ID on IPv6 interfaces above links that actually have real 16923 * Ethernet addresses. 16924 */ 16925 if (ill->ill_phys_addr_length == ETHERADDRL) { 16926 /* Form EUI-64 like address */ 16927 addr = (char *)&v6addr->s6_addr32[2]; 16928 bcopy(ill->ill_phys_addr, addr, 3); 16929 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 16930 addr[3] = (char)0xff; 16931 addr[4] = (char)0xfe; 16932 bcopy(ill->ill_phys_addr + 3, addr + 5, 3); 16933 } 16934 } 16935 16936 /* ARGSUSED */ 16937 static void 16938 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16939 { 16940 } 16941 16942 typedef struct ipmp_ifcookie { 16943 uint32_t ic_hostid; 16944 char ic_ifname[LIFNAMSIZ]; 16945 char ic_zonename[ZONENAME_MAX]; 16946 } ipmp_ifcookie_t; 16947 16948 /* 16949 * Construct a pseudo-random interface ID for the IPMP interface that's both 16950 * predictable and (almost) guaranteed to be unique. 16951 */ 16952 static void 16953 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr) 16954 { 16955 zone_t *zp; 16956 uint8_t *addr; 16957 uchar_t hash[16]; 16958 ulong_t hostid; 16959 MD5_CTX ctx; 16960 ipmp_ifcookie_t ic = { 0 }; 16961 16962 ASSERT(IS_IPMP(ill)); 16963 16964 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); 16965 ic.ic_hostid = htonl((uint32_t)hostid); 16966 16967 (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ); 16968 16969 if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) { 16970 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX); 16971 zone_rele(zp); 16972 } 16973 16974 MD5Init(&ctx); 16975 MD5Update(&ctx, &ic, sizeof (ic)); 16976 MD5Final(hash, &ctx); 16977 16978 /* 16979 * Map the hash to an interface ID per the basic approach in RFC3041. 16980 */ 16981 addr = &v6addr->s6_addr8[8]; 16982 bcopy(hash + 8, addr, sizeof (uint64_t)); 16983 addr[0] &= ~0x2; /* set local bit */ 16984 } 16985 16986 /* 16987 * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet. 16988 */ 16989 static void 16990 ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr) 16991 { 16992 phyint_t *phyi = ill->ill_phyint; 16993 16994 /* 16995 * Check PHYI_MULTI_BCAST and length of physical 16996 * address to determine if we use the mapping or the 16997 * broadcast address. 16998 */ 16999 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || 17000 ill->ill_phys_addr_length != ETHERADDRL) { 17001 ip_mbcast_mapping(ill, m_ip6addr, m_physaddr); 17002 return; 17003 } 17004 m_physaddr[0] = 0x33; 17005 m_physaddr[1] = 0x33; 17006 m_physaddr[2] = m_ip6addr[12]; 17007 m_physaddr[3] = m_ip6addr[13]; 17008 m_physaddr[4] = m_ip6addr[14]; 17009 m_physaddr[5] = m_ip6addr[15]; 17010 } 17011 17012 /* 17013 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet. 17014 */ 17015 static void 17016 ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17017 { 17018 phyint_t *phyi = ill->ill_phyint; 17019 17020 /* 17021 * Check PHYI_MULTI_BCAST and length of physical 17022 * address to determine if we use the mapping or the 17023 * broadcast address. 17024 */ 17025 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || 17026 ill->ill_phys_addr_length != ETHERADDRL) { 17027 ip_mbcast_mapping(ill, m_ipaddr, m_physaddr); 17028 return; 17029 } 17030 m_physaddr[0] = 0x01; 17031 m_physaddr[1] = 0x00; 17032 m_physaddr[2] = 0x5e; 17033 m_physaddr[3] = m_ipaddr[1] & 0x7f; 17034 m_physaddr[4] = m_ipaddr[2]; 17035 m_physaddr[5] = m_ipaddr[3]; 17036 } 17037 17038 /* ARGSUSED */ 17039 static void 17040 ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17041 { 17042 /* 17043 * for the MULTI_BCAST case and other cases when we want to 17044 * use the link-layer broadcast address for multicast. 17045 */ 17046 uint8_t *bphys_addr; 17047 dl_unitdata_req_t *dlur; 17048 17049 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17050 if (ill->ill_sap_length < 0) { 17051 bphys_addr = (uchar_t *)dlur + 17052 dlur->dl_dest_addr_offset; 17053 } else { 17054 bphys_addr = (uchar_t *)dlur + 17055 dlur->dl_dest_addr_offset + ill->ill_sap_length; 17056 } 17057 17058 bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length); 17059 } 17060 17061 /* 17062 * Derive IPoIB interface id from the link layer address. 17063 */ 17064 static void 17065 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17066 { 17067 char *addr; 17068 17069 ASSERT(ill->ill_phys_addr_length == 20); 17070 addr = (char *)&v6addr->s6_addr32[2]; 17071 bcopy(ill->ill_phys_addr + 12, addr, 8); 17072 /* 17073 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 17074 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 17075 * rules. In these cases, the IBA considers these GUIDs to be in 17076 * "Modified EUI-64" format, and thus toggling the u/l bit is not 17077 * required; vendors are required not to assign global EUI-64's 17078 * that differ only in u/l bit values, thus guaranteeing uniqueness 17079 * of the interface identifier. Whether the GUID is in modified 17080 * or proper EUI-64 format, the ipv6 identifier must have the u/l 17081 * bit set to 1. 17082 */ 17083 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 17084 } 17085 17086 /* 17087 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand. 17088 * Note on mapping from multicast IP addresses to IPoIB multicast link 17089 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 17090 * The format of an IPoIB multicast address is: 17091 * 17092 * 4 byte QPN Scope Sign. Pkey 17093 * +--------------------------------------------+ 17094 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 17095 * +--------------------------------------------+ 17096 * 17097 * The Scope and Pkey components are properties of the IBA port and 17098 * network interface. They can be ascertained from the broadcast address. 17099 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 17100 */ 17101 static void 17102 ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17103 { 17104 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 17105 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 17106 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 17107 uint8_t *bphys_addr; 17108 dl_unitdata_req_t *dlur; 17109 17110 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); 17111 17112 /* 17113 * RFC 4391: IPv4 MGID is 28-bit long. 17114 */ 17115 m_physaddr[16] = m_ipaddr[0] & 0x0f; 17116 m_physaddr[17] = m_ipaddr[1]; 17117 m_physaddr[18] = m_ipaddr[2]; 17118 m_physaddr[19] = m_ipaddr[3]; 17119 17120 17121 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17122 if (ill->ill_sap_length < 0) { 17123 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 17124 } else { 17125 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + 17126 ill->ill_sap_length; 17127 } 17128 /* 17129 * Now fill in the IBA scope/Pkey values from the broadcast address. 17130 */ 17131 m_physaddr[5] = bphys_addr[5]; 17132 m_physaddr[8] = bphys_addr[8]; 17133 m_physaddr[9] = bphys_addr[9]; 17134 } 17135 17136 static void 17137 ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17138 { 17139 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 17140 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 17141 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 17142 uint8_t *bphys_addr; 17143 dl_unitdata_req_t *dlur; 17144 17145 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); 17146 17147 /* 17148 * RFC 4391: IPv4 MGID is 80-bit long. 17149 */ 17150 bcopy(&m_ipaddr[6], &m_physaddr[10], 10); 17151 17152 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17153 if (ill->ill_sap_length < 0) { 17154 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 17155 } else { 17156 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + 17157 ill->ill_sap_length; 17158 } 17159 /* 17160 * Now fill in the IBA scope/Pkey values from the broadcast address. 17161 */ 17162 m_physaddr[5] = bphys_addr[5]; 17163 m_physaddr[8] = bphys_addr[8]; 17164 m_physaddr[9] = bphys_addr[9]; 17165 } 17166 17167 /* 17168 * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4 17169 * tunnel). The IPv4 address simply get placed in the lower 4 bytes of the 17170 * IPv6 interface id. This is a suggested mechanism described in section 3.7 17171 * of RFC4213. 17172 */ 17173 static void 17174 ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 17175 { 17176 ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t)); 17177 v6addr->s6_addr32[2] = 0; 17178 bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t)); 17179 } 17180 17181 /* 17182 * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6 17183 * tunnel). The lower 8 bytes of the IPv6 address simply become the interface 17184 * id. 17185 */ 17186 static void 17187 ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 17188 { 17189 in6_addr_t *v6lladdr = (in6_addr_t *)physaddr; 17190 17191 ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t)); 17192 bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8); 17193 } 17194 17195 static void 17196 ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17197 { 17198 ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr); 17199 } 17200 17201 static void 17202 ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 17203 { 17204 ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr); 17205 } 17206 17207 static void 17208 ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17209 { 17210 ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr); 17211 } 17212 17213 static void 17214 ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 17215 { 17216 ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr); 17217 } 17218 17219 /* 17220 * Lookup an ill and verify that the zoneid has an ipif on that ill. 17221 * Returns an held ill, or NULL. 17222 */ 17223 ill_t * 17224 ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6, 17225 ip_stack_t *ipst) 17226 { 17227 ill_t *ill; 17228 ipif_t *ipif; 17229 17230 ill = ill_lookup_on_ifindex(index, isv6, ipst); 17231 if (ill == NULL) 17232 return (NULL); 17233 17234 mutex_enter(&ill->ill_lock); 17235 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17236 if (IPIF_IS_CONDEMNED(ipif)) 17237 continue; 17238 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 17239 ipif->ipif_zoneid != ALL_ZONES) 17240 continue; 17241 17242 mutex_exit(&ill->ill_lock); 17243 return (ill); 17244 } 17245 mutex_exit(&ill->ill_lock); 17246 ill_refrele(ill); 17247 return (NULL); 17248 } 17249 17250 /* 17251 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 17252 * If a pointer to an ipif_t is returned then the caller will need to do 17253 * an ill_refrele(). 17254 */ 17255 ipif_t * 17256 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, 17257 ip_stack_t *ipst) 17258 { 17259 ipif_t *ipif; 17260 ill_t *ill; 17261 17262 ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 17263 if (ill == NULL) 17264 return (NULL); 17265 17266 mutex_enter(&ill->ill_lock); 17267 if (ill->ill_state_flags & ILL_CONDEMNED) { 17268 mutex_exit(&ill->ill_lock); 17269 ill_refrele(ill); 17270 return (NULL); 17271 } 17272 17273 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17274 if (!IPIF_CAN_LOOKUP(ipif)) 17275 continue; 17276 if (lifidx == ipif->ipif_id) { 17277 ipif_refhold_locked(ipif); 17278 break; 17279 } 17280 } 17281 17282 mutex_exit(&ill->ill_lock); 17283 ill_refrele(ill); 17284 return (ipif); 17285 } 17286 17287 /* 17288 * Set ill_inputfn based on the current know state. 17289 * This needs to be called when any of the factors taken into 17290 * account changes. 17291 */ 17292 void 17293 ill_set_inputfn(ill_t *ill) 17294 { 17295 ip_stack_t *ipst = ill->ill_ipst; 17296 17297 if (ill->ill_isv6) { 17298 if (is_system_labeled()) 17299 ill->ill_inputfn = ill_input_full_v6; 17300 else 17301 ill->ill_inputfn = ill_input_short_v6; 17302 } else { 17303 if (is_system_labeled()) 17304 ill->ill_inputfn = ill_input_full_v4; 17305 else if (ill->ill_dhcpinit != 0) 17306 ill->ill_inputfn = ill_input_full_v4; 17307 else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head 17308 != NULL) 17309 ill->ill_inputfn = ill_input_full_v4; 17310 else if (ipst->ips_ip_cgtp_filter && 17311 ipst->ips_ip_cgtp_filter_ops != NULL) 17312 ill->ill_inputfn = ill_input_full_v4; 17313 else 17314 ill->ill_inputfn = ill_input_short_v4; 17315 } 17316 } 17317 17318 /* 17319 * Re-evaluate ill_inputfn for all the IPv4 ills. 17320 * Used when RSVP and CGTP comes and goes. 17321 */ 17322 void 17323 ill_set_inputfn_all(ip_stack_t *ipst) 17324 { 17325 ill_walk_context_t ctx; 17326 ill_t *ill; 17327 17328 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 17329 ill = ILL_START_WALK_V4(&ctx, ipst); 17330 for (; ill != NULL; ill = ill_next(&ctx, ill)) 17331 ill_set_inputfn(ill); 17332 17333 rw_exit(&ipst->ips_ill_g_lock); 17334 } 17335 17336 /* 17337 * Set the physical address information for `ill' to the contents of the 17338 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be 17339 * asynchronous if `ill' cannot immediately be quiesced -- in which case 17340 * EINPROGRESS will be returned. 17341 */ 17342 int 17343 ill_set_phys_addr(ill_t *ill, mblk_t *mp) 17344 { 17345 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 17346 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr; 17347 17348 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17349 17350 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR && 17351 dlindp->dl_data != DL_CURR_DEST_ADDR && 17352 dlindp->dl_data != DL_CURR_PHYS_ADDR) { 17353 /* Changing DL_IPV6_TOKEN is not yet supported */ 17354 return (0); 17355 } 17356 17357 /* 17358 * We need to store up to two copies of `mp' in `ill'. Due to the 17359 * design of ipsq_pending_mp_add(), we can't pass them as separate 17360 * arguments to ill_set_phys_addr_tail(). Instead, chain them 17361 * together here, then pull 'em apart in ill_set_phys_addr_tail(). 17362 */ 17363 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) { 17364 freemsg(mp); 17365 return (ENOMEM); 17366 } 17367 17368 ipsq_current_start(ipsq, ill->ill_ipif, 0); 17369 mutex_enter(&ill->ill_lock); 17370 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 17371 /* no more nce addition allowed */ 17372 mutex_exit(&ill->ill_lock); 17373 17374 /* 17375 * If we can quiesce the ill, then set the address. If not, then 17376 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). 17377 */ 17378 ill_down_ipifs(ill, B_TRUE); 17379 mutex_enter(&ill->ill_lock); 17380 if (!ill_is_quiescent(ill)) { 17381 /* call cannot fail since `conn_t *' argument is NULL */ 17382 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 17383 mp, ILL_DOWN); 17384 mutex_exit(&ill->ill_lock); 17385 return (EINPROGRESS); 17386 } 17387 mutex_exit(&ill->ill_lock); 17388 17389 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL); 17390 return (0); 17391 } 17392 17393 /* 17394 * Once the ill associated with `q' has quiesced, set its physical address 17395 * information to the values in `addrmp'. Note that two copies of `addrmp' 17396 * are passed (linked by b_cont), since we sometimes need to save two distinct 17397 * copies in the ill_t, and our context doesn't permit sleeping or allocation 17398 * failure (we'll free the other copy if it's not needed). Since the ill_t 17399 * is quiesced, we know any stale nce's with the old address information have 17400 * already been removed, so we don't need to call nce_flush(). 17401 */ 17402 /* ARGSUSED */ 17403 static void 17404 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) 17405 { 17406 ill_t *ill = q->q_ptr; 17407 mblk_t *addrmp2 = unlinkb(addrmp); 17408 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr; 17409 uint_t addrlen, addroff; 17410 int status; 17411 17412 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17413 17414 addroff = dlindp->dl_addr_offset; 17415 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length); 17416 17417 switch (dlindp->dl_data) { 17418 case DL_IPV6_LINK_LAYER_ADDR: 17419 ill_set_ndmp(ill, addrmp, addroff, addrlen); 17420 freemsg(addrmp2); 17421 break; 17422 17423 case DL_CURR_DEST_ADDR: 17424 freemsg(ill->ill_dest_addr_mp); 17425 ill->ill_dest_addr = addrmp->b_rptr + addroff; 17426 ill->ill_dest_addr_mp = addrmp; 17427 if (ill->ill_isv6) { 17428 ill_setdesttoken(ill); 17429 ipif_setdestlinklocal(ill->ill_ipif); 17430 } 17431 freemsg(addrmp2); 17432 break; 17433 17434 case DL_CURR_PHYS_ADDR: 17435 freemsg(ill->ill_phys_addr_mp); 17436 ill->ill_phys_addr = addrmp->b_rptr + addroff; 17437 ill->ill_phys_addr_mp = addrmp; 17438 ill->ill_phys_addr_length = addrlen; 17439 if (ill->ill_isv6) 17440 ill_set_ndmp(ill, addrmp2, addroff, addrlen); 17441 else 17442 freemsg(addrmp2); 17443 if (ill->ill_isv6) { 17444 ill_setdefaulttoken(ill); 17445 ipif_setlinklocal(ill->ill_ipif); 17446 } 17447 break; 17448 default: 17449 ASSERT(0); 17450 } 17451 17452 /* 17453 * If there are ipifs to bring up, ill_up_ipifs() will return 17454 * EINPROGRESS, and ipsq_current_finish() will be called by 17455 * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is 17456 * brought up. 17457 */ 17458 status = ill_up_ipifs(ill, q, addrmp); 17459 mutex_enter(&ill->ill_lock); 17460 if (ill->ill_dl_up) 17461 ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS; 17462 mutex_exit(&ill->ill_lock); 17463 if (status != EINPROGRESS) 17464 ipsq_current_finish(ipsq); 17465 } 17466 17467 /* 17468 * Helper routine for setting the ill_nd_lla fields. 17469 */ 17470 void 17471 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen) 17472 { 17473 freemsg(ill->ill_nd_lla_mp); 17474 ill->ill_nd_lla = ndmp->b_rptr + addroff; 17475 ill->ill_nd_lla_mp = ndmp; 17476 ill->ill_nd_lla_len = addrlen; 17477 } 17478 17479 /* 17480 * Replumb the ill. 17481 */ 17482 int 17483 ill_replumb(ill_t *ill, mblk_t *mp) 17484 { 17485 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 17486 17487 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17488 17489 ipsq_current_start(ipsq, ill->ill_ipif, 0); 17490 17491 mutex_enter(&ill->ill_lock); 17492 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 17493 /* no more nce addition allowed */ 17494 mutex_exit(&ill->ill_lock); 17495 17496 /* 17497 * If we can quiesce the ill, then continue. If not, then 17498 * ill_replumb_tail() will be called from ipif_ill_refrele_tail(). 17499 */ 17500 ill_down_ipifs(ill, B_FALSE); 17501 17502 mutex_enter(&ill->ill_lock); 17503 if (!ill_is_quiescent(ill)) { 17504 /* call cannot fail since `conn_t *' argument is NULL */ 17505 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 17506 mp, ILL_DOWN); 17507 mutex_exit(&ill->ill_lock); 17508 return (EINPROGRESS); 17509 } 17510 mutex_exit(&ill->ill_lock); 17511 17512 ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL); 17513 return (0); 17514 } 17515 17516 /* ARGSUSED */ 17517 static void 17518 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 17519 { 17520 ill_t *ill = q->q_ptr; 17521 int err; 17522 conn_t *connp = NULL; 17523 17524 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17525 freemsg(ill->ill_replumb_mp); 17526 ill->ill_replumb_mp = copyb(mp); 17527 17528 if (ill->ill_replumb_mp == NULL) { 17529 /* out of memory */ 17530 ipsq_current_finish(ipsq); 17531 return; 17532 } 17533 17534 mutex_enter(&ill->ill_lock); 17535 ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif, 17536 ill->ill_rq, ill->ill_replumb_mp, 0); 17537 mutex_exit(&ill->ill_lock); 17538 17539 if (!ill->ill_up_ipifs) { 17540 /* already closing */ 17541 ipsq_current_finish(ipsq); 17542 return; 17543 } 17544 ill->ill_replumbing = 1; 17545 err = ill_down_ipifs_tail(ill); 17546 17547 /* 17548 * Successfully quiesced and brought down the interface, now we send 17549 * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the 17550 * DL_NOTE_REPLUMB message. 17551 */ 17552 mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO, 17553 DL_NOTIFY_CONF); 17554 ASSERT(mp != NULL); 17555 ((dl_notify_conf_t *)mp->b_rptr)->dl_notification = 17556 DL_NOTE_REPLUMB_DONE; 17557 ill_dlpi_send(ill, mp); 17558 17559 /* 17560 * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP 17561 * streams have to be unbound. When all the DLPI exchanges are done, 17562 * ipsq_current_finish() will be called by arp_bringup_done(). The 17563 * remainder of ipif bringup via ill_up_ipifs() will also be done in 17564 * arp_bringup_done(). 17565 */ 17566 ASSERT(ill->ill_replumb_mp != NULL); 17567 if (err == EINPROGRESS) 17568 return; 17569 else 17570 ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp); 17571 ASSERT(connp == NULL); 17572 if (err == 0 && ill->ill_replumb_mp != NULL && 17573 ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) { 17574 return; 17575 } 17576 ipsq_current_finish(ipsq); 17577 } 17578 17579 /* 17580 * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf' 17581 * which is `bufsize' bytes. On success, zero is returned and `buf' updated 17582 * as per the ioctl. On failure, an errno is returned. 17583 */ 17584 static int 17585 ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr) 17586 { 17587 int rval; 17588 struct strioctl iocb; 17589 17590 iocb.ic_cmd = cmd; 17591 iocb.ic_timout = 15; 17592 iocb.ic_len = bufsize; 17593 iocb.ic_dp = buf; 17594 17595 return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval)); 17596 } 17597 17598 /* 17599 * Issue an SIOCGLIFCONF for address family `af' and store the result into a 17600 * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success. 17601 */ 17602 static int 17603 ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp, 17604 uint_t *bufsizep, cred_t *cr) 17605 { 17606 int err; 17607 struct lifnum lifn; 17608 17609 bzero(&lifn, sizeof (lifn)); 17610 lifn.lifn_family = af; 17611 lifn.lifn_flags = LIFC_UNDER_IPMP; 17612 17613 if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0) 17614 return (err); 17615 17616 /* 17617 * Pad the interface count to account for additional interfaces that 17618 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 17619 */ 17620 lifn.lifn_count += 4; 17621 bzero(lifcp, sizeof (*lifcp)); 17622 lifcp->lifc_flags = LIFC_UNDER_IPMP; 17623 lifcp->lifc_family = af; 17624 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 17625 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 17626 17627 err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr); 17628 if (err != 0) { 17629 kmem_free(lifcp->lifc_buf, *bufsizep); 17630 return (err); 17631 } 17632 17633 return (0); 17634 } 17635 17636 /* 17637 * Helper for ip_interface_cleanup() that removes the loopback interface. 17638 */ 17639 static void 17640 ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 17641 { 17642 int err; 17643 struct lifreq lifr; 17644 17645 bzero(&lifr, sizeof (lifr)); 17646 (void) strcpy(lifr.lifr_name, ipif_loopback_name); 17647 17648 err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr); 17649 if (err != 0) { 17650 ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: " 17651 "error %d\n", isv6 ? "v6" : "v4", err)); 17652 } 17653 } 17654 17655 /* 17656 * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP 17657 * groups and that IPMP data addresses are down. These conditions must be met 17658 * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp(). 17659 */ 17660 static void 17661 ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 17662 { 17663 int af = isv6 ? AF_INET6 : AF_INET; 17664 int i, nifs; 17665 int err; 17666 uint_t bufsize; 17667 uint_t lifrsize = sizeof (struct lifreq); 17668 struct lifconf lifc; 17669 struct lifreq *lifrp; 17670 17671 if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) { 17672 cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list " 17673 "(error %d); any IPMP interfaces cannot be shutdown", err); 17674 return; 17675 } 17676 17677 nifs = lifc.lifc_len / lifrsize; 17678 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 17679 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 17680 if (err != 0) { 17681 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get " 17682 "flags: error %d", lifrp->lifr_name, err); 17683 continue; 17684 } 17685 17686 if (lifrp->lifr_flags & IFF_IPMP) { 17687 if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0) 17688 continue; 17689 17690 lifrp->lifr_flags &= ~IFF_UP; 17691 err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr); 17692 if (err != 0) { 17693 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 17694 "bring down (error %d); IPMP interface may " 17695 "not be shutdown", lifrp->lifr_name, err); 17696 } 17697 17698 /* 17699 * Check if IFF_DUPLICATE is still set -- and if so, 17700 * reset the address to clear it. 17701 */ 17702 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 17703 if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE)) 17704 continue; 17705 17706 err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr); 17707 if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR, 17708 lifrp, lifrsize, cr)) != 0) { 17709 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 17710 "reset DAD (error %d); IPMP interface may " 17711 "not be shutdown", lifrp->lifr_name, err); 17712 } 17713 continue; 17714 } 17715 17716 lifrp->lifr_groupname[0] = '\0'; 17717 err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp, lifrsize, cr); 17718 if (err != 0) { 17719 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot leave " 17720 "IPMP group (error %d); associated IPMP interface " 17721 "may not be shutdown", lifrp->lifr_name, err); 17722 continue; 17723 } 17724 } 17725 17726 kmem_free(lifc.lifc_buf, bufsize); 17727 } 17728 17729 #define UDPDEV "/devices/pseudo/udp@0:udp" 17730 #define UDP6DEV "/devices/pseudo/udp6@0:udp6" 17731 17732 /* 17733 * Remove the loopback interfaces and prep the IPMP interfaces to be torn down. 17734 * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away 17735 * when the user-level processes in the zone are killed and the latter are 17736 * cleaned up by str_stack_shutdown(). 17737 */ 17738 void 17739 ip_interface_cleanup(ip_stack_t *ipst) 17740 { 17741 ldi_handle_t lh; 17742 ldi_ident_t li; 17743 cred_t *cr; 17744 int err; 17745 int i; 17746 char *devs[] = { UDP6DEV, UDPDEV }; 17747 netstackid_t stackid = ipst->ips_netstack->netstack_stackid; 17748 17749 if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) { 17750 cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:" 17751 " error %d", err); 17752 return; 17753 } 17754 17755 cr = zone_get_kcred(netstackid_to_zoneid(stackid)); 17756 ASSERT(cr != NULL); 17757 17758 /* 17759 * NOTE: loop executes exactly twice and is hardcoded to know that the 17760 * first iteration is IPv6. (Unrolling yields repetitious code, hence 17761 * the loop.) 17762 */ 17763 for (i = 0; i < 2; i++) { 17764 err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li); 17765 if (err != 0) { 17766 cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:" 17767 " error %d", devs[i], err); 17768 continue; 17769 } 17770 17771 ip_loopback_removeif(lh, i == 0, cr); 17772 ip_ipmp_cleanup(lh, i == 0, cr); 17773 17774 (void) ldi_close(lh, FREAD|FWRITE, cr); 17775 } 17776 17777 ldi_ident_release(li); 17778 crfree(cr); 17779 } 17780 17781 /* 17782 * This needs to be in-sync with nic_event_t definition 17783 */ 17784 static const char * 17785 ill_hook_event2str(nic_event_t event) 17786 { 17787 switch (event) { 17788 case NE_PLUMB: 17789 return ("PLUMB"); 17790 case NE_UNPLUMB: 17791 return ("UNPLUMB"); 17792 case NE_UP: 17793 return ("UP"); 17794 case NE_DOWN: 17795 return ("DOWN"); 17796 case NE_ADDRESS_CHANGE: 17797 return ("ADDRESS_CHANGE"); 17798 case NE_LIF_UP: 17799 return ("LIF_UP"); 17800 case NE_LIF_DOWN: 17801 return ("LIF_DOWN"); 17802 case NE_IFINDEX_CHANGE: 17803 return ("IFINDEX_CHANGE"); 17804 default: 17805 return ("UNKNOWN"); 17806 } 17807 } 17808 17809 void 17810 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event, 17811 nic_event_data_t data, size_t datalen) 17812 { 17813 ip_stack_t *ipst = ill->ill_ipst; 17814 hook_nic_event_int_t *info; 17815 const char *str = NULL; 17816 17817 /* create a new nic event info */ 17818 if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL) 17819 goto fail; 17820 17821 info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex; 17822 info->hnei_event.hne_lif = lif; 17823 info->hnei_event.hne_event = event; 17824 info->hnei_event.hne_protocol = ill->ill_isv6 ? 17825 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 17826 info->hnei_event.hne_data = NULL; 17827 info->hnei_event.hne_datalen = 0; 17828 info->hnei_stackid = ipst->ips_netstack->netstack_stackid; 17829 17830 if (data != NULL && datalen != 0) { 17831 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP); 17832 if (info->hnei_event.hne_data == NULL) 17833 goto fail; 17834 bcopy(data, info->hnei_event.hne_data, datalen); 17835 info->hnei_event.hne_datalen = datalen; 17836 } 17837 17838 if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info, 17839 DDI_NOSLEEP) == DDI_SUCCESS) 17840 return; 17841 17842 fail: 17843 if (info != NULL) { 17844 if (info->hnei_event.hne_data != NULL) { 17845 kmem_free(info->hnei_event.hne_data, 17846 info->hnei_event.hne_datalen); 17847 } 17848 kmem_free(info, sizeof (hook_nic_event_t)); 17849 } 17850 str = ill_hook_event2str(event); 17851 ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event " 17852 "information for %s (ENOMEM)\n", str, ill->ill_name)); 17853 } 17854 17855 static int 17856 ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act) 17857 { 17858 int err = 0; 17859 const in_addr_t *addr = NULL; 17860 nce_t *nce = NULL; 17861 ill_t *ill = ipif->ipif_ill; 17862 ill_t *bound_ill; 17863 boolean_t added_ipif = B_FALSE; 17864 uint16_t state; 17865 uint16_t flags; 17866 17867 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail", 17868 ill_t *, ill, ipif_t *, ipif); 17869 if (ipif->ipif_lcl_addr != INADDR_ANY) { 17870 addr = &ipif->ipif_lcl_addr; 17871 } 17872 17873 if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) { 17874 if (res_act != Res_act_initial) 17875 return (EINVAL); 17876 } 17877 17878 if (addr != NULL) { 17879 ipmp_illgrp_t *illg = ill->ill_grp; 17880 17881 /* add unicast nce for the local addr */ 17882 17883 if (IS_IPMP(ill)) { 17884 /* 17885 * If we're here via ipif_up(), then the ipif 17886 * won't be bound yet -- add it to the group, 17887 * which will bind it if possible. (We would 17888 * add it in ipif_up(), but deleting on failure 17889 * there is gruesome.) If we're here via 17890 * ipmp_ill_bind_ipif(), then the ipif has 17891 * already been added to the group and we 17892 * just need to use the binding. 17893 */ 17894 if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) { 17895 bound_ill = ipmp_illgrp_add_ipif(illg, ipif); 17896 if (bound_ill == NULL) { 17897 /* 17898 * We couldn't bind the ipif to an ill 17899 * yet, so we have nothing to publish. 17900 * Mark the address as ready and return. 17901 */ 17902 ipif->ipif_addr_ready = 1; 17903 return (0); 17904 } 17905 added_ipif = B_TRUE; 17906 } 17907 } else { 17908 bound_ill = ill; 17909 } 17910 17911 flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY | 17912 NCE_F_NONUD); 17913 /* 17914 * If this is an initial bring-up (or the ipif was never 17915 * completely brought up), do DAD. Otherwise, we're here 17916 * because IPMP has rebound an address to this ill: send 17917 * unsolicited advertisements (ARP announcements) to 17918 * inform others. 17919 */ 17920 if (res_act == Res_act_initial || !ipif->ipif_addr_ready) { 17921 state = ND_UNCHANGED; /* compute in nce_add_common() */ 17922 } else { 17923 state = ND_REACHABLE; 17924 flags |= NCE_F_UNSOL_ADV; 17925 } 17926 17927 retry: 17928 err = nce_lookup_then_add_v4(ill, 17929 bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length, 17930 addr, flags, state, &nce); 17931 17932 /* 17933 * note that we may encounter EEXIST if we are moving 17934 * the nce as a result of a rebind operation. 17935 */ 17936 switch (err) { 17937 case 0: 17938 ipif->ipif_added_nce = 1; 17939 nce->nce_ipif_cnt++; 17940 break; 17941 case EEXIST: 17942 ip1dbg(("ipif_arp_up: NCE already exists for %s\n", 17943 ill->ill_name)); 17944 if (!NCE_MYADDR(nce->nce_common)) { 17945 /* 17946 * A leftover nce from before this address 17947 * existed 17948 */ 17949 ncec_delete(nce->nce_common); 17950 nce_refrele(nce); 17951 nce = NULL; 17952 goto retry; 17953 } 17954 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 17955 nce_refrele(nce); 17956 nce = NULL; 17957 ip1dbg(("ipif_arp_up: NCE already exists " 17958 "for %s:%u\n", ill->ill_name, 17959 ipif->ipif_id)); 17960 goto arp_up_done; 17961 } 17962 /* 17963 * Duplicate local addresses are permissible for 17964 * IPIF_POINTOPOINT interfaces which will get marked 17965 * IPIF_UNNUMBERED later in 17966 * ip_addr_availability_check(). 17967 * 17968 * The nce_ipif_cnt field tracks the number of 17969 * ipifs that have nce_addr as their local address. 17970 */ 17971 ipif->ipif_addr_ready = 1; 17972 ipif->ipif_added_nce = 1; 17973 nce->nce_ipif_cnt++; 17974 err = 0; 17975 break; 17976 default: 17977 ASSERT(nce == NULL); 17978 goto arp_up_done; 17979 } 17980 if (arp_no_defense) { 17981 if ((ipif->ipif_flags & IPIF_UP) && 17982 !ipif->ipif_addr_ready) 17983 ipif_up_notify(ipif); 17984 ipif->ipif_addr_ready = 1; 17985 } 17986 } else { 17987 /* zero address. nothing to publish */ 17988 ipif->ipif_addr_ready = 1; 17989 } 17990 if (nce != NULL) 17991 nce_refrele(nce); 17992 arp_up_done: 17993 if (added_ipif && err != 0) 17994 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 17995 return (err); 17996 } 17997 17998 int 17999 ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup) 18000 { 18001 int err = 0; 18002 ill_t *ill = ipif->ipif_ill; 18003 boolean_t first_interface, wait_for_dlpi = B_FALSE; 18004 18005 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up", 18006 ill_t *, ill, ipif_t *, ipif); 18007 18008 /* 18009 * need to bring up ARP or setup mcast mapping only 18010 * when the first interface is coming UP. 18011 */ 18012 first_interface = (ill->ill_ipif_up_count == 0 && 18013 ill->ill_ipif_dup_count == 0 && !was_dup); 18014 18015 if (res_act == Res_act_initial && first_interface) { 18016 /* 18017 * Send ATTACH + BIND 18018 */ 18019 err = arp_ll_up(ill); 18020 if (err != EINPROGRESS && err != 0) 18021 return (err); 18022 18023 /* 18024 * Add NCE for local address. Start DAD. 18025 * we'll wait to hear that DAD has finished 18026 * before using the interface. 18027 */ 18028 if (err == EINPROGRESS) 18029 wait_for_dlpi = B_TRUE; 18030 } 18031 18032 if (!wait_for_dlpi) 18033 (void) ipif_arp_up_done_tail(ipif, res_act); 18034 18035 return (!wait_for_dlpi ? 0 : EINPROGRESS); 18036 } 18037 18038 /* 18039 * Finish processing of "arp_up" after all the DLPI message 18040 * exchanges have completed between arp and the driver. 18041 */ 18042 void 18043 arp_bringup_done(ill_t *ill, int err) 18044 { 18045 mblk_t *mp1; 18046 ipif_t *ipif; 18047 conn_t *connp = NULL; 18048 ipsq_t *ipsq; 18049 queue_t *q; 18050 18051 ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name)); 18052 18053 ASSERT(IAM_WRITER_ILL(ill)); 18054 18055 ipsq = ill->ill_phyint->phyint_ipsq; 18056 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 18057 mp1 = ipsq_pending_mp_get(ipsq, &connp); 18058 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 18059 if (mp1 == NULL) /* bringup was aborted by the user */ 18060 return; 18061 18062 /* 18063 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 18064 * must have an associated conn_t. Otherwise, we're bringing this 18065 * interface back up as part of handling an asynchronous event (e.g., 18066 * physical address change). 18067 */ 18068 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18069 ASSERT(connp != NULL); 18070 q = CONNP_TO_WQ(connp); 18071 } else { 18072 ASSERT(connp == NULL); 18073 q = ill->ill_rq; 18074 } 18075 if (err == 0) { 18076 if (ipif->ipif_isv6) { 18077 if ((err = ipif_up_done_v6(ipif)) != 0) 18078 ip0dbg(("arp_bringup_done: init failed\n")); 18079 } else { 18080 err = ipif_arp_up_done_tail(ipif, Res_act_initial); 18081 if (err != 0 || 18082 (err = ipif_up_done(ipif)) != 0) { 18083 ip0dbg(("arp_bringup_done: " 18084 "init failed err %x\n", err)); 18085 (void) ipif_arp_down(ipif); 18086 } 18087 18088 } 18089 } else { 18090 ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n")); 18091 } 18092 18093 if ((err == 0) && (ill->ill_up_ipifs)) { 18094 err = ill_up_ipifs(ill, q, mp1); 18095 if (err == EINPROGRESS) 18096 return; 18097 } 18098 18099 /* 18100 * If we have a moved ipif to bring up, and everything has succeeded 18101 * to this point, bring it up on the IPMP ill. Otherwise, leave it 18102 * down -- the admin can try to bring it up by hand if need be. 18103 */ 18104 if (ill->ill_move_ipif != NULL) { 18105 ipif = ill->ill_move_ipif; 18106 ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif, 18107 ipif->ipif_ill->ill_name)); 18108 ill->ill_move_ipif = NULL; 18109 if (err == 0) { 18110 err = ipif_up(ipif, q, mp1); 18111 if (err == EINPROGRESS) 18112 return; 18113 } 18114 } 18115 18116 /* 18117 * The operation must complete without EINPROGRESS since 18118 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 18119 * Otherwise, the operation will be stuck forever in the ipsq. 18120 */ 18121 ASSERT(err != EINPROGRESS); 18122 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18123 DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish", 18124 int, ipsq->ipsq_xop->ipx_current_ioctl, 18125 ill_t *, ill, ipif_t *, ipif); 18126 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 18127 } else { 18128 ipsq_current_finish(ipsq); 18129 } 18130 } 18131 18132 /* 18133 * Finish processing of arp replumb after all the DLPI message 18134 * exchanges have completed between arp and the driver. 18135 */ 18136 void 18137 arp_replumb_done(ill_t *ill, int err) 18138 { 18139 mblk_t *mp1; 18140 ipif_t *ipif; 18141 conn_t *connp = NULL; 18142 ipsq_t *ipsq; 18143 queue_t *q; 18144 18145 ASSERT(IAM_WRITER_ILL(ill)); 18146 18147 ipsq = ill->ill_phyint->phyint_ipsq; 18148 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 18149 mp1 = ipsq_pending_mp_get(ipsq, &connp); 18150 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 18151 if (mp1 == NULL) { 18152 ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n", 18153 ipsq->ipsq_xop->ipx_current_ioctl)); 18154 /* bringup was aborted by the user */ 18155 return; 18156 } 18157 /* 18158 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 18159 * must have an associated conn_t. Otherwise, we're bringing this 18160 * interface back up as part of handling an asynchronous event (e.g., 18161 * physical address change). 18162 */ 18163 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18164 ASSERT(connp != NULL); 18165 q = CONNP_TO_WQ(connp); 18166 } else { 18167 ASSERT(connp == NULL); 18168 q = ill->ill_rq; 18169 } 18170 if ((err == 0) && (ill->ill_up_ipifs)) { 18171 err = ill_up_ipifs(ill, q, mp1); 18172 if (err == EINPROGRESS) 18173 return; 18174 } 18175 /* 18176 * The operation must complete without EINPROGRESS since 18177 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 18178 * Otherwise, the operation will be stuck forever in the ipsq. 18179 */ 18180 ASSERT(err != EINPROGRESS); 18181 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18182 DTRACE_PROBE4(ipif__ioctl, char *, 18183 "arp_replumb_done finish", 18184 int, ipsq->ipsq_xop->ipx_current_ioctl, 18185 ill_t *, ill, ipif_t *, ipif); 18186 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 18187 } else { 18188 ipsq_current_finish(ipsq); 18189 } 18190 } 18191 18192 void 18193 ipif_up_notify(ipif_t *ipif) 18194 { 18195 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 18196 ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT); 18197 sctp_update_ipif(ipif, SCTP_IPIF_UP); 18198 ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id), 18199 NE_LIF_UP, NULL, 0); 18200 } 18201 18202 /* 18203 * ILB ioctl uses cv_wait (such as deleting a rule or adding a server) and 18204 * this assumes the context is cv_wait'able. Hence it shouldnt' be used on 18205 * TPI end points with STREAMS modules pushed above. This is assured by not 18206 * having the IPI_MODOK flag for the ioctl. And IP ensures the ILB ioctl 18207 * never ends up on an ipsq, otherwise we may end up processing the ioctl 18208 * while unwinding from the ispq and that could be a thread from the bottom. 18209 */ 18210 /* ARGSUSED */ 18211 int 18212 ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 18213 ip_ioctl_cmd_t *ipip, void *arg) 18214 { 18215 mblk_t *cmd_mp = mp->b_cont->b_cont; 18216 ilb_cmd_t command = *((ilb_cmd_t *)cmd_mp->b_rptr); 18217 int ret = 0; 18218 int i; 18219 size_t size; 18220 ip_stack_t *ipst; 18221 zoneid_t zoneid; 18222 ilb_stack_t *ilbs; 18223 18224 ipst = CONNQ_TO_IPST(q); 18225 ilbs = ipst->ips_netstack->netstack_ilb; 18226 zoneid = Q_TO_CONN(q)->conn_zoneid; 18227 18228 switch (command) { 18229 case ILB_CREATE_RULE: { 18230 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 18231 18232 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 18233 ret = EINVAL; 18234 break; 18235 } 18236 18237 ret = ilb_rule_add(ilbs, zoneid, cmd); 18238 break; 18239 } 18240 case ILB_DESTROY_RULE: 18241 case ILB_ENABLE_RULE: 18242 case ILB_DISABLE_RULE: { 18243 ilb_name_cmd_t *cmd = (ilb_name_cmd_t *)cmd_mp->b_rptr; 18244 18245 if (MBLKL(cmd_mp) != sizeof (ilb_name_cmd_t)) { 18246 ret = EINVAL; 18247 break; 18248 } 18249 18250 if (cmd->flags & ILB_RULE_ALLRULES) { 18251 if (command == ILB_DESTROY_RULE) { 18252 ilb_rule_del_all(ilbs, zoneid); 18253 break; 18254 } else if (command == ILB_ENABLE_RULE) { 18255 ilb_rule_enable_all(ilbs, zoneid); 18256 break; 18257 } else if (command == ILB_DISABLE_RULE) { 18258 ilb_rule_disable_all(ilbs, zoneid); 18259 break; 18260 } 18261 } else { 18262 if (command == ILB_DESTROY_RULE) { 18263 ret = ilb_rule_del(ilbs, zoneid, cmd->name); 18264 } else if (command == ILB_ENABLE_RULE) { 18265 ret = ilb_rule_enable(ilbs, zoneid, cmd->name, 18266 NULL); 18267 } else if (command == ILB_DISABLE_RULE) { 18268 ret = ilb_rule_disable(ilbs, zoneid, cmd->name, 18269 NULL); 18270 } 18271 } 18272 break; 18273 } 18274 case ILB_NUM_RULES: { 18275 ilb_num_rules_cmd_t *cmd; 18276 18277 if (MBLKL(cmd_mp) != sizeof (ilb_num_rules_cmd_t)) { 18278 ret = EINVAL; 18279 break; 18280 } 18281 cmd = (ilb_num_rules_cmd_t *)cmd_mp->b_rptr; 18282 ilb_get_num_rules(ilbs, zoneid, &(cmd->num)); 18283 break; 18284 } 18285 case ILB_RULE_NAMES: { 18286 ilb_rule_names_cmd_t *cmd; 18287 18288 cmd = (ilb_rule_names_cmd_t *)cmd_mp->b_rptr; 18289 if (MBLKL(cmd_mp) < sizeof (ilb_rule_names_cmd_t) || 18290 cmd->num_names == 0) { 18291 ret = EINVAL; 18292 break; 18293 } 18294 size = cmd->num_names * ILB_RULE_NAMESZ; 18295 if (cmd_mp->b_rptr + offsetof(ilb_rule_names_cmd_t, buf) + 18296 size != cmd_mp->b_wptr) { 18297 ret = EINVAL; 18298 break; 18299 } 18300 ilb_get_rulenames(ilbs, zoneid, &cmd->num_names, cmd->buf); 18301 break; 18302 } 18303 case ILB_NUM_SERVERS: { 18304 ilb_num_servers_cmd_t *cmd; 18305 18306 if (MBLKL(cmd_mp) != sizeof (ilb_num_servers_cmd_t)) { 18307 ret = EINVAL; 18308 break; 18309 } 18310 cmd = (ilb_num_servers_cmd_t *)cmd_mp->b_rptr; 18311 ret = ilb_get_num_servers(ilbs, zoneid, cmd->name, 18312 &(cmd->num)); 18313 break; 18314 } 18315 case ILB_LIST_RULE: { 18316 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 18317 18318 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 18319 ret = EINVAL; 18320 break; 18321 } 18322 ret = ilb_rule_list(ilbs, zoneid, cmd); 18323 break; 18324 } 18325 case ILB_LIST_SERVERS: { 18326 ilb_servers_info_cmd_t *cmd; 18327 18328 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 18329 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t) || 18330 cmd->num_servers == 0) { 18331 ret = EINVAL; 18332 break; 18333 } 18334 size = cmd->num_servers * sizeof (ilb_server_info_t); 18335 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 18336 size != cmd_mp->b_wptr) { 18337 ret = EINVAL; 18338 break; 18339 } 18340 18341 ret = ilb_get_servers(ilbs, zoneid, cmd->name, cmd->servers, 18342 &cmd->num_servers); 18343 break; 18344 } 18345 case ILB_ADD_SERVERS: { 18346 ilb_servers_info_cmd_t *cmd; 18347 ilb_rule_t *rule; 18348 18349 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 18350 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t)) { 18351 ret = EINVAL; 18352 break; 18353 } 18354 size = cmd->num_servers * sizeof (ilb_server_info_t); 18355 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 18356 size != cmd_mp->b_wptr) { 18357 ret = EINVAL; 18358 break; 18359 } 18360 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 18361 if (rule == NULL) { 18362 ASSERT(ret != 0); 18363 break; 18364 } 18365 for (i = 0; i < cmd->num_servers; i++) { 18366 ilb_server_info_t *s; 18367 18368 s = &cmd->servers[i]; 18369 s->err = ilb_server_add(ilbs, rule, s); 18370 } 18371 ILB_RULE_REFRELE(rule); 18372 break; 18373 } 18374 case ILB_DEL_SERVERS: 18375 case ILB_ENABLE_SERVERS: 18376 case ILB_DISABLE_SERVERS: { 18377 ilb_servers_cmd_t *cmd; 18378 ilb_rule_t *rule; 18379 int (*f)(); 18380 18381 cmd = (ilb_servers_cmd_t *)cmd_mp->b_rptr; 18382 if (MBLKL(cmd_mp) < sizeof (ilb_servers_cmd_t)) { 18383 ret = EINVAL; 18384 break; 18385 } 18386 size = cmd->num_servers * sizeof (ilb_server_arg_t); 18387 if (cmd_mp->b_rptr + offsetof(ilb_servers_cmd_t, servers) + 18388 size != cmd_mp->b_wptr) { 18389 ret = EINVAL; 18390 break; 18391 } 18392 18393 if (command == ILB_DEL_SERVERS) 18394 f = ilb_server_del; 18395 else if (command == ILB_ENABLE_SERVERS) 18396 f = ilb_server_enable; 18397 else if (command == ILB_DISABLE_SERVERS) 18398 f = ilb_server_disable; 18399 18400 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 18401 if (rule == NULL) { 18402 ASSERT(ret != 0); 18403 break; 18404 } 18405 18406 for (i = 0; i < cmd->num_servers; i++) { 18407 ilb_server_arg_t *s; 18408 18409 s = &cmd->servers[i]; 18410 s->err = f(ilbs, zoneid, NULL, rule, &s->addr); 18411 } 18412 ILB_RULE_REFRELE(rule); 18413 break; 18414 } 18415 case ILB_LIST_NAT_TABLE: { 18416 ilb_list_nat_cmd_t *cmd; 18417 18418 cmd = (ilb_list_nat_cmd_t *)cmd_mp->b_rptr; 18419 if (MBLKL(cmd_mp) < sizeof (ilb_list_nat_cmd_t)) { 18420 ret = EINVAL; 18421 break; 18422 } 18423 size = cmd->num_nat * sizeof (ilb_nat_entry_t); 18424 if (cmd_mp->b_rptr + offsetof(ilb_list_nat_cmd_t, entries) + 18425 size != cmd_mp->b_wptr) { 18426 ret = EINVAL; 18427 break; 18428 } 18429 18430 ret = ilb_list_nat(ilbs, zoneid, cmd->entries, &cmd->num_nat, 18431 &cmd->flags); 18432 break; 18433 } 18434 case ILB_LIST_STICKY_TABLE: { 18435 ilb_list_sticky_cmd_t *cmd; 18436 18437 cmd = (ilb_list_sticky_cmd_t *)cmd_mp->b_rptr; 18438 if (MBLKL(cmd_mp) < sizeof (ilb_list_sticky_cmd_t)) { 18439 ret = EINVAL; 18440 break; 18441 } 18442 size = cmd->num_sticky * sizeof (ilb_sticky_entry_t); 18443 if (cmd_mp->b_rptr + offsetof(ilb_list_sticky_cmd_t, entries) + 18444 size != cmd_mp->b_wptr) { 18445 ret = EINVAL; 18446 break; 18447 } 18448 18449 ret = ilb_list_sticky(ilbs, zoneid, cmd->entries, 18450 &cmd->num_sticky, &cmd->flags); 18451 break; 18452 } 18453 default: 18454 ret = EINVAL; 18455 break; 18456 } 18457 done: 18458 return (ret); 18459 } 18460 18461 /* Remove all cache entries for this logical interface */ 18462 void 18463 ipif_nce_down(ipif_t *ipif) 18464 { 18465 ill_t *ill = ipif->ipif_ill; 18466 nce_t *nce; 18467 18468 DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down", 18469 ill_t *, ill, ipif_t *, ipif); 18470 if (ipif->ipif_added_nce) { 18471 if (ipif->ipif_isv6) 18472 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); 18473 else 18474 nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr); 18475 if (nce != NULL) { 18476 if (--nce->nce_ipif_cnt == 0) 18477 ncec_delete(nce->nce_common); 18478 ipif->ipif_added_nce = 0; 18479 nce_refrele(nce); 18480 } else { 18481 /* 18482 * nce may already be NULL because it was already 18483 * flushed, e.g., due to a call to nce_flush 18484 */ 18485 ipif->ipif_added_nce = 0; 18486 } 18487 } 18488 /* 18489 * Make IPMP aware of the deleted data address. 18490 */ 18491 if (IS_IPMP(ill)) 18492 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 18493 18494 /* 18495 * Remove all other nces dependent on this ill when the last ipif 18496 * is going away. 18497 */ 18498 if (ill->ill_ipif_up_count == 0) { 18499 ncec_walk(ill, (pfi_t)ncec_delete_per_ill, 18500 (uchar_t *)ill, ill->ill_ipst); 18501 if (IS_UNDER_IPMP(ill)) 18502 nce_flush(ill, B_TRUE); 18503 } 18504 } 18505 18506 /* 18507 * find the first interface that uses usill for its source address. 18508 */ 18509 ill_t * 18510 ill_lookup_usesrc(ill_t *usill) 18511 { 18512 ip_stack_t *ipst = usill->ill_ipst; 18513 ill_t *ill; 18514 18515 ASSERT(usill != NULL); 18516 18517 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 18518 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 18519 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18520 for (ill = usill->ill_usesrc_grp_next; ill != NULL && ill != usill; 18521 ill = ill->ill_usesrc_grp_next) { 18522 if (!IS_UNDER_IPMP(ill) && (ill->ill_flags & ILLF_MULTICAST) && 18523 !ILL_IS_CONDEMNED(ill)) { 18524 ill_refhold(ill); 18525 break; 18526 } 18527 } 18528 rw_exit(&ipst->ips_ill_g_lock); 18529 rw_exit(&ipst->ips_ill_g_usesrc_lock); 18530 return (ill); 18531 } 18532