1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains the interface control functions for IP. 31 */ 32 33 #include <sys/types.h> 34 #include <sys/stream.h> 35 #include <sys/dlpi.h> 36 #include <sys/stropts.h> 37 #include <sys/strsun.h> 38 #include <sys/sysmacros.h> 39 #include <sys/strlog.h> 40 #include <sys/ddi.h> 41 #include <sys/sunddi.h> 42 #include <sys/cmn_err.h> 43 #include <sys/kstat.h> 44 #include <sys/debug.h> 45 #include <sys/zone.h> 46 47 #include <sys/kmem.h> 48 #include <sys/systm.h> 49 #include <sys/param.h> 50 #include <sys/socket.h> 51 #include <sys/isa_defs.h> 52 #include <net/if.h> 53 #include <net/if_arp.h> 54 #include <net/if_types.h> 55 #include <net/if_dl.h> 56 #include <net/route.h> 57 #include <sys/sockio.h> 58 #include <netinet/in.h> 59 #include <netinet/ip6.h> 60 #include <netinet/icmp6.h> 61 #include <netinet/igmp_var.h> 62 #include <sys/strsun.h> 63 #include <sys/policy.h> 64 #include <sys/ethernet.h> 65 66 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 67 #include <inet/mi.h> 68 #include <inet/nd.h> 69 #include <inet/arp.h> 70 #include <inet/mib2.h> 71 #include <inet/ip.h> 72 #include <inet/ip6.h> 73 #include <inet/ip6_asp.h> 74 #include <inet/tcp.h> 75 #include <inet/ip_multi.h> 76 #include <inet/ip_ire.h> 77 #include <inet/ip_ftable.h> 78 #include <inet/ip_rts.h> 79 #include <inet/ip_ndp.h> 80 #include <inet/ip_if.h> 81 #include <inet/ip_impl.h> 82 #include <inet/tun.h> 83 #include <inet/sctp_ip.h> 84 #include <inet/ip_netinfo.h> 85 86 #include <net/pfkeyv2.h> 87 #include <inet/ipsec_info.h> 88 #include <inet/sadb.h> 89 #include <inet/ipsec_impl.h> 90 #include <sys/iphada.h> 91 92 93 #include <netinet/igmp.h> 94 #include <inet/ip_listutils.h> 95 #include <inet/ipclassifier.h> 96 #include <sys/mac.h> 97 98 #include <sys/systeminfo.h> 99 #include <sys/bootconf.h> 100 101 #include <sys/tsol/tndb.h> 102 #include <sys/tsol/tnet.h> 103 104 /* The character which tells where the ill_name ends */ 105 #define IPIF_SEPARATOR_CHAR ':' 106 107 /* IP ioctl function table entry */ 108 typedef struct ipft_s { 109 int ipft_cmd; 110 pfi_t ipft_pfi; 111 int ipft_min_size; 112 int ipft_flags; 113 } ipft_t; 114 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 115 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 116 117 typedef struct ip_sock_ar_s { 118 union { 119 area_t ip_sock_area; 120 ared_t ip_sock_ared; 121 areq_t ip_sock_areq; 122 } ip_sock_ar_u; 123 queue_t *ip_sock_ar_q; 124 } ip_sock_ar_t; 125 126 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 127 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 128 char *value, caddr_t cp, cred_t *ioc_cr); 129 130 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 131 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 132 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 133 mblk_t *mp, boolean_t need_up); 134 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 135 mblk_t *mp, boolean_t need_up); 136 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 137 queue_t *q, mblk_t *mp, boolean_t need_up); 138 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 139 mblk_t *mp, boolean_t need_up); 140 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 141 mblk_t *mp); 142 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 143 queue_t *q, mblk_t *mp, boolean_t need_up); 144 static int ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, 145 sin_t *sin, boolean_t x_arp_ioctl, boolean_t if_arp_ioctl); 146 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **); 147 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 148 static void ipsq_flush(ill_t *ill); 149 static void ipsq_clean_all(ill_t *ill); 150 static void ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring); 151 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 152 queue_t *q, mblk_t *mp, boolean_t need_up); 153 static void ipsq_delete(ipsq_t *); 154 155 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 156 boolean_t initialize); 157 static void ipif_check_bcast_ires(ipif_t *test_ipif); 158 static void ipif_down_delete_ire(ire_t *ire, char *ipif); 159 static void ipif_delete_cache_ire(ire_t *, char *); 160 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 161 static void ipif_free(ipif_t *ipif); 162 static void ipif_free_tail(ipif_t *ipif); 163 static void ipif_mtu_change(ire_t *ire, char *ipif_arg); 164 static void ipif_multicast_down(ipif_t *ipif); 165 static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); 166 static void ipif_set_default(ipif_t *ipif); 167 static int ipif_set_values(queue_t *q, mblk_t *mp, 168 char *interf_name, uint_t *ppa); 169 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 170 queue_t *q); 171 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 172 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 173 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error); 174 static int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp); 175 static void ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp); 176 177 static int ill_alloc_ppa(ill_if_t *, ill_t *); 178 static int ill_arp_off(ill_t *ill); 179 static int ill_arp_on(ill_t *ill); 180 static void ill_delete_interface_type(ill_if_t *); 181 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 182 static void ill_dl_down(ill_t *ill); 183 static void ill_down(ill_t *ill); 184 static void ill_downi(ire_t *ire, char *ill_arg); 185 static void ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg); 186 static void ill_down_tail(ill_t *ill); 187 static void ill_free_mib(ill_t *ill); 188 static void ill_glist_delete(ill_t *); 189 static boolean_t ill_has_usable_ipif(ill_t *); 190 static int ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int); 191 static void ill_nominate_bcast_rcv(ill_group_t *illgrp); 192 static void ill_phyint_free(ill_t *ill); 193 static void ill_phyint_reinit(ill_t *ill); 194 static void ill_set_nce_router_flags(ill_t *, boolean_t); 195 static void ill_signal_ipsq_ills(ipsq_t *, boolean_t); 196 static boolean_t ill_split_ipsq(ipsq_t *cur_sq); 197 static void ill_stq_cache_delete(ire_t *, char *); 198 199 static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *); 200 static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *); 201 static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 202 in6_addr_t *); 203 static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 204 ipaddr_t *); 205 static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *); 206 static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 207 in6_addr_t *); 208 static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 209 ipaddr_t *); 210 211 static void ipif_save_ire(ipif_t *, ire_t *); 212 static void ipif_remove_ire(ipif_t *, ire_t *); 213 static void ip_cgtp_bcast_add(ire_t *, ire_t *); 214 static void ip_cgtp_bcast_delete(ire_t *); 215 216 /* 217 * Per-ill IPsec capabilities management. 218 */ 219 static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void); 220 static void ill_ipsec_capab_free(ill_ipsec_capab_t *); 221 static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); 222 static void ill_ipsec_capab_delete(ill_t *, uint_t); 223 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); 224 static void ill_capability_proto(ill_t *, int, mblk_t *); 225 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, 226 boolean_t); 227 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 228 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 229 static void ill_capability_mdt_reset(ill_t *, mblk_t **); 230 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 231 static void ill_capability_ipsec_reset(ill_t *, mblk_t **); 232 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 233 static void ill_capability_hcksum_reset(ill_t *, mblk_t **); 234 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 235 dl_capability_sub_t *); 236 static void ill_capability_zerocopy_reset(ill_t *, mblk_t **); 237 238 static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 239 static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *); 240 static void ill_capability_dls_reset(ill_t *, mblk_t **); 241 static void ill_capability_dls_disable(ill_t *); 242 243 static void illgrp_cache_delete(ire_t *, char *); 244 static void illgrp_delete(ill_t *ill); 245 static void illgrp_reset_schednext(ill_t *ill); 246 247 static ill_t *ill_prev_usesrc(ill_t *); 248 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 249 static void ill_disband_usesrc_group(ill_t *); 250 251 static void conn_cleanup_stale_ire(conn_t *, caddr_t); 252 253 /* 254 * if we go over the memory footprint limit more than once in this msec 255 * interval, we'll start pruning aggressively. 256 */ 257 int ip_min_frag_prune_time = 0; 258 259 /* 260 * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY 261 * and the IPsec DOI 262 */ 263 #define MAX_IPSEC_ALGS 256 264 265 #define BITSPERBYTE 8 266 #define BITS(type) (BITSPERBYTE * (long)sizeof (type)) 267 268 #define IPSEC_ALG_ENABLE(algs, algid) \ 269 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \ 270 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 271 272 #define IPSEC_ALG_IS_ENABLED(algid, algs) \ 273 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \ 274 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 275 276 typedef uint8_t ipsec_capab_elem_t; 277 278 /* 279 * Per-algorithm parameters. Note that at present, only encryption 280 * algorithms have variable keysize (IKE does not provide a way to negotiate 281 * auth algorithm keysize). 282 * 283 * All sizes here are in bits. 284 */ 285 typedef struct 286 { 287 uint16_t minkeylen; 288 uint16_t maxkeylen; 289 } ipsec_capab_algparm_t; 290 291 /* 292 * Per-ill capabilities. 293 */ 294 struct ill_ipsec_capab_s { 295 ipsec_capab_elem_t *encr_hw_algs; 296 ipsec_capab_elem_t *auth_hw_algs; 297 uint32_t algs_size; /* size of _hw_algs in bytes */ 298 /* algorithm key lengths */ 299 ipsec_capab_algparm_t *encr_algparm; 300 uint32_t encr_algparm_size; 301 uint32_t encr_algparm_end; 302 }; 303 304 /* 305 * List of AH and ESP IPsec acceleration capable ills 306 */ 307 typedef struct ipsec_capab_ill_s { 308 uint_t ill_index; 309 boolean_t ill_isv6; 310 struct ipsec_capab_ill_s *next; 311 } ipsec_capab_ill_t; 312 313 static ipsec_capab_ill_t *ipsec_capab_ills_ah; 314 static ipsec_capab_ill_t *ipsec_capab_ills_esp; 315 krwlock_t ipsec_capab_ills_lock; 316 317 /* 318 * The field values are larger than strictly necessary for simple 319 * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls. 320 */ 321 static area_t ip_area_template = { 322 AR_ENTRY_ADD, /* area_cmd */ 323 sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl), 324 /* area_name_offset */ 325 /* area_name_length temporarily holds this structure length */ 326 sizeof (area_t), /* area_name_length */ 327 IP_ARP_PROTO_TYPE, /* area_proto */ 328 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 329 IP_ADDR_LEN, /* area_proto_addr_length */ 330 sizeof (ip_sock_ar_t) + IP_ADDR_LEN, 331 /* area_proto_mask_offset */ 332 0, /* area_flags */ 333 sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN, 334 /* area_hw_addr_offset */ 335 /* Zero length hw_addr_length means 'use your idea of the address' */ 336 0 /* area_hw_addr_length */ 337 }; 338 339 /* 340 * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver 341 * support 342 */ 343 static area_t ip6_area_template = { 344 AR_ENTRY_ADD, /* area_cmd */ 345 sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t), 346 /* area_name_offset */ 347 /* area_name_length temporarily holds this structure length */ 348 sizeof (area_t), /* area_name_length */ 349 IP_ARP_PROTO_TYPE, /* area_proto */ 350 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 351 IPV6_ADDR_LEN, /* area_proto_addr_length */ 352 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN, 353 /* area_proto_mask_offset */ 354 0, /* area_flags */ 355 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN, 356 /* area_hw_addr_offset */ 357 /* Zero length hw_addr_length means 'use your idea of the address' */ 358 0 /* area_hw_addr_length */ 359 }; 360 361 static ared_t ip_ared_template = { 362 AR_ENTRY_DELETE, 363 sizeof (ared_t) + IP_ADDR_LEN, 364 sizeof (ared_t), 365 IP_ARP_PROTO_TYPE, 366 sizeof (ared_t), 367 IP_ADDR_LEN 368 }; 369 370 static ared_t ip6_ared_template = { 371 AR_ENTRY_DELETE, 372 sizeof (ared_t) + IPV6_ADDR_LEN, 373 sizeof (ared_t), 374 IP_ARP_PROTO_TYPE, 375 sizeof (ared_t), 376 IPV6_ADDR_LEN 377 }; 378 379 /* 380 * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as 381 * as the areq doesn't include an IP address in ill_dl_up() (the only place a 382 * areq is used). 383 */ 384 static areq_t ip_areq_template = { 385 AR_ENTRY_QUERY, /* cmd */ 386 sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */ 387 sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ 388 IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ 389 sizeof (areq_t), /* target addr offset */ 390 IP_ADDR_LEN, /* target addr_length */ 391 0, /* flags */ 392 sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ 393 IP_ADDR_LEN, /* sender addr length */ 394 6, /* xmit_count */ 395 1000, /* (re)xmit_interval in milliseconds */ 396 4 /* max # of requests to buffer */ 397 /* anything else filled in by the code */ 398 }; 399 400 static arc_t ip_aru_template = { 401 AR_INTERFACE_UP, 402 sizeof (arc_t), /* Name offset */ 403 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 404 }; 405 406 static arc_t ip_ard_template = { 407 AR_INTERFACE_DOWN, 408 sizeof (arc_t), /* Name offset */ 409 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 410 }; 411 412 static arc_t ip_aron_template = { 413 AR_INTERFACE_ON, 414 sizeof (arc_t), /* Name offset */ 415 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 416 }; 417 418 static arc_t ip_aroff_template = { 419 AR_INTERFACE_OFF, 420 sizeof (arc_t), /* Name offset */ 421 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 422 }; 423 424 425 static arma_t ip_arma_multi_template = { 426 AR_MAPPING_ADD, 427 sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN, 428 /* Name offset */ 429 sizeof (arma_t), /* Name length (set by ill_arp_alloc) */ 430 IP_ARP_PROTO_TYPE, 431 sizeof (arma_t), /* proto_addr_offset */ 432 IP_ADDR_LEN, /* proto_addr_length */ 433 sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */ 434 sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */ 435 ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */ 436 sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */ 437 IP_MAX_HW_LEN, /* hw_addr_length */ 438 0, /* hw_mapping_start */ 439 }; 440 441 static ipft_t ip_ioctl_ftbl[] = { 442 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 443 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 444 IPFT_F_NO_REPLY }, 445 { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t), 446 IPFT_F_NO_REPLY }, 447 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 448 { 0 } 449 }; 450 451 /* Simple ICMP IP Header Template */ 452 static ipha_t icmp_ipha = { 453 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 454 }; 455 456 /* Flag descriptors for ip_ipif_report */ 457 static nv_t ipif_nv_tbl[] = { 458 { IPIF_UP, "UP" }, 459 { IPIF_BROADCAST, "BROADCAST" }, 460 { ILLF_DEBUG, "DEBUG" }, 461 { PHYI_LOOPBACK, "LOOPBACK" }, 462 { IPIF_POINTOPOINT, "POINTOPOINT" }, 463 { ILLF_NOTRAILERS, "NOTRAILERS" }, 464 { PHYI_RUNNING, "RUNNING" }, 465 { ILLF_NOARP, "NOARP" }, 466 { PHYI_PROMISC, "PROMISC" }, 467 { PHYI_ALLMULTI, "ALLMULTI" }, 468 { PHYI_INTELLIGENT, "INTELLIGENT" }, 469 { ILLF_MULTICAST, "MULTICAST" }, 470 { PHYI_MULTI_BCAST, "MULTI_BCAST" }, 471 { IPIF_UNNUMBERED, "UNNUMBERED" }, 472 { IPIF_DHCPRUNNING, "DHCP" }, 473 { IPIF_PRIVATE, "PRIVATE" }, 474 { IPIF_NOXMIT, "NOXMIT" }, 475 { IPIF_NOLOCAL, "NOLOCAL" }, 476 { IPIF_DEPRECATED, "DEPRECATED" }, 477 { IPIF_PREFERRED, "PREFERRED" }, 478 { IPIF_TEMPORARY, "TEMPORARY" }, 479 { IPIF_ADDRCONF, "ADDRCONF" }, 480 { PHYI_VIRTUAL, "VIRTUAL" }, 481 { ILLF_ROUTER, "ROUTER" }, 482 { ILLF_NONUD, "NONUD" }, 483 { IPIF_ANYCAST, "ANYCAST" }, 484 { ILLF_NORTEXCH, "NORTEXCH" }, 485 { ILLF_IPV4, "IPV4" }, 486 { ILLF_IPV6, "IPV6" }, 487 { IPIF_MIPRUNNING, "MIP" }, 488 { IPIF_NOFAILOVER, "NOFAILOVER" }, 489 { PHYI_FAILED, "FAILED" }, 490 { PHYI_STANDBY, "STANDBY" }, 491 { PHYI_INACTIVE, "INACTIVE" }, 492 { PHYI_OFFLINE, "OFFLINE" }, 493 }; 494 495 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 496 497 static ip_m_t ip_m_tbl[] = { 498 { DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 499 ip_ether_v6intfid }, 500 { DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 501 ip_nodef_v6intfid }, 502 { DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 503 ip_nodef_v6intfid }, 504 { DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 505 ip_nodef_v6intfid }, 506 { DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 507 ip_ether_v6intfid }, 508 { DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo, 509 ip_ib_v6intfid }, 510 { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL}, 511 { DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 512 ip_nodef_v6intfid } 513 }; 514 515 static ill_t ill_null; /* Empty ILL for init. */ 516 char ipif_loopback_name[] = "lo0"; 517 static char *ipv4_forward_suffix = ":ip_forwarding"; 518 static char *ipv6_forward_suffix = ":ip6_forwarding"; 519 static kstat_t *loopback_ksp = NULL; 520 static sin6_t sin6_null; /* Zero address for quick clears */ 521 static sin_t sin_null; /* Zero address for quick clears */ 522 static uint_t ill_index = 1; /* Used to assign interface indicies */ 523 /* When set search for unused index */ 524 static boolean_t ill_index_wrap = B_FALSE; 525 /* When set search for unused ipif_seqid */ 526 static ipif_t ipif_zero; 527 uint_t ipif_src_random; 528 529 /* 530 * For details on the protection offered by these locks please refer 531 * to the notes under the Synchronization section at the start of ip.c 532 */ 533 krwlock_t ill_g_lock; /* The global ill_g_lock */ 534 kmutex_t ip_addr_avail_lock; /* Address availability check lock */ 535 ipsq_t *ipsq_g_head; /* List of all ipsq's on the system */ 536 537 krwlock_t ill_g_usesrc_lock; /* Protects usesrc related fields */ 538 539 /* 540 * illgrp_head/ifgrp_head is protected by IP's perimeter. 541 */ 542 static ill_group_t *illgrp_head_v4; /* Head of IPv4 ill groups */ 543 ill_group_t *illgrp_head_v6; /* Head of IPv6 ill groups */ 544 545 ill_g_head_t ill_g_heads[MAX_G_HEADS]; /* ILL List Head */ 546 547 /* 548 * ppa arena is created after these many 549 * interfaces have been plumbed. 550 */ 551 uint_t ill_no_arena = 12; 552 553 #pragma align CACHE_ALIGN_SIZE(phyint_g_list) 554 static phyint_list_t phyint_g_list; /* start of phyint list */ 555 556 /* 557 * Reflects value of FAILBACK variable in IPMP config file 558 * /etc/default/mpathd. Default value is B_TRUE. 559 * Set to B_FALSE if user disabled failback by configuring "FAILBACK=no" 560 * in.mpathd uses SIOCSIPMPFAILBACK ioctl to pass this information to kernel. 561 */ 562 static boolean_t ipmp_enable_failback = B_TRUE; 563 564 /* 565 * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout 566 * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is 567 * set through platform specific code (Niagara/Ontario). 568 */ 569 #define SOFT_RINGS_ENABLED() (ip_soft_rings_cnt ? \ 570 (ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE) 571 572 #define ILL_CAPAB_DLS (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL) 573 574 static uint_t 575 ipif_rand(void) 576 { 577 ipif_src_random = ipif_src_random * 1103515245 + 12345; 578 return ((ipif_src_random >> 16) & 0x7fff); 579 } 580 581 /* 582 * Allocate per-interface mibs. Only used for ipv6. 583 * Returns true if ok. False otherwise. 584 * ipsq may not yet be allocated (loopback case ). 585 */ 586 static boolean_t 587 ill_allocate_mibs(ill_t *ill) 588 { 589 ASSERT(ill->ill_isv6); 590 591 /* Already allocated? */ 592 if (ill->ill_ip6_mib != NULL) { 593 ASSERT(ill->ill_icmp6_mib != NULL); 594 return (B_TRUE); 595 } 596 597 ill->ill_ip6_mib = kmem_zalloc(sizeof (*ill->ill_ip6_mib), 598 KM_NOSLEEP); 599 if (ill->ill_ip6_mib == NULL) { 600 return (B_FALSE); 601 } 602 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 603 KM_NOSLEEP); 604 if (ill->ill_icmp6_mib == NULL) { 605 kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib)); 606 ill->ill_ip6_mib = NULL; 607 return (B_FALSE); 608 } 609 /* 610 * The ipv6Ifindex and ipv6IfIcmpIndex will be assigned later 611 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 612 * -> ill_phyint_reinit 613 */ 614 return (B_TRUE); 615 } 616 617 /* 618 * Common code for preparation of ARP commands. Two points to remember: 619 * 1) The ill_name is tacked on at the end of the allocated space so 620 * the templates name_offset field must contain the total space 621 * to allocate less the name length. 622 * 623 * 2) The templates name_length field should contain the *template* 624 * length. We use it as a parameter to bcopy() and then write 625 * the real ill_name_length into the name_length field of the copy. 626 * (Always called as writer.) 627 */ 628 mblk_t * 629 ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr) 630 { 631 arc_t *arc = (arc_t *)template; 632 char *cp; 633 int len; 634 mblk_t *mp; 635 uint_t name_length = ill->ill_name_length; 636 uint_t template_len = arc->arc_name_length; 637 638 len = arc->arc_name_offset + name_length; 639 mp = allocb(len, BPRI_HI); 640 if (mp == NULL) 641 return (NULL); 642 cp = (char *)mp->b_rptr; 643 mp->b_wptr = (uchar_t *)&cp[len]; 644 if (template_len) 645 bcopy(template, cp, template_len); 646 if (len > template_len) 647 bzero(&cp[template_len], len - template_len); 648 mp->b_datap->db_type = M_PROTO; 649 650 arc = (arc_t *)cp; 651 arc->arc_name_length = name_length; 652 cp = (char *)arc + arc->arc_name_offset; 653 bcopy(ill->ill_name, cp, name_length); 654 655 if (addr) { 656 area_t *area = (area_t *)mp->b_rptr; 657 658 cp = (char *)area + area->area_proto_addr_offset; 659 bcopy(addr, cp, area->area_proto_addr_length); 660 if (area->area_cmd == AR_ENTRY_ADD) { 661 cp = (char *)area; 662 len = area->area_proto_addr_length; 663 if (area->area_proto_mask_offset) 664 cp += area->area_proto_mask_offset; 665 else 666 cp += area->area_proto_addr_offset + len; 667 while (len-- > 0) 668 *cp++ = (char)~0; 669 } 670 } 671 return (mp); 672 } 673 674 mblk_t * 675 ipif_area_alloc(ipif_t *ipif) 676 { 677 return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template, 678 (char *)&ipif->ipif_lcl_addr)); 679 } 680 681 mblk_t * 682 ipif_ared_alloc(ipif_t *ipif) 683 { 684 return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template, 685 (char *)&ipif->ipif_lcl_addr)); 686 } 687 688 mblk_t * 689 ill_ared_alloc(ill_t *ill, ipaddr_t addr) 690 { 691 return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 692 (char *)&addr)); 693 } 694 695 /* 696 * Completely vaporize a lower level tap and all associated interfaces. 697 * ill_delete is called only out of ip_close when the device control 698 * stream is being closed. 699 */ 700 void 701 ill_delete(ill_t *ill) 702 { 703 ipif_t *ipif; 704 ill_t *prev_ill; 705 706 /* 707 * ill_delete may be forcibly entering the ipsq. The previous 708 * ioctl may not have completed and may need to be aborted. 709 * ipsq_flush takes care of it. If we don't need to enter the 710 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 711 * ill_delete_tail is sufficient. 712 */ 713 ipsq_flush(ill); 714 715 /* 716 * Nuke all interfaces. ipif_free will take down the interface, 717 * remove it from the list, and free the data structure. 718 * Walk down the ipif list and remove the logical interfaces 719 * first before removing the main ipif. We can't unplumb 720 * zeroth interface first in the case of IPv6 as reset_conn_ill 721 * -> ip_ll_delmulti_v6 de-references ill_ipif for checking 722 * POINTOPOINT. 723 * 724 * If ill_ipif was not properly initialized (i.e low on memory), 725 * then no interfaces to clean up. In this case just clean up the 726 * ill. 727 */ 728 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 729 ipif_free(ipif); 730 731 /* 732 * Used only by ill_arp_on and ill_arp_off, which are writers. 733 * So nobody can be using this mp now. Free the mp allocated for 734 * honoring ILLF_NOARP 735 */ 736 freemsg(ill->ill_arp_on_mp); 737 ill->ill_arp_on_mp = NULL; 738 739 /* Clean up msgs on pending upcalls for mrouted */ 740 reset_mrt_ill(ill); 741 742 /* 743 * ipif_free -> reset_conn_ipif will remove all multicast 744 * references for IPv4. For IPv6, we need to do it here as 745 * it points only at ills. 746 */ 747 reset_conn_ill(ill); 748 749 /* 750 * ill_down will arrange to blow off any IRE's dependent on this 751 * ILL, and shut down fragmentation reassembly. 752 */ 753 ill_down(ill); 754 755 /* Let SCTP know, so that it can remove this from its list. */ 756 sctp_update_ill(ill, SCTP_ILL_REMOVE); 757 758 /* 759 * If an address on this ILL is being used as a source address then 760 * clear out the pointers in other ILLs that point to this ILL. 761 */ 762 rw_enter(&ill_g_usesrc_lock, RW_WRITER); 763 if (ill->ill_usesrc_grp_next != NULL) { 764 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 765 ill_disband_usesrc_group(ill); 766 } else { /* consumer of the usesrc ILL */ 767 prev_ill = ill_prev_usesrc(ill); 768 prev_ill->ill_usesrc_grp_next = 769 ill->ill_usesrc_grp_next; 770 } 771 } 772 rw_exit(&ill_g_usesrc_lock); 773 } 774 775 static void 776 ipif_non_duplicate(ipif_t *ipif) 777 { 778 ill_t *ill = ipif->ipif_ill; 779 mutex_enter(&ill->ill_lock); 780 if (ipif->ipif_flags & IPIF_DUPLICATE) { 781 ipif->ipif_flags &= ~IPIF_DUPLICATE; 782 ASSERT(ill->ill_ipif_dup_count > 0); 783 ill->ill_ipif_dup_count--; 784 } 785 mutex_exit(&ill->ill_lock); 786 } 787 788 /* 789 * ill_delete_tail is called from ip_modclose after all references 790 * to the closing ill are gone. The wait is done in ip_modclose 791 */ 792 void 793 ill_delete_tail(ill_t *ill) 794 { 795 mblk_t **mpp; 796 ipif_t *ipif; 797 798 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 799 ipif_non_duplicate(ipif); 800 ipif_down_tail(ipif); 801 } 802 803 ASSERT(ill->ill_ipif_dup_count == 0 && 804 ill->ill_arp_down_mp == NULL && 805 ill->ill_arp_del_mapping_mp == NULL); 806 807 /* 808 * If polling capability is enabled (which signifies direct 809 * upcall into IP and driver has ill saved as a handle), 810 * we need to make sure that unbind has completed before we 811 * let the ill disappear and driver no longer has any reference 812 * to this ill. 813 */ 814 mutex_enter(&ill->ill_lock); 815 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 816 cv_wait(&ill->ill_cv, &ill->ill_lock); 817 mutex_exit(&ill->ill_lock); 818 819 /* 820 * Clean up polling and soft ring capabilities 821 */ 822 if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) 823 ill_capability_dls_disable(ill); 824 825 /* 826 * Send the detach if there's one to send (i.e., if we're above a 827 * style 2 DLPI driver). 828 */ 829 if (ill->ill_detach_mp != NULL) { 830 ill_dlpi_send(ill, ill->ill_detach_mp); 831 ill->ill_detach_mp = NULL; 832 } 833 834 if (ill->ill_net_type != IRE_LOOPBACK) 835 qprocsoff(ill->ill_rq); 836 837 /* 838 * We do an ipsq_flush once again now. New messages could have 839 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 840 * could also have landed up if an ioctl thread had looked up 841 * the ill before we set the ILL_CONDEMNED flag, but not yet 842 * enqueued the ioctl when we did the ipsq_flush last time. 843 */ 844 ipsq_flush(ill); 845 846 /* 847 * Free capabilities. 848 */ 849 if (ill->ill_ipsec_capab_ah != NULL) { 850 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH); 851 ill_ipsec_capab_free(ill->ill_ipsec_capab_ah); 852 ill->ill_ipsec_capab_ah = NULL; 853 } 854 855 if (ill->ill_ipsec_capab_esp != NULL) { 856 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP); 857 ill_ipsec_capab_free(ill->ill_ipsec_capab_esp); 858 ill->ill_ipsec_capab_esp = NULL; 859 } 860 861 if (ill->ill_mdt_capab != NULL) { 862 kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t)); 863 ill->ill_mdt_capab = NULL; 864 } 865 866 if (ill->ill_hcksum_capab != NULL) { 867 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 868 ill->ill_hcksum_capab = NULL; 869 } 870 871 if (ill->ill_zerocopy_capab != NULL) { 872 kmem_free(ill->ill_zerocopy_capab, 873 sizeof (ill_zerocopy_capab_t)); 874 ill->ill_zerocopy_capab = NULL; 875 } 876 877 if (ill->ill_dls_capab != NULL) { 878 CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn); 879 ill->ill_dls_capab->ill_unbind_conn = NULL; 880 kmem_free(ill->ill_dls_capab, 881 sizeof (ill_dls_capab_t) + 882 (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS)); 883 ill->ill_dls_capab = NULL; 884 } 885 886 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL)); 887 888 while (ill->ill_ipif != NULL) 889 ipif_free_tail(ill->ill_ipif); 890 891 ill_down_tail(ill); 892 893 /* 894 * We have removed all references to ilm from conn and the ones joined 895 * within the kernel. 896 * 897 * We don't walk conns, mrts and ires because 898 * 899 * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts. 900 * 2) ill_down ->ill_downi walks all the ires and cleans up 901 * ill references. 902 */ 903 ASSERT(ilm_walk_ill(ill) == 0); 904 /* 905 * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free 906 * could free the phyint. No more reference to the phyint after this 907 * point. 908 */ 909 (void) ill_glist_delete(ill); 910 911 rw_enter(&ip_g_nd_lock, RW_WRITER); 912 if (ill->ill_ndd_name != NULL) 913 nd_unload(&ip_g_nd, ill->ill_ndd_name); 914 rw_exit(&ip_g_nd_lock); 915 916 917 if (ill->ill_frag_ptr != NULL) { 918 uint_t count; 919 920 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 921 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 922 } 923 mi_free(ill->ill_frag_ptr); 924 ill->ill_frag_ptr = NULL; 925 ill->ill_frag_hash_tbl = NULL; 926 } 927 if (ill->ill_nd_lla_mp != NULL) 928 freemsg(ill->ill_nd_lla_mp); 929 /* Free all retained control messages. */ 930 mpp = &ill->ill_first_mp_to_free; 931 do { 932 while (mpp[0]) { 933 mblk_t *mp; 934 mblk_t *mp1; 935 936 mp = mpp[0]; 937 mpp[0] = mp->b_next; 938 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 939 mp1->b_next = NULL; 940 mp1->b_prev = NULL; 941 } 942 freemsg(mp); 943 } 944 } while (mpp++ != &ill->ill_last_mp_to_free); 945 946 ill_free_mib(ill); 947 ILL_TRACE_CLEANUP(ill); 948 } 949 950 static void 951 ill_free_mib(ill_t *ill) 952 { 953 if (ill->ill_ip6_mib != NULL) { 954 kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib)); 955 ill->ill_ip6_mib = NULL; 956 } 957 if (ill->ill_icmp6_mib != NULL) { 958 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 959 ill->ill_icmp6_mib = NULL; 960 } 961 } 962 963 /* 964 * Concatenate together a physical address and a sap. 965 * 966 * Sap_lengths are interpreted as follows: 967 * sap_length == 0 ==> no sap 968 * sap_length > 0 ==> sap is at the head of the dlpi address 969 * sap_length < 0 ==> sap is at the tail of the dlpi address 970 */ 971 static void 972 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 973 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 974 { 975 uint16_t sap_addr = (uint16_t)sap_src; 976 977 if (sap_length == 0) { 978 if (phys_src == NULL) 979 bzero(dst, phys_length); 980 else 981 bcopy(phys_src, dst, phys_length); 982 } else if (sap_length < 0) { 983 if (phys_src == NULL) 984 bzero(dst, phys_length); 985 else 986 bcopy(phys_src, dst, phys_length); 987 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 988 } else { 989 bcopy(&sap_addr, dst, sizeof (sap_addr)); 990 if (phys_src == NULL) 991 bzero((char *)dst + sap_length, phys_length); 992 else 993 bcopy(phys_src, (char *)dst + sap_length, phys_length); 994 } 995 } 996 997 /* 998 * Generate a dl_unitdata_req mblk for the device and address given. 999 * addr_length is the length of the physical portion of the address. 1000 * If addr is NULL include an all zero address of the specified length. 1001 * TRUE? In any case, addr_length is taken to be the entire length of the 1002 * dlpi address, including the absolute value of sap_length. 1003 */ 1004 mblk_t * 1005 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 1006 t_scalar_t sap_length) 1007 { 1008 dl_unitdata_req_t *dlur; 1009 mblk_t *mp; 1010 t_scalar_t abs_sap_length; /* absolute value */ 1011 1012 abs_sap_length = ABS(sap_length); 1013 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 1014 DL_UNITDATA_REQ); 1015 if (mp == NULL) 1016 return (NULL); 1017 dlur = (dl_unitdata_req_t *)mp->b_rptr; 1018 /* HACK: accomodate incompatible DLPI drivers */ 1019 if (addr_length == 8) 1020 addr_length = 6; 1021 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 1022 dlur->dl_dest_addr_offset = sizeof (*dlur); 1023 dlur->dl_priority.dl_min = 0; 1024 dlur->dl_priority.dl_max = 0; 1025 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 1026 (uchar_t *)&dlur[1]); 1027 return (mp); 1028 } 1029 1030 /* 1031 * Add the 'mp' to the list of pending mp's headed by ill_pending_mp 1032 * Return an error if we already have 1 or more ioctls in progress. 1033 * This is used only for non-exclusive ioctls. Currently this is used 1034 * for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive 1035 * and thus need to use ipsq_pending_mp_add. 1036 */ 1037 boolean_t 1038 ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp) 1039 { 1040 ASSERT(MUTEX_HELD(&ill->ill_lock)); 1041 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1042 /* 1043 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls. 1044 */ 1045 ASSERT((add_mp->b_datap->db_type == M_IOCDATA) || 1046 (add_mp->b_datap->db_type == M_IOCTL)); 1047 1048 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1049 /* 1050 * Return error if the conn has started closing. The conn 1051 * could have finished cleaning up the pending mp list, 1052 * If so we should not add another mp to the list negating 1053 * the cleanup. 1054 */ 1055 if (connp->conn_state_flags & CONN_CLOSING) 1056 return (B_FALSE); 1057 /* 1058 * Add the pending mp to the head of the list, chained by b_next. 1059 * Note down the conn on which the ioctl request came, in b_prev. 1060 * This will be used to later get the conn, when we get a response 1061 * on the ill queue, from some other module (typically arp) 1062 */ 1063 add_mp->b_next = (void *)ill->ill_pending_mp; 1064 add_mp->b_queue = CONNP_TO_WQ(connp); 1065 ill->ill_pending_mp = add_mp; 1066 if (connp != NULL) 1067 connp->conn_oper_pending_ill = ill; 1068 return (B_TRUE); 1069 } 1070 1071 /* 1072 * Retrieve the ill_pending_mp and return it. We have to walk the list 1073 * of mblks starting at ill_pending_mp, and match based on the ioc_id. 1074 */ 1075 mblk_t * 1076 ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) 1077 { 1078 mblk_t *prev = NULL; 1079 mblk_t *curr = NULL; 1080 uint_t id; 1081 conn_t *connp; 1082 1083 /* 1084 * When the conn closes, conn_ioctl_cleanup needs to clean 1085 * up the pending mp, but it does not know the ioc_id and 1086 * passes in a zero for it. 1087 */ 1088 mutex_enter(&ill->ill_lock); 1089 if (ioc_id != 0) 1090 *connpp = NULL; 1091 1092 /* Search the list for the appropriate ioctl based on ioc_id */ 1093 for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL; 1094 prev = curr, curr = curr->b_next) { 1095 id = ((struct iocblk *)curr->b_rptr)->ioc_id; 1096 connp = Q_TO_CONN(curr->b_queue); 1097 /* Match based on the ioc_id or based on the conn */ 1098 if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp)) 1099 break; 1100 } 1101 1102 if (curr != NULL) { 1103 /* Unlink the mblk from the pending mp list */ 1104 if (prev != NULL) { 1105 prev->b_next = curr->b_next; 1106 } else { 1107 ASSERT(ill->ill_pending_mp == curr); 1108 ill->ill_pending_mp = curr->b_next; 1109 } 1110 1111 /* 1112 * conn refcnt must have been bumped up at the start of 1113 * the ioctl. So we can safely access the conn. 1114 */ 1115 ASSERT(CONN_Q(curr->b_queue)); 1116 *connpp = Q_TO_CONN(curr->b_queue); 1117 curr->b_next = NULL; 1118 curr->b_queue = NULL; 1119 } 1120 1121 mutex_exit(&ill->ill_lock); 1122 1123 return (curr); 1124 } 1125 1126 /* 1127 * Add the pending mp to the list. There can be only 1 pending mp 1128 * in the list. Any exclusive ioctl that needs to wait for a response 1129 * from another module or driver needs to use this function to set 1130 * the ipsq_pending_mp to the ioctl mblk and wait for the response from 1131 * the other module/driver. This is also used while waiting for the 1132 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 1133 */ 1134 boolean_t 1135 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 1136 int waitfor) 1137 { 1138 ipsq_t *ipsq; 1139 1140 ASSERT(IAM_WRITER_IPIF(ipif)); 1141 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 1142 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1143 /* 1144 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls, 1145 * M_ERROR/M_HANGUP from driver 1146 */ 1147 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) || 1148 (DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP)); 1149 1150 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 1151 if (connp != NULL) { 1152 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1153 /* 1154 * Return error if the conn has started closing. The conn 1155 * could have finished cleaning up the pending mp list, 1156 * If so we should not add another mp to the list negating 1157 * the cleanup. 1158 */ 1159 if (connp->conn_state_flags & CONN_CLOSING) 1160 return (B_FALSE); 1161 } 1162 mutex_enter(&ipsq->ipsq_lock); 1163 ipsq->ipsq_pending_ipif = ipif; 1164 /* 1165 * Note down the queue in b_queue. This will be returned by 1166 * ipsq_pending_mp_get. Caller will then use these values to restart 1167 * the processing 1168 */ 1169 add_mp->b_next = NULL; 1170 add_mp->b_queue = q; 1171 ipsq->ipsq_pending_mp = add_mp; 1172 ipsq->ipsq_waitfor = waitfor; 1173 /* 1174 * ipsq_current_ipif is needed to restart the operation from 1175 * ipif_ill_refrele_tail when the last reference to the ipi/ill 1176 * is gone. Since this is not an ioctl ipsq_current_ipif has not 1177 * been set until now. 1178 */ 1179 if (DB_TYPE(add_mp) == M_ERROR || DB_TYPE(add_mp) == M_HANGUP) { 1180 ASSERT(ipsq->ipsq_current_ipif == NULL); 1181 ipsq->ipsq_current_ipif = ipif; 1182 ipsq->ipsq_last_cmd = DB_TYPE(add_mp); 1183 } 1184 if (connp != NULL) 1185 connp->conn_oper_pending_ill = ipif->ipif_ill; 1186 mutex_exit(&ipsq->ipsq_lock); 1187 return (B_TRUE); 1188 } 1189 1190 /* 1191 * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp 1192 * queued in the list. 1193 */ 1194 mblk_t * 1195 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 1196 { 1197 mblk_t *curr = NULL; 1198 1199 mutex_enter(&ipsq->ipsq_lock); 1200 *connpp = NULL; 1201 if (ipsq->ipsq_pending_mp == NULL) { 1202 mutex_exit(&ipsq->ipsq_lock); 1203 return (NULL); 1204 } 1205 1206 /* There can be only 1 such excl message */ 1207 curr = ipsq->ipsq_pending_mp; 1208 ASSERT(curr != NULL && curr->b_next == NULL); 1209 ipsq->ipsq_pending_ipif = NULL; 1210 ipsq->ipsq_pending_mp = NULL; 1211 ipsq->ipsq_waitfor = 0; 1212 mutex_exit(&ipsq->ipsq_lock); 1213 1214 if (CONN_Q(curr->b_queue)) { 1215 /* 1216 * This mp did a refhold on the conn, at the start of the ioctl. 1217 * So we can safely return a pointer to the conn to the caller. 1218 */ 1219 *connpp = Q_TO_CONN(curr->b_queue); 1220 } else { 1221 *connpp = NULL; 1222 } 1223 curr->b_next = NULL; 1224 curr->b_prev = NULL; 1225 return (curr); 1226 } 1227 1228 /* 1229 * Cleanup the ioctl mp queued in ipsq_pending_mp 1230 * - Called in the ill_delete path 1231 * - Called in the M_ERROR or M_HANGUP path on the ill. 1232 * - Called in the conn close path. 1233 */ 1234 boolean_t 1235 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 1236 { 1237 mblk_t *mp; 1238 ipsq_t *ipsq; 1239 queue_t *q; 1240 ipif_t *ipif; 1241 1242 ASSERT(IAM_WRITER_ILL(ill)); 1243 ipsq = ill->ill_phyint->phyint_ipsq; 1244 mutex_enter(&ipsq->ipsq_lock); 1245 /* 1246 * If connp is null, unconditionally clean up the ipsq_pending_mp. 1247 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 1248 * even if it is meant for another ill, since we have to enqueue 1249 * a new mp now in ipsq_pending_mp to complete the ipif_down. 1250 * If connp is non-null we are called from the conn close path. 1251 */ 1252 mp = ipsq->ipsq_pending_mp; 1253 if (mp == NULL || (connp != NULL && 1254 mp->b_queue != CONNP_TO_WQ(connp))) { 1255 mutex_exit(&ipsq->ipsq_lock); 1256 return (B_FALSE); 1257 } 1258 /* Now remove from the ipsq_pending_mp */ 1259 ipsq->ipsq_pending_mp = NULL; 1260 q = mp->b_queue; 1261 mp->b_next = NULL; 1262 mp->b_prev = NULL; 1263 mp->b_queue = NULL; 1264 1265 /* If MOVE was in progress, clear the move_in_progress fields also. */ 1266 ill = ipsq->ipsq_pending_ipif->ipif_ill; 1267 if (ill->ill_move_in_progress) { 1268 ILL_CLEAR_MOVE(ill); 1269 } else if (ill->ill_up_ipifs) { 1270 ill_group_cleanup(ill); 1271 } 1272 1273 ipif = ipsq->ipsq_pending_ipif; 1274 ipsq->ipsq_pending_ipif = NULL; 1275 ipsq->ipsq_waitfor = 0; 1276 ipsq->ipsq_current_ipif = NULL; 1277 mutex_exit(&ipsq->ipsq_lock); 1278 1279 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 1280 ip_ioctl_finish(q, mp, ENXIO, connp != NULL ? CONN_CLOSE : 1281 NO_COPYOUT, connp != NULL ? ipif : NULL, NULL); 1282 } else { 1283 /* 1284 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 1285 * be just inet_freemsg. we have to restart it 1286 * otherwise the thread will be stuck. 1287 */ 1288 inet_freemsg(mp); 1289 } 1290 return (B_TRUE); 1291 } 1292 1293 /* 1294 * The ill is closing. Cleanup all the pending mps. Called exclusively 1295 * towards the end of ill_delete. The refcount has gone to 0. So nobody 1296 * knows this ill, and hence nobody can add an mp to this list 1297 */ 1298 static void 1299 ill_pending_mp_cleanup(ill_t *ill) 1300 { 1301 mblk_t *mp; 1302 queue_t *q; 1303 1304 ASSERT(IAM_WRITER_ILL(ill)); 1305 1306 mutex_enter(&ill->ill_lock); 1307 /* 1308 * Every mp on the pending mp list originating from an ioctl 1309 * added 1 to the conn refcnt, at the start of the ioctl. 1310 * So bump it down now. See comments in ip_wput_nondata() 1311 */ 1312 while (ill->ill_pending_mp != NULL) { 1313 mp = ill->ill_pending_mp; 1314 ill->ill_pending_mp = mp->b_next; 1315 mutex_exit(&ill->ill_lock); 1316 1317 q = mp->b_queue; 1318 ASSERT(CONN_Q(q)); 1319 mp->b_next = NULL; 1320 mp->b_prev = NULL; 1321 mp->b_queue = NULL; 1322 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL, NULL); 1323 mutex_enter(&ill->ill_lock); 1324 } 1325 ill->ill_pending_ipif = NULL; 1326 1327 mutex_exit(&ill->ill_lock); 1328 } 1329 1330 /* 1331 * Called in the conn close path and ill delete path 1332 */ 1333 static void 1334 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 1335 { 1336 ipsq_t *ipsq; 1337 mblk_t *prev; 1338 mblk_t *curr; 1339 mblk_t *next; 1340 queue_t *q; 1341 mblk_t *tmp_list = NULL; 1342 1343 ASSERT(IAM_WRITER_ILL(ill)); 1344 if (connp != NULL) 1345 q = CONNP_TO_WQ(connp); 1346 else 1347 q = ill->ill_wq; 1348 1349 ipsq = ill->ill_phyint->phyint_ipsq; 1350 /* 1351 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 1352 * In the case of ioctl from a conn, there can be only 1 mp 1353 * queued on the ipsq. If an ill is being unplumbed, only messages 1354 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 1355 * ioctls meant for this ill form conn's are not flushed. They will 1356 * be processed during ipsq_exit and will not find the ill and will 1357 * return error. 1358 */ 1359 mutex_enter(&ipsq->ipsq_lock); 1360 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 1361 curr = next) { 1362 next = curr->b_next; 1363 if (curr->b_queue == q || curr->b_queue == RD(q)) { 1364 /* Unlink the mblk from the pending mp list */ 1365 if (prev != NULL) { 1366 prev->b_next = curr->b_next; 1367 } else { 1368 ASSERT(ipsq->ipsq_xopq_mphead == curr); 1369 ipsq->ipsq_xopq_mphead = curr->b_next; 1370 } 1371 if (ipsq->ipsq_xopq_mptail == curr) 1372 ipsq->ipsq_xopq_mptail = prev; 1373 /* 1374 * Create a temporary list and release the ipsq lock 1375 * New elements are added to the head of the tmp_list 1376 */ 1377 curr->b_next = tmp_list; 1378 tmp_list = curr; 1379 } else { 1380 prev = curr; 1381 } 1382 } 1383 mutex_exit(&ipsq->ipsq_lock); 1384 1385 while (tmp_list != NULL) { 1386 curr = tmp_list; 1387 tmp_list = curr->b_next; 1388 curr->b_next = NULL; 1389 curr->b_prev = NULL; 1390 curr->b_queue = NULL; 1391 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 1392 ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? 1393 CONN_CLOSE : NO_COPYOUT, NULL, NULL); 1394 } else { 1395 /* 1396 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 1397 * this can't be just inet_freemsg. we have to 1398 * restart it otherwise the thread will be stuck. 1399 */ 1400 inet_freemsg(curr); 1401 } 1402 } 1403 } 1404 1405 /* 1406 * This conn has started closing. Cleanup any pending ioctl from this conn. 1407 * STREAMS ensures that there can be at most 1 ioctl pending on a stream. 1408 */ 1409 void 1410 conn_ioctl_cleanup(conn_t *connp) 1411 { 1412 mblk_t *curr; 1413 ipsq_t *ipsq; 1414 ill_t *ill; 1415 boolean_t refheld; 1416 1417 /* 1418 * Is any exclusive ioctl pending ? If so clean it up. If the 1419 * ioctl has not yet started, the mp is pending in the list headed by 1420 * ipsq_xopq_head. If the ioctl has started the mp could be present in 1421 * ipsq_pending_mp. If the ioctl timed out in the streamhead but 1422 * is currently executing now the mp is not queued anywhere but 1423 * conn_oper_pending_ill is null. The conn close will wait 1424 * till the conn_ref drops to zero. 1425 */ 1426 mutex_enter(&connp->conn_lock); 1427 ill = connp->conn_oper_pending_ill; 1428 if (ill == NULL) { 1429 mutex_exit(&connp->conn_lock); 1430 return; 1431 } 1432 1433 curr = ill_pending_mp_get(ill, &connp, 0); 1434 if (curr != NULL) { 1435 mutex_exit(&connp->conn_lock); 1436 CONN_DEC_REF(connp); 1437 inet_freemsg(curr); 1438 return; 1439 } 1440 /* 1441 * We may not be able to refhold the ill if the ill/ipif 1442 * is changing. But we need to make sure that the ill will 1443 * not vanish. So we just bump up the ill_waiter count. 1444 */ 1445 refheld = ill_waiter_inc(ill); 1446 mutex_exit(&connp->conn_lock); 1447 if (refheld) { 1448 if (ipsq_enter(ill, B_TRUE)) { 1449 ill_waiter_dcr(ill); 1450 /* 1451 * Check whether this ioctl has started and is 1452 * pending now in ipsq_pending_mp. If it is not 1453 * found there then check whether this ioctl has 1454 * not even started and is in the ipsq_xopq list. 1455 */ 1456 if (!ipsq_pending_mp_cleanup(ill, connp)) 1457 ipsq_xopq_mp_cleanup(ill, connp); 1458 ipsq = ill->ill_phyint->phyint_ipsq; 1459 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1460 return; 1461 } 1462 } 1463 1464 /* 1465 * The ill is also closing and we could not bump up the 1466 * ill_waiter_count or we could not enter the ipsq. Leave 1467 * the cleanup to ill_delete 1468 */ 1469 mutex_enter(&connp->conn_lock); 1470 while (connp->conn_oper_pending_ill != NULL) 1471 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1472 mutex_exit(&connp->conn_lock); 1473 if (refheld) 1474 ill_waiter_dcr(ill); 1475 } 1476 1477 /* 1478 * ipcl_walk function for cleaning up conn_*_ill fields. 1479 */ 1480 static void 1481 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1482 { 1483 ill_t *ill = (ill_t *)arg; 1484 ire_t *ire; 1485 1486 mutex_enter(&connp->conn_lock); 1487 if (connp->conn_multicast_ill == ill) { 1488 /* Revert to late binding */ 1489 connp->conn_multicast_ill = NULL; 1490 connp->conn_orig_multicast_ifindex = 0; 1491 } 1492 if (connp->conn_incoming_ill == ill) 1493 connp->conn_incoming_ill = NULL; 1494 if (connp->conn_outgoing_ill == ill) 1495 connp->conn_outgoing_ill = NULL; 1496 if (connp->conn_outgoing_pill == ill) 1497 connp->conn_outgoing_pill = NULL; 1498 if (connp->conn_nofailover_ill == ill) 1499 connp->conn_nofailover_ill = NULL; 1500 if (connp->conn_xmit_if_ill == ill) 1501 connp->conn_xmit_if_ill = NULL; 1502 if (connp->conn_ire_cache != NULL) { 1503 ire = connp->conn_ire_cache; 1504 /* 1505 * ip_newroute creates IRE_CACHE with ire_stq coming from 1506 * interface X and ipif coming from interface Y, if interface 1507 * X and Y are part of the same IPMPgroup. Thus whenever 1508 * interface X goes down, remove all references to it by 1509 * checking both on ire_ipif and ire_stq. 1510 */ 1511 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1512 (ire->ire_type == IRE_CACHE && 1513 ire->ire_stq == ill->ill_wq)) { 1514 connp->conn_ire_cache = NULL; 1515 mutex_exit(&connp->conn_lock); 1516 ire_refrele_notr(ire); 1517 return; 1518 } 1519 } 1520 mutex_exit(&connp->conn_lock); 1521 1522 } 1523 1524 /* ARGSUSED */ 1525 void 1526 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1527 { 1528 ill_t *ill = q->q_ptr; 1529 ipif_t *ipif; 1530 1531 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1532 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1533 ipif_non_duplicate(ipif); 1534 ipif_down_tail(ipif); 1535 } 1536 ill_down_tail(ill); 1537 freemsg(mp); 1538 ipsq->ipsq_current_ipif = NULL; 1539 } 1540 1541 /* 1542 * ill_down_start is called when we want to down this ill and bring it up again 1543 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1544 * all interfaces, but don't tear down any plumbing. 1545 */ 1546 boolean_t 1547 ill_down_start(queue_t *q, mblk_t *mp) 1548 { 1549 ill_t *ill; 1550 ipif_t *ipif; 1551 1552 ill = q->q_ptr; 1553 1554 ASSERT(IAM_WRITER_ILL(ill)); 1555 1556 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1557 (void) ipif_down(ipif, NULL, NULL); 1558 1559 ill_down(ill); 1560 1561 (void) ipsq_pending_mp_cleanup(ill, NULL); 1562 mutex_enter(&ill->ill_lock); 1563 /* 1564 * Atomically test and add the pending mp if references are 1565 * still active. 1566 */ 1567 if (!ill_is_quiescent(ill)) { 1568 /* 1569 * Get rid of any pending mps and cleanup. Call will 1570 * not fail since we are passing a null connp. 1571 */ 1572 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1573 mp, ILL_DOWN); 1574 mutex_exit(&ill->ill_lock); 1575 return (B_FALSE); 1576 } 1577 mutex_exit(&ill->ill_lock); 1578 return (B_TRUE); 1579 } 1580 1581 static void 1582 ill_down(ill_t *ill) 1583 { 1584 /* Blow off any IREs dependent on this ILL. */ 1585 ire_walk(ill_downi, (char *)ill); 1586 1587 mutex_enter(&ire_mrtun_lock); 1588 if (ire_mrtun_count != 0) { 1589 mutex_exit(&ire_mrtun_lock); 1590 ire_walk_ill_mrtun(0, 0, ill_downi_mrtun_srcif, 1591 (char *)ill, NULL); 1592 } else { 1593 mutex_exit(&ire_mrtun_lock); 1594 } 1595 1596 /* 1597 * If any interface based forwarding table exists 1598 * Blow off the ires there dependent on this ill 1599 */ 1600 mutex_enter(&ire_srcif_table_lock); 1601 if (ire_srcif_table_count > 0) { 1602 mutex_exit(&ire_srcif_table_lock); 1603 ire_walk_srcif_table_v4(ill_downi_mrtun_srcif, (char *)ill); 1604 } else { 1605 mutex_exit(&ire_srcif_table_lock); 1606 } 1607 1608 /* Remove any conn_*_ill depending on this ill */ 1609 ipcl_walk(conn_cleanup_ill, (caddr_t)ill); 1610 1611 if (ill->ill_group != NULL) { 1612 illgrp_delete(ill); 1613 } 1614 1615 } 1616 1617 static void 1618 ill_down_tail(ill_t *ill) 1619 { 1620 int i; 1621 1622 /* Destroy ill_srcif_table if it exists */ 1623 /* Lock not reqd really because nobody should be able to access */ 1624 mutex_enter(&ill->ill_lock); 1625 if (ill->ill_srcif_table != NULL) { 1626 ill->ill_srcif_refcnt = 0; 1627 for (i = 0; i < IP_SRCIF_TABLE_SIZE; i++) { 1628 rw_destroy(&ill->ill_srcif_table[i].irb_lock); 1629 } 1630 kmem_free(ill->ill_srcif_table, 1631 IP_SRCIF_TABLE_SIZE * sizeof (irb_t)); 1632 ill->ill_srcif_table = NULL; 1633 ill->ill_srcif_refcnt = 0; 1634 ill->ill_mrtun_refcnt = 0; 1635 } 1636 mutex_exit(&ill->ill_lock); 1637 } 1638 1639 /* 1640 * ire_walk routine used to delete every IRE that depends on queues 1641 * associated with 'ill'. (Always called as writer.) 1642 */ 1643 static void 1644 ill_downi(ire_t *ire, char *ill_arg) 1645 { 1646 ill_t *ill = (ill_t *)ill_arg; 1647 1648 /* 1649 * ip_newroute creates IRE_CACHE with ire_stq coming from 1650 * interface X and ipif coming from interface Y, if interface 1651 * X and Y are part of the same IPMP group. Thus whenever interface 1652 * X goes down, remove all references to it by checking both 1653 * on ire_ipif and ire_stq. 1654 */ 1655 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1656 (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) { 1657 ire_delete(ire); 1658 } 1659 } 1660 1661 /* 1662 * A seperate routine for deleting revtun and srcif based routes 1663 * are needed because the ires only deleted when the interface 1664 * is unplumbed. Also these ires have ire_in_ill non-null as well. 1665 * we want to keep mobile IP specific code separate. 1666 */ 1667 static void 1668 ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg) 1669 { 1670 ill_t *ill = (ill_t *)ill_arg; 1671 1672 ASSERT(ire->ire_in_ill != NULL); 1673 1674 if ((ire->ire_in_ill != NULL && ire->ire_in_ill == ill) || 1675 (ire->ire_stq == ill->ill_wq) || (ire->ire_stq == ill->ill_rq)) { 1676 ire_delete(ire); 1677 } 1678 } 1679 1680 /* 1681 * Remove ire/nce from the fastpath list. 1682 */ 1683 void 1684 ill_fastpath_nack(ill_t *ill) 1685 { 1686 if (ill->ill_isv6) { 1687 nce_fastpath_list_dispatch(ill, NULL, NULL); 1688 } else { 1689 ire_fastpath_list_dispatch(ill, NULL, NULL); 1690 } 1691 } 1692 1693 /* Consume an M_IOCACK of the fastpath probe. */ 1694 void 1695 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1696 { 1697 mblk_t *mp1 = mp; 1698 1699 /* 1700 * If this was the first attempt turn on the fastpath probing. 1701 */ 1702 mutex_enter(&ill->ill_lock); 1703 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1704 ill->ill_dlpi_fastpath_state = IDS_OK; 1705 mutex_exit(&ill->ill_lock); 1706 1707 /* Free the M_IOCACK mblk, hold on to the data */ 1708 mp = mp->b_cont; 1709 freeb(mp1); 1710 if (mp == NULL) 1711 return; 1712 if (mp->b_cont != NULL) { 1713 /* 1714 * Update all IRE's or NCE's that are waiting for 1715 * fastpath update. 1716 */ 1717 if (ill->ill_isv6) { 1718 /* 1719 * update nce's in the fastpath list. 1720 */ 1721 nce_fastpath_list_dispatch(ill, 1722 ndp_fastpath_update, mp); 1723 } else { 1724 1725 /* 1726 * update ire's in the fastpath list. 1727 */ 1728 ire_fastpath_list_dispatch(ill, 1729 ire_fastpath_update, mp); 1730 /* 1731 * Check if we need to traverse reverse tunnel table. 1732 * Since there is only single ire_type (IRE_MIPRTUN) 1733 * in the table, we don't need to match on ire_type. 1734 * We have to check ire_mrtun_count and not the 1735 * ill_mrtun_refcnt since ill_mrtun_refcnt is set 1736 * on the incoming ill and here we are dealing with 1737 * outgoing ill. 1738 */ 1739 mutex_enter(&ire_mrtun_lock); 1740 if (ire_mrtun_count != 0) { 1741 mutex_exit(&ire_mrtun_lock); 1742 ire_walk_ill_mrtun(MATCH_IRE_WQ, IRE_MIPRTUN, 1743 (void (*)(ire_t *, void *)) 1744 ire_fastpath_update, mp, ill); 1745 } else { 1746 mutex_exit(&ire_mrtun_lock); 1747 } 1748 } 1749 mp1 = mp->b_cont; 1750 freeb(mp); 1751 mp = mp1; 1752 } else { 1753 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1754 } 1755 1756 freeb(mp); 1757 } 1758 1759 /* 1760 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1761 * The data portion of the request is a dl_unitdata_req_t template for 1762 * what we would send downstream in the absence of a fastpath confirmation. 1763 */ 1764 int 1765 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1766 { 1767 struct iocblk *ioc; 1768 mblk_t *mp; 1769 1770 if (dlur_mp == NULL) 1771 return (EINVAL); 1772 1773 mutex_enter(&ill->ill_lock); 1774 switch (ill->ill_dlpi_fastpath_state) { 1775 case IDS_FAILED: 1776 /* 1777 * Driver NAKed the first fastpath ioctl - assume it doesn't 1778 * support it. 1779 */ 1780 mutex_exit(&ill->ill_lock); 1781 return (ENOTSUP); 1782 case IDS_UNKNOWN: 1783 /* This is the first probe */ 1784 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1785 break; 1786 default: 1787 break; 1788 } 1789 mutex_exit(&ill->ill_lock); 1790 1791 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1792 return (EAGAIN); 1793 1794 mp->b_cont = copyb(dlur_mp); 1795 if (mp->b_cont == NULL) { 1796 freeb(mp); 1797 return (EAGAIN); 1798 } 1799 1800 ioc = (struct iocblk *)mp->b_rptr; 1801 ioc->ioc_count = msgdsize(mp->b_cont); 1802 1803 putnext(ill->ill_wq, mp); 1804 return (0); 1805 } 1806 1807 void 1808 ill_capability_probe(ill_t *ill) 1809 { 1810 /* 1811 * Do so only if negotiation is enabled, capabilities are unknown, 1812 * and a capability negotiation is not already in progress. 1813 */ 1814 if (ill->ill_dlpi_capab_state != IDS_UNKNOWN && 1815 ill->ill_dlpi_capab_state != IDS_RENEG) 1816 return; 1817 1818 ill->ill_dlpi_capab_state = IDS_INPROGRESS; 1819 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1820 ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL); 1821 } 1822 1823 void 1824 ill_capability_reset(ill_t *ill) 1825 { 1826 mblk_t *sc_mp = NULL; 1827 mblk_t *tmp; 1828 1829 /* 1830 * Note here that we reset the state to UNKNOWN, and later send 1831 * down the DL_CAPABILITY_REQ without first setting the state to 1832 * INPROGRESS. We do this in order to distinguish the 1833 * DL_CAPABILITY_ACK response which may come back in response to 1834 * a "reset" apart from the "probe" DL_CAPABILITY_REQ. This would 1835 * also handle the case where the driver doesn't send us back 1836 * a DL_CAPABILITY_ACK in response, since the "probe" routine 1837 * requires the state to be in UNKNOWN anyway. In any case, all 1838 * features are turned off until the state reaches IDS_OK. 1839 */ 1840 ill->ill_dlpi_capab_state = IDS_UNKNOWN; 1841 1842 /* 1843 * Disable sub-capabilities and request a list of sub-capability 1844 * messages which will be sent down to the driver. Each handler 1845 * allocates the corresponding dl_capability_sub_t inside an 1846 * mblk, and links it to the existing sc_mp mblk, or return it 1847 * as sc_mp if it's the first sub-capability (the passed in 1848 * sc_mp is NULL). Upon returning from all capability handlers, 1849 * sc_mp will be pulled-up, before passing it downstream. 1850 */ 1851 ill_capability_mdt_reset(ill, &sc_mp); 1852 ill_capability_hcksum_reset(ill, &sc_mp); 1853 ill_capability_zerocopy_reset(ill, &sc_mp); 1854 ill_capability_ipsec_reset(ill, &sc_mp); 1855 ill_capability_dls_reset(ill, &sc_mp); 1856 1857 /* Nothing to send down in order to disable the capabilities? */ 1858 if (sc_mp == NULL) 1859 return; 1860 1861 tmp = msgpullup(sc_mp, -1); 1862 freemsg(sc_mp); 1863 if ((sc_mp = tmp) == NULL) { 1864 cmn_err(CE_WARN, "ill_capability_reset: unable to send down " 1865 "DL_CAPABILITY_REQ (ENOMEM)\n"); 1866 return; 1867 } 1868 1869 ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n")); 1870 ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp); 1871 } 1872 1873 /* 1874 * Request or set new-style hardware capabilities supported by DLS provider. 1875 */ 1876 static void 1877 ill_capability_proto(ill_t *ill, int type, mblk_t *reqp) 1878 { 1879 mblk_t *mp; 1880 dl_capability_req_t *capb; 1881 size_t size = 0; 1882 uint8_t *ptr; 1883 1884 if (reqp != NULL) 1885 size = MBLKL(reqp); 1886 1887 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type); 1888 if (mp == NULL) { 1889 freemsg(reqp); 1890 return; 1891 } 1892 ptr = mp->b_rptr; 1893 1894 capb = (dl_capability_req_t *)ptr; 1895 ptr += sizeof (dl_capability_req_t); 1896 1897 if (reqp != NULL) { 1898 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1899 capb->dl_sub_length = size; 1900 bcopy(reqp->b_rptr, ptr, size); 1901 ptr += size; 1902 mp->b_cont = reqp->b_cont; 1903 freeb(reqp); 1904 } 1905 ASSERT(ptr == mp->b_wptr); 1906 1907 ill_dlpi_send(ill, mp); 1908 } 1909 1910 static void 1911 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1912 { 1913 dl_capab_id_t *id_ic; 1914 uint_t sub_dl_cap = outers->dl_cap; 1915 dl_capability_sub_t *inners; 1916 uint8_t *capend; 1917 1918 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1919 1920 /* 1921 * Note: range checks here are not absolutely sufficient to 1922 * make us robust against malformed messages sent by drivers; 1923 * this is in keeping with the rest of IP's dlpi handling. 1924 * (Remember, it's coming from something else in the kernel 1925 * address space) 1926 */ 1927 1928 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1929 if (capend > mp->b_wptr) { 1930 cmn_err(CE_WARN, "ill_capability_id_ack: " 1931 "malformed sub-capability too long for mblk"); 1932 return; 1933 } 1934 1935 id_ic = (dl_capab_id_t *)(outers + 1); 1936 1937 if (outers->dl_length < sizeof (*id_ic) || 1938 (inners = &id_ic->id_subcap, 1939 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1940 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1941 "encapsulated capab type %d too long for mblk", 1942 inners->dl_cap); 1943 return; 1944 } 1945 1946 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1947 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1948 "isn't as expected; pass-thru module(s) detected, " 1949 "discarding capability\n", inners->dl_cap)); 1950 return; 1951 } 1952 1953 /* Process the encapsulated sub-capability */ 1954 ill_capability_dispatch(ill, mp, inners, B_TRUE); 1955 } 1956 1957 /* 1958 * Process Multidata Transmit capability negotiation ack received from a 1959 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a 1960 * DL_CAPABILITY_ACK message. 1961 */ 1962 static void 1963 ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1964 { 1965 mblk_t *nmp = NULL; 1966 dl_capability_req_t *oc; 1967 dl_capab_mdt_t *mdt_ic, *mdt_oc; 1968 ill_mdt_capab_t **ill_mdt_capab; 1969 uint_t sub_dl_cap = isub->dl_cap; 1970 uint8_t *capend; 1971 1972 ASSERT(sub_dl_cap == DL_CAPAB_MDT); 1973 1974 ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab; 1975 1976 /* 1977 * Note: range checks here are not absolutely sufficient to 1978 * make us robust against malformed messages sent by drivers; 1979 * this is in keeping with the rest of IP's dlpi handling. 1980 * (Remember, it's coming from something else in the kernel 1981 * address space) 1982 */ 1983 1984 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1985 if (capend > mp->b_wptr) { 1986 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1987 "malformed sub-capability too long for mblk"); 1988 return; 1989 } 1990 1991 mdt_ic = (dl_capab_mdt_t *)(isub + 1); 1992 1993 if (mdt_ic->mdt_version != MDT_VERSION_2) { 1994 cmn_err(CE_CONT, "ill_capability_mdt_ack: " 1995 "unsupported MDT sub-capability (version %d, expected %d)", 1996 mdt_ic->mdt_version, MDT_VERSION_2); 1997 return; 1998 } 1999 2000 if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) { 2001 ip1dbg(("ill_capability_mdt_ack: mid token for MDT " 2002 "capability isn't as expected; pass-thru module(s) " 2003 "detected, discarding capability\n")); 2004 return; 2005 } 2006 2007 if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) { 2008 2009 if (*ill_mdt_capab == NULL) { 2010 *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), 2011 KM_NOSLEEP); 2012 2013 if (*ill_mdt_capab == NULL) { 2014 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 2015 "could not enable MDT version %d " 2016 "for %s (ENOMEM)\n", MDT_VERSION_2, 2017 ill->ill_name); 2018 return; 2019 } 2020 } 2021 2022 ip1dbg(("ill_capability_mdt_ack: interface %s supports " 2023 "MDT version %d (%d bytes leading, %d bytes trailing " 2024 "header spaces, %d max pld bufs, %d span limit)\n", 2025 ill->ill_name, MDT_VERSION_2, 2026 mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail, 2027 mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit)); 2028 2029 (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2; 2030 (*ill_mdt_capab)->ill_mdt_on = 1; 2031 /* 2032 * Round the following values to the nearest 32-bit; ULP 2033 * may further adjust them to accomodate for additional 2034 * protocol headers. We pass these values to ULP during 2035 * bind time. 2036 */ 2037 (*ill_mdt_capab)->ill_mdt_hdr_head = 2038 roundup(mdt_ic->mdt_hdr_head, 4); 2039 (*ill_mdt_capab)->ill_mdt_hdr_tail = 2040 roundup(mdt_ic->mdt_hdr_tail, 4); 2041 (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld; 2042 (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit; 2043 2044 ill->ill_capabilities |= ILL_CAPAB_MDT; 2045 } else { 2046 uint_t size; 2047 uchar_t *rptr; 2048 2049 size = sizeof (dl_capability_req_t) + 2050 sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); 2051 2052 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2053 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 2054 "could not enable MDT for %s (ENOMEM)\n", 2055 ill->ill_name); 2056 return; 2057 } 2058 2059 rptr = nmp->b_rptr; 2060 /* initialize dl_capability_req_t */ 2061 oc = (dl_capability_req_t *)nmp->b_rptr; 2062 oc->dl_sub_offset = sizeof (dl_capability_req_t); 2063 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 2064 sizeof (dl_capab_mdt_t); 2065 nmp->b_rptr += sizeof (dl_capability_req_t); 2066 2067 /* initialize dl_capability_sub_t */ 2068 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 2069 nmp->b_rptr += sizeof (*isub); 2070 2071 /* initialize dl_capab_mdt_t */ 2072 mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr; 2073 bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic)); 2074 2075 nmp->b_rptr = rptr; 2076 2077 ip1dbg(("ill_capability_mdt_ack: asking interface %s " 2078 "to enable MDT version %d\n", ill->ill_name, 2079 MDT_VERSION_2)); 2080 2081 /* set ENABLE flag */ 2082 mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; 2083 2084 /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ 2085 ill_dlpi_send(ill, nmp); 2086 } 2087 } 2088 2089 static void 2090 ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp) 2091 { 2092 mblk_t *mp; 2093 dl_capab_mdt_t *mdt_subcap; 2094 dl_capability_sub_t *dl_subcap; 2095 int size; 2096 2097 if (!ILL_MDT_CAPABLE(ill)) 2098 return; 2099 2100 ASSERT(ill->ill_mdt_capab != NULL); 2101 /* 2102 * Clear the capability flag for MDT but retain the ill_mdt_capab 2103 * structure since it's possible that another thread is still 2104 * referring to it. The structure only gets deallocated when 2105 * we destroy the ill. 2106 */ 2107 ill->ill_capabilities &= ~ILL_CAPAB_MDT; 2108 2109 size = sizeof (*dl_subcap) + sizeof (*mdt_subcap); 2110 2111 mp = allocb(size, BPRI_HI); 2112 if (mp == NULL) { 2113 ip1dbg(("ill_capability_mdt_reset: unable to allocate " 2114 "request to disable MDT\n")); 2115 return; 2116 } 2117 2118 mp->b_wptr = mp->b_rptr + size; 2119 2120 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2121 dl_subcap->dl_cap = DL_CAPAB_MDT; 2122 dl_subcap->dl_length = sizeof (*mdt_subcap); 2123 2124 mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1); 2125 mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version; 2126 mdt_subcap->mdt_flags = 0; 2127 mdt_subcap->mdt_hdr_head = 0; 2128 mdt_subcap->mdt_hdr_tail = 0; 2129 2130 if (*sc_mp != NULL) 2131 linkb(*sc_mp, mp); 2132 else 2133 *sc_mp = mp; 2134 } 2135 2136 /* 2137 * Send a DL_NOTIFY_REQ to the specified ill to enable 2138 * DL_NOTE_PROMISC_ON/OFF_PHYS notifications. 2139 * Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware 2140 * acceleration. 2141 * Returns B_TRUE on success, B_FALSE if the message could not be sent. 2142 */ 2143 static boolean_t 2144 ill_enable_promisc_notify(ill_t *ill) 2145 { 2146 mblk_t *mp; 2147 dl_notify_req_t *req; 2148 2149 IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n")); 2150 2151 mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ); 2152 if (mp == NULL) 2153 return (B_FALSE); 2154 2155 req = (dl_notify_req_t *)mp->b_rptr; 2156 req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS | 2157 DL_NOTE_PROMISC_OFF_PHYS; 2158 2159 ill_dlpi_send(ill, mp); 2160 2161 return (B_TRUE); 2162 } 2163 2164 2165 /* 2166 * Allocate an IPsec capability request which will be filled by our 2167 * caller to turn on support for one or more algorithms. 2168 */ 2169 static mblk_t * 2170 ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub) 2171 { 2172 mblk_t *nmp; 2173 dl_capability_req_t *ocap; 2174 dl_capab_ipsec_t *ocip; 2175 dl_capab_ipsec_t *icip; 2176 uint8_t *ptr; 2177 icip = (dl_capab_ipsec_t *)(isub + 1); 2178 2179 /* 2180 * The first time around, we send a DL_NOTIFY_REQ to enable 2181 * PROMISC_ON/OFF notification from the provider. We need to 2182 * do this before enabling the algorithms to avoid leakage of 2183 * cleartext packets. 2184 */ 2185 2186 if (!ill_enable_promisc_notify(ill)) 2187 return (NULL); 2188 2189 /* 2190 * Allocate new mblk which will contain a new capability 2191 * request to enable the capabilities. 2192 */ 2193 2194 nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + 2195 sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ); 2196 if (nmp == NULL) 2197 return (NULL); 2198 2199 ptr = nmp->b_rptr; 2200 2201 /* initialize dl_capability_req_t */ 2202 ocap = (dl_capability_req_t *)ptr; 2203 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2204 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2205 ptr += sizeof (dl_capability_req_t); 2206 2207 /* initialize dl_capability_sub_t */ 2208 bcopy(isub, ptr, sizeof (*isub)); 2209 ptr += sizeof (*isub); 2210 2211 /* initialize dl_capab_ipsec_t */ 2212 ocip = (dl_capab_ipsec_t *)ptr; 2213 bcopy(icip, ocip, sizeof (*icip)); 2214 2215 nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]); 2216 return (nmp); 2217 } 2218 2219 /* 2220 * Process an IPsec capability negotiation ack received from a DLS Provider. 2221 * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or 2222 * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message. 2223 */ 2224 static void 2225 ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2226 { 2227 dl_capab_ipsec_t *icip; 2228 dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */ 2229 dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */ 2230 uint_t cipher, nciphers; 2231 mblk_t *nmp; 2232 uint_t alg_len; 2233 boolean_t need_sadb_dump; 2234 uint_t sub_dl_cap = isub->dl_cap; 2235 ill_ipsec_capab_t **ill_capab; 2236 uint64_t ill_capab_flag; 2237 uint8_t *capend, *ciphend; 2238 boolean_t sadb_resync; 2239 2240 ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH || 2241 sub_dl_cap == DL_CAPAB_IPSEC_ESP); 2242 2243 if (sub_dl_cap == DL_CAPAB_IPSEC_AH) { 2244 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah; 2245 ill_capab_flag = ILL_CAPAB_AH; 2246 } else { 2247 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp; 2248 ill_capab_flag = ILL_CAPAB_ESP; 2249 } 2250 2251 /* 2252 * If the ill capability structure exists, then this incoming 2253 * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle. 2254 * If this is so, then we'd need to resynchronize the SADB 2255 * after re-enabling the offloaded ciphers. 2256 */ 2257 sadb_resync = (*ill_capab != NULL); 2258 2259 /* 2260 * Note: range checks here are not absolutely sufficient to 2261 * make us robust against malformed messages sent by drivers; 2262 * this is in keeping with the rest of IP's dlpi handling. 2263 * (Remember, it's coming from something else in the kernel 2264 * address space) 2265 */ 2266 2267 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2268 if (capend > mp->b_wptr) { 2269 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2270 "malformed sub-capability too long for mblk"); 2271 return; 2272 } 2273 2274 /* 2275 * There are two types of acks we process here: 2276 * 1. acks in reply to a (first form) generic capability req 2277 * (no ENABLE flag set) 2278 * 2. acks in reply to a ENABLE capability req. 2279 * (ENABLE flag set) 2280 * 2281 * We process the subcapability passed as argument as follows: 2282 * 1 do initializations 2283 * 1.1 initialize nmp = NULL 2284 * 1.2 set need_sadb_dump to B_FALSE 2285 * 2 for each cipher in subcapability: 2286 * 2.1 if ENABLE flag is set: 2287 * 2.1.1 update per-ill ipsec capabilities info 2288 * 2.1.2 set need_sadb_dump to B_TRUE 2289 * 2.2 if ENABLE flag is not set: 2290 * 2.2.1 if nmp is NULL: 2291 * 2.2.1.1 allocate and initialize nmp 2292 * 2.2.1.2 init current pos in nmp 2293 * 2.2.2 copy current cipher to current pos in nmp 2294 * 2.2.3 set ENABLE flag in nmp 2295 * 2.2.4 update current pos 2296 * 3 if nmp is not equal to NULL, send enable request 2297 * 3.1 send capability request 2298 * 4 if need_sadb_dump is B_TRUE 2299 * 4.1 enable promiscuous on/off notifications 2300 * 4.2 call ill_dlpi_send(isub->dlcap) to send all 2301 * AH or ESP SA's to interface. 2302 */ 2303 2304 nmp = NULL; 2305 oalg = NULL; 2306 need_sadb_dump = B_FALSE; 2307 icip = (dl_capab_ipsec_t *)(isub + 1); 2308 ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]); 2309 2310 nciphers = icip->cip_nciphers; 2311 ciphend = (uint8_t *)(ialg + icip->cip_nciphers); 2312 2313 if (ciphend > capend) { 2314 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2315 "too many ciphers for sub-capability len"); 2316 return; 2317 } 2318 2319 for (cipher = 0; cipher < nciphers; cipher++) { 2320 alg_len = sizeof (dl_capab_ipsec_alg_t); 2321 2322 if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) { 2323 /* 2324 * TBD: when we provide a way to disable capabilities 2325 * from above, need to manage the request-pending state 2326 * and fail if we were not expecting this ACK. 2327 */ 2328 IPSECHW_DEBUG(IPSECHW_CAPAB, 2329 ("ill_capability_ipsec_ack: got ENABLE ACK\n")); 2330 2331 /* 2332 * Update IPsec capabilities for this ill 2333 */ 2334 2335 if (*ill_capab == NULL) { 2336 IPSECHW_DEBUG(IPSECHW_CAPAB, 2337 ("ill_capability_ipsec_ack: " 2338 "allocating ipsec_capab for ill\n")); 2339 *ill_capab = ill_ipsec_capab_alloc(); 2340 2341 if (*ill_capab == NULL) { 2342 cmn_err(CE_WARN, 2343 "ill_capability_ipsec_ack: " 2344 "could not enable IPsec Hardware " 2345 "acceleration for %s (ENOMEM)\n", 2346 ill->ill_name); 2347 return; 2348 } 2349 } 2350 2351 ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH || 2352 ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR); 2353 2354 if (ialg->alg_prim >= MAX_IPSEC_ALGS) { 2355 cmn_err(CE_WARN, 2356 "ill_capability_ipsec_ack: " 2357 "malformed IPsec algorithm id %d", 2358 ialg->alg_prim); 2359 continue; 2360 } 2361 2362 if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) { 2363 IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs, 2364 ialg->alg_prim); 2365 } else { 2366 ipsec_capab_algparm_t *alp; 2367 2368 IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs, 2369 ialg->alg_prim); 2370 if (!ill_ipsec_capab_resize_algparm(*ill_capab, 2371 ialg->alg_prim)) { 2372 cmn_err(CE_WARN, 2373 "ill_capability_ipsec_ack: " 2374 "no space for IPsec alg id %d", 2375 ialg->alg_prim); 2376 continue; 2377 } 2378 alp = &((*ill_capab)->encr_algparm[ 2379 ialg->alg_prim]); 2380 alp->minkeylen = ialg->alg_minbits; 2381 alp->maxkeylen = ialg->alg_maxbits; 2382 } 2383 ill->ill_capabilities |= ill_capab_flag; 2384 /* 2385 * indicate that a capability was enabled, which 2386 * will be used below to kick off a SADB dump 2387 * to the ill. 2388 */ 2389 need_sadb_dump = B_TRUE; 2390 } else { 2391 IPSECHW_DEBUG(IPSECHW_CAPAB, 2392 ("ill_capability_ipsec_ack: enabling alg 0x%x\n", 2393 ialg->alg_prim)); 2394 2395 if (nmp == NULL) { 2396 nmp = ill_alloc_ipsec_cap_req(ill, isub); 2397 if (nmp == NULL) { 2398 /* 2399 * Sending the PROMISC_ON/OFF 2400 * notification request failed. 2401 * We cannot enable the algorithms 2402 * since the Provider will not 2403 * notify IP of promiscous mode 2404 * changes, which could lead 2405 * to leakage of packets. 2406 */ 2407 cmn_err(CE_WARN, 2408 "ill_capability_ipsec_ack: " 2409 "could not enable IPsec Hardware " 2410 "acceleration for %s (ENOMEM)\n", 2411 ill->ill_name); 2412 return; 2413 } 2414 /* ptr to current output alg specifier */ 2415 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2416 } 2417 2418 /* 2419 * Copy current alg specifier, set ENABLE 2420 * flag, and advance to next output alg. 2421 * For now we enable all IPsec capabilities. 2422 */ 2423 ASSERT(oalg != NULL); 2424 bcopy(ialg, oalg, alg_len); 2425 oalg->alg_flag |= DL_CAPAB_ALG_ENABLE; 2426 nmp->b_wptr += alg_len; 2427 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2428 } 2429 2430 /* move to next input algorithm specifier */ 2431 ialg = (dl_capab_ipsec_alg_t *) 2432 ((char *)ialg + alg_len); 2433 } 2434 2435 if (nmp != NULL) 2436 /* 2437 * nmp points to a DL_CAPABILITY_REQ message to enable 2438 * IPsec hardware acceleration. 2439 */ 2440 ill_dlpi_send(ill, nmp); 2441 2442 if (need_sadb_dump) 2443 /* 2444 * An acknowledgement corresponding to a request to 2445 * enable acceleration was received, notify SADB. 2446 */ 2447 ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync); 2448 } 2449 2450 /* 2451 * Given an mblk with enough space in it, create sub-capability entries for 2452 * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised 2453 * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared, 2454 * in preparation for the reset the DL_CAPABILITY_REQ message. 2455 */ 2456 static void 2457 ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, 2458 ill_ipsec_capab_t *ill_cap, mblk_t *mp) 2459 { 2460 dl_capab_ipsec_t *oipsec; 2461 dl_capab_ipsec_alg_t *oalg; 2462 dl_capability_sub_t *dl_subcap; 2463 int i, k; 2464 2465 ASSERT(nciphers > 0); 2466 ASSERT(ill_cap != NULL); 2467 ASSERT(mp != NULL); 2468 ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen); 2469 2470 /* dl_capability_sub_t for "stype" */ 2471 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2472 dl_subcap->dl_cap = stype; 2473 dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen; 2474 mp->b_wptr += sizeof (dl_capability_sub_t); 2475 2476 /* dl_capab_ipsec_t for "stype" */ 2477 oipsec = (dl_capab_ipsec_t *)mp->b_wptr; 2478 oipsec->cip_version = 1; 2479 oipsec->cip_nciphers = nciphers; 2480 mp->b_wptr = (uchar_t *)&oipsec->cip_data[0]; 2481 2482 /* create entries for "stype" AUTH ciphers */ 2483 for (i = 0; i < ill_cap->algs_size; i++) { 2484 for (k = 0; k < BITSPERBYTE; k++) { 2485 if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0) 2486 continue; 2487 2488 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2489 bzero((void *)oalg, sizeof (*oalg)); 2490 oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH; 2491 oalg->alg_prim = k + (BITSPERBYTE * i); 2492 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2493 } 2494 } 2495 /* create entries for "stype" ENCR ciphers */ 2496 for (i = 0; i < ill_cap->algs_size; i++) { 2497 for (k = 0; k < BITSPERBYTE; k++) { 2498 if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0) 2499 continue; 2500 2501 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2502 bzero((void *)oalg, sizeof (*oalg)); 2503 oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR; 2504 oalg->alg_prim = k + (BITSPERBYTE * i); 2505 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2506 } 2507 } 2508 } 2509 2510 /* 2511 * Macro to count number of 1s in a byte (8-bit word). The total count is 2512 * accumulated into the passed-in argument (sum). We could use SPARCv9's 2513 * POPC instruction, but our macro is more flexible for an arbitrary length 2514 * of bytes, such as {auth,encr}_hw_algs. These variables are currently 2515 * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length 2516 * stays that way, we can reduce the number of iterations required. 2517 */ 2518 #define COUNT_1S(val, sum) { \ 2519 uint8_t x = val & 0xff; \ 2520 x = (x & 0x55) + ((x >> 1) & 0x55); \ 2521 x = (x & 0x33) + ((x >> 2) & 0x33); \ 2522 sum += (x & 0xf) + ((x >> 4) & 0xf); \ 2523 } 2524 2525 /* ARGSUSED */ 2526 static void 2527 ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp) 2528 { 2529 mblk_t *mp; 2530 ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; 2531 ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; 2532 uint64_t ill_capabilities = ill->ill_capabilities; 2533 int ah_cnt = 0, esp_cnt = 0; 2534 int ah_len = 0, esp_len = 0; 2535 int i, size = 0; 2536 2537 if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) 2538 return; 2539 2540 ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); 2541 ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); 2542 2543 /* Find out the number of ciphers for AH */ 2544 if (cap_ah != NULL) { 2545 for (i = 0; i < cap_ah->algs_size; i++) { 2546 COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt); 2547 COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt); 2548 } 2549 if (ah_cnt > 0) { 2550 size += sizeof (dl_capability_sub_t) + 2551 sizeof (dl_capab_ipsec_t); 2552 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2553 ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2554 size += ah_len; 2555 } 2556 } 2557 2558 /* Find out the number of ciphers for ESP */ 2559 if (cap_esp != NULL) { 2560 for (i = 0; i < cap_esp->algs_size; i++) { 2561 COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt); 2562 COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt); 2563 } 2564 if (esp_cnt > 0) { 2565 size += sizeof (dl_capability_sub_t) + 2566 sizeof (dl_capab_ipsec_t); 2567 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2568 esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2569 size += esp_len; 2570 } 2571 } 2572 2573 if (size == 0) { 2574 ip1dbg(("ill_capability_ipsec_reset: capabilities exist but " 2575 "there's nothing to reset\n")); 2576 return; 2577 } 2578 2579 mp = allocb(size, BPRI_HI); 2580 if (mp == NULL) { 2581 ip1dbg(("ill_capability_ipsec_reset: unable to allocate " 2582 "request to disable IPSEC Hardware Acceleration\n")); 2583 return; 2584 } 2585 2586 /* 2587 * Clear the capability flags for IPSec HA but retain the ill 2588 * capability structures since it's possible that another thread 2589 * is still referring to them. The structures only get deallocated 2590 * when we destroy the ill. 2591 * 2592 * Various places check the flags to see if the ill is capable of 2593 * hardware acceleration, and by clearing them we ensure that new 2594 * outbound IPSec packets are sent down encrypted. 2595 */ 2596 ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP); 2597 2598 /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ 2599 if (ah_cnt > 0) { 2600 ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, 2601 cap_ah, mp); 2602 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2603 } 2604 2605 /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ 2606 if (esp_cnt > 0) { 2607 ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, 2608 cap_esp, mp); 2609 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2610 } 2611 2612 /* 2613 * At this point we've composed a bunch of sub-capabilities to be 2614 * encapsulated in a DL_CAPABILITY_REQ and later sent downstream 2615 * by the caller. Upon receiving this reset message, the driver 2616 * must stop inbound decryption (by destroying all inbound SAs) 2617 * and let the corresponding packets come in encrypted. 2618 */ 2619 2620 if (*sc_mp != NULL) 2621 linkb(*sc_mp, mp); 2622 else 2623 *sc_mp = mp; 2624 } 2625 2626 static void 2627 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, 2628 boolean_t encapsulated) 2629 { 2630 boolean_t legacy = B_FALSE; 2631 2632 /* 2633 * If this DL_CAPABILITY_ACK came in as a response to our "reset" 2634 * DL_CAPABILITY_REQ, ignore it during this cycle. We've just 2635 * instructed the driver to disable its advertised capabilities, 2636 * so there's no point in accepting any response at this moment. 2637 */ 2638 if (ill->ill_dlpi_capab_state == IDS_UNKNOWN) 2639 return; 2640 2641 /* 2642 * Note that only the following two sub-capabilities may be 2643 * considered as "legacy", since their original definitions 2644 * do not incorporate the dl_mid_t module ID token, and hence 2645 * may require the use of the wrapper sub-capability. 2646 */ 2647 switch (subp->dl_cap) { 2648 case DL_CAPAB_IPSEC_AH: 2649 case DL_CAPAB_IPSEC_ESP: 2650 legacy = B_TRUE; 2651 break; 2652 } 2653 2654 /* 2655 * For legacy sub-capabilities which don't incorporate a queue_t 2656 * pointer in their structures, discard them if we detect that 2657 * there are intermediate modules in between IP and the driver. 2658 */ 2659 if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) { 2660 ip1dbg(("ill_capability_dispatch: unencapsulated capab type " 2661 "%d discarded; %d module(s) present below IP\n", 2662 subp->dl_cap, ill->ill_lmod_cnt)); 2663 return; 2664 } 2665 2666 switch (subp->dl_cap) { 2667 case DL_CAPAB_IPSEC_AH: 2668 case DL_CAPAB_IPSEC_ESP: 2669 ill_capability_ipsec_ack(ill, mp, subp); 2670 break; 2671 case DL_CAPAB_MDT: 2672 ill_capability_mdt_ack(ill, mp, subp); 2673 break; 2674 case DL_CAPAB_HCKSUM: 2675 ill_capability_hcksum_ack(ill, mp, subp); 2676 break; 2677 case DL_CAPAB_ZEROCOPY: 2678 ill_capability_zerocopy_ack(ill, mp, subp); 2679 break; 2680 case DL_CAPAB_POLL: 2681 if (!SOFT_RINGS_ENABLED()) 2682 ill_capability_dls_ack(ill, mp, subp); 2683 break; 2684 case DL_CAPAB_SOFT_RING: 2685 if (SOFT_RINGS_ENABLED()) 2686 ill_capability_dls_ack(ill, mp, subp); 2687 break; 2688 default: 2689 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 2690 subp->dl_cap)); 2691 } 2692 } 2693 2694 /* 2695 * As part of negotiating polling capability, the driver tells us 2696 * the default (or normal) blanking interval and packet threshold 2697 * (the receive timer fires if blanking interval is reached or 2698 * the packet threshold is reached). 2699 * 2700 * As part of manipulating the polling interval, we always use our 2701 * estimated interval (avg service time * number of packets queued 2702 * on the squeue) but we try to blank for a minimum of 2703 * rr_normal_blank_time * rr_max_blank_ratio. We disable the 2704 * packet threshold during this time. When we are not in polling mode 2705 * we set the blank interval typically lower, rr_normal_pkt_cnt * 2706 * rr_min_blank_ratio but up the packet cnt by a ratio of 2707 * rr_min_pkt_cnt_ratio so that we are still getting chains if 2708 * possible although for a shorter interval. 2709 */ 2710 #define RR_MAX_BLANK_RATIO 20 2711 #define RR_MIN_BLANK_RATIO 10 2712 #define RR_MAX_PKT_CNT_RATIO 3 2713 #define RR_MIN_PKT_CNT_RATIO 3 2714 2715 /* 2716 * These can be tuned via /etc/system. 2717 */ 2718 int rr_max_blank_ratio = RR_MAX_BLANK_RATIO; 2719 int rr_min_blank_ratio = RR_MIN_BLANK_RATIO; 2720 int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO; 2721 int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO; 2722 2723 static mac_resource_handle_t 2724 ill_ring_add(void *arg, mac_resource_t *mrp) 2725 { 2726 ill_t *ill = (ill_t *)arg; 2727 mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 2728 ill_rx_ring_t *rx_ring; 2729 int ip_rx_index; 2730 2731 ASSERT(mrp != NULL); 2732 if (mrp->mr_type != MAC_RX_FIFO) { 2733 return (NULL); 2734 } 2735 ASSERT(ill != NULL); 2736 ASSERT(ill->ill_dls_capab != NULL); 2737 2738 mutex_enter(&ill->ill_lock); 2739 for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) { 2740 rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index]; 2741 ASSERT(rx_ring != NULL); 2742 2743 if (rx_ring->rr_ring_state == ILL_RING_FREE) { 2744 time_t normal_blank_time = 2745 mrfp->mrf_normal_blank_time; 2746 uint_t normal_pkt_cnt = 2747 mrfp->mrf_normal_pkt_count; 2748 2749 bzero(rx_ring, sizeof (ill_rx_ring_t)); 2750 2751 rx_ring->rr_blank = mrfp->mrf_blank; 2752 rx_ring->rr_handle = mrfp->mrf_arg; 2753 rx_ring->rr_ill = ill; 2754 rx_ring->rr_normal_blank_time = normal_blank_time; 2755 rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt; 2756 2757 rx_ring->rr_max_blank_time = 2758 normal_blank_time * rr_max_blank_ratio; 2759 rx_ring->rr_min_blank_time = 2760 normal_blank_time * rr_min_blank_ratio; 2761 rx_ring->rr_max_pkt_cnt = 2762 normal_pkt_cnt * rr_max_pkt_cnt_ratio; 2763 rx_ring->rr_min_pkt_cnt = 2764 normal_pkt_cnt * rr_min_pkt_cnt_ratio; 2765 2766 rx_ring->rr_ring_state = ILL_RING_INUSE; 2767 mutex_exit(&ill->ill_lock); 2768 2769 DTRACE_PROBE2(ill__ring__add, (void *), ill, 2770 (int), ip_rx_index); 2771 return ((mac_resource_handle_t)rx_ring); 2772 } 2773 } 2774 2775 /* 2776 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If 2777 * we have devices which can overwhelm this limit, ILL_MAX_RING 2778 * should be made configurable. Meanwhile it cause no panic because 2779 * driver will pass ip_input a NULL handle which will make 2780 * IP allocate the default squeue and Polling mode will not 2781 * be used for this ring. 2782 */ 2783 cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) " 2784 "for %s\n", ILL_MAX_RINGS, ill->ill_name); 2785 2786 mutex_exit(&ill->ill_lock); 2787 return (NULL); 2788 } 2789 2790 static boolean_t 2791 ill_capability_dls_init(ill_t *ill) 2792 { 2793 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2794 conn_t *connp; 2795 size_t sz; 2796 2797 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 2798 if (ill_dls == NULL) { 2799 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2800 "soft_ring enabled for ill=%s (%p) but data " 2801 "structs uninitialized\n", ill->ill_name, 2802 (void *)ill); 2803 } 2804 return (B_TRUE); 2805 } else if (ill->ill_capabilities & ILL_CAPAB_POLL) { 2806 if (ill_dls == NULL) { 2807 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2808 "polling enabled for ill=%s (%p) but data " 2809 "structs uninitialized\n", ill->ill_name, 2810 (void *)ill); 2811 } 2812 return (B_TRUE); 2813 } 2814 2815 if (ill_dls != NULL) { 2816 ill_rx_ring_t *rx_ring = ill_dls->ill_ring_tbl; 2817 /* Soft_Ring or polling is being re-enabled */ 2818 2819 connp = ill_dls->ill_unbind_conn; 2820 ASSERT(rx_ring != NULL); 2821 bzero((void *)ill_dls, sizeof (ill_dls_capab_t)); 2822 bzero((void *)rx_ring, 2823 sizeof (ill_rx_ring_t) * ILL_MAX_RINGS); 2824 ill_dls->ill_ring_tbl = rx_ring; 2825 ill_dls->ill_unbind_conn = connp; 2826 return (B_TRUE); 2827 } 2828 2829 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL) 2830 return (B_FALSE); 2831 2832 sz = sizeof (ill_dls_capab_t); 2833 sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS; 2834 2835 ill_dls = kmem_zalloc(sz, KM_NOSLEEP); 2836 if (ill_dls == NULL) { 2837 cmn_err(CE_WARN, "ill_capability_dls_init: could not " 2838 "allocate dls_capab for %s (%p)\n", ill->ill_name, 2839 (void *)ill); 2840 CONN_DEC_REF(connp); 2841 return (B_FALSE); 2842 } 2843 2844 /* Allocate space to hold ring table */ 2845 ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1]; 2846 ill->ill_dls_capab = ill_dls; 2847 ill_dls->ill_unbind_conn = connp; 2848 return (B_TRUE); 2849 } 2850 2851 /* 2852 * ill_capability_dls_disable: disable soft_ring and/or polling 2853 * capability. Since any of the rings might already be in use, need 2854 * to call ipsq_clean_all() which gets behind the squeue to disable 2855 * direct calls if necessary. 2856 */ 2857 static void 2858 ill_capability_dls_disable(ill_t *ill) 2859 { 2860 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2861 2862 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 2863 ipsq_clean_all(ill); 2864 ill_dls->ill_tx = NULL; 2865 ill_dls->ill_tx_handle = NULL; 2866 ill_dls->ill_dls_change_status = NULL; 2867 ill_dls->ill_dls_bind = NULL; 2868 ill_dls->ill_dls_unbind = NULL; 2869 } 2870 2871 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS)); 2872 } 2873 2874 static void 2875 ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls, 2876 dl_capability_sub_t *isub) 2877 { 2878 uint_t size; 2879 uchar_t *rptr; 2880 dl_capab_dls_t dls, *odls; 2881 ill_dls_capab_t *ill_dls; 2882 mblk_t *nmp = NULL; 2883 dl_capability_req_t *ocap; 2884 uint_t sub_dl_cap = isub->dl_cap; 2885 2886 if (!ill_capability_dls_init(ill)) 2887 return; 2888 ill_dls = ill->ill_dls_capab; 2889 2890 /* Copy locally to get the members aligned */ 2891 bcopy((void *)idls, (void *)&dls, 2892 sizeof (dl_capab_dls_t)); 2893 2894 /* Get the tx function and handle from dld */ 2895 ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx; 2896 ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle; 2897 2898 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2899 ill_dls->ill_dls_change_status = 2900 (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status; 2901 ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind; 2902 ill_dls->ill_dls_unbind = 2903 (ip_dls_unbind_t)dls.dls_ring_unbind; 2904 ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt; 2905 } 2906 2907 size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) + 2908 isub->dl_length; 2909 2910 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2911 cmn_err(CE_WARN, "ill_capability_dls_capable: could " 2912 "not allocate memory for CAPAB_REQ for %s (%p)\n", 2913 ill->ill_name, (void *)ill); 2914 return; 2915 } 2916 2917 /* initialize dl_capability_req_t */ 2918 rptr = nmp->b_rptr; 2919 ocap = (dl_capability_req_t *)rptr; 2920 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2921 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2922 rptr += sizeof (dl_capability_req_t); 2923 2924 /* initialize dl_capability_sub_t */ 2925 bcopy(isub, rptr, sizeof (*isub)); 2926 rptr += sizeof (*isub); 2927 2928 odls = (dl_capab_dls_t *)rptr; 2929 rptr += sizeof (dl_capab_dls_t); 2930 2931 /* initialize dl_capab_dls_t to be sent down */ 2932 dls.dls_rx_handle = (uintptr_t)ill; 2933 dls.dls_rx = (uintptr_t)ip_input; 2934 dls.dls_ring_add = (uintptr_t)ill_ring_add; 2935 2936 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2937 dls.dls_ring_cnt = ip_soft_rings_cnt; 2938 dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment; 2939 dls.dls_flags = SOFT_RING_ENABLE; 2940 } else { 2941 dls.dls_flags = POLL_ENABLE; 2942 ip1dbg(("ill_capability_dls_capable: asking interface %s " 2943 "to enable polling\n", ill->ill_name)); 2944 } 2945 bcopy((void *)&dls, (void *)odls, 2946 sizeof (dl_capab_dls_t)); 2947 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 2948 /* 2949 * nmp points to a DL_CAPABILITY_REQ message to 2950 * enable either soft_ring or polling 2951 */ 2952 ill_dlpi_send(ill, nmp); 2953 } 2954 2955 static void 2956 ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp) 2957 { 2958 mblk_t *mp; 2959 dl_capab_dls_t *idls; 2960 dl_capability_sub_t *dl_subcap; 2961 int size; 2962 2963 if (!(ill->ill_capabilities & ILL_CAPAB_DLS)) 2964 return; 2965 2966 ASSERT(ill->ill_dls_capab != NULL); 2967 2968 size = sizeof (*dl_subcap) + sizeof (*idls); 2969 2970 mp = allocb(size, BPRI_HI); 2971 if (mp == NULL) { 2972 ip1dbg(("ill_capability_dls_reset: unable to allocate " 2973 "request to disable soft_ring\n")); 2974 return; 2975 } 2976 2977 mp->b_wptr = mp->b_rptr + size; 2978 2979 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2980 dl_subcap->dl_length = sizeof (*idls); 2981 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2982 dl_subcap->dl_cap = DL_CAPAB_SOFT_RING; 2983 else 2984 dl_subcap->dl_cap = DL_CAPAB_POLL; 2985 2986 idls = (dl_capab_dls_t *)(dl_subcap + 1); 2987 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2988 idls->dls_flags = SOFT_RING_DISABLE; 2989 else 2990 idls->dls_flags = POLL_DISABLE; 2991 2992 if (*sc_mp != NULL) 2993 linkb(*sc_mp, mp); 2994 else 2995 *sc_mp = mp; 2996 } 2997 2998 /* 2999 * Process a soft_ring/poll capability negotiation ack received 3000 * from a DLS Provider.isub must point to the sub-capability 3001 * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message. 3002 */ 3003 static void 3004 ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3005 { 3006 dl_capab_dls_t *idls; 3007 uint_t sub_dl_cap = isub->dl_cap; 3008 uint8_t *capend; 3009 3010 ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING || 3011 sub_dl_cap == DL_CAPAB_POLL); 3012 3013 if (ill->ill_isv6) 3014 return; 3015 3016 /* 3017 * Note: range checks here are not absolutely sufficient to 3018 * make us robust against malformed messages sent by drivers; 3019 * this is in keeping with the rest of IP's dlpi handling. 3020 * (Remember, it's coming from something else in the kernel 3021 * address space) 3022 */ 3023 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3024 if (capend > mp->b_wptr) { 3025 cmn_err(CE_WARN, "ill_capability_dls_ack: " 3026 "malformed sub-capability too long for mblk"); 3027 return; 3028 } 3029 3030 /* 3031 * There are two types of acks we process here: 3032 * 1. acks in reply to a (first form) generic capability req 3033 * (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE) 3034 * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE 3035 * capability req. 3036 */ 3037 idls = (dl_capab_dls_t *)(isub + 1); 3038 3039 if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) { 3040 ip1dbg(("ill_capability_dls_ack: mid token for dls " 3041 "capability isn't as expected; pass-thru " 3042 "module(s) detected, discarding capability\n")); 3043 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 3044 /* 3045 * This is a capability renegotitation case. 3046 * The interface better be unusable at this 3047 * point other wise bad things will happen 3048 * if we disable direct calls on a running 3049 * and up interface. 3050 */ 3051 ill_capability_dls_disable(ill); 3052 } 3053 return; 3054 } 3055 3056 switch (idls->dls_flags) { 3057 default: 3058 /* Disable if unknown flag */ 3059 case SOFT_RING_DISABLE: 3060 case POLL_DISABLE: 3061 ill_capability_dls_disable(ill); 3062 break; 3063 case SOFT_RING_CAPABLE: 3064 case POLL_CAPABLE: 3065 /* 3066 * If the capability was already enabled, its safe 3067 * to disable it first to get rid of stale information 3068 * and then start enabling it again. 3069 */ 3070 ill_capability_dls_disable(ill); 3071 ill_capability_dls_capable(ill, idls, isub); 3072 break; 3073 case SOFT_RING_ENABLE: 3074 case POLL_ENABLE: 3075 mutex_enter(&ill->ill_lock); 3076 if (sub_dl_cap == DL_CAPAB_SOFT_RING && 3077 !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) { 3078 ASSERT(ill->ill_dls_capab != NULL); 3079 ill->ill_capabilities |= ILL_CAPAB_SOFT_RING; 3080 } 3081 if (sub_dl_cap == DL_CAPAB_POLL && 3082 !(ill->ill_capabilities & ILL_CAPAB_POLL)) { 3083 ASSERT(ill->ill_dls_capab != NULL); 3084 ill->ill_capabilities |= ILL_CAPAB_POLL; 3085 ip1dbg(("ill_capability_dls_ack: interface %s " 3086 "has enabled polling\n", ill->ill_name)); 3087 } 3088 mutex_exit(&ill->ill_lock); 3089 break; 3090 } 3091 } 3092 3093 /* 3094 * Process a hardware checksum offload capability negotiation ack received 3095 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 3096 * of a DL_CAPABILITY_ACK message. 3097 */ 3098 static void 3099 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3100 { 3101 dl_capability_req_t *ocap; 3102 dl_capab_hcksum_t *ihck, *ohck; 3103 ill_hcksum_capab_t **ill_hcksum; 3104 mblk_t *nmp = NULL; 3105 uint_t sub_dl_cap = isub->dl_cap; 3106 uint8_t *capend; 3107 3108 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 3109 3110 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 3111 3112 /* 3113 * Note: range checks here are not absolutely sufficient to 3114 * make us robust against malformed messages sent by drivers; 3115 * this is in keeping with the rest of IP's dlpi handling. 3116 * (Remember, it's coming from something else in the kernel 3117 * address space) 3118 */ 3119 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3120 if (capend > mp->b_wptr) { 3121 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3122 "malformed sub-capability too long for mblk"); 3123 return; 3124 } 3125 3126 /* 3127 * There are two types of acks we process here: 3128 * 1. acks in reply to a (first form) generic capability req 3129 * (no ENABLE flag set) 3130 * 2. acks in reply to a ENABLE capability req. 3131 * (ENABLE flag set) 3132 */ 3133 ihck = (dl_capab_hcksum_t *)(isub + 1); 3134 3135 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 3136 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 3137 "unsupported hardware checksum " 3138 "sub-capability (version %d, expected %d)", 3139 ihck->hcksum_version, HCKSUM_VERSION_1); 3140 return; 3141 } 3142 3143 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 3144 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 3145 "checksum capability isn't as expected; pass-thru " 3146 "module(s) detected, discarding capability\n")); 3147 return; 3148 } 3149 3150 #define CURR_HCKSUM_CAPAB \ 3151 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 3152 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 3153 3154 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 3155 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 3156 /* do ENABLE processing */ 3157 if (*ill_hcksum == NULL) { 3158 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 3159 KM_NOSLEEP); 3160 3161 if (*ill_hcksum == NULL) { 3162 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3163 "could not enable hcksum version %d " 3164 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 3165 ill->ill_name); 3166 return; 3167 } 3168 } 3169 3170 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 3171 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 3172 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 3173 ip1dbg(("ill_capability_hcksum_ack: interface %s " 3174 "has enabled hardware checksumming\n ", 3175 ill->ill_name)); 3176 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 3177 /* 3178 * Enabling hardware checksum offload 3179 * Currently IP supports {TCP,UDP}/IPv4 3180 * partial and full cksum offload and 3181 * IPv4 header checksum offload. 3182 * Allocate new mblk which will 3183 * contain a new capability request 3184 * to enable hardware checksum offload. 3185 */ 3186 uint_t size; 3187 uchar_t *rptr; 3188 3189 size = sizeof (dl_capability_req_t) + 3190 sizeof (dl_capability_sub_t) + isub->dl_length; 3191 3192 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3193 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3194 "could not enable hardware cksum for %s (ENOMEM)\n", 3195 ill->ill_name); 3196 return; 3197 } 3198 3199 rptr = nmp->b_rptr; 3200 /* initialize dl_capability_req_t */ 3201 ocap = (dl_capability_req_t *)nmp->b_rptr; 3202 ocap->dl_sub_offset = 3203 sizeof (dl_capability_req_t); 3204 ocap->dl_sub_length = 3205 sizeof (dl_capability_sub_t) + 3206 isub->dl_length; 3207 nmp->b_rptr += sizeof (dl_capability_req_t); 3208 3209 /* initialize dl_capability_sub_t */ 3210 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 3211 nmp->b_rptr += sizeof (*isub); 3212 3213 /* initialize dl_capab_hcksum_t */ 3214 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 3215 bcopy(ihck, ohck, sizeof (*ihck)); 3216 3217 nmp->b_rptr = rptr; 3218 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 3219 3220 /* Set ENABLE flag */ 3221 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 3222 ohck->hcksum_txflags |= HCKSUM_ENABLE; 3223 3224 /* 3225 * nmp points to a DL_CAPABILITY_REQ message to enable 3226 * hardware checksum acceleration. 3227 */ 3228 ill_dlpi_send(ill, nmp); 3229 } else { 3230 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 3231 "advertised %x hardware checksum capability flags\n", 3232 ill->ill_name, ihck->hcksum_txflags)); 3233 } 3234 } 3235 3236 static void 3237 ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp) 3238 { 3239 mblk_t *mp; 3240 dl_capab_hcksum_t *hck_subcap; 3241 dl_capability_sub_t *dl_subcap; 3242 int size; 3243 3244 if (!ILL_HCKSUM_CAPABLE(ill)) 3245 return; 3246 3247 ASSERT(ill->ill_hcksum_capab != NULL); 3248 /* 3249 * Clear the capability flag for hardware checksum offload but 3250 * retain the ill_hcksum_capab structure since it's possible that 3251 * another thread is still referring to it. The structure only 3252 * gets deallocated when we destroy the ill. 3253 */ 3254 ill->ill_capabilities &= ~ILL_CAPAB_HCKSUM; 3255 3256 size = sizeof (*dl_subcap) + sizeof (*hck_subcap); 3257 3258 mp = allocb(size, BPRI_HI); 3259 if (mp == NULL) { 3260 ip1dbg(("ill_capability_hcksum_reset: unable to allocate " 3261 "request to disable hardware checksum offload\n")); 3262 return; 3263 } 3264 3265 mp->b_wptr = mp->b_rptr + size; 3266 3267 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3268 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 3269 dl_subcap->dl_length = sizeof (*hck_subcap); 3270 3271 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 3272 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 3273 hck_subcap->hcksum_txflags = 0; 3274 3275 if (*sc_mp != NULL) 3276 linkb(*sc_mp, mp); 3277 else 3278 *sc_mp = mp; 3279 } 3280 3281 static void 3282 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3283 { 3284 mblk_t *nmp = NULL; 3285 dl_capability_req_t *oc; 3286 dl_capab_zerocopy_t *zc_ic, *zc_oc; 3287 ill_zerocopy_capab_t **ill_zerocopy_capab; 3288 uint_t sub_dl_cap = isub->dl_cap; 3289 uint8_t *capend; 3290 3291 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 3292 3293 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 3294 3295 /* 3296 * Note: range checks here are not absolutely sufficient to 3297 * make us robust against malformed messages sent by drivers; 3298 * this is in keeping with the rest of IP's dlpi handling. 3299 * (Remember, it's coming from something else in the kernel 3300 * address space) 3301 */ 3302 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3303 if (capend > mp->b_wptr) { 3304 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3305 "malformed sub-capability too long for mblk"); 3306 return; 3307 } 3308 3309 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 3310 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 3311 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 3312 "unsupported ZEROCOPY sub-capability (version %d, " 3313 "expected %d)", zc_ic->zerocopy_version, 3314 ZEROCOPY_VERSION_1); 3315 return; 3316 } 3317 3318 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 3319 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 3320 "capability isn't as expected; pass-thru module(s) " 3321 "detected, discarding capability\n")); 3322 return; 3323 } 3324 3325 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 3326 if (*ill_zerocopy_capab == NULL) { 3327 *ill_zerocopy_capab = 3328 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 3329 KM_NOSLEEP); 3330 3331 if (*ill_zerocopy_capab == NULL) { 3332 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3333 "could not enable Zero-copy version %d " 3334 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 3335 ill->ill_name); 3336 return; 3337 } 3338 } 3339 3340 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 3341 "supports Zero-copy version %d\n", ill->ill_name, 3342 ZEROCOPY_VERSION_1)); 3343 3344 (*ill_zerocopy_capab)->ill_zerocopy_version = 3345 zc_ic->zerocopy_version; 3346 (*ill_zerocopy_capab)->ill_zerocopy_flags = 3347 zc_ic->zerocopy_flags; 3348 3349 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 3350 } else { 3351 uint_t size; 3352 uchar_t *rptr; 3353 3354 size = sizeof (dl_capability_req_t) + 3355 sizeof (dl_capability_sub_t) + 3356 sizeof (dl_capab_zerocopy_t); 3357 3358 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3359 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3360 "could not enable zerocopy for %s (ENOMEM)\n", 3361 ill->ill_name); 3362 return; 3363 } 3364 3365 rptr = nmp->b_rptr; 3366 /* initialize dl_capability_req_t */ 3367 oc = (dl_capability_req_t *)rptr; 3368 oc->dl_sub_offset = sizeof (dl_capability_req_t); 3369 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 3370 sizeof (dl_capab_zerocopy_t); 3371 rptr += sizeof (dl_capability_req_t); 3372 3373 /* initialize dl_capability_sub_t */ 3374 bcopy(isub, rptr, sizeof (*isub)); 3375 rptr += sizeof (*isub); 3376 3377 /* initialize dl_capab_zerocopy_t */ 3378 zc_oc = (dl_capab_zerocopy_t *)rptr; 3379 *zc_oc = *zc_ic; 3380 3381 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 3382 "to enable zero-copy version %d\n", ill->ill_name, 3383 ZEROCOPY_VERSION_1)); 3384 3385 /* set VMSAFE_MEM flag */ 3386 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 3387 3388 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 3389 ill_dlpi_send(ill, nmp); 3390 } 3391 } 3392 3393 static void 3394 ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp) 3395 { 3396 mblk_t *mp; 3397 dl_capab_zerocopy_t *zerocopy_subcap; 3398 dl_capability_sub_t *dl_subcap; 3399 int size; 3400 3401 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 3402 return; 3403 3404 ASSERT(ill->ill_zerocopy_capab != NULL); 3405 /* 3406 * Clear the capability flag for Zero-copy but retain the 3407 * ill_zerocopy_capab structure since it's possible that another 3408 * thread is still referring to it. The structure only gets 3409 * deallocated when we destroy the ill. 3410 */ 3411 ill->ill_capabilities &= ~ILL_CAPAB_ZEROCOPY; 3412 3413 size = sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 3414 3415 mp = allocb(size, BPRI_HI); 3416 if (mp == NULL) { 3417 ip1dbg(("ill_capability_zerocopy_reset: unable to allocate " 3418 "request to disable Zero-copy\n")); 3419 return; 3420 } 3421 3422 mp->b_wptr = mp->b_rptr + size; 3423 3424 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3425 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 3426 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 3427 3428 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 3429 zerocopy_subcap->zerocopy_version = 3430 ill->ill_zerocopy_capab->ill_zerocopy_version; 3431 zerocopy_subcap->zerocopy_flags = 0; 3432 3433 if (*sc_mp != NULL) 3434 linkb(*sc_mp, mp); 3435 else 3436 *sc_mp = mp; 3437 } 3438 3439 /* 3440 * Consume a new-style hardware capabilities negotiation ack. 3441 * Called from ip_rput_dlpi_writer(). 3442 */ 3443 void 3444 ill_capability_ack(ill_t *ill, mblk_t *mp) 3445 { 3446 dl_capability_ack_t *capp; 3447 dl_capability_sub_t *subp, *endp; 3448 3449 if (ill->ill_dlpi_capab_state == IDS_INPROGRESS) 3450 ill->ill_dlpi_capab_state = IDS_OK; 3451 3452 capp = (dl_capability_ack_t *)mp->b_rptr; 3453 3454 if (capp->dl_sub_length == 0) 3455 /* no new-style capabilities */ 3456 return; 3457 3458 /* make sure the driver supplied correct dl_sub_length */ 3459 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 3460 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 3461 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 3462 return; 3463 } 3464 3465 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 3466 /* 3467 * There are sub-capabilities. Process the ones we know about. 3468 * Loop until we don't have room for another sub-cap header.. 3469 */ 3470 for (subp = SC(capp, capp->dl_sub_offset), 3471 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 3472 subp <= endp; 3473 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 3474 3475 switch (subp->dl_cap) { 3476 case DL_CAPAB_ID_WRAPPER: 3477 ill_capability_id_ack(ill, mp, subp); 3478 break; 3479 default: 3480 ill_capability_dispatch(ill, mp, subp, B_FALSE); 3481 break; 3482 } 3483 } 3484 #undef SC 3485 } 3486 3487 /* 3488 * This routine is called to scan the fragmentation reassembly table for 3489 * the specified ILL for any packets that are starting to smell. 3490 * dead_interval is the maximum time in seconds that will be tolerated. It 3491 * will either be the value specified in ip_g_frag_timeout, or zero if the 3492 * ILL is shutting down and it is time to blow everything off. 3493 * 3494 * It returns the number of seconds (as a time_t) that the next frag timer 3495 * should be scheduled for, 0 meaning that the timer doesn't need to be 3496 * re-started. Note that the method of calculating next_timeout isn't 3497 * entirely accurate since time will flow between the time we grab 3498 * current_time and the time we schedule the next timeout. This isn't a 3499 * big problem since this is the timer for sending an ICMP reassembly time 3500 * exceeded messages, and it doesn't have to be exactly accurate. 3501 * 3502 * This function is 3503 * sometimes called as writer, although this is not required. 3504 */ 3505 time_t 3506 ill_frag_timeout(ill_t *ill, time_t dead_interval) 3507 { 3508 ipfb_t *ipfb; 3509 ipfb_t *endp; 3510 ipf_t *ipf; 3511 ipf_t *ipfnext; 3512 mblk_t *mp; 3513 time_t current_time = gethrestime_sec(); 3514 time_t next_timeout = 0; 3515 uint32_t hdr_length; 3516 mblk_t *send_icmp_head; 3517 mblk_t *send_icmp_head_v6; 3518 zoneid_t zoneid; 3519 3520 ipfb = ill->ill_frag_hash_tbl; 3521 if (ipfb == NULL) 3522 return (B_FALSE); 3523 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 3524 /* Walk the frag hash table. */ 3525 for (; ipfb < endp; ipfb++) { 3526 send_icmp_head = NULL; 3527 send_icmp_head_v6 = NULL; 3528 mutex_enter(&ipfb->ipfb_lock); 3529 while ((ipf = ipfb->ipfb_ipf) != 0) { 3530 time_t frag_time = current_time - ipf->ipf_timestamp; 3531 time_t frag_timeout; 3532 3533 if (frag_time < dead_interval) { 3534 /* 3535 * There are some outstanding fragments 3536 * that will timeout later. Make note of 3537 * the time so that we can reschedule the 3538 * next timeout appropriately. 3539 */ 3540 frag_timeout = dead_interval - frag_time; 3541 if (next_timeout == 0 || 3542 frag_timeout < next_timeout) { 3543 next_timeout = frag_timeout; 3544 } 3545 break; 3546 } 3547 /* Time's up. Get it out of here. */ 3548 hdr_length = ipf->ipf_nf_hdr_len; 3549 ipfnext = ipf->ipf_hash_next; 3550 if (ipfnext) 3551 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 3552 *ipf->ipf_ptphn = ipfnext; 3553 mp = ipf->ipf_mp->b_cont; 3554 for (; mp; mp = mp->b_cont) { 3555 /* Extra points for neatness. */ 3556 IP_REASS_SET_START(mp, 0); 3557 IP_REASS_SET_END(mp, 0); 3558 } 3559 mp = ipf->ipf_mp->b_cont; 3560 ill->ill_frag_count -= ipf->ipf_count; 3561 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 3562 ipfb->ipfb_count -= ipf->ipf_count; 3563 ASSERT(ipfb->ipfb_frag_pkts > 0); 3564 ipfb->ipfb_frag_pkts--; 3565 /* 3566 * We do not send any icmp message from here because 3567 * we currently are holding the ipfb_lock for this 3568 * hash chain. If we try and send any icmp messages 3569 * from here we may end up via a put back into ip 3570 * trying to get the same lock, causing a recursive 3571 * mutex panic. Instead we build a list and send all 3572 * the icmp messages after we have dropped the lock. 3573 */ 3574 if (ill->ill_isv6) { 3575 BUMP_MIB(ill->ill_ip6_mib, ipv6ReasmFails); 3576 if (hdr_length != 0) { 3577 mp->b_next = send_icmp_head_v6; 3578 send_icmp_head_v6 = mp; 3579 } else { 3580 freemsg(mp); 3581 } 3582 } else { 3583 BUMP_MIB(&ip_mib, ipReasmFails); 3584 if (hdr_length != 0) { 3585 mp->b_next = send_icmp_head; 3586 send_icmp_head = mp; 3587 } else { 3588 freemsg(mp); 3589 } 3590 } 3591 freeb(ipf->ipf_mp); 3592 } 3593 mutex_exit(&ipfb->ipfb_lock); 3594 /* 3595 * Now need to send any icmp messages that we delayed from 3596 * above. 3597 */ 3598 while (send_icmp_head_v6 != NULL) { 3599 ip6_t *ip6h; 3600 3601 mp = send_icmp_head_v6; 3602 send_icmp_head_v6 = send_icmp_head_v6->b_next; 3603 mp->b_next = NULL; 3604 if (mp->b_datap->db_type == M_CTL) 3605 ip6h = (ip6_t *)mp->b_cont->b_rptr; 3606 else 3607 ip6h = (ip6_t *)mp->b_rptr; 3608 zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 3609 ill); 3610 if (zoneid == ALL_ZONES) { 3611 freemsg(mp); 3612 } else { 3613 icmp_time_exceeded_v6(ill->ill_wq, mp, 3614 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 3615 B_FALSE, zoneid); 3616 } 3617 } 3618 while (send_icmp_head != NULL) { 3619 ipaddr_t dst; 3620 3621 mp = send_icmp_head; 3622 send_icmp_head = send_icmp_head->b_next; 3623 mp->b_next = NULL; 3624 3625 if (mp->b_datap->db_type == M_CTL) 3626 dst = ((ipha_t *)mp->b_cont->b_rptr)->ipha_dst; 3627 else 3628 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 3629 3630 zoneid = ipif_lookup_addr_zoneid(dst, ill); 3631 if (zoneid == ALL_ZONES) { 3632 freemsg(mp); 3633 } else { 3634 icmp_time_exceeded(ill->ill_wq, mp, 3635 ICMP_REASSEMBLY_TIME_EXCEEDED, zoneid); 3636 } 3637 } 3638 } 3639 /* 3640 * A non-dying ILL will use the return value to decide whether to 3641 * restart the frag timer, and for how long. 3642 */ 3643 return (next_timeout); 3644 } 3645 3646 /* 3647 * This routine is called when the approximate count of mblk memory used 3648 * for the specified ILL has exceeded max_count. 3649 */ 3650 void 3651 ill_frag_prune(ill_t *ill, uint_t max_count) 3652 { 3653 ipfb_t *ipfb; 3654 ipf_t *ipf; 3655 size_t count; 3656 3657 /* 3658 * If we are here within ip_min_frag_prune_time msecs remove 3659 * ill_frag_free_num_pkts oldest packets from each bucket and increment 3660 * ill_frag_free_num_pkts. 3661 */ 3662 mutex_enter(&ill->ill_lock); 3663 if (TICK_TO_MSEC(lbolt - ill->ill_last_frag_clean_time) <= 3664 (ip_min_frag_prune_time != 0 ? 3665 ip_min_frag_prune_time : msec_per_tick)) { 3666 3667 ill->ill_frag_free_num_pkts++; 3668 3669 } else { 3670 ill->ill_frag_free_num_pkts = 0; 3671 } 3672 ill->ill_last_frag_clean_time = lbolt; 3673 mutex_exit(&ill->ill_lock); 3674 3675 /* 3676 * free ill_frag_free_num_pkts oldest packets from each bucket. 3677 */ 3678 if (ill->ill_frag_free_num_pkts != 0) { 3679 int ix; 3680 3681 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3682 ipfb = &ill->ill_frag_hash_tbl[ix]; 3683 mutex_enter(&ipfb->ipfb_lock); 3684 if (ipfb->ipfb_ipf != NULL) { 3685 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 3686 ill->ill_frag_free_num_pkts); 3687 } 3688 mutex_exit(&ipfb->ipfb_lock); 3689 } 3690 } 3691 /* 3692 * While the reassembly list for this ILL is too big, prune a fragment 3693 * queue by age, oldest first. Note that the per ILL count is 3694 * approximate, while the per frag hash bucket counts are accurate. 3695 */ 3696 while (ill->ill_frag_count > max_count) { 3697 int ix; 3698 ipfb_t *oipfb = NULL; 3699 uint_t oldest = UINT_MAX; 3700 3701 count = 0; 3702 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3703 ipfb = &ill->ill_frag_hash_tbl[ix]; 3704 mutex_enter(&ipfb->ipfb_lock); 3705 ipf = ipfb->ipfb_ipf; 3706 if (ipf != NULL && ipf->ipf_gen < oldest) { 3707 oldest = ipf->ipf_gen; 3708 oipfb = ipfb; 3709 } 3710 count += ipfb->ipfb_count; 3711 mutex_exit(&ipfb->ipfb_lock); 3712 } 3713 /* Refresh the per ILL count */ 3714 ill->ill_frag_count = count; 3715 if (oipfb == NULL) { 3716 ill->ill_frag_count = 0; 3717 break; 3718 } 3719 if (count <= max_count) 3720 return; /* Somebody beat us to it, nothing to do */ 3721 mutex_enter(&oipfb->ipfb_lock); 3722 ipf = oipfb->ipfb_ipf; 3723 if (ipf != NULL) { 3724 ill_frag_free_pkts(ill, oipfb, ipf, 1); 3725 } 3726 mutex_exit(&oipfb->ipfb_lock); 3727 } 3728 } 3729 3730 /* 3731 * free 'free_cnt' fragmented packets starting at ipf. 3732 */ 3733 void 3734 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 3735 { 3736 size_t count; 3737 mblk_t *mp; 3738 mblk_t *tmp; 3739 ipf_t **ipfp = ipf->ipf_ptphn; 3740 3741 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 3742 ASSERT(ipfp != NULL); 3743 ASSERT(ipf != NULL); 3744 3745 while (ipf != NULL && free_cnt-- > 0) { 3746 count = ipf->ipf_count; 3747 mp = ipf->ipf_mp; 3748 ipf = ipf->ipf_hash_next; 3749 for (tmp = mp; tmp; tmp = tmp->b_cont) { 3750 IP_REASS_SET_START(tmp, 0); 3751 IP_REASS_SET_END(tmp, 0); 3752 } 3753 ill->ill_frag_count -= count; 3754 ASSERT(ipfb->ipfb_count >= count); 3755 ipfb->ipfb_count -= count; 3756 ASSERT(ipfb->ipfb_frag_pkts > 0); 3757 ipfb->ipfb_frag_pkts--; 3758 freemsg(mp); 3759 BUMP_MIB(&ip_mib, ipReasmFails); 3760 } 3761 3762 if (ipf) 3763 ipf->ipf_ptphn = ipfp; 3764 ipfp[0] = ipf; 3765 } 3766 3767 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 3768 "obsolete and may be removed in a future release of Solaris. Use " \ 3769 "ifconfig(1M) to manipulate the forwarding status of an interface." 3770 3771 /* 3772 * For obsolete per-interface forwarding configuration; 3773 * called in response to ND_GET. 3774 */ 3775 /* ARGSUSED */ 3776 static int 3777 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 3778 { 3779 ill_t *ill = (ill_t *)cp; 3780 3781 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3782 3783 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 3784 return (0); 3785 } 3786 3787 /* 3788 * For obsolete per-interface forwarding configuration; 3789 * called in response to ND_SET. 3790 */ 3791 /* ARGSUSED */ 3792 static int 3793 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 3794 cred_t *ioc_cr) 3795 { 3796 long value; 3797 int retval; 3798 3799 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3800 3801 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 3802 value < 0 || value > 1) { 3803 return (EINVAL); 3804 } 3805 3806 rw_enter(&ill_g_lock, RW_READER); 3807 retval = ill_forward_set(q, mp, (value != 0), cp); 3808 rw_exit(&ill_g_lock); 3809 return (retval); 3810 } 3811 3812 /* 3813 * Set an ill's ILLF_ROUTER flag appropriately. If the ill is part of an 3814 * IPMP group, make sure all ill's in the group adopt the new policy. Send 3815 * up RTS_IFINFO routing socket messages for each interface whose flags we 3816 * change. 3817 */ 3818 /* ARGSUSED */ 3819 int 3820 ill_forward_set(queue_t *q, mblk_t *mp, boolean_t enable, caddr_t cp) 3821 { 3822 ill_t *ill = (ill_t *)cp; 3823 ill_group_t *illgrp; 3824 3825 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ill_g_lock)); 3826 3827 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 3828 (!enable && !(ill->ill_flags & ILLF_ROUTER)) || 3829 (ill->ill_phyint->phyint_flags & PHYI_LOOPBACK)) 3830 return (EINVAL); 3831 3832 /* 3833 * If the ill is in an IPMP group, set the forwarding policy on all 3834 * members of the group to the same value. 3835 */ 3836 illgrp = ill->ill_group; 3837 if (illgrp != NULL) { 3838 ill_t *tmp_ill; 3839 3840 for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL; 3841 tmp_ill = tmp_ill->ill_group_next) { 3842 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3843 (enable ? "Enabling" : "Disabling"), 3844 (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"), 3845 tmp_ill->ill_name)); 3846 mutex_enter(&tmp_ill->ill_lock); 3847 if (enable) 3848 tmp_ill->ill_flags |= ILLF_ROUTER; 3849 else 3850 tmp_ill->ill_flags &= ~ILLF_ROUTER; 3851 mutex_exit(&tmp_ill->ill_lock); 3852 if (tmp_ill->ill_isv6) 3853 ill_set_nce_router_flags(tmp_ill, enable); 3854 /* Notify routing socket listeners of this change. */ 3855 ip_rts_ifmsg(tmp_ill->ill_ipif); 3856 } 3857 } else { 3858 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3859 (enable ? "Enabling" : "Disabling"), 3860 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 3861 mutex_enter(&ill->ill_lock); 3862 if (enable) 3863 ill->ill_flags |= ILLF_ROUTER; 3864 else 3865 ill->ill_flags &= ~ILLF_ROUTER; 3866 mutex_exit(&ill->ill_lock); 3867 if (ill->ill_isv6) 3868 ill_set_nce_router_flags(ill, enable); 3869 /* Notify routing socket listeners of this change. */ 3870 ip_rts_ifmsg(ill->ill_ipif); 3871 } 3872 3873 return (0); 3874 } 3875 3876 /* 3877 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 3878 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 3879 * set or clear. 3880 */ 3881 static void 3882 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 3883 { 3884 ipif_t *ipif; 3885 nce_t *nce; 3886 3887 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3888 nce = ndp_lookup_v6(ill, &ipif->ipif_v6lcl_addr, B_FALSE); 3889 if (nce != NULL) { 3890 mutex_enter(&nce->nce_lock); 3891 if (enable) 3892 nce->nce_flags |= NCE_F_ISROUTER; 3893 else 3894 nce->nce_flags &= ~NCE_F_ISROUTER; 3895 mutex_exit(&nce->nce_lock); 3896 NCE_REFRELE(nce); 3897 } 3898 } 3899 } 3900 3901 /* 3902 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 3903 * for this ill. Make sure the v6/v4 question has been answered about this 3904 * ill. The creation of this ndd variable is only for backwards compatibility. 3905 * The preferred way to control per-interface IP forwarding is through the 3906 * ILLF_ROUTER interface flag. 3907 */ 3908 static int 3909 ill_set_ndd_name(ill_t *ill) 3910 { 3911 char *suffix; 3912 3913 ASSERT(IAM_WRITER_ILL(ill)); 3914 3915 if (ill->ill_isv6) 3916 suffix = ipv6_forward_suffix; 3917 else 3918 suffix = ipv4_forward_suffix; 3919 3920 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 3921 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 3922 /* 3923 * Copies over the '\0'. 3924 * Note that strlen(suffix) is always bounded. 3925 */ 3926 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 3927 strlen(suffix) + 1); 3928 3929 /* 3930 * Use of the nd table requires holding the reader lock. 3931 * Modifying the nd table thru nd_load/nd_unload requires 3932 * the writer lock. 3933 */ 3934 rw_enter(&ip_g_nd_lock, RW_WRITER); 3935 if (!nd_load(&ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 3936 nd_ill_forward_set, (caddr_t)ill)) { 3937 /* 3938 * If the nd_load failed, it only meant that it could not 3939 * allocate a new bunch of room for further NDD expansion. 3940 * Because of that, the ill_ndd_name will be set to 0, and 3941 * this interface is at the mercy of the global ip_forwarding 3942 * variable. 3943 */ 3944 rw_exit(&ip_g_nd_lock); 3945 ill->ill_ndd_name = NULL; 3946 return (ENOMEM); 3947 } 3948 rw_exit(&ip_g_nd_lock); 3949 return (0); 3950 } 3951 3952 /* 3953 * Intializes the context structure and returns the first ill in the list 3954 * cuurently start_list and end_list can have values: 3955 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 3956 * IP_V4_G_HEAD Traverse IPV4 list only. 3957 * IP_V6_G_HEAD Traverse IPV6 list only. 3958 */ 3959 3960 /* 3961 * We don't check for CONDEMNED ills here. Caller must do that if 3962 * necessary under the ill lock. 3963 */ 3964 ill_t * 3965 ill_first(int start_list, int end_list, ill_walk_context_t *ctx) 3966 { 3967 ill_if_t *ifp; 3968 ill_t *ill; 3969 avl_tree_t *avl_tree; 3970 3971 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 3972 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 3973 3974 /* 3975 * setup the lists to search 3976 */ 3977 if (end_list != MAX_G_HEADS) { 3978 ctx->ctx_current_list = start_list; 3979 ctx->ctx_last_list = end_list; 3980 } else { 3981 ctx->ctx_last_list = MAX_G_HEADS - 1; 3982 ctx->ctx_current_list = 0; 3983 } 3984 3985 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 3986 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list); 3987 if (ifp != (ill_if_t *) 3988 &IP_VX_ILL_G_LIST(ctx->ctx_current_list)) { 3989 avl_tree = &ifp->illif_avl_by_ppa; 3990 ill = avl_first(avl_tree); 3991 /* 3992 * ill is guaranteed to be non NULL or ifp should have 3993 * not existed. 3994 */ 3995 ASSERT(ill != NULL); 3996 return (ill); 3997 } 3998 ctx->ctx_current_list++; 3999 } 4000 4001 return (NULL); 4002 } 4003 4004 /* 4005 * returns the next ill in the list. ill_first() must have been called 4006 * before calling ill_next() or bad things will happen. 4007 */ 4008 4009 /* 4010 * We don't check for CONDEMNED ills here. Caller must do that if 4011 * necessary under the ill lock. 4012 */ 4013 ill_t * 4014 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 4015 { 4016 ill_if_t *ifp; 4017 ill_t *ill; 4018 4019 4020 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 4021 ASSERT(lastill->ill_ifptr != (ill_if_t *) 4022 &IP_VX_ILL_G_LIST(ctx->ctx_current_list)); 4023 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 4024 AVL_AFTER)) != NULL) { 4025 return (ill); 4026 } 4027 4028 /* goto next ill_ifp in the list. */ 4029 ifp = lastill->ill_ifptr->illif_next; 4030 4031 /* make sure not at end of circular list */ 4032 while (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list)) { 4033 if (++ctx->ctx_current_list > ctx->ctx_last_list) 4034 return (NULL); 4035 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list); 4036 } 4037 4038 return (avl_first(&ifp->illif_avl_by_ppa)); 4039 } 4040 4041 /* 4042 * Check interface name for correct format which is name+ppa. 4043 * name can contain characters and digits, the right most digits 4044 * make up the ppa number. use of octal is not allowed, name must contain 4045 * a ppa, return pointer to the start of ppa. 4046 * In case of error return NULL. 4047 */ 4048 static char * 4049 ill_get_ppa_ptr(char *name) 4050 { 4051 int namelen = mi_strlen(name); 4052 4053 int len = namelen; 4054 4055 name += len; 4056 while (len > 0) { 4057 name--; 4058 if (*name < '0' || *name > '9') 4059 break; 4060 len--; 4061 } 4062 4063 /* empty string, all digits, or no trailing digits */ 4064 if (len == 0 || len == (int)namelen) 4065 return (NULL); 4066 4067 name++; 4068 /* check for attempted use of octal */ 4069 if (*name == '0' && len != (int)namelen - 1) 4070 return (NULL); 4071 return (name); 4072 } 4073 4074 /* 4075 * use avl tree to locate the ill. 4076 */ 4077 static ill_t * 4078 ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, 4079 ipsq_func_t func, int *error) 4080 { 4081 char *ppa_ptr = NULL; 4082 int len; 4083 uint_t ppa; 4084 ill_t *ill = NULL; 4085 ill_if_t *ifp; 4086 int list; 4087 ipsq_t *ipsq; 4088 4089 if (error != NULL) 4090 *error = 0; 4091 4092 /* 4093 * get ppa ptr 4094 */ 4095 if (isv6) 4096 list = IP_V6_G_HEAD; 4097 else 4098 list = IP_V4_G_HEAD; 4099 4100 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 4101 if (error != NULL) 4102 *error = ENXIO; 4103 return (NULL); 4104 } 4105 4106 len = ppa_ptr - name + 1; 4107 4108 ppa = stoi(&ppa_ptr); 4109 4110 ifp = IP_VX_ILL_G_LIST(list); 4111 4112 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list)) { 4113 /* 4114 * match is done on len - 1 as the name is not null 4115 * terminated it contains ppa in addition to the interface 4116 * name. 4117 */ 4118 if ((ifp->illif_name_len == len) && 4119 bcmp(ifp->illif_name, name, len - 1) == 0) { 4120 break; 4121 } else { 4122 ifp = ifp->illif_next; 4123 } 4124 } 4125 4126 4127 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list)) { 4128 /* 4129 * Even the interface type does not exist. 4130 */ 4131 if (error != NULL) 4132 *error = ENXIO; 4133 return (NULL); 4134 } 4135 4136 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 4137 if (ill != NULL) { 4138 /* 4139 * The block comment at the start of ipif_down 4140 * explains the use of the macros used below 4141 */ 4142 GRAB_CONN_LOCK(q); 4143 mutex_enter(&ill->ill_lock); 4144 if (ILL_CAN_LOOKUP(ill)) { 4145 ill_refhold_locked(ill); 4146 mutex_exit(&ill->ill_lock); 4147 RELEASE_CONN_LOCK(q); 4148 return (ill); 4149 } else if (ILL_CAN_WAIT(ill, q)) { 4150 ipsq = ill->ill_phyint->phyint_ipsq; 4151 mutex_enter(&ipsq->ipsq_lock); 4152 mutex_exit(&ill->ill_lock); 4153 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 4154 mutex_exit(&ipsq->ipsq_lock); 4155 RELEASE_CONN_LOCK(q); 4156 *error = EINPROGRESS; 4157 return (NULL); 4158 } 4159 mutex_exit(&ill->ill_lock); 4160 RELEASE_CONN_LOCK(q); 4161 } 4162 if (error != NULL) 4163 *error = ENXIO; 4164 return (NULL); 4165 } 4166 4167 /* 4168 * comparison function for use with avl. 4169 */ 4170 static int 4171 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 4172 { 4173 uint_t ppa; 4174 uint_t ill_ppa; 4175 4176 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 4177 4178 ppa = *((uint_t *)ppa_ptr); 4179 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 4180 /* 4181 * We want the ill with the lowest ppa to be on the 4182 * top. 4183 */ 4184 if (ill_ppa < ppa) 4185 return (1); 4186 if (ill_ppa > ppa) 4187 return (-1); 4188 return (0); 4189 } 4190 4191 /* 4192 * remove an interface type from the global list. 4193 */ 4194 static void 4195 ill_delete_interface_type(ill_if_t *interface) 4196 { 4197 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 4198 4199 ASSERT(interface != NULL); 4200 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 4201 4202 avl_destroy(&interface->illif_avl_by_ppa); 4203 if (interface->illif_ppa_arena != NULL) 4204 vmem_destroy(interface->illif_ppa_arena); 4205 4206 remque(interface); 4207 4208 mi_free(interface); 4209 } 4210 4211 /* Defined in ip_netinfo.c */ 4212 extern ddi_taskq_t *eventq_queue_nic; 4213 4214 /* 4215 * remove ill from the global list. 4216 */ 4217 static void 4218 ill_glist_delete(ill_t *ill) 4219 { 4220 char *nicname; 4221 size_t nicnamelen; 4222 hook_nic_event_t *info; 4223 4224 if (ill == NULL) 4225 return; 4226 4227 rw_enter(&ill_g_lock, RW_WRITER); 4228 4229 if (ill->ill_name != NULL) { 4230 nicname = kmem_alloc(ill->ill_name_length, KM_NOSLEEP); 4231 if (nicname != NULL) { 4232 bcopy(ill->ill_name, nicname, ill->ill_name_length); 4233 nicnamelen = ill->ill_name_length; 4234 } 4235 } else { 4236 nicname = NULL; 4237 nicnamelen = 0; 4238 } 4239 4240 /* 4241 * If the ill was never inserted into the AVL tree 4242 * we skip the if branch. 4243 */ 4244 if (ill->ill_ifptr != NULL) { 4245 /* 4246 * remove from AVL tree and free ppa number 4247 */ 4248 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 4249 4250 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 4251 vmem_free(ill->ill_ifptr->illif_ppa_arena, 4252 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4253 } 4254 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 4255 ill_delete_interface_type(ill->ill_ifptr); 4256 } 4257 4258 /* 4259 * Indicate ill is no longer in the list. 4260 */ 4261 ill->ill_ifptr = NULL; 4262 ill->ill_name_length = 0; 4263 ill->ill_name[0] = '\0'; 4264 ill->ill_ppa = UINT_MAX; 4265 } 4266 4267 /* 4268 * Run the unplumb hook after the NIC has disappeared from being 4269 * visible so that attempts to revalidate its existance will fail. 4270 * 4271 * This needs to be run inside the ill_g_lock perimeter to ensure 4272 * that the ordering of delivered events to listeners matches the 4273 * order of them in the kernel. 4274 */ 4275 if ((info = ill->ill_nic_event_info) != NULL) { 4276 if (info->hne_event != NE_DOWN) { 4277 ip2dbg(("ill_glist_delete: unexpected nic event %d " 4278 "attached for %s\n", info->hne_event, 4279 ill->ill_name)); 4280 if (info->hne_data != NULL) 4281 kmem_free(info->hne_data, info->hne_datalen); 4282 kmem_free(info, sizeof (hook_nic_event_t)); 4283 } else { 4284 if (ddi_taskq_dispatch(eventq_queue_nic, 4285 ip_ne_queue_func, (void *)info, DDI_SLEEP) 4286 == DDI_FAILURE) { 4287 ip2dbg(("ill_glist_delete: ddi_taskq_dispatch " 4288 "failed\n")); 4289 if (info->hne_data != NULL) 4290 kmem_free(info->hne_data, 4291 info->hne_datalen); 4292 kmem_free(info, sizeof (hook_nic_event_t)); 4293 } 4294 } 4295 } 4296 4297 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 4298 if (info != NULL) { 4299 info->hne_nic = ill->ill_phyint->phyint_ifindex; 4300 info->hne_lif = 0; 4301 info->hne_event = NE_UNPLUMB; 4302 info->hne_data = nicname; 4303 info->hne_datalen = nicnamelen; 4304 info->hne_family = ill->ill_isv6 ? ipv6 : ipv4; 4305 } else { 4306 ip2dbg(("ill_glist_delete: could not attach UNPLUMB nic event " 4307 "information for %s (ENOMEM)\n", ill->ill_name)); 4308 if (nicname != NULL) 4309 kmem_free(nicname, nicnamelen); 4310 } 4311 4312 ill->ill_nic_event_info = info; 4313 4314 ill_phyint_free(ill); 4315 4316 rw_exit(&ill_g_lock); 4317 } 4318 4319 /* 4320 * allocate a ppa, if the number of plumbed interfaces of this type are 4321 * less than ill_no_arena do a linear search to find a unused ppa. 4322 * When the number goes beyond ill_no_arena switch to using an arena. 4323 * Note: ppa value of zero cannot be allocated from vmem_arena as it 4324 * is the return value for an error condition, so allocation starts at one 4325 * and is decremented by one. 4326 */ 4327 static int 4328 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 4329 { 4330 ill_t *tmp_ill; 4331 uint_t start, end; 4332 int ppa; 4333 4334 if (ifp->illif_ppa_arena == NULL && 4335 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 4336 /* 4337 * Create an arena. 4338 */ 4339 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 4340 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 4341 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 4342 /* allocate what has already been assigned */ 4343 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 4344 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 4345 tmp_ill, AVL_AFTER)) { 4346 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4347 1, /* size */ 4348 1, /* align/quantum */ 4349 0, /* phase */ 4350 0, /* nocross */ 4351 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), /* minaddr */ 4352 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), /* maxaddr */ 4353 VM_NOSLEEP|VM_FIRSTFIT); 4354 if (ppa == 0) { 4355 ip1dbg(("ill_alloc_ppa: ppa allocation" 4356 " failed while switching")); 4357 vmem_destroy(ifp->illif_ppa_arena); 4358 ifp->illif_ppa_arena = NULL; 4359 break; 4360 } 4361 } 4362 } 4363 4364 if (ifp->illif_ppa_arena != NULL) { 4365 if (ill->ill_ppa == UINT_MAX) { 4366 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 4367 1, VM_NOSLEEP|VM_FIRSTFIT); 4368 if (ppa == 0) 4369 return (EAGAIN); 4370 ill->ill_ppa = --ppa; 4371 } else { 4372 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4373 1, /* size */ 4374 1, /* align/quantum */ 4375 0, /* phase */ 4376 0, /* nocross */ 4377 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 4378 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 4379 VM_NOSLEEP|VM_FIRSTFIT); 4380 /* 4381 * Most likely the allocation failed because 4382 * the requested ppa was in use. 4383 */ 4384 if (ppa == 0) 4385 return (EEXIST); 4386 } 4387 return (0); 4388 } 4389 4390 /* 4391 * No arena is in use and not enough (>ill_no_arena) interfaces have 4392 * been plumbed to create one. Do a linear search to get a unused ppa. 4393 */ 4394 if (ill->ill_ppa == UINT_MAX) { 4395 end = UINT_MAX - 1; 4396 start = 0; 4397 } else { 4398 end = start = ill->ill_ppa; 4399 } 4400 4401 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 4402 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 4403 if (start++ >= end) { 4404 if (ill->ill_ppa == UINT_MAX) 4405 return (EAGAIN); 4406 else 4407 return (EEXIST); 4408 } 4409 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 4410 } 4411 ill->ill_ppa = start; 4412 return (0); 4413 } 4414 4415 /* 4416 * Insert ill into the list of configured ill's. Once this function completes, 4417 * the ill is globally visible and is available through lookups. More precisely 4418 * this happens after the caller drops the ill_g_lock. 4419 */ 4420 static int 4421 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 4422 { 4423 ill_if_t *ill_interface; 4424 avl_index_t where = 0; 4425 int error; 4426 int name_length; 4427 int index; 4428 boolean_t check_length = B_FALSE; 4429 4430 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 4431 4432 name_length = mi_strlen(name) + 1; 4433 4434 if (isv6) 4435 index = IP_V6_G_HEAD; 4436 else 4437 index = IP_V4_G_HEAD; 4438 4439 ill_interface = IP_VX_ILL_G_LIST(index); 4440 /* 4441 * Search for interface type based on name 4442 */ 4443 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index)) { 4444 if ((ill_interface->illif_name_len == name_length) && 4445 (strcmp(ill_interface->illif_name, name) == 0)) { 4446 break; 4447 } 4448 ill_interface = ill_interface->illif_next; 4449 } 4450 4451 /* 4452 * Interface type not found, create one. 4453 */ 4454 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index)) { 4455 4456 ill_g_head_t ghead; 4457 4458 /* 4459 * allocate ill_if_t structure 4460 */ 4461 4462 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 4463 if (ill_interface == NULL) { 4464 return (ENOMEM); 4465 } 4466 4467 4468 4469 (void) strcpy(ill_interface->illif_name, name); 4470 ill_interface->illif_name_len = name_length; 4471 4472 avl_create(&ill_interface->illif_avl_by_ppa, 4473 ill_compare_ppa, sizeof (ill_t), 4474 offsetof(struct ill_s, ill_avl_byppa)); 4475 4476 /* 4477 * link the structure in the back to maintain order 4478 * of configuration for ifconfig output. 4479 */ 4480 ghead = ill_g_heads[index]; 4481 insque(ill_interface, ghead.ill_g_list_tail); 4482 4483 } 4484 4485 if (ill->ill_ppa == UINT_MAX) 4486 check_length = B_TRUE; 4487 4488 error = ill_alloc_ppa(ill_interface, ill); 4489 if (error != 0) { 4490 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 4491 ill_delete_interface_type(ill->ill_ifptr); 4492 return (error); 4493 } 4494 4495 /* 4496 * When the ppa is choosen by the system, check that there is 4497 * enough space to insert ppa. if a specific ppa was passed in this 4498 * check is not required as the interface name passed in will have 4499 * the right ppa in it. 4500 */ 4501 if (check_length) { 4502 /* 4503 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 4504 */ 4505 char buf[sizeof (uint_t) * 3]; 4506 4507 /* 4508 * convert ppa to string to calculate the amount of space 4509 * required for it in the name. 4510 */ 4511 numtos(ill->ill_ppa, buf); 4512 4513 /* Do we have enough space to insert ppa ? */ 4514 4515 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 4516 /* Free ppa and interface type struct */ 4517 if (ill_interface->illif_ppa_arena != NULL) { 4518 vmem_free(ill_interface->illif_ppa_arena, 4519 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4520 } 4521 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 4522 0) { 4523 ill_delete_interface_type(ill->ill_ifptr); 4524 } 4525 4526 return (EINVAL); 4527 } 4528 } 4529 4530 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 4531 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 4532 4533 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 4534 &where); 4535 ill->ill_ifptr = ill_interface; 4536 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 4537 4538 ill_phyint_reinit(ill); 4539 return (0); 4540 } 4541 4542 /* Initialize the per phyint (per IPMP group) ipsq used for serialization */ 4543 static boolean_t 4544 ipsq_init(ill_t *ill) 4545 { 4546 ipsq_t *ipsq; 4547 4548 /* Init the ipsq and impicitly enter as writer */ 4549 ill->ill_phyint->phyint_ipsq = 4550 kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 4551 if (ill->ill_phyint->phyint_ipsq == NULL) 4552 return (B_FALSE); 4553 ipsq = ill->ill_phyint->phyint_ipsq; 4554 ipsq->ipsq_phyint_list = ill->ill_phyint; 4555 ill->ill_phyint->phyint_ipsq_next = NULL; 4556 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 4557 ipsq->ipsq_refs = 1; 4558 ipsq->ipsq_writer = curthread; 4559 ipsq->ipsq_reentry_cnt = 1; 4560 #ifdef ILL_DEBUG 4561 ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack, IP_STACK_DEPTH); 4562 #endif 4563 (void) strcpy(ipsq->ipsq_name, ill->ill_name); 4564 return (B_TRUE); 4565 } 4566 4567 /* 4568 * ill_init is called by ip_open when a device control stream is opened. 4569 * It does a few initializations, and shoots a DL_INFO_REQ message down 4570 * to the driver. The response is later picked up in ip_rput_dlpi and 4571 * used to set up default mechanisms for talking to the driver. (Always 4572 * called as writer.) 4573 * 4574 * If this function returns error, ip_open will call ip_close which in 4575 * turn will call ill_delete to clean up any memory allocated here that 4576 * is not yet freed. 4577 */ 4578 int 4579 ill_init(queue_t *q, ill_t *ill) 4580 { 4581 int count; 4582 dl_info_req_t *dlir; 4583 mblk_t *info_mp; 4584 uchar_t *frag_ptr; 4585 4586 /* 4587 * The ill is initialized to zero by mi_alloc*(). In addition 4588 * some fields already contain valid values, initialized in 4589 * ip_open(), before we reach here. 4590 */ 4591 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 4592 4593 ill->ill_rq = q; 4594 ill->ill_wq = WR(q); 4595 4596 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 4597 BPRI_HI); 4598 if (info_mp == NULL) 4599 return (ENOMEM); 4600 4601 /* 4602 * Allocate sufficient space to contain our fragment hash table and 4603 * the device name. 4604 */ 4605 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 4606 2 * LIFNAMSIZ + 5 + strlen(ipv6_forward_suffix)); 4607 if (frag_ptr == NULL) { 4608 freemsg(info_mp); 4609 return (ENOMEM); 4610 } 4611 ill->ill_frag_ptr = frag_ptr; 4612 ill->ill_frag_free_num_pkts = 0; 4613 ill->ill_last_frag_clean_time = 0; 4614 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 4615 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 4616 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 4617 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 4618 NULL, MUTEX_DEFAULT, NULL); 4619 } 4620 4621 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4622 if (ill->ill_phyint == NULL) { 4623 freemsg(info_mp); 4624 mi_free(frag_ptr); 4625 return (ENOMEM); 4626 } 4627 4628 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4629 /* 4630 * For now pretend this is a v4 ill. We need to set phyint_ill* 4631 * at this point because of the following reason. If we can't 4632 * enter the ipsq at some point and cv_wait, the writer that 4633 * wakes us up tries to locate us using the list of all phyints 4634 * in an ipsq and the ills from the phyint thru the phyint_ill*. 4635 * If we don't set it now, we risk a missed wakeup. 4636 */ 4637 ill->ill_phyint->phyint_illv4 = ill; 4638 ill->ill_ppa = UINT_MAX; 4639 ill->ill_fastpath_list = &ill->ill_fastpath_list; 4640 4641 if (!ipsq_init(ill)) { 4642 freemsg(info_mp); 4643 mi_free(frag_ptr); 4644 mi_free(ill->ill_phyint); 4645 return (ENOMEM); 4646 } 4647 4648 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 4649 4650 4651 /* Frag queue limit stuff */ 4652 ill->ill_frag_count = 0; 4653 ill->ill_ipf_gen = 0; 4654 4655 ill->ill_global_timer = INFINITY; 4656 ill->ill_mcast_type = IGMP_V3_ROUTER; /* == MLD_V2_ROUTER */ 4657 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4658 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4659 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4660 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4661 4662 /* 4663 * Initialize IPv6 configuration variables. The IP module is always 4664 * opened as an IPv4 module. Instead tracking down the cases where 4665 * it switches to do ipv6, we'll just initialize the IPv6 configuration 4666 * here for convenience, this has no effect until the ill is set to do 4667 * IPv6. 4668 */ 4669 ill->ill_reachable_time = ND_REACHABLE_TIME; 4670 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 4671 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 4672 ill->ill_max_buf = ND_MAX_Q; 4673 ill->ill_refcnt = 0; 4674 4675 /* Send down the Info Request to the driver. */ 4676 info_mp->b_datap->db_type = M_PCPROTO; 4677 dlir = (dl_info_req_t *)info_mp->b_rptr; 4678 info_mp->b_wptr = (uchar_t *)&dlir[1]; 4679 dlir->dl_primitive = DL_INFO_REQ; 4680 4681 ill->ill_dlpi_pending = DL_PRIM_INVAL; 4682 4683 qprocson(q); 4684 ill_dlpi_send(ill, info_mp); 4685 4686 return (0); 4687 } 4688 4689 /* 4690 * ill_dls_info 4691 * creates datalink socket info from the device. 4692 */ 4693 int 4694 ill_dls_info(struct sockaddr_dl *sdl, const ipif_t *ipif) 4695 { 4696 size_t length; 4697 ill_t *ill = ipif->ipif_ill; 4698 4699 sdl->sdl_family = AF_LINK; 4700 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4701 sdl->sdl_type = ipif->ipif_type; 4702 (void) ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4703 length = mi_strlen(sdl->sdl_data); 4704 ASSERT(length < 256); 4705 sdl->sdl_nlen = (uchar_t)length; 4706 sdl->sdl_alen = ill->ill_phys_addr_length; 4707 mutex_enter(&ill->ill_lock); 4708 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) { 4709 bcopy(ill->ill_phys_addr, &sdl->sdl_data[length], 4710 ill->ill_phys_addr_length); 4711 } 4712 mutex_exit(&ill->ill_lock); 4713 sdl->sdl_slen = 0; 4714 return (sizeof (struct sockaddr_dl)); 4715 } 4716 4717 /* 4718 * ill_xarp_info 4719 * creates xarp info from the device. 4720 */ 4721 static int 4722 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 4723 { 4724 sdl->sdl_family = AF_LINK; 4725 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4726 sdl->sdl_type = ill->ill_type; 4727 (void) ipif_get_name(ill->ill_ipif, sdl->sdl_data, 4728 sizeof (sdl->sdl_data)); 4729 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 4730 sdl->sdl_alen = ill->ill_phys_addr_length; 4731 sdl->sdl_slen = 0; 4732 return (sdl->sdl_nlen); 4733 } 4734 4735 static int 4736 loopback_kstat_update(kstat_t *ksp, int rw) 4737 { 4738 kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); 4739 4740 if (rw == KSTAT_WRITE) 4741 return (EACCES); 4742 kn[0].value.ui32 = loopback_packets; 4743 kn[1].value.ui32 = loopback_packets; 4744 return (0); 4745 } 4746 4747 4748 /* 4749 * Has ifindex been plumbed already. 4750 */ 4751 static boolean_t 4752 phyint_exists(uint_t index) 4753 { 4754 phyint_t *phyi; 4755 4756 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 4757 /* 4758 * Indexes are stored in the phyint - a common structure 4759 * to both IPv4 and IPv6. 4760 */ 4761 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 4762 (void *) &index, NULL); 4763 return (phyi != NULL); 4764 } 4765 4766 /* 4767 * Assign a unique interface index for the phyint. 4768 */ 4769 static boolean_t 4770 phyint_assign_ifindex(phyint_t *phyi) 4771 { 4772 uint_t starting_index; 4773 4774 ASSERT(phyi->phyint_ifindex == 0); 4775 if (!ill_index_wrap) { 4776 phyi->phyint_ifindex = ill_index++; 4777 if (ill_index == 0) { 4778 /* Reached the uint_t limit Next time wrap */ 4779 ill_index_wrap = B_TRUE; 4780 } 4781 return (B_TRUE); 4782 } 4783 4784 /* 4785 * Start reusing unused indexes. Note that we hold the ill_g_lock 4786 * at this point and don't want to call any function that attempts 4787 * to get the lock again. 4788 */ 4789 starting_index = ill_index++; 4790 for (; ill_index != starting_index; ill_index++) { 4791 if (ill_index != 0 && !phyint_exists(ill_index)) { 4792 /* found unused index - use it */ 4793 phyi->phyint_ifindex = ill_index; 4794 return (B_TRUE); 4795 } 4796 } 4797 4798 /* 4799 * all interface indicies are inuse. 4800 */ 4801 return (B_FALSE); 4802 } 4803 4804 /* 4805 * Return a pointer to the ill which matches the supplied name. Note that 4806 * the ill name length includes the null termination character. (May be 4807 * called as writer.) 4808 * If do_alloc and the interface is "lo0" it will be automatically created. 4809 * Cannot bump up reference on condemned ills. So dup detect can't be done 4810 * using this func. 4811 */ 4812 ill_t * 4813 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 4814 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc) 4815 { 4816 ill_t *ill; 4817 ipif_t *ipif; 4818 kstat_named_t *kn; 4819 boolean_t isloopback; 4820 ipsq_t *old_ipsq; 4821 4822 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 4823 4824 rw_enter(&ill_g_lock, RW_READER); 4825 ill = ill_find_by_name(name, isv6, q, mp, func, error); 4826 rw_exit(&ill_g_lock); 4827 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) 4828 return (ill); 4829 4830 /* 4831 * Couldn't find it. Does this happen to be a lookup for the 4832 * loopback device and are we allowed to allocate it? 4833 */ 4834 if (!isloopback || !do_alloc) 4835 return (NULL); 4836 4837 rw_enter(&ill_g_lock, RW_WRITER); 4838 4839 ill = ill_find_by_name(name, isv6, q, mp, func, error); 4840 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) { 4841 rw_exit(&ill_g_lock); 4842 return (ill); 4843 } 4844 4845 /* Create the loopback device on demand */ 4846 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 4847 sizeof (ipif_loopback_name), BPRI_MED)); 4848 if (ill == NULL) 4849 goto done; 4850 4851 *ill = ill_null; 4852 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 4853 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4854 if (ill->ill_phyint == NULL) 4855 goto done; 4856 4857 if (isv6) 4858 ill->ill_phyint->phyint_illv6 = ill; 4859 else 4860 ill->ill_phyint->phyint_illv4 = ill; 4861 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4862 ill->ill_max_frag = IP_LOOPBACK_MTU; 4863 /* Add room for tcp+ip headers */ 4864 if (isv6) { 4865 ill->ill_isv6 = B_TRUE; 4866 ill->ill_max_frag += IPV6_HDR_LEN + 20; /* for TCP */ 4867 if (!ill_allocate_mibs(ill)) 4868 goto done; 4869 } else { 4870 ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20; 4871 } 4872 ill->ill_max_mtu = ill->ill_max_frag; 4873 /* 4874 * ipif_loopback_name can't be pointed at directly because its used 4875 * by both the ipv4 and ipv6 interfaces. When the ill is removed 4876 * from the glist, ill_glist_delete() sets the first character of 4877 * ill_name to '\0'. 4878 */ 4879 ill->ill_name = (char *)ill + sizeof (*ill); 4880 (void) strcpy(ill->ill_name, ipif_loopback_name); 4881 ill->ill_name_length = sizeof (ipif_loopback_name); 4882 /* Set ill_name_set for ill_phyint_reinit to work properly */ 4883 4884 ill->ill_global_timer = INFINITY; 4885 ill->ill_mcast_type = IGMP_V3_ROUTER; /* == MLD_V2_ROUTER */ 4886 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4887 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4888 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4889 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4890 4891 /* No resolver here. */ 4892 ill->ill_net_type = IRE_LOOPBACK; 4893 4894 /* Initialize the ipsq */ 4895 if (!ipsq_init(ill)) 4896 goto done; 4897 4898 ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL; 4899 ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--; 4900 ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0); 4901 #ifdef ILL_DEBUG 4902 ill->ill_phyint->phyint_ipsq->ipsq_depth = 0; 4903 #endif 4904 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE); 4905 if (ipif == NULL) 4906 goto done; 4907 4908 ill->ill_flags = ILLF_MULTICAST; 4909 4910 /* Set up default loopback address and mask. */ 4911 if (!isv6) { 4912 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 4913 4914 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 4915 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4916 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 4917 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4918 ipif->ipif_v6subnet); 4919 ill->ill_flags |= ILLF_IPV4; 4920 } else { 4921 ipif->ipif_v6lcl_addr = ipv6_loopback; 4922 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4923 ipif->ipif_v6net_mask = ipv6_all_ones; 4924 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4925 ipif->ipif_v6subnet); 4926 ill->ill_flags |= ILLF_IPV6; 4927 } 4928 4929 /* 4930 * Chain us in at the end of the ill list. hold the ill 4931 * before we make it globally visible. 1 for the lookup. 4932 */ 4933 ill->ill_refcnt = 0; 4934 ill_refhold(ill); 4935 4936 ill->ill_frag_count = 0; 4937 ill->ill_frag_free_num_pkts = 0; 4938 ill->ill_last_frag_clean_time = 0; 4939 4940 old_ipsq = ill->ill_phyint->phyint_ipsq; 4941 4942 if (ill_glist_insert(ill, "lo", isv6) != 0) 4943 cmn_err(CE_PANIC, "cannot insert loopback interface"); 4944 4945 /* Let SCTP know so that it can add this to its list */ 4946 sctp_update_ill(ill, SCTP_ILL_INSERT); 4947 4948 /* Let SCTP know about this IPIF, so that it can add it to its list */ 4949 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 4950 4951 /* 4952 * If the ipsq was changed in ill_phyint_reinit free the old ipsq. 4953 */ 4954 if (old_ipsq != ill->ill_phyint->phyint_ipsq) { 4955 /* Loopback ills aren't in any IPMP group */ 4956 ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP)); 4957 ipsq_delete(old_ipsq); 4958 } 4959 4960 /* 4961 * Delay this till the ipif is allocated as ipif_allocate 4962 * de-references ill_phyint for getting the ifindex. We 4963 * can't do this before ipif_allocate because ill_phyint_reinit 4964 * -> phyint_assign_ifindex expects ipif to be present. 4965 */ 4966 mutex_enter(&ill->ill_phyint->phyint_lock); 4967 ill->ill_phyint->phyint_flags |= PHYI_LOOPBACK | PHYI_VIRTUAL; 4968 mutex_exit(&ill->ill_phyint->phyint_lock); 4969 4970 if (loopback_ksp == NULL) { 4971 /* Export loopback interface statistics */ 4972 loopback_ksp = kstat_create("lo", 0, ipif_loopback_name, "net", 4973 KSTAT_TYPE_NAMED, 2, 0); 4974 if (loopback_ksp != NULL) { 4975 loopback_ksp->ks_update = loopback_kstat_update; 4976 kn = KSTAT_NAMED_PTR(loopback_ksp); 4977 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 4978 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 4979 kstat_install(loopback_ksp); 4980 } 4981 } 4982 4983 if (error != NULL) 4984 *error = 0; 4985 *did_alloc = B_TRUE; 4986 rw_exit(&ill_g_lock); 4987 return (ill); 4988 done: 4989 if (ill != NULL) { 4990 if (ill->ill_phyint != NULL) { 4991 ipsq_t *ipsq; 4992 4993 ipsq = ill->ill_phyint->phyint_ipsq; 4994 if (ipsq != NULL) 4995 kmem_free(ipsq, sizeof (ipsq_t)); 4996 mi_free(ill->ill_phyint); 4997 } 4998 ill_free_mib(ill); 4999 mi_free(ill); 5000 } 5001 rw_exit(&ill_g_lock); 5002 if (error != NULL) 5003 *error = ENOMEM; 5004 return (NULL); 5005 } 5006 5007 /* 5008 * Return a pointer to the ill which matches the index and IP version type. 5009 */ 5010 ill_t * 5011 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, 5012 ipsq_func_t func, int *err) 5013 { 5014 ill_t *ill; 5015 ipsq_t *ipsq; 5016 phyint_t *phyi; 5017 5018 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 5019 (q != NULL && mp != NULL && func != NULL && err != NULL)); 5020 5021 if (err != NULL) 5022 *err = 0; 5023 5024 /* 5025 * Indexes are stored in the phyint - a common structure 5026 * to both IPv4 and IPv6. 5027 */ 5028 rw_enter(&ill_g_lock, RW_READER); 5029 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 5030 (void *) &index, NULL); 5031 if (phyi != NULL) { 5032 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 5033 if (ill != NULL) { 5034 /* 5035 * The block comment at the start of ipif_down 5036 * explains the use of the macros used below 5037 */ 5038 GRAB_CONN_LOCK(q); 5039 mutex_enter(&ill->ill_lock); 5040 if (ILL_CAN_LOOKUP(ill)) { 5041 ill_refhold_locked(ill); 5042 mutex_exit(&ill->ill_lock); 5043 RELEASE_CONN_LOCK(q); 5044 rw_exit(&ill_g_lock); 5045 return (ill); 5046 } else if (ILL_CAN_WAIT(ill, q)) { 5047 ipsq = ill->ill_phyint->phyint_ipsq; 5048 mutex_enter(&ipsq->ipsq_lock); 5049 rw_exit(&ill_g_lock); 5050 mutex_exit(&ill->ill_lock); 5051 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 5052 mutex_exit(&ipsq->ipsq_lock); 5053 RELEASE_CONN_LOCK(q); 5054 *err = EINPROGRESS; 5055 return (NULL); 5056 } 5057 RELEASE_CONN_LOCK(q); 5058 mutex_exit(&ill->ill_lock); 5059 } 5060 } 5061 rw_exit(&ill_g_lock); 5062 if (err != NULL) 5063 *err = ENXIO; 5064 return (NULL); 5065 } 5066 5067 /* 5068 * Return the ifindex next in sequence after the passed in ifindex. 5069 * If there is no next ifindex for the given protocol, return 0. 5070 */ 5071 uint_t 5072 ill_get_next_ifindex(uint_t index, boolean_t isv6) 5073 { 5074 phyint_t *phyi; 5075 phyint_t *phyi_initial; 5076 uint_t ifindex; 5077 5078 rw_enter(&ill_g_lock, RW_READER); 5079 5080 if (index == 0) { 5081 phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index); 5082 } else { 5083 phyi = phyi_initial = avl_find( 5084 &phyint_g_list.phyint_list_avl_by_index, 5085 (void *) &index, NULL); 5086 } 5087 5088 for (; phyi != NULL; 5089 phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index, 5090 phyi, AVL_AFTER)) { 5091 /* 5092 * If we're not returning the first interface in the tree 5093 * and we still haven't moved past the phyint_t that 5094 * corresponds to index, avl_walk needs to be called again 5095 */ 5096 if (!((index != 0) && (phyi == phyi_initial))) { 5097 if (isv6) { 5098 if ((phyi->phyint_illv6) && 5099 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 5100 (phyi->phyint_illv6->ill_isv6 == 1)) 5101 break; 5102 } else { 5103 if ((phyi->phyint_illv4) && 5104 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 5105 (phyi->phyint_illv4->ill_isv6 == 0)) 5106 break; 5107 } 5108 } 5109 } 5110 5111 rw_exit(&ill_g_lock); 5112 5113 if (phyi != NULL) 5114 ifindex = phyi->phyint_ifindex; 5115 else 5116 ifindex = 0; 5117 5118 return (ifindex); 5119 } 5120 5121 5122 /* 5123 * Return the ifindex for the named interface. 5124 * If there is no next ifindex for the interface, return 0. 5125 */ 5126 uint_t 5127 ill_get_ifindex_by_name(char *name) 5128 { 5129 phyint_t *phyi; 5130 avl_index_t where = 0; 5131 uint_t ifindex; 5132 5133 rw_enter(&ill_g_lock, RW_READER); 5134 5135 if ((phyi = avl_find(&phyint_g_list.phyint_list_avl_by_name, 5136 name, &where)) == NULL) { 5137 rw_exit(&ill_g_lock); 5138 return (0); 5139 } 5140 5141 ifindex = phyi->phyint_ifindex; 5142 5143 rw_exit(&ill_g_lock); 5144 5145 return (ifindex); 5146 } 5147 5148 5149 /* 5150 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 5151 * that gives a running thread a reference to the ill. This reference must be 5152 * released by the thread when it is done accessing the ill and related 5153 * objects. ill_refcnt can not be used to account for static references 5154 * such as other structures pointing to an ill. Callers must generally 5155 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 5156 * or be sure that the ill is not being deleted or changing state before 5157 * calling the refhold functions. A non-zero ill_refcnt ensures that the 5158 * ill won't change any of its critical state such as address, netmask etc. 5159 */ 5160 void 5161 ill_refhold(ill_t *ill) 5162 { 5163 mutex_enter(&ill->ill_lock); 5164 ill->ill_refcnt++; 5165 ILL_TRACE_REF(ill); 5166 mutex_exit(&ill->ill_lock); 5167 } 5168 5169 void 5170 ill_refhold_locked(ill_t *ill) 5171 { 5172 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5173 ill->ill_refcnt++; 5174 ILL_TRACE_REF(ill); 5175 } 5176 5177 int 5178 ill_check_and_refhold(ill_t *ill) 5179 { 5180 mutex_enter(&ill->ill_lock); 5181 if (ILL_CAN_LOOKUP(ill)) { 5182 ill_refhold_locked(ill); 5183 mutex_exit(&ill->ill_lock); 5184 return (0); 5185 } 5186 mutex_exit(&ill->ill_lock); 5187 return (ILL_LOOKUP_FAILED); 5188 } 5189 5190 /* 5191 * Must not be called while holding any locks. Otherwise if this is 5192 * the last reference to be released, there is a chance of recursive mutex 5193 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5194 * to restart an ioctl. 5195 */ 5196 void 5197 ill_refrele(ill_t *ill) 5198 { 5199 mutex_enter(&ill->ill_lock); 5200 ASSERT(ill->ill_refcnt != 0); 5201 ill->ill_refcnt--; 5202 ILL_UNTRACE_REF(ill); 5203 if (ill->ill_refcnt != 0) { 5204 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 5205 mutex_exit(&ill->ill_lock); 5206 return; 5207 } 5208 5209 /* Drops the ill_lock */ 5210 ipif_ill_refrele_tail(ill); 5211 } 5212 5213 /* 5214 * Obtain a weak reference count on the ill. This reference ensures the 5215 * ill won't be freed, but the ill may change any of its critical state 5216 * such as netmask, address etc. Returns an error if the ill has started 5217 * closing. 5218 */ 5219 boolean_t 5220 ill_waiter_inc(ill_t *ill) 5221 { 5222 mutex_enter(&ill->ill_lock); 5223 if (ill->ill_state_flags & ILL_CONDEMNED) { 5224 mutex_exit(&ill->ill_lock); 5225 return (B_FALSE); 5226 } 5227 ill->ill_waiters++; 5228 mutex_exit(&ill->ill_lock); 5229 return (B_TRUE); 5230 } 5231 5232 void 5233 ill_waiter_dcr(ill_t *ill) 5234 { 5235 mutex_enter(&ill->ill_lock); 5236 ill->ill_waiters--; 5237 if (ill->ill_waiters == 0) 5238 cv_broadcast(&ill->ill_cv); 5239 mutex_exit(&ill->ill_lock); 5240 } 5241 5242 /* 5243 * Named Dispatch routine to produce a formatted report on all ILLs. 5244 * This report is accessed by using the ndd utility to "get" ND variable 5245 * "ip_ill_status". 5246 */ 5247 /* ARGSUSED */ 5248 int 5249 ip_ill_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5250 { 5251 ill_t *ill; 5252 ill_walk_context_t ctx; 5253 5254 (void) mi_mpprintf(mp, 5255 "ILL " MI_COL_HDRPAD_STR 5256 /* 01234567[89ABCDEF] */ 5257 "rq " MI_COL_HDRPAD_STR 5258 /* 01234567[89ABCDEF] */ 5259 "wq " MI_COL_HDRPAD_STR 5260 /* 01234567[89ABCDEF] */ 5261 "upcnt mxfrg err name"); 5262 /* 12345 12345 123 xxxxxxxx */ 5263 5264 rw_enter(&ill_g_lock, RW_READER); 5265 ill = ILL_START_WALK_ALL(&ctx); 5266 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5267 (void) mi_mpprintf(mp, 5268 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 5269 "%05u %05u %03d %s", 5270 (void *)ill, (void *)ill->ill_rq, (void *)ill->ill_wq, 5271 ill->ill_ipif_up_count, 5272 ill->ill_max_frag, ill->ill_error, ill->ill_name); 5273 } 5274 rw_exit(&ill_g_lock); 5275 5276 return (0); 5277 } 5278 5279 /* 5280 * Named Dispatch routine to produce a formatted report on all IPIFs. 5281 * This report is accessed by using the ndd utility to "get" ND variable 5282 * "ip_ipif_status". 5283 */ 5284 /* ARGSUSED */ 5285 int 5286 ip_ipif_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5287 { 5288 char buf1[INET6_ADDRSTRLEN]; 5289 char buf2[INET6_ADDRSTRLEN]; 5290 char buf3[INET6_ADDRSTRLEN]; 5291 char buf4[INET6_ADDRSTRLEN]; 5292 char buf5[INET6_ADDRSTRLEN]; 5293 char buf6[INET6_ADDRSTRLEN]; 5294 char buf[LIFNAMSIZ]; 5295 ill_t *ill; 5296 ipif_t *ipif; 5297 nv_t *nvp; 5298 uint64_t flags; 5299 zoneid_t zoneid; 5300 ill_walk_context_t ctx; 5301 5302 (void) mi_mpprintf(mp, 5303 "IPIF metric mtu in/out/forward name zone flags...\n" 5304 "\tlocal address\n" 5305 "\tsrc address\n" 5306 "\tsubnet\n" 5307 "\tmask\n" 5308 "\tbroadcast\n" 5309 "\tp-p-dst"); 5310 5311 ASSERT(q->q_next == NULL); 5312 zoneid = Q_TO_CONN(q)->conn_zoneid; /* IP is a driver */ 5313 5314 rw_enter(&ill_g_lock, RW_READER); 5315 ill = ILL_START_WALK_ALL(&ctx); 5316 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5317 for (ipif = ill->ill_ipif; ipif != NULL; 5318 ipif = ipif->ipif_next) { 5319 if (zoneid != GLOBAL_ZONEID && 5320 zoneid != ipif->ipif_zoneid && 5321 ipif->ipif_zoneid != ALL_ZONES) 5322 continue; 5323 (void) mi_mpprintf(mp, 5324 MI_COL_PTRFMT_STR 5325 "%04u %05u %u/%u/%u %s %d", 5326 (void *)ipif, 5327 ipif->ipif_metric, ipif->ipif_mtu, 5328 ipif->ipif_ib_pkt_count, 5329 ipif->ipif_ob_pkt_count, 5330 ipif->ipif_fo_pkt_count, 5331 ipif_get_name(ipif, buf, sizeof (buf)), 5332 ipif->ipif_zoneid); 5333 5334 flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags | 5335 ipif->ipif_ill->ill_phyint->phyint_flags; 5336 5337 /* Tack on text strings for any flags. */ 5338 nvp = ipif_nv_tbl; 5339 for (; nvp < A_END(ipif_nv_tbl); nvp++) { 5340 if (nvp->nv_value & flags) 5341 (void) mi_mpprintf_nr(mp, " %s", 5342 nvp->nv_name); 5343 } 5344 (void) mi_mpprintf(mp, 5345 "\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n\t%s", 5346 inet_ntop(AF_INET6, 5347 &ipif->ipif_v6lcl_addr, buf1, sizeof (buf1)), 5348 inet_ntop(AF_INET6, 5349 &ipif->ipif_v6src_addr, buf2, sizeof (buf2)), 5350 inet_ntop(AF_INET6, 5351 &ipif->ipif_v6subnet, buf3, sizeof (buf3)), 5352 inet_ntop(AF_INET6, 5353 &ipif->ipif_v6net_mask, buf4, sizeof (buf4)), 5354 inet_ntop(AF_INET6, 5355 &ipif->ipif_v6brd_addr, buf5, sizeof (buf5)), 5356 inet_ntop(AF_INET6, 5357 &ipif->ipif_v6pp_dst_addr, 5358 buf6, sizeof (buf6))); 5359 } 5360 } 5361 rw_exit(&ill_g_lock); 5362 return (0); 5363 } 5364 5365 /* 5366 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 5367 * driver. We construct best guess defaults for lower level information that 5368 * we need. If an interface is brought up without injection of any overriding 5369 * information from outside, we have to be ready to go with these defaults. 5370 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 5371 * we primarely want the dl_provider_style. 5372 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 5373 * at which point we assume the other part of the information is valid. 5374 */ 5375 void 5376 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 5377 { 5378 uchar_t *brdcst_addr; 5379 uint_t brdcst_addr_length, phys_addr_length; 5380 t_scalar_t sap_length; 5381 dl_info_ack_t *dlia; 5382 ip_m_t *ipm; 5383 dl_qos_cl_sel1_t *sel1; 5384 5385 ASSERT(IAM_WRITER_ILL(ill)); 5386 5387 /* 5388 * Till the ill is fully up ILL_CHANGING will be set and 5389 * the ill is not globally visible. So no need for a lock. 5390 */ 5391 dlia = (dl_info_ack_t *)mp->b_rptr; 5392 ill->ill_mactype = dlia->dl_mac_type; 5393 5394 ipm = ip_m_lookup(dlia->dl_mac_type); 5395 if (ipm == NULL) { 5396 ipm = ip_m_lookup(DL_OTHER); 5397 ASSERT(ipm != NULL); 5398 } 5399 ill->ill_media = ipm; 5400 5401 /* 5402 * When the new DLPI stuff is ready we'll pull lengths 5403 * from dlia. 5404 */ 5405 if (dlia->dl_version == DL_VERSION_2) { 5406 brdcst_addr_length = dlia->dl_brdcst_addr_length; 5407 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 5408 brdcst_addr_length); 5409 if (brdcst_addr == NULL) { 5410 brdcst_addr_length = 0; 5411 } 5412 sap_length = dlia->dl_sap_length; 5413 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 5414 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 5415 brdcst_addr_length, sap_length, phys_addr_length)); 5416 } else { 5417 brdcst_addr_length = 6; 5418 brdcst_addr = ip_six_byte_all_ones; 5419 sap_length = -2; 5420 phys_addr_length = brdcst_addr_length; 5421 } 5422 5423 ill->ill_bcast_addr_length = brdcst_addr_length; 5424 ill->ill_phys_addr_length = phys_addr_length; 5425 ill->ill_sap_length = sap_length; 5426 ill->ill_max_frag = dlia->dl_max_sdu; 5427 ill->ill_max_mtu = ill->ill_max_frag; 5428 5429 ill->ill_type = ipm->ip_m_type; 5430 5431 if (!ill->ill_dlpi_style_set) { 5432 if (dlia->dl_provider_style == DL_STYLE2) 5433 ill->ill_needs_attach = 1; 5434 5435 /* 5436 * Allocate the first ipif on this ill. We don't delay it 5437 * further as ioctl handling assumes atleast one ipif to 5438 * be present. 5439 * 5440 * At this point we don't know whether the ill is v4 or v6. 5441 * We will know this whan the SIOCSLIFNAME happens and 5442 * the correct value for ill_isv6 will be assigned in 5443 * ipif_set_values(). We need to hold the ill lock and 5444 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 5445 * the wakeup. 5446 */ 5447 (void) ipif_allocate(ill, 0, IRE_LOCAL, 5448 dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE); 5449 mutex_enter(&ill->ill_lock); 5450 ASSERT(ill->ill_dlpi_style_set == 0); 5451 ill->ill_dlpi_style_set = 1; 5452 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 5453 cv_broadcast(&ill->ill_cv); 5454 mutex_exit(&ill->ill_lock); 5455 freemsg(mp); 5456 return; 5457 } 5458 ASSERT(ill->ill_ipif != NULL); 5459 /* 5460 * We know whether it is IPv4 or IPv6 now, as this is the 5461 * second DL_INFO_ACK we are recieving in response to the 5462 * DL_INFO_REQ sent in ipif_set_values. 5463 */ 5464 if (ill->ill_isv6) 5465 ill->ill_sap = IP6_DL_SAP; 5466 else 5467 ill->ill_sap = IP_DL_SAP; 5468 /* 5469 * Set ipif_mtu which is used to set the IRE's 5470 * ire_max_frag value. The driver could have sent 5471 * a different mtu from what it sent last time. No 5472 * need to call ipif_mtu_change because IREs have 5473 * not yet been created. 5474 */ 5475 ill->ill_ipif->ipif_mtu = ill->ill_max_mtu; 5476 /* 5477 * Clear all the flags that were set based on ill_bcast_addr_length 5478 * and ill_phys_addr_length (in ipif_set_values) as these could have 5479 * changed now and we need to re-evaluate. 5480 */ 5481 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 5482 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 5483 5484 /* 5485 * Free ill_resolver_mp and ill_bcast_mp as things could have 5486 * changed now. 5487 */ 5488 if (ill->ill_bcast_addr_length == 0) { 5489 if (ill->ill_resolver_mp != NULL) 5490 freemsg(ill->ill_resolver_mp); 5491 if (ill->ill_bcast_mp != NULL) 5492 freemsg(ill->ill_bcast_mp); 5493 if (ill->ill_flags & ILLF_XRESOLV) 5494 ill->ill_net_type = IRE_IF_RESOLVER; 5495 else 5496 ill->ill_net_type = IRE_IF_NORESOLVER; 5497 ill->ill_resolver_mp = ill_dlur_gen(NULL, 5498 ill->ill_phys_addr_length, 5499 ill->ill_sap, 5500 ill->ill_sap_length); 5501 ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp); 5502 5503 if (ill->ill_isv6) 5504 /* 5505 * Note: xresolv interfaces will eventually need NOARP 5506 * set here as well, but that will require those 5507 * external resolvers to have some knowledge of 5508 * that flag and act appropriately. Not to be changed 5509 * at present. 5510 */ 5511 ill->ill_flags |= ILLF_NONUD; 5512 else 5513 ill->ill_flags |= ILLF_NOARP; 5514 5515 if (ill->ill_phys_addr_length == 0) { 5516 if (ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 5517 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 5518 ill->ill_phyint->phyint_flags |= PHYI_VIRTUAL; 5519 } else { 5520 /* pt-pt supports multicast. */ 5521 ill->ill_flags |= ILLF_MULTICAST; 5522 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 5523 } 5524 } 5525 } else { 5526 ill->ill_net_type = IRE_IF_RESOLVER; 5527 if (ill->ill_bcast_mp != NULL) 5528 freemsg(ill->ill_bcast_mp); 5529 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 5530 ill->ill_bcast_addr_length, ill->ill_sap, 5531 ill->ill_sap_length); 5532 /* 5533 * Later detect lack of DLPI driver multicast 5534 * capability by catching DL_ENABMULTI errors in 5535 * ip_rput_dlpi. 5536 */ 5537 ill->ill_flags |= ILLF_MULTICAST; 5538 if (!ill->ill_isv6) 5539 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 5540 } 5541 /* By default an interface does not support any CoS marking */ 5542 ill->ill_flags &= ~ILLF_COS_ENABLED; 5543 5544 /* 5545 * If we get QoS information in DL_INFO_ACK, the device supports 5546 * some form of CoS marking, set ILLF_COS_ENABLED. 5547 */ 5548 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 5549 dlia->dl_qos_length); 5550 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 5551 ill->ill_flags |= ILLF_COS_ENABLED; 5552 } 5553 5554 /* Clear any previous error indication. */ 5555 ill->ill_error = 0; 5556 freemsg(mp); 5557 } 5558 5559 /* 5560 * Perform various checks to verify that an address would make sense as a 5561 * local, remote, or subnet interface address. 5562 */ 5563 static boolean_t 5564 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 5565 { 5566 ipaddr_t net_mask; 5567 5568 /* 5569 * Don't allow all zeroes, all ones or experimental address, but allow 5570 * all ones netmask. 5571 */ 5572 if ((net_mask = ip_net_mask(addr)) == 0) 5573 return (B_FALSE); 5574 /* A given netmask overrides the "guess" netmask */ 5575 if (subnet_mask != 0) 5576 net_mask = subnet_mask; 5577 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 5578 (addr == (addr | ~net_mask)))) { 5579 return (B_FALSE); 5580 } 5581 if (CLASSD(addr)) 5582 return (B_FALSE); 5583 5584 return (B_TRUE); 5585 } 5586 5587 /* 5588 * ipif_lookup_group 5589 * Returns held ipif 5590 */ 5591 ipif_t * 5592 ipif_lookup_group(ipaddr_t group, zoneid_t zoneid) 5593 { 5594 ire_t *ire; 5595 ipif_t *ipif; 5596 5597 ire = ire_lookup_multi(group, zoneid); 5598 if (ire == NULL) 5599 return (NULL); 5600 ipif = ire->ire_ipif; 5601 ipif_refhold(ipif); 5602 ire_refrele(ire); 5603 return (ipif); 5604 } 5605 5606 /* 5607 * Look for an ipif with the specified interface address and destination. 5608 * The destination address is used only for matching point-to-point interfaces. 5609 */ 5610 ipif_t * 5611 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, 5612 ipsq_func_t func, int *error) 5613 { 5614 ipif_t *ipif; 5615 ill_t *ill; 5616 ill_walk_context_t ctx; 5617 ipsq_t *ipsq; 5618 5619 if (error != NULL) 5620 *error = 0; 5621 5622 /* 5623 * First match all the point-to-point interfaces 5624 * before looking at non-point-to-point interfaces. 5625 * This is done to avoid returning non-point-to-point 5626 * ipif instead of unnumbered point-to-point ipif. 5627 */ 5628 rw_enter(&ill_g_lock, RW_READER); 5629 ill = ILL_START_WALK_V4(&ctx); 5630 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5631 GRAB_CONN_LOCK(q); 5632 mutex_enter(&ill->ill_lock); 5633 for (ipif = ill->ill_ipif; ipif != NULL; 5634 ipif = ipif->ipif_next) { 5635 /* Allow the ipif to be down */ 5636 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 5637 (ipif->ipif_lcl_addr == if_addr) && 5638 (ipif->ipif_pp_dst_addr == dst)) { 5639 /* 5640 * The block comment at the start of ipif_down 5641 * explains the use of the macros used below 5642 */ 5643 if (IPIF_CAN_LOOKUP(ipif)) { 5644 ipif_refhold_locked(ipif); 5645 mutex_exit(&ill->ill_lock); 5646 RELEASE_CONN_LOCK(q); 5647 rw_exit(&ill_g_lock); 5648 return (ipif); 5649 } else if (IPIF_CAN_WAIT(ipif, q)) { 5650 ipsq = ill->ill_phyint->phyint_ipsq; 5651 mutex_enter(&ipsq->ipsq_lock); 5652 mutex_exit(&ill->ill_lock); 5653 rw_exit(&ill_g_lock); 5654 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5655 ill); 5656 mutex_exit(&ipsq->ipsq_lock); 5657 RELEASE_CONN_LOCK(q); 5658 *error = EINPROGRESS; 5659 return (NULL); 5660 } 5661 } 5662 } 5663 mutex_exit(&ill->ill_lock); 5664 RELEASE_CONN_LOCK(q); 5665 } 5666 rw_exit(&ill_g_lock); 5667 5668 /* lookup the ipif based on interface address */ 5669 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error); 5670 ASSERT(ipif == NULL || !ipif->ipif_isv6); 5671 return (ipif); 5672 } 5673 5674 /* 5675 * Look for an ipif with the specified address. For point-point links 5676 * we look for matches on either the destination address and the local 5677 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 5678 * is set. 5679 * Matches on a specific ill if match_ill is set. 5680 */ 5681 ipif_t * 5682 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, 5683 mblk_t *mp, ipsq_func_t func, int *error) 5684 { 5685 ipif_t *ipif; 5686 ill_t *ill; 5687 boolean_t ptp = B_FALSE; 5688 ipsq_t *ipsq; 5689 ill_walk_context_t ctx; 5690 5691 if (error != NULL) 5692 *error = 0; 5693 5694 rw_enter(&ill_g_lock, RW_READER); 5695 /* 5696 * Repeat twice, first based on local addresses and 5697 * next time for pointopoint. 5698 */ 5699 repeat: 5700 ill = ILL_START_WALK_V4(&ctx); 5701 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5702 if (match_ill != NULL && ill != match_ill) { 5703 continue; 5704 } 5705 GRAB_CONN_LOCK(q); 5706 mutex_enter(&ill->ill_lock); 5707 for (ipif = ill->ill_ipif; ipif != NULL; 5708 ipif = ipif->ipif_next) { 5709 if (zoneid != ALL_ZONES && 5710 zoneid != ipif->ipif_zoneid && 5711 ipif->ipif_zoneid != ALL_ZONES) 5712 continue; 5713 /* Allow the ipif to be down */ 5714 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 5715 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5716 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 5717 (ipif->ipif_pp_dst_addr == addr))) { 5718 /* 5719 * The block comment at the start of ipif_down 5720 * explains the use of the macros used below 5721 */ 5722 if (IPIF_CAN_LOOKUP(ipif)) { 5723 ipif_refhold_locked(ipif); 5724 mutex_exit(&ill->ill_lock); 5725 RELEASE_CONN_LOCK(q); 5726 rw_exit(&ill_g_lock); 5727 return (ipif); 5728 } else if (IPIF_CAN_WAIT(ipif, q)) { 5729 ipsq = ill->ill_phyint->phyint_ipsq; 5730 mutex_enter(&ipsq->ipsq_lock); 5731 mutex_exit(&ill->ill_lock); 5732 rw_exit(&ill_g_lock); 5733 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5734 ill); 5735 mutex_exit(&ipsq->ipsq_lock); 5736 RELEASE_CONN_LOCK(q); 5737 *error = EINPROGRESS; 5738 return (NULL); 5739 } 5740 } 5741 } 5742 mutex_exit(&ill->ill_lock); 5743 RELEASE_CONN_LOCK(q); 5744 } 5745 5746 /* If we already did the ptp case, then we are done */ 5747 if (ptp) { 5748 rw_exit(&ill_g_lock); 5749 if (error != NULL) 5750 *error = ENXIO; 5751 return (NULL); 5752 } 5753 ptp = B_TRUE; 5754 goto repeat; 5755 } 5756 5757 /* 5758 * Look for an ipif with the specified address. For point-point links 5759 * we look for matches on either the destination address and the local 5760 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 5761 * is set. 5762 * Matches on a specific ill if match_ill is set. 5763 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 5764 */ 5765 zoneid_t 5766 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill) 5767 { 5768 zoneid_t zoneid; 5769 ipif_t *ipif; 5770 ill_t *ill; 5771 boolean_t ptp = B_FALSE; 5772 ill_walk_context_t ctx; 5773 5774 rw_enter(&ill_g_lock, RW_READER); 5775 /* 5776 * Repeat twice, first based on local addresses and 5777 * next time for pointopoint. 5778 */ 5779 repeat: 5780 ill = ILL_START_WALK_V4(&ctx); 5781 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5782 if (match_ill != NULL && ill != match_ill) { 5783 continue; 5784 } 5785 mutex_enter(&ill->ill_lock); 5786 for (ipif = ill->ill_ipif; ipif != NULL; 5787 ipif = ipif->ipif_next) { 5788 /* Allow the ipif to be down */ 5789 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 5790 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5791 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 5792 (ipif->ipif_pp_dst_addr == addr)) && 5793 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 5794 zoneid = ipif->ipif_zoneid; 5795 mutex_exit(&ill->ill_lock); 5796 rw_exit(&ill_g_lock); 5797 /* 5798 * If ipif_zoneid was ALL_ZONES then we have 5799 * a trusted extensions shared IP address. 5800 * In that case GLOBAL_ZONEID works to send. 5801 */ 5802 if (zoneid == ALL_ZONES) 5803 zoneid = GLOBAL_ZONEID; 5804 return (zoneid); 5805 } 5806 } 5807 mutex_exit(&ill->ill_lock); 5808 } 5809 5810 /* If we already did the ptp case, then we are done */ 5811 if (ptp) { 5812 rw_exit(&ill_g_lock); 5813 return (ALL_ZONES); 5814 } 5815 ptp = B_TRUE; 5816 goto repeat; 5817 } 5818 5819 /* 5820 * Look for an ipif that matches the specified remote address i.e. the 5821 * ipif that would receive the specified packet. 5822 * First look for directly connected interfaces and then do a recursive 5823 * IRE lookup and pick the first ipif corresponding to the source address in the 5824 * ire. 5825 * Returns: held ipif 5826 */ 5827 ipif_t * 5828 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 5829 { 5830 ipif_t *ipif; 5831 ire_t *ire; 5832 5833 ASSERT(!ill->ill_isv6); 5834 5835 /* 5836 * Someone could be changing this ipif currently or change it 5837 * after we return this. Thus a few packets could use the old 5838 * old values. However structure updates/creates (ire, ilg, ilm etc) 5839 * will atomically be updated or cleaned up with the new value 5840 * Thus we don't need a lock to check the flags or other attrs below. 5841 */ 5842 mutex_enter(&ill->ill_lock); 5843 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5844 if (!IPIF_CAN_LOOKUP(ipif)) 5845 continue; 5846 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 5847 ipif->ipif_zoneid != ALL_ZONES) 5848 continue; 5849 /* Allow the ipif to be down */ 5850 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 5851 if ((ipif->ipif_pp_dst_addr == addr) || 5852 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 5853 ipif->ipif_lcl_addr == addr)) { 5854 ipif_refhold_locked(ipif); 5855 mutex_exit(&ill->ill_lock); 5856 return (ipif); 5857 } 5858 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 5859 ipif_refhold_locked(ipif); 5860 mutex_exit(&ill->ill_lock); 5861 return (ipif); 5862 } 5863 } 5864 mutex_exit(&ill->ill_lock); 5865 ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid, 5866 NULL, MATCH_IRE_RECURSIVE); 5867 if (ire != NULL) { 5868 /* 5869 * The callers of this function wants to know the 5870 * interface on which they have to send the replies 5871 * back. For IRE_CACHES that have ire_stq and ire_ipif 5872 * derived from different ills, we really don't care 5873 * what we return here. 5874 */ 5875 ipif = ire->ire_ipif; 5876 if (ipif != NULL) { 5877 ipif_refhold(ipif); 5878 ire_refrele(ire); 5879 return (ipif); 5880 } 5881 ire_refrele(ire); 5882 } 5883 /* Pick the first interface */ 5884 ipif = ipif_get_next_ipif(NULL, ill); 5885 return (ipif); 5886 } 5887 5888 /* 5889 * This func does not prevent refcnt from increasing. But if 5890 * the caller has taken steps to that effect, then this func 5891 * can be used to determine whether the ill has become quiescent 5892 */ 5893 boolean_t 5894 ill_is_quiescent(ill_t *ill) 5895 { 5896 ipif_t *ipif; 5897 5898 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5899 5900 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5901 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 5902 return (B_FALSE); 5903 } 5904 } 5905 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0 || 5906 ill->ill_nce_cnt != 0 || ill->ill_srcif_refcnt != 0 || 5907 ill->ill_mrtun_refcnt != 0) { 5908 return (B_FALSE); 5909 } 5910 return (B_TRUE); 5911 } 5912 5913 /* 5914 * This func does not prevent refcnt from increasing. But if 5915 * the caller has taken steps to that effect, then this func 5916 * can be used to determine whether the ipif has become quiescent 5917 */ 5918 static boolean_t 5919 ipif_is_quiescent(ipif_t *ipif) 5920 { 5921 ill_t *ill; 5922 5923 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5924 5925 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 5926 return (B_FALSE); 5927 } 5928 5929 ill = ipif->ipif_ill; 5930 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 5931 ill->ill_logical_down) { 5932 return (B_TRUE); 5933 } 5934 5935 /* This is the last ipif going down or being deleted on this ill */ 5936 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { 5937 return (B_FALSE); 5938 } 5939 5940 return (B_TRUE); 5941 } 5942 5943 /* 5944 * This func does not prevent refcnt from increasing. But if 5945 * the caller has taken steps to that effect, then this func 5946 * can be used to determine whether the ipifs marked with IPIF_MOVING 5947 * have become quiescent and can be moved in a failover/failback. 5948 */ 5949 static ipif_t * 5950 ill_quiescent_to_move(ill_t *ill) 5951 { 5952 ipif_t *ipif; 5953 5954 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5955 5956 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5957 if (ipif->ipif_state_flags & IPIF_MOVING) { 5958 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 5959 return (ipif); 5960 } 5961 } 5962 } 5963 return (NULL); 5964 } 5965 5966 /* 5967 * The ipif/ill/ire has been refreled. Do the tail processing. 5968 * Determine if the ipif or ill in question has become quiescent and if so 5969 * wakeup close and/or restart any queued pending ioctl that is waiting 5970 * for the ipif_down (or ill_down) 5971 */ 5972 void 5973 ipif_ill_refrele_tail(ill_t *ill) 5974 { 5975 mblk_t *mp; 5976 conn_t *connp; 5977 ipsq_t *ipsq; 5978 ipif_t *ipif; 5979 5980 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5981 5982 if ((ill->ill_state_flags & ILL_CONDEMNED) && 5983 ill_is_quiescent(ill)) { 5984 /* ill_close may be waiting */ 5985 cv_broadcast(&ill->ill_cv); 5986 } 5987 5988 /* ipsq can't change because ill_lock is held */ 5989 ipsq = ill->ill_phyint->phyint_ipsq; 5990 if (ipsq->ipsq_waitfor == 0) { 5991 /* Not waiting for anything, just return. */ 5992 mutex_exit(&ill->ill_lock); 5993 return; 5994 } 5995 ASSERT(ipsq->ipsq_pending_mp != NULL && 5996 ipsq->ipsq_pending_ipif != NULL); 5997 /* 5998 * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF. 5999 * Last ipif going down needs to down the ill, so ill_ire_cnt must 6000 * be zero for restarting an ioctl that ends up downing the ill. 6001 */ 6002 ipif = ipsq->ipsq_pending_ipif; 6003 if (ipif->ipif_ill != ill) { 6004 /* The ioctl is pending on some other ill. */ 6005 mutex_exit(&ill->ill_lock); 6006 return; 6007 } 6008 6009 switch (ipsq->ipsq_waitfor) { 6010 case IPIF_DOWN: 6011 case IPIF_FREE: 6012 if (!ipif_is_quiescent(ipif)) { 6013 mutex_exit(&ill->ill_lock); 6014 return; 6015 } 6016 break; 6017 6018 case ILL_DOWN: 6019 case ILL_FREE: 6020 /* 6021 * case ILL_FREE arises only for loopback. otherwise ill_delete 6022 * waits synchronously in ip_close, and no message is queued in 6023 * ipsq_pending_mp at all in this case 6024 */ 6025 if (!ill_is_quiescent(ill)) { 6026 mutex_exit(&ill->ill_lock); 6027 return; 6028 } 6029 6030 break; 6031 6032 case ILL_MOVE_OK: 6033 if (ill_quiescent_to_move(ill) != NULL) { 6034 mutex_exit(&ill->ill_lock); 6035 return; 6036 } 6037 6038 break; 6039 default: 6040 cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n", 6041 (void *)ipsq, ipsq->ipsq_waitfor); 6042 } 6043 6044 /* 6045 * Incr refcnt for the qwriter_ip call below which 6046 * does a refrele 6047 */ 6048 ill_refhold_locked(ill); 6049 mutex_exit(&ill->ill_lock); 6050 6051 mp = ipsq_pending_mp_get(ipsq, &connp); 6052 ASSERT(mp != NULL); 6053 6054 switch (mp->b_datap->db_type) { 6055 case M_ERROR: 6056 case M_HANGUP: 6057 (void) qwriter_ip(NULL, ill, ill->ill_rq, mp, 6058 ipif_all_down_tail, CUR_OP, B_TRUE); 6059 return; 6060 6061 case M_IOCTL: 6062 case M_IOCDATA: 6063 (void) qwriter_ip(NULL, ill, 6064 (connp != NULL ? CONNP_TO_WQ(connp) : ill->ill_wq), mp, 6065 ip_reprocess_ioctl, CUR_OP, B_TRUE); 6066 return; 6067 6068 default: 6069 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 6070 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 6071 } 6072 } 6073 6074 #ifdef ILL_DEBUG 6075 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 6076 void 6077 th_trace_rrecord(th_trace_t *th_trace) 6078 { 6079 tr_buf_t *tr_buf; 6080 uint_t lastref; 6081 6082 lastref = th_trace->th_trace_lastref; 6083 lastref++; 6084 if (lastref == TR_BUF_MAX) 6085 lastref = 0; 6086 th_trace->th_trace_lastref = lastref; 6087 tr_buf = &th_trace->th_trbuf[lastref]; 6088 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, IP_STACK_DEPTH); 6089 } 6090 6091 th_trace_t * 6092 th_trace_ipif_lookup(ipif_t *ipif) 6093 { 6094 int bucket_id; 6095 th_trace_t *th_trace; 6096 6097 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6098 6099 bucket_id = IP_TR_HASH(curthread); 6100 ASSERT(bucket_id < IP_TR_HASH_MAX); 6101 6102 for (th_trace = ipif->ipif_trace[bucket_id]; th_trace != NULL; 6103 th_trace = th_trace->th_next) { 6104 if (th_trace->th_id == curthread) 6105 return (th_trace); 6106 } 6107 return (NULL); 6108 } 6109 6110 void 6111 ipif_trace_ref(ipif_t *ipif) 6112 { 6113 int bucket_id; 6114 th_trace_t *th_trace; 6115 6116 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6117 6118 if (ipif->ipif_trace_disable) 6119 return; 6120 6121 /* 6122 * Attempt to locate the trace buffer for the curthread. 6123 * If it does not exist, then allocate a new trace buffer 6124 * and link it in list of trace bufs for this ipif, at the head 6125 */ 6126 th_trace = th_trace_ipif_lookup(ipif); 6127 if (th_trace == NULL) { 6128 bucket_id = IP_TR_HASH(curthread); 6129 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 6130 KM_NOSLEEP); 6131 if (th_trace == NULL) { 6132 ipif->ipif_trace_disable = B_TRUE; 6133 ipif_trace_cleanup(ipif); 6134 return; 6135 } 6136 th_trace->th_id = curthread; 6137 th_trace->th_next = ipif->ipif_trace[bucket_id]; 6138 th_trace->th_prev = &ipif->ipif_trace[bucket_id]; 6139 if (th_trace->th_next != NULL) 6140 th_trace->th_next->th_prev = &th_trace->th_next; 6141 ipif->ipif_trace[bucket_id] = th_trace; 6142 } 6143 ASSERT(th_trace->th_refcnt >= 0 && 6144 th_trace->th_refcnt < TR_BUF_MAX -1); 6145 th_trace->th_refcnt++; 6146 th_trace_rrecord(th_trace); 6147 } 6148 6149 void 6150 ipif_untrace_ref(ipif_t *ipif) 6151 { 6152 th_trace_t *th_trace; 6153 6154 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6155 6156 if (ipif->ipif_trace_disable) 6157 return; 6158 th_trace = th_trace_ipif_lookup(ipif); 6159 ASSERT(th_trace != NULL); 6160 ASSERT(th_trace->th_refcnt > 0); 6161 6162 th_trace->th_refcnt--; 6163 th_trace_rrecord(th_trace); 6164 } 6165 6166 th_trace_t * 6167 th_trace_ill_lookup(ill_t *ill) 6168 { 6169 th_trace_t *th_trace; 6170 int bucket_id; 6171 6172 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6173 6174 bucket_id = IP_TR_HASH(curthread); 6175 ASSERT(bucket_id < IP_TR_HASH_MAX); 6176 6177 for (th_trace = ill->ill_trace[bucket_id]; th_trace != NULL; 6178 th_trace = th_trace->th_next) { 6179 if (th_trace->th_id == curthread) 6180 return (th_trace); 6181 } 6182 return (NULL); 6183 } 6184 6185 void 6186 ill_trace_ref(ill_t *ill) 6187 { 6188 int bucket_id; 6189 th_trace_t *th_trace; 6190 6191 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6192 if (ill->ill_trace_disable) 6193 return; 6194 /* 6195 * Attempt to locate the trace buffer for the curthread. 6196 * If it does not exist, then allocate a new trace buffer 6197 * and link it in list of trace bufs for this ill, at the head 6198 */ 6199 th_trace = th_trace_ill_lookup(ill); 6200 if (th_trace == NULL) { 6201 bucket_id = IP_TR_HASH(curthread); 6202 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 6203 KM_NOSLEEP); 6204 if (th_trace == NULL) { 6205 ill->ill_trace_disable = B_TRUE; 6206 ill_trace_cleanup(ill); 6207 return; 6208 } 6209 th_trace->th_id = curthread; 6210 th_trace->th_next = ill->ill_trace[bucket_id]; 6211 th_trace->th_prev = &ill->ill_trace[bucket_id]; 6212 if (th_trace->th_next != NULL) 6213 th_trace->th_next->th_prev = &th_trace->th_next; 6214 ill->ill_trace[bucket_id] = th_trace; 6215 } 6216 ASSERT(th_trace->th_refcnt >= 0 && 6217 th_trace->th_refcnt < TR_BUF_MAX - 1); 6218 6219 th_trace->th_refcnt++; 6220 th_trace_rrecord(th_trace); 6221 } 6222 6223 void 6224 ill_untrace_ref(ill_t *ill) 6225 { 6226 th_trace_t *th_trace; 6227 6228 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6229 6230 if (ill->ill_trace_disable) 6231 return; 6232 th_trace = th_trace_ill_lookup(ill); 6233 ASSERT(th_trace != NULL); 6234 ASSERT(th_trace->th_refcnt > 0); 6235 6236 th_trace->th_refcnt--; 6237 th_trace_rrecord(th_trace); 6238 } 6239 6240 /* 6241 * Verify that this thread has no refs to the ipif and free 6242 * the trace buffers 6243 */ 6244 /* ARGSUSED */ 6245 void 6246 ipif_thread_exit(ipif_t *ipif, void *dummy) 6247 { 6248 th_trace_t *th_trace; 6249 6250 mutex_enter(&ipif->ipif_ill->ill_lock); 6251 6252 th_trace = th_trace_ipif_lookup(ipif); 6253 if (th_trace == NULL) { 6254 mutex_exit(&ipif->ipif_ill->ill_lock); 6255 return; 6256 } 6257 ASSERT(th_trace->th_refcnt == 0); 6258 /* unlink th_trace and free it */ 6259 *th_trace->th_prev = th_trace->th_next; 6260 if (th_trace->th_next != NULL) 6261 th_trace->th_next->th_prev = th_trace->th_prev; 6262 th_trace->th_next = NULL; 6263 th_trace->th_prev = NULL; 6264 kmem_free(th_trace, sizeof (th_trace_t)); 6265 6266 mutex_exit(&ipif->ipif_ill->ill_lock); 6267 } 6268 6269 /* 6270 * Verify that this thread has no refs to the ill and free 6271 * the trace buffers 6272 */ 6273 /* ARGSUSED */ 6274 void 6275 ill_thread_exit(ill_t *ill, void *dummy) 6276 { 6277 th_trace_t *th_trace; 6278 6279 mutex_enter(&ill->ill_lock); 6280 6281 th_trace = th_trace_ill_lookup(ill); 6282 if (th_trace == NULL) { 6283 mutex_exit(&ill->ill_lock); 6284 return; 6285 } 6286 ASSERT(th_trace->th_refcnt == 0); 6287 /* unlink th_trace and free it */ 6288 *th_trace->th_prev = th_trace->th_next; 6289 if (th_trace->th_next != NULL) 6290 th_trace->th_next->th_prev = th_trace->th_prev; 6291 th_trace->th_next = NULL; 6292 th_trace->th_prev = NULL; 6293 kmem_free(th_trace, sizeof (th_trace_t)); 6294 6295 mutex_exit(&ill->ill_lock); 6296 } 6297 #endif 6298 6299 #ifdef ILL_DEBUG 6300 void 6301 ip_thread_exit(void) 6302 { 6303 ill_t *ill; 6304 ipif_t *ipif; 6305 ill_walk_context_t ctx; 6306 6307 rw_enter(&ill_g_lock, RW_READER); 6308 ill = ILL_START_WALK_ALL(&ctx); 6309 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 6310 for (ipif = ill->ill_ipif; ipif != NULL; 6311 ipif = ipif->ipif_next) { 6312 ipif_thread_exit(ipif, NULL); 6313 } 6314 ill_thread_exit(ill, NULL); 6315 } 6316 rw_exit(&ill_g_lock); 6317 6318 ire_walk(ire_thread_exit, NULL); 6319 ndp_walk_common(&ndp4, NULL, nce_thread_exit, NULL, B_FALSE); 6320 ndp_walk_common(&ndp6, NULL, nce_thread_exit, NULL, B_FALSE); 6321 } 6322 6323 /* 6324 * Called when ipif is unplumbed or when memory alloc fails 6325 */ 6326 void 6327 ipif_trace_cleanup(ipif_t *ipif) 6328 { 6329 int i; 6330 th_trace_t *th_trace; 6331 th_trace_t *th_trace_next; 6332 6333 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6334 for (th_trace = ipif->ipif_trace[i]; th_trace != NULL; 6335 th_trace = th_trace_next) { 6336 th_trace_next = th_trace->th_next; 6337 kmem_free(th_trace, sizeof (th_trace_t)); 6338 } 6339 ipif->ipif_trace[i] = NULL; 6340 } 6341 } 6342 6343 /* 6344 * Called when ill is unplumbed or when memory alloc fails 6345 */ 6346 void 6347 ill_trace_cleanup(ill_t *ill) 6348 { 6349 int i; 6350 th_trace_t *th_trace; 6351 th_trace_t *th_trace_next; 6352 6353 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6354 for (th_trace = ill->ill_trace[i]; th_trace != NULL; 6355 th_trace = th_trace_next) { 6356 th_trace_next = th_trace->th_next; 6357 kmem_free(th_trace, sizeof (th_trace_t)); 6358 } 6359 ill->ill_trace[i] = NULL; 6360 } 6361 } 6362 6363 #else 6364 void ip_thread_exit(void) {} 6365 #endif 6366 6367 void 6368 ipif_refhold_locked(ipif_t *ipif) 6369 { 6370 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6371 ipif->ipif_refcnt++; 6372 IPIF_TRACE_REF(ipif); 6373 } 6374 6375 void 6376 ipif_refhold(ipif_t *ipif) 6377 { 6378 ill_t *ill; 6379 6380 ill = ipif->ipif_ill; 6381 mutex_enter(&ill->ill_lock); 6382 ipif->ipif_refcnt++; 6383 IPIF_TRACE_REF(ipif); 6384 mutex_exit(&ill->ill_lock); 6385 } 6386 6387 /* 6388 * Must not be called while holding any locks. Otherwise if this is 6389 * the last reference to be released there is a chance of recursive mutex 6390 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 6391 * to restart an ioctl. 6392 */ 6393 void 6394 ipif_refrele(ipif_t *ipif) 6395 { 6396 ill_t *ill; 6397 6398 ill = ipif->ipif_ill; 6399 6400 mutex_enter(&ill->ill_lock); 6401 ASSERT(ipif->ipif_refcnt != 0); 6402 ipif->ipif_refcnt--; 6403 IPIF_UNTRACE_REF(ipif); 6404 if (ipif->ipif_refcnt != 0) { 6405 mutex_exit(&ill->ill_lock); 6406 return; 6407 } 6408 6409 /* Drops the ill_lock */ 6410 ipif_ill_refrele_tail(ill); 6411 } 6412 6413 ipif_t * 6414 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 6415 { 6416 ipif_t *ipif; 6417 6418 mutex_enter(&ill->ill_lock); 6419 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 6420 ipif != NULL; ipif = ipif->ipif_next) { 6421 if (!IPIF_CAN_LOOKUP(ipif)) 6422 continue; 6423 ipif_refhold_locked(ipif); 6424 mutex_exit(&ill->ill_lock); 6425 return (ipif); 6426 } 6427 mutex_exit(&ill->ill_lock); 6428 return (NULL); 6429 } 6430 6431 /* 6432 * TODO: make this table extendible at run time 6433 * Return a pointer to the mac type info for 'mac_type' 6434 */ 6435 static ip_m_t * 6436 ip_m_lookup(t_uscalar_t mac_type) 6437 { 6438 ip_m_t *ipm; 6439 6440 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 6441 if (ipm->ip_m_mac_type == mac_type) 6442 return (ipm); 6443 return (NULL); 6444 } 6445 6446 /* 6447 * ip_rt_add is called to add an IPv4 route to the forwarding table. 6448 * ipif_arg is passed in to associate it with the correct interface. 6449 * We may need to restart this operation if the ipif cannot be looked up 6450 * due to an exclusive operation that is currently in progress. The restart 6451 * entry point is specified by 'func' 6452 */ 6453 int 6454 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6455 ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ipif_t *src_ipif, 6456 ire_t **ire_arg, boolean_t ioctl_msg, queue_t *q, mblk_t *mp, 6457 ipsq_func_t func, struct rtsa_s *sp) 6458 { 6459 ire_t *ire; 6460 ire_t *gw_ire = NULL; 6461 ipif_t *ipif = NULL; 6462 boolean_t ipif_refheld = B_FALSE; 6463 uint_t type; 6464 int match_flags = MATCH_IRE_TYPE; 6465 int error; 6466 tsol_gc_t *gc = NULL; 6467 tsol_gcgrp_t *gcgrp = NULL; 6468 boolean_t gcgrp_xtraref = B_FALSE; 6469 6470 ip1dbg(("ip_rt_add:")); 6471 6472 if (ire_arg != NULL) 6473 *ire_arg = NULL; 6474 6475 /* 6476 * If this is the case of RTF_HOST being set, then we set the netmask 6477 * to all ones (regardless if one was supplied). 6478 */ 6479 if (flags & RTF_HOST) 6480 mask = IP_HOST_MASK; 6481 6482 /* 6483 * Prevent routes with a zero gateway from being created (since 6484 * interfaces can currently be plumbed and brought up no assigned 6485 * address). 6486 * For routes with RTA_SRCIFP, the gateway address can be 0.0.0.0. 6487 */ 6488 if (gw_addr == 0 && src_ipif == NULL) 6489 return (ENETUNREACH); 6490 /* 6491 * Get the ipif, if any, corresponding to the gw_addr 6492 */ 6493 if (gw_addr != 0) { 6494 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, 6495 &error); 6496 if (ipif != NULL) { 6497 if (IS_VNI(ipif->ipif_ill)) { 6498 ipif_refrele(ipif); 6499 return (EINVAL); 6500 } 6501 ipif_refheld = B_TRUE; 6502 } else if (error == EINPROGRESS) { 6503 ip1dbg(("ip_rt_add: null and EINPROGRESS")); 6504 return (EINPROGRESS); 6505 } else { 6506 error = 0; 6507 } 6508 } 6509 6510 if (ipif != NULL) { 6511 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull")); 6512 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6513 } else { 6514 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null")); 6515 } 6516 6517 /* 6518 * GateD will attempt to create routes with a loopback interface 6519 * address as the gateway and with RTF_GATEWAY set. We allow 6520 * these routes to be added, but create them as interface routes 6521 * since the gateway is an interface address. 6522 */ 6523 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 6524 flags &= ~RTF_GATEWAY; 6525 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 6526 mask == IP_HOST_MASK) { 6527 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 6528 ALL_ZONES, NULL, match_flags); 6529 if (ire != NULL) { 6530 ire_refrele(ire); 6531 if (ipif_refheld) 6532 ipif_refrele(ipif); 6533 return (EEXIST); 6534 } 6535 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x" 6536 "for 0x%x\n", (void *)ipif, 6537 ipif->ipif_ire_type, 6538 ntohl(ipif->ipif_lcl_addr))); 6539 ire = ire_create( 6540 (uchar_t *)&dst_addr, /* dest address */ 6541 (uchar_t *)&mask, /* mask */ 6542 (uchar_t *)&ipif->ipif_src_addr, 6543 NULL, /* no gateway */ 6544 NULL, 6545 &ipif->ipif_mtu, 6546 NULL, 6547 ipif->ipif_rq, /* recv-from queue */ 6548 NULL, /* no send-to queue */ 6549 ipif->ipif_ire_type, /* LOOPBACK */ 6550 NULL, 6551 ipif, 6552 NULL, 6553 0, 6554 0, 6555 0, 6556 (ipif->ipif_flags & IPIF_PRIVATE) ? 6557 RTF_PRIVATE : 0, 6558 &ire_uinfo_null, 6559 NULL, 6560 NULL); 6561 6562 if (ire == NULL) { 6563 if (ipif_refheld) 6564 ipif_refrele(ipif); 6565 return (ENOMEM); 6566 } 6567 error = ire_add(&ire, q, mp, func, B_FALSE); 6568 if (error == 0) 6569 goto save_ire; 6570 if (ipif_refheld) 6571 ipif_refrele(ipif); 6572 return (error); 6573 6574 } 6575 } 6576 6577 /* 6578 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 6579 * and the gateway address provided is one of the system's interface 6580 * addresses. By using the routing socket interface and supplying an 6581 * RTA_IFP sockaddr with an interface index, an alternate method of 6582 * specifying an interface route to be created is available which uses 6583 * the interface index that specifies the outgoing interface rather than 6584 * the address of an outgoing interface (which may not be able to 6585 * uniquely identify an interface). When coupled with the RTF_GATEWAY 6586 * flag, routes can be specified which not only specify the next-hop to 6587 * be used when routing to a certain prefix, but also which outgoing 6588 * interface should be used. 6589 * 6590 * Previously, interfaces would have unique addresses assigned to them 6591 * and so the address assigned to a particular interface could be used 6592 * to identify a particular interface. One exception to this was the 6593 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 6594 * 6595 * With the advent of IPv6 and its link-local addresses, this 6596 * restriction was relaxed and interfaces could share addresses between 6597 * themselves. In fact, typically all of the link-local interfaces on 6598 * an IPv6 node or router will have the same link-local address. In 6599 * order to differentiate between these interfaces, the use of an 6600 * interface index is necessary and this index can be carried inside a 6601 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 6602 * of using the interface index, however, is that all of the ipif's that 6603 * are part of an ill have the same index and so the RTA_IFP sockaddr 6604 * cannot be used to differentiate between ipif's (or logical 6605 * interfaces) that belong to the same ill (physical interface). 6606 * 6607 * For example, in the following case involving IPv4 interfaces and 6608 * logical interfaces 6609 * 6610 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 6611 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1 6612 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2 6613 * 6614 * the ipif's corresponding to each of these interface routes can be 6615 * uniquely identified by the "gateway" (actually interface address). 6616 * 6617 * In this case involving multiple IPv6 default routes to a particular 6618 * link-local gateway, the use of RTA_IFP is necessary to specify which 6619 * default route is of interest: 6620 * 6621 * default fe80::123:4567:89ab:cdef U if0 6622 * default fe80::123:4567:89ab:cdef U if1 6623 */ 6624 6625 /* RTF_GATEWAY not set */ 6626 if (!(flags & RTF_GATEWAY)) { 6627 queue_t *stq; 6628 queue_t *rfq = NULL; 6629 ill_t *in_ill = NULL; 6630 6631 if (sp != NULL) { 6632 ip2dbg(("ip_rt_add: gateway security attributes " 6633 "cannot be set with interface route\n")); 6634 if (ipif_refheld) 6635 ipif_refrele(ipif); 6636 return (EINVAL); 6637 } 6638 6639 /* 6640 * As the interface index specified with the RTA_IFP sockaddr is 6641 * the same for all ipif's off of an ill, the matching logic 6642 * below uses MATCH_IRE_ILL if such an index was specified. 6643 * This means that routes sharing the same prefix when added 6644 * using a RTA_IFP sockaddr must have distinct interface 6645 * indices (namely, they must be on distinct ill's). 6646 * 6647 * On the other hand, since the gateway address will usually be 6648 * different for each ipif on the system, the matching logic 6649 * uses MATCH_IRE_IPIF in the case of a traditional interface 6650 * route. This means that interface routes for the same prefix 6651 * can be created if they belong to distinct ipif's and if a 6652 * RTA_IFP sockaddr is not present. 6653 */ 6654 if (ipif_arg != NULL) { 6655 if (ipif_refheld) { 6656 ipif_refrele(ipif); 6657 ipif_refheld = B_FALSE; 6658 } 6659 ipif = ipif_arg; 6660 match_flags |= MATCH_IRE_ILL; 6661 } else { 6662 /* 6663 * Check the ipif corresponding to the gw_addr 6664 */ 6665 if (ipif == NULL) 6666 return (ENETUNREACH); 6667 match_flags |= MATCH_IRE_IPIF; 6668 } 6669 ASSERT(ipif != NULL); 6670 /* 6671 * If src_ipif is not NULL, we have to create 6672 * an ire with non-null ire_in_ill value 6673 */ 6674 if (src_ipif != NULL) { 6675 in_ill = src_ipif->ipif_ill; 6676 } 6677 6678 /* 6679 * We check for an existing entry at this point. 6680 * 6681 * Since a netmask isn't passed in via the ioctl interface 6682 * (SIOCADDRT), we don't check for a matching netmask in that 6683 * case. 6684 */ 6685 if (!ioctl_msg) 6686 match_flags |= MATCH_IRE_MASK; 6687 if (src_ipif != NULL) { 6688 /* Look up in the special table */ 6689 ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE, 6690 ipif, src_ipif->ipif_ill, match_flags); 6691 } else { 6692 ire = ire_ftable_lookup(dst_addr, mask, 0, 6693 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 6694 NULL, match_flags); 6695 } 6696 if (ire != NULL) { 6697 ire_refrele(ire); 6698 if (ipif_refheld) 6699 ipif_refrele(ipif); 6700 return (EEXIST); 6701 } 6702 6703 if (src_ipif != NULL) { 6704 /* 6705 * Create the special ire for the IRE table 6706 * which hangs out of ire_in_ill. This ire 6707 * is in-between IRE_CACHE and IRE_INTERFACE. 6708 * Thus rfq is non-NULL. 6709 */ 6710 rfq = ipif->ipif_rq; 6711 } 6712 /* Create the usual interface ires */ 6713 6714 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 6715 ? ipif->ipif_rq : ipif->ipif_wq; 6716 6717 /* 6718 * Create a copy of the IRE_LOOPBACK, 6719 * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with 6720 * the modified address and netmask. 6721 */ 6722 ire = ire_create( 6723 (uchar_t *)&dst_addr, 6724 (uint8_t *)&mask, 6725 (uint8_t *)&ipif->ipif_src_addr, 6726 NULL, 6727 NULL, 6728 &ipif->ipif_mtu, 6729 NULL, 6730 rfq, 6731 stq, 6732 ipif->ipif_net_type, 6733 ipif->ipif_resolver_mp, 6734 ipif, 6735 in_ill, 6736 0, 6737 0, 6738 0, 6739 flags, 6740 &ire_uinfo_null, 6741 NULL, 6742 NULL); 6743 if (ire == NULL) { 6744 if (ipif_refheld) 6745 ipif_refrele(ipif); 6746 return (ENOMEM); 6747 } 6748 6749 /* 6750 * Some software (for example, GateD and Sun Cluster) attempts 6751 * to create (what amount to) IRE_PREFIX routes with the 6752 * loopback address as the gateway. This is primarily done to 6753 * set up prefixes with the RTF_REJECT flag set (for example, 6754 * when generating aggregate routes.) 6755 * 6756 * If the IRE type (as defined by ipif->ipif_net_type) is 6757 * IRE_LOOPBACK, then we map the request into a 6758 * IRE_IF_NORESOLVER. 6759 * 6760 * Needless to say, the real IRE_LOOPBACK is NOT created by this 6761 * routine, but rather using ire_create() directly. 6762 * 6763 */ 6764 if (ipif->ipif_net_type == IRE_LOOPBACK) 6765 ire->ire_type = IRE_IF_NORESOLVER; 6766 6767 error = ire_add(&ire, q, mp, func, B_FALSE); 6768 if (error == 0) 6769 goto save_ire; 6770 6771 /* 6772 * In the result of failure, ire_add() will have already 6773 * deleted the ire in question, so there is no need to 6774 * do that here. 6775 */ 6776 if (ipif_refheld) 6777 ipif_refrele(ipif); 6778 return (error); 6779 } 6780 if (ipif_refheld) { 6781 ipif_refrele(ipif); 6782 ipif_refheld = B_FALSE; 6783 } 6784 6785 if (src_ipif != NULL) { 6786 /* RTA_SRCIFP is not supported on RTF_GATEWAY */ 6787 ip2dbg(("ip_rt_add: SRCIF cannot be set with gateway route\n")); 6788 return (EINVAL); 6789 } 6790 /* 6791 * Get an interface IRE for the specified gateway. 6792 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 6793 * gateway, it is currently unreachable and we fail the request 6794 * accordingly. 6795 */ 6796 ipif = ipif_arg; 6797 if (ipif_arg != NULL) 6798 match_flags |= MATCH_IRE_ILL; 6799 gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL, 6800 ALL_ZONES, 0, NULL, match_flags); 6801 if (gw_ire == NULL) 6802 return (ENETUNREACH); 6803 6804 /* 6805 * We create one of three types of IREs as a result of this request 6806 * based on the netmask. A netmask of all ones (which is automatically 6807 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 6808 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 6809 * created. Otherwise, an IRE_PREFIX route is created for the 6810 * destination prefix. 6811 */ 6812 if (mask == IP_HOST_MASK) 6813 type = IRE_HOST; 6814 else if (mask == 0) 6815 type = IRE_DEFAULT; 6816 else 6817 type = IRE_PREFIX; 6818 6819 /* check for a duplicate entry */ 6820 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 6821 NULL, ALL_ZONES, 0, NULL, 6822 match_flags | MATCH_IRE_MASK | MATCH_IRE_GW); 6823 if (ire != NULL) { 6824 ire_refrele(gw_ire); 6825 ire_refrele(ire); 6826 return (EEXIST); 6827 } 6828 6829 /* Security attribute exists */ 6830 if (sp != NULL) { 6831 tsol_gcgrp_addr_t ga; 6832 6833 /* find or create the gateway credentials group */ 6834 ga.ga_af = AF_INET; 6835 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 6836 6837 /* we hold reference to it upon success */ 6838 gcgrp = gcgrp_lookup(&ga, B_TRUE); 6839 if (gcgrp == NULL) { 6840 ire_refrele(gw_ire); 6841 return (ENOMEM); 6842 } 6843 6844 /* 6845 * Create and add the security attribute to the group; a 6846 * reference to the group is made upon allocating a new 6847 * entry successfully. If it finds an already-existing 6848 * entry for the security attribute in the group, it simply 6849 * returns it and no new reference is made to the group. 6850 */ 6851 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 6852 if (gc == NULL) { 6853 /* release reference held by gcgrp_lookup */ 6854 GCGRP_REFRELE(gcgrp); 6855 ire_refrele(gw_ire); 6856 return (ENOMEM); 6857 } 6858 } 6859 6860 /* Create the IRE. */ 6861 ire = ire_create( 6862 (uchar_t *)&dst_addr, /* dest address */ 6863 (uchar_t *)&mask, /* mask */ 6864 /* src address assigned by the caller? */ 6865 (uchar_t *)(((src_addr != INADDR_ANY) && 6866 (flags & RTF_SETSRC)) ? &src_addr : NULL), 6867 (uchar_t *)&gw_addr, /* gateway address */ 6868 NULL, /* no in-srcaddress */ 6869 &gw_ire->ire_max_frag, 6870 NULL, /* no Fast Path header */ 6871 NULL, /* no recv-from queue */ 6872 NULL, /* no send-to queue */ 6873 (ushort_t)type, /* IRE type */ 6874 NULL, 6875 ipif_arg, 6876 NULL, 6877 0, 6878 0, 6879 0, 6880 flags, 6881 &gw_ire->ire_uinfo, /* Inherit ULP info from gw */ 6882 gc, /* security attribute */ 6883 NULL); 6884 /* 6885 * The ire holds a reference to the 'gc' and the 'gc' holds a 6886 * reference to the 'gcgrp'. We can now release the extra reference 6887 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 6888 */ 6889 if (gcgrp_xtraref) 6890 GCGRP_REFRELE(gcgrp); 6891 if (ire == NULL) { 6892 if (gc != NULL) 6893 GC_REFRELE(gc); 6894 ire_refrele(gw_ire); 6895 return (ENOMEM); 6896 } 6897 6898 /* 6899 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 6900 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 6901 */ 6902 6903 /* Add the new IRE. */ 6904 error = ire_add(&ire, q, mp, func, B_FALSE); 6905 if (error != 0) { 6906 /* 6907 * In the result of failure, ire_add() will have already 6908 * deleted the ire in question, so there is no need to 6909 * do that here. 6910 */ 6911 ire_refrele(gw_ire); 6912 return (error); 6913 } 6914 6915 if (flags & RTF_MULTIRT) { 6916 /* 6917 * Invoke the CGTP (multirouting) filtering module 6918 * to add the dst address in the filtering database. 6919 * Replicated inbound packets coming from that address 6920 * will be filtered to discard the duplicates. 6921 * It is not necessary to call the CGTP filter hook 6922 * when the dst address is a broadcast or multicast, 6923 * because an IP source address cannot be a broadcast 6924 * or a multicast. 6925 */ 6926 ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0, 6927 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 6928 if (ire_dst != NULL) { 6929 ip_cgtp_bcast_add(ire, ire_dst); 6930 ire_refrele(ire_dst); 6931 goto save_ire; 6932 } 6933 if ((ip_cgtp_filter_ops != NULL) && !CLASSD(ire->ire_addr)) { 6934 int res = ip_cgtp_filter_ops->cfo_add_dest_v4( 6935 ire->ire_addr, 6936 ire->ire_gateway_addr, 6937 ire->ire_src_addr, 6938 gw_ire->ire_src_addr); 6939 if (res != 0) { 6940 ire_refrele(gw_ire); 6941 ire_delete(ire); 6942 return (res); 6943 } 6944 } 6945 } 6946 6947 /* 6948 * Now that the prefix IRE entry has been created, delete any 6949 * existing gateway IRE cache entries as well as any IRE caches 6950 * using the gateway, and force them to be created through 6951 * ip_newroute. 6952 */ 6953 if (gc != NULL) { 6954 ASSERT(gcgrp != NULL); 6955 ire_clookup_delete_cache_gw(gw_addr, ALL_ZONES); 6956 } 6957 6958 save_ire: 6959 if (gw_ire != NULL) { 6960 ire_refrele(gw_ire); 6961 } 6962 /* 6963 * We do not do save_ire for the routes added with RTA_SRCIFP 6964 * flag. This route is only added and deleted by mipagent. 6965 * So, for simplicity of design, we refrain from saving 6966 * ires that are created with srcif value. This may change 6967 * in future if we find more usage of srcifp feature. 6968 */ 6969 if (ipif != NULL && src_ipif == NULL) { 6970 /* 6971 * Save enough information so that we can recreate the IRE if 6972 * the interface goes down and then up. The metrics associated 6973 * with the route will be saved as well when rts_setmetrics() is 6974 * called after the IRE has been created. In the case where 6975 * memory cannot be allocated, none of this information will be 6976 * saved. 6977 */ 6978 ipif_save_ire(ipif, ire); 6979 } 6980 if (ioctl_msg) 6981 ip_rts_rtmsg(RTM_OLDADD, ire, 0); 6982 if (ire_arg != NULL) { 6983 /* 6984 * Store the ire that was successfully added into where ire_arg 6985 * points to so that callers don't have to look it up 6986 * themselves (but they are responsible for ire_refrele()ing 6987 * the ire when they are finished with it). 6988 */ 6989 *ire_arg = ire; 6990 } else { 6991 ire_refrele(ire); /* Held in ire_add */ 6992 } 6993 if (ipif_refheld) 6994 ipif_refrele(ipif); 6995 return (0); 6996 } 6997 6998 /* 6999 * ip_rt_delete is called to delete an IPv4 route. 7000 * ipif_arg is passed in to associate it with the correct interface. 7001 * src_ipif is passed to associate the incoming interface of the packet. 7002 * We may need to restart this operation if the ipif cannot be looked up 7003 * due to an exclusive operation that is currently in progress. The restart 7004 * entry point is specified by 'func' 7005 */ 7006 /* ARGSUSED4 */ 7007 int 7008 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 7009 uint_t rtm_addrs, int flags, ipif_t *ipif_arg, ipif_t *src_ipif, 7010 boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func) 7011 { 7012 ire_t *ire = NULL; 7013 ipif_t *ipif; 7014 boolean_t ipif_refheld = B_FALSE; 7015 uint_t type; 7016 uint_t match_flags = MATCH_IRE_TYPE; 7017 int err = 0; 7018 7019 ip1dbg(("ip_rt_delete:")); 7020 /* 7021 * If this is the case of RTF_HOST being set, then we set the netmask 7022 * to all ones. Otherwise, we use the netmask if one was supplied. 7023 */ 7024 if (flags & RTF_HOST) { 7025 mask = IP_HOST_MASK; 7026 match_flags |= MATCH_IRE_MASK; 7027 } else if (rtm_addrs & RTA_NETMASK) { 7028 match_flags |= MATCH_IRE_MASK; 7029 } 7030 7031 /* 7032 * Note that RTF_GATEWAY is never set on a delete, therefore 7033 * we check if the gateway address is one of our interfaces first, 7034 * and fall back on RTF_GATEWAY routes. 7035 * 7036 * This makes it possible to delete an original 7037 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 7038 * 7039 * As the interface index specified with the RTA_IFP sockaddr is the 7040 * same for all ipif's off of an ill, the matching logic below uses 7041 * MATCH_IRE_ILL if such an index was specified. This means a route 7042 * sharing the same prefix and interface index as the the route 7043 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr 7044 * is specified in the request. 7045 * 7046 * On the other hand, since the gateway address will usually be 7047 * different for each ipif on the system, the matching logic 7048 * uses MATCH_IRE_IPIF in the case of a traditional interface 7049 * route. This means that interface routes for the same prefix can be 7050 * uniquely identified if they belong to distinct ipif's and if a 7051 * RTA_IFP sockaddr is not present. 7052 * 7053 * For more detail on specifying routes by gateway address and by 7054 * interface index, see the comments in ip_rt_add(). 7055 * gw_addr could be zero in some cases when both RTA_SRCIFP and 7056 * RTA_IFP are specified. If RTA_SRCIFP is specified and both 7057 * RTA_IFP and gateway_addr are NULL/zero, then delete will not 7058 * succeed. 7059 */ 7060 if (src_ipif != NULL) { 7061 if (ipif_arg == NULL && gw_addr != 0) { 7062 ipif_arg = ipif_lookup_interface(gw_addr, dst_addr, 7063 q, mp, func, &err); 7064 if (ipif_arg != NULL) 7065 ipif_refheld = B_TRUE; 7066 } 7067 if (ipif_arg == NULL) { 7068 err = (err == EINPROGRESS) ? err : ESRCH; 7069 return (err); 7070 } 7071 ipif = ipif_arg; 7072 } else { 7073 ipif = ipif_lookup_interface(gw_addr, dst_addr, 7074 q, mp, func, &err); 7075 if (ipif != NULL) 7076 ipif_refheld = B_TRUE; 7077 else if (err == EINPROGRESS) 7078 return (err); 7079 else 7080 err = 0; 7081 } 7082 if (ipif != NULL) { 7083 if (ipif_arg != NULL) { 7084 if (ipif_refheld) { 7085 ipif_refrele(ipif); 7086 ipif_refheld = B_FALSE; 7087 } 7088 ipif = ipif_arg; 7089 match_flags |= MATCH_IRE_ILL; 7090 } else { 7091 match_flags |= MATCH_IRE_IPIF; 7092 } 7093 if (src_ipif != NULL) { 7094 ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE, 7095 ipif, src_ipif->ipif_ill, match_flags); 7096 } else { 7097 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 7098 ire = ire_ctable_lookup(dst_addr, 0, 7099 IRE_LOOPBACK, ipif, ALL_ZONES, NULL, 7100 match_flags); 7101 } 7102 if (ire == NULL) { 7103 ire = ire_ftable_lookup(dst_addr, mask, 0, 7104 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 7105 NULL, match_flags); 7106 } 7107 } 7108 } 7109 7110 if (ire == NULL) { 7111 /* 7112 * At this point, the gateway address is not one of our own 7113 * addresses or a matching interface route was not found. We 7114 * set the IRE type to lookup based on whether 7115 * this is a host route, a default route or just a prefix. 7116 * 7117 * If an ipif_arg was passed in, then the lookup is based on an 7118 * interface index so MATCH_IRE_ILL is added to match_flags. 7119 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is 7120 * set as the route being looked up is not a traditional 7121 * interface route. 7122 * Since we do not add gateway route with srcipif, we don't 7123 * expect to find it either. 7124 */ 7125 if (src_ipif != NULL) { 7126 if (ipif_refheld) 7127 ipif_refrele(ipif); 7128 return (ESRCH); 7129 } else { 7130 match_flags &= ~MATCH_IRE_IPIF; 7131 match_flags |= MATCH_IRE_GW; 7132 if (ipif_arg != NULL) 7133 match_flags |= MATCH_IRE_ILL; 7134 if (mask == IP_HOST_MASK) 7135 type = IRE_HOST; 7136 else if (mask == 0) 7137 type = IRE_DEFAULT; 7138 else 7139 type = IRE_PREFIX; 7140 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, 7141 ipif_arg, NULL, ALL_ZONES, 0, NULL, match_flags); 7142 } 7143 } 7144 7145 if (ipif_refheld) 7146 ipif_refrele(ipif); 7147 7148 /* ipif is not refheld anymore */ 7149 if (ire == NULL) 7150 return (ESRCH); 7151 7152 if (ire->ire_flags & RTF_MULTIRT) { 7153 /* 7154 * Invoke the CGTP (multirouting) filtering module 7155 * to remove the dst address from the filtering database. 7156 * Packets coming from that address will no longer be 7157 * filtered to remove duplicates. 7158 */ 7159 if (ip_cgtp_filter_ops != NULL) { 7160 err = ip_cgtp_filter_ops->cfo_del_dest_v4(ire->ire_addr, 7161 ire->ire_gateway_addr); 7162 } 7163 ip_cgtp_bcast_delete(ire); 7164 } 7165 7166 ipif = ire->ire_ipif; 7167 /* 7168 * Removing from ipif_saved_ire_mp is not necessary 7169 * when src_ipif being non-NULL. ip_rt_add does not 7170 * save the ires which src_ipif being non-NULL. 7171 */ 7172 if (ipif != NULL && src_ipif == NULL) { 7173 ipif_remove_ire(ipif, ire); 7174 } 7175 if (ioctl_msg) 7176 ip_rts_rtmsg(RTM_OLDDEL, ire, 0); 7177 ire_delete(ire); 7178 ire_refrele(ire); 7179 return (err); 7180 } 7181 7182 /* 7183 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 7184 */ 7185 /* ARGSUSED */ 7186 int 7187 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7188 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7189 { 7190 ipaddr_t dst_addr; 7191 ipaddr_t gw_addr; 7192 ipaddr_t mask; 7193 int error = 0; 7194 mblk_t *mp1; 7195 struct rtentry *rt; 7196 ipif_t *ipif = NULL; 7197 7198 ip1dbg(("ip_siocaddrt:")); 7199 /* Existence of mp1 verified in ip_wput_nondata */ 7200 mp1 = mp->b_cont->b_cont; 7201 rt = (struct rtentry *)mp1->b_rptr; 7202 7203 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7204 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7205 7206 /* 7207 * If the RTF_HOST flag is on, this is a request to assign a gateway 7208 * to a particular host address. In this case, we set the netmask to 7209 * all ones for the particular destination address. Otherwise, 7210 * determine the netmask to be used based on dst_addr and the interfaces 7211 * in use. 7212 */ 7213 if (rt->rt_flags & RTF_HOST) { 7214 mask = IP_HOST_MASK; 7215 } else { 7216 /* 7217 * Note that ip_subnet_mask returns a zero mask in the case of 7218 * default (an all-zeroes address). 7219 */ 7220 mask = ip_subnet_mask(dst_addr, &ipif); 7221 } 7222 7223 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 7224 NULL, B_TRUE, q, mp, ip_process_ioctl, NULL); 7225 if (ipif != NULL) 7226 ipif_refrele(ipif); 7227 return (error); 7228 } 7229 7230 /* 7231 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 7232 */ 7233 /* ARGSUSED */ 7234 int 7235 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7236 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7237 { 7238 ipaddr_t dst_addr; 7239 ipaddr_t gw_addr; 7240 ipaddr_t mask; 7241 int error; 7242 mblk_t *mp1; 7243 struct rtentry *rt; 7244 ipif_t *ipif = NULL; 7245 7246 ip1dbg(("ip_siocdelrt:")); 7247 /* Existence of mp1 verified in ip_wput_nondata */ 7248 mp1 = mp->b_cont->b_cont; 7249 rt = (struct rtentry *)mp1->b_rptr; 7250 7251 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7252 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7253 7254 /* 7255 * If the RTF_HOST flag is on, this is a request to delete a gateway 7256 * to a particular host address. In this case, we set the netmask to 7257 * all ones for the particular destination address. Otherwise, 7258 * determine the netmask to be used based on dst_addr and the interfaces 7259 * in use. 7260 */ 7261 if (rt->rt_flags & RTF_HOST) { 7262 mask = IP_HOST_MASK; 7263 } else { 7264 /* 7265 * Note that ip_subnet_mask returns a zero mask in the case of 7266 * default (an all-zeroes address). 7267 */ 7268 mask = ip_subnet_mask(dst_addr, &ipif); 7269 } 7270 7271 error = ip_rt_delete(dst_addr, mask, gw_addr, 7272 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, NULL, 7273 B_TRUE, q, mp, ip_process_ioctl); 7274 if (ipif != NULL) 7275 ipif_refrele(ipif); 7276 return (error); 7277 } 7278 7279 /* 7280 * Enqueue the mp onto the ipsq, chained by b_next. 7281 * b_prev stores the function to be executed later, and b_queue the queue 7282 * where this mp originated. 7283 */ 7284 void 7285 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7286 ill_t *pending_ill) 7287 { 7288 conn_t *connp = NULL; 7289 7290 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7291 ASSERT(func != NULL); 7292 7293 mp->b_queue = q; 7294 mp->b_prev = (void *)func; 7295 mp->b_next = NULL; 7296 7297 switch (type) { 7298 case CUR_OP: 7299 if (ipsq->ipsq_mptail != NULL) { 7300 ASSERT(ipsq->ipsq_mphead != NULL); 7301 ipsq->ipsq_mptail->b_next = mp; 7302 } else { 7303 ASSERT(ipsq->ipsq_mphead == NULL); 7304 ipsq->ipsq_mphead = mp; 7305 } 7306 ipsq->ipsq_mptail = mp; 7307 break; 7308 7309 case NEW_OP: 7310 if (ipsq->ipsq_xopq_mptail != NULL) { 7311 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 7312 ipsq->ipsq_xopq_mptail->b_next = mp; 7313 } else { 7314 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 7315 ipsq->ipsq_xopq_mphead = mp; 7316 } 7317 ipsq->ipsq_xopq_mptail = mp; 7318 break; 7319 default: 7320 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 7321 } 7322 7323 if (CONN_Q(q) && pending_ill != NULL) { 7324 connp = Q_TO_CONN(q); 7325 7326 ASSERT(MUTEX_HELD(&connp->conn_lock)); 7327 connp->conn_oper_pending_ill = pending_ill; 7328 } 7329 } 7330 7331 /* 7332 * Return the mp at the head of the ipsq. After emptying the ipsq 7333 * look at the next ioctl, if this ioctl is complete. Otherwise 7334 * return, we will resume when we complete the current ioctl. 7335 * The current ioctl will wait till it gets a response from the 7336 * driver below. 7337 */ 7338 static mblk_t * 7339 ipsq_dq(ipsq_t *ipsq) 7340 { 7341 mblk_t *mp; 7342 7343 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7344 7345 mp = ipsq->ipsq_mphead; 7346 if (mp != NULL) { 7347 ipsq->ipsq_mphead = mp->b_next; 7348 if (ipsq->ipsq_mphead == NULL) 7349 ipsq->ipsq_mptail = NULL; 7350 mp->b_next = NULL; 7351 return (mp); 7352 } 7353 if (ipsq->ipsq_current_ipif != NULL) 7354 return (NULL); 7355 mp = ipsq->ipsq_xopq_mphead; 7356 if (mp != NULL) { 7357 ipsq->ipsq_xopq_mphead = mp->b_next; 7358 if (ipsq->ipsq_xopq_mphead == NULL) 7359 ipsq->ipsq_xopq_mptail = NULL; 7360 mp->b_next = NULL; 7361 return (mp); 7362 } 7363 return (NULL); 7364 } 7365 7366 /* 7367 * Enter the ipsq corresponding to ill, by waiting synchronously till 7368 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 7369 * will have to drain completely before ipsq_enter returns success. 7370 * ipsq_current_ipif will be set if some exclusive ioctl is in progress, 7371 * and the ipsq_exit logic will start the next enqueued ioctl after 7372 * completion of the current ioctl. If 'force' is used, we don't wait 7373 * for the enqueued ioctls. This is needed when a conn_close wants to 7374 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 7375 * of an ill can also use this option. But we dont' use it currently. 7376 */ 7377 #define ENTER_SQ_WAIT_TICKS 100 7378 boolean_t 7379 ipsq_enter(ill_t *ill, boolean_t force) 7380 { 7381 ipsq_t *ipsq; 7382 boolean_t waited_enough = B_FALSE; 7383 7384 /* 7385 * Holding the ill_lock prevents <ill-ipsq> assocs from changing. 7386 * Since the <ill-ipsq> assocs could change while we wait for the 7387 * writer, it is easier to wait on a fixed global rather than try to 7388 * cv_wait on a changing ipsq. 7389 */ 7390 mutex_enter(&ill->ill_lock); 7391 for (;;) { 7392 if (ill->ill_state_flags & ILL_CONDEMNED) { 7393 mutex_exit(&ill->ill_lock); 7394 return (B_FALSE); 7395 } 7396 7397 ipsq = ill->ill_phyint->phyint_ipsq; 7398 mutex_enter(&ipsq->ipsq_lock); 7399 if (ipsq->ipsq_writer == NULL && 7400 (ipsq->ipsq_current_ipif == NULL || waited_enough)) { 7401 break; 7402 } else if (ipsq->ipsq_writer != NULL) { 7403 mutex_exit(&ipsq->ipsq_lock); 7404 cv_wait(&ill->ill_cv, &ill->ill_lock); 7405 } else { 7406 mutex_exit(&ipsq->ipsq_lock); 7407 if (force) { 7408 (void) cv_timedwait(&ill->ill_cv, 7409 &ill->ill_lock, 7410 lbolt + ENTER_SQ_WAIT_TICKS); 7411 waited_enough = B_TRUE; 7412 continue; 7413 } else { 7414 cv_wait(&ill->ill_cv, &ill->ill_lock); 7415 } 7416 } 7417 } 7418 7419 ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL); 7420 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7421 ipsq->ipsq_writer = curthread; 7422 ipsq->ipsq_reentry_cnt++; 7423 #ifdef ILL_DEBUG 7424 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH); 7425 #endif 7426 mutex_exit(&ipsq->ipsq_lock); 7427 mutex_exit(&ill->ill_lock); 7428 return (B_TRUE); 7429 } 7430 7431 /* 7432 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 7433 * certain critical operations like plumbing (i.e. most set ioctls), 7434 * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP 7435 * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per 7436 * IPMP group. The ipsq serializes exclusive ioctls issued by applications 7437 * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple 7438 * threads executing in the ipsq. Responses from the driver pertain to the 7439 * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated 7440 * as part of bringing up the interface) and are enqueued in ipsq_mphead. 7441 * 7442 * If a thread does not want to reenter the ipsq when it is already writer, 7443 * it must make sure that the specified reentry point to be called later 7444 * when the ipsq is empty, nor any code path starting from the specified reentry 7445 * point must never ever try to enter the ipsq again. Otherwise it can lead 7446 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 7447 * When the thread that is currently exclusive finishes, it (ipsq_exit) 7448 * dequeues the requests waiting to become exclusive in ipsq_mphead and calls 7449 * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit 7450 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 7451 * ioctl if the current ioctl has completed. If the current ioctl is still 7452 * in progress it simply returns. The current ioctl could be waiting for 7453 * a response from another module (arp_ or the driver or could be waiting for 7454 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp 7455 * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the 7456 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 7457 * ipsq_current_ipif is clear which happens only on ioctl completion. 7458 */ 7459 7460 /* 7461 * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of 7462 * ipif or ill can be specified). The caller ensures ipif or ill is valid by 7463 * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued 7464 * completion. 7465 */ 7466 ipsq_t * 7467 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7468 ipsq_func_t func, int type, boolean_t reentry_ok) 7469 { 7470 ipsq_t *ipsq; 7471 7472 /* Only 1 of ipif or ill can be specified */ 7473 ASSERT((ipif != NULL) ^ (ill != NULL)); 7474 if (ipif != NULL) 7475 ill = ipif->ipif_ill; 7476 7477 /* 7478 * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock 7479 * ipsq of an ill can't change when ill_lock is held. 7480 */ 7481 GRAB_CONN_LOCK(q); 7482 mutex_enter(&ill->ill_lock); 7483 ipsq = ill->ill_phyint->phyint_ipsq; 7484 mutex_enter(&ipsq->ipsq_lock); 7485 7486 /* 7487 * 1. Enter the ipsq if we are already writer and reentry is ok. 7488 * (Note: If the caller does not specify reentry_ok then neither 7489 * 'func' nor any of its callees must ever attempt to enter the ipsq 7490 * again. Otherwise it can lead to an infinite loop 7491 * 2. Enter the ipsq if there is no current writer and this attempted 7492 * entry is part of the current ioctl or operation 7493 * 3. Enter the ipsq if there is no current writer and this is a new 7494 * ioctl (or operation) and the ioctl (or operation) queue is 7495 * empty and there is no ioctl (or operation) currently in progress 7496 */ 7497 if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) || 7498 (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL && 7499 ipsq->ipsq_current_ipif == NULL))) || 7500 (ipsq->ipsq_writer == curthread && reentry_ok)) { 7501 /* Success. */ 7502 ipsq->ipsq_reentry_cnt++; 7503 ipsq->ipsq_writer = curthread; 7504 mutex_exit(&ipsq->ipsq_lock); 7505 mutex_exit(&ill->ill_lock); 7506 RELEASE_CONN_LOCK(q); 7507 #ifdef ILL_DEBUG 7508 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH); 7509 #endif 7510 return (ipsq); 7511 } 7512 7513 ipsq_enq(ipsq, q, mp, func, type, ill); 7514 7515 mutex_exit(&ipsq->ipsq_lock); 7516 mutex_exit(&ill->ill_lock); 7517 RELEASE_CONN_LOCK(q); 7518 return (NULL); 7519 } 7520 7521 /* 7522 * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of 7523 * ipif or ill can be specified). The caller ensures ipif or ill is valid by 7524 * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued 7525 * completion. 7526 * 7527 * This function does a refrele on the ipif/ill. 7528 */ 7529 void 7530 qwriter_ip(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7531 ipsq_func_t func, int type, boolean_t reentry_ok) 7532 { 7533 ipsq_t *ipsq; 7534 7535 ipsq = ipsq_try_enter(ipif, ill, q, mp, func, type, reentry_ok); 7536 /* 7537 * Caller must have done a refhold on the ipif. ipif_refrele 7538 * happens on the passed ipif. We can do this since we are 7539 * already exclusive, or we won't access ipif henceforth, Both 7540 * this func and caller will just return if we ipsq_try_enter 7541 * fails above. This is needed because func needs to 7542 * see the correct refcount. Eg. removeif can work only then. 7543 */ 7544 if (ipif != NULL) 7545 ipif_refrele(ipif); 7546 else 7547 ill_refrele(ill); 7548 if (ipsq != NULL) { 7549 (*func)(ipsq, q, mp, NULL); 7550 ipsq_exit(ipsq, B_TRUE, B_TRUE); 7551 } 7552 } 7553 7554 /* 7555 * If there are more than ILL_GRP_CNT ills in a group, 7556 * we use kmem alloc'd buffers, else use the stack 7557 */ 7558 #define ILL_GRP_CNT 14 7559 /* 7560 * Drain the ipsq, if there are messages on it, and then leave the ipsq. 7561 * Called by a thread that is currently exclusive on this ipsq. 7562 */ 7563 void 7564 ipsq_exit(ipsq_t *ipsq, boolean_t start_igmp_timer, boolean_t start_mld_timer) 7565 { 7566 queue_t *q; 7567 mblk_t *mp; 7568 ipsq_func_t func; 7569 int next; 7570 ill_t **ill_list = NULL; 7571 size_t ill_list_size = 0; 7572 int cnt = 0; 7573 boolean_t need_ipsq_free = B_FALSE; 7574 7575 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7576 mutex_enter(&ipsq->ipsq_lock); 7577 ASSERT(ipsq->ipsq_reentry_cnt >= 1); 7578 if (ipsq->ipsq_reentry_cnt != 1) { 7579 ipsq->ipsq_reentry_cnt--; 7580 mutex_exit(&ipsq->ipsq_lock); 7581 return; 7582 } 7583 7584 mp = ipsq_dq(ipsq); 7585 while (mp != NULL) { 7586 again: 7587 mutex_exit(&ipsq->ipsq_lock); 7588 func = (ipsq_func_t)mp->b_prev; 7589 q = (queue_t *)mp->b_queue; 7590 mp->b_prev = NULL; 7591 mp->b_queue = NULL; 7592 7593 /* 7594 * If 'q' is an conn queue, it is valid, since we did a 7595 * a refhold on the connp, at the start of the ioctl. 7596 * If 'q' is an ill queue, it is valid, since close of an 7597 * ill will clean up the 'ipsq'. 7598 */ 7599 (*func)(ipsq, q, mp, NULL); 7600 7601 mutex_enter(&ipsq->ipsq_lock); 7602 mp = ipsq_dq(ipsq); 7603 } 7604 7605 mutex_exit(&ipsq->ipsq_lock); 7606 7607 /* 7608 * Need to grab the locks in the right order. Need to 7609 * atomically check (under ipsq_lock) that there are no 7610 * messages before relinquishing the ipsq. Also need to 7611 * atomically wakeup waiters on ill_cv while holding ill_lock. 7612 * Holding ill_g_lock ensures that ipsq list of ills is stable. 7613 * If we need to call ill_split_ipsq and change <ill-ipsq> we need 7614 * to grab ill_g_lock as writer. 7615 */ 7616 rw_enter(&ill_g_lock, ipsq->ipsq_split ? RW_WRITER : RW_READER); 7617 7618 /* ipsq_refs can't change while ill_g_lock is held as reader */ 7619 if (ipsq->ipsq_refs != 0) { 7620 /* At most 2 ills v4/v6 per phyint */ 7621 cnt = ipsq->ipsq_refs << 1; 7622 ill_list_size = cnt * sizeof (ill_t *); 7623 /* 7624 * If memory allocation fails, we will do the split 7625 * the next time ipsq_exit is called for whatever reason. 7626 * As long as the ipsq_split flag is set the need to 7627 * split is remembered. 7628 */ 7629 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 7630 if (ill_list != NULL) 7631 cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt); 7632 } 7633 mutex_enter(&ipsq->ipsq_lock); 7634 mp = ipsq_dq(ipsq); 7635 if (mp != NULL) { 7636 /* oops, some message has landed up, we can't get out */ 7637 if (ill_list != NULL) 7638 ill_unlock_ills(ill_list, cnt); 7639 rw_exit(&ill_g_lock); 7640 if (ill_list != NULL) 7641 kmem_free(ill_list, ill_list_size); 7642 ill_list = NULL; 7643 ill_list_size = 0; 7644 cnt = 0; 7645 goto again; 7646 } 7647 7648 /* 7649 * Split only if no ioctl is pending and if memory alloc succeeded 7650 * above. 7651 */ 7652 if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL && 7653 ill_list != NULL) { 7654 /* 7655 * No new ill can join this ipsq since we are holding the 7656 * ill_g_lock. Hence ill_split_ipsq can safely traverse the 7657 * ipsq. ill_split_ipsq may fail due to memory shortage. 7658 * If so we will retry on the next ipsq_exit. 7659 */ 7660 ipsq->ipsq_split = ill_split_ipsq(ipsq); 7661 } 7662 7663 /* 7664 * We are holding the ipsq lock, hence no new messages can 7665 * land up on the ipsq, and there are no messages currently. 7666 * Now safe to get out. Wake up waiters and relinquish ipsq 7667 * atomically while holding ill locks. 7668 */ 7669 ipsq->ipsq_writer = NULL; 7670 ipsq->ipsq_reentry_cnt--; 7671 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7672 #ifdef ILL_DEBUG 7673 ipsq->ipsq_depth = 0; 7674 #endif 7675 mutex_exit(&ipsq->ipsq_lock); 7676 /* 7677 * For IPMP this should wake up all ills in this ipsq. 7678 * We need to hold the ill_lock while waking up waiters to 7679 * avoid missed wakeups. But there is no need to acquire all 7680 * the ill locks and then wakeup. If we have not acquired all 7681 * the locks (due to memory failure above) ill_signal_ipsq_ills 7682 * wakes up ills one at a time after getting the right ill_lock 7683 */ 7684 ill_signal_ipsq_ills(ipsq, ill_list != NULL); 7685 if (ill_list != NULL) 7686 ill_unlock_ills(ill_list, cnt); 7687 if (ipsq->ipsq_refs == 0) 7688 need_ipsq_free = B_TRUE; 7689 rw_exit(&ill_g_lock); 7690 if (ill_list != 0) 7691 kmem_free(ill_list, ill_list_size); 7692 7693 if (need_ipsq_free) { 7694 /* 7695 * Free the ipsq. ipsq_refs can't increase because ipsq can't be 7696 * looked up. ipsq can be looked up only thru ill or phyint 7697 * and there are no ills/phyint on this ipsq. 7698 */ 7699 ipsq_delete(ipsq); 7700 } 7701 /* 7702 * Now start any igmp or mld timers that could not be started 7703 * while inside the ipsq. The timers can't be started while inside 7704 * the ipsq, since igmp_start_timers may need to call untimeout() 7705 * which can't be done while holding a lock i.e. the ipsq. Otherwise 7706 * there could be a deadlock since the timeout handlers 7707 * mld_timeout_handler / igmp_timeout_handler also synchronously 7708 * wait in ipsq_enter() trying to get the ipsq. 7709 * 7710 * However there is one exception to the above. If this thread is 7711 * itself the igmp/mld timeout handler thread, then we don't want 7712 * to start any new timer until the current handler is done. The 7713 * handler thread passes in B_FALSE for start_igmp/mld_timers, while 7714 * all others pass B_TRUE. 7715 */ 7716 if (start_igmp_timer) { 7717 mutex_enter(&igmp_timer_lock); 7718 next = igmp_deferred_next; 7719 igmp_deferred_next = INFINITY; 7720 mutex_exit(&igmp_timer_lock); 7721 7722 if (next != INFINITY) 7723 igmp_start_timers(next); 7724 } 7725 7726 if (start_mld_timer) { 7727 mutex_enter(&mld_timer_lock); 7728 next = mld_deferred_next; 7729 mld_deferred_next = INFINITY; 7730 mutex_exit(&mld_timer_lock); 7731 7732 if (next != INFINITY) 7733 mld_start_timers(next); 7734 } 7735 } 7736 7737 /* 7738 * The ill is closing. Flush all messages on the ipsq that originated 7739 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 7740 * for this ill since ipsq_enter could not have entered until then. 7741 * New messages can't be queued since the CONDEMNED flag is set. 7742 */ 7743 static void 7744 ipsq_flush(ill_t *ill) 7745 { 7746 queue_t *q; 7747 mblk_t *prev; 7748 mblk_t *mp; 7749 mblk_t *mp_next; 7750 ipsq_t *ipsq; 7751 7752 ASSERT(IAM_WRITER_ILL(ill)); 7753 ipsq = ill->ill_phyint->phyint_ipsq; 7754 /* 7755 * Flush any messages sent up by the driver. 7756 */ 7757 mutex_enter(&ipsq->ipsq_lock); 7758 for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) { 7759 mp_next = mp->b_next; 7760 q = mp->b_queue; 7761 if (q == ill->ill_rq || q == ill->ill_wq) { 7762 /* Remove the mp from the ipsq */ 7763 if (prev == NULL) 7764 ipsq->ipsq_mphead = mp->b_next; 7765 else 7766 prev->b_next = mp->b_next; 7767 if (ipsq->ipsq_mptail == mp) { 7768 ASSERT(mp_next == NULL); 7769 ipsq->ipsq_mptail = prev; 7770 } 7771 inet_freemsg(mp); 7772 } else { 7773 prev = mp; 7774 } 7775 } 7776 mutex_exit(&ipsq->ipsq_lock); 7777 (void) ipsq_pending_mp_cleanup(ill, NULL); 7778 ipsq_xopq_mp_cleanup(ill, NULL); 7779 ill_pending_mp_cleanup(ill); 7780 } 7781 7782 /* 7783 * Clean up one squeue element. ill_inuse_ref is protected by ill_lock. 7784 * The real cleanup happens behind the squeue via ip_squeue_clean function but 7785 * we need to protect ourselfs from 2 threads trying to cleanup at the same 7786 * time (possible with one port going down for aggr and someone tearing down the 7787 * entire aggr simultaneously. So we use ill_inuse_ref protected by ill_lock 7788 * to indicate when the cleanup has started (1 ref) and when the cleanup 7789 * is done (0 ref). When a new ring gets assigned to squeue, we start by 7790 * putting 2 ref on ill_inuse_ref. 7791 */ 7792 static void 7793 ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) 7794 { 7795 conn_t *connp; 7796 squeue_t *sqp; 7797 mblk_t *mp; 7798 7799 ASSERT(rx_ring != NULL); 7800 7801 /* Just clean one squeue */ 7802 mutex_enter(&ill->ill_lock); 7803 /* 7804 * Reset the ILL_SOFT_RING_ASSIGN bit so that 7805 * ip_squeue_soft_ring_affinty() will not go 7806 * ahead with assigning rings. 7807 */ 7808 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 7809 while (rx_ring->rr_ring_state == ILL_RING_INPROC) 7810 /* Some operations pending on the ring. Wait */ 7811 cv_wait(&ill->ill_cv, &ill->ill_lock); 7812 7813 if (rx_ring->rr_ring_state != ILL_RING_INUSE) { 7814 /* 7815 * Someone already trying to clean 7816 * this squeue or its already been cleaned. 7817 */ 7818 mutex_exit(&ill->ill_lock); 7819 return; 7820 } 7821 sqp = rx_ring->rr_sqp; 7822 7823 if (sqp == NULL) { 7824 /* 7825 * The rx_ring never had a squeue assigned to it. 7826 * We are under ill_lock so we can clean it up 7827 * here itself since no one can get to it. 7828 */ 7829 rx_ring->rr_blank = NULL; 7830 rx_ring->rr_handle = NULL; 7831 rx_ring->rr_sqp = NULL; 7832 rx_ring->rr_ring_state = ILL_RING_FREE; 7833 mutex_exit(&ill->ill_lock); 7834 return; 7835 } 7836 7837 /* Set the state that its being cleaned */ 7838 rx_ring->rr_ring_state = ILL_RING_BEING_FREED; 7839 ASSERT(sqp != NULL); 7840 mutex_exit(&ill->ill_lock); 7841 7842 /* 7843 * Use the preallocated ill_unbind_conn for this purpose 7844 */ 7845 connp = ill->ill_dls_capab->ill_unbind_conn; 7846 mp = &connp->conn_tcp->tcp_closemp; 7847 CONN_INC_REF(connp); 7848 squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL); 7849 7850 mutex_enter(&ill->ill_lock); 7851 while (rx_ring->rr_ring_state != ILL_RING_FREE) 7852 cv_wait(&ill->ill_cv, &ill->ill_lock); 7853 7854 mutex_exit(&ill->ill_lock); 7855 } 7856 7857 static void 7858 ipsq_clean_all(ill_t *ill) 7859 { 7860 int idx; 7861 7862 /* 7863 * No need to clean if poll_capab isn't set for this ill 7864 */ 7865 if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))) 7866 return; 7867 7868 for (idx = 0; idx < ILL_MAX_RINGS; idx++) { 7869 ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx]; 7870 ipsq_clean_ring(ill, ipr); 7871 } 7872 7873 ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING); 7874 } 7875 7876 /* ARGSUSED */ 7877 int 7878 ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 7879 ip_ioctl_cmd_t *ipip, void *ifreq) 7880 { 7881 ill_t *ill; 7882 struct lifreq *lifr = (struct lifreq *)ifreq; 7883 boolean_t isv6; 7884 conn_t *connp; 7885 7886 connp = Q_TO_CONN(q); 7887 isv6 = connp->conn_af_isv6; 7888 /* 7889 * Set original index. 7890 * Failover and failback move logical interfaces 7891 * from one physical interface to another. The 7892 * original index indicates the parent of a logical 7893 * interface, in other words, the physical interface 7894 * the logical interface will be moved back to on 7895 * failback. 7896 */ 7897 7898 /* 7899 * Don't allow the original index to be changed 7900 * for non-failover addresses, autoconfigured 7901 * addresses, or IPv6 link local addresses. 7902 */ 7903 if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) || 7904 (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) { 7905 return (EINVAL); 7906 } 7907 /* 7908 * The new original index must be in use by some 7909 * physical interface. 7910 */ 7911 ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL, 7912 NULL, NULL); 7913 if (ill == NULL) 7914 return (ENXIO); 7915 ill_refrele(ill); 7916 7917 ipif->ipif_orig_ifindex = lifr->lifr_index; 7918 /* 7919 * When this ipif gets failed back, don't 7920 * preserve the original id, as it is no 7921 * longer applicable. 7922 */ 7923 ipif->ipif_orig_ipifid = 0; 7924 /* 7925 * For IPv4, change the original index of any 7926 * multicast addresses associated with the 7927 * ipif to the new value. 7928 */ 7929 if (!isv6) { 7930 ilm_t *ilm; 7931 7932 mutex_enter(&ipif->ipif_ill->ill_lock); 7933 for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL; 7934 ilm = ilm->ilm_next) { 7935 if (ilm->ilm_ipif == ipif) { 7936 ilm->ilm_orig_ifindex = lifr->lifr_index; 7937 } 7938 } 7939 mutex_exit(&ipif->ipif_ill->ill_lock); 7940 } 7941 return (0); 7942 } 7943 7944 /* ARGSUSED */ 7945 int 7946 ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 7947 ip_ioctl_cmd_t *ipip, void *ifreq) 7948 { 7949 struct lifreq *lifr = (struct lifreq *)ifreq; 7950 7951 /* 7952 * Get the original interface index i.e the one 7953 * before FAILOVER if it ever happened. 7954 */ 7955 lifr->lifr_index = ipif->ipif_orig_ifindex; 7956 return (0); 7957 } 7958 7959 /* 7960 * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls, 7961 * refhold and return the associated ipif 7962 */ 7963 int 7964 ip_extract_tunreq(queue_t *q, mblk_t *mp, ipif_t **ipifp, ipsq_func_t func) 7965 { 7966 boolean_t exists; 7967 struct iftun_req *ta; 7968 ipif_t *ipif; 7969 ill_t *ill; 7970 boolean_t isv6; 7971 mblk_t *mp1; 7972 int error; 7973 conn_t *connp; 7974 7975 /* Existence verified in ip_wput_nondata */ 7976 mp1 = mp->b_cont->b_cont; 7977 ta = (struct iftun_req *)mp1->b_rptr; 7978 /* 7979 * Null terminate the string to protect against buffer 7980 * overrun. String was generated by user code and may not 7981 * be trusted. 7982 */ 7983 ta->ifta_lifr_name[LIFNAMSIZ - 1] = '\0'; 7984 7985 connp = Q_TO_CONN(q); 7986 isv6 = connp->conn_af_isv6; 7987 7988 /* Disallows implicit create */ 7989 ipif = ipif_lookup_on_name(ta->ifta_lifr_name, 7990 mi_strlen(ta->ifta_lifr_name), B_FALSE, &exists, isv6, 7991 connp->conn_zoneid, CONNP_TO_WQ(connp), mp, func, &error); 7992 if (ipif == NULL) 7993 return (error); 7994 7995 if (ipif->ipif_id != 0) { 7996 /* 7997 * We really don't want to set/get tunnel parameters 7998 * on virtual tunnel interfaces. Only allow the 7999 * base tunnel to do these. 8000 */ 8001 ipif_refrele(ipif); 8002 return (EINVAL); 8003 } 8004 8005 /* 8006 * Send down to tunnel mod for ioctl processing. 8007 * Will finish ioctl in ip_rput_other(). 8008 */ 8009 ill = ipif->ipif_ill; 8010 if (ill->ill_net_type == IRE_LOOPBACK) { 8011 ipif_refrele(ipif); 8012 return (EOPNOTSUPP); 8013 } 8014 8015 if (ill->ill_wq == NULL) { 8016 ipif_refrele(ipif); 8017 return (ENXIO); 8018 } 8019 /* 8020 * Mark the ioctl as coming from an IPv6 interface for 8021 * tun's convenience. 8022 */ 8023 if (ill->ill_isv6) 8024 ta->ifta_flags |= 0x80000000; 8025 *ipifp = ipif; 8026 return (0); 8027 } 8028 8029 /* 8030 * Parse an ifreq or lifreq struct coming down ioctls and refhold 8031 * and return the associated ipif. 8032 * Return value: 8033 * Non zero: An error has occurred. ci may not be filled out. 8034 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 8035 * a held ipif in ci.ci_ipif. 8036 */ 8037 int 8038 ip_extract_lifreq_cmn(queue_t *q, mblk_t *mp, int cmd_type, int flags, 8039 cmd_info_t *ci, ipsq_func_t func) 8040 { 8041 sin_t *sin; 8042 sin6_t *sin6; 8043 char *name; 8044 struct ifreq *ifr; 8045 struct lifreq *lifr; 8046 ipif_t *ipif = NULL; 8047 ill_t *ill; 8048 conn_t *connp; 8049 boolean_t isv6; 8050 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8051 boolean_t exists; 8052 int err; 8053 mblk_t *mp1; 8054 zoneid_t zoneid; 8055 8056 if (q->q_next != NULL) { 8057 ill = (ill_t *)q->q_ptr; 8058 isv6 = ill->ill_isv6; 8059 connp = NULL; 8060 zoneid = ALL_ZONES; 8061 } else { 8062 ill = NULL; 8063 connp = Q_TO_CONN(q); 8064 isv6 = connp->conn_af_isv6; 8065 zoneid = connp->conn_zoneid; 8066 if (zoneid == GLOBAL_ZONEID) { 8067 /* global zone can access ipifs in all zones */ 8068 zoneid = ALL_ZONES; 8069 } 8070 } 8071 8072 /* Has been checked in ip_wput_nondata */ 8073 mp1 = mp->b_cont->b_cont; 8074 8075 8076 if (cmd_type == IF_CMD) { 8077 /* This a old style SIOC[GS]IF* command */ 8078 ifr = (struct ifreq *)mp1->b_rptr; 8079 /* 8080 * Null terminate the string to protect against buffer 8081 * overrun. String was generated by user code and may not 8082 * be trusted. 8083 */ 8084 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 8085 sin = (sin_t *)&ifr->ifr_addr; 8086 name = ifr->ifr_name; 8087 ci->ci_sin = sin; 8088 ci->ci_sin6 = NULL; 8089 ci->ci_lifr = (struct lifreq *)ifr; 8090 } else { 8091 /* This a new style SIOC[GS]LIF* command */ 8092 ASSERT(cmd_type == LIF_CMD); 8093 lifr = (struct lifreq *)mp1->b_rptr; 8094 /* 8095 * Null terminate the string to protect against buffer 8096 * overrun. String was generated by user code and may not 8097 * be trusted. 8098 */ 8099 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 8100 name = lifr->lifr_name; 8101 sin = (sin_t *)&lifr->lifr_addr; 8102 sin6 = (sin6_t *)&lifr->lifr_addr; 8103 if (iocp->ioc_cmd == SIOCSLIFGROUPNAME) { 8104 (void) strncpy(ci->ci_groupname, lifr->lifr_groupname, 8105 LIFNAMSIZ); 8106 } 8107 ci->ci_sin = sin; 8108 ci->ci_sin6 = sin6; 8109 ci->ci_lifr = lifr; 8110 } 8111 8112 8113 if (iocp->ioc_cmd == SIOCSLIFNAME) { 8114 /* 8115 * The ioctl will be failed if the ioctl comes down 8116 * an conn stream 8117 */ 8118 if (ill == NULL) { 8119 /* 8120 * Not an ill queue, return EINVAL same as the 8121 * old error code. 8122 */ 8123 return (ENXIO); 8124 } 8125 ipif = ill->ill_ipif; 8126 ipif_refhold(ipif); 8127 } else { 8128 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 8129 &exists, isv6, zoneid, 8130 (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err); 8131 if (ipif == NULL) { 8132 if (err == EINPROGRESS) 8133 return (err); 8134 if (iocp->ioc_cmd == SIOCLIFFAILOVER || 8135 iocp->ioc_cmd == SIOCLIFFAILBACK) { 8136 /* 8137 * Need to try both v4 and v6 since this 8138 * ioctl can come down either v4 or v6 8139 * socket. The lifreq.lifr_family passed 8140 * down by this ioctl is AF_UNSPEC. 8141 */ 8142 ipif = ipif_lookup_on_name(name, 8143 mi_strlen(name), B_FALSE, &exists, !isv6, 8144 zoneid, (connp == NULL) ? q : 8145 CONNP_TO_WQ(connp), mp, func, &err); 8146 if (err == EINPROGRESS) 8147 return (err); 8148 } 8149 err = 0; /* Ensure we don't use it below */ 8150 } 8151 } 8152 8153 /* 8154 * Old style [GS]IFCMD does not admit IPv6 ipif 8155 */ 8156 if (ipif != NULL && ipif->ipif_isv6 && cmd_type == IF_CMD) { 8157 ipif_refrele(ipif); 8158 return (ENXIO); 8159 } 8160 8161 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 8162 name[0] == '\0') { 8163 /* 8164 * Handle a or a SIOC?IF* with a null name 8165 * during plumb (on the ill queue before the I_PLINK). 8166 */ 8167 ipif = ill->ill_ipif; 8168 ipif_refhold(ipif); 8169 } 8170 8171 if (ipif == NULL) 8172 return (ENXIO); 8173 8174 /* 8175 * Allow only GET operations if this ipif has been created 8176 * temporarily due to a MOVE operation. 8177 */ 8178 if (ipif->ipif_replace_zero && !(flags & IPI_REPL)) { 8179 ipif_refrele(ipif); 8180 return (EINVAL); 8181 } 8182 8183 ci->ci_ipif = ipif; 8184 return (0); 8185 } 8186 8187 /* 8188 * Return the total number of ipifs. 8189 */ 8190 static uint_t 8191 ip_get_numifs(zoneid_t zoneid) 8192 { 8193 uint_t numifs = 0; 8194 ill_t *ill; 8195 ill_walk_context_t ctx; 8196 ipif_t *ipif; 8197 8198 rw_enter(&ill_g_lock, RW_READER); 8199 ill = ILL_START_WALK_V4(&ctx); 8200 8201 while (ill != NULL) { 8202 for (ipif = ill->ill_ipif; ipif != NULL; 8203 ipif = ipif->ipif_next) { 8204 if (ipif->ipif_zoneid == zoneid || 8205 ipif->ipif_zoneid == ALL_ZONES) 8206 numifs++; 8207 } 8208 ill = ill_next(&ctx, ill); 8209 } 8210 rw_exit(&ill_g_lock); 8211 return (numifs); 8212 } 8213 8214 /* 8215 * Return the total number of ipifs. 8216 */ 8217 static uint_t 8218 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid) 8219 { 8220 uint_t numifs = 0; 8221 ill_t *ill; 8222 ipif_t *ipif; 8223 ill_walk_context_t ctx; 8224 8225 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 8226 8227 rw_enter(&ill_g_lock, RW_READER); 8228 if (family == AF_INET) 8229 ill = ILL_START_WALK_V4(&ctx); 8230 else if (family == AF_INET6) 8231 ill = ILL_START_WALK_V6(&ctx); 8232 else 8233 ill = ILL_START_WALK_ALL(&ctx); 8234 8235 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8236 for (ipif = ill->ill_ipif; ipif != NULL; 8237 ipif = ipif->ipif_next) { 8238 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8239 !(lifn_flags & LIFC_NOXMIT)) 8240 continue; 8241 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8242 !(lifn_flags & LIFC_TEMPORARY)) 8243 continue; 8244 if (((ipif->ipif_flags & 8245 (IPIF_NOXMIT|IPIF_NOLOCAL| 8246 IPIF_DEPRECATED)) || 8247 (ill->ill_phyint->phyint_flags & 8248 PHYI_LOOPBACK) || 8249 !(ipif->ipif_flags & IPIF_UP)) && 8250 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 8251 continue; 8252 8253 if (zoneid != ipif->ipif_zoneid && 8254 ipif->ipif_zoneid != ALL_ZONES && 8255 (zoneid != GLOBAL_ZONEID || 8256 !(lifn_flags & LIFC_ALLZONES))) 8257 continue; 8258 8259 numifs++; 8260 } 8261 } 8262 rw_exit(&ill_g_lock); 8263 return (numifs); 8264 } 8265 8266 uint_t 8267 ip_get_lifsrcofnum(ill_t *ill) 8268 { 8269 uint_t numifs = 0; 8270 ill_t *ill_head = ill; 8271 8272 /* 8273 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 8274 * other thread may be trying to relink the ILLs in this usesrc group 8275 * and adjusting the ill_usesrc_grp_next pointers 8276 */ 8277 rw_enter(&ill_g_usesrc_lock, RW_READER); 8278 if ((ill->ill_usesrc_ifindex == 0) && 8279 (ill->ill_usesrc_grp_next != NULL)) { 8280 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 8281 ill = ill->ill_usesrc_grp_next) 8282 numifs++; 8283 } 8284 rw_exit(&ill_g_usesrc_lock); 8285 8286 return (numifs); 8287 } 8288 8289 /* Null values are passed in for ipif, sin, and ifreq */ 8290 /* ARGSUSED */ 8291 int 8292 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8293 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8294 { 8295 int *nump; 8296 8297 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8298 8299 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8300 nump = (int *)mp->b_cont->b_cont->b_rptr; 8301 8302 *nump = ip_get_numifs(Q_TO_CONN(q)->conn_zoneid); 8303 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 8304 return (0); 8305 } 8306 8307 /* Null values are passed in for ipif, sin, and ifreq */ 8308 /* ARGSUSED */ 8309 int 8310 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 8311 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8312 { 8313 struct lifnum *lifn; 8314 mblk_t *mp1; 8315 8316 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8317 8318 /* Existence checked in ip_wput_nondata */ 8319 mp1 = mp->b_cont->b_cont; 8320 8321 lifn = (struct lifnum *)mp1->b_rptr; 8322 switch (lifn->lifn_family) { 8323 case AF_UNSPEC: 8324 case AF_INET: 8325 case AF_INET6: 8326 break; 8327 default: 8328 return (EAFNOSUPPORT); 8329 } 8330 8331 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 8332 Q_TO_CONN(q)->conn_zoneid); 8333 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 8334 return (0); 8335 } 8336 8337 /* ARGSUSED */ 8338 int 8339 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8340 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8341 { 8342 STRUCT_HANDLE(ifconf, ifc); 8343 mblk_t *mp1; 8344 struct iocblk *iocp; 8345 struct ifreq *ifr; 8346 ill_walk_context_t ctx; 8347 ill_t *ill; 8348 ipif_t *ipif; 8349 struct sockaddr_in *sin; 8350 int32_t ifclen; 8351 zoneid_t zoneid; 8352 8353 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 8354 8355 ip1dbg(("ip_sioctl_get_ifconf")); 8356 /* Existence verified in ip_wput_nondata */ 8357 mp1 = mp->b_cont->b_cont; 8358 iocp = (struct iocblk *)mp->b_rptr; 8359 zoneid = Q_TO_CONN(q)->conn_zoneid; 8360 8361 /* 8362 * The original SIOCGIFCONF passed in a struct ifconf which specified 8363 * the user buffer address and length into which the list of struct 8364 * ifreqs was to be copied. Since AT&T Streams does not seem to 8365 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 8366 * the SIOCGIFCONF operation was redefined to simply provide 8367 * a large output buffer into which we are supposed to jam the ifreq 8368 * array. The same ioctl command code was used, despite the fact that 8369 * both the applications and the kernel code had to change, thus making 8370 * it impossible to support both interfaces. 8371 * 8372 * For reasons not good enough to try to explain, the following 8373 * algorithm is used for deciding what to do with one of these: 8374 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 8375 * form with the output buffer coming down as the continuation message. 8376 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 8377 * and we have to copy in the ifconf structure to find out how big the 8378 * output buffer is and where to copy out to. Sure no problem... 8379 * 8380 */ 8381 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 8382 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 8383 int numifs = 0; 8384 size_t ifc_bufsize; 8385 8386 /* 8387 * Must be (better be!) continuation of a TRANSPARENT 8388 * IOCTL. We just copied in the ifconf structure. 8389 */ 8390 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 8391 (struct ifconf *)mp1->b_rptr); 8392 8393 /* 8394 * Allocate a buffer to hold requested information. 8395 * 8396 * If ifc_len is larger than what is needed, we only 8397 * allocate what we will use. 8398 * 8399 * If ifc_len is smaller than what is needed, return 8400 * EINVAL. 8401 * 8402 * XXX: the ill_t structure can hava 2 counters, for 8403 * v4 and v6 (not just ill_ipif_up_count) to store the 8404 * number of interfaces for a device, so we don't need 8405 * to count them here... 8406 */ 8407 numifs = ip_get_numifs(zoneid); 8408 8409 ifclen = STRUCT_FGET(ifc, ifc_len); 8410 ifc_bufsize = numifs * sizeof (struct ifreq); 8411 if (ifc_bufsize > ifclen) { 8412 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8413 /* old behaviour */ 8414 return (EINVAL); 8415 } else { 8416 ifc_bufsize = ifclen; 8417 } 8418 } 8419 8420 mp1 = mi_copyout_alloc(q, mp, 8421 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 8422 if (mp1 == NULL) 8423 return (ENOMEM); 8424 8425 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 8426 } 8427 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8428 /* 8429 * the SIOCGIFCONF ioctl only knows about 8430 * IPv4 addresses, so don't try to tell 8431 * it about interfaces with IPv6-only 8432 * addresses. (Last parm 'isv6' is B_FALSE) 8433 */ 8434 8435 ifr = (struct ifreq *)mp1->b_rptr; 8436 8437 rw_enter(&ill_g_lock, RW_READER); 8438 ill = ILL_START_WALK_V4(&ctx); 8439 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8440 for (ipif = ill->ill_ipif; ipif != NULL; 8441 ipif = ipif->ipif_next) { 8442 if (zoneid != ipif->ipif_zoneid && 8443 ipif->ipif_zoneid != ALL_ZONES) 8444 continue; 8445 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 8446 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8447 /* old behaviour */ 8448 rw_exit(&ill_g_lock); 8449 return (EINVAL); 8450 } else { 8451 goto if_copydone; 8452 } 8453 } 8454 (void) ipif_get_name(ipif, 8455 ifr->ifr_name, 8456 sizeof (ifr->ifr_name)); 8457 sin = (sin_t *)&ifr->ifr_addr; 8458 *sin = sin_null; 8459 sin->sin_family = AF_INET; 8460 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8461 ifr++; 8462 } 8463 } 8464 if_copydone: 8465 rw_exit(&ill_g_lock); 8466 mp1->b_wptr = (uchar_t *)ifr; 8467 8468 if (STRUCT_BUF(ifc) != NULL) { 8469 STRUCT_FSET(ifc, ifc_len, 8470 (int)((uchar_t *)ifr - mp1->b_rptr)); 8471 } 8472 return (0); 8473 } 8474 8475 /* 8476 * Get the interfaces using the address hosted on the interface passed in, 8477 * as a source adddress 8478 */ 8479 /* ARGSUSED */ 8480 int 8481 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8482 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8483 { 8484 mblk_t *mp1; 8485 ill_t *ill, *ill_head; 8486 ipif_t *ipif, *orig_ipif; 8487 int numlifs = 0; 8488 size_t lifs_bufsize, lifsmaxlen; 8489 struct lifreq *lifr; 8490 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8491 uint_t ifindex; 8492 zoneid_t zoneid; 8493 int err = 0; 8494 boolean_t isv6 = B_FALSE; 8495 struct sockaddr_in *sin; 8496 struct sockaddr_in6 *sin6; 8497 8498 STRUCT_HANDLE(lifsrcof, lifs); 8499 8500 ASSERT(q->q_next == NULL); 8501 8502 zoneid = Q_TO_CONN(q)->conn_zoneid; 8503 8504 /* Existence verified in ip_wput_nondata */ 8505 mp1 = mp->b_cont->b_cont; 8506 8507 /* 8508 * Must be (better be!) continuation of a TRANSPARENT 8509 * IOCTL. We just copied in the lifsrcof structure. 8510 */ 8511 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 8512 (struct lifsrcof *)mp1->b_rptr); 8513 8514 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 8515 return (EINVAL); 8516 8517 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 8518 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 8519 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp, 8520 ip_process_ioctl, &err); 8521 if (ipif == NULL) { 8522 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 8523 ifindex)); 8524 return (err); 8525 } 8526 8527 8528 /* Allocate a buffer to hold requested information */ 8529 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 8530 lifs_bufsize = numlifs * sizeof (struct lifreq); 8531 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 8532 /* The actual size needed is always returned in lifs_len */ 8533 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 8534 8535 /* If the amount we need is more than what is passed in, abort */ 8536 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 8537 ipif_refrele(ipif); 8538 return (0); 8539 } 8540 8541 mp1 = mi_copyout_alloc(q, mp, 8542 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 8543 if (mp1 == NULL) { 8544 ipif_refrele(ipif); 8545 return (ENOMEM); 8546 } 8547 8548 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 8549 bzero(mp1->b_rptr, lifs_bufsize); 8550 8551 lifr = (struct lifreq *)mp1->b_rptr; 8552 8553 ill = ill_head = ipif->ipif_ill; 8554 orig_ipif = ipif; 8555 8556 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 8557 rw_enter(&ill_g_usesrc_lock, RW_READER); 8558 rw_enter(&ill_g_lock, RW_READER); 8559 8560 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 8561 for (; (ill != NULL) && (ill != ill_head); 8562 ill = ill->ill_usesrc_grp_next) { 8563 8564 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 8565 break; 8566 8567 ipif = ill->ill_ipif; 8568 (void) ipif_get_name(ipif, 8569 lifr->lifr_name, sizeof (lifr->lifr_name)); 8570 if (ipif->ipif_isv6) { 8571 sin6 = (sin6_t *)&lifr->lifr_addr; 8572 *sin6 = sin6_null; 8573 sin6->sin6_family = AF_INET6; 8574 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 8575 lifr->lifr_addrlen = ip_mask_to_plen_v6( 8576 &ipif->ipif_v6net_mask); 8577 } else { 8578 sin = (sin_t *)&lifr->lifr_addr; 8579 *sin = sin_null; 8580 sin->sin_family = AF_INET; 8581 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8582 lifr->lifr_addrlen = ip_mask_to_plen( 8583 ipif->ipif_net_mask); 8584 } 8585 lifr++; 8586 } 8587 rw_exit(&ill_g_usesrc_lock); 8588 rw_exit(&ill_g_lock); 8589 ipif_refrele(orig_ipif); 8590 mp1->b_wptr = (uchar_t *)lifr; 8591 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 8592 8593 return (0); 8594 } 8595 8596 /* ARGSUSED */ 8597 int 8598 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8599 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8600 { 8601 mblk_t *mp1; 8602 int list; 8603 ill_t *ill; 8604 ipif_t *ipif; 8605 int flags; 8606 int numlifs = 0; 8607 size_t lifc_bufsize; 8608 struct lifreq *lifr; 8609 sa_family_t family; 8610 struct sockaddr_in *sin; 8611 struct sockaddr_in6 *sin6; 8612 ill_walk_context_t ctx; 8613 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8614 int32_t lifclen; 8615 zoneid_t zoneid; 8616 STRUCT_HANDLE(lifconf, lifc); 8617 8618 ip1dbg(("ip_sioctl_get_lifconf")); 8619 8620 ASSERT(q->q_next == NULL); 8621 8622 zoneid = Q_TO_CONN(q)->conn_zoneid; 8623 8624 /* Existence verified in ip_wput_nondata */ 8625 mp1 = mp->b_cont->b_cont; 8626 8627 /* 8628 * An extended version of SIOCGIFCONF that takes an 8629 * additional address family and flags field. 8630 * AF_UNSPEC retrieve both IPv4 and IPv6. 8631 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 8632 * interfaces are omitted. 8633 * Similarly, IPIF_TEMPORARY interfaces are omitted 8634 * unless LIFC_TEMPORARY is specified. 8635 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 8636 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 8637 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 8638 * has priority over LIFC_NOXMIT. 8639 */ 8640 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 8641 8642 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 8643 return (EINVAL); 8644 8645 /* 8646 * Must be (better be!) continuation of a TRANSPARENT 8647 * IOCTL. We just copied in the lifconf structure. 8648 */ 8649 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 8650 8651 family = STRUCT_FGET(lifc, lifc_family); 8652 flags = STRUCT_FGET(lifc, lifc_flags); 8653 8654 switch (family) { 8655 case AF_UNSPEC: 8656 /* 8657 * walk all ILL's. 8658 */ 8659 list = MAX_G_HEADS; 8660 break; 8661 case AF_INET: 8662 /* 8663 * walk only IPV4 ILL's. 8664 */ 8665 list = IP_V4_G_HEAD; 8666 break; 8667 case AF_INET6: 8668 /* 8669 * walk only IPV6 ILL's. 8670 */ 8671 list = IP_V6_G_HEAD; 8672 break; 8673 default: 8674 return (EAFNOSUPPORT); 8675 } 8676 8677 /* 8678 * Allocate a buffer to hold requested information. 8679 * 8680 * If lifc_len is larger than what is needed, we only 8681 * allocate what we will use. 8682 * 8683 * If lifc_len is smaller than what is needed, return 8684 * EINVAL. 8685 */ 8686 numlifs = ip_get_numlifs(family, flags, zoneid); 8687 lifc_bufsize = numlifs * sizeof (struct lifreq); 8688 lifclen = STRUCT_FGET(lifc, lifc_len); 8689 if (lifc_bufsize > lifclen) { 8690 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 8691 return (EINVAL); 8692 else 8693 lifc_bufsize = lifclen; 8694 } 8695 8696 mp1 = mi_copyout_alloc(q, mp, 8697 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 8698 if (mp1 == NULL) 8699 return (ENOMEM); 8700 8701 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 8702 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8703 8704 lifr = (struct lifreq *)mp1->b_rptr; 8705 8706 rw_enter(&ill_g_lock, RW_READER); 8707 ill = ill_first(list, list, &ctx); 8708 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8709 for (ipif = ill->ill_ipif; ipif != NULL; 8710 ipif = ipif->ipif_next) { 8711 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8712 !(flags & LIFC_NOXMIT)) 8713 continue; 8714 8715 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8716 !(flags & LIFC_TEMPORARY)) 8717 continue; 8718 8719 if (((ipif->ipif_flags & 8720 (IPIF_NOXMIT|IPIF_NOLOCAL| 8721 IPIF_DEPRECATED)) || 8722 (ill->ill_phyint->phyint_flags & 8723 PHYI_LOOPBACK) || 8724 !(ipif->ipif_flags & IPIF_UP)) && 8725 (flags & LIFC_EXTERNAL_SOURCE)) 8726 continue; 8727 8728 if (zoneid != ipif->ipif_zoneid && 8729 ipif->ipif_zoneid != ALL_ZONES && 8730 (zoneid != GLOBAL_ZONEID || 8731 !(flags & LIFC_ALLZONES))) 8732 continue; 8733 8734 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 8735 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 8736 rw_exit(&ill_g_lock); 8737 return (EINVAL); 8738 } else { 8739 goto lif_copydone; 8740 } 8741 } 8742 8743 (void) ipif_get_name(ipif, 8744 lifr->lifr_name, 8745 sizeof (lifr->lifr_name)); 8746 if (ipif->ipif_isv6) { 8747 sin6 = (sin6_t *)&lifr->lifr_addr; 8748 *sin6 = sin6_null; 8749 sin6->sin6_family = AF_INET6; 8750 sin6->sin6_addr = 8751 ipif->ipif_v6lcl_addr; 8752 lifr->lifr_addrlen = 8753 ip_mask_to_plen_v6( 8754 &ipif->ipif_v6net_mask); 8755 } else { 8756 sin = (sin_t *)&lifr->lifr_addr; 8757 *sin = sin_null; 8758 sin->sin_family = AF_INET; 8759 sin->sin_addr.s_addr = 8760 ipif->ipif_lcl_addr; 8761 lifr->lifr_addrlen = 8762 ip_mask_to_plen( 8763 ipif->ipif_net_mask); 8764 } 8765 lifr++; 8766 } 8767 } 8768 lif_copydone: 8769 rw_exit(&ill_g_lock); 8770 8771 mp1->b_wptr = (uchar_t *)lifr; 8772 if (STRUCT_BUF(lifc) != NULL) { 8773 STRUCT_FSET(lifc, lifc_len, 8774 (int)((uchar_t *)lifr - mp1->b_rptr)); 8775 } 8776 return (0); 8777 } 8778 8779 /* ARGSUSED */ 8780 int 8781 ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin, 8782 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8783 { 8784 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8785 ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr; 8786 return (0); 8787 } 8788 8789 static void 8790 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 8791 { 8792 ip6_asp_t *table; 8793 size_t table_size; 8794 mblk_t *data_mp; 8795 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8796 8797 /* These two ioctls are I_STR only */ 8798 if (iocp->ioc_count == TRANSPARENT) { 8799 miocnak(q, mp, 0, EINVAL); 8800 return; 8801 } 8802 8803 data_mp = mp->b_cont; 8804 if (data_mp == NULL) { 8805 /* The user passed us a NULL argument */ 8806 table = NULL; 8807 table_size = iocp->ioc_count; 8808 } else { 8809 /* 8810 * The user provided a table. The stream head 8811 * may have copied in the user data in chunks, 8812 * so make sure everything is pulled up 8813 * properly. 8814 */ 8815 if (MBLKL(data_mp) < iocp->ioc_count) { 8816 mblk_t *new_data_mp; 8817 if ((new_data_mp = msgpullup(data_mp, -1)) == 8818 NULL) { 8819 miocnak(q, mp, 0, ENOMEM); 8820 return; 8821 } 8822 freemsg(data_mp); 8823 data_mp = new_data_mp; 8824 mp->b_cont = data_mp; 8825 } 8826 table = (ip6_asp_t *)data_mp->b_rptr; 8827 table_size = iocp->ioc_count; 8828 } 8829 8830 switch (iocp->ioc_cmd) { 8831 case SIOCGIP6ADDRPOLICY: 8832 iocp->ioc_rval = ip6_asp_get(table, table_size); 8833 if (iocp->ioc_rval == -1) 8834 iocp->ioc_error = EINVAL; 8835 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8836 else if (table != NULL && 8837 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 8838 ip6_asp_t *src = table; 8839 ip6_asp32_t *dst = (void *)table; 8840 int count = table_size / sizeof (ip6_asp_t); 8841 int i; 8842 8843 /* 8844 * We need to do an in-place shrink of the array 8845 * to match the alignment attributes of the 8846 * 32-bit ABI looking at it. 8847 */ 8848 /* LINTED: logical expression always true: op "||" */ 8849 ASSERT(sizeof (*src) > sizeof (*dst)); 8850 for (i = 1; i < count; i++) 8851 bcopy(src + i, dst + i, sizeof (*dst)); 8852 } 8853 #endif 8854 break; 8855 8856 case SIOCSIP6ADDRPOLICY: 8857 ASSERT(mp->b_prev == NULL); 8858 mp->b_prev = (void *)q; 8859 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8860 /* 8861 * We pass in the datamodel here so that the ip6_asp_replace() 8862 * routine can handle converting from 32-bit to native formats 8863 * where necessary. 8864 * 8865 * A better way to handle this might be to convert the inbound 8866 * data structure here, and hang it off a new 'mp'; thus the 8867 * ip6_asp_replace() logic would always be dealing with native 8868 * format data structures.. 8869 * 8870 * (An even simpler way to handle these ioctls is to just 8871 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 8872 * and just recompile everything that depends on it.) 8873 */ 8874 #endif 8875 ip6_asp_replace(mp, table, table_size, B_FALSE, 8876 iocp->ioc_flag & IOC_MODELS); 8877 return; 8878 } 8879 8880 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 8881 qreply(q, mp); 8882 } 8883 8884 static void 8885 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 8886 { 8887 mblk_t *data_mp; 8888 struct dstinforeq *dir; 8889 uint8_t *end, *cur; 8890 in6_addr_t *daddr, *saddr; 8891 ipaddr_t v4daddr; 8892 ire_t *ire; 8893 char *slabel, *dlabel; 8894 boolean_t isipv4; 8895 int match_ire; 8896 ill_t *dst_ill; 8897 ipif_t *src_ipif, *ire_ipif; 8898 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8899 zoneid_t zoneid; 8900 8901 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8902 zoneid = Q_TO_CONN(q)->conn_zoneid; 8903 8904 /* 8905 * This ioctl is I_STR only, and must have a 8906 * data mblk following the M_IOCTL mblk. 8907 */ 8908 data_mp = mp->b_cont; 8909 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 8910 miocnak(q, mp, 0, EINVAL); 8911 return; 8912 } 8913 8914 if (MBLKL(data_mp) < iocp->ioc_count) { 8915 mblk_t *new_data_mp; 8916 8917 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 8918 miocnak(q, mp, 0, ENOMEM); 8919 return; 8920 } 8921 freemsg(data_mp); 8922 data_mp = new_data_mp; 8923 mp->b_cont = data_mp; 8924 } 8925 match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT; 8926 8927 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 8928 end - cur >= sizeof (struct dstinforeq); 8929 cur += sizeof (struct dstinforeq)) { 8930 dir = (struct dstinforeq *)cur; 8931 daddr = &dir->dir_daddr; 8932 saddr = &dir->dir_saddr; 8933 8934 /* 8935 * ip_addr_scope_v6() and ip6_asp_lookup() handle 8936 * v4 mapped addresses; ire_ftable_lookup[_v6]() 8937 * and ipif_select_source[_v6]() do not. 8938 */ 8939 dir->dir_dscope = ip_addr_scope_v6(daddr); 8940 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence); 8941 8942 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 8943 if (isipv4) { 8944 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 8945 ire = ire_ftable_lookup(v4daddr, NULL, NULL, 8946 0, NULL, NULL, zoneid, 0, NULL, match_ire); 8947 } else { 8948 ire = ire_ftable_lookup_v6(daddr, NULL, NULL, 8949 0, NULL, NULL, zoneid, 0, NULL, match_ire); 8950 } 8951 if (ire == NULL) { 8952 dir->dir_dreachable = 0; 8953 8954 /* move on to next dst addr */ 8955 continue; 8956 } 8957 dir->dir_dreachable = 1; 8958 8959 ire_ipif = ire->ire_ipif; 8960 if (ire_ipif == NULL) 8961 goto next_dst; 8962 8963 /* 8964 * We expect to get back an interface ire or a 8965 * gateway ire cache entry. For both types, the 8966 * output interface is ire_ipif->ipif_ill. 8967 */ 8968 dst_ill = ire_ipif->ipif_ill; 8969 dir->dir_dmactype = dst_ill->ill_mactype; 8970 8971 if (isipv4) { 8972 src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid); 8973 } else { 8974 src_ipif = ipif_select_source_v6(dst_ill, 8975 daddr, RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, 8976 zoneid); 8977 } 8978 if (src_ipif == NULL) 8979 goto next_dst; 8980 8981 *saddr = src_ipif->ipif_v6lcl_addr; 8982 dir->dir_sscope = ip_addr_scope_v6(saddr); 8983 slabel = ip6_asp_lookup(saddr, NULL); 8984 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 8985 dir->dir_sdeprecated = 8986 (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 8987 ipif_refrele(src_ipif); 8988 next_dst: 8989 ire_refrele(ire); 8990 } 8991 miocack(q, mp, iocp->ioc_count, 0); 8992 } 8993 8994 8995 /* 8996 * Check if this is an address assigned to this machine. 8997 * Skips interfaces that are down by using ire checks. 8998 * Translates mapped addresses to v4 addresses and then 8999 * treats them as such, returning true if the v4 address 9000 * associated with this mapped address is configured. 9001 * Note: Applications will have to be careful what they do 9002 * with the response; use of mapped addresses limits 9003 * what can be done with the socket, especially with 9004 * respect to socket options and ioctls - neither IPv4 9005 * options nor IPv6 sticky options/ancillary data options 9006 * may be used. 9007 */ 9008 /* ARGSUSED */ 9009 int 9010 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9011 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9012 { 9013 struct sioc_addrreq *sia; 9014 sin_t *sin; 9015 ire_t *ire; 9016 mblk_t *mp1; 9017 zoneid_t zoneid; 9018 9019 ip1dbg(("ip_sioctl_tmyaddr")); 9020 9021 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9022 zoneid = Q_TO_CONN(q)->conn_zoneid; 9023 9024 /* Existence verified in ip_wput_nondata */ 9025 mp1 = mp->b_cont->b_cont; 9026 sia = (struct sioc_addrreq *)mp1->b_rptr; 9027 sin = (sin_t *)&sia->sa_addr; 9028 switch (sin->sin_family) { 9029 case AF_INET6: { 9030 sin6_t *sin6 = (sin6_t *)sin; 9031 9032 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9033 ipaddr_t v4_addr; 9034 9035 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9036 v4_addr); 9037 ire = ire_ctable_lookup(v4_addr, 0, 9038 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9039 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 9040 } else { 9041 in6_addr_t v6addr; 9042 9043 v6addr = sin6->sin6_addr; 9044 ire = ire_ctable_lookup_v6(&v6addr, 0, 9045 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9046 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 9047 } 9048 break; 9049 } 9050 case AF_INET: { 9051 ipaddr_t v4addr; 9052 9053 v4addr = sin->sin_addr.s_addr; 9054 ire = ire_ctable_lookup(v4addr, 0, 9055 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9056 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 9057 break; 9058 } 9059 default: 9060 return (EAFNOSUPPORT); 9061 } 9062 if (ire != NULL) { 9063 sia->sa_res = 1; 9064 ire_refrele(ire); 9065 } else { 9066 sia->sa_res = 0; 9067 } 9068 return (0); 9069 } 9070 9071 /* 9072 * Check if this is an address assigned on-link i.e. neighbor, 9073 * and makes sure it's reachable from the current zone. 9074 * Returns true for my addresses as well. 9075 * Translates mapped addresses to v4 addresses and then 9076 * treats them as such, returning true if the v4 address 9077 * associated with this mapped address is configured. 9078 * Note: Applications will have to be careful what they do 9079 * with the response; use of mapped addresses limits 9080 * what can be done with the socket, especially with 9081 * respect to socket options and ioctls - neither IPv4 9082 * options nor IPv6 sticky options/ancillary data options 9083 * may be used. 9084 */ 9085 /* ARGSUSED */ 9086 int 9087 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9088 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 9089 { 9090 struct sioc_addrreq *sia; 9091 sin_t *sin; 9092 mblk_t *mp1; 9093 ire_t *ire = NULL; 9094 zoneid_t zoneid; 9095 9096 ip1dbg(("ip_sioctl_tonlink")); 9097 9098 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9099 zoneid = Q_TO_CONN(q)->conn_zoneid; 9100 9101 /* Existence verified in ip_wput_nondata */ 9102 mp1 = mp->b_cont->b_cont; 9103 sia = (struct sioc_addrreq *)mp1->b_rptr; 9104 sin = (sin_t *)&sia->sa_addr; 9105 9106 /* 9107 * Match addresses with a zero gateway field to avoid 9108 * routes going through a router. 9109 * Exclude broadcast and multicast addresses. 9110 */ 9111 switch (sin->sin_family) { 9112 case AF_INET6: { 9113 sin6_t *sin6 = (sin6_t *)sin; 9114 9115 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9116 ipaddr_t v4_addr; 9117 9118 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9119 v4_addr); 9120 if (!CLASSD(v4_addr)) { 9121 ire = ire_route_lookup(v4_addr, 0, 0, 0, 9122 NULL, NULL, zoneid, NULL, 9123 MATCH_IRE_GW); 9124 } 9125 } else { 9126 in6_addr_t v6addr; 9127 in6_addr_t v6gw; 9128 9129 v6addr = sin6->sin6_addr; 9130 v6gw = ipv6_all_zeros; 9131 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 9132 ire = ire_route_lookup_v6(&v6addr, 0, 9133 &v6gw, 0, NULL, NULL, zoneid, 9134 NULL, MATCH_IRE_GW); 9135 } 9136 } 9137 break; 9138 } 9139 case AF_INET: { 9140 ipaddr_t v4addr; 9141 9142 v4addr = sin->sin_addr.s_addr; 9143 if (!CLASSD(v4addr)) { 9144 ire = ire_route_lookup(v4addr, 0, 0, 0, 9145 NULL, NULL, zoneid, NULL, 9146 MATCH_IRE_GW); 9147 } 9148 break; 9149 } 9150 default: 9151 return (EAFNOSUPPORT); 9152 } 9153 sia->sa_res = 0; 9154 if (ire != NULL) { 9155 if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE| 9156 IRE_LOCAL|IRE_LOOPBACK)) { 9157 sia->sa_res = 1; 9158 } 9159 ire_refrele(ire); 9160 } 9161 return (0); 9162 } 9163 9164 /* 9165 * TBD: implement when kernel maintaines a list of site prefixes. 9166 */ 9167 /* ARGSUSED */ 9168 int 9169 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9170 ip_ioctl_cmd_t *ipip, void *ifreq) 9171 { 9172 return (ENXIO); 9173 } 9174 9175 /* ARGSUSED */ 9176 int 9177 ip_sioctl_tunparam(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9178 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9179 { 9180 ill_t *ill; 9181 mblk_t *mp1; 9182 conn_t *connp; 9183 boolean_t success; 9184 9185 ip1dbg(("ip_sioctl_tunparam(%s:%u %p)\n", 9186 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9187 /* ioctl comes down on an conn */ 9188 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9189 connp = Q_TO_CONN(q); 9190 9191 mp->b_datap->db_type = M_IOCTL; 9192 9193 /* 9194 * Send down a copy. (copymsg does not copy b_next/b_prev). 9195 * The original mp contains contaminated b_next values due to 'mi', 9196 * which is needed to do the mi_copy_done. Unfortunately if we 9197 * send down the original mblk itself and if we are popped due to an 9198 * an unplumb before the response comes back from tunnel, 9199 * the streamhead (which does a freemsg) will see this contaminated 9200 * message and the assertion in freemsg about non-null b_next/b_prev 9201 * will panic a DEBUG kernel. 9202 */ 9203 mp1 = copymsg(mp); 9204 if (mp1 == NULL) 9205 return (ENOMEM); 9206 9207 ill = ipif->ipif_ill; 9208 mutex_enter(&connp->conn_lock); 9209 mutex_enter(&ill->ill_lock); 9210 if (ipip->ipi_cmd == SIOCSTUNPARAM || ipip->ipi_cmd == OSIOCSTUNPARAM) { 9211 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), 9212 mp, 0); 9213 } else { 9214 success = ill_pending_mp_add(ill, connp, mp); 9215 } 9216 mutex_exit(&ill->ill_lock); 9217 mutex_exit(&connp->conn_lock); 9218 9219 if (success) { 9220 ip1dbg(("sending down tunparam request ")); 9221 putnext(ill->ill_wq, mp1); 9222 return (EINPROGRESS); 9223 } else { 9224 /* The conn has started closing */ 9225 freemsg(mp1); 9226 return (EINTR); 9227 } 9228 } 9229 9230 static int 9231 ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, sin_t *sin, 9232 boolean_t x_arp_ioctl, boolean_t if_arp_ioctl) 9233 { 9234 mblk_t *mp1; 9235 mblk_t *mp2; 9236 mblk_t *pending_mp; 9237 ipaddr_t ipaddr; 9238 area_t *area; 9239 struct iocblk *iocp; 9240 conn_t *connp; 9241 struct arpreq *ar; 9242 struct xarpreq *xar; 9243 boolean_t success; 9244 int flags, alength; 9245 char *lladdr; 9246 9247 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9248 connp = Q_TO_CONN(q); 9249 9250 iocp = (struct iocblk *)mp->b_rptr; 9251 /* 9252 * ill has already been set depending on whether 9253 * bsd style or interface style ioctl. 9254 */ 9255 ASSERT(ill != NULL); 9256 9257 /* 9258 * Is this one of the new SIOC*XARP ioctls? 9259 */ 9260 if (x_arp_ioctl) { 9261 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 9262 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 9263 ar = NULL; 9264 9265 flags = xar->xarp_flags; 9266 lladdr = LLADDR(&xar->xarp_ha); 9267 /* 9268 * Validate against user's link layer address length 9269 * input and name and addr length limits. 9270 */ 9271 alength = ill->ill_phys_addr_length; 9272 if (iocp->ioc_cmd == SIOCSXARP) { 9273 if (alength != xar->xarp_ha.sdl_alen || 9274 (alength + xar->xarp_ha.sdl_nlen > 9275 sizeof (xar->xarp_ha.sdl_data))) 9276 return (EINVAL); 9277 } 9278 } else { 9279 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 9280 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 9281 xar = NULL; 9282 9283 flags = ar->arp_flags; 9284 lladdr = ar->arp_ha.sa_data; 9285 /* 9286 * Theoretically, the sa_family could tell us what link 9287 * layer type this operation is trying to deal with. By 9288 * common usage AF_UNSPEC means ethernet. We'll assume 9289 * any attempt to use the SIOC?ARP ioctls is for ethernet, 9290 * for now. Our new SIOC*XARP ioctls can be used more 9291 * generally. 9292 * 9293 * If the underlying media happens to have a non 6 byte 9294 * address, arp module will fail set/get, but the del 9295 * operation will succeed. 9296 */ 9297 alength = 6; 9298 if ((iocp->ioc_cmd != SIOCDARP) && 9299 (alength != ill->ill_phys_addr_length)) { 9300 return (EINVAL); 9301 } 9302 } 9303 9304 /* 9305 * We are going to pass up to ARP a packet chain that looks 9306 * like: 9307 * 9308 * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 9309 * 9310 * Get a copy of the original IOCTL mblk to head the chain, 9311 * to be sent up (in mp1). Also get another copy to store 9312 * in the ill_pending_mp list, for matching the response 9313 * when it comes back from ARP. 9314 */ 9315 mp1 = copyb(mp); 9316 pending_mp = copymsg(mp); 9317 if (mp1 == NULL || pending_mp == NULL) { 9318 if (mp1 != NULL) 9319 freeb(mp1); 9320 if (pending_mp != NULL) 9321 inet_freemsg(pending_mp); 9322 return (ENOMEM); 9323 } 9324 9325 ipaddr = sin->sin_addr.s_addr; 9326 9327 mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 9328 (caddr_t)&ipaddr); 9329 if (mp2 == NULL) { 9330 freeb(mp1); 9331 inet_freemsg(pending_mp); 9332 return (ENOMEM); 9333 } 9334 /* Put together the chain. */ 9335 mp1->b_cont = mp2; 9336 mp1->b_datap->db_type = M_IOCTL; 9337 mp2->b_cont = mp; 9338 mp2->b_datap->db_type = M_DATA; 9339 9340 iocp = (struct iocblk *)mp1->b_rptr; 9341 9342 /* 9343 * An M_IOCDATA's payload (struct copyresp) is mostly the same as an 9344 * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a 9345 * cp_private field (or cp_rval on 32-bit systems) in place of the 9346 * ioc_count field; set ioc_count to be correct. 9347 */ 9348 iocp->ioc_count = MBLKL(mp1->b_cont); 9349 9350 /* 9351 * Set the proper command in the ARP message. 9352 * Convert the SIOC{G|S|D}ARP calls into our 9353 * AR_ENTRY_xxx calls. 9354 */ 9355 area = (area_t *)mp2->b_rptr; 9356 switch (iocp->ioc_cmd) { 9357 case SIOCDARP: 9358 case SIOCDXARP: 9359 /* 9360 * We defer deleting the corresponding IRE until 9361 * we return from arp. 9362 */ 9363 area->area_cmd = AR_ENTRY_DELETE; 9364 area->area_proto_mask_offset = 0; 9365 break; 9366 case SIOCGARP: 9367 case SIOCGXARP: 9368 area->area_cmd = AR_ENTRY_SQUERY; 9369 area->area_proto_mask_offset = 0; 9370 break; 9371 case SIOCSARP: 9372 case SIOCSXARP: { 9373 /* 9374 * Delete the corresponding ire to make sure IP will 9375 * pick up any change from arp. 9376 */ 9377 if (!if_arp_ioctl) { 9378 (void) ip_ire_clookup_and_delete(ipaddr, NULL); 9379 break; 9380 } else { 9381 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 9382 if (ipif != NULL) { 9383 (void) ip_ire_clookup_and_delete(ipaddr, ipif); 9384 ipif_refrele(ipif); 9385 } 9386 break; 9387 } 9388 } 9389 } 9390 iocp->ioc_cmd = area->area_cmd; 9391 9392 /* 9393 * Before sending 'mp' to ARP, we have to clear the b_next 9394 * and b_prev. Otherwise if STREAMS encounters such a message 9395 * in freemsg(), (because ARP can close any time) it can cause 9396 * a panic. But mi code needs the b_next and b_prev values of 9397 * mp->b_cont, to complete the ioctl. So we store it here 9398 * in pending_mp->bcont, and restore it in ip_sioctl_iocack() 9399 * when the response comes down from ARP. 9400 */ 9401 pending_mp->b_cont->b_next = mp->b_cont->b_next; 9402 pending_mp->b_cont->b_prev = mp->b_cont->b_prev; 9403 mp->b_cont->b_next = NULL; 9404 mp->b_cont->b_prev = NULL; 9405 9406 mutex_enter(&connp->conn_lock); 9407 mutex_enter(&ill->ill_lock); 9408 /* conn has not yet started closing, hence this can't fail */ 9409 success = ill_pending_mp_add(ill, connp, pending_mp); 9410 ASSERT(success); 9411 mutex_exit(&ill->ill_lock); 9412 mutex_exit(&connp->conn_lock); 9413 9414 /* 9415 * Fill in the rest of the ARP operation fields. 9416 */ 9417 area->area_hw_addr_length = alength; 9418 bcopy(lladdr, 9419 (char *)area + area->area_hw_addr_offset, 9420 area->area_hw_addr_length); 9421 /* Translate the flags. */ 9422 if (flags & ATF_PERM) 9423 area->area_flags |= ACE_F_PERMANENT; 9424 if (flags & ATF_PUBL) 9425 area->area_flags |= ACE_F_PUBLISH; 9426 if (flags & ATF_AUTHORITY) 9427 area->area_flags |= ACE_F_AUTHORITY; 9428 9429 /* 9430 * Up to ARP it goes. The response will come 9431 * back in ip_wput as an M_IOCACK message, and 9432 * will be handed to ip_sioctl_iocack for 9433 * completion. 9434 */ 9435 putnext(ill->ill_rq, mp1); 9436 return (EINPROGRESS); 9437 } 9438 9439 /* ARGSUSED */ 9440 int 9441 ip_sioctl_xarp(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9442 ip_ioctl_cmd_t *ipip, void *ifreq) 9443 { 9444 struct xarpreq *xar; 9445 boolean_t isv6; 9446 mblk_t *mp1; 9447 int err; 9448 conn_t *connp; 9449 int ifnamelen; 9450 ire_t *ire = NULL; 9451 ill_t *ill = NULL; 9452 struct sockaddr_in *sin; 9453 boolean_t if_arp_ioctl = B_FALSE; 9454 9455 /* ioctl comes down on an conn */ 9456 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9457 connp = Q_TO_CONN(q); 9458 isv6 = connp->conn_af_isv6; 9459 9460 /* Existance verified in ip_wput_nondata */ 9461 mp1 = mp->b_cont->b_cont; 9462 9463 ASSERT(MBLKL(mp1) >= sizeof (*xar)); 9464 xar = (struct xarpreq *)mp1->b_rptr; 9465 sin = (sin_t *)&xar->xarp_pa; 9466 9467 if (isv6 || (xar->xarp_ha.sdl_family != AF_LINK) || 9468 (xar->xarp_pa.ss_family != AF_INET)) 9469 return (ENXIO); 9470 9471 ifnamelen = xar->xarp_ha.sdl_nlen; 9472 if (ifnamelen != 0) { 9473 char *cptr, cval; 9474 9475 if (ifnamelen >= LIFNAMSIZ) 9476 return (EINVAL); 9477 9478 /* 9479 * Instead of bcopying a bunch of bytes, 9480 * null-terminate the string in-situ. 9481 */ 9482 cptr = xar->xarp_ha.sdl_data + ifnamelen; 9483 cval = *cptr; 9484 *cptr = '\0'; 9485 ill = ill_lookup_on_name(xar->xarp_ha.sdl_data, 9486 B_FALSE, isv6, CONNP_TO_WQ(connp), mp, ip_process_ioctl, 9487 &err, NULL); 9488 *cptr = cval; 9489 if (ill == NULL) 9490 return (err); 9491 if (ill->ill_net_type != IRE_IF_RESOLVER) { 9492 ill_refrele(ill); 9493 return (ENXIO); 9494 } 9495 9496 if_arp_ioctl = B_TRUE; 9497 } else { 9498 /* 9499 * PSARC 2003/088 states that if sdl_nlen == 0, it behaves 9500 * as an extended BSD ioctl. The kernel uses the IP address 9501 * to figure out the network interface. 9502 */ 9503 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL); 9504 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 9505 ((ill = ire_to_ill(ire)) == NULL) || 9506 (ill->ill_net_type != IRE_IF_RESOLVER)) { 9507 if (ire != NULL) 9508 ire_refrele(ire); 9509 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 9510 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 9511 NULL, MATCH_IRE_TYPE); 9512 if ((ire == NULL) || 9513 ((ill = ire_to_ill(ire)) == NULL)) { 9514 if (ire != NULL) 9515 ire_refrele(ire); 9516 return (ENXIO); 9517 } 9518 } 9519 ASSERT(ire != NULL && ill != NULL); 9520 } 9521 9522 err = ip_sioctl_arp_common(ill, q, mp, sin, B_TRUE, if_arp_ioctl); 9523 if (if_arp_ioctl) 9524 ill_refrele(ill); 9525 if (ire != NULL) 9526 ire_refrele(ire); 9527 9528 return (err); 9529 } 9530 9531 /* 9532 * ARP IOCTLs. 9533 * How does IP get in the business of fronting ARP configuration/queries? 9534 * Well its like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP) 9535 * are by tradition passed in through a datagram socket. That lands in IP. 9536 * As it happens, this is just as well since the interface is quite crude in 9537 * that it passes in no information about protocol or hardware types, or 9538 * interface association. After making the protocol assumption, IP is in 9539 * the position to look up the name of the ILL, which ARP will need, and 9540 * format a request that can be handled by ARP. The request is passed up 9541 * stream to ARP, and the original IOCTL is completed by IP when ARP passes 9542 * back a response. ARP supports its own set of more general IOCTLs, in 9543 * case anyone is interested. 9544 */ 9545 /* ARGSUSED */ 9546 int 9547 ip_sioctl_arp(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9548 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9549 { 9550 struct arpreq *ar; 9551 struct sockaddr_in *sin; 9552 ire_t *ire; 9553 boolean_t isv6; 9554 mblk_t *mp1; 9555 int err; 9556 conn_t *connp; 9557 ill_t *ill; 9558 9559 /* ioctl comes down on an conn */ 9560 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9561 connp = Q_TO_CONN(q); 9562 isv6 = connp->conn_af_isv6; 9563 if (isv6) 9564 return (ENXIO); 9565 9566 /* Existance verified in ip_wput_nondata */ 9567 mp1 = mp->b_cont->b_cont; 9568 9569 ar = (struct arpreq *)mp1->b_rptr; 9570 sin = (sin_t *)&ar->arp_pa; 9571 9572 /* 9573 * We need to let ARP know on which interface the IP 9574 * address has an ARP mapping. In the IPMP case, a 9575 * simple forwarding table lookup will return the 9576 * IRE_IF_RESOLVER for the first interface in the group, 9577 * which might not be the interface on which the 9578 * requested IP address was resolved due to the ill 9579 * selection algorithm (see ip_newroute_get_dst_ill()). 9580 * So we do a cache table lookup first: if the IRE cache 9581 * entry for the IP address is still there, it will 9582 * contain the ill pointer for the right interface, so 9583 * we use that. If the cache entry has been flushed, we 9584 * fall back to the forwarding table lookup. This should 9585 * be rare enough since IRE cache entries have a longer 9586 * life expectancy than ARP cache entries. 9587 */ 9588 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL); 9589 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 9590 ((ill = ire_to_ill(ire)) == NULL)) { 9591 if (ire != NULL) 9592 ire_refrele(ire); 9593 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 9594 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 9595 NULL, MATCH_IRE_TYPE); 9596 if ((ire == NULL) || ((ill = ire_to_ill(ire)) == NULL)) { 9597 if (ire != NULL) 9598 ire_refrele(ire); 9599 return (ENXIO); 9600 } 9601 } 9602 ASSERT(ire != NULL && ill != NULL); 9603 9604 err = ip_sioctl_arp_common(ill, q, mp, sin, B_FALSE, B_FALSE); 9605 ire_refrele(ire); 9606 return (err); 9607 } 9608 9609 /* 9610 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 9611 * atomically set/clear the muxids. Also complete the ioctl by acking or 9612 * naking it. Note that the code is structured such that the link type, 9613 * whether it's persistent or not, is treated equally. ifconfig(1M) and 9614 * its clones use the persistent link, while pppd(1M) and perhaps many 9615 * other daemons may use non-persistent link. When combined with some 9616 * ill_t states, linking and unlinking lower streams may be used as 9617 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 9618 */ 9619 /* ARGSUSED */ 9620 void 9621 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 9622 { 9623 mblk_t *mp1; 9624 mblk_t *mp2; 9625 struct linkblk *li; 9626 queue_t *ipwq; 9627 char *name; 9628 struct qinit *qinfo; 9629 struct ipmx_s *ipmxp; 9630 ill_t *ill = NULL; 9631 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9632 int err = 0; 9633 boolean_t entered_ipsq = B_FALSE; 9634 boolean_t islink; 9635 queue_t *dwq = NULL; 9636 9637 ASSERT(iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_PUNLINK || 9638 iocp->ioc_cmd == I_LINK || iocp->ioc_cmd == I_UNLINK); 9639 9640 islink = (iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_LINK) ? 9641 B_TRUE : B_FALSE; 9642 9643 mp1 = mp->b_cont; /* This is the linkblk info */ 9644 li = (struct linkblk *)mp1->b_rptr; 9645 9646 /* 9647 * ARP has added this special mblk, and the utility is asking us 9648 * to perform consistency checks, and also atomically set the 9649 * muxid. Ifconfig is an example. It achieves this by using 9650 * /dev/arp as the mux to plink the arp stream, and pushes arp on 9651 * to /dev/udp[6] stream for use as the mux when plinking the IP 9652 * stream. SIOCSLIFMUXID is not required. See ifconfig.c, arp.c 9653 * and other comments in this routine for more details. 9654 */ 9655 mp2 = mp1->b_cont; /* This is added by ARP */ 9656 9657 /* 9658 * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than 9659 * ifconfig which didn't push ARP on top of the dummy mux, we won't 9660 * get the special mblk above. For backward compatibility, we just 9661 * return success. The utility will use SIOCSLIFMUXID to store 9662 * the muxids. This is not atomic, and can leave the streams 9663 * unplumbable if the utility is interrrupted, before it does the 9664 * SIOCSLIFMUXID. 9665 */ 9666 if (mp2 == NULL) { 9667 /* 9668 * At this point we don't know whether or not this is the 9669 * IP module stream or the ARP device stream. We need to 9670 * walk the lower stream in order to find this out, since 9671 * the capability negotiation is done only on the IP module 9672 * stream. IP module instance is identified by the module 9673 * name IP, non-null q_next, and it's wput not being ip_lwput. 9674 * STREAMS ensures that the lower stream (l_qbot) will not 9675 * vanish until this ioctl completes. So we can safely walk 9676 * the stream or refer to the q_ptr. 9677 */ 9678 ipwq = li->l_qbot; 9679 while (ipwq != NULL) { 9680 qinfo = ipwq->q_qinfo; 9681 name = qinfo->qi_minfo->mi_idname; 9682 if (name != NULL && name[0] != NULL && 9683 (strcmp(name, ip_mod_info.mi_idname) == 0) && 9684 ((void *)(qinfo->qi_putp) != (void *)ip_lwput) && 9685 (ipwq->q_next != NULL)) { 9686 break; 9687 } 9688 ipwq = ipwq->q_next; 9689 } 9690 /* 9691 * This looks like an IP module stream, so trigger 9692 * the capability reset or re-negotiation if necessary. 9693 */ 9694 if (ipwq != NULL) { 9695 ill = ipwq->q_ptr; 9696 ASSERT(ill != NULL); 9697 9698 if (ipsq == NULL) { 9699 ipsq = ipsq_try_enter(NULL, ill, q, mp, 9700 ip_sioctl_plink, NEW_OP, B_TRUE); 9701 if (ipsq == NULL) 9702 return; 9703 entered_ipsq = B_TRUE; 9704 } 9705 ASSERT(IAM_WRITER_ILL(ill)); 9706 /* 9707 * Store the upper read queue of the module 9708 * immediately below IP, and count the total 9709 * number of lower modules. Do this only 9710 * for I_PLINK or I_LINK event. 9711 */ 9712 ill->ill_lmod_rq = NULL; 9713 ill->ill_lmod_cnt = 0; 9714 if (islink && (dwq = ipwq->q_next) != NULL) { 9715 ill->ill_lmod_rq = RD(dwq); 9716 9717 while (dwq != NULL) { 9718 ill->ill_lmod_cnt++; 9719 dwq = dwq->q_next; 9720 } 9721 } 9722 /* 9723 * There's no point in resetting or re-negotiating if 9724 * we are not bound to the driver, so only do this if 9725 * the DLPI state is idle (up); we assume such state 9726 * since ill_ipif_up_count gets incremented in 9727 * ipif_up_done(), which is after we are bound to the 9728 * driver. Note that in the case of logical 9729 * interfaces, IP won't rebind to the driver unless 9730 * the ill_ipif_up_count is 0, meaning that all other 9731 * IP interfaces (including the main ipif) are in the 9732 * down state. Because of this, we use such counter 9733 * as an indicator, instead of relying on the IPIF_UP 9734 * flag, which is per ipif instance. 9735 */ 9736 if (ill->ill_ipif_up_count > 0) { 9737 if (islink) 9738 ill_capability_probe(ill); 9739 else 9740 ill_capability_reset(ill); 9741 } 9742 } 9743 goto done; 9744 } 9745 9746 /* 9747 * This is an I_{P}LINK sent down by ifconfig on 9748 * /dev/arp. ARP has appended this last (3rd) mblk, 9749 * giving more info. STREAMS ensures that the lower 9750 * stream (l_qbot) will not vanish until this ioctl 9751 * completes. So we can safely walk the stream or refer 9752 * to the q_ptr. 9753 */ 9754 ipmxp = (struct ipmx_s *)mp2->b_rptr; 9755 if (ipmxp->ipmx_arpdev_stream) { 9756 /* 9757 * The operation is occuring on the arp-device 9758 * stream. 9759 */ 9760 ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE, 9761 q, mp, ip_sioctl_plink, &err, NULL); 9762 if (ill == NULL) { 9763 if (err == EINPROGRESS) { 9764 return; 9765 } else { 9766 err = EINVAL; 9767 goto done; 9768 } 9769 } 9770 9771 if (ipsq == NULL) { 9772 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 9773 NEW_OP, B_TRUE); 9774 if (ipsq == NULL) { 9775 ill_refrele(ill); 9776 return; 9777 } 9778 entered_ipsq = B_TRUE; 9779 } 9780 ASSERT(IAM_WRITER_ILL(ill)); 9781 ill_refrele(ill); 9782 /* 9783 * To ensure consistency between IP and ARP, 9784 * the following LIFO scheme is used in 9785 * plink/punlink. (IP first, ARP last). 9786 * This is because the muxid's are stored 9787 * in the IP stream on the ill. 9788 * 9789 * I_{P}LINK: ifconfig plinks the IP stream before 9790 * plinking the ARP stream. On an arp-dev 9791 * stream, IP checks that it is not yet 9792 * plinked, and it also checks that the 9793 * corresponding IP stream is already plinked. 9794 * 9795 * I_{P}UNLINK: ifconfig punlinks the ARP stream 9796 * before punlinking the IP stream. IP does 9797 * not allow punlink of the IP stream unless 9798 * the arp stream has been punlinked. 9799 * 9800 */ 9801 if ((islink && 9802 (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) || 9803 (!islink && 9804 ill->ill_arp_muxid != li->l_index)) { 9805 err = EINVAL; 9806 goto done; 9807 } 9808 if (islink) { 9809 ill->ill_arp_muxid = li->l_index; 9810 } else { 9811 ill->ill_arp_muxid = 0; 9812 } 9813 } else { 9814 /* 9815 * This must be the IP module stream with or 9816 * without arp. Walk the stream and locate the 9817 * IP module. An IP module instance is 9818 * identified by the module name IP, non-null 9819 * q_next, and it's wput not being ip_lwput. 9820 */ 9821 ipwq = li->l_qbot; 9822 while (ipwq != NULL) { 9823 qinfo = ipwq->q_qinfo; 9824 name = qinfo->qi_minfo->mi_idname; 9825 if (name != NULL && name[0] != NULL && 9826 (strcmp(name, ip_mod_info.mi_idname) == 0) && 9827 ((void *)(qinfo->qi_putp) != (void *)ip_lwput) && 9828 (ipwq->q_next != NULL)) { 9829 break; 9830 } 9831 ipwq = ipwq->q_next; 9832 } 9833 if (ipwq != NULL) { 9834 ill = ipwq->q_ptr; 9835 ASSERT(ill != NULL); 9836 9837 if (ipsq == NULL) { 9838 ipsq = ipsq_try_enter(NULL, ill, q, mp, 9839 ip_sioctl_plink, NEW_OP, B_TRUE); 9840 if (ipsq == NULL) 9841 return; 9842 entered_ipsq = B_TRUE; 9843 } 9844 ASSERT(IAM_WRITER_ILL(ill)); 9845 /* 9846 * Return error if the ip_mux_id is 9847 * non-zero and command is I_{P}LINK. 9848 * If command is I_{P}UNLINK, return 9849 * error if the arp-devstr is not 9850 * yet punlinked. 9851 */ 9852 if ((islink && ill->ill_ip_muxid != 0) || 9853 (!islink && ill->ill_arp_muxid != 0)) { 9854 err = EINVAL; 9855 goto done; 9856 } 9857 ill->ill_lmod_rq = NULL; 9858 ill->ill_lmod_cnt = 0; 9859 if (islink) { 9860 /* 9861 * Store the upper read queue of the module 9862 * immediately below IP, and count the total 9863 * number of lower modules. 9864 */ 9865 if ((dwq = ipwq->q_next) != NULL) { 9866 ill->ill_lmod_rq = RD(dwq); 9867 9868 while (dwq != NULL) { 9869 ill->ill_lmod_cnt++; 9870 dwq = dwq->q_next; 9871 } 9872 } 9873 ill->ill_ip_muxid = li->l_index; 9874 } else { 9875 ill->ill_ip_muxid = 0; 9876 } 9877 9878 /* 9879 * See comments above about resetting/re- 9880 * negotiating driver sub-capabilities. 9881 */ 9882 if (ill->ill_ipif_up_count > 0) { 9883 if (islink) 9884 ill_capability_probe(ill); 9885 else 9886 ill_capability_reset(ill); 9887 } 9888 } 9889 } 9890 done: 9891 iocp->ioc_count = 0; 9892 iocp->ioc_error = err; 9893 if (err == 0) 9894 mp->b_datap->db_type = M_IOCACK; 9895 else 9896 mp->b_datap->db_type = M_IOCNAK; 9897 qreply(q, mp); 9898 9899 /* Conn was refheld in ip_sioctl_copyin_setup */ 9900 if (CONN_Q(q)) 9901 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 9902 if (entered_ipsq) 9903 ipsq_exit(ipsq, B_TRUE, B_TRUE); 9904 } 9905 9906 /* 9907 * Search the ioctl command in the ioctl tables and return a pointer 9908 * to the ioctl command information. The ioctl command tables are 9909 * static and fully populated at compile time. 9910 */ 9911 ip_ioctl_cmd_t * 9912 ip_sioctl_lookup(int ioc_cmd) 9913 { 9914 int index; 9915 ip_ioctl_cmd_t *ipip; 9916 ip_ioctl_cmd_t *ipip_end; 9917 9918 if (ioc_cmd == IPI_DONTCARE) 9919 return (NULL); 9920 9921 /* 9922 * Do a 2 step search. First search the indexed table 9923 * based on the least significant byte of the ioctl cmd. 9924 * If we don't find a match, then search the misc table 9925 * serially. 9926 */ 9927 index = ioc_cmd & 0xFF; 9928 if (index < ip_ndx_ioctl_count) { 9929 ipip = &ip_ndx_ioctl_table[index]; 9930 if (ipip->ipi_cmd == ioc_cmd) { 9931 /* Found a match in the ndx table */ 9932 return (ipip); 9933 } 9934 } 9935 9936 /* Search the misc table */ 9937 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 9938 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 9939 if (ipip->ipi_cmd == ioc_cmd) 9940 /* Found a match in the misc table */ 9941 return (ipip); 9942 } 9943 9944 return (NULL); 9945 } 9946 9947 /* 9948 * Wrapper function for resuming deferred ioctl processing 9949 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 9950 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 9951 */ 9952 /* ARGSUSED */ 9953 void 9954 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 9955 void *dummy_arg) 9956 { 9957 ip_sioctl_copyin_setup(q, mp); 9958 } 9959 9960 /* 9961 * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message 9962 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 9963 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 9964 * We establish here the size of the block to be copied in. mi_copyin 9965 * arranges for this to happen, an processing continues in ip_wput with 9966 * an M_IOCDATA message. 9967 */ 9968 void 9969 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 9970 { 9971 int copyin_size; 9972 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9973 ip_ioctl_cmd_t *ipip; 9974 cred_t *cr; 9975 9976 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 9977 if (ipip == NULL) { 9978 /* 9979 * The ioctl is not one we understand or own. 9980 * Pass it along to be processed down stream, 9981 * if this is a module instance of IP, else nak 9982 * the ioctl. 9983 */ 9984 if (q->q_next == NULL) { 9985 goto nak; 9986 } else { 9987 putnext(q, mp); 9988 return; 9989 } 9990 } 9991 9992 /* 9993 * If this is deferred, then we will do all the checks when we 9994 * come back. 9995 */ 9996 if ((iocp->ioc_cmd == SIOCGDSTINFO || 9997 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup()) { 9998 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 9999 return; 10000 } 10001 10002 /* 10003 * Only allow a very small subset of IP ioctls on this stream if 10004 * IP is a module and not a driver. Allowing ioctls to be processed 10005 * in this case may cause assert failures or data corruption. 10006 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 10007 * ioctls allowed on an IP module stream, after which this stream 10008 * normally becomes a multiplexor (at which time the stream head 10009 * will fail all ioctls). 10010 */ 10011 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 10012 if (ipip->ipi_flags & IPI_PASS_DOWN) { 10013 /* 10014 * Pass common Streams ioctls which the IP 10015 * module does not own or consume along to 10016 * be processed down stream. 10017 */ 10018 putnext(q, mp); 10019 return; 10020 } else { 10021 goto nak; 10022 } 10023 } 10024 10025 /* Make sure we have ioctl data to process. */ 10026 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 10027 goto nak; 10028 10029 /* 10030 * Prefer dblk credential over ioctl credential; some synthesized 10031 * ioctls have kcred set because there's no way to crhold() 10032 * a credential in some contexts. (ioc_cr is not crfree() by 10033 * the framework; the caller of ioctl needs to hold the reference 10034 * for the duration of the call). 10035 */ 10036 cr = DB_CREDDEF(mp, iocp->ioc_cr); 10037 10038 /* Make sure normal users don't send down privileged ioctls */ 10039 if ((ipip->ipi_flags & IPI_PRIV) && 10040 (cr != NULL) && secpolicy_net_config(cr, B_TRUE) != 0) { 10041 /* We checked the privilege earlier but log it here */ 10042 miocnak(q, mp, 0, secpolicy_net_config(cr, B_FALSE)); 10043 return; 10044 } 10045 10046 /* 10047 * The ioctl command tables can only encode fixed length 10048 * ioctl data. If the length is variable, the table will 10049 * encode the length as zero. Such special cases are handled 10050 * below in the switch. 10051 */ 10052 if (ipip->ipi_copyin_size != 0) { 10053 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 10054 return; 10055 } 10056 10057 switch (iocp->ioc_cmd) { 10058 case O_SIOCGIFCONF: 10059 case SIOCGIFCONF: 10060 /* 10061 * This IOCTL is hilarious. See comments in 10062 * ip_sioctl_get_ifconf for the story. 10063 */ 10064 if (iocp->ioc_count == TRANSPARENT) 10065 copyin_size = SIZEOF_STRUCT(ifconf, 10066 iocp->ioc_flag); 10067 else 10068 copyin_size = iocp->ioc_count; 10069 mi_copyin(q, mp, NULL, copyin_size); 10070 return; 10071 10072 case O_SIOCGLIFCONF: 10073 case SIOCGLIFCONF: 10074 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 10075 mi_copyin(q, mp, NULL, copyin_size); 10076 return; 10077 10078 case SIOCGLIFSRCOF: 10079 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 10080 mi_copyin(q, mp, NULL, copyin_size); 10081 return; 10082 case SIOCGIP6ADDRPOLICY: 10083 ip_sioctl_ip6addrpolicy(q, mp); 10084 ip6_asp_table_refrele(); 10085 return; 10086 10087 case SIOCSIP6ADDRPOLICY: 10088 ip_sioctl_ip6addrpolicy(q, mp); 10089 return; 10090 10091 case SIOCGDSTINFO: 10092 ip_sioctl_dstinfo(q, mp); 10093 ip6_asp_table_refrele(); 10094 return; 10095 10096 case I_PLINK: 10097 case I_PUNLINK: 10098 case I_LINK: 10099 case I_UNLINK: 10100 /* 10101 * We treat non-persistent link similarly as the persistent 10102 * link case, in terms of plumbing/unplumbing, as well as 10103 * dynamic re-plumbing events indicator. See comments 10104 * in ip_sioctl_plink() for more. 10105 * 10106 * Request can be enqueued in the 'ipsq' while waiting 10107 * to become exclusive. So bump up the conn ref. 10108 */ 10109 if (CONN_Q(q)) 10110 CONN_INC_REF(Q_TO_CONN(q)); 10111 ip_sioctl_plink(NULL, q, mp, NULL); 10112 return; 10113 10114 case ND_GET: 10115 case ND_SET: 10116 /* 10117 * Use of the nd table requires holding the reader lock. 10118 * Modifying the nd table thru nd_load/nd_unload requires 10119 * the writer lock. 10120 */ 10121 rw_enter(&ip_g_nd_lock, RW_READER); 10122 if (nd_getset(q, ip_g_nd, mp)) { 10123 rw_exit(&ip_g_nd_lock); 10124 10125 if (iocp->ioc_error) 10126 iocp->ioc_count = 0; 10127 mp->b_datap->db_type = M_IOCACK; 10128 qreply(q, mp); 10129 return; 10130 } 10131 rw_exit(&ip_g_nd_lock); 10132 /* 10133 * We don't understand this subioctl of ND_GET / ND_SET. 10134 * Maybe intended for some driver / module below us 10135 */ 10136 if (q->q_next) { 10137 putnext(q, mp); 10138 } else { 10139 iocp->ioc_error = ENOENT; 10140 mp->b_datap->db_type = M_IOCNAK; 10141 iocp->ioc_count = 0; 10142 qreply(q, mp); 10143 } 10144 return; 10145 10146 case IP_IOCTL: 10147 ip_wput_ioctl(q, mp); 10148 return; 10149 default: 10150 cmn_err(CE_PANIC, "should not happen "); 10151 } 10152 nak: 10153 if (mp->b_cont != NULL) { 10154 freemsg(mp->b_cont); 10155 mp->b_cont = NULL; 10156 } 10157 iocp->ioc_error = EINVAL; 10158 mp->b_datap->db_type = M_IOCNAK; 10159 iocp->ioc_count = 0; 10160 qreply(q, mp); 10161 } 10162 10163 /* ip_wput hands off ARP IOCTL responses to us */ 10164 void 10165 ip_sioctl_iocack(queue_t *q, mblk_t *mp) 10166 { 10167 struct arpreq *ar; 10168 struct xarpreq *xar; 10169 area_t *area; 10170 mblk_t *area_mp; 10171 struct iocblk *iocp; 10172 mblk_t *orig_ioc_mp, *tmp; 10173 struct iocblk *orig_iocp; 10174 ill_t *ill; 10175 conn_t *connp = NULL; 10176 uint_t ioc_id; 10177 mblk_t *pending_mp; 10178 int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE; 10179 int *flagsp; 10180 char *storage = NULL; 10181 sin_t *sin; 10182 ipaddr_t addr; 10183 int err; 10184 10185 ill = q->q_ptr; 10186 ASSERT(ill != NULL); 10187 10188 /* 10189 * We should get back from ARP a packet chain that looks like: 10190 * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 10191 */ 10192 if (!(area_mp = mp->b_cont) || 10193 (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) || 10194 !(orig_ioc_mp = area_mp->b_cont) || 10195 !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) { 10196 freemsg(mp); 10197 return; 10198 } 10199 10200 orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr; 10201 10202 tmp = (orig_ioc_mp->b_cont)->b_cont; 10203 if ((orig_iocp->ioc_cmd == SIOCGXARP) || 10204 (orig_iocp->ioc_cmd == SIOCSXARP) || 10205 (orig_iocp->ioc_cmd == SIOCDXARP)) { 10206 x_arp_ioctl = B_TRUE; 10207 xar = (struct xarpreq *)tmp->b_rptr; 10208 sin = (sin_t *)&xar->xarp_pa; 10209 flagsp = &xar->xarp_flags; 10210 storage = xar->xarp_ha.sdl_data; 10211 if (xar->xarp_ha.sdl_nlen != 0) 10212 ifx_arp_ioctl = B_TRUE; 10213 } else { 10214 ar = (struct arpreq *)tmp->b_rptr; 10215 sin = (sin_t *)&ar->arp_pa; 10216 flagsp = &ar->arp_flags; 10217 storage = ar->arp_ha.sa_data; 10218 } 10219 10220 iocp = (struct iocblk *)mp->b_rptr; 10221 10222 /* 10223 * Pick out the originating queue based on the ioc_id. 10224 */ 10225 ioc_id = iocp->ioc_id; 10226 pending_mp = ill_pending_mp_get(ill, &connp, ioc_id); 10227 if (pending_mp == NULL) { 10228 ASSERT(connp == NULL); 10229 inet_freemsg(mp); 10230 return; 10231 } 10232 ASSERT(connp != NULL); 10233 q = CONNP_TO_WQ(connp); 10234 10235 /* Uncouple the internally generated IOCTL from the original one */ 10236 area = (area_t *)area_mp->b_rptr; 10237 area_mp->b_cont = NULL; 10238 10239 /* 10240 * Restore the b_next and b_prev used by mi code. This is needed 10241 * to complete the ioctl using mi* functions. We stored them in 10242 * the pending mp prior to sending the request to ARP. 10243 */ 10244 orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next; 10245 orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev; 10246 inet_freemsg(pending_mp); 10247 10248 /* 10249 * We're done if there was an error or if this is not an SIOCG{X}ARP 10250 * Catch the case where there is an IRE_CACHE by no entry in the 10251 * arp table. 10252 */ 10253 addr = sin->sin_addr.s_addr; 10254 if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) { 10255 ire_t *ire; 10256 dl_unitdata_req_t *dlup; 10257 mblk_t *llmp; 10258 int addr_len; 10259 ill_t *ipsqill = NULL; 10260 10261 if (ifx_arp_ioctl) { 10262 /* 10263 * There's no need to lookup the ill, since 10264 * we've already done that when we started 10265 * processing the ioctl and sent the message 10266 * to ARP on that ill. So use the ill that 10267 * is stored in q->q_ptr. 10268 */ 10269 ipsqill = ill; 10270 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10271 ipsqill->ill_ipif, ALL_ZONES, 10272 NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL); 10273 } else { 10274 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10275 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 10276 if (ire != NULL) 10277 ipsqill = ire_to_ill(ire); 10278 } 10279 10280 if ((x_arp_ioctl) && (ipsqill != NULL)) 10281 storage += ill_xarp_info(&xar->xarp_ha, ipsqill); 10282 10283 if (ire != NULL) { 10284 /* 10285 * Since the ire obtained from cachetable is used for 10286 * mac addr copying below, treat an incomplete ire as if 10287 * as if we never found it. 10288 */ 10289 if (ire->ire_nce != NULL && 10290 ire->ire_nce->nce_state != ND_REACHABLE) { 10291 ire_refrele(ire); 10292 ire = NULL; 10293 ipsqill = NULL; 10294 goto errack; 10295 } 10296 *flagsp = ATF_INUSE; 10297 llmp = (ire->ire_nce != NULL ? 10298 ire->ire_nce->nce_res_mp : NULL); 10299 if (llmp != NULL && ipsqill != NULL) { 10300 uchar_t *macaddr; 10301 10302 addr_len = ipsqill->ill_phys_addr_length; 10303 if (x_arp_ioctl && ((addr_len + 10304 ipsqill->ill_name_length) > 10305 sizeof (xar->xarp_ha.sdl_data))) { 10306 ire_refrele(ire); 10307 freemsg(mp); 10308 ip_ioctl_finish(q, orig_ioc_mp, 10309 EINVAL, NO_COPYOUT, NULL, NULL); 10310 return; 10311 } 10312 *flagsp |= ATF_COM; 10313 dlup = (dl_unitdata_req_t *)llmp->b_rptr; 10314 if (ipsqill->ill_sap_length < 0) 10315 macaddr = llmp->b_rptr + 10316 dlup->dl_dest_addr_offset; 10317 else 10318 macaddr = llmp->b_rptr + 10319 dlup->dl_dest_addr_offset + 10320 ipsqill->ill_sap_length; 10321 /* 10322 * For SIOCGARP, MAC address length 10323 * validation has already been done 10324 * before the ioctl was issued to ARP to 10325 * allow it to progress only on 6 byte 10326 * addressable (ethernet like) media. Thus 10327 * the mac address copying can not overwrite 10328 * the sa_data area below. 10329 */ 10330 bcopy(macaddr, storage, addr_len); 10331 } 10332 /* Ditch the internal IOCTL. */ 10333 freemsg(mp); 10334 ire_refrele(ire); 10335 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL, NULL); 10336 return; 10337 } 10338 } 10339 10340 /* 10341 * Delete the coresponding IRE_CACHE if any. 10342 * Reset the error if there was one (in case there was no entry 10343 * in arp.) 10344 */ 10345 if (iocp->ioc_cmd == AR_ENTRY_DELETE) { 10346 ipif_t *ipintf = NULL; 10347 10348 if (ifx_arp_ioctl) { 10349 /* 10350 * There's no need to lookup the ill, since 10351 * we've already done that when we started 10352 * processing the ioctl and sent the message 10353 * to ARP on that ill. So use the ill that 10354 * is stored in q->q_ptr. 10355 */ 10356 ipintf = ill->ill_ipif; 10357 } 10358 if (ip_ire_clookup_and_delete(addr, ipintf)) { 10359 /* 10360 * The address in "addr" may be an entry for a 10361 * router. If that's true, then any off-net 10362 * IRE_CACHE entries that go through the router 10363 * with address "addr" must be clobbered. Use 10364 * ire_walk to achieve this goal. 10365 */ 10366 if (ifx_arp_ioctl) 10367 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 10368 ire_delete_cache_gw, (char *)&addr, ill); 10369 else 10370 ire_walk_v4(ire_delete_cache_gw, (char *)&addr, 10371 ALL_ZONES); 10372 iocp->ioc_error = 0; 10373 } 10374 } 10375 errack: 10376 if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) { 10377 err = iocp->ioc_error; 10378 freemsg(mp); 10379 ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL, NULL); 10380 return; 10381 } 10382 10383 /* 10384 * Completion of an SIOCG{X}ARP. Translate the information from 10385 * the area_t into the struct {x}arpreq. 10386 */ 10387 if (x_arp_ioctl) { 10388 storage += ill_xarp_info(&xar->xarp_ha, ill); 10389 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 10390 sizeof (xar->xarp_ha.sdl_data)) { 10391 freemsg(mp); 10392 ip_ioctl_finish(q, orig_ioc_mp, EINVAL, 10393 NO_COPYOUT, NULL, NULL); 10394 return; 10395 } 10396 } 10397 *flagsp = ATF_INUSE; 10398 if (area->area_flags & ACE_F_PERMANENT) 10399 *flagsp |= ATF_PERM; 10400 if (area->area_flags & ACE_F_PUBLISH) 10401 *flagsp |= ATF_PUBL; 10402 if (area->area_flags & ACE_F_AUTHORITY) 10403 *flagsp |= ATF_AUTHORITY; 10404 if (area->area_hw_addr_length != 0) { 10405 *flagsp |= ATF_COM; 10406 /* 10407 * For SIOCGARP, MAC address length validation has 10408 * already been done before the ioctl was issued to ARP 10409 * to allow it to progress only on 6 byte addressable 10410 * (ethernet like) media. Thus the mac address copying 10411 * can not overwrite the sa_data area below. 10412 */ 10413 bcopy((char *)area + area->area_hw_addr_offset, 10414 storage, area->area_hw_addr_length); 10415 } 10416 10417 /* Ditch the internal IOCTL. */ 10418 freemsg(mp); 10419 /* Complete the original. */ 10420 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL, NULL); 10421 } 10422 10423 /* 10424 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 10425 * interface) create the next available logical interface for this 10426 * physical interface. 10427 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 10428 * ipif with the specified name. 10429 * 10430 * If the address family is not AF_UNSPEC then set the address as well. 10431 * 10432 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 10433 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 10434 * 10435 * Executed as a writer on the ill or ill group. 10436 * So no lock is needed to traverse the ipif chain, or examine the 10437 * phyint flags. 10438 */ 10439 /* ARGSUSED */ 10440 int 10441 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 10442 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10443 { 10444 mblk_t *mp1; 10445 struct lifreq *lifr; 10446 boolean_t isv6; 10447 boolean_t exists; 10448 char *name; 10449 char *endp; 10450 char *cp; 10451 int namelen; 10452 ipif_t *ipif; 10453 long id; 10454 ipsq_t *ipsq; 10455 ill_t *ill; 10456 sin_t *sin; 10457 int err = 0; 10458 boolean_t found_sep = B_FALSE; 10459 conn_t *connp; 10460 zoneid_t zoneid; 10461 int orig_ifindex = 0; 10462 10463 ip1dbg(("ip_sioctl_addif\n")); 10464 /* Existence of mp1 has been checked in ip_wput_nondata */ 10465 mp1 = mp->b_cont->b_cont; 10466 /* 10467 * Null terminate the string to protect against buffer 10468 * overrun. String was generated by user code and may not 10469 * be trusted. 10470 */ 10471 lifr = (struct lifreq *)mp1->b_rptr; 10472 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 10473 name = lifr->lifr_name; 10474 ASSERT(CONN_Q(q)); 10475 connp = Q_TO_CONN(q); 10476 isv6 = connp->conn_af_isv6; 10477 zoneid = connp->conn_zoneid; 10478 namelen = mi_strlen(name); 10479 if (namelen == 0) 10480 return (EINVAL); 10481 10482 exists = B_FALSE; 10483 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 10484 (mi_strcmp(name, ipif_loopback_name) == 0)) { 10485 /* 10486 * Allow creating lo0 using SIOCLIFADDIF. 10487 * can't be any other writer thread. So can pass null below 10488 * for the last 4 args to ipif_lookup_name. 10489 */ 10490 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, 10491 B_TRUE, &exists, isv6, zoneid, NULL, NULL, NULL, NULL); 10492 /* Prevent any further action */ 10493 if (ipif == NULL) { 10494 return (ENOBUFS); 10495 } else if (!exists) { 10496 /* We created the ipif now and as writer */ 10497 ipif_refrele(ipif); 10498 return (0); 10499 } else { 10500 ill = ipif->ipif_ill; 10501 ill_refhold(ill); 10502 ipif_refrele(ipif); 10503 } 10504 } else { 10505 /* Look for a colon in the name. */ 10506 endp = &name[namelen]; 10507 for (cp = endp; --cp > name; ) { 10508 if (*cp == IPIF_SEPARATOR_CHAR) { 10509 found_sep = B_TRUE; 10510 /* 10511 * Reject any non-decimal aliases for plumbing 10512 * of logical interfaces. Aliases with leading 10513 * zeroes are also rejected as they introduce 10514 * ambiguity in the naming of the interfaces. 10515 * Comparing with "0" takes care of all such 10516 * cases. 10517 */ 10518 if ((strncmp("0", cp+1, 1)) == 0) 10519 return (EINVAL); 10520 10521 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 10522 id <= 0 || *endp != '\0') { 10523 return (EINVAL); 10524 } 10525 *cp = '\0'; 10526 break; 10527 } 10528 } 10529 ill = ill_lookup_on_name(name, B_FALSE, isv6, 10530 CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL); 10531 if (found_sep) 10532 *cp = IPIF_SEPARATOR_CHAR; 10533 if (ill == NULL) 10534 return (err); 10535 } 10536 10537 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 10538 B_TRUE); 10539 10540 /* 10541 * Release the refhold due to the lookup, now that we are excl 10542 * or we are just returning 10543 */ 10544 ill_refrele(ill); 10545 10546 if (ipsq == NULL) 10547 return (EINPROGRESS); 10548 10549 /* 10550 * If the interface is failed, inactive or offlined, look for a working 10551 * interface in the ill group and create the ipif there. If we can't 10552 * find a good interface, create the ipif anyway so that in.mpathd can 10553 * move it to the first repaired interface. 10554 */ 10555 if ((ill->ill_phyint->phyint_flags & 10556 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10557 ill->ill_phyint->phyint_groupname_len != 0) { 10558 phyint_t *phyi; 10559 char *groupname = ill->ill_phyint->phyint_groupname; 10560 10561 /* 10562 * We're looking for a working interface, but it doesn't matter 10563 * if it's up or down; so instead of following the group lists, 10564 * we look at each physical interface and compare the groupname. 10565 * We're only interested in interfaces with IPv4 (resp. IPv6) 10566 * plumbed when we're adding an IPv4 (resp. IPv6) ipif. 10567 * Otherwise we create the ipif on the failed interface. 10568 */ 10569 rw_enter(&ill_g_lock, RW_READER); 10570 phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index); 10571 for (; phyi != NULL; 10572 phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index, 10573 phyi, AVL_AFTER)) { 10574 if (phyi->phyint_groupname_len == 0) 10575 continue; 10576 ASSERT(phyi->phyint_groupname != NULL); 10577 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 && 10578 !(phyi->phyint_flags & 10579 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10580 (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) : 10581 (phyi->phyint_illv4 != NULL))) { 10582 break; 10583 } 10584 } 10585 rw_exit(&ill_g_lock); 10586 10587 if (phyi != NULL) { 10588 orig_ifindex = ill->ill_phyint->phyint_ifindex; 10589 ill = (ill->ill_isv6 ? phyi->phyint_illv6 : 10590 phyi->phyint_illv4); 10591 } 10592 } 10593 10594 /* 10595 * We are now exclusive on the ipsq, so an ill move will be serialized 10596 * before or after us. 10597 */ 10598 ASSERT(IAM_WRITER_ILL(ill)); 10599 ASSERT(ill->ill_move_in_progress == B_FALSE); 10600 10601 if (found_sep && orig_ifindex == 0) { 10602 /* Now see if there is an IPIF with this unit number. */ 10603 for (ipif = ill->ill_ipif; ipif != NULL; 10604 ipif = ipif->ipif_next) { 10605 if (ipif->ipif_id == id) { 10606 err = EEXIST; 10607 goto done; 10608 } 10609 } 10610 } 10611 10612 /* 10613 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 10614 * of lo0. We never come here when we plumb lo0:0. It 10615 * happens in ipif_lookup_on_name. 10616 * The specified unit number is ignored when we create the ipif on a 10617 * different interface. However, we save it in ipif_orig_ipifid below so 10618 * that the ipif fails back to the right position. 10619 */ 10620 if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ? 10621 id : -1, IRE_LOCAL, B_TRUE)) == NULL) { 10622 err = ENOBUFS; 10623 goto done; 10624 } 10625 10626 /* Return created name with ioctl */ 10627 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 10628 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 10629 ip1dbg(("created %s\n", lifr->lifr_name)); 10630 10631 /* Set address */ 10632 sin = (sin_t *)&lifr->lifr_addr; 10633 if (sin->sin_family != AF_UNSPEC) { 10634 err = ip_sioctl_addr(ipif, sin, q, mp, 10635 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 10636 } 10637 10638 /* Set ifindex and unit number for failback */ 10639 if (err == 0 && orig_ifindex != 0) { 10640 ipif->ipif_orig_ifindex = orig_ifindex; 10641 if (found_sep) { 10642 ipif->ipif_orig_ipifid = id; 10643 } 10644 } 10645 10646 done: 10647 ipsq_exit(ipsq, B_TRUE, B_TRUE); 10648 return (err); 10649 } 10650 10651 /* 10652 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 10653 * interface) delete it based on the IP address (on this physical interface). 10654 * Otherwise delete it based on the ipif_id. 10655 * Also, special handling to allow a removeif of lo0. 10656 */ 10657 /* ARGSUSED */ 10658 int 10659 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10660 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10661 { 10662 conn_t *connp; 10663 ill_t *ill = ipif->ipif_ill; 10664 boolean_t success; 10665 10666 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 10667 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10668 ASSERT(IAM_WRITER_IPIF(ipif)); 10669 10670 connp = Q_TO_CONN(q); 10671 /* 10672 * Special case for unplumbing lo0 (the loopback physical interface). 10673 * If unplumbing lo0, the incoming address structure has been 10674 * initialized to all zeros. When unplumbing lo0, all its logical 10675 * interfaces must be removed too. 10676 * 10677 * Note that this interface may be called to remove a specific 10678 * loopback logical interface (eg, lo0:1). But in that case 10679 * ipif->ipif_id != 0 so that the code path for that case is the 10680 * same as any other interface (meaning it skips the code directly 10681 * below). 10682 */ 10683 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10684 if (sin->sin_family == AF_UNSPEC && 10685 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 10686 /* 10687 * Mark it condemned. No new ref. will be made to ill. 10688 */ 10689 mutex_enter(&ill->ill_lock); 10690 ill->ill_state_flags |= ILL_CONDEMNED; 10691 for (ipif = ill->ill_ipif; ipif != NULL; 10692 ipif = ipif->ipif_next) { 10693 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10694 } 10695 mutex_exit(&ill->ill_lock); 10696 10697 ipif = ill->ill_ipif; 10698 /* unplumb the loopback interface */ 10699 ill_delete(ill); 10700 mutex_enter(&connp->conn_lock); 10701 mutex_enter(&ill->ill_lock); 10702 ASSERT(ill->ill_group == NULL); 10703 10704 /* Are any references to this ill active */ 10705 if (ill_is_quiescent(ill)) { 10706 mutex_exit(&ill->ill_lock); 10707 mutex_exit(&connp->conn_lock); 10708 ill_delete_tail(ill); 10709 mi_free(ill); 10710 return (0); 10711 } 10712 success = ipsq_pending_mp_add(connp, ipif, 10713 CONNP_TO_WQ(connp), mp, ILL_FREE); 10714 mutex_exit(&connp->conn_lock); 10715 mutex_exit(&ill->ill_lock); 10716 if (success) 10717 return (EINPROGRESS); 10718 else 10719 return (EINTR); 10720 } 10721 } 10722 10723 /* 10724 * We are exclusive on the ipsq, so an ill move will be serialized 10725 * before or after us. 10726 */ 10727 ASSERT(ill->ill_move_in_progress == B_FALSE); 10728 10729 if (ipif->ipif_id == 0) { 10730 /* Find based on address */ 10731 if (ipif->ipif_isv6) { 10732 sin6_t *sin6; 10733 10734 if (sin->sin_family != AF_INET6) 10735 return (EAFNOSUPPORT); 10736 10737 sin6 = (sin6_t *)sin; 10738 /* We are a writer, so we should be able to lookup */ 10739 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 10740 ill, ALL_ZONES, NULL, NULL, NULL, NULL); 10741 if (ipif == NULL) { 10742 /* 10743 * Maybe the address in on another interface in 10744 * the same IPMP group? We check this below. 10745 */ 10746 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 10747 NULL, ALL_ZONES, NULL, NULL, NULL, NULL); 10748 } 10749 } else { 10750 ipaddr_t addr; 10751 10752 if (sin->sin_family != AF_INET) 10753 return (EAFNOSUPPORT); 10754 10755 addr = sin->sin_addr.s_addr; 10756 /* We are a writer, so we should be able to lookup */ 10757 ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL, 10758 NULL, NULL, NULL); 10759 if (ipif == NULL) { 10760 /* 10761 * Maybe the address in on another interface in 10762 * the same IPMP group? We check this below. 10763 */ 10764 ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES, 10765 NULL, NULL, NULL, NULL); 10766 } 10767 } 10768 if (ipif == NULL) { 10769 return (EADDRNOTAVAIL); 10770 } 10771 /* 10772 * When the address to be removed is hosted on a different 10773 * interface, we check if the interface is in the same IPMP 10774 * group as the specified one; if so we proceed with the 10775 * removal. 10776 * ill->ill_group is NULL when the ill is down, so we have to 10777 * compare the group names instead. 10778 */ 10779 if (ipif->ipif_ill != ill && 10780 (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 || 10781 ill->ill_phyint->phyint_groupname_len == 0 || 10782 mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname, 10783 ill->ill_phyint->phyint_groupname) != 0)) { 10784 ipif_refrele(ipif); 10785 return (EADDRNOTAVAIL); 10786 } 10787 10788 /* This is a writer */ 10789 ipif_refrele(ipif); 10790 } 10791 10792 /* 10793 * Can not delete instance zero since it is tied to the ill. 10794 */ 10795 if (ipif->ipif_id == 0) 10796 return (EBUSY); 10797 10798 mutex_enter(&ill->ill_lock); 10799 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10800 mutex_exit(&ill->ill_lock); 10801 10802 ipif_free(ipif); 10803 10804 mutex_enter(&connp->conn_lock); 10805 mutex_enter(&ill->ill_lock); 10806 10807 /* Are any references to this ipif active */ 10808 if (ipif->ipif_refcnt == 0 && ipif->ipif_ire_cnt == 0) { 10809 mutex_exit(&ill->ill_lock); 10810 mutex_exit(&connp->conn_lock); 10811 ipif_non_duplicate(ipif); 10812 ipif_down_tail(ipif); 10813 ipif_free_tail(ipif); 10814 return (0); 10815 } 10816 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 10817 IPIF_FREE); 10818 mutex_exit(&ill->ill_lock); 10819 mutex_exit(&connp->conn_lock); 10820 if (success) 10821 return (EINPROGRESS); 10822 else 10823 return (EINTR); 10824 } 10825 10826 /* 10827 * Restart the removeif ioctl. The refcnt has gone down to 0. 10828 * The ipif is already condemned. So can't find it thru lookups. 10829 */ 10830 /* ARGSUSED */ 10831 int 10832 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 10833 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10834 { 10835 ill_t *ill; 10836 10837 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 10838 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10839 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10840 ill = ipif->ipif_ill; 10841 ASSERT(IAM_WRITER_ILL(ill)); 10842 ASSERT((ipif->ipif_state_flags & IPIF_CONDEMNED) && 10843 (ill->ill_state_flags & IPIF_CONDEMNED)); 10844 ill_delete_tail(ill); 10845 mi_free(ill); 10846 return (0); 10847 } 10848 10849 ill = ipif->ipif_ill; 10850 ASSERT(IAM_WRITER_IPIF(ipif)); 10851 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 10852 10853 ipif_non_duplicate(ipif); 10854 ipif_down_tail(ipif); 10855 ipif_free_tail(ipif); 10856 10857 ILL_UNMARK_CHANGING(ill); 10858 return (0); 10859 } 10860 10861 /* 10862 * Set the local interface address. 10863 * Allow an address of all zero when the interface is down. 10864 */ 10865 /* ARGSUSED */ 10866 int 10867 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10868 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10869 { 10870 int err = 0; 10871 in6_addr_t v6addr; 10872 boolean_t need_up = B_FALSE; 10873 10874 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 10875 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10876 10877 ASSERT(IAM_WRITER_IPIF(ipif)); 10878 10879 if (ipif->ipif_isv6) { 10880 sin6_t *sin6; 10881 ill_t *ill; 10882 phyint_t *phyi; 10883 10884 if (sin->sin_family != AF_INET6) 10885 return (EAFNOSUPPORT); 10886 10887 sin6 = (sin6_t *)sin; 10888 v6addr = sin6->sin6_addr; 10889 ill = ipif->ipif_ill; 10890 phyi = ill->ill_phyint; 10891 10892 /* 10893 * Enforce that true multicast interfaces have a link-local 10894 * address for logical unit 0. 10895 */ 10896 if (ipif->ipif_id == 0 && 10897 (ill->ill_flags & ILLF_MULTICAST) && 10898 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 10899 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 10900 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 10901 return (EADDRNOTAVAIL); 10902 } 10903 10904 /* 10905 * up interfaces shouldn't have the unspecified address 10906 * unless they also have the IPIF_NOLOCAL flags set and 10907 * have a subnet assigned. 10908 */ 10909 if ((ipif->ipif_flags & IPIF_UP) && 10910 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 10911 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 10912 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 10913 return (EADDRNOTAVAIL); 10914 } 10915 10916 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 10917 return (EADDRNOTAVAIL); 10918 } else { 10919 ipaddr_t addr; 10920 10921 if (sin->sin_family != AF_INET) 10922 return (EAFNOSUPPORT); 10923 10924 addr = sin->sin_addr.s_addr; 10925 10926 /* Allow 0 as the local address. */ 10927 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 10928 return (EADDRNOTAVAIL); 10929 10930 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10931 } 10932 10933 10934 /* 10935 * Even if there is no change we redo things just to rerun 10936 * ipif_set_default. 10937 */ 10938 if (ipif->ipif_flags & IPIF_UP) { 10939 /* 10940 * Setting a new local address, make sure 10941 * we have net and subnet bcast ire's for 10942 * the old address if we need them. 10943 */ 10944 if (!ipif->ipif_isv6) 10945 ipif_check_bcast_ires(ipif); 10946 /* 10947 * If the interface is already marked up, 10948 * we call ipif_down which will take care 10949 * of ditching any IREs that have been set 10950 * up based on the old interface address. 10951 */ 10952 err = ipif_logical_down(ipif, q, mp); 10953 if (err == EINPROGRESS) 10954 return (err); 10955 ipif_down_tail(ipif); 10956 need_up = 1; 10957 } 10958 10959 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 10960 return (err); 10961 } 10962 10963 int 10964 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10965 boolean_t need_up) 10966 { 10967 in6_addr_t v6addr; 10968 ipaddr_t addr; 10969 sin6_t *sin6; 10970 int sinlen; 10971 int err = 0; 10972 ill_t *ill = ipif->ipif_ill; 10973 boolean_t need_dl_down; 10974 boolean_t need_arp_down; 10975 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 10976 10977 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 10978 ill->ill_name, ipif->ipif_id, (void *)ipif)); 10979 ASSERT(IAM_WRITER_IPIF(ipif)); 10980 10981 /* Must cancel any pending timer before taking the ill_lock */ 10982 if (ipif->ipif_recovery_id != 0) 10983 (void) untimeout(ipif->ipif_recovery_id); 10984 ipif->ipif_recovery_id = 0; 10985 10986 if (ipif->ipif_isv6) { 10987 sin6 = (sin6_t *)sin; 10988 v6addr = sin6->sin6_addr; 10989 sinlen = sizeof (struct sockaddr_in6); 10990 } else { 10991 addr = sin->sin_addr.s_addr; 10992 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10993 sinlen = sizeof (struct sockaddr_in); 10994 } 10995 mutex_enter(&ill->ill_lock); 10996 ipif->ipif_v6lcl_addr = v6addr; 10997 if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) { 10998 ipif->ipif_v6src_addr = ipv6_all_zeros; 10999 } else { 11000 ipif->ipif_v6src_addr = v6addr; 11001 } 11002 ipif->ipif_addr_ready = 0; 11003 11004 /* 11005 * If the interface was previously marked as a duplicate, then since 11006 * we've now got a "new" address, it should no longer be considered a 11007 * duplicate -- even if the "new" address is the same as the old one. 11008 * Note that if all ipifs are down, we may have a pending ARP down 11009 * event to handle. This is because we want to recover from duplicates 11010 * and thus delay tearing down ARP until the duplicates have been 11011 * removed or disabled. 11012 */ 11013 need_dl_down = need_arp_down = B_FALSE; 11014 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11015 need_arp_down = !need_up; 11016 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11017 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11018 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11019 need_dl_down = B_TRUE; 11020 } 11021 } 11022 11023 if (ipif->ipif_isv6 && IN6_IS_ADDR_6TO4(&v6addr) && 11024 !ill->ill_is_6to4tun) { 11025 queue_t *wqp = ill->ill_wq; 11026 11027 /* 11028 * The local address of this interface is a 6to4 address, 11029 * check if this interface is in fact a 6to4 tunnel or just 11030 * an interface configured with a 6to4 address. We are only 11031 * interested in the former. 11032 */ 11033 if (wqp != NULL) { 11034 while ((wqp->q_next != NULL) && 11035 (wqp->q_next->q_qinfo != NULL) && 11036 (wqp->q_next->q_qinfo->qi_minfo != NULL)) { 11037 11038 if (wqp->q_next->q_qinfo->qi_minfo->mi_idnum 11039 == TUN6TO4_MODID) { 11040 /* set for use in IP */ 11041 ill->ill_is_6to4tun = 1; 11042 break; 11043 } 11044 wqp = wqp->q_next; 11045 } 11046 } 11047 } 11048 11049 ipif_set_default(ipif); 11050 11051 /* 11052 * When publishing an interface address change event, we only notify 11053 * the event listeners of the new address. It is assumed that if they 11054 * actively care about the addresses assigned that they will have 11055 * already discovered the previous address assigned (if there was one.) 11056 * 11057 * Don't attach nic event message for SIOCLIFADDIF ioctl. 11058 */ 11059 if (iocp->ioc_cmd != SIOCLIFADDIF) { 11060 hook_nic_event_t *info; 11061 if ((info = ipif->ipif_ill->ill_nic_event_info) != NULL) { 11062 ip2dbg(("ip_sioctl_addr_tail: unexpected nic event %d " 11063 "attached for %s\n", info->hne_event, 11064 ill->ill_name)); 11065 if (info->hne_data != NULL) 11066 kmem_free(info->hne_data, info->hne_datalen); 11067 kmem_free(info, sizeof (hook_nic_event_t)); 11068 } 11069 11070 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 11071 if (info != NULL) { 11072 info->hne_nic = 11073 ipif->ipif_ill->ill_phyint->phyint_ifindex; 11074 info->hne_lif = MAP_IPIF_ID(ipif->ipif_id); 11075 info->hne_event = NE_ADDRESS_CHANGE; 11076 info->hne_family = ipif->ipif_isv6 ? ipv6 : ipv4; 11077 info->hne_data = kmem_alloc(sinlen, KM_NOSLEEP); 11078 if (info->hne_data != NULL) { 11079 info->hne_datalen = sinlen; 11080 bcopy(sin, info->hne_data, sinlen); 11081 } else { 11082 ip2dbg(("ip_sioctl_addr_tail: could not attach " 11083 "address information for ADDRESS_CHANGE nic" 11084 " event of %s (ENOMEM)\n", 11085 ipif->ipif_ill->ill_name)); 11086 kmem_free(info, sizeof (hook_nic_event_t)); 11087 } 11088 } else 11089 ip2dbg(("ip_sioctl_addr_tail: could not attach " 11090 "ADDRESS_CHANGE nic event information for %s " 11091 "(ENOMEM)\n", ipif->ipif_ill->ill_name)); 11092 11093 ipif->ipif_ill->ill_nic_event_info = info; 11094 } 11095 11096 mutex_exit(&ipif->ipif_ill->ill_lock); 11097 11098 if (need_up) { 11099 /* 11100 * Now bring the interface back up. If this 11101 * is the only IPIF for the ILL, ipif_up 11102 * will have to re-bind to the device, so 11103 * we may get back EINPROGRESS, in which 11104 * case, this IOCTL will get completed in 11105 * ip_rput_dlpi when we see the DL_BIND_ACK. 11106 */ 11107 err = ipif_up(ipif, q, mp); 11108 } else { 11109 /* 11110 * Update the IPIF list in SCTP, ipif_up_done() will do it 11111 * if need_up is true. 11112 */ 11113 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 11114 } 11115 11116 if (need_dl_down) 11117 ill_dl_down(ill); 11118 if (need_arp_down) 11119 ipif_arp_down(ipif); 11120 11121 return (err); 11122 } 11123 11124 11125 /* 11126 * Restart entry point to restart the address set operation after the 11127 * refcounts have dropped to zero. 11128 */ 11129 /* ARGSUSED */ 11130 int 11131 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11132 ip_ioctl_cmd_t *ipip, void *ifreq) 11133 { 11134 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 11135 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11136 ASSERT(IAM_WRITER_IPIF(ipif)); 11137 ipif_down_tail(ipif); 11138 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 11139 } 11140 11141 /* ARGSUSED */ 11142 int 11143 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11144 ip_ioctl_cmd_t *ipip, void *if_req) 11145 { 11146 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11147 struct lifreq *lifr = (struct lifreq *)if_req; 11148 11149 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 11150 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11151 /* 11152 * The net mask and address can't change since we have a 11153 * reference to the ipif. So no lock is necessary. 11154 */ 11155 if (ipif->ipif_isv6) { 11156 *sin6 = sin6_null; 11157 sin6->sin6_family = AF_INET6; 11158 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 11159 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11160 lifr->lifr_addrlen = 11161 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11162 } else { 11163 *sin = sin_null; 11164 sin->sin_family = AF_INET; 11165 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 11166 if (ipip->ipi_cmd_type == LIF_CMD) { 11167 lifr->lifr_addrlen = 11168 ip_mask_to_plen(ipif->ipif_net_mask); 11169 } 11170 } 11171 return (0); 11172 } 11173 11174 /* 11175 * Set the destination address for a pt-pt interface. 11176 */ 11177 /* ARGSUSED */ 11178 int 11179 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11180 ip_ioctl_cmd_t *ipip, void *if_req) 11181 { 11182 int err = 0; 11183 in6_addr_t v6addr; 11184 boolean_t need_up = B_FALSE; 11185 11186 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 11187 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11188 ASSERT(IAM_WRITER_IPIF(ipif)); 11189 11190 if (ipif->ipif_isv6) { 11191 sin6_t *sin6; 11192 11193 if (sin->sin_family != AF_INET6) 11194 return (EAFNOSUPPORT); 11195 11196 sin6 = (sin6_t *)sin; 11197 v6addr = sin6->sin6_addr; 11198 11199 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 11200 return (EADDRNOTAVAIL); 11201 } else { 11202 ipaddr_t addr; 11203 11204 if (sin->sin_family != AF_INET) 11205 return (EAFNOSUPPORT); 11206 11207 addr = sin->sin_addr.s_addr; 11208 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 11209 return (EADDRNOTAVAIL); 11210 11211 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11212 } 11213 11214 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 11215 return (0); /* No change */ 11216 11217 if (ipif->ipif_flags & IPIF_UP) { 11218 /* 11219 * If the interface is already marked up, 11220 * we call ipif_down which will take care 11221 * of ditching any IREs that have been set 11222 * up based on the old pp dst address. 11223 */ 11224 err = ipif_logical_down(ipif, q, mp); 11225 if (err == EINPROGRESS) 11226 return (err); 11227 ipif_down_tail(ipif); 11228 need_up = B_TRUE; 11229 } 11230 /* 11231 * could return EINPROGRESS. If so ioctl will complete in 11232 * ip_rput_dlpi_writer 11233 */ 11234 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 11235 return (err); 11236 } 11237 11238 static int 11239 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11240 boolean_t need_up) 11241 { 11242 in6_addr_t v6addr; 11243 ill_t *ill = ipif->ipif_ill; 11244 int err = 0; 11245 boolean_t need_dl_down; 11246 boolean_t need_arp_down; 11247 11248 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 11249 ipif->ipif_id, (void *)ipif)); 11250 11251 /* Must cancel any pending timer before taking the ill_lock */ 11252 if (ipif->ipif_recovery_id != 0) 11253 (void) untimeout(ipif->ipif_recovery_id); 11254 ipif->ipif_recovery_id = 0; 11255 11256 if (ipif->ipif_isv6) { 11257 sin6_t *sin6; 11258 11259 sin6 = (sin6_t *)sin; 11260 v6addr = sin6->sin6_addr; 11261 } else { 11262 ipaddr_t addr; 11263 11264 addr = sin->sin_addr.s_addr; 11265 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11266 } 11267 mutex_enter(&ill->ill_lock); 11268 /* Set point to point destination address. */ 11269 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11270 /* 11271 * Allow this as a means of creating logical 11272 * pt-pt interfaces on top of e.g. an Ethernet. 11273 * XXX Undocumented HACK for testing. 11274 * pt-pt interfaces are created with NUD disabled. 11275 */ 11276 ipif->ipif_flags |= IPIF_POINTOPOINT; 11277 ipif->ipif_flags &= ~IPIF_BROADCAST; 11278 if (ipif->ipif_isv6) 11279 ill->ill_flags |= ILLF_NONUD; 11280 } 11281 11282 /* 11283 * If the interface was previously marked as a duplicate, then since 11284 * we've now got a "new" address, it should no longer be considered a 11285 * duplicate -- even if the "new" address is the same as the old one. 11286 * Note that if all ipifs are down, we may have a pending ARP down 11287 * event to handle. 11288 */ 11289 need_dl_down = need_arp_down = B_FALSE; 11290 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11291 need_arp_down = !need_up; 11292 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11293 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11294 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11295 need_dl_down = B_TRUE; 11296 } 11297 } 11298 11299 /* Set the new address. */ 11300 ipif->ipif_v6pp_dst_addr = v6addr; 11301 /* Make sure subnet tracks pp_dst */ 11302 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 11303 mutex_exit(&ill->ill_lock); 11304 11305 if (need_up) { 11306 /* 11307 * Now bring the interface back up. If this 11308 * is the only IPIF for the ILL, ipif_up 11309 * will have to re-bind to the device, so 11310 * we may get back EINPROGRESS, in which 11311 * case, this IOCTL will get completed in 11312 * ip_rput_dlpi when we see the DL_BIND_ACK. 11313 */ 11314 err = ipif_up(ipif, q, mp); 11315 } 11316 11317 if (need_dl_down) 11318 ill_dl_down(ill); 11319 11320 if (need_arp_down) 11321 ipif_arp_down(ipif); 11322 return (err); 11323 } 11324 11325 /* 11326 * Restart entry point to restart the dstaddress set operation after the 11327 * refcounts have dropped to zero. 11328 */ 11329 /* ARGSUSED */ 11330 int 11331 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11332 ip_ioctl_cmd_t *ipip, void *ifreq) 11333 { 11334 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 11335 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11336 ipif_down_tail(ipif); 11337 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 11338 } 11339 11340 /* ARGSUSED */ 11341 int 11342 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11343 ip_ioctl_cmd_t *ipip, void *if_req) 11344 { 11345 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11346 11347 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 11348 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11349 /* 11350 * Get point to point destination address. The addresses can't 11351 * change since we hold a reference to the ipif. 11352 */ 11353 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 11354 return (EADDRNOTAVAIL); 11355 11356 if (ipif->ipif_isv6) { 11357 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11358 *sin6 = sin6_null; 11359 sin6->sin6_family = AF_INET6; 11360 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 11361 } else { 11362 *sin = sin_null; 11363 sin->sin_family = AF_INET; 11364 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 11365 } 11366 return (0); 11367 } 11368 11369 /* 11370 * part of ipmp, make this func return the active/inactive state and 11371 * caller can set once atomically instead of multiple mutex_enter/mutex_exit 11372 */ 11373 /* 11374 * This function either sets or clears the IFF_INACTIVE flag. 11375 * 11376 * As long as there are some addresses or multicast memberships on the 11377 * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we 11378 * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface 11379 * will be used for outbound packets. 11380 * 11381 * Caller needs to verify the validity of setting IFF_INACTIVE. 11382 */ 11383 static void 11384 phyint_inactive(phyint_t *phyi) 11385 { 11386 ill_t *ill_v4; 11387 ill_t *ill_v6; 11388 ipif_t *ipif; 11389 ilm_t *ilm; 11390 11391 ill_v4 = phyi->phyint_illv4; 11392 ill_v6 = phyi->phyint_illv6; 11393 11394 /* 11395 * No need for a lock while traversing the list since iam 11396 * a writer 11397 */ 11398 if (ill_v4 != NULL) { 11399 ASSERT(IAM_WRITER_ILL(ill_v4)); 11400 for (ipif = ill_v4->ill_ipif; ipif != NULL; 11401 ipif = ipif->ipif_next) { 11402 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11403 mutex_enter(&phyi->phyint_lock); 11404 phyi->phyint_flags &= ~PHYI_INACTIVE; 11405 mutex_exit(&phyi->phyint_lock); 11406 return; 11407 } 11408 } 11409 for (ilm = ill_v4->ill_ilm; ilm != NULL; 11410 ilm = ilm->ilm_next) { 11411 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11412 mutex_enter(&phyi->phyint_lock); 11413 phyi->phyint_flags &= ~PHYI_INACTIVE; 11414 mutex_exit(&phyi->phyint_lock); 11415 return; 11416 } 11417 } 11418 } 11419 if (ill_v6 != NULL) { 11420 ill_v6 = phyi->phyint_illv6; 11421 for (ipif = ill_v6->ill_ipif; ipif != NULL; 11422 ipif = ipif->ipif_next) { 11423 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11424 mutex_enter(&phyi->phyint_lock); 11425 phyi->phyint_flags &= ~PHYI_INACTIVE; 11426 mutex_exit(&phyi->phyint_lock); 11427 return; 11428 } 11429 } 11430 for (ilm = ill_v6->ill_ilm; ilm != NULL; 11431 ilm = ilm->ilm_next) { 11432 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11433 mutex_enter(&phyi->phyint_lock); 11434 phyi->phyint_flags &= ~PHYI_INACTIVE; 11435 mutex_exit(&phyi->phyint_lock); 11436 return; 11437 } 11438 } 11439 } 11440 mutex_enter(&phyi->phyint_lock); 11441 phyi->phyint_flags |= PHYI_INACTIVE; 11442 mutex_exit(&phyi->phyint_lock); 11443 } 11444 11445 /* 11446 * This function is called only when the phyint flags change. Currently 11447 * called from ip_sioctl_flags. We re-do the broadcast nomination so 11448 * that we can select a good ill. 11449 */ 11450 static void 11451 ip_redo_nomination(phyint_t *phyi) 11452 { 11453 ill_t *ill_v4; 11454 11455 ill_v4 = phyi->phyint_illv4; 11456 11457 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 11458 ASSERT(IAM_WRITER_ILL(ill_v4)); 11459 if (ill_v4->ill_group->illgrp_ill_count > 1) 11460 ill_nominate_bcast_rcv(ill_v4->ill_group); 11461 } 11462 } 11463 11464 /* 11465 * Heuristic to check if ill is INACTIVE. 11466 * Checks if ill has an ipif with an usable ip address. 11467 * 11468 * Return values: 11469 * B_TRUE - ill is INACTIVE; has no usable ipif 11470 * B_FALSE - ill is not INACTIVE; ill has at least one usable ipif 11471 */ 11472 static boolean_t 11473 ill_is_inactive(ill_t *ill) 11474 { 11475 ipif_t *ipif; 11476 11477 /* Check whether it is in an IPMP group */ 11478 if (ill->ill_phyint->phyint_groupname == NULL) 11479 return (B_FALSE); 11480 11481 if (ill->ill_ipif_up_count == 0) 11482 return (B_TRUE); 11483 11484 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 11485 uint64_t flags = ipif->ipif_flags; 11486 11487 /* 11488 * This ipif is usable if it is IPIF_UP and not a 11489 * dedicated test address. A dedicated test address 11490 * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED 11491 * (note in particular that V6 test addresses are 11492 * link-local data addresses and thus are marked 11493 * IPIF_NOFAILOVER but not IPIF_DEPRECATED). 11494 */ 11495 if ((flags & IPIF_UP) && 11496 ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) != 11497 (IPIF_DEPRECATED|IPIF_NOFAILOVER))) 11498 return (B_FALSE); 11499 } 11500 return (B_TRUE); 11501 } 11502 11503 /* 11504 * Set interface flags. 11505 * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, 11506 * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST, 11507 * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE. 11508 * 11509 * NOTE : We really don't enforce that ipif_id zero should be used 11510 * for setting any flags other than IFF_LOGINT_FLAGS. This 11511 * is because applications generally does SICGLIFFLAGS and 11512 * ORs in the new flags (that affects the logical) and does a 11513 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 11514 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 11515 * flags that will be turned on is correct with respect to 11516 * ipif_id 0. For backward compatibility reasons, it is not done. 11517 */ 11518 /* ARGSUSED */ 11519 int 11520 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11521 ip_ioctl_cmd_t *ipip, void *if_req) 11522 { 11523 uint64_t turn_on; 11524 uint64_t turn_off; 11525 int err; 11526 boolean_t need_up = B_FALSE; 11527 phyint_t *phyi; 11528 ill_t *ill; 11529 uint64_t intf_flags; 11530 boolean_t phyint_flags_modified = B_FALSE; 11531 uint64_t flags; 11532 struct ifreq *ifr; 11533 struct lifreq *lifr; 11534 boolean_t set_linklocal = B_FALSE; 11535 boolean_t zero_source = B_FALSE; 11536 11537 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 11538 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11539 11540 ASSERT(IAM_WRITER_IPIF(ipif)); 11541 11542 ill = ipif->ipif_ill; 11543 phyi = ill->ill_phyint; 11544 11545 if (ipip->ipi_cmd_type == IF_CMD) { 11546 ifr = (struct ifreq *)if_req; 11547 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 11548 } else { 11549 lifr = (struct lifreq *)if_req; 11550 flags = lifr->lifr_flags; 11551 } 11552 11553 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11554 11555 /* 11556 * Has the flags been set correctly till now ? 11557 */ 11558 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11559 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11560 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11561 /* 11562 * Compare the new flags to the old, and partition 11563 * into those coming on and those going off. 11564 * For the 16 bit command keep the bits above bit 16 unchanged. 11565 */ 11566 if (ipip->ipi_cmd == SIOCSIFFLAGS) 11567 flags |= intf_flags & ~0xFFFF; 11568 11569 /* 11570 * First check which bits will change and then which will 11571 * go on and off 11572 */ 11573 turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE; 11574 if (!turn_on) 11575 return (0); /* No change */ 11576 11577 turn_off = intf_flags & turn_on; 11578 turn_on ^= turn_off; 11579 err = 0; 11580 11581 /* 11582 * Don't allow any bits belonging to the logical interface 11583 * to be set or cleared on the replacement ipif that was 11584 * created temporarily during a MOVE. 11585 */ 11586 if (ipif->ipif_replace_zero && 11587 ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) { 11588 return (EINVAL); 11589 } 11590 11591 /* 11592 * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on 11593 * IPv6 interfaces. 11594 */ 11595 if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6)) 11596 return (EINVAL); 11597 11598 /* 11599 * Don't allow the IFF_ROUTER flag to be turned on on loopback 11600 * interfaces. It makes no sense in that context. 11601 */ 11602 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 11603 return (EINVAL); 11604 11605 if (flags & (IFF_NOLOCAL|IFF_ANYCAST)) 11606 zero_source = B_TRUE; 11607 11608 /* 11609 * For IPv6 ipif_id 0, don't allow the interface to be up without 11610 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 11611 * If the link local address isn't set, and can be set, it will get 11612 * set later on in this function. 11613 */ 11614 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 11615 (flags & IFF_UP) && !zero_source && 11616 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 11617 if (ipif_cant_setlinklocal(ipif)) 11618 return (EINVAL); 11619 set_linklocal = B_TRUE; 11620 } 11621 11622 /* 11623 * ILL cannot be part of a usesrc group and and IPMP group at the 11624 * same time. No need to grab ill_g_usesrc_lock here, see 11625 * synchronization notes in ip.c 11626 */ 11627 if (turn_on & PHYI_STANDBY && 11628 ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 11629 return (EINVAL); 11630 } 11631 11632 /* 11633 * If we modify physical interface flags, we'll potentially need to 11634 * send up two routing socket messages for the changes (one for the 11635 * IPv4 ill, and another for the IPv6 ill). Note that here. 11636 */ 11637 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 11638 phyint_flags_modified = B_TRUE; 11639 11640 /* 11641 * If we are setting or clearing FAILED or STANDBY or OFFLINE, 11642 * we need to flush the IRE_CACHES belonging to this ill. 11643 * We handle this case here without doing the DOWN/UP dance 11644 * like it is done for other flags. If some other flags are 11645 * being turned on/off with FAILED/STANDBY/OFFLINE, the code 11646 * below will handle it by bringing it down and then 11647 * bringing it UP. 11648 */ 11649 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) { 11650 ill_t *ill_v4, *ill_v6; 11651 11652 ill_v4 = phyi->phyint_illv4; 11653 ill_v6 = phyi->phyint_illv6; 11654 11655 /* 11656 * First set the INACTIVE flag if needed. Then delete the ires. 11657 * ire_add will atomically prevent creating new IRE_CACHEs 11658 * unless hidden flag is set. 11659 * PHYI_FAILED and PHYI_INACTIVE are exclusive 11660 */ 11661 if ((turn_on & PHYI_FAILED) && 11662 ((intf_flags & PHYI_STANDBY) || !ipmp_enable_failback)) { 11663 /* Reset PHYI_INACTIVE when PHYI_FAILED is being set */ 11664 phyi->phyint_flags &= ~PHYI_INACTIVE; 11665 } 11666 if ((turn_off & PHYI_FAILED) && 11667 ((intf_flags & PHYI_STANDBY) || 11668 (!ipmp_enable_failback && ill_is_inactive(ill)))) { 11669 phyint_inactive(phyi); 11670 } 11671 11672 if (turn_on & PHYI_STANDBY) { 11673 /* 11674 * We implicitly set INACTIVE only when STANDBY is set. 11675 * INACTIVE is also set on non-STANDBY phyint when user 11676 * disables FAILBACK using configuration file. 11677 * Do not allow STANDBY to be set on such INACTIVE 11678 * phyint 11679 */ 11680 if (phyi->phyint_flags & PHYI_INACTIVE) 11681 return (EINVAL); 11682 if (!(phyi->phyint_flags & PHYI_FAILED)) 11683 phyint_inactive(phyi); 11684 } 11685 if (turn_off & PHYI_STANDBY) { 11686 if (ipmp_enable_failback) { 11687 /* 11688 * Reset PHYI_INACTIVE. 11689 */ 11690 phyi->phyint_flags &= ~PHYI_INACTIVE; 11691 } else if (ill_is_inactive(ill) && 11692 !(phyi->phyint_flags & PHYI_FAILED)) { 11693 /* 11694 * Need to set INACTIVE, when user sets 11695 * STANDBY on a non-STANDBY phyint and 11696 * later resets STANDBY 11697 */ 11698 phyint_inactive(phyi); 11699 } 11700 } 11701 /* 11702 * We should always send up a message so that the 11703 * daemons come to know of it. Note that the zeroth 11704 * interface can be down and the check below for IPIF_UP 11705 * will not make sense as we are actually setting 11706 * a phyint flag here. We assume that the ipif used 11707 * is always the zeroth ipif. (ip_rts_ifmsg does not 11708 * send up any message for non-zero ipifs). 11709 */ 11710 phyint_flags_modified = B_TRUE; 11711 11712 if (ill_v4 != NULL) { 11713 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 11714 IRE_CACHE, ill_stq_cache_delete, 11715 (char *)ill_v4, ill_v4); 11716 illgrp_reset_schednext(ill_v4); 11717 } 11718 if (ill_v6 != NULL) { 11719 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 11720 IRE_CACHE, ill_stq_cache_delete, 11721 (char *)ill_v6, ill_v6); 11722 illgrp_reset_schednext(ill_v6); 11723 } 11724 } 11725 11726 /* 11727 * If ILLF_ROUTER changes, we need to change the ip forwarding 11728 * status of the interface and, if the interface is part of an IPMP 11729 * group, all other interfaces that are part of the same IPMP 11730 * group. 11731 */ 11732 if ((turn_on | turn_off) & ILLF_ROUTER) { 11733 (void) ill_forward_set(q, mp, ((turn_on & ILLF_ROUTER) != 0), 11734 (caddr_t)ill); 11735 } 11736 11737 /* 11738 * If the interface is not UP and we are not going to 11739 * bring it UP, record the flags and return. When the 11740 * interface comes UP later, the right actions will be 11741 * taken. 11742 */ 11743 if (!(ipif->ipif_flags & IPIF_UP) && 11744 !(turn_on & IPIF_UP)) { 11745 /* Record new flags in their respective places. */ 11746 mutex_enter(&ill->ill_lock); 11747 mutex_enter(&ill->ill_phyint->phyint_lock); 11748 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11749 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11750 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11751 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11752 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11753 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11754 mutex_exit(&ill->ill_lock); 11755 mutex_exit(&ill->ill_phyint->phyint_lock); 11756 11757 /* 11758 * We do the broadcast and nomination here rather 11759 * than waiting for a FAILOVER/FAILBACK to happen. In 11760 * the case of FAILBACK from INACTIVE standby to the 11761 * interface that has been repaired, PHYI_FAILED has not 11762 * been cleared yet. If there are only two interfaces in 11763 * that group, all we have is a FAILED and INACTIVE 11764 * interface. If we do the nomination soon after a failback, 11765 * the broadcast nomination code would select the 11766 * INACTIVE interface for receiving broadcasts as FAILED is 11767 * not yet cleared. As we don't want STANDBY/INACTIVE to 11768 * receive broadcast packets, we need to redo nomination 11769 * when the FAILED is cleared here. Thus, in general we 11770 * always do the nomination here for FAILED, STANDBY 11771 * and OFFLINE. 11772 */ 11773 if (((turn_on | turn_off) & 11774 (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) { 11775 ip_redo_nomination(phyi); 11776 } 11777 if (phyint_flags_modified) { 11778 if (phyi->phyint_illv4 != NULL) { 11779 ip_rts_ifmsg(phyi->phyint_illv4-> 11780 ill_ipif); 11781 } 11782 if (phyi->phyint_illv6 != NULL) { 11783 ip_rts_ifmsg(phyi->phyint_illv6-> 11784 ill_ipif); 11785 } 11786 } 11787 return (0); 11788 } else if (set_linklocal || zero_source) { 11789 mutex_enter(&ill->ill_lock); 11790 if (set_linklocal) 11791 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 11792 if (zero_source) 11793 ipif->ipif_state_flags |= IPIF_ZERO_SOURCE; 11794 mutex_exit(&ill->ill_lock); 11795 } 11796 11797 /* 11798 * Disallow IPv6 interfaces coming up that have the unspecified address, 11799 * or point-to-point interfaces with an unspecified destination. We do 11800 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 11801 * have a subnet assigned, which is how in.ndpd currently manages its 11802 * onlink prefix list when no addresses are configured with those 11803 * prefixes. 11804 */ 11805 if (ipif->ipif_isv6 && 11806 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 11807 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 11808 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 11809 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11810 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 11811 return (EINVAL); 11812 } 11813 11814 /* 11815 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 11816 * from being brought up. 11817 */ 11818 if (!ipif->ipif_isv6 && 11819 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11820 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 11821 return (EINVAL); 11822 } 11823 11824 /* 11825 * The only flag changes that we currently take specific action on 11826 * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, 11827 * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and 11828 * IPIF_PREFERRED. This is done by bring the ipif down, changing 11829 * the flags and bringing it back up again. 11830 */ 11831 if ((turn_on|turn_off) & 11832 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 11833 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) { 11834 /* 11835 * Taking this ipif down, make sure we have 11836 * valid net and subnet bcast ire's for other 11837 * logical interfaces, if we need them. 11838 */ 11839 if (!ipif->ipif_isv6) 11840 ipif_check_bcast_ires(ipif); 11841 11842 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 11843 !(turn_off & IPIF_UP)) { 11844 need_up = B_TRUE; 11845 if (ipif->ipif_flags & IPIF_UP) 11846 ill->ill_logical_down = 1; 11847 turn_on &= ~IPIF_UP; 11848 } 11849 err = ipif_down(ipif, q, mp); 11850 ip1dbg(("ipif_down returns %d err ", err)); 11851 if (err == EINPROGRESS) 11852 return (err); 11853 ipif_down_tail(ipif); 11854 } 11855 return (ip_sioctl_flags_tail(ipif, flags, q, mp, need_up)); 11856 } 11857 11858 static int 11859 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp, 11860 boolean_t need_up) 11861 { 11862 ill_t *ill; 11863 phyint_t *phyi; 11864 uint64_t turn_on; 11865 uint64_t turn_off; 11866 uint64_t intf_flags; 11867 boolean_t phyint_flags_modified = B_FALSE; 11868 int err = 0; 11869 boolean_t set_linklocal = B_FALSE; 11870 boolean_t zero_source = B_FALSE; 11871 11872 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 11873 ipif->ipif_ill->ill_name, ipif->ipif_id)); 11874 11875 ASSERT(IAM_WRITER_IPIF(ipif)); 11876 11877 ill = ipif->ipif_ill; 11878 phyi = ill->ill_phyint; 11879 11880 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11881 turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP); 11882 11883 turn_off = intf_flags & turn_on; 11884 turn_on ^= turn_off; 11885 11886 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) 11887 phyint_flags_modified = B_TRUE; 11888 11889 /* 11890 * Now we change the flags. Track current value of 11891 * other flags in their respective places. 11892 */ 11893 mutex_enter(&ill->ill_lock); 11894 mutex_enter(&phyi->phyint_lock); 11895 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11896 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11897 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11898 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11899 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11900 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11901 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 11902 set_linklocal = B_TRUE; 11903 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 11904 } 11905 if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) { 11906 zero_source = B_TRUE; 11907 ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE; 11908 } 11909 mutex_exit(&ill->ill_lock); 11910 mutex_exit(&phyi->phyint_lock); 11911 11912 if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) 11913 ip_redo_nomination(phyi); 11914 11915 if (set_linklocal) 11916 (void) ipif_setlinklocal(ipif); 11917 11918 if (zero_source) 11919 ipif->ipif_v6src_addr = ipv6_all_zeros; 11920 else 11921 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 11922 11923 if (need_up) { 11924 /* 11925 * XXX ipif_up really does not know whether a phyint flags 11926 * was modified or not. So, it sends up information on 11927 * only one routing sockets message. As we don't bring up 11928 * the interface and also set STANDBY/FAILED simultaneously 11929 * it should be okay. 11930 */ 11931 err = ipif_up(ipif, q, mp); 11932 } else { 11933 /* 11934 * Make sure routing socket sees all changes to the flags. 11935 * ipif_up_done* handles this when we use ipif_up. 11936 */ 11937 if (phyint_flags_modified) { 11938 if (phyi->phyint_illv4 != NULL) { 11939 ip_rts_ifmsg(phyi->phyint_illv4-> 11940 ill_ipif); 11941 } 11942 if (phyi->phyint_illv6 != NULL) { 11943 ip_rts_ifmsg(phyi->phyint_illv6-> 11944 ill_ipif); 11945 } 11946 } else { 11947 ip_rts_ifmsg(ipif); 11948 } 11949 } 11950 return (err); 11951 } 11952 11953 /* 11954 * Restart entry point to restart the flags restart operation after the 11955 * refcounts have dropped to zero. 11956 */ 11957 /* ARGSUSED */ 11958 int 11959 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11960 ip_ioctl_cmd_t *ipip, void *if_req) 11961 { 11962 int err; 11963 struct ifreq *ifr = (struct ifreq *)if_req; 11964 struct lifreq *lifr = (struct lifreq *)if_req; 11965 11966 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 11967 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11968 11969 ipif_down_tail(ipif); 11970 if (ipip->ipi_cmd_type == IF_CMD) { 11971 /* 11972 * Since ip_sioctl_flags expects an int and ifr_flags 11973 * is a short we need to cast ifr_flags into an int 11974 * to avoid having sign extension cause bits to get 11975 * set that should not be. 11976 */ 11977 err = ip_sioctl_flags_tail(ipif, 11978 (uint64_t)(ifr->ifr_flags & 0x0000ffff), 11979 q, mp, B_TRUE); 11980 } else { 11981 err = ip_sioctl_flags_tail(ipif, lifr->lifr_flags, 11982 q, mp, B_TRUE); 11983 } 11984 return (err); 11985 } 11986 11987 /* ARGSUSED */ 11988 int 11989 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11990 ip_ioctl_cmd_t *ipip, void *if_req) 11991 { 11992 /* 11993 * Has the flags been set correctly till now ? 11994 */ 11995 ill_t *ill = ipif->ipif_ill; 11996 phyint_t *phyi = ill->ill_phyint; 11997 11998 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 11999 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12000 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 12001 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 12002 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 12003 12004 /* 12005 * Need a lock since some flags can be set even when there are 12006 * references to the ipif. 12007 */ 12008 mutex_enter(&ill->ill_lock); 12009 if (ipip->ipi_cmd_type == IF_CMD) { 12010 struct ifreq *ifr = (struct ifreq *)if_req; 12011 12012 /* Get interface flags (low 16 only). */ 12013 ifr->ifr_flags = ((ipif->ipif_flags | 12014 ill->ill_flags | phyi->phyint_flags) & 0xffff); 12015 } else { 12016 struct lifreq *lifr = (struct lifreq *)if_req; 12017 12018 /* Get interface flags. */ 12019 lifr->lifr_flags = ipif->ipif_flags | 12020 ill->ill_flags | phyi->phyint_flags; 12021 } 12022 mutex_exit(&ill->ill_lock); 12023 return (0); 12024 } 12025 12026 /* ARGSUSED */ 12027 int 12028 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12029 ip_ioctl_cmd_t *ipip, void *if_req) 12030 { 12031 int mtu; 12032 int ip_min_mtu; 12033 struct ifreq *ifr; 12034 struct lifreq *lifr; 12035 ire_t *ire; 12036 12037 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 12038 ipif->ipif_id, (void *)ipif)); 12039 if (ipip->ipi_cmd_type == IF_CMD) { 12040 ifr = (struct ifreq *)if_req; 12041 mtu = ifr->ifr_metric; 12042 } else { 12043 lifr = (struct lifreq *)if_req; 12044 mtu = lifr->lifr_mtu; 12045 } 12046 12047 if (ipif->ipif_isv6) 12048 ip_min_mtu = IPV6_MIN_MTU; 12049 else 12050 ip_min_mtu = IP_MIN_MTU; 12051 12052 if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu) 12053 return (EINVAL); 12054 12055 /* 12056 * Change the MTU size in all relevant ire's. 12057 * Mtu change Vs. new ire creation - protocol below. 12058 * First change ipif_mtu and the ire_max_frag of the 12059 * interface ire. Then do an ire walk and change the 12060 * ire_max_frag of all affected ires. During ire_add 12061 * under the bucket lock, set the ire_max_frag of the 12062 * new ire being created from the ipif/ire from which 12063 * it is being derived. If an mtu change happens after 12064 * the ire is added, the new ire will be cleaned up. 12065 * Conversely if the mtu change happens before the ire 12066 * is added, ire_add will see the new value of the mtu. 12067 */ 12068 ipif->ipif_mtu = mtu; 12069 ipif->ipif_flags |= IPIF_FIXEDMTU; 12070 12071 if (ipif->ipif_isv6) 12072 ire = ipif_to_ire_v6(ipif); 12073 else 12074 ire = ipif_to_ire(ipif); 12075 if (ire != NULL) { 12076 ire->ire_max_frag = ipif->ipif_mtu; 12077 ire_refrele(ire); 12078 } 12079 if (ipif->ipif_flags & IPIF_UP) { 12080 if (ipif->ipif_isv6) 12081 ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES); 12082 else 12083 ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES); 12084 } 12085 /* Update the MTU in SCTP's list */ 12086 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 12087 return (0); 12088 } 12089 12090 /* Get interface MTU. */ 12091 /* ARGSUSED */ 12092 int 12093 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12094 ip_ioctl_cmd_t *ipip, void *if_req) 12095 { 12096 struct ifreq *ifr; 12097 struct lifreq *lifr; 12098 12099 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 12100 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12101 if (ipip->ipi_cmd_type == IF_CMD) { 12102 ifr = (struct ifreq *)if_req; 12103 ifr->ifr_metric = ipif->ipif_mtu; 12104 } else { 12105 lifr = (struct lifreq *)if_req; 12106 lifr->lifr_mtu = ipif->ipif_mtu; 12107 } 12108 return (0); 12109 } 12110 12111 /* Set interface broadcast address. */ 12112 /* ARGSUSED2 */ 12113 int 12114 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12115 ip_ioctl_cmd_t *ipip, void *if_req) 12116 { 12117 ipaddr_t addr; 12118 ire_t *ire; 12119 12120 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name, 12121 ipif->ipif_id)); 12122 12123 ASSERT(IAM_WRITER_IPIF(ipif)); 12124 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 12125 return (EADDRNOTAVAIL); 12126 12127 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 12128 12129 if (sin->sin_family != AF_INET) 12130 return (EAFNOSUPPORT); 12131 12132 addr = sin->sin_addr.s_addr; 12133 if (ipif->ipif_flags & IPIF_UP) { 12134 /* 12135 * If we are already up, make sure the new 12136 * broadcast address makes sense. If it does, 12137 * there should be an IRE for it already. 12138 * Don't match on ipif, only on the ill 12139 * since we are sharing these now. Don't use 12140 * MATCH_IRE_ILL_GROUP as we are looking for 12141 * the broadcast ire on this ill and each ill 12142 * in the group has its own broadcast ire. 12143 */ 12144 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, 12145 ipif, ALL_ZONES, NULL, 12146 (MATCH_IRE_ILL | MATCH_IRE_TYPE)); 12147 if (ire == NULL) { 12148 return (EINVAL); 12149 } else { 12150 ire_refrele(ire); 12151 } 12152 } 12153 /* 12154 * Changing the broadcast addr for this ipif. 12155 * Make sure we have valid net and subnet bcast 12156 * ire's for other logical interfaces, if needed. 12157 */ 12158 if (addr != ipif->ipif_brd_addr) 12159 ipif_check_bcast_ires(ipif); 12160 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 12161 return (0); 12162 } 12163 12164 /* Get interface broadcast address. */ 12165 /* ARGSUSED */ 12166 int 12167 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12168 ip_ioctl_cmd_t *ipip, void *if_req) 12169 { 12170 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 12171 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12172 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 12173 return (EADDRNOTAVAIL); 12174 12175 /* IPIF_BROADCAST not possible with IPv6 */ 12176 ASSERT(!ipif->ipif_isv6); 12177 *sin = sin_null; 12178 sin->sin_family = AF_INET; 12179 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 12180 return (0); 12181 } 12182 12183 /* 12184 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 12185 */ 12186 /* ARGSUSED */ 12187 int 12188 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12189 ip_ioctl_cmd_t *ipip, void *if_req) 12190 { 12191 int err = 0; 12192 in6_addr_t v6mask; 12193 12194 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 12195 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12196 12197 ASSERT(IAM_WRITER_IPIF(ipif)); 12198 12199 if (ipif->ipif_isv6) { 12200 sin6_t *sin6; 12201 12202 if (sin->sin_family != AF_INET6) 12203 return (EAFNOSUPPORT); 12204 12205 sin6 = (sin6_t *)sin; 12206 v6mask = sin6->sin6_addr; 12207 } else { 12208 ipaddr_t mask; 12209 12210 if (sin->sin_family != AF_INET) 12211 return (EAFNOSUPPORT); 12212 12213 mask = sin->sin_addr.s_addr; 12214 V4MASK_TO_V6(mask, v6mask); 12215 } 12216 12217 /* 12218 * No big deal if the interface isn't already up, or the mask 12219 * isn't really changing, or this is pt-pt. 12220 */ 12221 if (!(ipif->ipif_flags & IPIF_UP) || 12222 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 12223 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 12224 ipif->ipif_v6net_mask = v6mask; 12225 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12226 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 12227 ipif->ipif_v6net_mask, 12228 ipif->ipif_v6subnet); 12229 } 12230 return (0); 12231 } 12232 /* 12233 * Make sure we have valid net and subnet broadcast ire's 12234 * for the old netmask, if needed by other logical interfaces. 12235 */ 12236 if (!ipif->ipif_isv6) 12237 ipif_check_bcast_ires(ipif); 12238 12239 err = ipif_logical_down(ipif, q, mp); 12240 if (err == EINPROGRESS) 12241 return (err); 12242 ipif_down_tail(ipif); 12243 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 12244 return (err); 12245 } 12246 12247 static int 12248 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 12249 { 12250 in6_addr_t v6mask; 12251 int err = 0; 12252 12253 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 12254 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12255 12256 if (ipif->ipif_isv6) { 12257 sin6_t *sin6; 12258 12259 sin6 = (sin6_t *)sin; 12260 v6mask = sin6->sin6_addr; 12261 } else { 12262 ipaddr_t mask; 12263 12264 mask = sin->sin_addr.s_addr; 12265 V4MASK_TO_V6(mask, v6mask); 12266 } 12267 12268 ipif->ipif_v6net_mask = v6mask; 12269 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12270 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 12271 ipif->ipif_v6subnet); 12272 } 12273 err = ipif_up(ipif, q, mp); 12274 12275 if (err == 0 || err == EINPROGRESS) { 12276 /* 12277 * The interface must be DL_BOUND if this packet has to 12278 * go out on the wire. Since we only go through a logical 12279 * down and are bound with the driver during an internal 12280 * down/up that is satisfied. 12281 */ 12282 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 12283 /* Potentially broadcast an address mask reply. */ 12284 ipif_mask_reply(ipif); 12285 } 12286 } 12287 return (err); 12288 } 12289 12290 /* ARGSUSED */ 12291 int 12292 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12293 ip_ioctl_cmd_t *ipip, void *if_req) 12294 { 12295 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 12296 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12297 ipif_down_tail(ipif); 12298 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 12299 } 12300 12301 /* Get interface net mask. */ 12302 /* ARGSUSED */ 12303 int 12304 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12305 ip_ioctl_cmd_t *ipip, void *if_req) 12306 { 12307 struct lifreq *lifr = (struct lifreq *)if_req; 12308 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 12309 12310 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 12311 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12312 12313 /* 12314 * net mask can't change since we have a reference to the ipif. 12315 */ 12316 if (ipif->ipif_isv6) { 12317 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12318 *sin6 = sin6_null; 12319 sin6->sin6_family = AF_INET6; 12320 sin6->sin6_addr = ipif->ipif_v6net_mask; 12321 lifr->lifr_addrlen = 12322 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12323 } else { 12324 *sin = sin_null; 12325 sin->sin_family = AF_INET; 12326 sin->sin_addr.s_addr = ipif->ipif_net_mask; 12327 if (ipip->ipi_cmd_type == LIF_CMD) { 12328 lifr->lifr_addrlen = 12329 ip_mask_to_plen(ipif->ipif_net_mask); 12330 } 12331 } 12332 return (0); 12333 } 12334 12335 /* ARGSUSED */ 12336 int 12337 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12338 ip_ioctl_cmd_t *ipip, void *if_req) 12339 { 12340 12341 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 12342 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12343 /* 12344 * Set interface metric. We don't use this for 12345 * anything but we keep track of it in case it is 12346 * important to routing applications or such. 12347 */ 12348 if (ipip->ipi_cmd_type == IF_CMD) { 12349 struct ifreq *ifr; 12350 12351 ifr = (struct ifreq *)if_req; 12352 ipif->ipif_metric = ifr->ifr_metric; 12353 } else { 12354 struct lifreq *lifr; 12355 12356 lifr = (struct lifreq *)if_req; 12357 ipif->ipif_metric = lifr->lifr_metric; 12358 } 12359 return (0); 12360 } 12361 12362 12363 /* ARGSUSED */ 12364 int 12365 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12366 ip_ioctl_cmd_t *ipip, void *if_req) 12367 { 12368 12369 /* Get interface metric. */ 12370 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 12371 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12372 if (ipip->ipi_cmd_type == IF_CMD) { 12373 struct ifreq *ifr; 12374 12375 ifr = (struct ifreq *)if_req; 12376 ifr->ifr_metric = ipif->ipif_metric; 12377 } else { 12378 struct lifreq *lifr; 12379 12380 lifr = (struct lifreq *)if_req; 12381 lifr->lifr_metric = ipif->ipif_metric; 12382 } 12383 12384 return (0); 12385 } 12386 12387 /* ARGSUSED */ 12388 int 12389 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12390 ip_ioctl_cmd_t *ipip, void *if_req) 12391 { 12392 12393 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 12394 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12395 /* 12396 * Set the muxid returned from I_PLINK. 12397 */ 12398 if (ipip->ipi_cmd_type == IF_CMD) { 12399 struct ifreq *ifr = (struct ifreq *)if_req; 12400 12401 ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid; 12402 ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid; 12403 } else { 12404 struct lifreq *lifr = (struct lifreq *)if_req; 12405 12406 ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid; 12407 ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid; 12408 } 12409 return (0); 12410 } 12411 12412 /* ARGSUSED */ 12413 int 12414 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12415 ip_ioctl_cmd_t *ipip, void *if_req) 12416 { 12417 12418 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 12419 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12420 /* 12421 * Get the muxid saved in ill for I_PUNLINK. 12422 */ 12423 if (ipip->ipi_cmd_type == IF_CMD) { 12424 struct ifreq *ifr = (struct ifreq *)if_req; 12425 12426 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12427 ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12428 } else { 12429 struct lifreq *lifr = (struct lifreq *)if_req; 12430 12431 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12432 lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12433 } 12434 return (0); 12435 } 12436 12437 /* 12438 * Set the subnet prefix. Does not modify the broadcast address. 12439 */ 12440 /* ARGSUSED */ 12441 int 12442 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12443 ip_ioctl_cmd_t *ipip, void *if_req) 12444 { 12445 int err = 0; 12446 in6_addr_t v6addr; 12447 in6_addr_t v6mask; 12448 boolean_t need_up = B_FALSE; 12449 int addrlen; 12450 12451 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 12452 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12453 12454 ASSERT(IAM_WRITER_IPIF(ipif)); 12455 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 12456 12457 if (ipif->ipif_isv6) { 12458 sin6_t *sin6; 12459 12460 if (sin->sin_family != AF_INET6) 12461 return (EAFNOSUPPORT); 12462 12463 sin6 = (sin6_t *)sin; 12464 v6addr = sin6->sin6_addr; 12465 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 12466 return (EADDRNOTAVAIL); 12467 } else { 12468 ipaddr_t addr; 12469 12470 if (sin->sin_family != AF_INET) 12471 return (EAFNOSUPPORT); 12472 12473 addr = sin->sin_addr.s_addr; 12474 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 12475 return (EADDRNOTAVAIL); 12476 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12477 /* Add 96 bits */ 12478 addrlen += IPV6_ABITS - IP_ABITS; 12479 } 12480 12481 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 12482 return (EINVAL); 12483 12484 /* Check if bits in the address is set past the mask */ 12485 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 12486 return (EINVAL); 12487 12488 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 12489 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 12490 return (0); /* No change */ 12491 12492 if (ipif->ipif_flags & IPIF_UP) { 12493 /* 12494 * If the interface is already marked up, 12495 * we call ipif_down which will take care 12496 * of ditching any IREs that have been set 12497 * up based on the old interface address. 12498 */ 12499 err = ipif_logical_down(ipif, q, mp); 12500 if (err == EINPROGRESS) 12501 return (err); 12502 ipif_down_tail(ipif); 12503 need_up = B_TRUE; 12504 } 12505 12506 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 12507 return (err); 12508 } 12509 12510 static int 12511 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 12512 queue_t *q, mblk_t *mp, boolean_t need_up) 12513 { 12514 ill_t *ill = ipif->ipif_ill; 12515 int err = 0; 12516 12517 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 12518 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12519 12520 /* Set the new address. */ 12521 mutex_enter(&ill->ill_lock); 12522 ipif->ipif_v6net_mask = v6mask; 12523 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12524 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 12525 ipif->ipif_v6subnet); 12526 } 12527 mutex_exit(&ill->ill_lock); 12528 12529 if (need_up) { 12530 /* 12531 * Now bring the interface back up. If this 12532 * is the only IPIF for the ILL, ipif_up 12533 * will have to re-bind to the device, so 12534 * we may get back EINPROGRESS, in which 12535 * case, this IOCTL will get completed in 12536 * ip_rput_dlpi when we see the DL_BIND_ACK. 12537 */ 12538 err = ipif_up(ipif, q, mp); 12539 if (err == EINPROGRESS) 12540 return (err); 12541 } 12542 return (err); 12543 } 12544 12545 /* ARGSUSED */ 12546 int 12547 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12548 ip_ioctl_cmd_t *ipip, void *if_req) 12549 { 12550 int addrlen; 12551 in6_addr_t v6addr; 12552 in6_addr_t v6mask; 12553 struct lifreq *lifr = (struct lifreq *)if_req; 12554 12555 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 12556 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12557 ipif_down_tail(ipif); 12558 12559 addrlen = lifr->lifr_addrlen; 12560 if (ipif->ipif_isv6) { 12561 sin6_t *sin6; 12562 12563 sin6 = (sin6_t *)sin; 12564 v6addr = sin6->sin6_addr; 12565 } else { 12566 ipaddr_t addr; 12567 12568 addr = sin->sin_addr.s_addr; 12569 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12570 addrlen += IPV6_ABITS - IP_ABITS; 12571 } 12572 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 12573 12574 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 12575 } 12576 12577 /* ARGSUSED */ 12578 int 12579 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12580 ip_ioctl_cmd_t *ipip, void *if_req) 12581 { 12582 struct lifreq *lifr = (struct lifreq *)if_req; 12583 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 12584 12585 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 12586 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12587 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12588 12589 if (ipif->ipif_isv6) { 12590 *sin6 = sin6_null; 12591 sin6->sin6_family = AF_INET6; 12592 sin6->sin6_addr = ipif->ipif_v6subnet; 12593 lifr->lifr_addrlen = 12594 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12595 } else { 12596 *sin = sin_null; 12597 sin->sin_family = AF_INET; 12598 sin->sin_addr.s_addr = ipif->ipif_subnet; 12599 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 12600 } 12601 return (0); 12602 } 12603 12604 /* 12605 * Set the IPv6 address token. 12606 */ 12607 /* ARGSUSED */ 12608 int 12609 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12610 ip_ioctl_cmd_t *ipi, void *if_req) 12611 { 12612 ill_t *ill = ipif->ipif_ill; 12613 int err; 12614 in6_addr_t v6addr; 12615 in6_addr_t v6mask; 12616 boolean_t need_up = B_FALSE; 12617 int i; 12618 sin6_t *sin6 = (sin6_t *)sin; 12619 struct lifreq *lifr = (struct lifreq *)if_req; 12620 int addrlen; 12621 12622 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 12623 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12624 ASSERT(IAM_WRITER_IPIF(ipif)); 12625 12626 addrlen = lifr->lifr_addrlen; 12627 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12628 if (ipif->ipif_id != 0) 12629 return (EINVAL); 12630 12631 if (!ipif->ipif_isv6) 12632 return (EINVAL); 12633 12634 if (addrlen > IPV6_ABITS) 12635 return (EINVAL); 12636 12637 v6addr = sin6->sin6_addr; 12638 12639 /* 12640 * The length of the token is the length from the end. To get 12641 * the proper mask for this, compute the mask of the bits not 12642 * in the token; ie. the prefix, and then xor to get the mask. 12643 */ 12644 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 12645 return (EINVAL); 12646 for (i = 0; i < 4; i++) { 12647 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12648 } 12649 12650 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 12651 ill->ill_token_length == addrlen) 12652 return (0); /* No change */ 12653 12654 if (ipif->ipif_flags & IPIF_UP) { 12655 err = ipif_logical_down(ipif, q, mp); 12656 if (err == EINPROGRESS) 12657 return (err); 12658 ipif_down_tail(ipif); 12659 need_up = B_TRUE; 12660 } 12661 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 12662 return (err); 12663 } 12664 12665 static int 12666 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 12667 mblk_t *mp, boolean_t need_up) 12668 { 12669 in6_addr_t v6addr; 12670 in6_addr_t v6mask; 12671 ill_t *ill = ipif->ipif_ill; 12672 int i; 12673 int err = 0; 12674 12675 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 12676 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12677 v6addr = sin6->sin6_addr; 12678 /* 12679 * The length of the token is the length from the end. To get 12680 * the proper mask for this, compute the mask of the bits not 12681 * in the token; ie. the prefix, and then xor to get the mask. 12682 */ 12683 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 12684 for (i = 0; i < 4; i++) 12685 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12686 12687 mutex_enter(&ill->ill_lock); 12688 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 12689 ill->ill_token_length = addrlen; 12690 mutex_exit(&ill->ill_lock); 12691 12692 if (need_up) { 12693 /* 12694 * Now bring the interface back up. If this 12695 * is the only IPIF for the ILL, ipif_up 12696 * will have to re-bind to the device, so 12697 * we may get back EINPROGRESS, in which 12698 * case, this IOCTL will get completed in 12699 * ip_rput_dlpi when we see the DL_BIND_ACK. 12700 */ 12701 err = ipif_up(ipif, q, mp); 12702 if (err == EINPROGRESS) 12703 return (err); 12704 } 12705 return (err); 12706 } 12707 12708 /* ARGSUSED */ 12709 int 12710 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12711 ip_ioctl_cmd_t *ipi, void *if_req) 12712 { 12713 ill_t *ill; 12714 sin6_t *sin6 = (sin6_t *)sin; 12715 struct lifreq *lifr = (struct lifreq *)if_req; 12716 12717 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 12718 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12719 if (ipif->ipif_id != 0) 12720 return (EINVAL); 12721 12722 ill = ipif->ipif_ill; 12723 if (!ill->ill_isv6) 12724 return (ENXIO); 12725 12726 *sin6 = sin6_null; 12727 sin6->sin6_family = AF_INET6; 12728 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 12729 sin6->sin6_addr = ill->ill_token; 12730 lifr->lifr_addrlen = ill->ill_token_length; 12731 return (0); 12732 } 12733 12734 /* 12735 * Set (hardware) link specific information that might override 12736 * what was acquired through the DL_INFO_ACK. 12737 * The logic is as follows. 12738 * 12739 * become exclusive 12740 * set CHANGING flag 12741 * change mtu on affected IREs 12742 * clear CHANGING flag 12743 * 12744 * An ire add that occurs before the CHANGING flag is set will have its mtu 12745 * changed by the ip_sioctl_lnkinfo. 12746 * 12747 * During the time the CHANGING flag is set, no new ires will be added to the 12748 * bucket, and ire add will fail (due the CHANGING flag). 12749 * 12750 * An ire add that occurs after the CHANGING flag is set will have the right mtu 12751 * before it is added to the bucket. 12752 * 12753 * Obviously only 1 thread can set the CHANGING flag and we need to become 12754 * exclusive to set the flag. 12755 */ 12756 /* ARGSUSED */ 12757 int 12758 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12759 ip_ioctl_cmd_t *ipi, void *if_req) 12760 { 12761 ill_t *ill = ipif->ipif_ill; 12762 ipif_t *nipif; 12763 int ip_min_mtu; 12764 boolean_t mtu_walk = B_FALSE; 12765 struct lifreq *lifr = (struct lifreq *)if_req; 12766 lif_ifinfo_req_t *lir; 12767 ire_t *ire; 12768 12769 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 12770 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12771 lir = &lifr->lifr_ifinfo; 12772 ASSERT(IAM_WRITER_IPIF(ipif)); 12773 12774 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12775 if (ipif->ipif_id != 0) 12776 return (EINVAL); 12777 12778 /* Set interface MTU. */ 12779 if (ipif->ipif_isv6) 12780 ip_min_mtu = IPV6_MIN_MTU; 12781 else 12782 ip_min_mtu = IP_MIN_MTU; 12783 12784 /* 12785 * Verify values before we set anything. Allow zero to 12786 * mean unspecified. 12787 */ 12788 if (lir->lir_maxmtu != 0 && 12789 (lir->lir_maxmtu > ill->ill_max_frag || 12790 lir->lir_maxmtu < ip_min_mtu)) 12791 return (EINVAL); 12792 if (lir->lir_reachtime != 0 && 12793 lir->lir_reachtime > ND_MAX_REACHTIME) 12794 return (EINVAL); 12795 if (lir->lir_reachretrans != 0 && 12796 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 12797 return (EINVAL); 12798 12799 mutex_enter(&ill->ill_lock); 12800 ill->ill_state_flags |= ILL_CHANGING; 12801 for (nipif = ill->ill_ipif; nipif != NULL; 12802 nipif = nipif->ipif_next) { 12803 nipif->ipif_state_flags |= IPIF_CHANGING; 12804 } 12805 12806 mutex_exit(&ill->ill_lock); 12807 12808 if (lir->lir_maxmtu != 0) { 12809 ill->ill_max_mtu = lir->lir_maxmtu; 12810 ill->ill_mtu_userspecified = 1; 12811 mtu_walk = B_TRUE; 12812 } 12813 12814 if (lir->lir_reachtime != 0) 12815 ill->ill_reachable_time = lir->lir_reachtime; 12816 12817 if (lir->lir_reachretrans != 0) 12818 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 12819 12820 ill->ill_max_hops = lir->lir_maxhops; 12821 12822 ill->ill_max_buf = ND_MAX_Q; 12823 12824 if (mtu_walk) { 12825 /* 12826 * Set the MTU on all ipifs associated with this ill except 12827 * for those whose MTU was fixed via SIOCSLIFMTU. 12828 */ 12829 for (nipif = ill->ill_ipif; nipif != NULL; 12830 nipif = nipif->ipif_next) { 12831 if (nipif->ipif_flags & IPIF_FIXEDMTU) 12832 continue; 12833 12834 nipif->ipif_mtu = ill->ill_max_mtu; 12835 12836 if (!(nipif->ipif_flags & IPIF_UP)) 12837 continue; 12838 12839 if (nipif->ipif_isv6) 12840 ire = ipif_to_ire_v6(nipif); 12841 else 12842 ire = ipif_to_ire(nipif); 12843 if (ire != NULL) { 12844 ire->ire_max_frag = ipif->ipif_mtu; 12845 ire_refrele(ire); 12846 } 12847 if (ill->ill_isv6) { 12848 ire_walk_ill_v6(MATCH_IRE_ILL, 0, 12849 ipif_mtu_change, (char *)nipif, 12850 ill); 12851 } else { 12852 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 12853 ipif_mtu_change, (char *)nipif, 12854 ill); 12855 } 12856 } 12857 } 12858 12859 mutex_enter(&ill->ill_lock); 12860 for (nipif = ill->ill_ipif; nipif != NULL; 12861 nipif = nipif->ipif_next) { 12862 nipif->ipif_state_flags &= ~IPIF_CHANGING; 12863 } 12864 ILL_UNMARK_CHANGING(ill); 12865 mutex_exit(&ill->ill_lock); 12866 12867 return (0); 12868 } 12869 12870 /* ARGSUSED */ 12871 int 12872 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12873 ip_ioctl_cmd_t *ipi, void *if_req) 12874 { 12875 struct lif_ifinfo_req *lir; 12876 ill_t *ill = ipif->ipif_ill; 12877 12878 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 12879 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12880 if (ipif->ipif_id != 0) 12881 return (EINVAL); 12882 12883 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 12884 lir->lir_maxhops = ill->ill_max_hops; 12885 lir->lir_reachtime = ill->ill_reachable_time; 12886 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 12887 lir->lir_maxmtu = ill->ill_max_mtu; 12888 12889 return (0); 12890 } 12891 12892 /* 12893 * Return best guess as to the subnet mask for the specified address. 12894 * Based on the subnet masks for all the configured interfaces. 12895 * 12896 * We end up returning a zero mask in the case of default, multicast or 12897 * experimental. 12898 */ 12899 static ipaddr_t 12900 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp) 12901 { 12902 ipaddr_t net_mask; 12903 ill_t *ill; 12904 ipif_t *ipif; 12905 ill_walk_context_t ctx; 12906 ipif_t *fallback_ipif = NULL; 12907 12908 net_mask = ip_net_mask(addr); 12909 if (net_mask == 0) { 12910 *ipifp = NULL; 12911 return (0); 12912 } 12913 12914 /* Let's check to see if this is maybe a local subnet route. */ 12915 /* this function only applies to IPv4 interfaces */ 12916 rw_enter(&ill_g_lock, RW_READER); 12917 ill = ILL_START_WALK_V4(&ctx); 12918 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 12919 mutex_enter(&ill->ill_lock); 12920 for (ipif = ill->ill_ipif; ipif != NULL; 12921 ipif = ipif->ipif_next) { 12922 if (!IPIF_CAN_LOOKUP(ipif)) 12923 continue; 12924 if (!(ipif->ipif_flags & IPIF_UP)) 12925 continue; 12926 if ((ipif->ipif_subnet & net_mask) == 12927 (addr & net_mask)) { 12928 /* 12929 * Don't trust pt-pt interfaces if there are 12930 * other interfaces. 12931 */ 12932 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 12933 if (fallback_ipif == NULL) { 12934 ipif_refhold_locked(ipif); 12935 fallback_ipif = ipif; 12936 } 12937 continue; 12938 } 12939 12940 /* 12941 * Fine. Just assume the same net mask as the 12942 * directly attached subnet interface is using. 12943 */ 12944 ipif_refhold_locked(ipif); 12945 mutex_exit(&ill->ill_lock); 12946 rw_exit(&ill_g_lock); 12947 if (fallback_ipif != NULL) 12948 ipif_refrele(fallback_ipif); 12949 *ipifp = ipif; 12950 return (ipif->ipif_net_mask); 12951 } 12952 } 12953 mutex_exit(&ill->ill_lock); 12954 } 12955 rw_exit(&ill_g_lock); 12956 12957 *ipifp = fallback_ipif; 12958 return ((fallback_ipif != NULL) ? 12959 fallback_ipif->ipif_net_mask : net_mask); 12960 } 12961 12962 /* 12963 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 12964 */ 12965 static void 12966 ip_wput_ioctl(queue_t *q, mblk_t *mp) 12967 { 12968 IOCP iocp; 12969 ipft_t *ipft; 12970 ipllc_t *ipllc; 12971 mblk_t *mp1; 12972 cred_t *cr; 12973 int error = 0; 12974 conn_t *connp; 12975 12976 ip1dbg(("ip_wput_ioctl")); 12977 iocp = (IOCP)mp->b_rptr; 12978 mp1 = mp->b_cont; 12979 if (mp1 == NULL) { 12980 iocp->ioc_error = EINVAL; 12981 mp->b_datap->db_type = M_IOCNAK; 12982 iocp->ioc_count = 0; 12983 qreply(q, mp); 12984 return; 12985 } 12986 12987 /* 12988 * These IOCTLs provide various control capabilities to 12989 * upstream agents such as ULPs and processes. There 12990 * are currently two such IOCTLs implemented. They 12991 * are used by TCP to provide update information for 12992 * existing IREs and to forcibly delete an IRE for a 12993 * host that is not responding, thereby forcing an 12994 * attempt at a new route. 12995 */ 12996 iocp->ioc_error = EINVAL; 12997 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 12998 goto done; 12999 13000 ipllc = (ipllc_t *)mp1->b_rptr; 13001 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 13002 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 13003 break; 13004 } 13005 /* 13006 * prefer credential from mblk over ioctl; 13007 * see ip_sioctl_copyin_setup 13008 */ 13009 cr = DB_CREDDEF(mp, iocp->ioc_cr); 13010 13011 /* 13012 * Refhold the conn in case the request gets queued up in some lookup 13013 */ 13014 ASSERT(CONN_Q(q)); 13015 connp = Q_TO_CONN(q); 13016 CONN_INC_REF(connp); 13017 if (ipft->ipft_pfi && 13018 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 13019 pullupmsg(mp1, ipft->ipft_min_size))) { 13020 error = (*ipft->ipft_pfi)(q, 13021 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 13022 } 13023 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 13024 /* 13025 * CONN_OPER_PENDING_DONE happens in the function called 13026 * through ipft_pfi above. 13027 */ 13028 return; 13029 } 13030 13031 CONN_OPER_PENDING_DONE(connp); 13032 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 13033 freemsg(mp); 13034 return; 13035 } 13036 iocp->ioc_error = error; 13037 13038 done: 13039 mp->b_datap->db_type = M_IOCACK; 13040 if (iocp->ioc_error) 13041 iocp->ioc_count = 0; 13042 qreply(q, mp); 13043 } 13044 13045 /* 13046 * Lookup an ipif using the sequence id (ipif_seqid) 13047 */ 13048 ipif_t * 13049 ipif_lookup_seqid(ill_t *ill, uint_t seqid) 13050 { 13051 ipif_t *ipif; 13052 13053 ASSERT(MUTEX_HELD(&ill->ill_lock)); 13054 13055 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13056 if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif)) 13057 return (ipif); 13058 } 13059 return (NULL); 13060 } 13061 13062 uint64_t ipif_g_seqid; 13063 13064 /* 13065 * Assign a unique id for the ipif. This is used later when we send 13066 * IRES to ARP for resolution where we initialize ire_ipif_seqid 13067 * to the value pointed by ire_ipif->ipif_seqid. Later when the 13068 * IRE is added, we verify that ipif has not disappeared. 13069 */ 13070 13071 static void 13072 ipif_assign_seqid(ipif_t *ipif) 13073 { 13074 ipif->ipif_seqid = atomic_add_64_nv(&ipif_g_seqid, 1); 13075 } 13076 13077 /* 13078 * Insert the ipif, so that the list of ipifs on the ill will be sorted 13079 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 13080 * be inserted into the first space available in the list. The value of 13081 * ipif_id will then be set to the appropriate value for its position. 13082 */ 13083 static int 13084 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) 13085 { 13086 ill_t *ill; 13087 ipif_t *tipif; 13088 ipif_t **tipifp; 13089 int id; 13090 13091 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 13092 IAM_WRITER_IPIF(ipif)); 13093 13094 ill = ipif->ipif_ill; 13095 ASSERT(ill != NULL); 13096 13097 /* 13098 * In the case of lo0:0 we already hold the ill_g_lock. 13099 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 13100 * ipif_insert. Another such caller is ipif_move. 13101 */ 13102 if (acquire_g_lock) 13103 rw_enter(&ill_g_lock, RW_WRITER); 13104 if (acquire_ill_lock) 13105 mutex_enter(&ill->ill_lock); 13106 id = ipif->ipif_id; 13107 tipifp = &(ill->ill_ipif); 13108 if (id == -1) { /* need to find a real id */ 13109 id = 0; 13110 while ((tipif = *tipifp) != NULL) { 13111 ASSERT(tipif->ipif_id >= id); 13112 if (tipif->ipif_id != id) 13113 break; /* non-consecutive id */ 13114 id++; 13115 tipifp = &(tipif->ipif_next); 13116 } 13117 /* limit number of logical interfaces */ 13118 if (id >= ip_addrs_per_if) { 13119 if (acquire_ill_lock) 13120 mutex_exit(&ill->ill_lock); 13121 if (acquire_g_lock) 13122 rw_exit(&ill_g_lock); 13123 return (-1); 13124 } 13125 ipif->ipif_id = id; /* assign new id */ 13126 } else if (id < ip_addrs_per_if) { 13127 /* we have a real id; insert ipif in the right place */ 13128 while ((tipif = *tipifp) != NULL) { 13129 ASSERT(tipif->ipif_id != id); 13130 if (tipif->ipif_id > id) 13131 break; /* found correct location */ 13132 tipifp = &(tipif->ipif_next); 13133 } 13134 } else { 13135 if (acquire_ill_lock) 13136 mutex_exit(&ill->ill_lock); 13137 if (acquire_g_lock) 13138 rw_exit(&ill_g_lock); 13139 return (-1); 13140 } 13141 13142 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 13143 13144 ipif->ipif_next = tipif; 13145 *tipifp = ipif; 13146 if (acquire_ill_lock) 13147 mutex_exit(&ill->ill_lock); 13148 if (acquire_g_lock) 13149 rw_exit(&ill_g_lock); 13150 return (0); 13151 } 13152 13153 /* 13154 * Allocate and initialize a new interface control structure. (Always 13155 * called as writer.) 13156 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 13157 * is not part of the global linked list of ills. ipif_seqid is unique 13158 * in the system and to preserve the uniqueness, it is assigned only 13159 * when ill becomes part of the global list. At that point ill will 13160 * have a name. If it doesn't get assigned here, it will get assigned 13161 * in ipif_set_values() as part of SIOCSLIFNAME processing. 13162 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 13163 * the interface flags or any other information from the DL_INFO_ACK for 13164 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 13165 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 13166 * second DL_INFO_ACK comes in from the driver. 13167 */ 13168 static ipif_t * 13169 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) 13170 { 13171 ipif_t *ipif; 13172 phyint_t *phyi; 13173 13174 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 13175 ill->ill_name, id, (void *)ill)); 13176 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 13177 13178 if ((ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) 13179 return (NULL); 13180 *ipif = ipif_zero; /* start clean */ 13181 13182 ipif->ipif_ill = ill; 13183 ipif->ipif_id = id; /* could be -1 */ 13184 ipif->ipif_zoneid = GLOBAL_ZONEID; 13185 13186 mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 13187 13188 ipif->ipif_refcnt = 0; 13189 ipif->ipif_saved_ire_cnt = 0; 13190 13191 if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) { 13192 mi_free(ipif); 13193 return (NULL); 13194 } 13195 /* -1 id should have been replaced by real id */ 13196 id = ipif->ipif_id; 13197 ASSERT(id >= 0); 13198 13199 if (ill->ill_name[0] != '\0') { 13200 ipif_assign_seqid(ipif); 13201 if (ill->ill_phyint->phyint_ifindex != 0) 13202 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 13203 } 13204 /* 13205 * Keep a copy of original id in ipif_orig_ipifid. Failback 13206 * will attempt to restore the original id. The SIOCSLIFOINDEX 13207 * ioctl sets ipif_orig_ipifid to zero. 13208 */ 13209 ipif->ipif_orig_ipifid = id; 13210 13211 /* 13212 * We grab the ill_lock and phyint_lock to protect the flag changes. 13213 * The ipif is still not up and can't be looked up until the 13214 * ioctl completes and the IPIF_CHANGING flag is cleared. 13215 */ 13216 mutex_enter(&ill->ill_lock); 13217 mutex_enter(&ill->ill_phyint->phyint_lock); 13218 /* 13219 * Set the running flag when logical interface zero is created. 13220 * For subsequent logical interfaces, a DLPI link down 13221 * notification message may have cleared the running flag to 13222 * indicate the link is down, so we shouldn't just blindly set it. 13223 */ 13224 if (id == 0) 13225 ill->ill_phyint->phyint_flags |= PHYI_RUNNING; 13226 ipif->ipif_ire_type = ire_type; 13227 phyi = ill->ill_phyint; 13228 ipif->ipif_orig_ifindex = phyi->phyint_ifindex; 13229 13230 if (ipif->ipif_isv6) { 13231 ill->ill_flags |= ILLF_IPV6; 13232 } else { 13233 ipaddr_t inaddr_any = INADDR_ANY; 13234 13235 ill->ill_flags |= ILLF_IPV4; 13236 13237 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 13238 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13239 &ipif->ipif_v6lcl_addr); 13240 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13241 &ipif->ipif_v6src_addr); 13242 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13243 &ipif->ipif_v6subnet); 13244 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13245 &ipif->ipif_v6net_mask); 13246 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13247 &ipif->ipif_v6brd_addr); 13248 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13249 &ipif->ipif_v6pp_dst_addr); 13250 } 13251 13252 /* 13253 * Don't set the interface flags etc. now, will do it in 13254 * ip_ll_subnet_defaults. 13255 */ 13256 if (!initialize) { 13257 mutex_exit(&ill->ill_lock); 13258 mutex_exit(&ill->ill_phyint->phyint_lock); 13259 return (ipif); 13260 } 13261 ipif->ipif_mtu = ill->ill_max_mtu; 13262 13263 if (ill->ill_bcast_addr_length != 0) { 13264 /* 13265 * Later detect lack of DLPI driver multicast 13266 * capability by catching DL_ENABMULTI errors in 13267 * ip_rput_dlpi. 13268 */ 13269 ill->ill_flags |= ILLF_MULTICAST; 13270 if (!ipif->ipif_isv6) 13271 ipif->ipif_flags |= IPIF_BROADCAST; 13272 } else { 13273 if (ill->ill_net_type != IRE_LOOPBACK) { 13274 if (ipif->ipif_isv6) 13275 /* 13276 * Note: xresolv interfaces will eventually need 13277 * NOARP set here as well, but that will require 13278 * those external resolvers to have some 13279 * knowledge of that flag and act appropriately. 13280 * Not to be changed at present. 13281 */ 13282 ill->ill_flags |= ILLF_NONUD; 13283 else 13284 ill->ill_flags |= ILLF_NOARP; 13285 } 13286 if (ill->ill_phys_addr_length == 0) { 13287 if (ill->ill_media && 13288 ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 13289 ipif->ipif_flags |= IPIF_NOXMIT; 13290 phyi->phyint_flags |= PHYI_VIRTUAL; 13291 } else { 13292 /* pt-pt supports multicast. */ 13293 ill->ill_flags |= ILLF_MULTICAST; 13294 if (ill->ill_net_type == IRE_LOOPBACK) { 13295 phyi->phyint_flags |= 13296 (PHYI_LOOPBACK | PHYI_VIRTUAL); 13297 } else { 13298 ipif->ipif_flags |= IPIF_POINTOPOINT; 13299 } 13300 } 13301 } 13302 } 13303 mutex_exit(&ill->ill_lock); 13304 mutex_exit(&ill->ill_phyint->phyint_lock); 13305 return (ipif); 13306 } 13307 13308 /* 13309 * If appropriate, send a message up to the resolver delete the entry 13310 * for the address of this interface which is going out of business. 13311 * (Always called as writer). 13312 * 13313 * NOTE : We need to check for NULL mps as some of the fields are 13314 * initialized only for some interface types. See ipif_resolver_up() 13315 * for details. 13316 */ 13317 void 13318 ipif_arp_down(ipif_t *ipif) 13319 { 13320 mblk_t *mp; 13321 ill_t *ill = ipif->ipif_ill; 13322 13323 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13324 ASSERT(IAM_WRITER_IPIF(ipif)); 13325 13326 /* Delete the mapping for the local address */ 13327 mp = ipif->ipif_arp_del_mp; 13328 if (mp != NULL) { 13329 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13330 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13331 putnext(ill->ill_rq, mp); 13332 ipif->ipif_arp_del_mp = NULL; 13333 } 13334 13335 /* 13336 * If this is the last ipif that is going down and there are no 13337 * duplicate addresses we may yet attempt to re-probe, then we need to 13338 * clean up ARP completely. 13339 */ 13340 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) { 13341 13342 /* Send up AR_INTERFACE_DOWN message */ 13343 mp = ill->ill_arp_down_mp; 13344 if (mp != NULL) { 13345 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13346 *(unsigned *)mp->b_rptr, ill->ill_name, 13347 ipif->ipif_id)); 13348 putnext(ill->ill_rq, mp); 13349 ill->ill_arp_down_mp = NULL; 13350 } 13351 13352 /* Tell ARP to delete the multicast mappings */ 13353 mp = ill->ill_arp_del_mapping_mp; 13354 if (mp != NULL) { 13355 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13356 *(unsigned *)mp->b_rptr, ill->ill_name, 13357 ipif->ipif_id)); 13358 putnext(ill->ill_rq, mp); 13359 ill->ill_arp_del_mapping_mp = NULL; 13360 } 13361 } 13362 } 13363 13364 /* 13365 * This function sets up the multicast mappings in ARP. When ipif_resolver_up 13366 * calls this function, it passes a non-NULL arp_add_mapping_mp indicating 13367 * that it wants the add_mp allocated in this function to be returned 13368 * wihtout sending it to arp. When ip_rput_dlpi_writer calls this to 13369 * just re-do the multicast, it wants us to send the add_mp to ARP also. 13370 * ipif_resolver_up does not want us to do the "add" i.e sending to ARP, 13371 * as it does a ipif_arp_down after calling this function - which will 13372 * remove what we add here. 13373 * 13374 * Returns -1 on failures and 0 on success. 13375 */ 13376 int 13377 ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) 13378 { 13379 mblk_t *del_mp = NULL; 13380 mblk_t *add_mp = NULL; 13381 mblk_t *mp; 13382 ill_t *ill = ipif->ipif_ill; 13383 phyint_t *phyi = ill->ill_phyint; 13384 ipaddr_t addr, mask, extract_mask = 0; 13385 arma_t *arma; 13386 uint8_t *maddr, *bphys_addr; 13387 uint32_t hw_start; 13388 dl_unitdata_req_t *dlur; 13389 13390 ASSERT(IAM_WRITER_IPIF(ipif)); 13391 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13392 return (0); 13393 13394 /* 13395 * Delete the existing mapping from ARP. Normally ipif_down 13396 * -> ipif_arp_down should send this up to ARP. The only 13397 * reason we would find this when we are switching from 13398 * Multicast to Broadcast where we did not do a down. 13399 */ 13400 mp = ill->ill_arp_del_mapping_mp; 13401 if (mp != NULL) { 13402 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13403 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13404 putnext(ill->ill_rq, mp); 13405 ill->ill_arp_del_mapping_mp = NULL; 13406 } 13407 13408 if (arp_add_mapping_mp != NULL) 13409 *arp_add_mapping_mp = NULL; 13410 13411 /* 13412 * Check that the address is not to long for the constant 13413 * length reserved in the template arma_t. 13414 */ 13415 if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) 13416 return (-1); 13417 13418 /* Add mapping mblk */ 13419 addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP); 13420 mask = (ipaddr_t)htonl(IN_CLASSD_NET); 13421 add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template, 13422 (caddr_t)&addr); 13423 if (add_mp == NULL) 13424 return (-1); 13425 arma = (arma_t *)add_mp->b_rptr; 13426 maddr = (uint8_t *)arma + arma->arma_hw_addr_offset; 13427 bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN); 13428 arma->arma_hw_addr_length = ill->ill_phys_addr_length; 13429 13430 /* 13431 * Determine the broadcast address. 13432 */ 13433 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 13434 if (ill->ill_sap_length < 0) 13435 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 13436 else 13437 bphys_addr = (uchar_t *)dlur + 13438 dlur->dl_dest_addr_offset + ill->ill_sap_length; 13439 /* 13440 * Check PHYI_MULTI_BCAST and length of physical 13441 * address to determine if we use the mapping or the 13442 * broadcast address. 13443 */ 13444 if (!(phyi->phyint_flags & PHYI_MULTI_BCAST)) 13445 if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length, 13446 bphys_addr, maddr, &hw_start, &extract_mask)) 13447 phyi->phyint_flags |= PHYI_MULTI_BCAST; 13448 13449 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) || 13450 (ill->ill_flags & ILLF_MULTICAST)) { 13451 /* Make sure this will not match the "exact" entry. */ 13452 addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP); 13453 del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 13454 (caddr_t)&addr); 13455 if (del_mp == NULL) { 13456 freemsg(add_mp); 13457 return (-1); 13458 } 13459 bcopy(&extract_mask, (char *)arma + 13460 arma->arma_proto_extract_mask_offset, IP_ADDR_LEN); 13461 if (phyi->phyint_flags & PHYI_MULTI_BCAST) { 13462 /* Use link-layer broadcast address for MULTI_BCAST */ 13463 bcopy(bphys_addr, maddr, ill->ill_phys_addr_length); 13464 ip2dbg(("ipif_arp_setup_multicast: adding" 13465 " MULTI_BCAST ARP setup for %s\n", ill->ill_name)); 13466 } else { 13467 arma->arma_hw_mapping_start = hw_start; 13468 ip2dbg(("ipif_arp_setup_multicast: adding multicast" 13469 " ARP setup for %s\n", ill->ill_name)); 13470 } 13471 } else { 13472 freemsg(add_mp); 13473 ASSERT(del_mp == NULL); 13474 /* It is neither MULTICAST nor MULTI_BCAST */ 13475 return (0); 13476 } 13477 ASSERT(add_mp != NULL && del_mp != NULL); 13478 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13479 ill->ill_arp_del_mapping_mp = del_mp; 13480 if (arp_add_mapping_mp != NULL) { 13481 /* The caller just wants the mblks allocated */ 13482 *arp_add_mapping_mp = add_mp; 13483 } else { 13484 /* The caller wants us to send it to arp */ 13485 putnext(ill->ill_rq, add_mp); 13486 } 13487 return (0); 13488 } 13489 13490 /* 13491 * Get the resolver set up for a new interface address. 13492 * (Always called as writer.) 13493 * Called both for IPv4 and IPv6 interfaces, 13494 * though it only sets up the resolver for v6 13495 * if it's an xresolv interface (one using an external resolver). 13496 * Honors ILLF_NOARP. 13497 * The enumerated value res_act is used to tune the behavior. 13498 * If set to Res_act_initial, then we set up all the resolver 13499 * structures for a new interface. If set to Res_act_move, then 13500 * we just send an AR_ENTRY_ADD message up to ARP for IPv4 13501 * interfaces; this is called by ip_rput_dlpi_writer() to handle 13502 * asynchronous hardware address change notification. If set to 13503 * Res_act_defend, then we tell ARP that it needs to send a single 13504 * gratuitous message in defense of the address. 13505 * Returns error on failure. 13506 */ 13507 int 13508 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 13509 { 13510 caddr_t addr; 13511 mblk_t *arp_up_mp = NULL; 13512 mblk_t *arp_down_mp = NULL; 13513 mblk_t *arp_add_mp = NULL; 13514 mblk_t *arp_del_mp = NULL; 13515 mblk_t *arp_add_mapping_mp = NULL; 13516 mblk_t *arp_del_mapping_mp = NULL; 13517 ill_t *ill = ipif->ipif_ill; 13518 uchar_t *area_p = NULL; 13519 uchar_t *ared_p = NULL; 13520 int err = ENOMEM; 13521 boolean_t was_dup; 13522 13523 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 13524 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 13525 ASSERT(IAM_WRITER_IPIF(ipif)); 13526 13527 was_dup = B_FALSE; 13528 if (res_act == Res_act_initial) { 13529 ipif->ipif_addr_ready = 0; 13530 /* 13531 * We're bringing an interface up here. There's no way that we 13532 * should need to shut down ARP now. 13533 */ 13534 mutex_enter(&ill->ill_lock); 13535 if (ipif->ipif_flags & IPIF_DUPLICATE) { 13536 ipif->ipif_flags &= ~IPIF_DUPLICATE; 13537 ill->ill_ipif_dup_count--; 13538 was_dup = B_TRUE; 13539 } 13540 mutex_exit(&ill->ill_lock); 13541 } 13542 if (ipif->ipif_recovery_id != 0) 13543 (void) untimeout(ipif->ipif_recovery_id); 13544 ipif->ipif_recovery_id = 0; 13545 if (ill->ill_net_type != IRE_IF_RESOLVER) { 13546 ipif->ipif_addr_ready = 1; 13547 return (0); 13548 } 13549 /* NDP will set the ipif_addr_ready flag when it's ready */ 13550 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 13551 return (0); 13552 13553 if (ill->ill_isv6) { 13554 /* 13555 * External resolver for IPv6 13556 */ 13557 ASSERT(res_act == Res_act_initial); 13558 if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 13559 addr = (caddr_t)&ipif->ipif_v6lcl_addr; 13560 area_p = (uchar_t *)&ip6_area_template; 13561 ared_p = (uchar_t *)&ip6_ared_template; 13562 } 13563 } else { 13564 /* 13565 * IPv4 arp case. If the ARP stream has already started 13566 * closing, fail this request for ARP bringup. Else 13567 * record the fact that an ARP bringup is pending. 13568 */ 13569 mutex_enter(&ill->ill_lock); 13570 if (ill->ill_arp_closing) { 13571 mutex_exit(&ill->ill_lock); 13572 err = EINVAL; 13573 goto failed; 13574 } else { 13575 if (ill->ill_ipif_up_count == 0 && 13576 ill->ill_ipif_dup_count == 0 && !was_dup) 13577 ill->ill_arp_bringup_pending = 1; 13578 mutex_exit(&ill->ill_lock); 13579 } 13580 if (ipif->ipif_lcl_addr != INADDR_ANY) { 13581 addr = (caddr_t)&ipif->ipif_lcl_addr; 13582 area_p = (uchar_t *)&ip_area_template; 13583 ared_p = (uchar_t *)&ip_ared_template; 13584 } 13585 } 13586 13587 /* 13588 * Add an entry for the local address in ARP only if it 13589 * is not UNNUMBERED and the address is not INADDR_ANY. 13590 */ 13591 if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && area_p != NULL) { 13592 area_t *area; 13593 13594 /* Now ask ARP to publish our address. */ 13595 arp_add_mp = ill_arp_alloc(ill, area_p, addr); 13596 if (arp_add_mp == NULL) 13597 goto failed; 13598 area = (area_t *)arp_add_mp->b_rptr; 13599 if (res_act != Res_act_initial) { 13600 /* 13601 * Copy the new hardware address and length into 13602 * arp_add_mp to be sent to ARP. 13603 */ 13604 area->area_hw_addr_length = 13605 ill->ill_phys_addr_length; 13606 bcopy((char *)ill->ill_phys_addr, 13607 ((char *)area + area->area_hw_addr_offset), 13608 area->area_hw_addr_length); 13609 } 13610 13611 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | 13612 ACE_F_MYADDR; 13613 13614 if (res_act == Res_act_defend) { 13615 area->area_flags |= ACE_F_DEFEND; 13616 /* 13617 * If we're just defending our address now, then 13618 * there's no need to set up ARP multicast mappings. 13619 * The publish command is enough. 13620 */ 13621 goto done; 13622 } 13623 13624 if (res_act != Res_act_initial) 13625 goto arp_setup_multicast; 13626 13627 /* 13628 * Allocate an ARP deletion message so we know we can tell ARP 13629 * when the interface goes down. 13630 */ 13631 arp_del_mp = ill_arp_alloc(ill, ared_p, addr); 13632 if (arp_del_mp == NULL) 13633 goto failed; 13634 13635 } else { 13636 if (res_act != Res_act_initial) 13637 goto done; 13638 } 13639 /* 13640 * Need to bring up ARP or setup multicast mapping only 13641 * when the first interface is coming UP. 13642 */ 13643 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 13644 was_dup) { 13645 goto done; 13646 } 13647 13648 /* 13649 * Allocate an ARP down message (to be saved) and an ARP up 13650 * message. 13651 */ 13652 arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0); 13653 if (arp_down_mp == NULL) 13654 goto failed; 13655 13656 arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0); 13657 if (arp_up_mp == NULL) 13658 goto failed; 13659 13660 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13661 goto done; 13662 13663 arp_setup_multicast: 13664 /* 13665 * Setup the multicast mappings. This function initializes 13666 * ill_arp_del_mapping_mp also. This does not need to be done for 13667 * IPv6. 13668 */ 13669 if (!ill->ill_isv6) { 13670 err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp); 13671 if (err != 0) 13672 goto failed; 13673 ASSERT(ill->ill_arp_del_mapping_mp != NULL); 13674 ASSERT(arp_add_mapping_mp != NULL); 13675 } 13676 13677 done: 13678 if (arp_del_mp != NULL) { 13679 ASSERT(ipif->ipif_arp_del_mp == NULL); 13680 ipif->ipif_arp_del_mp = arp_del_mp; 13681 } 13682 if (arp_down_mp != NULL) { 13683 ASSERT(ill->ill_arp_down_mp == NULL); 13684 ill->ill_arp_down_mp = arp_down_mp; 13685 } 13686 if (arp_del_mapping_mp != NULL) { 13687 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13688 ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; 13689 } 13690 if (arp_up_mp != NULL) { 13691 ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", 13692 ill->ill_name, ipif->ipif_id)); 13693 putnext(ill->ill_rq, arp_up_mp); 13694 } 13695 if (arp_add_mp != NULL) { 13696 ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", 13697 ill->ill_name, ipif->ipif_id)); 13698 /* 13699 * If it's an extended ARP implementation, then we'll wait to 13700 * hear that DAD has finished before using the interface. 13701 */ 13702 if (!ill->ill_arp_extend) 13703 ipif->ipif_addr_ready = 1; 13704 putnext(ill->ill_rq, arp_add_mp); 13705 } else { 13706 ipif->ipif_addr_ready = 1; 13707 } 13708 if (arp_add_mapping_mp != NULL) { 13709 ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", 13710 ill->ill_name, ipif->ipif_id)); 13711 putnext(ill->ill_rq, arp_add_mapping_mp); 13712 } 13713 if (res_act != Res_act_initial) 13714 return (0); 13715 13716 if (ill->ill_flags & ILLF_NOARP) 13717 err = ill_arp_off(ill); 13718 else 13719 err = ill_arp_on(ill); 13720 if (err != 0) { 13721 ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err)); 13722 freemsg(ipif->ipif_arp_del_mp); 13723 freemsg(ill->ill_arp_down_mp); 13724 freemsg(ill->ill_arp_del_mapping_mp); 13725 ipif->ipif_arp_del_mp = NULL; 13726 ill->ill_arp_down_mp = NULL; 13727 ill->ill_arp_del_mapping_mp = NULL; 13728 return (err); 13729 } 13730 return ((ill->ill_ipif_up_count != 0 || was_dup || 13731 ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS); 13732 13733 failed: 13734 ip1dbg(("ipif_resolver_up: FAILED\n")); 13735 freemsg(arp_add_mp); 13736 freemsg(arp_del_mp); 13737 freemsg(arp_add_mapping_mp); 13738 freemsg(arp_up_mp); 13739 freemsg(arp_down_mp); 13740 ill->ill_arp_bringup_pending = 0; 13741 return (err); 13742 } 13743 13744 /* 13745 * This routine restarts IPv4 duplicate address detection (DAD) when a link has 13746 * just gone back up. 13747 */ 13748 static void 13749 ipif_arp_start_dad(ipif_t *ipif) 13750 { 13751 ill_t *ill = ipif->ipif_ill; 13752 mblk_t *arp_add_mp; 13753 area_t *area; 13754 13755 if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing || 13756 (ipif->ipif_flags & IPIF_UNNUMBERED) || 13757 ipif->ipif_lcl_addr == INADDR_ANY || 13758 (arp_add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 13759 (char *)&ipif->ipif_lcl_addr)) == NULL) { 13760 /* 13761 * If we can't contact ARP for some reason, that's not really a 13762 * problem. Just send out the routing socket notification that 13763 * DAD completion would have done, and continue. 13764 */ 13765 ipif_mask_reply(ipif); 13766 ip_rts_ifmsg(ipif); 13767 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 13768 sctp_update_ipif(ipif, SCTP_IPIF_UP); 13769 ipif->ipif_addr_ready = 1; 13770 return; 13771 } 13772 13773 /* Setting the 'unverified' flag restarts DAD */ 13774 area = (area_t *)arp_add_mp->b_rptr; 13775 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | 13776 ACE_F_UNVERIFIED; 13777 putnext(ill->ill_rq, arp_add_mp); 13778 } 13779 13780 static void 13781 ipif_ndp_start_dad(ipif_t *ipif) 13782 { 13783 nce_t *nce; 13784 13785 nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, B_FALSE); 13786 if (nce == NULL) 13787 return; 13788 13789 if (!ndp_restart_dad(nce)) { 13790 /* 13791 * If we can't restart DAD for some reason, that's not really a 13792 * problem. Just send out the routing socket notification that 13793 * DAD completion would have done, and continue. 13794 */ 13795 ip_rts_ifmsg(ipif); 13796 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 13797 sctp_update_ipif(ipif, SCTP_IPIF_UP); 13798 ipif->ipif_addr_ready = 1; 13799 } 13800 NCE_REFRELE(nce); 13801 } 13802 13803 /* 13804 * Restart duplicate address detection on all interfaces on the given ill. 13805 * 13806 * This is called when an interface transitions from down to up 13807 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 13808 * 13809 * Note that since the underlying physical link has transitioned, we must cause 13810 * at least one routing socket message to be sent here, either via DAD 13811 * completion or just by default on the first ipif. (If we don't do this, then 13812 * in.mpathd will see long delays when doing link-based failure recovery.) 13813 */ 13814 void 13815 ill_restart_dad(ill_t *ill, boolean_t went_up) 13816 { 13817 ipif_t *ipif; 13818 13819 if (ill == NULL) 13820 return; 13821 13822 /* 13823 * If layer two doesn't support duplicate address detection, then just 13824 * send the routing socket message now and be done with it. 13825 */ 13826 if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) || 13827 (!ill->ill_isv6 && !ill->ill_arp_extend)) { 13828 ip_rts_ifmsg(ill->ill_ipif); 13829 return; 13830 } 13831 13832 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13833 if (went_up) { 13834 if (ipif->ipif_flags & IPIF_UP) { 13835 if (ill->ill_isv6) 13836 ipif_ndp_start_dad(ipif); 13837 else 13838 ipif_arp_start_dad(ipif); 13839 } else if (ill->ill_isv6 && 13840 (ipif->ipif_flags & IPIF_DUPLICATE)) { 13841 /* 13842 * For IPv4, the ARP module itself will 13843 * automatically start the DAD process when it 13844 * sees DL_NOTE_LINK_UP. We respond to the 13845 * AR_CN_READY at the completion of that task. 13846 * For IPv6, we must kick off the bring-up 13847 * process now. 13848 */ 13849 ndp_do_recovery(ipif); 13850 } else { 13851 /* 13852 * Unfortunately, the first ipif is "special" 13853 * and represents the underlying ill in the 13854 * routing socket messages. Thus, when this 13855 * one ipif is down, we must still notify so 13856 * that the user knows the IFF_RUNNING status 13857 * change. (If the first ipif is up, then 13858 * we'll handle eventual routing socket 13859 * notification via DAD completion.) 13860 */ 13861 if (ipif == ill->ill_ipif) 13862 ip_rts_ifmsg(ill->ill_ipif); 13863 } 13864 } else { 13865 /* 13866 * After link down, we'll need to send a new routing 13867 * message when the link comes back, so clear 13868 * ipif_addr_ready. 13869 */ 13870 ipif->ipif_addr_ready = 0; 13871 } 13872 } 13873 13874 /* 13875 * If we've torn down links, then notify the user right away. 13876 */ 13877 if (!went_up) 13878 ip_rts_ifmsg(ill->ill_ipif); 13879 } 13880 13881 /* 13882 * Wakeup all threads waiting to enter the ipsq, and sleeping 13883 * on any of the ills in this ipsq. The ill_lock of the ill 13884 * must be held so that waiters don't miss wakeups 13885 */ 13886 static void 13887 ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock) 13888 { 13889 phyint_t *phyint; 13890 13891 phyint = ipsq->ipsq_phyint_list; 13892 while (phyint != NULL) { 13893 if (phyint->phyint_illv4) { 13894 if (!caller_holds_lock) 13895 mutex_enter(&phyint->phyint_illv4->ill_lock); 13896 ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 13897 cv_broadcast(&phyint->phyint_illv4->ill_cv); 13898 if (!caller_holds_lock) 13899 mutex_exit(&phyint->phyint_illv4->ill_lock); 13900 } 13901 if (phyint->phyint_illv6) { 13902 if (!caller_holds_lock) 13903 mutex_enter(&phyint->phyint_illv6->ill_lock); 13904 ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 13905 cv_broadcast(&phyint->phyint_illv6->ill_cv); 13906 if (!caller_holds_lock) 13907 mutex_exit(&phyint->phyint_illv6->ill_lock); 13908 } 13909 phyint = phyint->phyint_ipsq_next; 13910 } 13911 } 13912 13913 static ipsq_t * 13914 ipsq_create(char *groupname) 13915 { 13916 ipsq_t *ipsq; 13917 13918 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 13919 ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 13920 if (ipsq == NULL) { 13921 return (NULL); 13922 } 13923 13924 if (groupname != NULL) 13925 (void) strcpy(ipsq->ipsq_name, groupname); 13926 else 13927 ipsq->ipsq_name[0] = '\0'; 13928 13929 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL); 13930 ipsq->ipsq_flags |= IPSQ_GROUP; 13931 ipsq->ipsq_next = ipsq_g_head; 13932 ipsq_g_head = ipsq; 13933 return (ipsq); 13934 } 13935 13936 /* 13937 * Return an ipsq correspoding to the groupname. If 'create' is true 13938 * allocate a new ipsq if one does not exist. Usually an ipsq is associated 13939 * uniquely with an IPMP group. However during IPMP groupname operations, 13940 * multiple IPMP groups may be associated with a single ipsq. But no 13941 * IPMP group can be associated with more than 1 ipsq at any time. 13942 * For example 13943 * Interfaces IPMP grpname ipsq ipsq_name ipsq_refs 13944 * hme1, hme2 mpk17-84 ipsq1 mpk17-84 2 13945 * hme3, hme4 mpk17-85 ipsq2 mpk17-85 2 13946 * 13947 * Now the command ifconfig hme3 group mpk17-84 results in the temporary 13948 * status shown below during the execution of the above command. 13949 * hme1, hme2, hme3, hme4 mpk17-84, mpk17-85 ipsq1 mpk17-84 4 13950 * 13951 * After the completion of the above groupname command we return to the stable 13952 * state shown below. 13953 * hme1, hme2, hme3 mpk17-84 ipsq1 mpk17-84 3 13954 * hme4 mpk17-85 ipsq2 mpk17-85 1 13955 * 13956 * Because of the above, we don't search based on the ipsq_name since that 13957 * would miss the correct ipsq during certain windows as shown above. 13958 * The ipsq_name is only used during split of an ipsq to return the ipsq to its 13959 * natural state. 13960 */ 13961 static ipsq_t * 13962 ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq) 13963 { 13964 ipsq_t *ipsq; 13965 int group_len; 13966 phyint_t *phyint; 13967 13968 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 13969 13970 group_len = strlen(groupname); 13971 ASSERT(group_len != 0); 13972 group_len++; 13973 13974 for (ipsq = ipsq_g_head; ipsq != NULL; ipsq = ipsq->ipsq_next) { 13975 /* 13976 * When an ipsq is being split, and ill_split_ipsq 13977 * calls this function, we exclude it from being considered. 13978 */ 13979 if (ipsq == exclude_ipsq) 13980 continue; 13981 13982 /* 13983 * Compare against the ipsq_name. The groupname change happens 13984 * in 2 phases. The 1st phase merges the from group into 13985 * the to group's ipsq, by calling ill_merge_groups and restarts 13986 * the ioctl. The 2nd phase then locates the ipsq again thru 13987 * ipsq_name. At this point the phyint_groupname has not been 13988 * updated. 13989 */ 13990 if ((group_len == strlen(ipsq->ipsq_name) + 1) && 13991 (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) { 13992 /* 13993 * Verify that an ipmp groupname is exactly 13994 * part of 1 ipsq and is not found in any other 13995 * ipsq. 13996 */ 13997 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq) == 13998 NULL); 13999 return (ipsq); 14000 } 14001 14002 /* 14003 * Comparison against ipsq_name alone is not sufficient. 14004 * In the case when groups are currently being 14005 * merged, the ipsq could hold other IPMP groups temporarily. 14006 * so we walk the phyint list and compare against the 14007 * phyint_groupname as well. 14008 */ 14009 phyint = ipsq->ipsq_phyint_list; 14010 while (phyint != NULL) { 14011 if ((group_len == phyint->phyint_groupname_len) && 14012 (bcmp(phyint->phyint_groupname, groupname, 14013 group_len) == 0)) { 14014 /* 14015 * Verify that an ipmp groupname is exactly 14016 * part of 1 ipsq and is not found in any other 14017 * ipsq. 14018 */ 14019 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq) 14020 == NULL); 14021 return (ipsq); 14022 } 14023 phyint = phyint->phyint_ipsq_next; 14024 } 14025 } 14026 if (create) 14027 ipsq = ipsq_create(groupname); 14028 return (ipsq); 14029 } 14030 14031 static void 14032 ipsq_delete(ipsq_t *ipsq) 14033 { 14034 ipsq_t *nipsq; 14035 ipsq_t *pipsq = NULL; 14036 14037 /* 14038 * We don't hold the ipsq lock, but we are sure no new 14039 * messages can land up, since the ipsq_refs is zero. 14040 * i.e. this ipsq is unnamed and no phyint or phyint group 14041 * is associated with this ipsq. (Lookups are based on ill_name 14042 * or phyint_group_name) 14043 */ 14044 ASSERT(ipsq->ipsq_refs == 0); 14045 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL); 14046 ASSERT(ipsq->ipsq_pending_mp == NULL); 14047 if (!(ipsq->ipsq_flags & IPSQ_GROUP)) { 14048 /* 14049 * This is not the ipsq of an IPMP group. 14050 */ 14051 kmem_free(ipsq, sizeof (ipsq_t)); 14052 return; 14053 } 14054 14055 rw_enter(&ill_g_lock, RW_WRITER); 14056 14057 /* 14058 * Locate the ipsq before we can remove it from 14059 * the singly linked list of ipsq's. 14060 */ 14061 for (nipsq = ipsq_g_head; nipsq != NULL; nipsq = nipsq->ipsq_next) { 14062 if (nipsq == ipsq) { 14063 break; 14064 } 14065 pipsq = nipsq; 14066 } 14067 14068 ASSERT(nipsq == ipsq); 14069 14070 /* unlink ipsq from the list */ 14071 if (pipsq != NULL) 14072 pipsq->ipsq_next = ipsq->ipsq_next; 14073 else 14074 ipsq_g_head = ipsq->ipsq_next; 14075 kmem_free(ipsq, sizeof (ipsq_t)); 14076 rw_exit(&ill_g_lock); 14077 } 14078 14079 static void 14080 ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp, 14081 queue_t *q) 14082 14083 { 14084 14085 ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock)); 14086 ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL); 14087 ASSERT(old_ipsq->ipsq_pending_ipif == NULL); 14088 ASSERT(old_ipsq->ipsq_pending_mp == NULL); 14089 ASSERT(current_mp != NULL); 14090 14091 ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl, 14092 NEW_OP, NULL); 14093 14094 ASSERT(new_ipsq->ipsq_xopq_mptail != NULL && 14095 new_ipsq->ipsq_xopq_mphead != NULL); 14096 14097 /* 14098 * move from old ipsq to the new ipsq. 14099 */ 14100 new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead; 14101 if (old_ipsq->ipsq_xopq_mphead != NULL) 14102 new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail; 14103 14104 old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL; 14105 } 14106 14107 void 14108 ill_group_cleanup(ill_t *ill) 14109 { 14110 ill_t *ill_v4; 14111 ill_t *ill_v6; 14112 ipif_t *ipif; 14113 14114 ill_v4 = ill->ill_phyint->phyint_illv4; 14115 ill_v6 = ill->ill_phyint->phyint_illv6; 14116 14117 if (ill_v4 != NULL) { 14118 mutex_enter(&ill_v4->ill_lock); 14119 for (ipif = ill_v4->ill_ipif; ipif != NULL; 14120 ipif = ipif->ipif_next) { 14121 IPIF_UNMARK_MOVING(ipif); 14122 } 14123 ill_v4->ill_up_ipifs = B_FALSE; 14124 mutex_exit(&ill_v4->ill_lock); 14125 } 14126 14127 if (ill_v6 != NULL) { 14128 mutex_enter(&ill_v6->ill_lock); 14129 for (ipif = ill_v6->ill_ipif; ipif != NULL; 14130 ipif = ipif->ipif_next) { 14131 IPIF_UNMARK_MOVING(ipif); 14132 } 14133 ill_v6->ill_up_ipifs = B_FALSE; 14134 mutex_exit(&ill_v6->ill_lock); 14135 } 14136 } 14137 /* 14138 * This function is called when an ill has had a change in its group status 14139 * to bring up all the ipifs that were up before the change. 14140 */ 14141 int 14142 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 14143 { 14144 ipif_t *ipif; 14145 ill_t *ill_v4; 14146 ill_t *ill_v6; 14147 ill_t *from_ill; 14148 int err = 0; 14149 14150 14151 ASSERT(IAM_WRITER_ILL(ill)); 14152 14153 /* 14154 * Except for ipif_state_flags and ill_state_flags the other 14155 * fields of the ipif/ill that are modified below are protected 14156 * implicitly since we are a writer. We would have tried to down 14157 * even an ipif that was already down, in ill_down_ipifs. So we 14158 * just blindly clear the IPIF_CHANGING flag here on all ipifs. 14159 */ 14160 ill_v4 = ill->ill_phyint->phyint_illv4; 14161 ill_v6 = ill->ill_phyint->phyint_illv6; 14162 if (ill_v4 != NULL) { 14163 ill_v4->ill_up_ipifs = B_TRUE; 14164 for (ipif = ill_v4->ill_ipif; ipif != NULL; 14165 ipif = ipif->ipif_next) { 14166 mutex_enter(&ill_v4->ill_lock); 14167 ipif->ipif_state_flags &= ~IPIF_CHANGING; 14168 IPIF_UNMARK_MOVING(ipif); 14169 mutex_exit(&ill_v4->ill_lock); 14170 if (ipif->ipif_was_up) { 14171 if (!(ipif->ipif_flags & IPIF_UP)) 14172 err = ipif_up(ipif, q, mp); 14173 ipif->ipif_was_up = B_FALSE; 14174 if (err != 0) { 14175 /* 14176 * Can there be any other error ? 14177 */ 14178 ASSERT(err == EINPROGRESS); 14179 return (err); 14180 } 14181 } 14182 } 14183 mutex_enter(&ill_v4->ill_lock); 14184 ill_v4->ill_state_flags &= ~ILL_CHANGING; 14185 mutex_exit(&ill_v4->ill_lock); 14186 ill_v4->ill_up_ipifs = B_FALSE; 14187 if (ill_v4->ill_move_in_progress) { 14188 ASSERT(ill_v4->ill_move_peer != NULL); 14189 ill_v4->ill_move_in_progress = B_FALSE; 14190 from_ill = ill_v4->ill_move_peer; 14191 from_ill->ill_move_in_progress = B_FALSE; 14192 from_ill->ill_move_peer = NULL; 14193 mutex_enter(&from_ill->ill_lock); 14194 from_ill->ill_state_flags &= ~ILL_CHANGING; 14195 mutex_exit(&from_ill->ill_lock); 14196 if (ill_v6 == NULL) { 14197 if (from_ill->ill_phyint->phyint_flags & 14198 PHYI_STANDBY) { 14199 phyint_inactive(from_ill->ill_phyint); 14200 } 14201 if (ill_v4->ill_phyint->phyint_flags & 14202 PHYI_STANDBY) { 14203 phyint_inactive(ill_v4->ill_phyint); 14204 } 14205 } 14206 ill_v4->ill_move_peer = NULL; 14207 } 14208 } 14209 14210 if (ill_v6 != NULL) { 14211 ill_v6->ill_up_ipifs = B_TRUE; 14212 for (ipif = ill_v6->ill_ipif; ipif != NULL; 14213 ipif = ipif->ipif_next) { 14214 mutex_enter(&ill_v6->ill_lock); 14215 ipif->ipif_state_flags &= ~IPIF_CHANGING; 14216 IPIF_UNMARK_MOVING(ipif); 14217 mutex_exit(&ill_v6->ill_lock); 14218 if (ipif->ipif_was_up) { 14219 if (!(ipif->ipif_flags & IPIF_UP)) 14220 err = ipif_up(ipif, q, mp); 14221 ipif->ipif_was_up = B_FALSE; 14222 if (err != 0) { 14223 /* 14224 * Can there be any other error ? 14225 */ 14226 ASSERT(err == EINPROGRESS); 14227 return (err); 14228 } 14229 } 14230 } 14231 mutex_enter(&ill_v6->ill_lock); 14232 ill_v6->ill_state_flags &= ~ILL_CHANGING; 14233 mutex_exit(&ill_v6->ill_lock); 14234 ill_v6->ill_up_ipifs = B_FALSE; 14235 if (ill_v6->ill_move_in_progress) { 14236 ASSERT(ill_v6->ill_move_peer != NULL); 14237 ill_v6->ill_move_in_progress = B_FALSE; 14238 from_ill = ill_v6->ill_move_peer; 14239 from_ill->ill_move_in_progress = B_FALSE; 14240 from_ill->ill_move_peer = NULL; 14241 mutex_enter(&from_ill->ill_lock); 14242 from_ill->ill_state_flags &= ~ILL_CHANGING; 14243 mutex_exit(&from_ill->ill_lock); 14244 if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 14245 phyint_inactive(from_ill->ill_phyint); 14246 } 14247 if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) { 14248 phyint_inactive(ill_v6->ill_phyint); 14249 } 14250 ill_v6->ill_move_peer = NULL; 14251 } 14252 } 14253 return (0); 14254 } 14255 14256 /* 14257 * bring down all the approriate ipifs. 14258 */ 14259 /* ARGSUSED */ 14260 static void 14261 ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover) 14262 { 14263 ipif_t *ipif; 14264 14265 ASSERT(IAM_WRITER_ILL(ill)); 14266 14267 /* 14268 * Except for ipif_state_flags the other fields of the ipif/ill that 14269 * are modified below are protected implicitly since we are a writer 14270 */ 14271 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14272 if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER)) 14273 continue; 14274 if (index == 0 || index == ipif->ipif_orig_ifindex) { 14275 /* 14276 * We go through the ipif_down logic even if the ipif 14277 * is already down, since routes can be added based 14278 * on down ipifs. Going through ipif_down once again 14279 * will delete any IREs created based on these routes. 14280 */ 14281 if (ipif->ipif_flags & IPIF_UP) 14282 ipif->ipif_was_up = B_TRUE; 14283 /* 14284 * If called with chk_nofailover true ipif is moving. 14285 */ 14286 mutex_enter(&ill->ill_lock); 14287 if (chk_nofailover) { 14288 ipif->ipif_state_flags |= 14289 IPIF_MOVING | IPIF_CHANGING; 14290 } else { 14291 ipif->ipif_state_flags |= IPIF_CHANGING; 14292 } 14293 mutex_exit(&ill->ill_lock); 14294 /* 14295 * Need to re-create net/subnet bcast ires if 14296 * they are dependent on ipif. 14297 */ 14298 if (!ipif->ipif_isv6) 14299 ipif_check_bcast_ires(ipif); 14300 (void) ipif_logical_down(ipif, NULL, NULL); 14301 ipif_non_duplicate(ipif); 14302 ipif_down_tail(ipif); 14303 /* 14304 * We don't do ipif_multicast_down for IPv4 in 14305 * ipif_down. We need to set this so that 14306 * ipif_multicast_up will join the 14307 * ALLHOSTS_GROUP on to_ill. 14308 */ 14309 ipif->ipif_multicast_up = B_FALSE; 14310 } 14311 } 14312 } 14313 14314 #define IPSQ_INC_REF(ipsq) { \ 14315 ASSERT(RW_WRITE_HELD(&ill_g_lock)); \ 14316 (ipsq)->ipsq_refs++; \ 14317 } 14318 14319 #define IPSQ_DEC_REF(ipsq) { \ 14320 ASSERT(RW_WRITE_HELD(&ill_g_lock)); \ 14321 (ipsq)->ipsq_refs--; \ 14322 if ((ipsq)->ipsq_refs == 0) \ 14323 (ipsq)->ipsq_name[0] = '\0'; \ 14324 } 14325 14326 /* 14327 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 14328 * new_ipsq. 14329 */ 14330 static void 14331 ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq) 14332 { 14333 phyint_t *phyint; 14334 phyint_t *next_phyint; 14335 14336 /* 14337 * To change the ipsq of an ill, we need to hold the ill_g_lock as 14338 * writer and the ill_lock of the ill in question. Also the dest 14339 * ipsq can't vanish while we hold the ill_g_lock as writer. 14340 */ 14341 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 14342 14343 phyint = cur_ipsq->ipsq_phyint_list; 14344 cur_ipsq->ipsq_phyint_list = NULL; 14345 while (phyint != NULL) { 14346 next_phyint = phyint->phyint_ipsq_next; 14347 IPSQ_DEC_REF(cur_ipsq); 14348 phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list; 14349 new_ipsq->ipsq_phyint_list = phyint; 14350 IPSQ_INC_REF(new_ipsq); 14351 phyint->phyint_ipsq = new_ipsq; 14352 phyint = next_phyint; 14353 } 14354 } 14355 14356 #define SPLIT_SUCCESS 0 14357 #define SPLIT_NOT_NEEDED 1 14358 #define SPLIT_FAILED 2 14359 14360 int 14361 ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry) 14362 { 14363 ipsq_t *newipsq = NULL; 14364 14365 /* 14366 * Assertions denote pre-requisites for changing the ipsq of 14367 * a phyint 14368 */ 14369 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 14370 /* 14371 * <ill-phyint> assocs can't change while ill_g_lock 14372 * is held as writer. See ill_phyint_reinit() 14373 */ 14374 ASSERT(phyint->phyint_illv4 == NULL || 14375 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14376 ASSERT(phyint->phyint_illv6 == NULL || 14377 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14378 14379 if ((phyint->phyint_groupname_len != 14380 (strlen(cur_ipsq->ipsq_name) + 1) || 14381 bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name, 14382 phyint->phyint_groupname_len) != 0)) { 14383 /* 14384 * Once we fail in creating a new ipsq due to memory shortage, 14385 * don't attempt to create new ipsq again, based on another 14386 * phyint, since we want all phyints belonging to an IPMP group 14387 * to be in the same ipsq even in the event of mem alloc fails. 14388 */ 14389 newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry, 14390 cur_ipsq); 14391 if (newipsq == NULL) { 14392 /* Memory allocation failure */ 14393 return (SPLIT_FAILED); 14394 } else { 14395 /* ipsq_refs protected by ill_g_lock (writer) */ 14396 IPSQ_DEC_REF(cur_ipsq); 14397 phyint->phyint_ipsq = newipsq; 14398 phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list; 14399 newipsq->ipsq_phyint_list = phyint; 14400 IPSQ_INC_REF(newipsq); 14401 return (SPLIT_SUCCESS); 14402 } 14403 } 14404 return (SPLIT_NOT_NEEDED); 14405 } 14406 14407 /* 14408 * The ill locks of the phyint and the ill_g_lock (writer) must be held 14409 * to do this split 14410 */ 14411 static int 14412 ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq) 14413 { 14414 ipsq_t *newipsq; 14415 14416 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 14417 /* 14418 * <ill-phyint> assocs can't change while ill_g_lock 14419 * is held as writer. See ill_phyint_reinit() 14420 */ 14421 14422 ASSERT(phyint->phyint_illv4 == NULL || 14423 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14424 ASSERT(phyint->phyint_illv6 == NULL || 14425 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14426 14427 if (!ipsq_init((phyint->phyint_illv4 != NULL) ? 14428 phyint->phyint_illv4: phyint->phyint_illv6)) { 14429 /* 14430 * ipsq_init failed due to no memory 14431 * caller will use the same ipsq 14432 */ 14433 return (SPLIT_FAILED); 14434 } 14435 14436 /* ipsq_ref is protected by ill_g_lock (writer) */ 14437 IPSQ_DEC_REF(cur_ipsq); 14438 14439 /* 14440 * This is a new ipsq that is unknown to the world. 14441 * So we don't need to hold ipsq_lock, 14442 */ 14443 newipsq = phyint->phyint_ipsq; 14444 newipsq->ipsq_writer = NULL; 14445 newipsq->ipsq_reentry_cnt--; 14446 ASSERT(newipsq->ipsq_reentry_cnt == 0); 14447 #ifdef ILL_DEBUG 14448 newipsq->ipsq_depth = 0; 14449 #endif 14450 14451 return (SPLIT_SUCCESS); 14452 } 14453 14454 /* 14455 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 14456 * ipsq's representing their individual groups or themselves. Return 14457 * whether split needs to be retried again later. 14458 */ 14459 static boolean_t 14460 ill_split_ipsq(ipsq_t *cur_ipsq) 14461 { 14462 phyint_t *phyint; 14463 phyint_t *next_phyint; 14464 int error; 14465 boolean_t need_retry = B_FALSE; 14466 14467 phyint = cur_ipsq->ipsq_phyint_list; 14468 cur_ipsq->ipsq_phyint_list = NULL; 14469 while (phyint != NULL) { 14470 next_phyint = phyint->phyint_ipsq_next; 14471 /* 14472 * 'created' will tell us whether the callee actually 14473 * created an ipsq. Lack of memory may force the callee 14474 * to return without creating an ipsq. 14475 */ 14476 if (phyint->phyint_groupname == NULL) { 14477 error = ill_split_to_own_ipsq(phyint, cur_ipsq); 14478 } else { 14479 error = ill_split_to_grp_ipsq(phyint, cur_ipsq, 14480 need_retry); 14481 } 14482 14483 switch (error) { 14484 case SPLIT_FAILED: 14485 need_retry = B_TRUE; 14486 /* FALLTHRU */ 14487 case SPLIT_NOT_NEEDED: 14488 /* 14489 * Keep it on the list. 14490 */ 14491 phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list; 14492 cur_ipsq->ipsq_phyint_list = phyint; 14493 break; 14494 case SPLIT_SUCCESS: 14495 break; 14496 default: 14497 ASSERT(0); 14498 } 14499 14500 phyint = next_phyint; 14501 } 14502 return (need_retry); 14503 } 14504 14505 /* 14506 * given an ipsq 'ipsq' lock all ills associated with this ipsq. 14507 * and return the ills in the list. This list will be 14508 * needed to unlock all the ills later on by the caller. 14509 * The <ill-ipsq> associations could change between the 14510 * lock and unlock. Hence the unlock can't traverse the 14511 * ipsq to get the list of ills. 14512 */ 14513 static int 14514 ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max) 14515 { 14516 int cnt = 0; 14517 phyint_t *phyint; 14518 14519 /* 14520 * The caller holds ill_g_lock to ensure that the ill memberships 14521 * of the ipsq don't change 14522 */ 14523 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 14524 14525 phyint = ipsq->ipsq_phyint_list; 14526 while (phyint != NULL) { 14527 if (phyint->phyint_illv4 != NULL) { 14528 ASSERT(cnt < list_max); 14529 list[cnt++] = phyint->phyint_illv4; 14530 } 14531 if (phyint->phyint_illv6 != NULL) { 14532 ASSERT(cnt < list_max); 14533 list[cnt++] = phyint->phyint_illv6; 14534 } 14535 phyint = phyint->phyint_ipsq_next; 14536 } 14537 ill_lock_ills(list, cnt); 14538 return (cnt); 14539 } 14540 14541 void 14542 ill_lock_ills(ill_t **list, int cnt) 14543 { 14544 int i; 14545 14546 if (cnt > 1) { 14547 boolean_t try_again; 14548 do { 14549 try_again = B_FALSE; 14550 for (i = 0; i < cnt - 1; i++) { 14551 if (list[i] < list[i + 1]) { 14552 ill_t *tmp; 14553 14554 /* swap the elements */ 14555 tmp = list[i]; 14556 list[i] = list[i + 1]; 14557 list[i + 1] = tmp; 14558 try_again = B_TRUE; 14559 } 14560 } 14561 } while (try_again); 14562 } 14563 14564 for (i = 0; i < cnt; i++) { 14565 if (i == 0) { 14566 if (list[i] != NULL) 14567 mutex_enter(&list[i]->ill_lock); 14568 else 14569 return; 14570 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 14571 mutex_enter(&list[i]->ill_lock); 14572 } 14573 } 14574 } 14575 14576 void 14577 ill_unlock_ills(ill_t **list, int cnt) 14578 { 14579 int i; 14580 14581 for (i = 0; i < cnt; i++) { 14582 if ((i == 0) && (list[i] != NULL)) { 14583 mutex_exit(&list[i]->ill_lock); 14584 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 14585 mutex_exit(&list[i]->ill_lock); 14586 } 14587 } 14588 } 14589 14590 /* 14591 * Merge all the ills from 1 ipsq group into another ipsq group. 14592 * The source ipsq group is specified by the ipsq associated with 14593 * 'from_ill'. The destination ipsq group is specified by the ipsq 14594 * associated with 'to_ill' or 'groupname' respectively. 14595 * Note that ipsq itself does not have a reference count mechanism 14596 * and functions don't look up an ipsq and pass it around. Instead 14597 * functions pass around an ill or groupname, and the ipsq is looked 14598 * up from the ill or groupname and the required operation performed 14599 * atomically with the lookup on the ipsq. 14600 */ 14601 static int 14602 ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp, 14603 queue_t *q) 14604 { 14605 ipsq_t *old_ipsq; 14606 ipsq_t *new_ipsq; 14607 ill_t **ill_list; 14608 int cnt; 14609 size_t ill_list_size; 14610 boolean_t became_writer_on_new_sq = B_FALSE; 14611 14612 /* Exactly 1 of 'to_ill' and groupname can be specified. */ 14613 ASSERT((to_ill != NULL) ^ (groupname != NULL)); 14614 14615 /* 14616 * Need to hold ill_g_lock as writer and also the ill_lock to 14617 * change the <ill-ipsq> assoc of an ill. Need to hold the 14618 * ipsq_lock to prevent new messages from landing on an ipsq. 14619 */ 14620 rw_enter(&ill_g_lock, RW_WRITER); 14621 14622 old_ipsq = from_ill->ill_phyint->phyint_ipsq; 14623 if (groupname != NULL) 14624 new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL); 14625 else { 14626 new_ipsq = to_ill->ill_phyint->phyint_ipsq; 14627 } 14628 14629 ASSERT(old_ipsq != NULL && new_ipsq != NULL); 14630 14631 /* 14632 * both groups are on the same ipsq. 14633 */ 14634 if (old_ipsq == new_ipsq) { 14635 rw_exit(&ill_g_lock); 14636 return (0); 14637 } 14638 14639 cnt = old_ipsq->ipsq_refs << 1; 14640 ill_list_size = cnt * sizeof (ill_t *); 14641 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 14642 if (ill_list == NULL) { 14643 rw_exit(&ill_g_lock); 14644 return (ENOMEM); 14645 } 14646 cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt); 14647 14648 /* Need ipsq lock to enque messages on new ipsq or to become writer */ 14649 mutex_enter(&new_ipsq->ipsq_lock); 14650 if ((new_ipsq->ipsq_writer == NULL && 14651 new_ipsq->ipsq_current_ipif == NULL) || 14652 (new_ipsq->ipsq_writer == curthread)) { 14653 new_ipsq->ipsq_writer = curthread; 14654 new_ipsq->ipsq_reentry_cnt++; 14655 became_writer_on_new_sq = B_TRUE; 14656 } 14657 14658 /* 14659 * We are holding ill_g_lock as writer and all the ill locks of 14660 * the old ipsq. So the old_ipsq can't be looked up, and hence no new 14661 * message can land up on the old ipsq even though we don't hold the 14662 * ipsq_lock of the old_ipsq. Now move all messages to the newipsq. 14663 */ 14664 ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q); 14665 14666 /* 14667 * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'. 14668 * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq> 14669 * assocs. till we release the ill_g_lock, and hence it can't vanish. 14670 */ 14671 ill_merge_ipsq(old_ipsq, new_ipsq); 14672 14673 /* 14674 * Mark the new ipsq as needing a split since it is currently 14675 * being shared by more than 1 IPMP group. The split will 14676 * occur at the end of ipsq_exit 14677 */ 14678 new_ipsq->ipsq_split = B_TRUE; 14679 14680 /* Now release all the locks */ 14681 mutex_exit(&new_ipsq->ipsq_lock); 14682 ill_unlock_ills(ill_list, cnt); 14683 rw_exit(&ill_g_lock); 14684 14685 kmem_free(ill_list, ill_list_size); 14686 14687 /* 14688 * If we succeeded in becoming writer on the new ipsq, then 14689 * drain the new ipsq and start processing all enqueued messages 14690 * including the current ioctl we are processing which is either 14691 * a set groupname or failover/failback. 14692 */ 14693 if (became_writer_on_new_sq) 14694 ipsq_exit(new_ipsq, B_TRUE, B_TRUE); 14695 14696 /* 14697 * syncq has been changed and all the messages have been moved. 14698 */ 14699 mutex_enter(&old_ipsq->ipsq_lock); 14700 old_ipsq->ipsq_current_ipif = NULL; 14701 mutex_exit(&old_ipsq->ipsq_lock); 14702 return (EINPROGRESS); 14703 } 14704 14705 /* 14706 * Delete and add the loopback copy and non-loopback copy of 14707 * the BROADCAST ire corresponding to ill and addr. Used to 14708 * group broadcast ires together when ill becomes part of 14709 * a group. 14710 * 14711 * This function is also called when ill is leaving the group 14712 * so that the ires belonging to the group gets re-grouped. 14713 */ 14714 static void 14715 ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr) 14716 { 14717 ire_t *ire, *nire, *nire_next, *ire_head = NULL; 14718 ire_t **ire_ptpn = &ire_head; 14719 14720 /* 14721 * The loopback and non-loopback IREs are inserted in the order in which 14722 * they're found, on the basis that they are correctly ordered (loopback 14723 * first). 14724 */ 14725 for (;;) { 14726 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 14727 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL); 14728 if (ire == NULL) 14729 break; 14730 14731 /* 14732 * we are passing in KM_SLEEP because it is not easy to 14733 * go back to a sane state in case of memory failure. 14734 */ 14735 nire = kmem_cache_alloc(ire_cache, KM_SLEEP); 14736 ASSERT(nire != NULL); 14737 bzero(nire, sizeof (ire_t)); 14738 /* 14739 * Don't use ire_max_frag directly since we don't 14740 * hold on to 'ire' until we add the new ire 'nire' and 14741 * we don't want the new ire to have a dangling reference 14742 * to 'ire'. The ire_max_frag of a broadcast ire must 14743 * be in sync with the ipif_mtu of the associate ipif. 14744 * For eg. this happens as a result of SIOCSLIFNAME, 14745 * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by 14746 * the driver. A change in ire_max_frag triggered as 14747 * as a result of path mtu discovery, or due to an 14748 * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a 14749 * route change -mtu command does not apply to broadcast ires. 14750 * 14751 * XXX We need a recovery strategy here if ire_init fails 14752 */ 14753 if (ire_init(nire, 14754 (uchar_t *)&ire->ire_addr, 14755 (uchar_t *)&ire->ire_mask, 14756 (uchar_t *)&ire->ire_src_addr, 14757 (uchar_t *)&ire->ire_gateway_addr, 14758 (uchar_t *)&ire->ire_in_src_addr, 14759 ire->ire_stq == NULL ? &ip_loopback_mtu : 14760 &ire->ire_ipif->ipif_mtu, 14761 (ire->ire_nce != NULL ? ire->ire_nce->nce_fp_mp : NULL), 14762 ire->ire_rfq, 14763 ire->ire_stq, 14764 ire->ire_type, 14765 (ire->ire_nce != NULL? ire->ire_nce->nce_res_mp : NULL), 14766 ire->ire_ipif, 14767 ire->ire_in_ill, 14768 ire->ire_cmask, 14769 ire->ire_phandle, 14770 ire->ire_ihandle, 14771 ire->ire_flags, 14772 &ire->ire_uinfo, 14773 NULL, 14774 NULL) == NULL) { 14775 cmn_err(CE_PANIC, "ire_init() failed"); 14776 } 14777 ire_delete(ire); 14778 ire_refrele(ire); 14779 14780 /* 14781 * The newly created IREs are inserted at the tail of the list 14782 * starting with ire_head. As we've just allocated them no one 14783 * knows about them so it's safe. 14784 */ 14785 *ire_ptpn = nire; 14786 ire_ptpn = &nire->ire_next; 14787 } 14788 14789 for (nire = ire_head; nire != NULL; nire = nire_next) { 14790 int error; 14791 ire_t *oire; 14792 /* unlink the IRE from our list before calling ire_add() */ 14793 nire_next = nire->ire_next; 14794 nire->ire_next = NULL; 14795 14796 /* ire_add adds the ire at the right place in the list */ 14797 oire = nire; 14798 error = ire_add(&nire, NULL, NULL, NULL, B_FALSE); 14799 ASSERT(error == 0); 14800 ASSERT(oire == nire); 14801 ire_refrele(nire); /* Held in ire_add */ 14802 } 14803 } 14804 14805 /* 14806 * This function is usually called when an ill is inserted in 14807 * a group and all the ipifs are already UP. As all the ipifs 14808 * are already UP, the broadcast ires have already been created 14809 * and been inserted. But, ire_add_v4 would not have grouped properly. 14810 * We need to re-group for the benefit of ip_wput_ire which 14811 * expects BROADCAST ires to be grouped properly to avoid sending 14812 * more than one copy of the broadcast packet per group. 14813 * 14814 * NOTE : We don't check for ill_ipif_up_count to be non-zero here 14815 * because when ipif_up_done ends up calling this, ires have 14816 * already been added before illgrp_insert i.e before ill_group 14817 * has been initialized. 14818 */ 14819 static void 14820 ill_group_bcast_for_xmit(ill_t *ill) 14821 { 14822 ill_group_t *illgrp; 14823 ipif_t *ipif; 14824 ipaddr_t addr; 14825 ipaddr_t net_mask; 14826 ipaddr_t subnet_netmask; 14827 14828 illgrp = ill->ill_group; 14829 14830 /* 14831 * This function is called even when an ill is deleted from 14832 * the group. Hence, illgrp could be null. 14833 */ 14834 if (illgrp != NULL && illgrp->illgrp_ill_count == 1) 14835 return; 14836 14837 /* 14838 * Delete all the BROADCAST ires matching this ill and add 14839 * them back. This time, ire_add_v4 should take care of 14840 * grouping them with others because ill is part of the 14841 * group. 14842 */ 14843 ill_bcast_delete_and_add(ill, 0); 14844 ill_bcast_delete_and_add(ill, INADDR_BROADCAST); 14845 14846 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14847 14848 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14849 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14850 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14851 } else { 14852 net_mask = htonl(IN_CLASSA_NET); 14853 } 14854 addr = net_mask & ipif->ipif_subnet; 14855 ill_bcast_delete_and_add(ill, addr); 14856 ill_bcast_delete_and_add(ill, ~net_mask | addr); 14857 14858 subnet_netmask = ipif->ipif_net_mask; 14859 addr = ipif->ipif_subnet; 14860 ill_bcast_delete_and_add(ill, addr); 14861 ill_bcast_delete_and_add(ill, ~subnet_netmask | addr); 14862 } 14863 } 14864 14865 /* 14866 * This function is called from illgrp_delete when ill is being deleted 14867 * from the group. 14868 * 14869 * As ill is not there in the group anymore, any address belonging 14870 * to this ill should be cleared of IRE_MARK_NORECV. 14871 */ 14872 static void 14873 ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr) 14874 { 14875 ire_t *ire; 14876 irb_t *irb; 14877 14878 ASSERT(ill->ill_group == NULL); 14879 14880 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 14881 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL); 14882 14883 if (ire != NULL) { 14884 /* 14885 * IPMP and plumbing operations are serialized on the ipsq, so 14886 * no one will insert or delete a broadcast ire under our feet. 14887 */ 14888 irb = ire->ire_bucket; 14889 rw_enter(&irb->irb_lock, RW_READER); 14890 ire_refrele(ire); 14891 14892 for (; ire != NULL; ire = ire->ire_next) { 14893 if (ire->ire_addr != addr) 14894 break; 14895 if (ire_to_ill(ire) != ill) 14896 continue; 14897 14898 ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED)); 14899 ire->ire_marks &= ~IRE_MARK_NORECV; 14900 } 14901 rw_exit(&irb->irb_lock); 14902 } 14903 } 14904 14905 /* 14906 * This function must be called only after the broadcast ires 14907 * have been grouped together. For a given address addr, nominate 14908 * only one of the ires whose interface is not FAILED or OFFLINE. 14909 * 14910 * This is also called when an ipif goes down, so that we can nominate 14911 * a different ire with the same address for receiving. 14912 */ 14913 static void 14914 ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr) 14915 { 14916 irb_t *irb; 14917 ire_t *ire; 14918 ire_t *ire1; 14919 ire_t *save_ire; 14920 ire_t **irep = NULL; 14921 boolean_t first = B_TRUE; 14922 ire_t *clear_ire = NULL; 14923 ire_t *start_ire = NULL; 14924 ire_t *new_lb_ire; 14925 ire_t *new_nlb_ire; 14926 boolean_t new_lb_ire_used = B_FALSE; 14927 boolean_t new_nlb_ire_used = B_FALSE; 14928 uint64_t match_flags; 14929 uint64_t phyi_flags; 14930 boolean_t fallback = B_FALSE; 14931 14932 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES, 14933 NULL, MATCH_IRE_TYPE); 14934 /* 14935 * We may not be able to find some ires if a previous 14936 * ire_create failed. This happens when an ipif goes 14937 * down and we are unable to create BROADCAST ires due 14938 * to memory failure. Thus, we have to check for NULL 14939 * below. This should handle the case for LOOPBACK, 14940 * POINTOPOINT and interfaces with some POINTOPOINT 14941 * logicals for which there are no BROADCAST ires. 14942 */ 14943 if (ire == NULL) 14944 return; 14945 /* 14946 * Currently IRE_BROADCASTS are deleted when an ipif 14947 * goes down which runs exclusively. Thus, setting 14948 * IRE_MARK_RCVD should not race with ire_delete marking 14949 * IRE_MARK_CONDEMNED. We grab the lock below just to 14950 * be consistent with other parts of the code that walks 14951 * a given bucket. 14952 */ 14953 save_ire = ire; 14954 irb = ire->ire_bucket; 14955 new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 14956 if (new_lb_ire == NULL) { 14957 ire_refrele(ire); 14958 return; 14959 } 14960 new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 14961 if (new_nlb_ire == NULL) { 14962 ire_refrele(ire); 14963 kmem_cache_free(ire_cache, new_lb_ire); 14964 return; 14965 } 14966 IRB_REFHOLD(irb); 14967 rw_enter(&irb->irb_lock, RW_WRITER); 14968 /* 14969 * Get to the first ire matching the address and the 14970 * group. If the address does not match we are done 14971 * as we could not find the IRE. If the address matches 14972 * we should get to the first one matching the group. 14973 */ 14974 while (ire != NULL) { 14975 if (ire->ire_addr != addr || 14976 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 14977 break; 14978 } 14979 ire = ire->ire_next; 14980 } 14981 match_flags = PHYI_FAILED | PHYI_INACTIVE; 14982 start_ire = ire; 14983 redo: 14984 while (ire != NULL && ire->ire_addr == addr && 14985 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 14986 /* 14987 * The first ire for any address within a group 14988 * should always be the one with IRE_MARK_NORECV cleared 14989 * so that ip_wput_ire can avoid searching for one. 14990 * Note down the insertion point which will be used 14991 * later. 14992 */ 14993 if (first && (irep == NULL)) 14994 irep = ire->ire_ptpn; 14995 /* 14996 * PHYI_FAILED is set when the interface fails. 14997 * This interface might have become good, but the 14998 * daemon has not yet detected. We should still 14999 * not receive on this. PHYI_OFFLINE should never 15000 * be picked as this has been offlined and soon 15001 * be removed. 15002 */ 15003 phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags; 15004 if (phyi_flags & PHYI_OFFLINE) { 15005 ire->ire_marks |= IRE_MARK_NORECV; 15006 ire = ire->ire_next; 15007 continue; 15008 } 15009 if (phyi_flags & match_flags) { 15010 ire->ire_marks |= IRE_MARK_NORECV; 15011 ire = ire->ire_next; 15012 if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) == 15013 PHYI_INACTIVE) { 15014 fallback = B_TRUE; 15015 } 15016 continue; 15017 } 15018 if (first) { 15019 /* 15020 * We will move this to the front of the list later 15021 * on. 15022 */ 15023 clear_ire = ire; 15024 ire->ire_marks &= ~IRE_MARK_NORECV; 15025 } else { 15026 ire->ire_marks |= IRE_MARK_NORECV; 15027 } 15028 first = B_FALSE; 15029 ire = ire->ire_next; 15030 } 15031 /* 15032 * If we never nominated anybody, try nominating at least 15033 * an INACTIVE, if we found one. Do it only once though. 15034 */ 15035 if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) && 15036 fallback) { 15037 match_flags = PHYI_FAILED; 15038 ire = start_ire; 15039 irep = NULL; 15040 goto redo; 15041 } 15042 ire_refrele(save_ire); 15043 15044 /* 15045 * irep non-NULL indicates that we entered the while loop 15046 * above. If clear_ire is at the insertion point, we don't 15047 * have to do anything. clear_ire will be NULL if all the 15048 * interfaces are failed. 15049 * 15050 * We cannot unlink and reinsert the ire at the right place 15051 * in the list since there can be other walkers of this bucket. 15052 * Instead we delete and recreate the ire 15053 */ 15054 if (clear_ire != NULL && irep != NULL && *irep != clear_ire) { 15055 ire_t *clear_ire_stq = NULL; 15056 mblk_t *fp_mp = NULL, *res_mp = NULL; 15057 15058 bzero(new_lb_ire, sizeof (ire_t)); 15059 if (clear_ire->ire_nce != NULL) { 15060 fp_mp = clear_ire->ire_nce->nce_fp_mp; 15061 res_mp = clear_ire->ire_nce->nce_res_mp; 15062 } 15063 /* XXX We need a recovery strategy here. */ 15064 if (ire_init(new_lb_ire, 15065 (uchar_t *)&clear_ire->ire_addr, 15066 (uchar_t *)&clear_ire->ire_mask, 15067 (uchar_t *)&clear_ire->ire_src_addr, 15068 (uchar_t *)&clear_ire->ire_gateway_addr, 15069 (uchar_t *)&clear_ire->ire_in_src_addr, 15070 &clear_ire->ire_max_frag, 15071 fp_mp, 15072 clear_ire->ire_rfq, 15073 clear_ire->ire_stq, 15074 clear_ire->ire_type, 15075 res_mp, 15076 clear_ire->ire_ipif, 15077 clear_ire->ire_in_ill, 15078 clear_ire->ire_cmask, 15079 clear_ire->ire_phandle, 15080 clear_ire->ire_ihandle, 15081 clear_ire->ire_flags, 15082 &clear_ire->ire_uinfo, 15083 NULL, 15084 NULL) == NULL) 15085 cmn_err(CE_PANIC, "ire_init() failed"); 15086 if (clear_ire->ire_stq == NULL) { 15087 ire_t *ire_next = clear_ire->ire_next; 15088 if (ire_next != NULL && 15089 ire_next->ire_stq != NULL && 15090 ire_next->ire_addr == clear_ire->ire_addr && 15091 ire_next->ire_ipif->ipif_ill == 15092 clear_ire->ire_ipif->ipif_ill) { 15093 clear_ire_stq = ire_next; 15094 15095 bzero(new_nlb_ire, sizeof (ire_t)); 15096 if (clear_ire_stq->ire_nce != NULL) { 15097 fp_mp = 15098 clear_ire_stq->ire_nce->nce_fp_mp; 15099 res_mp = 15100 clear_ire_stq->ire_nce->nce_res_mp; 15101 } else { 15102 fp_mp = res_mp = NULL; 15103 } 15104 /* XXX We need a recovery strategy here. */ 15105 if (ire_init(new_nlb_ire, 15106 (uchar_t *)&clear_ire_stq->ire_addr, 15107 (uchar_t *)&clear_ire_stq->ire_mask, 15108 (uchar_t *)&clear_ire_stq->ire_src_addr, 15109 (uchar_t *)&clear_ire_stq->ire_gateway_addr, 15110 (uchar_t *)&clear_ire_stq->ire_in_src_addr, 15111 &clear_ire_stq->ire_max_frag, 15112 fp_mp, 15113 clear_ire_stq->ire_rfq, 15114 clear_ire_stq->ire_stq, 15115 clear_ire_stq->ire_type, 15116 res_mp, 15117 clear_ire_stq->ire_ipif, 15118 clear_ire_stq->ire_in_ill, 15119 clear_ire_stq->ire_cmask, 15120 clear_ire_stq->ire_phandle, 15121 clear_ire_stq->ire_ihandle, 15122 clear_ire_stq->ire_flags, 15123 &clear_ire_stq->ire_uinfo, 15124 NULL, 15125 NULL) == NULL) 15126 cmn_err(CE_PANIC, "ire_init() failed"); 15127 } 15128 } 15129 15130 /* 15131 * Delete the ire. We can't call ire_delete() since 15132 * we are holding the bucket lock. We can't release the 15133 * bucket lock since we can't allow irep to change. So just 15134 * mark it CONDEMNED. The IRB_REFRELE will delete the 15135 * ire from the list and do the refrele. 15136 */ 15137 clear_ire->ire_marks |= IRE_MARK_CONDEMNED; 15138 irb->irb_marks |= IRB_MARK_CONDEMNED; 15139 15140 if (clear_ire_stq != NULL) { 15141 ire_fastpath_list_delete( 15142 (ill_t *)clear_ire_stq->ire_stq->q_ptr, 15143 clear_ire_stq); 15144 clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED; 15145 } 15146 15147 /* 15148 * Also take care of otherfields like ib/ob pkt count 15149 * etc. Need to dup them. ditto in ill_bcast_delete_and_add 15150 */ 15151 15152 /* Add the new ire's. Insert at *irep */ 15153 new_lb_ire->ire_bucket = clear_ire->ire_bucket; 15154 ire1 = *irep; 15155 if (ire1 != NULL) 15156 ire1->ire_ptpn = &new_lb_ire->ire_next; 15157 new_lb_ire->ire_next = ire1; 15158 /* Link the new one in. */ 15159 new_lb_ire->ire_ptpn = irep; 15160 membar_producer(); 15161 *irep = new_lb_ire; 15162 new_lb_ire_used = B_TRUE; 15163 BUMP_IRE_STATS(ire_stats_v4, ire_stats_inserted); 15164 new_lb_ire->ire_bucket->irb_ire_cnt++; 15165 new_lb_ire->ire_ipif->ipif_ire_cnt++; 15166 15167 if (clear_ire_stq != NULL) { 15168 new_nlb_ire->ire_bucket = clear_ire->ire_bucket; 15169 irep = &new_lb_ire->ire_next; 15170 /* Add the new ire. Insert at *irep */ 15171 ire1 = *irep; 15172 if (ire1 != NULL) 15173 ire1->ire_ptpn = &new_nlb_ire->ire_next; 15174 new_nlb_ire->ire_next = ire1; 15175 /* Link the new one in. */ 15176 new_nlb_ire->ire_ptpn = irep; 15177 membar_producer(); 15178 *irep = new_nlb_ire; 15179 new_nlb_ire_used = B_TRUE; 15180 BUMP_IRE_STATS(ire_stats_v4, ire_stats_inserted); 15181 new_nlb_ire->ire_bucket->irb_ire_cnt++; 15182 new_nlb_ire->ire_ipif->ipif_ire_cnt++; 15183 ((ill_t *)new_nlb_ire->ire_stq->q_ptr)->ill_ire_cnt++; 15184 } 15185 } 15186 rw_exit(&irb->irb_lock); 15187 if (!new_lb_ire_used) 15188 kmem_cache_free(ire_cache, new_lb_ire); 15189 if (!new_nlb_ire_used) 15190 kmem_cache_free(ire_cache, new_nlb_ire); 15191 IRB_REFRELE(irb); 15192 } 15193 15194 /* 15195 * Whenever an ipif goes down we have to renominate a different 15196 * broadcast ire to receive. Whenever an ipif comes up, we need 15197 * to make sure that we have only one nominated to receive. 15198 */ 15199 static void 15200 ipif_renominate_bcast(ipif_t *ipif) 15201 { 15202 ill_t *ill = ipif->ipif_ill; 15203 ipaddr_t subnet_addr; 15204 ipaddr_t net_addr; 15205 ipaddr_t net_mask = 0; 15206 ipaddr_t subnet_netmask; 15207 ipaddr_t addr; 15208 ill_group_t *illgrp; 15209 15210 illgrp = ill->ill_group; 15211 /* 15212 * If this is the last ipif going down, it might take 15213 * the ill out of the group. In that case ipif_down -> 15214 * illgrp_delete takes care of doing the nomination. 15215 * ipif_down does not call for this case. 15216 */ 15217 ASSERT(illgrp != NULL); 15218 15219 /* There could not have been any ires associated with this */ 15220 if (ipif->ipif_subnet == 0) 15221 return; 15222 15223 ill_mark_bcast(illgrp, 0); 15224 ill_mark_bcast(illgrp, INADDR_BROADCAST); 15225 15226 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15227 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15228 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15229 } else { 15230 net_mask = htonl(IN_CLASSA_NET); 15231 } 15232 addr = net_mask & ipif->ipif_subnet; 15233 ill_mark_bcast(illgrp, addr); 15234 15235 net_addr = ~net_mask | addr; 15236 ill_mark_bcast(illgrp, net_addr); 15237 15238 subnet_netmask = ipif->ipif_net_mask; 15239 addr = ipif->ipif_subnet; 15240 ill_mark_bcast(illgrp, addr); 15241 15242 subnet_addr = ~subnet_netmask | addr; 15243 ill_mark_bcast(illgrp, subnet_addr); 15244 } 15245 15246 /* 15247 * Whenever we form or delete ill groups, we need to nominate one set of 15248 * BROADCAST ires for receiving in the group. 15249 * 15250 * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires 15251 * have been added, but ill_ipif_up_count is 0. Thus, we don't assert 15252 * for ill_ipif_up_count to be non-zero. This is the only case where 15253 * ill_ipif_up_count is zero and we would still find the ires. 15254 * 15255 * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one 15256 * ipif is UP and we just have to do the nomination. 15257 * 15258 * 3) When ill_handoff_responsibility calls us, some ill has been removed 15259 * from the group. So, we have to do the nomination. 15260 * 15261 * Because of (3), there could be just one ill in the group. But we have 15262 * to nominate still as IRE_MARK_NORCV may have been marked on this. 15263 * Thus, this function does not optimize when there is only one ill as 15264 * it is not correct for (3). 15265 */ 15266 static void 15267 ill_nominate_bcast_rcv(ill_group_t *illgrp) 15268 { 15269 ill_t *ill; 15270 ipif_t *ipif; 15271 ipaddr_t subnet_addr; 15272 ipaddr_t prev_subnet_addr = 0; 15273 ipaddr_t net_addr; 15274 ipaddr_t prev_net_addr = 0; 15275 ipaddr_t net_mask = 0; 15276 ipaddr_t subnet_netmask; 15277 ipaddr_t addr; 15278 15279 /* 15280 * When the last memeber is leaving, there is nothing to 15281 * nominate. 15282 */ 15283 if (illgrp->illgrp_ill_count == 0) { 15284 ASSERT(illgrp->illgrp_ill == NULL); 15285 return; 15286 } 15287 15288 ill = illgrp->illgrp_ill; 15289 ASSERT(!ill->ill_isv6); 15290 /* 15291 * We assume that ires with same address and belonging to the 15292 * same group, has been grouped together. Nominating a *single* 15293 * ill in the group for sending and receiving broadcast is done 15294 * by making sure that the first BROADCAST ire (which will be 15295 * the one returned by ire_ctable_lookup for ip_rput and the 15296 * one that will be used in ip_wput_ire) will be the one that 15297 * will not have IRE_MARK_NORECV set. 15298 * 15299 * 1) ip_rput checks and discards packets received on ires marked 15300 * with IRE_MARK_NORECV. Thus, we don't send up duplicate 15301 * broadcast packets. We need to clear IRE_MARK_NORECV on the 15302 * first ire in the group for every broadcast address in the group. 15303 * ip_rput will accept packets only on the first ire i.e only 15304 * one copy of the ill. 15305 * 15306 * 2) ip_wput_ire needs to send out just one copy of the broadcast 15307 * packet for the whole group. It needs to send out on the ill 15308 * whose ire has not been marked with IRE_MARK_NORECV. If it sends 15309 * on the one marked with IRE_MARK_NORECV, ip_rput will accept 15310 * the copy echoed back on other port where the ire is not marked 15311 * with IRE_MARK_NORECV. 15312 * 15313 * Note that we just need to have the first IRE either loopback or 15314 * non-loopback (either of them may not exist if ire_create failed 15315 * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will 15316 * always hit the first one and hence will always accept one copy. 15317 * 15318 * We have a broadcast ire per ill for all the unique prefixes 15319 * hosted on that ill. As we don't have a way of knowing the 15320 * unique prefixes on a given ill and hence in the whole group, 15321 * we just call ill_mark_bcast on all the prefixes that exist 15322 * in the group. For the common case of one prefix, the code 15323 * below optimizes by remebering the last address used for 15324 * markng. In the case of multiple prefixes, this will still 15325 * optimize depending the order of prefixes. 15326 * 15327 * The only unique address across the whole group is 0.0.0.0 and 15328 * 255.255.255.255 and thus we call only once. ill_mark_bcast enables 15329 * the first ire in the bucket for receiving and disables the 15330 * others. 15331 */ 15332 ill_mark_bcast(illgrp, 0); 15333 ill_mark_bcast(illgrp, INADDR_BROADCAST); 15334 for (; ill != NULL; ill = ill->ill_group_next) { 15335 15336 for (ipif = ill->ill_ipif; ipif != NULL; 15337 ipif = ipif->ipif_next) { 15338 15339 if (!(ipif->ipif_flags & IPIF_UP) || 15340 ipif->ipif_subnet == 0) { 15341 continue; 15342 } 15343 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15344 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15345 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15346 } else { 15347 net_mask = htonl(IN_CLASSA_NET); 15348 } 15349 addr = net_mask & ipif->ipif_subnet; 15350 if (prev_net_addr == 0 || prev_net_addr != addr) { 15351 ill_mark_bcast(illgrp, addr); 15352 net_addr = ~net_mask | addr; 15353 ill_mark_bcast(illgrp, net_addr); 15354 } 15355 prev_net_addr = addr; 15356 15357 subnet_netmask = ipif->ipif_net_mask; 15358 addr = ipif->ipif_subnet; 15359 if (prev_subnet_addr == 0 || 15360 prev_subnet_addr != addr) { 15361 ill_mark_bcast(illgrp, addr); 15362 subnet_addr = ~subnet_netmask | addr; 15363 ill_mark_bcast(illgrp, subnet_addr); 15364 } 15365 prev_subnet_addr = addr; 15366 } 15367 } 15368 } 15369 15370 /* 15371 * This function is called while forming ill groups. 15372 * 15373 * Currently, we handle only allmulti groups. We want to join 15374 * allmulti on only one of the ills in the groups. In future, 15375 * when we have link aggregation, we may have to join normal 15376 * multicast groups on multiple ills as switch does inbound load 15377 * balancing. Following are the functions that calls this 15378 * function : 15379 * 15380 * 1) ill_recover_multicast : Interface is coming back UP. 15381 * When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6 15382 * will call ill_recover_multicast to recover all the multicast 15383 * groups. We need to make sure that only one member is joined 15384 * in the ill group. 15385 * 15386 * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed. 15387 * Somebody is joining allmulti. We need to make sure that only one 15388 * member is joined in the group. 15389 * 15390 * 3) illgrp_insert : If allmulti has already joined, we need to make 15391 * sure that only one member is joined in the group. 15392 * 15393 * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving 15394 * allmulti who we have nominated. We need to pick someother ill. 15395 * 15396 * 5) illgrp_delete : The ill we nominated is leaving the group, 15397 * we need to pick a new ill to join the group. 15398 * 15399 * For (1), (2), (5) - we just have to check whether there is 15400 * a good ill joined in the group. If we could not find any ills 15401 * joined the group, we should join. 15402 * 15403 * For (4), the one that was nominated to receive, left the group. 15404 * There could be nobody joined in the group when this function is 15405 * called. 15406 * 15407 * For (3) - we need to explicitly check whether there are multiple 15408 * ills joined in the group. 15409 * 15410 * For simplicity, we don't differentiate any of the above cases. We 15411 * just leave the group if it is joined on any of them and join on 15412 * the first good ill. 15413 */ 15414 int 15415 ill_nominate_mcast_rcv(ill_group_t *illgrp) 15416 { 15417 ilm_t *ilm; 15418 ill_t *ill; 15419 ill_t *fallback_inactive_ill = NULL; 15420 ill_t *fallback_failed_ill = NULL; 15421 int ret = 0; 15422 15423 /* 15424 * Leave the allmulti on all the ills and start fresh. 15425 */ 15426 for (ill = illgrp->illgrp_ill; ill != NULL; 15427 ill = ill->ill_group_next) { 15428 if (ill->ill_join_allmulti) 15429 (void) ip_leave_allmulti(ill->ill_ipif); 15430 } 15431 15432 /* 15433 * Choose a good ill. Fallback to inactive or failed if 15434 * none available. We need to fallback to FAILED in the 15435 * case where we have 2 interfaces in a group - where 15436 * one of them is failed and another is a good one and 15437 * the good one (not marked inactive) is leaving the group. 15438 */ 15439 ret = 0; 15440 for (ill = illgrp->illgrp_ill; ill != NULL; 15441 ill = ill->ill_group_next) { 15442 /* Never pick an offline interface */ 15443 if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE) 15444 continue; 15445 15446 if (ill->ill_phyint->phyint_flags & PHYI_FAILED) { 15447 fallback_failed_ill = ill; 15448 continue; 15449 } 15450 if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) { 15451 fallback_inactive_ill = ill; 15452 continue; 15453 } 15454 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15455 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15456 ret = ip_join_allmulti(ill->ill_ipif); 15457 /* 15458 * ip_join_allmulti can fail because of memory 15459 * failures. So, make sure we join at least 15460 * on one ill. 15461 */ 15462 if (ill->ill_join_allmulti) 15463 return (0); 15464 } 15465 } 15466 } 15467 if (ret != 0) { 15468 /* 15469 * If we tried nominating above and failed to do so, 15470 * return error. We might have tried multiple times. 15471 * But, return the latest error. 15472 */ 15473 return (ret); 15474 } 15475 if ((ill = fallback_inactive_ill) != NULL) { 15476 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15477 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15478 ret = ip_join_allmulti(ill->ill_ipif); 15479 return (ret); 15480 } 15481 } 15482 } else if ((ill = fallback_failed_ill) != NULL) { 15483 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15484 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15485 ret = ip_join_allmulti(ill->ill_ipif); 15486 return (ret); 15487 } 15488 } 15489 } 15490 return (0); 15491 } 15492 15493 /* 15494 * This function is called from illgrp_delete after it is 15495 * deleted from the group to reschedule responsibilities 15496 * to a different ill. 15497 */ 15498 static void 15499 ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp) 15500 { 15501 ilm_t *ilm; 15502 ipif_t *ipif; 15503 ipaddr_t subnet_addr; 15504 ipaddr_t net_addr; 15505 ipaddr_t net_mask = 0; 15506 ipaddr_t subnet_netmask; 15507 ipaddr_t addr; 15508 15509 ASSERT(ill->ill_group == NULL); 15510 /* 15511 * Broadcast Responsibility: 15512 * 15513 * 1. If this ill has been nominated for receiving broadcast 15514 * packets, we need to find a new one. Before we find a new 15515 * one, we need to re-group the ires that are part of this new 15516 * group (assumed by ill_nominate_bcast_rcv). We do this by 15517 * calling ill_group_bcast_for_xmit(ill) which will do the right 15518 * thing for us. 15519 * 15520 * 2. If this ill was not nominated for receiving broadcast 15521 * packets, we need to clear the IRE_MARK_NORECV flag 15522 * so that we continue to send up broadcast packets. 15523 */ 15524 if (!ill->ill_isv6) { 15525 /* 15526 * Case 1 above : No optimization here. Just redo the 15527 * nomination. 15528 */ 15529 ill_group_bcast_for_xmit(ill); 15530 ill_nominate_bcast_rcv(illgrp); 15531 15532 /* 15533 * Case 2 above : Lookup and clear IRE_MARK_NORECV. 15534 */ 15535 ill_clear_bcast_mark(ill, 0); 15536 ill_clear_bcast_mark(ill, INADDR_BROADCAST); 15537 15538 for (ipif = ill->ill_ipif; ipif != NULL; 15539 ipif = ipif->ipif_next) { 15540 15541 if (!(ipif->ipif_flags & IPIF_UP) || 15542 ipif->ipif_subnet == 0) { 15543 continue; 15544 } 15545 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15546 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15547 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15548 } else { 15549 net_mask = htonl(IN_CLASSA_NET); 15550 } 15551 addr = net_mask & ipif->ipif_subnet; 15552 ill_clear_bcast_mark(ill, addr); 15553 15554 net_addr = ~net_mask | addr; 15555 ill_clear_bcast_mark(ill, net_addr); 15556 15557 subnet_netmask = ipif->ipif_net_mask; 15558 addr = ipif->ipif_subnet; 15559 ill_clear_bcast_mark(ill, addr); 15560 15561 subnet_addr = ~subnet_netmask | addr; 15562 ill_clear_bcast_mark(ill, subnet_addr); 15563 } 15564 } 15565 15566 /* 15567 * Multicast Responsibility. 15568 * 15569 * If we have joined allmulti on this one, find a new member 15570 * in the group to join allmulti. As this ill is already part 15571 * of allmulti, we don't have to join on this one. 15572 * 15573 * If we have not joined allmulti on this one, there is no 15574 * responsibility to handoff. But we need to take new 15575 * responsibility i.e, join allmulti on this one if we need 15576 * to. 15577 */ 15578 if (ill->ill_join_allmulti) { 15579 (void) ill_nominate_mcast_rcv(illgrp); 15580 } else { 15581 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15582 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15583 (void) ip_join_allmulti(ill->ill_ipif); 15584 break; 15585 } 15586 } 15587 } 15588 15589 /* 15590 * We intentionally do the flushing of IRE_CACHES only matching 15591 * on the ill and not on groups. Note that we are already deleted 15592 * from the group. 15593 * 15594 * This will make sure that all IRE_CACHES whose stq is pointing 15595 * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get 15596 * deleted and IRE_CACHES that are not pointing at this ill will 15597 * be left alone. 15598 */ 15599 if (ill->ill_isv6) { 15600 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 15601 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 15602 } else { 15603 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 15604 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 15605 } 15606 15607 /* 15608 * Some conn may have cached one of the IREs deleted above. By removing 15609 * the ire reference, we clean up the extra reference to the ill held in 15610 * ire->ire_stq. 15611 */ 15612 ipcl_walk(conn_cleanup_stale_ire, NULL); 15613 15614 /* 15615 * Re-do source address selection for all the members in the 15616 * group, if they borrowed source address from one of the ipifs 15617 * in this ill. 15618 */ 15619 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15620 if (ill->ill_isv6) { 15621 ipif_update_other_ipifs_v6(ipif, illgrp); 15622 } else { 15623 ipif_update_other_ipifs(ipif, illgrp); 15624 } 15625 } 15626 } 15627 15628 /* 15629 * Delete the ill from the group. The caller makes sure that it is 15630 * in a group and it okay to delete from the group. So, we always 15631 * delete here. 15632 */ 15633 static void 15634 illgrp_delete(ill_t *ill) 15635 { 15636 ill_group_t *illgrp; 15637 ill_group_t *tmpg; 15638 ill_t *tmp_ill; 15639 15640 /* 15641 * Reset illgrp_ill_schednext if it was pointing at us. 15642 * We need to do this before we set ill_group to NULL. 15643 */ 15644 rw_enter(&ill_g_lock, RW_WRITER); 15645 mutex_enter(&ill->ill_lock); 15646 15647 illgrp_reset_schednext(ill); 15648 15649 illgrp = ill->ill_group; 15650 15651 /* Delete the ill from illgrp. */ 15652 if (illgrp->illgrp_ill == ill) { 15653 illgrp->illgrp_ill = ill->ill_group_next; 15654 } else { 15655 tmp_ill = illgrp->illgrp_ill; 15656 while (tmp_ill->ill_group_next != ill) { 15657 tmp_ill = tmp_ill->ill_group_next; 15658 ASSERT(tmp_ill != NULL); 15659 } 15660 tmp_ill->ill_group_next = ill->ill_group_next; 15661 } 15662 ill->ill_group = NULL; 15663 ill->ill_group_next = NULL; 15664 15665 illgrp->illgrp_ill_count--; 15666 mutex_exit(&ill->ill_lock); 15667 rw_exit(&ill_g_lock); 15668 15669 /* 15670 * As this ill is leaving the group, we need to hand off 15671 * the responsibilities to the other ills in the group, if 15672 * this ill had some responsibilities. 15673 */ 15674 15675 ill_handoff_responsibility(ill, illgrp); 15676 15677 rw_enter(&ill_g_lock, RW_WRITER); 15678 15679 if (illgrp->illgrp_ill_count == 0) { 15680 15681 ASSERT(illgrp->illgrp_ill == NULL); 15682 if (ill->ill_isv6) { 15683 if (illgrp == illgrp_head_v6) { 15684 illgrp_head_v6 = illgrp->illgrp_next; 15685 } else { 15686 tmpg = illgrp_head_v6; 15687 while (tmpg->illgrp_next != illgrp) { 15688 tmpg = tmpg->illgrp_next; 15689 ASSERT(tmpg != NULL); 15690 } 15691 tmpg->illgrp_next = illgrp->illgrp_next; 15692 } 15693 } else { 15694 if (illgrp == illgrp_head_v4) { 15695 illgrp_head_v4 = illgrp->illgrp_next; 15696 } else { 15697 tmpg = illgrp_head_v4; 15698 while (tmpg->illgrp_next != illgrp) { 15699 tmpg = tmpg->illgrp_next; 15700 ASSERT(tmpg != NULL); 15701 } 15702 tmpg->illgrp_next = illgrp->illgrp_next; 15703 } 15704 } 15705 mutex_destroy(&illgrp->illgrp_lock); 15706 mi_free(illgrp); 15707 } 15708 rw_exit(&ill_g_lock); 15709 15710 /* 15711 * Even though the ill is out of the group its not necessary 15712 * to set ipsq_split as TRUE as the ipifs could be down temporarily 15713 * We will split the ipsq when phyint_groupname is set to NULL. 15714 */ 15715 15716 /* 15717 * Send a routing sockets message if we are deleting from 15718 * groups with names. 15719 */ 15720 if (ill->ill_phyint->phyint_groupname_len != 0) 15721 ip_rts_ifmsg(ill->ill_ipif); 15722 } 15723 15724 /* 15725 * Re-do source address selection. This is normally called when 15726 * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST 15727 * ipif comes up. 15728 */ 15729 void 15730 ill_update_source_selection(ill_t *ill) 15731 { 15732 ipif_t *ipif; 15733 15734 ASSERT(IAM_WRITER_ILL(ill)); 15735 15736 if (ill->ill_group != NULL) 15737 ill = ill->ill_group->illgrp_ill; 15738 15739 for (; ill != NULL; ill = ill->ill_group_next) { 15740 for (ipif = ill->ill_ipif; ipif != NULL; 15741 ipif = ipif->ipif_next) { 15742 if (ill->ill_isv6) 15743 ipif_recreate_interface_routes_v6(NULL, ipif); 15744 else 15745 ipif_recreate_interface_routes(NULL, ipif); 15746 } 15747 } 15748 } 15749 15750 /* 15751 * Insert ill in a group headed by illgrp_head. The caller can either 15752 * pass a groupname in which case we search for a group with the 15753 * same name to insert in or pass a group to insert in. This function 15754 * would only search groups with names. 15755 * 15756 * NOTE : The caller should make sure that there is at least one ipif 15757 * UP on this ill so that illgrp_scheduler can pick this ill 15758 * for outbound packets. If ill_ipif_up_count is zero, we have 15759 * already sent a DL_UNBIND to the driver and we don't want to 15760 * send anymore packets. We don't assert for ipif_up_count 15761 * to be greater than zero, because ipif_up_done wants to call 15762 * this function before bumping up the ipif_up_count. See 15763 * ipif_up_done() for details. 15764 */ 15765 int 15766 illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname, 15767 ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up) 15768 { 15769 ill_group_t *illgrp; 15770 ill_t *prev_ill; 15771 phyint_t *phyi; 15772 15773 ASSERT(ill->ill_group == NULL); 15774 15775 rw_enter(&ill_g_lock, RW_WRITER); 15776 mutex_enter(&ill->ill_lock); 15777 15778 if (groupname != NULL) { 15779 /* 15780 * Look for a group with a matching groupname to insert. 15781 */ 15782 for (illgrp = *illgrp_head; illgrp != NULL; 15783 illgrp = illgrp->illgrp_next) { 15784 15785 ill_t *tmp_ill; 15786 15787 /* 15788 * If we have an ill_group_t in the list which has 15789 * no ill_t assigned then we must be in the process of 15790 * removing this group. We skip this as illgrp_delete() 15791 * will remove it from the list. 15792 */ 15793 if ((tmp_ill = illgrp->illgrp_ill) == NULL) { 15794 ASSERT(illgrp->illgrp_ill_count == 0); 15795 continue; 15796 } 15797 15798 ASSERT(tmp_ill->ill_phyint != NULL); 15799 phyi = tmp_ill->ill_phyint; 15800 /* 15801 * Look at groups which has names only. 15802 */ 15803 if (phyi->phyint_groupname_len == 0) 15804 continue; 15805 /* 15806 * Names are stored in the phyint common to both 15807 * IPv4 and IPv6. 15808 */ 15809 if (mi_strcmp(phyi->phyint_groupname, 15810 groupname) == 0) { 15811 break; 15812 } 15813 } 15814 } else { 15815 /* 15816 * If the caller passes in a NULL "grp_to_insert", we 15817 * allocate one below and insert this singleton. 15818 */ 15819 illgrp = grp_to_insert; 15820 } 15821 15822 ill->ill_group_next = NULL; 15823 15824 if (illgrp == NULL) { 15825 illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t)); 15826 if (illgrp == NULL) { 15827 return (ENOMEM); 15828 } 15829 illgrp->illgrp_next = *illgrp_head; 15830 *illgrp_head = illgrp; 15831 illgrp->illgrp_ill = ill; 15832 illgrp->illgrp_ill_count = 1; 15833 ill->ill_group = illgrp; 15834 /* 15835 * Used in illgrp_scheduler to protect multiple threads 15836 * from traversing the list. 15837 */ 15838 mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0); 15839 } else { 15840 ASSERT(ill->ill_net_type == 15841 illgrp->illgrp_ill->ill_net_type); 15842 ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type); 15843 15844 /* Insert ill at tail of this group */ 15845 prev_ill = illgrp->illgrp_ill; 15846 while (prev_ill->ill_group_next != NULL) 15847 prev_ill = prev_ill->ill_group_next; 15848 prev_ill->ill_group_next = ill; 15849 ill->ill_group = illgrp; 15850 illgrp->illgrp_ill_count++; 15851 /* 15852 * Inherit group properties. Currently only forwarding 15853 * is the property we try to keep the same with all the 15854 * ills. When there are more, we will abstract this into 15855 * a function. 15856 */ 15857 ill->ill_flags &= ~ILLF_ROUTER; 15858 ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER); 15859 } 15860 mutex_exit(&ill->ill_lock); 15861 rw_exit(&ill_g_lock); 15862 15863 /* 15864 * 1) When ipif_up_done() calls this function, ipif_up_count 15865 * may be zero as it has not yet been bumped. But the ires 15866 * have already been added. So, we do the nomination here 15867 * itself. But, when ip_sioctl_groupname calls this, it checks 15868 * for ill_ipif_up_count != 0. Thus we don't check for 15869 * ill_ipif_up_count here while nominating broadcast ires for 15870 * receive. 15871 * 15872 * 2) Similarly, we need to call ill_group_bcast_for_xmit here 15873 * to group them properly as ire_add() has already happened 15874 * in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert 15875 * case, we need to do it here anyway. 15876 */ 15877 if (!ill->ill_isv6) { 15878 ill_group_bcast_for_xmit(ill); 15879 ill_nominate_bcast_rcv(illgrp); 15880 } 15881 15882 if (!ipif_is_coming_up) { 15883 /* 15884 * When ipif_up_done() calls this function, the multicast 15885 * groups have not been joined yet. So, there is no point in 15886 * nomination. ip_join_allmulti will handle groups when 15887 * ill_recover_multicast is called from ipif_up_done() later. 15888 */ 15889 (void) ill_nominate_mcast_rcv(illgrp); 15890 /* 15891 * ipif_up_done calls ill_update_source_selection 15892 * anyway. Moreover, we don't want to re-create 15893 * interface routes while ipif_up_done() still has reference 15894 * to them. Refer to ipif_up_done() for more details. 15895 */ 15896 ill_update_source_selection(ill); 15897 } 15898 15899 /* 15900 * Send a routing sockets message if we are inserting into 15901 * groups with names. 15902 */ 15903 if (groupname != NULL) 15904 ip_rts_ifmsg(ill->ill_ipif); 15905 return (0); 15906 } 15907 15908 /* 15909 * Return the first phyint matching the groupname. There could 15910 * be more than one when there are ill groups. 15911 * 15912 * Needs work: called only from ip_sioctl_groupname 15913 */ 15914 static phyint_t * 15915 phyint_lookup_group(char *groupname) 15916 { 15917 phyint_t *phyi; 15918 15919 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 15920 /* 15921 * Group names are stored in the phyint - a common structure 15922 * to both IPv4 and IPv6. 15923 */ 15924 phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index); 15925 for (; phyi != NULL; 15926 phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index, 15927 phyi, AVL_AFTER)) { 15928 if (phyi->phyint_groupname_len == 0) 15929 continue; 15930 ASSERT(phyi->phyint_groupname != NULL); 15931 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0) 15932 return (phyi); 15933 } 15934 return (NULL); 15935 } 15936 15937 15938 15939 /* 15940 * MT notes on creation and deletion of IPMP groups 15941 * 15942 * Creation and deletion of IPMP groups introduce the need to merge or 15943 * split the associated serialization objects i.e the ipsq's. Normally all 15944 * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled 15945 * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during 15946 * the execution of the SIOCSLIFGROUPNAME command the picture changes. There 15947 * is a need to change the <ill-ipsq> association and we have to operate on both 15948 * the source and destination IPMP groups. For eg. attempting to set the 15949 * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to 15950 * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the 15951 * source or destination IPMP group are mapped to a single ipsq for executing 15952 * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's. 15953 * The <ill-ipsq> mapping is restored back to normal at a later point. This is 15954 * termed as a split of the ipsq. The converse of the merge i.e. a split of the 15955 * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname 15956 * occurred on the ipsq, then the ipsq_split flag is set. This indicates the 15957 * ipsq has to be examined for redoing the <ill-ipsq> associations. 15958 * 15959 * In the above example the ioctl handling code locates the current ipsq of hme0 15960 * which is ipsq(mpk17-84). It then enters the above ipsq immediately or 15961 * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates 15962 * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into 15963 * the destination ipsq. If the destination ipsq is not busy, it also enters 15964 * the destination ipsq exclusively. Now the actual groupname setting operation 15965 * can proceed. If the destination ipsq is busy, the operation is enqueued 15966 * on the destination (merged) ipsq and will be handled in the unwind from 15967 * ipsq_exit. 15968 * 15969 * To prevent other threads accessing the ill while the group name change is 15970 * in progres, we bring down the ipifs which also removes the ill from the 15971 * group. The group is changed in phyint and when the first ipif on the ill 15972 * is brought up, the ill is inserted into the right IPMP group by 15973 * illgrp_insert. 15974 */ 15975 /* ARGSUSED */ 15976 int 15977 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15978 ip_ioctl_cmd_t *ipip, void *ifreq) 15979 { 15980 int i; 15981 char *tmp; 15982 int namelen; 15983 ill_t *ill = ipif->ipif_ill; 15984 ill_t *ill_v4, *ill_v6; 15985 int err = 0; 15986 phyint_t *phyi; 15987 phyint_t *phyi_tmp; 15988 struct lifreq *lifr; 15989 mblk_t *mp1; 15990 char *groupname; 15991 ipsq_t *ipsq; 15992 15993 ASSERT(IAM_WRITER_IPIF(ipif)); 15994 15995 /* Existance verified in ip_wput_nondata */ 15996 mp1 = mp->b_cont->b_cont; 15997 lifr = (struct lifreq *)mp1->b_rptr; 15998 groupname = lifr->lifr_groupname; 15999 16000 if (ipif->ipif_id != 0) 16001 return (EINVAL); 16002 16003 phyi = ill->ill_phyint; 16004 ASSERT(phyi != NULL); 16005 16006 if (phyi->phyint_flags & PHYI_VIRTUAL) 16007 return (EINVAL); 16008 16009 tmp = groupname; 16010 for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++) 16011 ; 16012 16013 if (i == LIFNAMSIZ) { 16014 /* no null termination */ 16015 return (EINVAL); 16016 } 16017 16018 /* 16019 * Calculate the namelen exclusive of the null 16020 * termination character. 16021 */ 16022 namelen = tmp - groupname; 16023 16024 ill_v4 = phyi->phyint_illv4; 16025 ill_v6 = phyi->phyint_illv6; 16026 16027 /* 16028 * ILL cannot be part of a usesrc group and and IPMP group at the 16029 * same time. No need to grab the ill_g_usesrc_lock here, see 16030 * synchronization notes in ip.c 16031 */ 16032 if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 16033 return (EINVAL); 16034 } 16035 16036 /* 16037 * mark the ill as changing. 16038 * this should queue all new requests on the syncq. 16039 */ 16040 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16041 16042 if (ill_v4 != NULL) 16043 ill_v4->ill_state_flags |= ILL_CHANGING; 16044 if (ill_v6 != NULL) 16045 ill_v6->ill_state_flags |= ILL_CHANGING; 16046 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16047 16048 if (namelen == 0) { 16049 /* 16050 * Null string means remove this interface from the 16051 * existing group. 16052 */ 16053 if (phyi->phyint_groupname_len == 0) { 16054 /* 16055 * Never was in a group. 16056 */ 16057 err = 0; 16058 goto done; 16059 } 16060 16061 /* 16062 * IPv4 or IPv6 may be temporarily out of the group when all 16063 * the ipifs are down. Thus, we need to check for ill_group to 16064 * be non-NULL. 16065 */ 16066 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 16067 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 16068 mutex_enter(&ill_v4->ill_lock); 16069 if (!ill_is_quiescent(ill_v4)) { 16070 /* 16071 * ipsq_pending_mp_add will not fail since 16072 * connp is NULL 16073 */ 16074 (void) ipsq_pending_mp_add(NULL, 16075 ill_v4->ill_ipif, q, mp, ILL_DOWN); 16076 mutex_exit(&ill_v4->ill_lock); 16077 err = EINPROGRESS; 16078 goto done; 16079 } 16080 mutex_exit(&ill_v4->ill_lock); 16081 } 16082 16083 if (ill_v6 != NULL && ill_v6->ill_group != NULL) { 16084 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 16085 mutex_enter(&ill_v6->ill_lock); 16086 if (!ill_is_quiescent(ill_v6)) { 16087 (void) ipsq_pending_mp_add(NULL, 16088 ill_v6->ill_ipif, q, mp, ILL_DOWN); 16089 mutex_exit(&ill_v6->ill_lock); 16090 err = EINPROGRESS; 16091 goto done; 16092 } 16093 mutex_exit(&ill_v6->ill_lock); 16094 } 16095 16096 rw_enter(&ill_g_lock, RW_WRITER); 16097 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16098 mutex_enter(&phyi->phyint_lock); 16099 ASSERT(phyi->phyint_groupname != NULL); 16100 mi_free(phyi->phyint_groupname); 16101 phyi->phyint_groupname = NULL; 16102 phyi->phyint_groupname_len = 0; 16103 mutex_exit(&phyi->phyint_lock); 16104 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16105 rw_exit(&ill_g_lock); 16106 err = ill_up_ipifs(ill, q, mp); 16107 16108 /* 16109 * set the split flag so that the ipsq can be split 16110 */ 16111 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16112 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16113 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16114 16115 } else { 16116 if (phyi->phyint_groupname_len != 0) { 16117 ASSERT(phyi->phyint_groupname != NULL); 16118 /* Are we inserting in the same group ? */ 16119 if (mi_strcmp(groupname, 16120 phyi->phyint_groupname) == 0) { 16121 err = 0; 16122 goto done; 16123 } 16124 } 16125 16126 rw_enter(&ill_g_lock, RW_READER); 16127 /* 16128 * Merge ipsq for the group's. 16129 * This check is here as multiple groups/ills might be 16130 * sharing the same ipsq. 16131 * If we have to merege than the operation is restarted 16132 * on the new ipsq. 16133 */ 16134 ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL); 16135 if (phyi->phyint_ipsq != ipsq) { 16136 rw_exit(&ill_g_lock); 16137 err = ill_merge_groups(ill, NULL, groupname, mp, q); 16138 goto done; 16139 } 16140 /* 16141 * Running exclusive on new ipsq. 16142 */ 16143 16144 ASSERT(ipsq != NULL); 16145 ASSERT(ipsq->ipsq_writer == curthread); 16146 16147 /* 16148 * Check whether the ill_type and ill_net_type matches before 16149 * we allocate any memory so that the cleanup is easier. 16150 * 16151 * We can't group dissimilar ones as we can't load spread 16152 * packets across the group because of potential link-level 16153 * header differences. 16154 */ 16155 phyi_tmp = phyint_lookup_group(groupname); 16156 if (phyi_tmp != NULL) { 16157 if ((ill_v4 != NULL && 16158 phyi_tmp->phyint_illv4 != NULL) && 16159 ((ill_v4->ill_net_type != 16160 phyi_tmp->phyint_illv4->ill_net_type) || 16161 (ill_v4->ill_type != 16162 phyi_tmp->phyint_illv4->ill_type))) { 16163 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16164 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16165 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16166 rw_exit(&ill_g_lock); 16167 return (EINVAL); 16168 } 16169 if ((ill_v6 != NULL && 16170 phyi_tmp->phyint_illv6 != NULL) && 16171 ((ill_v6->ill_net_type != 16172 phyi_tmp->phyint_illv6->ill_net_type) || 16173 (ill_v6->ill_type != 16174 phyi_tmp->phyint_illv6->ill_type))) { 16175 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16176 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16177 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16178 rw_exit(&ill_g_lock); 16179 return (EINVAL); 16180 } 16181 } 16182 16183 rw_exit(&ill_g_lock); 16184 16185 /* 16186 * bring down all v4 ipifs. 16187 */ 16188 if (ill_v4 != NULL) { 16189 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 16190 } 16191 16192 /* 16193 * bring down all v6 ipifs. 16194 */ 16195 if (ill_v6 != NULL) { 16196 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 16197 } 16198 16199 /* 16200 * make sure all ipifs are down and there are no active 16201 * references. Call to ipsq_pending_mp_add will not fail 16202 * since connp is NULL. 16203 */ 16204 if (ill_v4 != NULL) { 16205 mutex_enter(&ill_v4->ill_lock); 16206 if (!ill_is_quiescent(ill_v4)) { 16207 (void) ipsq_pending_mp_add(NULL, 16208 ill_v4->ill_ipif, q, mp, ILL_DOWN); 16209 mutex_exit(&ill_v4->ill_lock); 16210 err = EINPROGRESS; 16211 goto done; 16212 } 16213 mutex_exit(&ill_v4->ill_lock); 16214 } 16215 16216 if (ill_v6 != NULL) { 16217 mutex_enter(&ill_v6->ill_lock); 16218 if (!ill_is_quiescent(ill_v6)) { 16219 (void) ipsq_pending_mp_add(NULL, 16220 ill_v6->ill_ipif, q, mp, ILL_DOWN); 16221 mutex_exit(&ill_v6->ill_lock); 16222 err = EINPROGRESS; 16223 goto done; 16224 } 16225 mutex_exit(&ill_v6->ill_lock); 16226 } 16227 16228 /* 16229 * allocate including space for null terminator 16230 * before we insert. 16231 */ 16232 tmp = (char *)mi_alloc(namelen + 1, BPRI_MED); 16233 if (tmp == NULL) 16234 return (ENOMEM); 16235 16236 rw_enter(&ill_g_lock, RW_WRITER); 16237 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16238 mutex_enter(&phyi->phyint_lock); 16239 if (phyi->phyint_groupname_len != 0) { 16240 ASSERT(phyi->phyint_groupname != NULL); 16241 mi_free(phyi->phyint_groupname); 16242 } 16243 16244 /* 16245 * setup the new group name. 16246 */ 16247 phyi->phyint_groupname = tmp; 16248 bcopy(groupname, phyi->phyint_groupname, namelen + 1); 16249 phyi->phyint_groupname_len = namelen + 1; 16250 mutex_exit(&phyi->phyint_lock); 16251 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16252 rw_exit(&ill_g_lock); 16253 16254 err = ill_up_ipifs(ill, q, mp); 16255 } 16256 16257 done: 16258 /* 16259 * normally ILL_CHANGING is cleared in ill_up_ipifs. 16260 */ 16261 if (err != EINPROGRESS) { 16262 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16263 if (ill_v4 != NULL) 16264 ill_v4->ill_state_flags &= ~ILL_CHANGING; 16265 if (ill_v6 != NULL) 16266 ill_v6->ill_state_flags &= ~ILL_CHANGING; 16267 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16268 } 16269 return (err); 16270 } 16271 16272 /* ARGSUSED */ 16273 int 16274 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 16275 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 16276 { 16277 ill_t *ill; 16278 phyint_t *phyi; 16279 struct lifreq *lifr; 16280 mblk_t *mp1; 16281 16282 /* Existence verified in ip_wput_nondata */ 16283 mp1 = mp->b_cont->b_cont; 16284 lifr = (struct lifreq *)mp1->b_rptr; 16285 ill = ipif->ipif_ill; 16286 phyi = ill->ill_phyint; 16287 16288 lifr->lifr_groupname[0] = '\0'; 16289 /* 16290 * ill_group may be null if all the interfaces 16291 * are down. But still, the phyint should always 16292 * hold the name. 16293 */ 16294 if (phyi->phyint_groupname_len != 0) { 16295 bcopy(phyi->phyint_groupname, lifr->lifr_groupname, 16296 phyi->phyint_groupname_len); 16297 } 16298 16299 return (0); 16300 } 16301 16302 16303 typedef struct conn_move_s { 16304 ill_t *cm_from_ill; 16305 ill_t *cm_to_ill; 16306 int cm_ifindex; 16307 } conn_move_t; 16308 16309 /* 16310 * ipcl_walk function for moving conn_multicast_ill for a given ill. 16311 */ 16312 static void 16313 conn_move(conn_t *connp, caddr_t arg) 16314 { 16315 conn_move_t *connm; 16316 int ifindex; 16317 int i; 16318 ill_t *from_ill; 16319 ill_t *to_ill; 16320 ilg_t *ilg; 16321 ilm_t *ret_ilm; 16322 16323 connm = (conn_move_t *)arg; 16324 ifindex = connm->cm_ifindex; 16325 from_ill = connm->cm_from_ill; 16326 to_ill = connm->cm_to_ill; 16327 16328 /* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */ 16329 16330 /* All multicast fields protected by conn_lock */ 16331 mutex_enter(&connp->conn_lock); 16332 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); 16333 if ((connp->conn_outgoing_ill == from_ill) && 16334 (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) { 16335 connp->conn_outgoing_ill = to_ill; 16336 connp->conn_incoming_ill = to_ill; 16337 } 16338 16339 /* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */ 16340 16341 if ((connp->conn_multicast_ill == from_ill) && 16342 (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) { 16343 connp->conn_multicast_ill = connm->cm_to_ill; 16344 } 16345 16346 /* Change IP_XMIT_IF associations */ 16347 if ((connp->conn_xmit_if_ill == from_ill) && 16348 (ifindex == 0 || connp->conn_orig_xmit_ifindex == ifindex)) { 16349 connp->conn_xmit_if_ill = to_ill; 16350 } 16351 /* 16352 * Change the ilg_ill to point to the new one. This assumes 16353 * ilm_move_v6 has moved the ilms to new_ill and the driver 16354 * has been told to receive packets on this interface. 16355 * ilm_move_v6 FAILBACKS all the ilms successfully always. 16356 * But when doing a FAILOVER, it might fail with ENOMEM and so 16357 * some ilms may not have moved. We check to see whether 16358 * the ilms have moved to to_ill. We can't check on from_ill 16359 * as in the process of moving, we could have split an ilm 16360 * in to two - which has the same orig_ifindex and v6group. 16361 * 16362 * For IPv4, ilg_ipif moves implicitly. The code below really 16363 * does not do anything for IPv4 as ilg_ill is NULL for IPv4. 16364 */ 16365 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 16366 ilg = &connp->conn_ilg[i]; 16367 if ((ilg->ilg_ill == from_ill) && 16368 (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) { 16369 /* ifindex != 0 indicates failback */ 16370 if (ifindex != 0) { 16371 connp->conn_ilg[i].ilg_ill = to_ill; 16372 continue; 16373 } 16374 16375 ret_ilm = ilm_lookup_ill_index_v6(to_ill, 16376 &ilg->ilg_v6group, ilg->ilg_orig_ifindex, 16377 connp->conn_zoneid); 16378 16379 if (ret_ilm != NULL) 16380 connp->conn_ilg[i].ilg_ill = to_ill; 16381 } 16382 } 16383 mutex_exit(&connp->conn_lock); 16384 } 16385 16386 static void 16387 conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex) 16388 { 16389 conn_move_t connm; 16390 16391 connm.cm_from_ill = from_ill; 16392 connm.cm_to_ill = to_ill; 16393 connm.cm_ifindex = ifindex; 16394 16395 ipcl_walk(conn_move, (caddr_t)&connm); 16396 } 16397 16398 /* 16399 * ilm has been moved from from_ill to to_ill. 16400 * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill. 16401 * appropriately. 16402 * 16403 * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because 16404 * the code there de-references ipif_ill to get the ill to 16405 * send multicast requests. It does not work as ipif is on its 16406 * move and already moved when this function is called. 16407 * Thus, we need to use from_ill and to_ill send down multicast 16408 * requests. 16409 */ 16410 static void 16411 ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill) 16412 { 16413 ipif_t *ipif; 16414 ilm_t *ilm; 16415 16416 /* 16417 * See whether we need to send down DL_ENABMULTI_REQ on 16418 * to_ill as ilm has just been added. 16419 */ 16420 ASSERT(IAM_WRITER_ILL(to_ill)); 16421 ASSERT(IAM_WRITER_ILL(from_ill)); 16422 16423 ILM_WALKER_HOLD(to_ill); 16424 for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 16425 16426 if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED)) 16427 continue; 16428 /* 16429 * no locks held, ill/ipif cannot dissappear as long 16430 * as we are writer. 16431 */ 16432 ipif = to_ill->ill_ipif; 16433 /* 16434 * No need to hold any lock as we are the writer and this 16435 * can only be changed by a writer. 16436 */ 16437 ilm->ilm_is_new = B_FALSE; 16438 16439 if (to_ill->ill_net_type != IRE_IF_RESOLVER || 16440 ipif->ipif_flags & IPIF_POINTOPOINT) { 16441 ip1dbg(("ilm_send_multicast_reqs: to_ill not " 16442 "resolver\n")); 16443 continue; /* Must be IRE_IF_NORESOLVER */ 16444 } 16445 16446 16447 if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 16448 ip1dbg(("ilm_send_multicast_reqs: " 16449 "to_ill MULTI_BCAST\n")); 16450 goto from; 16451 } 16452 16453 if (to_ill->ill_isv6) 16454 mld_joingroup(ilm); 16455 else 16456 igmp_joingroup(ilm); 16457 16458 if (to_ill->ill_ipif_up_count == 0) { 16459 /* 16460 * Nobody there. All multicast addresses will be 16461 * re-joined when we get the DL_BIND_ACK bringing the 16462 * interface up. 16463 */ 16464 ilm->ilm_notify_driver = B_FALSE; 16465 ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n")); 16466 goto from; 16467 } 16468 16469 /* 16470 * For allmulti address, we want to join on only one interface. 16471 * Checking for ilm_numentries_v6 is not correct as you may 16472 * find an ilm with zero address on to_ill, but we may not 16473 * have nominated to_ill for receiving. Thus, if we have 16474 * nominated from_ill (ill_join_allmulti is set), nominate 16475 * only if to_ill is not already nominated (to_ill normally 16476 * should not have been nominated if "from_ill" has already 16477 * been nominated. As we don't prevent failovers from happening 16478 * across groups, we don't assert). 16479 */ 16480 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16481 /* 16482 * There is no need to hold ill locks as we are 16483 * writer on both ills and when ill_join_allmulti 16484 * is changed the thread is always a writer. 16485 */ 16486 if (from_ill->ill_join_allmulti && 16487 !to_ill->ill_join_allmulti) { 16488 (void) ip_join_allmulti(to_ill->ill_ipif); 16489 } 16490 } else if (ilm->ilm_notify_driver) { 16491 16492 /* 16493 * This is a newly moved ilm so we need to tell the 16494 * driver about the new group. There can be more than 16495 * one ilm's for the same group in the list each with a 16496 * different orig_ifindex. We have to inform the driver 16497 * once. In ilm_move_v[4,6] we only set the flag 16498 * ilm_notify_driver for the first ilm. 16499 */ 16500 16501 (void) ip_ll_send_enabmulti_req(to_ill, 16502 &ilm->ilm_v6addr); 16503 } 16504 16505 ilm->ilm_notify_driver = B_FALSE; 16506 16507 /* 16508 * See whether we need to send down DL_DISABMULTI_REQ on 16509 * from_ill as ilm has just been removed. 16510 */ 16511 from: 16512 ipif = from_ill->ill_ipif; 16513 if (from_ill->ill_net_type != IRE_IF_RESOLVER || 16514 ipif->ipif_flags & IPIF_POINTOPOINT) { 16515 ip1dbg(("ilm_send_multicast_reqs: " 16516 "from_ill not resolver\n")); 16517 continue; /* Must be IRE_IF_NORESOLVER */ 16518 } 16519 16520 if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 16521 ip1dbg(("ilm_send_multicast_reqs: " 16522 "from_ill MULTI_BCAST\n")); 16523 continue; 16524 } 16525 16526 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16527 if (from_ill->ill_join_allmulti) 16528 (void) ip_leave_allmulti(from_ill->ill_ipif); 16529 } else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) { 16530 (void) ip_ll_send_disabmulti_req(from_ill, 16531 &ilm->ilm_v6addr); 16532 } 16533 } 16534 ILM_WALKER_RELE(to_ill); 16535 } 16536 16537 /* 16538 * This function is called when all multicast memberships needs 16539 * to be moved from "from_ill" to "to_ill" for IPv6. This function is 16540 * called only once unlike the IPv4 counterpart where it is called after 16541 * every logical interface is moved. The reason is due to multicast 16542 * memberships are joined using an interface address in IPv4 while in 16543 * IPv6, interface index is used. 16544 */ 16545 static void 16546 ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex) 16547 { 16548 ilm_t *ilm; 16549 ilm_t *ilm_next; 16550 ilm_t *new_ilm; 16551 ilm_t **ilmp; 16552 int count; 16553 char buf[INET6_ADDRSTRLEN]; 16554 in6_addr_t ipv6_snm = ipv6_solicited_node_mcast; 16555 16556 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 16557 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 16558 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 16559 16560 if (ifindex == 0) { 16561 /* 16562 * Form the solicited node mcast address which is used later. 16563 */ 16564 ipif_t *ipif; 16565 16566 ipif = from_ill->ill_ipif; 16567 ASSERT(ipif->ipif_id == 0); 16568 16569 ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 16570 } 16571 16572 ilmp = &from_ill->ill_ilm; 16573 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 16574 ilm_next = ilm->ilm_next; 16575 16576 if (ilm->ilm_flags & ILM_DELETED) { 16577 ilmp = &ilm->ilm_next; 16578 continue; 16579 } 16580 16581 new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr, 16582 ilm->ilm_orig_ifindex, ilm->ilm_zoneid); 16583 ASSERT(ilm->ilm_orig_ifindex != 0); 16584 if (ilm->ilm_orig_ifindex == ifindex) { 16585 /* 16586 * We are failing back multicast memberships. 16587 * If the same ilm exists in to_ill, it means somebody 16588 * has joined the same group there e.g. ff02::1 16589 * is joined within the kernel when the interfaces 16590 * came UP. 16591 */ 16592 ASSERT(ilm->ilm_ipif == NULL); 16593 if (new_ilm != NULL) { 16594 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16595 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16596 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16597 new_ilm->ilm_is_new = B_TRUE; 16598 } 16599 } else { 16600 /* 16601 * check if we can just move the ilm 16602 */ 16603 if (from_ill->ill_ilm_walker_cnt != 0) { 16604 /* 16605 * We have walkers we cannot move 16606 * the ilm, so allocate a new ilm, 16607 * this (old) ilm will be marked 16608 * ILM_DELETED at the end of the loop 16609 * and will be freed when the 16610 * last walker exits. 16611 */ 16612 new_ilm = (ilm_t *)mi_zalloc 16613 (sizeof (ilm_t)); 16614 if (new_ilm == NULL) { 16615 ip0dbg(("ilm_move_v6: " 16616 "FAILBACK of IPv6" 16617 " multicast address %s : " 16618 "from %s to" 16619 " %s failed : ENOMEM \n", 16620 inet_ntop(AF_INET6, 16621 &ilm->ilm_v6addr, buf, 16622 sizeof (buf)), 16623 from_ill->ill_name, 16624 to_ill->ill_name)); 16625 16626 ilmp = &ilm->ilm_next; 16627 continue; 16628 } 16629 *new_ilm = *ilm; 16630 /* 16631 * we don't want new_ilm linked to 16632 * ilm's filter list. 16633 */ 16634 new_ilm->ilm_filter = NULL; 16635 } else { 16636 /* 16637 * No walkers we can move the ilm. 16638 * lets take it out of the list. 16639 */ 16640 *ilmp = ilm->ilm_next; 16641 ilm->ilm_next = NULL; 16642 new_ilm = ilm; 16643 } 16644 16645 /* 16646 * if this is the first ilm for the group 16647 * set ilm_notify_driver so that we notify the 16648 * driver in ilm_send_multicast_reqs. 16649 */ 16650 if (ilm_lookup_ill_v6(to_ill, 16651 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16652 new_ilm->ilm_notify_driver = B_TRUE; 16653 16654 new_ilm->ilm_ill = to_ill; 16655 /* Add to the to_ill's list */ 16656 new_ilm->ilm_next = to_ill->ill_ilm; 16657 to_ill->ill_ilm = new_ilm; 16658 /* 16659 * set the flag so that mld_joingroup is 16660 * called in ilm_send_multicast_reqs(). 16661 */ 16662 new_ilm->ilm_is_new = B_TRUE; 16663 } 16664 goto bottom; 16665 } else if (ifindex != 0) { 16666 /* 16667 * If this is FAILBACK (ifindex != 0) and the ifindex 16668 * has not matched above, look at the next ilm. 16669 */ 16670 ilmp = &ilm->ilm_next; 16671 continue; 16672 } 16673 /* 16674 * If we are here, it means ifindex is 0. Failover 16675 * everything. 16676 * 16677 * We need to handle solicited node mcast address 16678 * and all_nodes mcast address differently as they 16679 * are joined witin the kenrel (ipif_multicast_up) 16680 * and potentially from the userland. We are called 16681 * after the ipifs of from_ill has been moved. 16682 * If we still find ilms on ill with solicited node 16683 * mcast address or all_nodes mcast address, it must 16684 * belong to the UP interface that has not moved e.g. 16685 * ipif_id 0 with the link local prefix does not move. 16686 * We join this on the new ill accounting for all the 16687 * userland memberships so that applications don't 16688 * see any failure. 16689 * 16690 * We need to make sure that we account only for the 16691 * solicited node and all node multicast addresses 16692 * that was brought UP on these. In the case of 16693 * a failover from A to B, we might have ilms belonging 16694 * to A (ilm_orig_ifindex pointing at A) on B accounting 16695 * for the membership from the userland. If we are failing 16696 * over from B to C now, we will find the ones belonging 16697 * to A on B. These don't account for the ill_ipif_up_count. 16698 * They just move from B to C. The check below on 16699 * ilm_orig_ifindex ensures that. 16700 */ 16701 if ((ilm->ilm_orig_ifindex == 16702 from_ill->ill_phyint->phyint_ifindex) && 16703 (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) || 16704 IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, 16705 &ilm->ilm_v6addr))) { 16706 ASSERT(ilm->ilm_refcnt > 0); 16707 count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count; 16708 /* 16709 * For indentation reasons, we are not using a 16710 * "else" here. 16711 */ 16712 if (count == 0) { 16713 ilmp = &ilm->ilm_next; 16714 continue; 16715 } 16716 ilm->ilm_refcnt -= count; 16717 if (new_ilm != NULL) { 16718 /* 16719 * Can find one with the same 16720 * ilm_orig_ifindex, if we are failing 16721 * over to a STANDBY. This happens 16722 * when somebody wants to join a group 16723 * on a STANDBY interface and we 16724 * internally join on a different one. 16725 * If we had joined on from_ill then, a 16726 * failover now will find a new ilm 16727 * with this index. 16728 */ 16729 ip1dbg(("ilm_move_v6: FAILOVER, found" 16730 " new ilm on %s, group address %s\n", 16731 to_ill->ill_name, 16732 inet_ntop(AF_INET6, 16733 &ilm->ilm_v6addr, buf, 16734 sizeof (buf)))); 16735 new_ilm->ilm_refcnt += count; 16736 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16737 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16738 new_ilm->ilm_is_new = B_TRUE; 16739 } 16740 } else { 16741 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 16742 if (new_ilm == NULL) { 16743 ip0dbg(("ilm_move_v6: FAILOVER of IPv6" 16744 " multicast address %s : from %s to" 16745 " %s failed : ENOMEM \n", 16746 inet_ntop(AF_INET6, 16747 &ilm->ilm_v6addr, buf, 16748 sizeof (buf)), from_ill->ill_name, 16749 to_ill->ill_name)); 16750 ilmp = &ilm->ilm_next; 16751 continue; 16752 } 16753 *new_ilm = *ilm; 16754 new_ilm->ilm_filter = NULL; 16755 new_ilm->ilm_refcnt = count; 16756 new_ilm->ilm_timer = INFINITY; 16757 new_ilm->ilm_rtx.rtx_timer = INFINITY; 16758 new_ilm->ilm_is_new = B_TRUE; 16759 /* 16760 * If the to_ill has not joined this 16761 * group we need to tell the driver in 16762 * ill_send_multicast_reqs. 16763 */ 16764 if (ilm_lookup_ill_v6(to_ill, 16765 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16766 new_ilm->ilm_notify_driver = B_TRUE; 16767 16768 new_ilm->ilm_ill = to_ill; 16769 /* Add to the to_ill's list */ 16770 new_ilm->ilm_next = to_ill->ill_ilm; 16771 to_ill->ill_ilm = new_ilm; 16772 ASSERT(new_ilm->ilm_ipif == NULL); 16773 } 16774 if (ilm->ilm_refcnt == 0) { 16775 goto bottom; 16776 } else { 16777 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16778 CLEAR_SLIST(new_ilm->ilm_filter); 16779 ilmp = &ilm->ilm_next; 16780 } 16781 continue; 16782 } else { 16783 /* 16784 * ifindex = 0 means, move everything pointing at 16785 * from_ill. We are doing this becuase ill has 16786 * either FAILED or became INACTIVE. 16787 * 16788 * As we would like to move things later back to 16789 * from_ill, we want to retain the identity of this 16790 * ilm. Thus, we don't blindly increment the reference 16791 * count on the ilms matching the address alone. We 16792 * need to match on the ilm_orig_index also. new_ilm 16793 * was obtained by matching ilm_orig_index also. 16794 */ 16795 if (new_ilm != NULL) { 16796 /* 16797 * This is possible only if a previous restore 16798 * was incomplete i.e restore to 16799 * ilm_orig_ifindex left some ilms because 16800 * of some failures. Thus when we are failing 16801 * again, we might find our old friends there. 16802 */ 16803 ip1dbg(("ilm_move_v6: FAILOVER, found new ilm" 16804 " on %s, group address %s\n", 16805 to_ill->ill_name, 16806 inet_ntop(AF_INET6, 16807 &ilm->ilm_v6addr, buf, 16808 sizeof (buf)))); 16809 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16810 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16811 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16812 new_ilm->ilm_is_new = B_TRUE; 16813 } 16814 } else { 16815 if (from_ill->ill_ilm_walker_cnt != 0) { 16816 new_ilm = (ilm_t *) 16817 mi_zalloc(sizeof (ilm_t)); 16818 if (new_ilm == NULL) { 16819 ip0dbg(("ilm_move_v6: " 16820 "FAILOVER of IPv6" 16821 " multicast address %s : " 16822 "from %s to" 16823 " %s failed : ENOMEM \n", 16824 inet_ntop(AF_INET6, 16825 &ilm->ilm_v6addr, buf, 16826 sizeof (buf)), 16827 from_ill->ill_name, 16828 to_ill->ill_name)); 16829 16830 ilmp = &ilm->ilm_next; 16831 continue; 16832 } 16833 *new_ilm = *ilm; 16834 new_ilm->ilm_filter = NULL; 16835 } else { 16836 *ilmp = ilm->ilm_next; 16837 new_ilm = ilm; 16838 } 16839 /* 16840 * If the to_ill has not joined this 16841 * group we need to tell the driver in 16842 * ill_send_multicast_reqs. 16843 */ 16844 if (ilm_lookup_ill_v6(to_ill, 16845 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16846 new_ilm->ilm_notify_driver = B_TRUE; 16847 16848 /* Add to the to_ill's list */ 16849 new_ilm->ilm_next = to_ill->ill_ilm; 16850 to_ill->ill_ilm = new_ilm; 16851 ASSERT(ilm->ilm_ipif == NULL); 16852 new_ilm->ilm_ill = to_ill; 16853 new_ilm->ilm_is_new = B_TRUE; 16854 } 16855 16856 } 16857 16858 bottom: 16859 /* 16860 * Revert multicast filter state to (EXCLUDE, NULL). 16861 * new_ilm->ilm_is_new should already be set if needed. 16862 */ 16863 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16864 CLEAR_SLIST(new_ilm->ilm_filter); 16865 /* 16866 * We allocated/got a new ilm, free the old one. 16867 */ 16868 if (new_ilm != ilm) { 16869 if (from_ill->ill_ilm_walker_cnt == 0) { 16870 *ilmp = ilm->ilm_next; 16871 ilm->ilm_next = NULL; 16872 FREE_SLIST(ilm->ilm_filter); 16873 FREE_SLIST(ilm->ilm_pendsrcs); 16874 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 16875 FREE_SLIST(ilm->ilm_rtx.rtx_block); 16876 mi_free((char *)ilm); 16877 } else { 16878 ilm->ilm_flags |= ILM_DELETED; 16879 from_ill->ill_ilm_cleanup_reqd = 1; 16880 ilmp = &ilm->ilm_next; 16881 } 16882 } 16883 } 16884 } 16885 16886 /* 16887 * Move all the multicast memberships to to_ill. Called when 16888 * an ipif moves from "from_ill" to "to_ill". This function is slightly 16889 * different from IPv6 counterpart as multicast memberships are associated 16890 * with ills in IPv6. This function is called after every ipif is moved 16891 * unlike IPv6, where it is moved only once. 16892 */ 16893 static void 16894 ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif) 16895 { 16896 ilm_t *ilm; 16897 ilm_t *ilm_next; 16898 ilm_t *new_ilm; 16899 ilm_t **ilmp; 16900 16901 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 16902 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 16903 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 16904 16905 ilmp = &from_ill->ill_ilm; 16906 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 16907 ilm_next = ilm->ilm_next; 16908 16909 if (ilm->ilm_flags & ILM_DELETED) { 16910 ilmp = &ilm->ilm_next; 16911 continue; 16912 } 16913 16914 ASSERT(ilm->ilm_ipif != NULL); 16915 16916 if (ilm->ilm_ipif != ipif) { 16917 ilmp = &ilm->ilm_next; 16918 continue; 16919 } 16920 16921 if (V4_PART_OF_V6(ilm->ilm_v6addr) == 16922 htonl(INADDR_ALLHOSTS_GROUP)) { 16923 /* 16924 * We joined this in ipif_multicast_up 16925 * and we never did an ipif_multicast_down 16926 * for IPv4. If nobody else from the userland 16927 * has reference, we free the ilm, and later 16928 * when this ipif comes up on the new ill, 16929 * we will join this again. 16930 */ 16931 if (--ilm->ilm_refcnt == 0) 16932 goto delete_ilm; 16933 16934 new_ilm = ilm_lookup_ipif(ipif, 16935 V4_PART_OF_V6(ilm->ilm_v6addr)); 16936 if (new_ilm != NULL) { 16937 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16938 /* 16939 * We still need to deal with the from_ill. 16940 */ 16941 new_ilm->ilm_is_new = B_TRUE; 16942 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16943 CLEAR_SLIST(new_ilm->ilm_filter); 16944 goto delete_ilm; 16945 } 16946 /* 16947 * If we could not find one e.g. ipif is 16948 * still down on to_ill, we add this ilm 16949 * on ill_new to preserve the reference 16950 * count. 16951 */ 16952 } 16953 /* 16954 * When ipifs move, ilms always move with it 16955 * to the NEW ill. Thus we should never be 16956 * able to find ilm till we really move it here. 16957 */ 16958 ASSERT(ilm_lookup_ipif(ipif, 16959 V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL); 16960 16961 if (from_ill->ill_ilm_walker_cnt != 0) { 16962 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 16963 if (new_ilm == NULL) { 16964 char buf[INET6_ADDRSTRLEN]; 16965 ip0dbg(("ilm_move_v4: FAILBACK of IPv4" 16966 " multicast address %s : " 16967 "from %s to" 16968 " %s failed : ENOMEM \n", 16969 inet_ntop(AF_INET, 16970 &ilm->ilm_v6addr, buf, 16971 sizeof (buf)), 16972 from_ill->ill_name, 16973 to_ill->ill_name)); 16974 16975 ilmp = &ilm->ilm_next; 16976 continue; 16977 } 16978 *new_ilm = *ilm; 16979 /* We don't want new_ilm linked to ilm's filter list */ 16980 new_ilm->ilm_filter = NULL; 16981 } else { 16982 /* Remove from the list */ 16983 *ilmp = ilm->ilm_next; 16984 new_ilm = ilm; 16985 } 16986 16987 /* 16988 * If we have never joined this group on the to_ill 16989 * make sure we tell the driver. 16990 */ 16991 if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr, 16992 ALL_ZONES) == NULL) 16993 new_ilm->ilm_notify_driver = B_TRUE; 16994 16995 /* Add to the to_ill's list */ 16996 new_ilm->ilm_next = to_ill->ill_ilm; 16997 to_ill->ill_ilm = new_ilm; 16998 new_ilm->ilm_is_new = B_TRUE; 16999 17000 /* 17001 * Revert multicast filter state to (EXCLUDE, NULL) 17002 */ 17003 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17004 CLEAR_SLIST(new_ilm->ilm_filter); 17005 17006 /* 17007 * Delete only if we have allocated a new ilm. 17008 */ 17009 if (new_ilm != ilm) { 17010 delete_ilm: 17011 if (from_ill->ill_ilm_walker_cnt == 0) { 17012 /* Remove from the list */ 17013 *ilmp = ilm->ilm_next; 17014 ilm->ilm_next = NULL; 17015 FREE_SLIST(ilm->ilm_filter); 17016 FREE_SLIST(ilm->ilm_pendsrcs); 17017 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 17018 FREE_SLIST(ilm->ilm_rtx.rtx_block); 17019 mi_free((char *)ilm); 17020 } else { 17021 ilm->ilm_flags |= ILM_DELETED; 17022 from_ill->ill_ilm_cleanup_reqd = 1; 17023 ilmp = &ilm->ilm_next; 17024 } 17025 } 17026 } 17027 } 17028 17029 static uint_t 17030 ipif_get_id(ill_t *ill, uint_t id) 17031 { 17032 uint_t unit; 17033 ipif_t *tipif; 17034 boolean_t found = B_FALSE; 17035 17036 /* 17037 * During failback, we want to go back to the same id 17038 * instead of the smallest id so that the original 17039 * configuration is maintained. id is non-zero in that 17040 * case. 17041 */ 17042 if (id != 0) { 17043 /* 17044 * While failing back, if we still have an ipif with 17045 * MAX_ADDRS_PER_IF, it means this will be replaced 17046 * as soon as we return from this function. It was 17047 * to set to MAX_ADDRS_PER_IF by the caller so that 17048 * we can choose the smallest id. Thus we return zero 17049 * in that case ignoring the hint. 17050 */ 17051 if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF) 17052 return (0); 17053 for (tipif = ill->ill_ipif; tipif != NULL; 17054 tipif = tipif->ipif_next) { 17055 if (tipif->ipif_id == id) { 17056 found = B_TRUE; 17057 break; 17058 } 17059 } 17060 /* 17061 * If somebody already plumbed another logical 17062 * with the same id, we won't be able to find it. 17063 */ 17064 if (!found) 17065 return (id); 17066 } 17067 for (unit = 0; unit <= ip_addrs_per_if; unit++) { 17068 found = B_FALSE; 17069 for (tipif = ill->ill_ipif; tipif != NULL; 17070 tipif = tipif->ipif_next) { 17071 if (tipif->ipif_id == unit) { 17072 found = B_TRUE; 17073 break; 17074 } 17075 } 17076 if (!found) 17077 break; 17078 } 17079 return (unit); 17080 } 17081 17082 /* ARGSUSED */ 17083 static int 17084 ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp, 17085 ipif_t **rep_ipif_ptr) 17086 { 17087 ill_t *from_ill; 17088 ipif_t *rep_ipif; 17089 ipif_t **ipifp; 17090 uint_t unit; 17091 int err = 0; 17092 ipif_t *to_ipif; 17093 struct iocblk *iocp; 17094 boolean_t failback_cmd; 17095 boolean_t remove_ipif; 17096 int rc; 17097 17098 ASSERT(IAM_WRITER_ILL(to_ill)); 17099 ASSERT(IAM_WRITER_IPIF(ipif)); 17100 17101 iocp = (struct iocblk *)mp->b_rptr; 17102 failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK); 17103 remove_ipif = B_FALSE; 17104 17105 from_ill = ipif->ipif_ill; 17106 17107 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 17108 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 17109 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 17110 17111 /* 17112 * Don't move LINK LOCAL addresses as they are tied to 17113 * physical interface. 17114 */ 17115 if (from_ill->ill_isv6 && 17116 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) { 17117 ipif->ipif_was_up = B_FALSE; 17118 IPIF_UNMARK_MOVING(ipif); 17119 return (0); 17120 } 17121 17122 /* 17123 * We set the ipif_id to maximum so that the search for 17124 * ipif_id will pick the lowest number i.e 0 in the 17125 * following 2 cases : 17126 * 17127 * 1) We have a replacement ipif at the head of to_ill. 17128 * We can't remove it yet as we can exceed ip_addrs_per_if 17129 * on to_ill and hence the MOVE might fail. We want to 17130 * remove it only if we could move the ipif. Thus, by 17131 * setting it to the MAX value, we make the search in 17132 * ipif_get_id return the zeroth id. 17133 * 17134 * 2) When DR pulls out the NIC and re-plumbs the interface, 17135 * we might just have a zero address plumbed on the ipif 17136 * with zero id in the case of IPv4. We remove that while 17137 * doing the failback. We want to remove it only if we 17138 * could move the ipif. Thus, by setting it to the MAX 17139 * value, we make the search in ipif_get_id return the 17140 * zeroth id. 17141 * 17142 * Both (1) and (2) are done only when when we are moving 17143 * an ipif (either due to failover/failback) which originally 17144 * belonged to this interface i.e the ipif_orig_ifindex is 17145 * the same as to_ill's ifindex. This is needed so that 17146 * FAILOVER from A -> B ( A failed) followed by FAILOVER 17147 * from B -> A (B is being removed from the group) and 17148 * FAILBACK from A -> B restores the original configuration. 17149 * Without the check for orig_ifindex, the second FAILOVER 17150 * could make the ipif belonging to B replace the A's zeroth 17151 * ipif and the subsequent failback re-creating the replacement 17152 * ipif again. 17153 * 17154 * NOTE : We created the replacement ipif when we did a 17155 * FAILOVER (See below). We could check for FAILBACK and 17156 * then look for replacement ipif to be removed. But we don't 17157 * want to do that because we wan't to allow the possibility 17158 * of a FAILOVER from A -> B (which creates the replacement ipif), 17159 * followed by a *FAILOVER* from B -> A instead of a FAILBACK 17160 * from B -> A. 17161 */ 17162 to_ipif = to_ill->ill_ipif; 17163 if ((to_ill->ill_phyint->phyint_ifindex == 17164 ipif->ipif_orig_ifindex) && 17165 IPIF_REPL_CHECK(to_ipif, failback_cmd)) { 17166 ASSERT(to_ipif->ipif_id == 0); 17167 remove_ipif = B_TRUE; 17168 to_ipif->ipif_id = MAX_ADDRS_PER_IF; 17169 } 17170 /* 17171 * Find the lowest logical unit number on the to_ill. 17172 * If we are failing back, try to get the original id 17173 * rather than the lowest one so that the original 17174 * configuration is maintained. 17175 * 17176 * XXX need a better scheme for this. 17177 */ 17178 if (failback_cmd) { 17179 unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid); 17180 } else { 17181 unit = ipif_get_id(to_ill, 0); 17182 } 17183 17184 /* Reset back to zero in case we fail below */ 17185 if (to_ipif->ipif_id == MAX_ADDRS_PER_IF) 17186 to_ipif->ipif_id = 0; 17187 17188 if (unit == ip_addrs_per_if) { 17189 ipif->ipif_was_up = B_FALSE; 17190 IPIF_UNMARK_MOVING(ipif); 17191 return (EINVAL); 17192 } 17193 17194 /* 17195 * ipif is ready to move from "from_ill" to "to_ill". 17196 * 17197 * 1) If we are moving ipif with id zero, create a 17198 * replacement ipif for this ipif on from_ill. If this fails 17199 * fail the MOVE operation. 17200 * 17201 * 2) Remove the replacement ipif on to_ill if any. 17202 * We could remove the replacement ipif when we are moving 17203 * the ipif with id zero. But what if somebody already 17204 * unplumbed it ? Thus we always remove it if it is present. 17205 * We want to do it only if we are sure we are going to 17206 * move the ipif to to_ill which is why there are no 17207 * returns due to error till ipif is linked to to_ill. 17208 * Note that the first ipif that we failback will always 17209 * be zero if it is present. 17210 */ 17211 if (ipif->ipif_id == 0) { 17212 ipaddr_t inaddr_any = INADDR_ANY; 17213 17214 rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED); 17215 if (rep_ipif == NULL) { 17216 ipif->ipif_was_up = B_FALSE; 17217 IPIF_UNMARK_MOVING(ipif); 17218 return (ENOMEM); 17219 } 17220 *rep_ipif = ipif_zero; 17221 /* 17222 * Before we put the ipif on the list, store the addresses 17223 * as mapped addresses as some of the ioctls e.g SIOCGIFADDR 17224 * assumes so. This logic is not any different from what 17225 * ipif_allocate does. 17226 */ 17227 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17228 &rep_ipif->ipif_v6lcl_addr); 17229 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17230 &rep_ipif->ipif_v6src_addr); 17231 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17232 &rep_ipif->ipif_v6subnet); 17233 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17234 &rep_ipif->ipif_v6net_mask); 17235 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17236 &rep_ipif->ipif_v6brd_addr); 17237 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17238 &rep_ipif->ipif_v6pp_dst_addr); 17239 /* 17240 * We mark IPIF_NOFAILOVER so that this can never 17241 * move. 17242 */ 17243 rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER; 17244 rep_ipif->ipif_flags &= ~IPIF_UP & ~IPIF_DUPLICATE; 17245 rep_ipif->ipif_replace_zero = B_TRUE; 17246 mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL, 17247 MUTEX_DEFAULT, NULL); 17248 rep_ipif->ipif_id = 0; 17249 rep_ipif->ipif_ire_type = ipif->ipif_ire_type; 17250 rep_ipif->ipif_ill = from_ill; 17251 rep_ipif->ipif_orig_ifindex = 17252 from_ill->ill_phyint->phyint_ifindex; 17253 /* Insert at head */ 17254 rep_ipif->ipif_next = from_ill->ill_ipif; 17255 from_ill->ill_ipif = rep_ipif; 17256 /* 17257 * We don't really care to let apps know about 17258 * this interface. 17259 */ 17260 } 17261 17262 if (remove_ipif) { 17263 /* 17264 * We set to a max value above for this case to get 17265 * id zero. ASSERT that we did get one. 17266 */ 17267 ASSERT((to_ipif->ipif_id == 0) && (unit == 0)); 17268 rep_ipif = to_ipif; 17269 to_ill->ill_ipif = rep_ipif->ipif_next; 17270 rep_ipif->ipif_next = NULL; 17271 /* 17272 * If some apps scanned and find this interface, 17273 * it is time to let them know, so that they can 17274 * delete it. 17275 */ 17276 17277 *rep_ipif_ptr = rep_ipif; 17278 } 17279 17280 /* Get it out of the ILL interface list. */ 17281 ipifp = &ipif->ipif_ill->ill_ipif; 17282 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 17283 if (*ipifp == ipif) { 17284 *ipifp = ipif->ipif_next; 17285 break; 17286 } 17287 } 17288 17289 /* Assign the new ill */ 17290 ipif->ipif_ill = to_ill; 17291 ipif->ipif_id = unit; 17292 /* id has already been checked */ 17293 rc = ipif_insert(ipif, B_FALSE, B_FALSE); 17294 ASSERT(rc == 0); 17295 /* Let SCTP update its list */ 17296 sctp_move_ipif(ipif, from_ill, to_ill); 17297 /* 17298 * Handle the failover and failback of ipif_t between 17299 * ill_t that have differing maximum mtu values. 17300 */ 17301 if (ipif->ipif_mtu > to_ill->ill_max_mtu) { 17302 if (ipif->ipif_saved_mtu == 0) { 17303 /* 17304 * As this ipif_t is moving to an ill_t 17305 * that has a lower ill_max_mtu, its 17306 * ipif_mtu needs to be saved so it can 17307 * be restored during failback or during 17308 * failover to an ill_t which has a 17309 * higher ill_max_mtu. 17310 */ 17311 ipif->ipif_saved_mtu = ipif->ipif_mtu; 17312 ipif->ipif_mtu = to_ill->ill_max_mtu; 17313 } else { 17314 /* 17315 * The ipif_t is, once again, moving to 17316 * an ill_t that has a lower maximum mtu 17317 * value. 17318 */ 17319 ipif->ipif_mtu = to_ill->ill_max_mtu; 17320 } 17321 } else if (ipif->ipif_mtu < to_ill->ill_max_mtu && 17322 ipif->ipif_saved_mtu != 0) { 17323 /* 17324 * The mtu of this ipif_t had to be reduced 17325 * during an earlier failover; this is an 17326 * opportunity for it to be increased (either as 17327 * part of another failover or a failback). 17328 */ 17329 if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) { 17330 ipif->ipif_mtu = ipif->ipif_saved_mtu; 17331 ipif->ipif_saved_mtu = 0; 17332 } else { 17333 ipif->ipif_mtu = to_ill->ill_max_mtu; 17334 } 17335 } 17336 17337 /* 17338 * We preserve all the other fields of the ipif including 17339 * ipif_saved_ire_mp. The routes that are saved here will 17340 * be recreated on the new interface and back on the old 17341 * interface when we move back. 17342 */ 17343 ASSERT(ipif->ipif_arp_del_mp == NULL); 17344 17345 return (err); 17346 } 17347 17348 static int 17349 ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp, 17350 int ifindex, ipif_t **rep_ipif_ptr) 17351 { 17352 ipif_t *mipif; 17353 ipif_t *ipif_next; 17354 int err; 17355 17356 /* 17357 * We don't really try to MOVE back things if some of the 17358 * operations fail. The daemon will take care of moving again 17359 * later on. 17360 */ 17361 for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) { 17362 ipif_next = mipif->ipif_next; 17363 if (!(mipif->ipif_flags & IPIF_NOFAILOVER) && 17364 (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) { 17365 17366 err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr); 17367 17368 /* 17369 * When the MOVE fails, it is the job of the 17370 * application to take care of this properly 17371 * i.e try again if it is ENOMEM. 17372 */ 17373 if (mipif->ipif_ill != from_ill) { 17374 /* 17375 * ipif has moved. 17376 * 17377 * Move the multicast memberships associated 17378 * with this ipif to the new ill. For IPv6, we 17379 * do it once after all the ipifs are moved 17380 * (in ill_move) as they are not associated 17381 * with ipifs. 17382 * 17383 * We need to move the ilms as the ipif has 17384 * already been moved to a new ill even 17385 * in the case of errors. Neither 17386 * ilm_free(ipif) will find the ilm 17387 * when somebody unplumbs this ipif nor 17388 * ilm_delete(ilm) will be able to find the 17389 * ilm, if we don't move now. 17390 */ 17391 if (!from_ill->ill_isv6) 17392 ilm_move_v4(from_ill, to_ill, mipif); 17393 } 17394 17395 if (err != 0) 17396 return (err); 17397 } 17398 } 17399 return (0); 17400 } 17401 17402 static int 17403 ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp) 17404 { 17405 int ifindex; 17406 int err; 17407 struct iocblk *iocp; 17408 ipif_t *ipif; 17409 ipif_t *rep_ipif_ptr = NULL; 17410 ipif_t *from_ipif = NULL; 17411 boolean_t check_rep_if = B_FALSE; 17412 17413 iocp = (struct iocblk *)mp->b_rptr; 17414 if (iocp->ioc_cmd == SIOCLIFFAILOVER) { 17415 /* 17416 * Move everything pointing at from_ill to to_ill. 17417 * We acheive this by passing in 0 as ifindex. 17418 */ 17419 ifindex = 0; 17420 } else { 17421 /* 17422 * Move everything pointing at from_ill whose original 17423 * ifindex of connp, ipif, ilm points at to_ill->ill_index. 17424 * We acheive this by passing in ifindex rather than 0. 17425 * Multicast vifs, ilgs move implicitly because ipifs move. 17426 */ 17427 ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK); 17428 ifindex = to_ill->ill_phyint->phyint_ifindex; 17429 } 17430 17431 /* 17432 * Determine if there is at least one ipif that would move from 17433 * 'from_ill' to 'to_ill'. If so, it is possible that the replacement 17434 * ipif (if it exists) on the to_ill would be consumed as a result of 17435 * the move, in which case we need to quiesce the replacement ipif also. 17436 */ 17437 for (from_ipif = from_ill->ill_ipif; from_ipif != NULL; 17438 from_ipif = from_ipif->ipif_next) { 17439 if (((ifindex == 0) || 17440 (ifindex == from_ipif->ipif_orig_ifindex)) && 17441 !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) { 17442 check_rep_if = B_TRUE; 17443 break; 17444 } 17445 } 17446 17447 17448 ill_down_ipifs(from_ill, mp, ifindex, B_TRUE); 17449 17450 GRAB_ILL_LOCKS(from_ill, to_ill); 17451 if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) { 17452 (void) ipsq_pending_mp_add(NULL, ipif, q, 17453 mp, ILL_MOVE_OK); 17454 RELEASE_ILL_LOCKS(from_ill, to_ill); 17455 return (EINPROGRESS); 17456 } 17457 17458 /* Check if the replacement ipif is quiescent to delete */ 17459 if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif, 17460 (iocp->ioc_cmd == SIOCLIFFAILBACK))) { 17461 to_ill->ill_ipif->ipif_state_flags |= 17462 IPIF_MOVING | IPIF_CHANGING; 17463 if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) { 17464 (void) ipsq_pending_mp_add(NULL, ipif, q, 17465 mp, ILL_MOVE_OK); 17466 RELEASE_ILL_LOCKS(from_ill, to_ill); 17467 return (EINPROGRESS); 17468 } 17469 } 17470 RELEASE_ILL_LOCKS(from_ill, to_ill); 17471 17472 ASSERT(!MUTEX_HELD(&to_ill->ill_lock)); 17473 rw_enter(&ill_g_lock, RW_WRITER); 17474 GRAB_ILL_LOCKS(from_ill, to_ill); 17475 err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr); 17476 17477 /* ilm_move is done inside ipif_move for IPv4 */ 17478 if (err == 0 && from_ill->ill_isv6) 17479 ilm_move_v6(from_ill, to_ill, ifindex); 17480 17481 RELEASE_ILL_LOCKS(from_ill, to_ill); 17482 rw_exit(&ill_g_lock); 17483 17484 /* 17485 * send rts messages and multicast messages. 17486 */ 17487 if (rep_ipif_ptr != NULL) { 17488 ip_rts_ifmsg(rep_ipif_ptr); 17489 ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr); 17490 IPIF_TRACE_CLEANUP(rep_ipif_ptr); 17491 mi_free(rep_ipif_ptr); 17492 } 17493 17494 conn_move_ill(from_ill, to_ill, ifindex); 17495 17496 return (err); 17497 } 17498 17499 /* 17500 * Used to extract arguments for FAILOVER/FAILBACK ioctls. 17501 * Also checks for the validity of the arguments. 17502 * Note: We are already exclusive inside the from group. 17503 * It is upto the caller to release refcnt on the to_ill's. 17504 */ 17505 static int 17506 ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4, 17507 ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6) 17508 { 17509 int dst_index; 17510 ipif_t *ipif_v4, *ipif_v6; 17511 struct lifreq *lifr; 17512 mblk_t *mp1; 17513 boolean_t exists; 17514 sin_t *sin; 17515 int err = 0; 17516 17517 if ((mp1 = mp->b_cont) == NULL) 17518 return (EPROTO); 17519 17520 if ((mp1 = mp1->b_cont) == NULL) 17521 return (EPROTO); 17522 17523 lifr = (struct lifreq *)mp1->b_rptr; 17524 sin = (sin_t *)&lifr->lifr_addr; 17525 17526 /* 17527 * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6 17528 * specific operations. 17529 */ 17530 if (sin->sin_family != AF_UNSPEC) 17531 return (EINVAL); 17532 17533 /* 17534 * Get ipif with id 0. We are writer on the from ill. So we can pass 17535 * NULLs for the last 4 args and we know the lookup won't fail 17536 * with EINPROGRESS. 17537 */ 17538 ipif_v4 = ipif_lookup_on_name(lifr->lifr_name, 17539 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE, 17540 ALL_ZONES, NULL, NULL, NULL, NULL); 17541 ipif_v6 = ipif_lookup_on_name(lifr->lifr_name, 17542 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE, 17543 ALL_ZONES, NULL, NULL, NULL, NULL); 17544 17545 if (ipif_v4 == NULL && ipif_v6 == NULL) 17546 return (ENXIO); 17547 17548 if (ipif_v4 != NULL) { 17549 ASSERT(ipif_v4->ipif_refcnt != 0); 17550 if (ipif_v4->ipif_id != 0) { 17551 err = EINVAL; 17552 goto done; 17553 } 17554 17555 ASSERT(IAM_WRITER_IPIF(ipif_v4)); 17556 *ill_from_v4 = ipif_v4->ipif_ill; 17557 } 17558 17559 if (ipif_v6 != NULL) { 17560 ASSERT(ipif_v6->ipif_refcnt != 0); 17561 if (ipif_v6->ipif_id != 0) { 17562 err = EINVAL; 17563 goto done; 17564 } 17565 17566 ASSERT(IAM_WRITER_IPIF(ipif_v6)); 17567 *ill_from_v6 = ipif_v6->ipif_ill; 17568 } 17569 17570 err = 0; 17571 dst_index = lifr->lifr_movetoindex; 17572 *ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE, 17573 q, mp, ip_process_ioctl, &err); 17574 if (err != 0) { 17575 /* 17576 * There could be only v6. 17577 */ 17578 if (err != ENXIO) 17579 goto done; 17580 err = 0; 17581 } 17582 17583 *ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE, 17584 q, mp, ip_process_ioctl, &err); 17585 if (err != 0) { 17586 if (err != ENXIO) 17587 goto done; 17588 if (*ill_to_v4 == NULL) { 17589 err = ENXIO; 17590 goto done; 17591 } 17592 err = 0; 17593 } 17594 17595 /* 17596 * If we have something to MOVE i.e "from" not NULL, 17597 * "to" should be non-NULL. 17598 */ 17599 if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) || 17600 (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) { 17601 err = EINVAL; 17602 } 17603 17604 done: 17605 if (ipif_v4 != NULL) 17606 ipif_refrele(ipif_v4); 17607 if (ipif_v6 != NULL) 17608 ipif_refrele(ipif_v6); 17609 return (err); 17610 } 17611 17612 /* 17613 * FAILOVER and FAILBACK are modelled as MOVE operations. 17614 * 17615 * We don't check whether the MOVE is within the same group or 17616 * not, because this ioctl can be used as a generic mechanism 17617 * to failover from interface A to B, though things will function 17618 * only if they are really part of the same group. Moreover, 17619 * all ipifs may be down and hence temporarily out of the group. 17620 * 17621 * ipif's that need to be moved are first brought down; V4 ipifs are brought 17622 * down first and then V6. For each we wait for the ipif's to become quiescent. 17623 * Bringing down the ipifs ensures that all ires pointing to these ipifs's 17624 * have been deleted and there are no active references. Once quiescent the 17625 * ipif's are moved and brought up on the new ill. 17626 * 17627 * Normally the source ill and destination ill belong to the same IPMP group 17628 * and hence the same ipsq_t. In the event they don't belong to the same 17629 * same group the two ipsq's are first merged into one ipsq - that of the 17630 * to_ill. The multicast memberships on the source and destination ill cannot 17631 * change during the move operation since multicast joins/leaves also have to 17632 * execute on the same ipsq and are hence serialized. 17633 */ 17634 /* ARGSUSED */ 17635 int 17636 ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17637 ip_ioctl_cmd_t *ipip, void *ifreq) 17638 { 17639 ill_t *ill_to_v4 = NULL; 17640 ill_t *ill_to_v6 = NULL; 17641 ill_t *ill_from_v4 = NULL; 17642 ill_t *ill_from_v6 = NULL; 17643 int err = 0; 17644 17645 /* 17646 * setup from and to ill's, we can get EINPROGRESS only for 17647 * to_ill's. 17648 */ 17649 err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6, 17650 &ill_to_v4, &ill_to_v6); 17651 17652 if (err != 0) { 17653 ip0dbg(("ip_sioctl_move: extract args failed\n")); 17654 goto done; 17655 } 17656 17657 /* 17658 * nothing to do. 17659 */ 17660 if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) { 17661 goto done; 17662 } 17663 17664 /* 17665 * nothing to do. 17666 */ 17667 if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) { 17668 goto done; 17669 } 17670 17671 /* 17672 * Mark the ill as changing. 17673 * ILL_CHANGING flag is cleared when the ipif's are brought up 17674 * in ill_up_ipifs in case of error they are cleared below. 17675 */ 17676 17677 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 17678 if (ill_from_v4 != NULL) 17679 ill_from_v4->ill_state_flags |= ILL_CHANGING; 17680 if (ill_from_v6 != NULL) 17681 ill_from_v6->ill_state_flags |= ILL_CHANGING; 17682 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 17683 17684 /* 17685 * Make sure that both src and dst are 17686 * in the same syncq group. If not make it happen. 17687 * We are not holding any locks because we are the writer 17688 * on the from_ipsq and we will hold locks in ill_merge_groups 17689 * to protect to_ipsq against changing. 17690 */ 17691 if (ill_from_v4 != NULL) { 17692 if (ill_from_v4->ill_phyint->phyint_ipsq != 17693 ill_to_v4->ill_phyint->phyint_ipsq) { 17694 err = ill_merge_groups(ill_from_v4, ill_to_v4, 17695 NULL, mp, q); 17696 goto err_ret; 17697 17698 } 17699 ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock)); 17700 } else { 17701 17702 if (ill_from_v6->ill_phyint->phyint_ipsq != 17703 ill_to_v6->ill_phyint->phyint_ipsq) { 17704 err = ill_merge_groups(ill_from_v6, ill_to_v6, 17705 NULL, mp, q); 17706 goto err_ret; 17707 17708 } 17709 ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock)); 17710 } 17711 17712 /* 17713 * Now that the ipsq's have been merged and we are the writer 17714 * lets mark to_ill as changing as well. 17715 */ 17716 17717 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 17718 if (ill_to_v4 != NULL) 17719 ill_to_v4->ill_state_flags |= ILL_CHANGING; 17720 if (ill_to_v6 != NULL) 17721 ill_to_v6->ill_state_flags |= ILL_CHANGING; 17722 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 17723 17724 /* 17725 * Its ok for us to proceed with the move even if 17726 * ill_pending_mp is non null on one of the from ill's as the reply 17727 * should not be looking at the ipif, it should only care about the 17728 * ill itself. 17729 */ 17730 17731 /* 17732 * lets move ipv4 first. 17733 */ 17734 if (ill_from_v4 != NULL) { 17735 ASSERT(IAM_WRITER_ILL(ill_to_v4)); 17736 ill_from_v4->ill_move_in_progress = B_TRUE; 17737 ill_to_v4->ill_move_in_progress = B_TRUE; 17738 ill_to_v4->ill_move_peer = ill_from_v4; 17739 ill_from_v4->ill_move_peer = ill_to_v4; 17740 err = ill_move(ill_from_v4, ill_to_v4, q, mp); 17741 } 17742 17743 /* 17744 * Now lets move ipv6. 17745 */ 17746 if (err == 0 && ill_from_v6 != NULL) { 17747 ASSERT(IAM_WRITER_ILL(ill_to_v6)); 17748 ill_from_v6->ill_move_in_progress = B_TRUE; 17749 ill_to_v6->ill_move_in_progress = B_TRUE; 17750 ill_to_v6->ill_move_peer = ill_from_v6; 17751 ill_from_v6->ill_move_peer = ill_to_v6; 17752 err = ill_move(ill_from_v6, ill_to_v6, q, mp); 17753 } 17754 17755 err_ret: 17756 /* 17757 * EINPROGRESS means we are waiting for the ipif's that need to be 17758 * moved to become quiescent. 17759 */ 17760 if (err == EINPROGRESS) { 17761 goto done; 17762 } 17763 17764 /* 17765 * if err is set ill_up_ipifs will not be called 17766 * lets clear the flags. 17767 */ 17768 17769 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 17770 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 17771 /* 17772 * Some of the clearing may be redundant. But it is simple 17773 * not making any extra checks. 17774 */ 17775 if (ill_from_v6 != NULL) { 17776 ill_from_v6->ill_move_in_progress = B_FALSE; 17777 ill_from_v6->ill_move_peer = NULL; 17778 ill_from_v6->ill_state_flags &= ~ILL_CHANGING; 17779 } 17780 if (ill_from_v4 != NULL) { 17781 ill_from_v4->ill_move_in_progress = B_FALSE; 17782 ill_from_v4->ill_move_peer = NULL; 17783 ill_from_v4->ill_state_flags &= ~ILL_CHANGING; 17784 } 17785 if (ill_to_v6 != NULL) { 17786 ill_to_v6->ill_move_in_progress = B_FALSE; 17787 ill_to_v6->ill_move_peer = NULL; 17788 ill_to_v6->ill_state_flags &= ~ILL_CHANGING; 17789 } 17790 if (ill_to_v4 != NULL) { 17791 ill_to_v4->ill_move_in_progress = B_FALSE; 17792 ill_to_v4->ill_move_peer = NULL; 17793 ill_to_v4->ill_state_flags &= ~ILL_CHANGING; 17794 } 17795 17796 /* 17797 * Check for setting INACTIVE, if STANDBY is set and FAILED is not set. 17798 * Do this always to maintain proper state i.e even in case of errors. 17799 * As phyint_inactive looks at both v4 and v6 interfaces, 17800 * we need not call on both v4 and v6 interfaces. 17801 */ 17802 if (ill_from_v4 != NULL) { 17803 if ((ill_from_v4->ill_phyint->phyint_flags & 17804 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 17805 phyint_inactive(ill_from_v4->ill_phyint); 17806 } 17807 } else if (ill_from_v6 != NULL) { 17808 if ((ill_from_v6->ill_phyint->phyint_flags & 17809 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 17810 phyint_inactive(ill_from_v6->ill_phyint); 17811 } 17812 } 17813 17814 if (ill_to_v4 != NULL) { 17815 if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) { 17816 ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 17817 } 17818 } else if (ill_to_v6 != NULL) { 17819 if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) { 17820 ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 17821 } 17822 } 17823 17824 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 17825 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 17826 17827 no_err: 17828 /* 17829 * lets bring the interfaces up on the to_ill. 17830 */ 17831 if (err == 0) { 17832 err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4, 17833 q, mp); 17834 } 17835 17836 if (err == 0) { 17837 if (ill_from_v4 != NULL && ill_to_v4 != NULL) 17838 ilm_send_multicast_reqs(ill_from_v4, ill_to_v4); 17839 17840 if (ill_from_v6 != NULL && ill_to_v6 != NULL) 17841 ilm_send_multicast_reqs(ill_from_v6, ill_to_v6); 17842 } 17843 done: 17844 17845 if (ill_to_v4 != NULL) { 17846 ill_refrele(ill_to_v4); 17847 } 17848 if (ill_to_v6 != NULL) { 17849 ill_refrele(ill_to_v6); 17850 } 17851 17852 return (err); 17853 } 17854 17855 static void 17856 ill_dl_down(ill_t *ill) 17857 { 17858 /* 17859 * The ill is down; unbind but stay attached since we're still 17860 * associated with a PPA. If we have negotiated DLPI capabilites 17861 * with the data link service provider (IDS_OK) then reset them. 17862 * The interval between unbinding and rebinding is potentially 17863 * unbounded hence we cannot assume things will be the same. 17864 * The DLPI capabilities will be probed again when the data link 17865 * is brought up. 17866 */ 17867 mblk_t *mp = ill->ill_unbind_mp; 17868 hook_nic_event_t *info; 17869 17870 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 17871 17872 ill->ill_unbind_mp = NULL; 17873 if (mp != NULL) { 17874 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 17875 dlpi_prim_str(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 17876 ill->ill_name)); 17877 mutex_enter(&ill->ill_lock); 17878 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 17879 mutex_exit(&ill->ill_lock); 17880 if (ill->ill_dlpi_capab_state == IDS_OK) 17881 ill_capability_reset(ill); 17882 ill_dlpi_send(ill, mp); 17883 } 17884 17885 /* 17886 * Toss all of our multicast memberships. We could keep them, but 17887 * then we'd have to do bookkeeping of any joins and leaves performed 17888 * by the application while the the interface is down (we can't just 17889 * issue them because arp cannot currently process AR_ENTRY_SQUERY's 17890 * on a downed interface). 17891 */ 17892 ill_leave_multicast(ill); 17893 17894 mutex_enter(&ill->ill_lock); 17895 17896 ill->ill_dl_up = 0; 17897 17898 if ((info = ill->ill_nic_event_info) != NULL) { 17899 ip2dbg(("ill_dl_down:unexpected nic event %d attached for %s\n", 17900 info->hne_event, ill->ill_name)); 17901 if (info->hne_data != NULL) 17902 kmem_free(info->hne_data, info->hne_datalen); 17903 kmem_free(info, sizeof (hook_nic_event_t)); 17904 } 17905 17906 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 17907 if (info != NULL) { 17908 info->hne_nic = ill->ill_phyint->phyint_ifindex; 17909 info->hne_lif = 0; 17910 info->hne_event = NE_DOWN; 17911 info->hne_data = NULL; 17912 info->hne_datalen = 0; 17913 info->hne_family = ill->ill_isv6 ? ipv6 : ipv4; 17914 } else 17915 ip2dbg(("ill_dl_down: could not attach DOWN nic event " 17916 "information for %s (ENOMEM)\n", ill->ill_name)); 17917 17918 ill->ill_nic_event_info = info; 17919 17920 mutex_exit(&ill->ill_lock); 17921 } 17922 17923 void 17924 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 17925 { 17926 union DL_primitives *dlp; 17927 t_uscalar_t prim; 17928 17929 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 17930 17931 dlp = (union DL_primitives *)mp->b_rptr; 17932 prim = dlp->dl_primitive; 17933 17934 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 17935 dlpi_prim_str(prim), prim, ill->ill_name)); 17936 17937 switch (prim) { 17938 case DL_PHYS_ADDR_REQ: 17939 { 17940 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 17941 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 17942 break; 17943 } 17944 case DL_BIND_REQ: 17945 mutex_enter(&ill->ill_lock); 17946 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 17947 mutex_exit(&ill->ill_lock); 17948 break; 17949 } 17950 17951 ill->ill_dlpi_pending = prim; 17952 17953 /* 17954 * Some drivers send M_FLUSH up to IP as part of unbind 17955 * request. When this M_FLUSH is sent back to the driver, 17956 * this can go after we send the detach request if the 17957 * M_FLUSH ends up in IP's syncq. To avoid that, we reply 17958 * to the M_FLUSH in ip_rput and locally generate another 17959 * M_FLUSH for the correctness. This will get freed in 17960 * ip_wput_nondata. 17961 */ 17962 if (prim == DL_UNBIND_REQ) 17963 (void) putnextctl1(ill->ill_rq, M_FLUSH, FLUSHRW); 17964 17965 putnext(ill->ill_wq, mp); 17966 } 17967 17968 /* 17969 * Send a DLPI control message to the driver but make sure there 17970 * is only one outstanding message. Uses ill_dlpi_pending to tell 17971 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 17972 * when an ACK or a NAK is received to process the next queued message. 17973 * 17974 * We don't protect ill_dlpi_pending with any lock. This is okay as 17975 * every place where its accessed, ip is exclusive while accessing 17976 * ill_dlpi_pending except when this function is called from ill_init() 17977 */ 17978 void 17979 ill_dlpi_send(ill_t *ill, mblk_t *mp) 17980 { 17981 mblk_t **mpp; 17982 17983 ASSERT(IAM_WRITER_ILL(ill)); 17984 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 17985 17986 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 17987 /* Must queue message. Tail insertion */ 17988 mpp = &ill->ill_dlpi_deferred; 17989 while (*mpp != NULL) 17990 mpp = &((*mpp)->b_next); 17991 17992 ip1dbg(("ill_dlpi_send: deferring request for %s\n", 17993 ill->ill_name)); 17994 17995 *mpp = mp; 17996 return; 17997 } 17998 17999 ill_dlpi_dispatch(ill, mp); 18000 } 18001 18002 /* 18003 * Called when an DLPI control message has been acked or nacked to 18004 * send down the next queued message (if any). 18005 */ 18006 void 18007 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 18008 { 18009 mblk_t *mp; 18010 18011 ASSERT(IAM_WRITER_ILL(ill)); 18012 18013 ASSERT(prim != DL_PRIM_INVAL); 18014 if (ill->ill_dlpi_pending != prim) { 18015 if (ill->ill_dlpi_pending == DL_PRIM_INVAL) { 18016 (void) mi_strlog(ill->ill_rq, 1, 18017 SL_CONSOLE|SL_ERROR|SL_TRACE, 18018 "ill_dlpi_done: unsolicited ack for %s from %s\n", 18019 dlpi_prim_str(prim), ill->ill_name); 18020 } else { 18021 (void) mi_strlog(ill->ill_rq, 1, 18022 SL_CONSOLE|SL_ERROR|SL_TRACE, 18023 "ill_dlpi_done: unexpected ack for %s from %s " 18024 "(expecting ack for %s)\n", 18025 dlpi_prim_str(prim), ill->ill_name, 18026 dlpi_prim_str(ill->ill_dlpi_pending)); 18027 } 18028 return; 18029 } 18030 18031 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 18032 dlpi_prim_str(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 18033 18034 if ((mp = ill->ill_dlpi_deferred) == NULL) { 18035 ill->ill_dlpi_pending = DL_PRIM_INVAL; 18036 return; 18037 } 18038 18039 ill->ill_dlpi_deferred = mp->b_next; 18040 mp->b_next = NULL; 18041 18042 ill_dlpi_dispatch(ill, mp); 18043 } 18044 18045 void 18046 conn_delete_ire(conn_t *connp, caddr_t arg) 18047 { 18048 ipif_t *ipif = (ipif_t *)arg; 18049 ire_t *ire; 18050 18051 /* 18052 * Look at the cached ires on conns which has pointers to ipifs. 18053 * We just call ire_refrele which clears up the reference 18054 * to ire. Called when a conn closes. Also called from ipif_free 18055 * to cleanup indirect references to the stale ipif via the cached ire. 18056 */ 18057 mutex_enter(&connp->conn_lock); 18058 ire = connp->conn_ire_cache; 18059 if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) { 18060 connp->conn_ire_cache = NULL; 18061 mutex_exit(&connp->conn_lock); 18062 IRE_REFRELE_NOTR(ire); 18063 return; 18064 } 18065 mutex_exit(&connp->conn_lock); 18066 18067 } 18068 18069 /* 18070 * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number 18071 * of IREs. Those IREs may have been previously cached in the conn structure. 18072 * This ipcl_walk() walker function releases all references to such IREs based 18073 * on the condemned flag. 18074 */ 18075 /* ARGSUSED */ 18076 void 18077 conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) 18078 { 18079 ire_t *ire; 18080 18081 mutex_enter(&connp->conn_lock); 18082 ire = connp->conn_ire_cache; 18083 if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) { 18084 connp->conn_ire_cache = NULL; 18085 mutex_exit(&connp->conn_lock); 18086 IRE_REFRELE_NOTR(ire); 18087 return; 18088 } 18089 mutex_exit(&connp->conn_lock); 18090 } 18091 18092 /* 18093 * Take down a specific interface, but don't lose any information about it. 18094 * Also delete interface from its interface group (ifgrp). 18095 * (Always called as writer.) 18096 * This function goes through the down sequence even if the interface is 18097 * already down. There are 2 reasons. 18098 * a. Currently we permit interface routes that depend on down interfaces 18099 * to be added. This behaviour itself is questionable. However it appears 18100 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 18101 * time. We go thru the cleanup in order to remove these routes. 18102 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 18103 * DL_ERROR_ACK in response to the the DL_BIND request. The interface is 18104 * down, but we need to cleanup i.e. do ill_dl_down and 18105 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 18106 * 18107 * IP-MT notes: 18108 * 18109 * Model of reference to interfaces. 18110 * 18111 * The following members in ipif_t track references to the ipif. 18112 * int ipif_refcnt; Active reference count 18113 * uint_t ipif_ire_cnt; Number of ire's referencing this ipif 18114 * The following members in ill_t track references to the ill. 18115 * int ill_refcnt; active refcnt 18116 * uint_t ill_ire_cnt; Number of ires referencing ill 18117 * uint_t ill_nce_cnt; Number of nces referencing ill 18118 * 18119 * Reference to an ipif or ill can be obtained in any of the following ways. 18120 * 18121 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 18122 * Pointers to ipif / ill from other data structures viz ire and conn. 18123 * Implicit reference to the ipif / ill by holding a reference to the ire. 18124 * 18125 * The ipif/ill lookup functions return a reference held ipif / ill. 18126 * ipif_refcnt and ill_refcnt track the reference counts respectively. 18127 * This is a purely dynamic reference count associated with threads holding 18128 * references to the ipif / ill. Pointers from other structures do not 18129 * count towards this reference count. 18130 * 18131 * ipif_ire_cnt/ill_ire_cnt is the number of ire's associated with the 18132 * ipif/ill. This is incremented whenever a new ire is created referencing the 18133 * ipif/ill. This is done atomically inside ire_add_v[46] where the ire is 18134 * actually added to the ire hash table. The count is decremented in 18135 * ire_inactive where the ire is destroyed. 18136 * 18137 * nce's reference ill's thru nce_ill and the count of nce's associated with 18138 * an ill is recorded in ill_nce_cnt. This is incremented atomically in 18139 * ndp_add() where the nce is actually added to the table. Similarly it is 18140 * decremented in ndp_inactive where the nce is destroyed. 18141 * 18142 * Flow of ioctls involving interface down/up 18143 * 18144 * The following is the sequence of an attempt to set some critical flags on an 18145 * up interface. 18146 * ip_sioctl_flags 18147 * ipif_down 18148 * wait for ipif to be quiescent 18149 * ipif_down_tail 18150 * ip_sioctl_flags_tail 18151 * 18152 * All set ioctls that involve down/up sequence would have a skeleton similar 18153 * to the above. All the *tail functions are called after the refcounts have 18154 * dropped to the appropriate values. 18155 * 18156 * The mechanism to quiesce an ipif is as follows. 18157 * 18158 * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed 18159 * on the ipif. Callers either pass a flag requesting wait or the lookup 18160 * functions will return NULL. 18161 * 18162 * Delete all ires referencing this ipif 18163 * 18164 * Any thread attempting to do an ipif_refhold on an ipif that has been 18165 * obtained thru a cached pointer will first make sure that 18166 * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then 18167 * increment the refcount. 18168 * 18169 * The above guarantees that the ipif refcount will eventually come down to 18170 * zero and the ipif will quiesce, once all threads that currently hold a 18171 * reference to the ipif refrelease the ipif. The ipif is quiescent after the 18172 * ipif_refcount has dropped to zero and all ire's associated with this ipif 18173 * have also been ire_inactive'd. i.e. when ipif_ire_cnt and ipif_refcnt both 18174 * drop to zero. 18175 * 18176 * Lookups during the IPIF_CHANGING/ILL_CHANGING interval. 18177 * 18178 * Threads trying to lookup an ipif or ill can pass a flag requesting 18179 * wait and restart if the ipif / ill cannot be looked up currently. 18180 * For eg. bind, and route operations (Eg. route add / delete) cannot return 18181 * failure if the ipif is currently undergoing an exclusive operation, and 18182 * hence pass the flag. The mblk is then enqueued in the ipsq and the operation 18183 * is restarted by ipsq_exit() when the currently exclusive ioctl completes. 18184 * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The 18185 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 18186 * change while the ill_lock is held. Before dropping the ill_lock we acquire 18187 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 18188 * until we release the ipsq_lock, even though the the ill/ipif state flags 18189 * can change after we drop the ill_lock. 18190 * 18191 * An attempt to send out a packet using an ipif that is currently 18192 * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this 18193 * operation and restart it later when the exclusive condition on the ipif ends. 18194 * This is an example of not passing the wait flag to the lookup functions. For 18195 * example an attempt to refhold and use conn->conn_multicast_ipif and send 18196 * out a multicast packet on that ipif will fail while the ipif is 18197 * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is 18198 * currently IPIF_CHANGING will also fail. 18199 */ 18200 int 18201 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 18202 { 18203 ill_t *ill = ipif->ipif_ill; 18204 phyint_t *phyi; 18205 conn_t *connp; 18206 boolean_t success; 18207 boolean_t ipif_was_up = B_FALSE; 18208 18209 ASSERT(IAM_WRITER_IPIF(ipif)); 18210 18211 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 18212 18213 if (ipif->ipif_flags & IPIF_UP) { 18214 mutex_enter(&ill->ill_lock); 18215 ipif->ipif_flags &= ~IPIF_UP; 18216 ASSERT(ill->ill_ipif_up_count > 0); 18217 --ill->ill_ipif_up_count; 18218 mutex_exit(&ill->ill_lock); 18219 ipif_was_up = B_TRUE; 18220 /* Update status in SCTP's list */ 18221 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 18222 } 18223 18224 /* 18225 * Blow away v6 memberships we established in ipif_multicast_up(); the 18226 * v4 ones are left alone (as is the ipif_multicast_up flag, so we 18227 * know not to rejoin when the interface is brought back up). 18228 */ 18229 if (ipif->ipif_isv6) 18230 ipif_multicast_down(ipif); 18231 /* 18232 * Remove from the mapping for __sin6_src_id. We insert only 18233 * when the address is not INADDR_ANY. As IPv4 addresses are 18234 * stored as mapped addresses, we need to check for mapped 18235 * INADDR_ANY also. 18236 */ 18237 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 18238 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 18239 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 18240 int err; 18241 18242 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 18243 ipif->ipif_zoneid); 18244 if (err != 0) { 18245 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 18246 } 18247 } 18248 18249 /* 18250 * Before we delete the ill from the group (if any), we need 18251 * to make sure that we delete all the routes dependent on 18252 * this and also any ipifs dependent on this ipif for 18253 * source address. We need to do before we delete from 18254 * the group because 18255 * 18256 * 1) ipif_down_delete_ire de-references ill->ill_group. 18257 * 18258 * 2) ipif_update_other_ipifs needs to walk the whole group 18259 * for re-doing source address selection. Note that 18260 * ipif_select_source[_v6] called from 18261 * ipif_update_other_ipifs[_v6] will not pick this ipif 18262 * because we have already marked down here i.e cleared 18263 * IPIF_UP. 18264 */ 18265 if (ipif->ipif_isv6) 18266 ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES); 18267 else 18268 ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES); 18269 18270 /* 18271 * Need to add these also to be saved and restored when the 18272 * ipif is brought down and up 18273 */ 18274 mutex_enter(&ire_mrtun_lock); 18275 if (ire_mrtun_count != 0) { 18276 mutex_exit(&ire_mrtun_lock); 18277 ire_walk_ill_mrtun(0, 0, ipif_down_delete_ire, 18278 (char *)ipif, NULL); 18279 } else { 18280 mutex_exit(&ire_mrtun_lock); 18281 } 18282 18283 mutex_enter(&ire_srcif_table_lock); 18284 if (ire_srcif_table_count > 0) { 18285 mutex_exit(&ire_srcif_table_lock); 18286 ire_walk_srcif_table_v4(ipif_down_delete_ire, (char *)ipif); 18287 } else { 18288 mutex_exit(&ire_srcif_table_lock); 18289 } 18290 18291 /* 18292 * Cleaning up the conn_ire_cache or conns must be done only after the 18293 * ires have been deleted above. Otherwise a thread could end up 18294 * caching an ire in a conn after we have finished the cleanup of the 18295 * conn. The caching is done after making sure that the ire is not yet 18296 * condemned. Also documented in the block comment above ip_output 18297 */ 18298 ipcl_walk(conn_cleanup_stale_ire, NULL); 18299 /* Also, delete the ires cached in SCTP */ 18300 sctp_ire_cache_flush(ipif); 18301 18302 /* Resolve any IPsec/IKE NAT-T instances that depend on this ipif. */ 18303 nattymod_clean_ipif(ipif); 18304 18305 /* 18306 * Update any other ipifs which have used "our" local address as 18307 * a source address. This entails removing and recreating IRE_INTERFACE 18308 * entries for such ipifs. 18309 */ 18310 if (ipif->ipif_isv6) 18311 ipif_update_other_ipifs_v6(ipif, ill->ill_group); 18312 else 18313 ipif_update_other_ipifs(ipif, ill->ill_group); 18314 18315 if (ipif_was_up) { 18316 /* 18317 * Check whether it is last ipif to leave this group. 18318 * If this is the last ipif to leave, we should remove 18319 * this ill from the group as ipif_select_source will not 18320 * be able to find any useful ipifs if this ill is selected 18321 * for load balancing. 18322 * 18323 * For nameless groups, we should call ifgrp_delete if this 18324 * belongs to some group. As this ipif is going down, we may 18325 * need to reconstruct groups. 18326 */ 18327 phyi = ill->ill_phyint; 18328 /* 18329 * If the phyint_groupname_len is 0, it may or may not 18330 * be in the nameless group. If the phyint_groupname_len is 18331 * not 0, then this ill should be part of some group. 18332 * As we always insert this ill in the group if 18333 * phyint_groupname_len is not zero when the first ipif 18334 * comes up (in ipif_up_done), it should be in a group 18335 * when the namelen is not 0. 18336 * 18337 * NOTE : When we delete the ill from the group,it will 18338 * blow away all the IRE_CACHES pointing either at this ipif or 18339 * ill_wq (illgrp_cache_delete does this). Thus, no IRES 18340 * should be pointing at this ill. 18341 */ 18342 ASSERT(phyi->phyint_groupname_len == 0 || 18343 (phyi->phyint_groupname != NULL && ill->ill_group != NULL)); 18344 18345 if (phyi->phyint_groupname_len != 0) { 18346 if (ill->ill_ipif_up_count == 0) 18347 illgrp_delete(ill); 18348 } 18349 18350 /* 18351 * If we have deleted some of the broadcast ires associated 18352 * with this ipif, we need to re-nominate somebody else if 18353 * the ires that we deleted were the nominated ones. 18354 */ 18355 if (ill->ill_group != NULL && !ill->ill_isv6) 18356 ipif_renominate_bcast(ipif); 18357 } 18358 18359 /* 18360 * neighbor-discovery or arp entries for this interface. 18361 */ 18362 ipif_ndp_down(ipif); 18363 18364 /* 18365 * If mp is NULL the caller will wait for the appropriate refcnt. 18366 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 18367 * and ill_delete -> ipif_free -> ipif_down 18368 */ 18369 if (mp == NULL) { 18370 ASSERT(q == NULL); 18371 return (0); 18372 } 18373 18374 if (CONN_Q(q)) { 18375 connp = Q_TO_CONN(q); 18376 mutex_enter(&connp->conn_lock); 18377 } else { 18378 connp = NULL; 18379 } 18380 mutex_enter(&ill->ill_lock); 18381 /* 18382 * Are there any ire's pointing to this ipif that are still active ? 18383 * If this is the last ipif going down, are there any ire's pointing 18384 * to this ill that are still active ? 18385 */ 18386 if (ipif_is_quiescent(ipif)) { 18387 mutex_exit(&ill->ill_lock); 18388 if (connp != NULL) 18389 mutex_exit(&connp->conn_lock); 18390 return (0); 18391 } 18392 18393 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 18394 ill->ill_name, (void *)ill)); 18395 /* 18396 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 18397 * drops down, the operation will be restarted by ipif_ill_refrele_tail 18398 * which in turn is called by the last refrele on the ipif/ill/ire. 18399 */ 18400 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 18401 if (!success) { 18402 /* The conn is closing. So just return */ 18403 ASSERT(connp != NULL); 18404 mutex_exit(&ill->ill_lock); 18405 mutex_exit(&connp->conn_lock); 18406 return (EINTR); 18407 } 18408 18409 mutex_exit(&ill->ill_lock); 18410 if (connp != NULL) 18411 mutex_exit(&connp->conn_lock); 18412 return (EINPROGRESS); 18413 } 18414 18415 void 18416 ipif_down_tail(ipif_t *ipif) 18417 { 18418 ill_t *ill = ipif->ipif_ill; 18419 18420 /* 18421 * Skip any loopback interface (null wq). 18422 * If this is the last logical interface on the ill 18423 * have ill_dl_down tell the driver we are gone (unbind) 18424 * Note that lun 0 can ipif_down even though 18425 * there are other logical units that are up. 18426 * This occurs e.g. when we change a "significant" IFF_ flag. 18427 */ 18428 if (ill->ill_wq != NULL && !ill->ill_logical_down && 18429 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 18430 ill->ill_dl_up) { 18431 ill_dl_down(ill); 18432 } 18433 ill->ill_logical_down = 0; 18434 18435 /* 18436 * Have to be after removing the routes in ipif_down_delete_ire. 18437 */ 18438 if (ipif->ipif_isv6) { 18439 if (ill->ill_flags & ILLF_XRESOLV) 18440 ipif_arp_down(ipif); 18441 } else { 18442 ipif_arp_down(ipif); 18443 } 18444 18445 ip_rts_ifmsg(ipif); 18446 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif); 18447 } 18448 18449 /* 18450 * Bring interface logically down without bringing the physical interface 18451 * down e.g. when the netmask is changed. This avoids long lasting link 18452 * negotiations between an ethernet interface and a certain switches. 18453 */ 18454 static int 18455 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 18456 { 18457 /* 18458 * The ill_logical_down flag is a transient flag. It is set here 18459 * and is cleared once the down has completed in ipif_down_tail. 18460 * This flag does not indicate whether the ill stream is in the 18461 * DL_BOUND state with the driver. Instead this flag is used by 18462 * ipif_down_tail to determine whether to DL_UNBIND the stream with 18463 * the driver. The state of the ill stream i.e. whether it is 18464 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 18465 */ 18466 ipif->ipif_ill->ill_logical_down = 1; 18467 return (ipif_down(ipif, q, mp)); 18468 } 18469 18470 /* 18471 * This is called when the SIOCSLIFUSESRC ioctl is processed in IP. 18472 * If the usesrc client ILL is already part of a usesrc group or not, 18473 * in either case a ire_stq with the matching usesrc client ILL will 18474 * locate the IRE's that need to be deleted. We want IREs to be created 18475 * with the new source address. 18476 */ 18477 static void 18478 ipif_delete_cache_ire(ire_t *ire, char *ill_arg) 18479 { 18480 ill_t *ucill = (ill_t *)ill_arg; 18481 18482 ASSERT(IAM_WRITER_ILL(ucill)); 18483 18484 if (ire->ire_stq == NULL) 18485 return; 18486 18487 if ((ire->ire_type == IRE_CACHE) && 18488 ((ill_t *)ire->ire_stq->q_ptr == ucill)) 18489 ire_delete(ire); 18490 } 18491 18492 /* 18493 * ire_walk routine to delete every IRE dependent on the interface 18494 * address that is going down. (Always called as writer.) 18495 * Works for both v4 and v6. 18496 * In addition for checking for ire_ipif matches it also checks for 18497 * IRE_CACHE entries which have the same source address as the 18498 * disappearing ipif since ipif_select_source might have picked 18499 * that source. Note that ipif_down/ipif_update_other_ipifs takes 18500 * care of any IRE_INTERFACE with the disappearing source address. 18501 */ 18502 static void 18503 ipif_down_delete_ire(ire_t *ire, char *ipif_arg) 18504 { 18505 ipif_t *ipif = (ipif_t *)ipif_arg; 18506 ill_t *ire_ill; 18507 ill_t *ipif_ill; 18508 18509 ASSERT(IAM_WRITER_IPIF(ipif)); 18510 if (ire->ire_ipif == NULL) 18511 return; 18512 18513 /* 18514 * For IPv4, we derive source addresses for an IRE from ipif's 18515 * belonging to the same IPMP group as the IRE's outgoing 18516 * interface. If an IRE's outgoing interface isn't in the 18517 * same IPMP group as a particular ipif, then that ipif 18518 * couldn't have been used as a source address for this IRE. 18519 * 18520 * For IPv6, source addresses are only restricted to the IPMP group 18521 * if the IRE is for a link-local address or a multicast address. 18522 * Otherwise, source addresses for an IRE can be chosen from 18523 * interfaces other than the the outgoing interface for that IRE. 18524 * 18525 * For source address selection details, see ipif_select_source() 18526 * and ipif_select_source_v6(). 18527 */ 18528 if (ire->ire_ipversion == IPV4_VERSION || 18529 IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) || 18530 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { 18531 ire_ill = ire->ire_ipif->ipif_ill; 18532 ipif_ill = ipif->ipif_ill; 18533 18534 if (ire_ill->ill_group != ipif_ill->ill_group) { 18535 return; 18536 } 18537 } 18538 18539 18540 if (ire->ire_ipif != ipif) { 18541 /* 18542 * Look for a matching source address. 18543 */ 18544 if (ire->ire_type != IRE_CACHE) 18545 return; 18546 if (ipif->ipif_flags & IPIF_NOLOCAL) 18547 return; 18548 18549 if (ire->ire_ipversion == IPV4_VERSION) { 18550 if (ire->ire_src_addr != ipif->ipif_src_addr) 18551 return; 18552 } else { 18553 if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 18554 &ipif->ipif_v6lcl_addr)) 18555 return; 18556 } 18557 ire_delete(ire); 18558 return; 18559 } 18560 /* 18561 * ire_delete() will do an ire_flush_cache which will delete 18562 * all ire_ipif matches 18563 */ 18564 ire_delete(ire); 18565 } 18566 18567 /* 18568 * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when 18569 * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or 18570 * 2) when an interface is brought up or down (on that ill). 18571 * This ensures that the IRE_CACHE entries don't retain stale source 18572 * address selection results. 18573 */ 18574 void 18575 ill_ipif_cache_delete(ire_t *ire, char *ill_arg) 18576 { 18577 ill_t *ill = (ill_t *)ill_arg; 18578 ill_t *ipif_ill; 18579 18580 ASSERT(IAM_WRITER_ILL(ill)); 18581 /* 18582 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18583 * Hence this should be IRE_CACHE. 18584 */ 18585 ASSERT(ire->ire_type == IRE_CACHE); 18586 18587 /* 18588 * We are called for IRE_CACHES whose ire_ipif matches ill. 18589 * We are only interested in IRE_CACHES that has borrowed 18590 * the source address from ill_arg e.g. ipif_up_done[_v6] 18591 * for which we need to look at ire_ipif->ipif_ill match 18592 * with ill. 18593 */ 18594 ASSERT(ire->ire_ipif != NULL); 18595 ipif_ill = ire->ire_ipif->ipif_ill; 18596 if (ipif_ill == ill || (ill->ill_group != NULL && 18597 ipif_ill->ill_group == ill->ill_group)) { 18598 ire_delete(ire); 18599 } 18600 } 18601 18602 /* 18603 * Delete all the ire whose stq references ill_arg. 18604 */ 18605 static void 18606 ill_stq_cache_delete(ire_t *ire, char *ill_arg) 18607 { 18608 ill_t *ill = (ill_t *)ill_arg; 18609 ill_t *ire_ill; 18610 18611 ASSERT(IAM_WRITER_ILL(ill)); 18612 /* 18613 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18614 * Hence this should be IRE_CACHE. 18615 */ 18616 ASSERT(ire->ire_type == IRE_CACHE); 18617 18618 /* 18619 * We are called for IRE_CACHES whose ire_stq and ire_ipif 18620 * matches ill. We are only interested in IRE_CACHES that 18621 * has ire_stq->q_ptr pointing at ill_arg. Thus we do the 18622 * filtering here. 18623 */ 18624 ire_ill = (ill_t *)ire->ire_stq->q_ptr; 18625 18626 if (ire_ill == ill) 18627 ire_delete(ire); 18628 } 18629 18630 /* 18631 * This is called when an ill leaves the group. We want to delete 18632 * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is 18633 * pointing at ill. 18634 */ 18635 static void 18636 illgrp_cache_delete(ire_t *ire, char *ill_arg) 18637 { 18638 ill_t *ill = (ill_t *)ill_arg; 18639 18640 ASSERT(IAM_WRITER_ILL(ill)); 18641 ASSERT(ill->ill_group == NULL); 18642 /* 18643 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18644 * Hence this should be IRE_CACHE. 18645 */ 18646 ASSERT(ire->ire_type == IRE_CACHE); 18647 /* 18648 * We are called for IRE_CACHES whose ire_stq and ire_ipif 18649 * matches ill. We are interested in both. 18650 */ 18651 ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) || 18652 (ire->ire_ipif->ipif_ill == ill)); 18653 18654 ire_delete(ire); 18655 } 18656 18657 /* 18658 * Initiate deallocate of an IPIF. Always called as writer. Called by 18659 * ill_delete or ip_sioctl_removeif. 18660 */ 18661 static void 18662 ipif_free(ipif_t *ipif) 18663 { 18664 ASSERT(IAM_WRITER_IPIF(ipif)); 18665 18666 if (ipif->ipif_recovery_id != 0) 18667 (void) untimeout(ipif->ipif_recovery_id); 18668 ipif->ipif_recovery_id = 0; 18669 18670 /* Remove conn references */ 18671 reset_conn_ipif(ipif); 18672 18673 /* 18674 * Make sure we have valid net and subnet broadcast ire's for the 18675 * other ipif's which share them with this ipif. 18676 */ 18677 if (!ipif->ipif_isv6) 18678 ipif_check_bcast_ires(ipif); 18679 18680 /* 18681 * Take down the interface. We can be called either from ill_delete 18682 * or from ip_sioctl_removeif. 18683 */ 18684 (void) ipif_down(ipif, NULL, NULL); 18685 18686 rw_enter(&ill_g_lock, RW_WRITER); 18687 /* Remove pointers to this ill in the multicast routing tables */ 18688 reset_mrt_vif_ipif(ipif); 18689 rw_exit(&ill_g_lock); 18690 } 18691 18692 static void 18693 ipif_free_tail(ipif_t *ipif) 18694 { 18695 mblk_t *mp; 18696 ipif_t **ipifp; 18697 18698 /* 18699 * Free state for addition IRE_IF_[NO]RESOLVER ire's. 18700 */ 18701 mutex_enter(&ipif->ipif_saved_ire_lock); 18702 mp = ipif->ipif_saved_ire_mp; 18703 ipif->ipif_saved_ire_mp = NULL; 18704 mutex_exit(&ipif->ipif_saved_ire_lock); 18705 freemsg(mp); 18706 18707 /* 18708 * Need to hold both ill_g_lock and ill_lock while 18709 * inserting or removing an ipif from the linked list 18710 * of ipifs hanging off the ill. 18711 */ 18712 rw_enter(&ill_g_lock, RW_WRITER); 18713 /* 18714 * Remove all multicast memberships on the interface now. 18715 * This removes IPv4 multicast memberships joined within 18716 * the kernel as ipif_down does not do ipif_multicast_down 18717 * for IPv4. IPv6 is not handled here as the multicast memberships 18718 * are based on ill and not on ipif. 18719 */ 18720 ilm_free(ipif); 18721 18722 /* 18723 * Since we held the ill_g_lock while doing the ilm_free above, 18724 * we can assert the ilms were really deleted and not just marked 18725 * ILM_DELETED. 18726 */ 18727 ASSERT(ilm_walk_ipif(ipif) == 0); 18728 18729 18730 IPIF_TRACE_CLEANUP(ipif); 18731 18732 /* Ask SCTP to take it out of it list */ 18733 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 18734 18735 mutex_enter(&ipif->ipif_ill->ill_lock); 18736 /* Get it out of the ILL interface list. */ 18737 ipifp = &ipif->ipif_ill->ill_ipif; 18738 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 18739 if (*ipifp == ipif) { 18740 *ipifp = ipif->ipif_next; 18741 break; 18742 } 18743 } 18744 18745 mutex_exit(&ipif->ipif_ill->ill_lock); 18746 rw_exit(&ill_g_lock); 18747 18748 mutex_destroy(&ipif->ipif_saved_ire_lock); 18749 18750 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 18751 18752 /* Free the memory. */ 18753 mi_free((char *)ipif); 18754 } 18755 18756 /* 18757 * Returns an ipif name in the form "ill_name/unit" if ipif_id is not zero, 18758 * "ill_name" otherwise. 18759 */ 18760 char * 18761 ipif_get_name(const ipif_t *ipif, char *buf, int len) 18762 { 18763 char lbuf[32]; 18764 char *name; 18765 size_t name_len; 18766 18767 buf[0] = '\0'; 18768 if (!ipif) 18769 return (buf); 18770 name = ipif->ipif_ill->ill_name; 18771 name_len = ipif->ipif_ill->ill_name_length; 18772 if (ipif->ipif_id != 0) { 18773 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 18774 ipif->ipif_id); 18775 name = lbuf; 18776 name_len = mi_strlen(name) + 1; 18777 } 18778 len -= 1; 18779 buf[len] = '\0'; 18780 len = MIN(len, name_len); 18781 bcopy(name, buf, len); 18782 return (buf); 18783 } 18784 18785 /* 18786 * Find an IPIF based on the name passed in. Names can be of the 18787 * form <phys> (e.g., le0), <phys>:<#> (e.g., le0:1), 18788 * The <phys> string can have forms like <dev><#> (e.g., le0), 18789 * <dev><#>.<module> (e.g. le0.foo), or <dev>.<module><#> (e.g. ip.tun3). 18790 * When there is no colon, the implied unit id is zero. <phys> must 18791 * correspond to the name of an ILL. (May be called as writer.) 18792 */ 18793 static ipif_t * 18794 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 18795 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q, 18796 mblk_t *mp, ipsq_func_t func, int *error) 18797 { 18798 char *cp; 18799 char *endp; 18800 long id; 18801 ill_t *ill; 18802 ipif_t *ipif; 18803 uint_t ire_type; 18804 boolean_t did_alloc = B_FALSE; 18805 ipsq_t *ipsq; 18806 18807 if (error != NULL) 18808 *error = 0; 18809 18810 /* 18811 * If the caller wants to us to create the ipif, make sure we have a 18812 * valid zoneid 18813 */ 18814 ASSERT(!do_alloc || zoneid != ALL_ZONES); 18815 18816 if (namelen == 0) { 18817 if (error != NULL) 18818 *error = ENXIO; 18819 return (NULL); 18820 } 18821 18822 *exists = B_FALSE; 18823 /* Look for a colon in the name. */ 18824 endp = &name[namelen]; 18825 for (cp = endp; --cp > name; ) { 18826 if (*cp == IPIF_SEPARATOR_CHAR) 18827 break; 18828 } 18829 18830 if (*cp == IPIF_SEPARATOR_CHAR) { 18831 /* 18832 * Reject any non-decimal aliases for logical 18833 * interfaces. Aliases with leading zeroes 18834 * are also rejected as they introduce ambiguity 18835 * in the naming of the interfaces. 18836 * In order to confirm with existing semantics, 18837 * and to not break any programs/script relying 18838 * on that behaviour, if<0>:0 is considered to be 18839 * a valid interface. 18840 * 18841 * If alias has two or more digits and the first 18842 * is zero, fail. 18843 */ 18844 if (&cp[2] < endp && cp[1] == '0') 18845 return (NULL); 18846 } 18847 18848 if (cp <= name) { 18849 cp = endp; 18850 } else { 18851 *cp = '\0'; 18852 } 18853 18854 /* 18855 * Look up the ILL, based on the portion of the name 18856 * before the slash. ill_lookup_on_name returns a held ill. 18857 * Temporary to check whether ill exists already. If so 18858 * ill_lookup_on_name will clear it. 18859 */ 18860 ill = ill_lookup_on_name(name, do_alloc, isv6, 18861 q, mp, func, error, &did_alloc); 18862 if (cp != endp) 18863 *cp = IPIF_SEPARATOR_CHAR; 18864 if (ill == NULL) 18865 return (NULL); 18866 18867 /* Establish the unit number in the name. */ 18868 id = 0; 18869 if (cp < endp && *endp == '\0') { 18870 /* If there was a colon, the unit number follows. */ 18871 cp++; 18872 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 18873 ill_refrele(ill); 18874 if (error != NULL) 18875 *error = ENXIO; 18876 return (NULL); 18877 } 18878 } 18879 18880 GRAB_CONN_LOCK(q); 18881 mutex_enter(&ill->ill_lock); 18882 /* Now see if there is an IPIF with this unit number. */ 18883 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 18884 if (ipif->ipif_id == id) { 18885 if (zoneid != ALL_ZONES && 18886 zoneid != ipif->ipif_zoneid && 18887 ipif->ipif_zoneid != ALL_ZONES) { 18888 mutex_exit(&ill->ill_lock); 18889 RELEASE_CONN_LOCK(q); 18890 ill_refrele(ill); 18891 if (error != NULL) 18892 *error = ENXIO; 18893 return (NULL); 18894 } 18895 /* 18896 * The block comment at the start of ipif_down 18897 * explains the use of the macros used below 18898 */ 18899 if (IPIF_CAN_LOOKUP(ipif)) { 18900 ipif_refhold_locked(ipif); 18901 mutex_exit(&ill->ill_lock); 18902 if (!did_alloc) 18903 *exists = B_TRUE; 18904 /* 18905 * Drop locks before calling ill_refrele 18906 * since it can potentially call into 18907 * ipif_ill_refrele_tail which can end up 18908 * in trying to acquire any lock. 18909 */ 18910 RELEASE_CONN_LOCK(q); 18911 ill_refrele(ill); 18912 return (ipif); 18913 } else if (IPIF_CAN_WAIT(ipif, q)) { 18914 ipsq = ill->ill_phyint->phyint_ipsq; 18915 mutex_enter(&ipsq->ipsq_lock); 18916 mutex_exit(&ill->ill_lock); 18917 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 18918 mutex_exit(&ipsq->ipsq_lock); 18919 RELEASE_CONN_LOCK(q); 18920 ill_refrele(ill); 18921 *error = EINPROGRESS; 18922 return (NULL); 18923 } 18924 } 18925 } 18926 RELEASE_CONN_LOCK(q); 18927 18928 if (!do_alloc) { 18929 mutex_exit(&ill->ill_lock); 18930 ill_refrele(ill); 18931 if (error != NULL) 18932 *error = ENXIO; 18933 return (NULL); 18934 } 18935 18936 /* 18937 * If none found, atomically allocate and return a new one. 18938 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 18939 * to support "receive only" use of lo0:1 etc. as is still done 18940 * below as an initial guess. 18941 * However, this is now likely to be overriden later in ipif_up_done() 18942 * when we know for sure what address has been configured on the 18943 * interface, since we might have more than one loopback interface 18944 * with a loopback address, e.g. in the case of zones, and all the 18945 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 18946 */ 18947 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 18948 ire_type = IRE_LOOPBACK; 18949 else 18950 ire_type = IRE_LOCAL; 18951 ipif = ipif_allocate(ill, id, ire_type, B_TRUE); 18952 if (ipif != NULL) 18953 ipif_refhold_locked(ipif); 18954 else if (error != NULL) 18955 *error = ENOMEM; 18956 mutex_exit(&ill->ill_lock); 18957 ill_refrele(ill); 18958 return (ipif); 18959 } 18960 18961 /* 18962 * This routine is called whenever a new address comes up on an ipif. If 18963 * we are configured to respond to address mask requests, then we are supposed 18964 * to broadcast an address mask reply at this time. This routine is also 18965 * called if we are already up, but a netmask change is made. This is legal 18966 * but might not make the system manager very popular. (May be called 18967 * as writer.) 18968 */ 18969 void 18970 ipif_mask_reply(ipif_t *ipif) 18971 { 18972 icmph_t *icmph; 18973 ipha_t *ipha; 18974 mblk_t *mp; 18975 18976 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 18977 18978 if (!ip_respond_to_address_mask_broadcast) 18979 return; 18980 18981 /* ICMP mask reply is IPv4 only */ 18982 ASSERT(!ipif->ipif_isv6); 18983 /* ICMP mask reply is not for a loopback interface */ 18984 ASSERT(ipif->ipif_ill->ill_wq != NULL); 18985 18986 mp = allocb(REPLY_LEN, BPRI_HI); 18987 if (mp == NULL) 18988 return; 18989 mp->b_wptr = mp->b_rptr + REPLY_LEN; 18990 18991 ipha = (ipha_t *)mp->b_rptr; 18992 bzero(ipha, REPLY_LEN); 18993 *ipha = icmp_ipha; 18994 ipha->ipha_ttl = ip_broadcast_ttl; 18995 ipha->ipha_src = ipif->ipif_src_addr; 18996 ipha->ipha_dst = ipif->ipif_brd_addr; 18997 ipha->ipha_length = htons(REPLY_LEN); 18998 ipha->ipha_ident = 0; 18999 19000 icmph = (icmph_t *)&ipha[1]; 19001 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 19002 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 19003 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 19004 if (icmph->icmph_checksum == 0) 19005 icmph->icmph_checksum = 0xffff; 19006 19007 put(ipif->ipif_wq, mp); 19008 19009 #undef REPLY_LEN 19010 } 19011 19012 /* 19013 * When the mtu in the ipif changes, we call this routine through ire_walk 19014 * to update all the relevant IREs. 19015 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 19016 */ 19017 static void 19018 ipif_mtu_change(ire_t *ire, char *ipif_arg) 19019 { 19020 ipif_t *ipif = (ipif_t *)ipif_arg; 19021 19022 if (ire->ire_stq == NULL || ire->ire_ipif != ipif) 19023 return; 19024 ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET); 19025 } 19026 19027 /* 19028 * When the mtu in the ill changes, we call this routine through ire_walk 19029 * to update all the relevant IREs. 19030 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 19031 */ 19032 void 19033 ill_mtu_change(ire_t *ire, char *ill_arg) 19034 { 19035 ill_t *ill = (ill_t *)ill_arg; 19036 19037 if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill) 19038 return; 19039 ire->ire_max_frag = ire->ire_ipif->ipif_mtu; 19040 } 19041 19042 /* 19043 * Join the ipif specific multicast groups. 19044 * Must be called after a mapping has been set up in the resolver. (Always 19045 * called as writer.) 19046 */ 19047 void 19048 ipif_multicast_up(ipif_t *ipif) 19049 { 19050 int err, index; 19051 ill_t *ill; 19052 19053 ASSERT(IAM_WRITER_IPIF(ipif)); 19054 19055 ill = ipif->ipif_ill; 19056 index = ill->ill_phyint->phyint_ifindex; 19057 19058 ip1dbg(("ipif_multicast_up\n")); 19059 if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up) 19060 return; 19061 19062 if (ipif->ipif_isv6) { 19063 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 19064 return; 19065 19066 /* Join the all hosts multicast address */ 19067 ip1dbg(("ipif_multicast_up - addmulti\n")); 19068 /* 19069 * Passing B_TRUE means we have to join the multicast 19070 * membership on this interface even though this is 19071 * FAILED. If we join on a different one in the group, 19072 * we will not be able to delete the membership later 19073 * as we currently don't track where we join when we 19074 * join within the kernel unlike applications where 19075 * we have ilg/ilg_orig_index. See ip_addmulti_v6 19076 * for more on this. 19077 */ 19078 err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index, 19079 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 19080 if (err != 0) { 19081 ip0dbg(("ipif_multicast_up: " 19082 "all_hosts_mcast failed %d\n", 19083 err)); 19084 return; 19085 } 19086 /* 19087 * Enable multicast for the solicited node multicast address 19088 */ 19089 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 19090 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 19091 19092 ipv6_multi.s6_addr32[3] |= 19093 ipif->ipif_v6lcl_addr.s6_addr32[3]; 19094 19095 err = ip_addmulti_v6(&ipv6_multi, ill, index, 19096 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, 19097 NULL); 19098 if (err != 0) { 19099 ip0dbg(("ipif_multicast_up: solicited MC" 19100 " failed %d\n", err)); 19101 (void) ip_delmulti_v6(&ipv6_all_hosts_mcast, 19102 ill, ill->ill_phyint->phyint_ifindex, 19103 ipif->ipif_zoneid, B_TRUE, B_TRUE); 19104 return; 19105 } 19106 } 19107 } else { 19108 if (ipif->ipif_lcl_addr == INADDR_ANY) 19109 return; 19110 19111 /* Join the all hosts multicast address */ 19112 ip1dbg(("ipif_multicast_up - addmulti\n")); 19113 err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, 19114 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 19115 if (err) { 19116 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 19117 return; 19118 } 19119 } 19120 ipif->ipif_multicast_up = 1; 19121 } 19122 19123 /* 19124 * Blow away any IPv6 multicast groups that we joined in ipif_multicast_up(); 19125 * any explicit memberships are blown away in ill_leave_multicast() when the 19126 * ill is brought down. 19127 */ 19128 static void 19129 ipif_multicast_down(ipif_t *ipif) 19130 { 19131 int err; 19132 19133 ASSERT(IAM_WRITER_IPIF(ipif)); 19134 19135 ip1dbg(("ipif_multicast_down\n")); 19136 if (!ipif->ipif_multicast_up) 19137 return; 19138 19139 ASSERT(ipif->ipif_isv6); 19140 19141 ip1dbg(("ipif_multicast_down - delmulti\n")); 19142 19143 /* 19144 * Leave the all hosts multicast address. Similar to ip_addmulti_v6, 19145 * we should look for ilms on this ill rather than the ones that have 19146 * been failed over here. They are here temporarily. As 19147 * ipif_multicast_up has joined on this ill, we should delete only 19148 * from this ill. 19149 */ 19150 err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, 19151 ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid, 19152 B_TRUE, B_TRUE); 19153 if (err != 0) { 19154 ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n", 19155 err)); 19156 } 19157 /* 19158 * Disable multicast for the solicited node multicast address 19159 */ 19160 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 19161 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 19162 19163 ipv6_multi.s6_addr32[3] |= 19164 ipif->ipif_v6lcl_addr.s6_addr32[3]; 19165 19166 err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill, 19167 ipif->ipif_ill->ill_phyint->phyint_ifindex, 19168 ipif->ipif_zoneid, B_TRUE, B_TRUE); 19169 19170 if (err != 0) { 19171 ip0dbg(("ipif_multicast_down: sol MC failed %d\n", 19172 err)); 19173 } 19174 } 19175 19176 ipif->ipif_multicast_up = 0; 19177 } 19178 19179 /* 19180 * Used when an interface comes up to recreate any extra routes on this 19181 * interface. 19182 */ 19183 static ire_t ** 19184 ipif_recover_ire(ipif_t *ipif) 19185 { 19186 mblk_t *mp; 19187 ire_t **ipif_saved_irep; 19188 ire_t **irep; 19189 19190 ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name, 19191 ipif->ipif_id)); 19192 19193 mutex_enter(&ipif->ipif_saved_ire_lock); 19194 ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) * 19195 ipif->ipif_saved_ire_cnt, KM_NOSLEEP); 19196 if (ipif_saved_irep == NULL) { 19197 mutex_exit(&ipif->ipif_saved_ire_lock); 19198 return (NULL); 19199 } 19200 19201 irep = ipif_saved_irep; 19202 for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 19203 ire_t *ire; 19204 queue_t *rfq; 19205 queue_t *stq; 19206 ifrt_t *ifrt; 19207 uchar_t *src_addr; 19208 uchar_t *gateway_addr; 19209 mblk_t *resolver_mp; 19210 ushort_t type; 19211 19212 /* 19213 * When the ire was initially created and then added in 19214 * ip_rt_add(), it was created either using ipif->ipif_net_type 19215 * in the case of a traditional interface route, or as one of 19216 * the IRE_OFFSUBNET types (with the exception of 19217 * IRE_HOST types ire which is created by icmp_redirect() and 19218 * which we don't need to save or recover). In the case where 19219 * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update 19220 * the ire_type to IRE_IF_NORESOLVER before calling ire_add() 19221 * to satisfy software like GateD and Sun Cluster which creates 19222 * routes using the the loopback interface's address as a 19223 * gateway. 19224 * 19225 * As ifrt->ifrt_type reflects the already updated ire_type and 19226 * since ire_create() expects that IRE_IF_NORESOLVER will have 19227 * a valid nce_res_mp field (which doesn't make sense for a 19228 * IRE_LOOPBACK), ire_create() will be called in the same way 19229 * here as in ip_rt_add(), namely using ipif->ipif_net_type when 19230 * the route looks like a traditional interface route (where 19231 * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using 19232 * the saved ifrt->ifrt_type. This means that in the case where 19233 * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by 19234 * ire_create() will be an IRE_LOOPBACK, it will then be turned 19235 * into an IRE_IF_NORESOLVER and then added by ire_add(). 19236 */ 19237 ifrt = (ifrt_t *)mp->b_rptr; 19238 if (ifrt->ifrt_type & IRE_INTERFACE) { 19239 rfq = NULL; 19240 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 19241 ? ipif->ipif_rq : ipif->ipif_wq; 19242 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19243 ? (uint8_t *)&ifrt->ifrt_src_addr 19244 : (uint8_t *)&ipif->ipif_src_addr; 19245 gateway_addr = NULL; 19246 resolver_mp = ipif->ipif_resolver_mp; 19247 type = ipif->ipif_net_type; 19248 } else if (ifrt->ifrt_type & IRE_BROADCAST) { 19249 /* Recover multiroute broadcast IRE. */ 19250 rfq = ipif->ipif_rq; 19251 stq = ipif->ipif_wq; 19252 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19253 ? (uint8_t *)&ifrt->ifrt_src_addr 19254 : (uint8_t *)&ipif->ipif_src_addr; 19255 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 19256 resolver_mp = ipif->ipif_bcast_mp; 19257 type = ifrt->ifrt_type; 19258 } else { 19259 rfq = NULL; 19260 stq = NULL; 19261 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19262 ? (uint8_t *)&ifrt->ifrt_src_addr : NULL; 19263 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 19264 resolver_mp = NULL; 19265 type = ifrt->ifrt_type; 19266 } 19267 19268 /* 19269 * Create a copy of the IRE with the saved address and netmask. 19270 */ 19271 ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for " 19272 "0x%x/0x%x\n", 19273 ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type, 19274 ntohl(ifrt->ifrt_addr), 19275 ntohl(ifrt->ifrt_mask))); 19276 ire = ire_create( 19277 (uint8_t *)&ifrt->ifrt_addr, 19278 (uint8_t *)&ifrt->ifrt_mask, 19279 src_addr, 19280 gateway_addr, 19281 NULL, 19282 &ifrt->ifrt_max_frag, 19283 NULL, 19284 rfq, 19285 stq, 19286 type, 19287 resolver_mp, 19288 ipif, 19289 NULL, 19290 0, 19291 0, 19292 0, 19293 ifrt->ifrt_flags, 19294 &ifrt->ifrt_iulp_info, 19295 NULL, 19296 NULL); 19297 19298 if (ire == NULL) { 19299 mutex_exit(&ipif->ipif_saved_ire_lock); 19300 kmem_free(ipif_saved_irep, 19301 ipif->ipif_saved_ire_cnt * sizeof (ire_t *)); 19302 return (NULL); 19303 } 19304 19305 /* 19306 * Some software (for example, GateD and Sun Cluster) attempts 19307 * to create (what amount to) IRE_PREFIX routes with the 19308 * loopback address as the gateway. This is primarily done to 19309 * set up prefixes with the RTF_REJECT flag set (for example, 19310 * when generating aggregate routes.) 19311 * 19312 * If the IRE type (as defined by ipif->ipif_net_type) is 19313 * IRE_LOOPBACK, then we map the request into a 19314 * IRE_IF_NORESOLVER. 19315 */ 19316 if (ipif->ipif_net_type == IRE_LOOPBACK) 19317 ire->ire_type = IRE_IF_NORESOLVER; 19318 /* 19319 * ire held by ire_add, will be refreled' towards the 19320 * the end of ipif_up_done 19321 */ 19322 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 19323 *irep = ire; 19324 irep++; 19325 ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire)); 19326 } 19327 mutex_exit(&ipif->ipif_saved_ire_lock); 19328 return (ipif_saved_irep); 19329 } 19330 19331 /* 19332 * Used to set the netmask and broadcast address to default values when the 19333 * interface is brought up. (Always called as writer.) 19334 */ 19335 static void 19336 ipif_set_default(ipif_t *ipif) 19337 { 19338 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 19339 19340 if (!ipif->ipif_isv6) { 19341 /* 19342 * Interface holds an IPv4 address. Default 19343 * mask is the natural netmask. 19344 */ 19345 if (!ipif->ipif_net_mask) { 19346 ipaddr_t v4mask; 19347 19348 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 19349 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 19350 } 19351 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19352 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19353 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 19354 } else { 19355 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 19356 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 19357 } 19358 /* 19359 * NOTE: SunOS 4.X does this even if the broadcast address 19360 * has been already set thus we do the same here. 19361 */ 19362 if (ipif->ipif_flags & IPIF_BROADCAST) { 19363 ipaddr_t v4addr; 19364 19365 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 19366 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 19367 } 19368 } else { 19369 /* 19370 * Interface holds an IPv6-only address. Default 19371 * mask is all-ones. 19372 */ 19373 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 19374 ipif->ipif_v6net_mask = ipv6_all_ones; 19375 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19376 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19377 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 19378 } else { 19379 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 19380 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 19381 } 19382 } 19383 } 19384 19385 /* 19386 * Return 0 if this address can be used as local address without causing 19387 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 19388 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 19389 * Special checks are needed to allow the same IPv6 link-local address 19390 * on different ills. 19391 * TODO: allowing the same site-local address on different ill's. 19392 */ 19393 int 19394 ip_addr_availability_check(ipif_t *new_ipif) 19395 { 19396 in6_addr_t our_v6addr; 19397 ill_t *ill; 19398 ipif_t *ipif; 19399 ill_walk_context_t ctx; 19400 19401 ASSERT(IAM_WRITER_IPIF(new_ipif)); 19402 ASSERT(MUTEX_HELD(&ip_addr_avail_lock)); 19403 ASSERT(RW_READ_HELD(&ill_g_lock)); 19404 19405 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 19406 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 19407 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 19408 return (0); 19409 19410 our_v6addr = new_ipif->ipif_v6lcl_addr; 19411 19412 if (new_ipif->ipif_isv6) 19413 ill = ILL_START_WALK_V6(&ctx); 19414 else 19415 ill = ILL_START_WALK_V4(&ctx); 19416 19417 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 19418 for (ipif = ill->ill_ipif; ipif != NULL; 19419 ipif = ipif->ipif_next) { 19420 if ((ipif == new_ipif) || 19421 !(ipif->ipif_flags & IPIF_UP) || 19422 (ipif->ipif_flags & IPIF_UNNUMBERED)) 19423 continue; 19424 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 19425 &our_v6addr)) { 19426 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 19427 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 19428 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 19429 ipif->ipif_flags |= IPIF_UNNUMBERED; 19430 else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) && 19431 new_ipif->ipif_ill != ill) 19432 continue; 19433 else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) && 19434 new_ipif->ipif_ill != ill) 19435 continue; 19436 else if (new_ipif->ipif_zoneid != 19437 ipif->ipif_zoneid && 19438 ipif->ipif_zoneid != ALL_ZONES && 19439 (ill->ill_phyint->phyint_flags & 19440 PHYI_LOOPBACK)) 19441 continue; 19442 else if (new_ipif->ipif_ill == ill) 19443 return (EADDRINUSE); 19444 else 19445 return (EADDRNOTAVAIL); 19446 } 19447 } 19448 } 19449 19450 return (0); 19451 } 19452 19453 /* 19454 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 19455 * IREs for the ipif. 19456 * When the routine returns EINPROGRESS then mp has been consumed and 19457 * the ioctl will be acked from ip_rput_dlpi. 19458 */ 19459 static int 19460 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 19461 { 19462 ill_t *ill = ipif->ipif_ill; 19463 boolean_t isv6 = ipif->ipif_isv6; 19464 int err = 0; 19465 boolean_t success; 19466 19467 ASSERT(IAM_WRITER_IPIF(ipif)); 19468 19469 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 19470 19471 /* Shouldn't get here if it is already up. */ 19472 if (ipif->ipif_flags & IPIF_UP) 19473 return (EALREADY); 19474 19475 /* Skip arp/ndp for any loopback interface. */ 19476 if (ill->ill_wq != NULL) { 19477 conn_t *connp = Q_TO_CONN(q); 19478 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 19479 19480 if (!ill->ill_dl_up) { 19481 /* 19482 * ill_dl_up is not yet set. i.e. we are yet to 19483 * DL_BIND with the driver and this is the first 19484 * logical interface on the ill to become "up". 19485 * Tell the driver to get going (via DL_BIND_REQ). 19486 * Note that changing "significant" IFF_ flags 19487 * address/netmask etc cause a down/up dance, but 19488 * does not cause an unbind (DL_UNBIND) with the driver 19489 */ 19490 return (ill_dl_up(ill, ipif, mp, q)); 19491 } 19492 19493 /* 19494 * ipif_resolver_up may end up sending an 19495 * AR_INTERFACE_UP message to ARP, which would, in 19496 * turn send a DLPI message to the driver. ioctls are 19497 * serialized and so we cannot send more than one 19498 * interface up message at a time. If ipif_resolver_up 19499 * does send an interface up message to ARP, we get 19500 * EINPROGRESS and we will complete in ip_arp_done. 19501 */ 19502 19503 ASSERT(connp != NULL); 19504 ASSERT(ipsq->ipsq_pending_mp == NULL); 19505 mutex_enter(&connp->conn_lock); 19506 mutex_enter(&ill->ill_lock); 19507 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 19508 mutex_exit(&ill->ill_lock); 19509 mutex_exit(&connp->conn_lock); 19510 if (!success) 19511 return (EINTR); 19512 19513 /* 19514 * Crank up IPv6 neighbor discovery 19515 * Unlike ARP, this should complete when 19516 * ipif_ndp_up returns. However, for 19517 * ILLF_XRESOLV interfaces we also send a 19518 * AR_INTERFACE_UP to the external resolver. 19519 * That ioctl will complete in ip_rput. 19520 */ 19521 if (isv6) { 19522 err = ipif_ndp_up(ipif, &ipif->ipif_v6lcl_addr, 19523 B_FALSE); 19524 if (err != 0) { 19525 if (err != EINPROGRESS) 19526 mp = ipsq_pending_mp_get(ipsq, &connp); 19527 return (err); 19528 } 19529 } 19530 /* Now, ARP */ 19531 err = ipif_resolver_up(ipif, Res_act_initial); 19532 if (err == EINPROGRESS) { 19533 /* We will complete it in ip_arp_done */ 19534 return (err); 19535 } 19536 mp = ipsq_pending_mp_get(ipsq, &connp); 19537 ASSERT(mp != NULL); 19538 if (err != 0) 19539 return (err); 19540 } else { 19541 /* 19542 * Interfaces without underlying hardware don't do duplicate 19543 * address detection. 19544 */ 19545 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 19546 ipif->ipif_addr_ready = 1; 19547 } 19548 return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 19549 } 19550 19551 /* 19552 * Perform a bind for the physical device. 19553 * When the routine returns EINPROGRESS then mp has been consumed and 19554 * the ioctl will be acked from ip_rput_dlpi. 19555 * Allocate an unbind message and save it until ipif_down. 19556 */ 19557 static int 19558 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 19559 { 19560 mblk_t *areq_mp = NULL; 19561 mblk_t *bind_mp = NULL; 19562 mblk_t *unbind_mp = NULL; 19563 conn_t *connp; 19564 boolean_t success; 19565 19566 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 19567 ASSERT(IAM_WRITER_ILL(ill)); 19568 19569 ASSERT(mp != NULL); 19570 19571 /* Create a resolver cookie for ARP */ 19572 if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) { 19573 areq_t *areq; 19574 uint16_t sap_addr; 19575 19576 areq_mp = ill_arp_alloc(ill, 19577 (uchar_t *)&ip_areq_template, 0); 19578 if (areq_mp == NULL) { 19579 return (ENOMEM); 19580 } 19581 freemsg(ill->ill_resolver_mp); 19582 ill->ill_resolver_mp = areq_mp; 19583 areq = (areq_t *)areq_mp->b_rptr; 19584 sap_addr = ill->ill_sap; 19585 bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr)); 19586 /* 19587 * Wait till we call ill_pending_mp_add to determine 19588 * the success before we free the ill_resolver_mp and 19589 * attach areq_mp in it's place. 19590 */ 19591 } 19592 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 19593 DL_BIND_REQ); 19594 if (bind_mp == NULL) 19595 goto bad; 19596 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 19597 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 19598 19599 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 19600 if (unbind_mp == NULL) 19601 goto bad; 19602 19603 /* 19604 * Record state needed to complete this operation when the 19605 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 19606 */ 19607 if (WR(q)->q_next == NULL) { 19608 connp = Q_TO_CONN(q); 19609 mutex_enter(&connp->conn_lock); 19610 } else { 19611 connp = NULL; 19612 } 19613 mutex_enter(&ipif->ipif_ill->ill_lock); 19614 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 19615 mutex_exit(&ipif->ipif_ill->ill_lock); 19616 if (connp != NULL) 19617 mutex_exit(&connp->conn_lock); 19618 if (!success) 19619 goto bad; 19620 19621 /* 19622 * Save the unbind message for ill_dl_down(); it will be consumed when 19623 * the interface goes down. 19624 */ 19625 ASSERT(ill->ill_unbind_mp == NULL); 19626 ill->ill_unbind_mp = unbind_mp; 19627 19628 ill_dlpi_send(ill, bind_mp); 19629 /* Send down link-layer capabilities probe if not already done. */ 19630 ill_capability_probe(ill); 19631 19632 /* 19633 * Sysid used to rely on the fact that netboots set domainname 19634 * and the like. Now that miniroot boots aren't strictly netboots 19635 * and miniroot network configuration is driven from userland 19636 * these things still need to be set. This situation can be detected 19637 * by comparing the interface being configured here to the one 19638 * dhcack was set to reference by the boot loader. Once sysid is 19639 * converted to use dhcp_ipc_getinfo() this call can go away. 19640 */ 19641 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && (dhcack != NULL) && 19642 (strcmp(ill->ill_name, dhcack) == 0) && 19643 (strlen(srpc_domain) == 0)) { 19644 if (dhcpinit() != 0) 19645 cmn_err(CE_WARN, "no cached dhcp response"); 19646 } 19647 19648 /* 19649 * This operation will complete in ip_rput_dlpi with either 19650 * a DL_BIND_ACK or DL_ERROR_ACK. 19651 */ 19652 return (EINPROGRESS); 19653 bad: 19654 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 19655 /* 19656 * We don't have to check for possible removal from illgrp 19657 * as we have not yet inserted in illgrp. For groups 19658 * without names, this ipif is still not UP and hence 19659 * this could not have possibly had any influence in forming 19660 * groups. 19661 */ 19662 19663 if (bind_mp != NULL) 19664 freemsg(bind_mp); 19665 if (unbind_mp != NULL) 19666 freemsg(unbind_mp); 19667 return (ENOMEM); 19668 } 19669 19670 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 19671 19672 /* 19673 * DLPI and ARP is up. 19674 * Create all the IREs associated with an interface bring up multicast. 19675 * Set the interface flag and finish other initialization 19676 * that potentially had to be differed to after DL_BIND_ACK. 19677 */ 19678 int 19679 ipif_up_done(ipif_t *ipif) 19680 { 19681 ire_t *ire_array[20]; 19682 ire_t **irep = ire_array; 19683 ire_t **irep1; 19684 ipaddr_t net_mask = 0; 19685 ipaddr_t subnet_mask, route_mask; 19686 ill_t *ill = ipif->ipif_ill; 19687 queue_t *stq; 19688 ipif_t *src_ipif; 19689 ipif_t *tmp_ipif; 19690 boolean_t flush_ire_cache = B_TRUE; 19691 int err = 0; 19692 phyint_t *phyi; 19693 ire_t **ipif_saved_irep = NULL; 19694 int ipif_saved_ire_cnt; 19695 int cnt; 19696 boolean_t src_ipif_held = B_FALSE; 19697 boolean_t ire_added = B_FALSE; 19698 boolean_t loopback = B_FALSE; 19699 19700 ip1dbg(("ipif_up_done(%s:%u)\n", 19701 ipif->ipif_ill->ill_name, ipif->ipif_id)); 19702 /* Check if this is a loopback interface */ 19703 if (ipif->ipif_ill->ill_wq == NULL) 19704 loopback = B_TRUE; 19705 19706 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 19707 /* 19708 * If all other interfaces for this ill are down or DEPRECATED, 19709 * or otherwise unsuitable for source address selection, remove 19710 * any IRE_CACHE entries for this ill to make sure source 19711 * address selection gets to take this new ipif into account. 19712 * No need to hold ill_lock while traversing the ipif list since 19713 * we are writer 19714 */ 19715 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 19716 tmp_ipif = tmp_ipif->ipif_next) { 19717 if (((tmp_ipif->ipif_flags & 19718 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 19719 !(tmp_ipif->ipif_flags & IPIF_UP)) || 19720 (tmp_ipif == ipif)) 19721 continue; 19722 /* first useable pre-existing interface */ 19723 flush_ire_cache = B_FALSE; 19724 break; 19725 } 19726 if (flush_ire_cache) 19727 ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE, 19728 IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); 19729 19730 /* 19731 * Figure out which way the send-to queue should go. Only 19732 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK 19733 * should show up here. 19734 */ 19735 switch (ill->ill_net_type) { 19736 case IRE_IF_RESOLVER: 19737 stq = ill->ill_rq; 19738 break; 19739 case IRE_IF_NORESOLVER: 19740 case IRE_LOOPBACK: 19741 stq = ill->ill_wq; 19742 break; 19743 default: 19744 return (EINVAL); 19745 } 19746 19747 if (ill->ill_phyint->phyint_flags & PHYI_LOOPBACK) { 19748 /* 19749 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 19750 * ipif_lookup_on_name(), but in the case of zones we can have 19751 * several loopback addresses on lo0. So all the interfaces with 19752 * loopback addresses need to be marked IRE_LOOPBACK. 19753 */ 19754 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 19755 htonl(INADDR_LOOPBACK)) 19756 ipif->ipif_ire_type = IRE_LOOPBACK; 19757 else 19758 ipif->ipif_ire_type = IRE_LOCAL; 19759 } 19760 19761 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) { 19762 /* 19763 * Can't use our source address. Select a different 19764 * source address for the IRE_INTERFACE and IRE_LOCAL 19765 */ 19766 src_ipif = ipif_select_source(ipif->ipif_ill, 19767 ipif->ipif_subnet, ipif->ipif_zoneid); 19768 if (src_ipif == NULL) 19769 src_ipif = ipif; /* Last resort */ 19770 else 19771 src_ipif_held = B_TRUE; 19772 } else { 19773 src_ipif = ipif; 19774 } 19775 19776 /* Create all the IREs associated with this interface */ 19777 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 19778 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 19779 19780 /* 19781 * If we're on a labeled system then make sure that zone- 19782 * private addresses have proper remote host database entries. 19783 */ 19784 if (is_system_labeled() && 19785 ipif->ipif_ire_type != IRE_LOOPBACK && 19786 !tsol_check_interface_address(ipif)) 19787 return (EINVAL); 19788 19789 /* Register the source address for __sin6_src_id */ 19790 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 19791 ipif->ipif_zoneid); 19792 if (err != 0) { 19793 ip0dbg(("ipif_up_done: srcid_insert %d\n", err)); 19794 return (err); 19795 } 19796 19797 /* If the interface address is set, create the local IRE. */ 19798 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n", 19799 (void *)ipif, 19800 ipif->ipif_ire_type, 19801 ntohl(ipif->ipif_lcl_addr))); 19802 *irep++ = ire_create( 19803 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 19804 (uchar_t *)&ip_g_all_ones, /* mask */ 19805 (uchar_t *)&src_ipif->ipif_src_addr, /* source address */ 19806 NULL, /* no gateway */ 19807 NULL, 19808 &ip_loopback_mtuplus, /* max frag size */ 19809 NULL, 19810 ipif->ipif_rq, /* recv-from queue */ 19811 NULL, /* no send-to queue */ 19812 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 19813 NULL, 19814 ipif, 19815 NULL, 19816 0, 19817 0, 19818 0, 19819 (ipif->ipif_flags & IPIF_PRIVATE) ? 19820 RTF_PRIVATE : 0, 19821 &ire_uinfo_null, 19822 NULL, 19823 NULL); 19824 } else { 19825 ip1dbg(( 19826 "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n", 19827 ipif->ipif_ire_type, 19828 ntohl(ipif->ipif_lcl_addr), 19829 (uint_t)ipif->ipif_flags)); 19830 } 19831 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 19832 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 19833 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 19834 } else { 19835 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 19836 } 19837 19838 subnet_mask = ipif->ipif_net_mask; 19839 19840 /* 19841 * If mask was not specified, use natural netmask of 19842 * interface address. Also, store this mask back into the 19843 * ipif struct. 19844 */ 19845 if (subnet_mask == 0) { 19846 subnet_mask = net_mask; 19847 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 19848 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 19849 ipif->ipif_v6subnet); 19850 } 19851 19852 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 19853 if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) && 19854 ipif->ipif_subnet != INADDR_ANY) { 19855 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19856 19857 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19858 route_mask = IP_HOST_MASK; 19859 } else { 19860 route_mask = subnet_mask; 19861 } 19862 19863 ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p " 19864 "creating if IRE ill_net_type 0x%x for 0x%x\n", 19865 (void *)ipif, (void *)ill, 19866 ill->ill_net_type, 19867 ntohl(ipif->ipif_subnet))); 19868 *irep++ = ire_create( 19869 (uchar_t *)&ipif->ipif_subnet, /* dest address */ 19870 (uchar_t *)&route_mask, /* mask */ 19871 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 19872 NULL, /* no gateway */ 19873 NULL, 19874 &ipif->ipif_mtu, /* max frag */ 19875 NULL, 19876 NULL, /* no recv queue */ 19877 stq, /* send-to queue */ 19878 ill->ill_net_type, /* IF_[NO]RESOLVER */ 19879 ill->ill_resolver_mp, /* xmit header */ 19880 ipif, 19881 NULL, 19882 0, 19883 0, 19884 0, 19885 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0, 19886 &ire_uinfo_null, 19887 NULL, 19888 NULL); 19889 } 19890 19891 /* 19892 * If the interface address is set, create the broadcast IREs. 19893 * 19894 * ire_create_bcast checks if the proposed new IRE matches 19895 * any existing IRE's with the same physical interface (ILL). 19896 * This should get rid of duplicates. 19897 * ire_create_bcast also check IPIF_NOXMIT and does not create 19898 * any broadcast ires. 19899 */ 19900 if ((ipif->ipif_subnet != INADDR_ANY) && 19901 (ipif->ipif_flags & IPIF_BROADCAST)) { 19902 ipaddr_t addr; 19903 19904 ip1dbg(("ipif_up_done: creating broadcast IRE\n")); 19905 irep = ire_check_and_create_bcast(ipif, 0, irep, 19906 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19907 irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, 19908 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19909 19910 /* 19911 * For backward compatibility, we need to create net 19912 * broadcast ire's based on the old "IP address class 19913 * system." The reason is that some old machines only 19914 * respond to these class derived net broadcast. 19915 * 19916 * But we should not create these net broadcast ire's if 19917 * the subnet_mask is shorter than the IP address class based 19918 * derived netmask. Otherwise, we may create a net 19919 * broadcast address which is the same as an IP address 19920 * on the subnet. Then TCP will refuse to talk to that 19921 * address. 19922 * 19923 * Nor do we need IRE_BROADCAST ire's for the interface 19924 * with the netmask as 0xFFFFFFFF, as IRE_LOCAL for that 19925 * interface is already created. Creating these broadcast 19926 * ire's will only create confusion as the "addr" is going 19927 * to be same as that of the IP address of the interface. 19928 */ 19929 if (net_mask < subnet_mask) { 19930 addr = net_mask & ipif->ipif_subnet; 19931 irep = ire_check_and_create_bcast(ipif, addr, irep, 19932 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19933 irep = ire_check_and_create_bcast(ipif, 19934 ~net_mask | addr, irep, 19935 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19936 } 19937 19938 if (subnet_mask != 0xFFFFFFFF) { 19939 addr = ipif->ipif_subnet; 19940 irep = ire_check_and_create_bcast(ipif, addr, irep, 19941 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19942 irep = ire_check_and_create_bcast(ipif, 19943 ~subnet_mask|addr, irep, 19944 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19945 } 19946 } 19947 19948 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 19949 19950 /* If an earlier ire_create failed, get out now */ 19951 for (irep1 = irep; irep1 > ire_array; ) { 19952 irep1--; 19953 if (*irep1 == NULL) { 19954 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 19955 err = ENOMEM; 19956 goto bad; 19957 } 19958 } 19959 19960 /* 19961 * Need to atomically check for ip_addr_availablity_check 19962 * under ip_addr_avail_lock, and if it fails got bad, and remove 19963 * from group also.The ill_g_lock is grabbed as reader 19964 * just to make sure no new ills or new ipifs are being added 19965 * to the system while we are checking the uniqueness of addresses. 19966 */ 19967 rw_enter(&ill_g_lock, RW_READER); 19968 mutex_enter(&ip_addr_avail_lock); 19969 /* Mark it up, and increment counters. */ 19970 ipif->ipif_flags |= IPIF_UP; 19971 ill->ill_ipif_up_count++; 19972 err = ip_addr_availability_check(ipif); 19973 mutex_exit(&ip_addr_avail_lock); 19974 rw_exit(&ill_g_lock); 19975 19976 if (err != 0) { 19977 /* 19978 * Our address may already be up on the same ill. In this case, 19979 * the ARP entry for our ipif replaced the one for the other 19980 * ipif. So we don't want to delete it (otherwise the other ipif 19981 * would be unable to send packets). 19982 * ip_addr_availability_check() identifies this case for us and 19983 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL 19984 * which is the expected error code. 19985 */ 19986 if (err == EADDRINUSE) { 19987 freemsg(ipif->ipif_arp_del_mp); 19988 ipif->ipif_arp_del_mp = NULL; 19989 err = EADDRNOTAVAIL; 19990 } 19991 ill->ill_ipif_up_count--; 19992 ipif->ipif_flags &= ~IPIF_UP; 19993 goto bad; 19994 } 19995 19996 /* 19997 * Add in all newly created IREs. ire_create_bcast() has 19998 * already checked for duplicates of the IRE_BROADCAST type. 19999 * We want to add before we call ifgrp_insert which wants 20000 * to know whether IRE_IF_RESOLVER exists or not. 20001 * 20002 * NOTE : We refrele the ire though we may branch to "bad" 20003 * later on where we do ire_delete. This is okay 20004 * because nobody can delete it as we are running 20005 * exclusively. 20006 */ 20007 for (irep1 = irep; irep1 > ire_array; ) { 20008 irep1--; 20009 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock))); 20010 /* 20011 * refheld by ire_add. refele towards the end of the func 20012 */ 20013 (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE); 20014 } 20015 ire_added = B_TRUE; 20016 /* 20017 * Form groups if possible. 20018 * 20019 * If we are supposed to be in a ill_group with a name, insert it 20020 * now as we know that at least one ipif is UP. Otherwise form 20021 * nameless groups. 20022 * 20023 * If ip_enable_group_ifs is set and ipif address is not 0, insert 20024 * this ipif into the appropriate interface group, or create a 20025 * new one. If this is already in a nameless group, we try to form 20026 * a bigger group looking at other ills potentially sharing this 20027 * ipif's prefix. 20028 */ 20029 phyi = ill->ill_phyint; 20030 if (phyi->phyint_groupname_len != 0) { 20031 ASSERT(phyi->phyint_groupname != NULL); 20032 if (ill->ill_ipif_up_count == 1) { 20033 ASSERT(ill->ill_group == NULL); 20034 err = illgrp_insert(&illgrp_head_v4, ill, 20035 phyi->phyint_groupname, NULL, B_TRUE); 20036 if (err != 0) { 20037 ip1dbg(("ipif_up_done: illgrp allocation " 20038 "failed, error %d\n", err)); 20039 goto bad; 20040 } 20041 } 20042 ASSERT(ill->ill_group != NULL); 20043 } 20044 20045 /* 20046 * When this is part of group, we need to make sure that 20047 * any broadcast ires created because of this ipif coming 20048 * UP gets marked/cleared with IRE_MARK_NORECV appropriately 20049 * so that we don't receive duplicate broadcast packets. 20050 */ 20051 if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0) 20052 ipif_renominate_bcast(ipif); 20053 20054 /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ 20055 ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; 20056 ipif_saved_irep = ipif_recover_ire(ipif); 20057 20058 if (!loopback) { 20059 /* 20060 * If the broadcast address has been set, make sure it makes 20061 * sense based on the interface address. 20062 * Only match on ill since we are sharing broadcast addresses. 20063 */ 20064 if ((ipif->ipif_brd_addr != INADDR_ANY) && 20065 (ipif->ipif_flags & IPIF_BROADCAST)) { 20066 ire_t *ire; 20067 20068 ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0, 20069 IRE_BROADCAST, ipif, ALL_ZONES, 20070 NULL, (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 20071 20072 if (ire == NULL) { 20073 /* 20074 * If there isn't a matching broadcast IRE, 20075 * revert to the default for this netmask. 20076 */ 20077 ipif->ipif_v6brd_addr = ipv6_all_zeros; 20078 mutex_enter(&ipif->ipif_ill->ill_lock); 20079 ipif_set_default(ipif); 20080 mutex_exit(&ipif->ipif_ill->ill_lock); 20081 } else { 20082 ire_refrele(ire); 20083 } 20084 } 20085 20086 } 20087 20088 /* This is the first interface on this ill */ 20089 if (ipif->ipif_ipif_up_count == 1 && !loopback) { 20090 /* 20091 * Need to recover all multicast memberships in the driver. 20092 * This had to be deferred until we had attached. 20093 */ 20094 ill_recover_multicast(ill); 20095 } 20096 /* Join the allhosts multicast address */ 20097 ipif_multicast_up(ipif); 20098 20099 if (!loopback) { 20100 /* 20101 * See whether anybody else would benefit from the 20102 * new ipif that we added. We call this always rather 20103 * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST 20104 * ipif is for the benefit of illgrp_insert (done above) 20105 * which does not do source address selection as it does 20106 * not want to re-create interface routes that we are 20107 * having reference to it here. 20108 */ 20109 ill_update_source_selection(ill); 20110 } 20111 20112 for (irep1 = irep; irep1 > ire_array; ) { 20113 irep1--; 20114 if (*irep1 != NULL) { 20115 /* was held in ire_add */ 20116 ire_refrele(*irep1); 20117 } 20118 } 20119 20120 cnt = ipif_saved_ire_cnt; 20121 for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) { 20122 if (*irep1 != NULL) { 20123 /* was held in ire_add */ 20124 ire_refrele(*irep1); 20125 } 20126 } 20127 20128 if (!loopback && ipif->ipif_addr_ready) { 20129 /* Broadcast an address mask reply. */ 20130 ipif_mask_reply(ipif); 20131 } 20132 if (ipif_saved_irep != NULL) { 20133 kmem_free(ipif_saved_irep, 20134 ipif_saved_ire_cnt * sizeof (ire_t *)); 20135 } 20136 if (src_ipif_held) 20137 ipif_refrele(src_ipif); 20138 20139 /* 20140 * This had to be deferred until we had bound. Tell routing sockets and 20141 * others that this interface is up if it looks like the address has 20142 * been validated. Otherwise, if it isn't ready yet, wait for 20143 * duplicate address detection to do its thing. 20144 */ 20145 if (ipif->ipif_addr_ready) { 20146 ip_rts_ifmsg(ipif); 20147 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 20148 /* Let SCTP update the status for this ipif */ 20149 sctp_update_ipif(ipif, SCTP_IPIF_UP); 20150 } 20151 return (0); 20152 20153 bad: 20154 ip1dbg(("ipif_up_done: FAILED \n")); 20155 /* 20156 * We don't have to bother removing from ill groups because 20157 * 20158 * 1) For groups with names, we insert only when the first ipif 20159 * comes up. In that case if it fails, it will not be in any 20160 * group. So, we need not try to remove for that case. 20161 * 20162 * 2) For groups without names, either we tried to insert ipif_ill 20163 * in a group as singleton or found some other group to become 20164 * a bigger group. For the former, if it fails we don't have 20165 * anything to do as ipif_ill is not in the group and for the 20166 * latter, there are no failures in illgrp_insert/illgrp_delete 20167 * (ENOMEM can't occur for this. Check ifgrp_insert). 20168 */ 20169 while (irep > ire_array) { 20170 irep--; 20171 if (*irep != NULL) { 20172 ire_delete(*irep); 20173 if (ire_added) 20174 ire_refrele(*irep); 20175 } 20176 } 20177 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid); 20178 20179 if (ipif_saved_irep != NULL) { 20180 kmem_free(ipif_saved_irep, 20181 ipif_saved_ire_cnt * sizeof (ire_t *)); 20182 } 20183 if (src_ipif_held) 20184 ipif_refrele(src_ipif); 20185 20186 ipif_arp_down(ipif); 20187 return (err); 20188 } 20189 20190 /* 20191 * Turn off the ARP with the ILLF_NOARP flag. 20192 */ 20193 static int 20194 ill_arp_off(ill_t *ill) 20195 { 20196 mblk_t *arp_off_mp = NULL; 20197 mblk_t *arp_on_mp = NULL; 20198 20199 ip1dbg(("ill_arp_off(%s)\n", ill->ill_name)); 20200 20201 ASSERT(IAM_WRITER_ILL(ill)); 20202 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 20203 20204 /* 20205 * If the on message is still around we've already done 20206 * an arp_off without doing an arp_on thus there is no 20207 * work needed. 20208 */ 20209 if (ill->ill_arp_on_mp != NULL) 20210 return (0); 20211 20212 /* 20213 * Allocate an ARP on message (to be saved) and an ARP off message 20214 */ 20215 arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0); 20216 if (!arp_off_mp) 20217 return (ENOMEM); 20218 20219 arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0); 20220 if (!arp_on_mp) 20221 goto failed; 20222 20223 ASSERT(ill->ill_arp_on_mp == NULL); 20224 ill->ill_arp_on_mp = arp_on_mp; 20225 20226 /* Send an AR_INTERFACE_OFF request */ 20227 putnext(ill->ill_rq, arp_off_mp); 20228 return (0); 20229 failed: 20230 20231 if (arp_off_mp) 20232 freemsg(arp_off_mp); 20233 return (ENOMEM); 20234 } 20235 20236 /* 20237 * Turn on ARP by turning off the ILLF_NOARP flag. 20238 */ 20239 static int 20240 ill_arp_on(ill_t *ill) 20241 { 20242 mblk_t *mp; 20243 20244 ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name)); 20245 20246 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 20247 20248 ASSERT(IAM_WRITER_ILL(ill)); 20249 /* 20250 * Send an AR_INTERFACE_ON request if we have already done 20251 * an arp_off (which allocated the message). 20252 */ 20253 if (ill->ill_arp_on_mp != NULL) { 20254 mp = ill->ill_arp_on_mp; 20255 ill->ill_arp_on_mp = NULL; 20256 putnext(ill->ill_rq, mp); 20257 } 20258 return (0); 20259 } 20260 20261 /* 20262 * Called after either deleting ill from the group or when setting 20263 * FAILED or STANDBY on the interface. 20264 */ 20265 static void 20266 illgrp_reset_schednext(ill_t *ill) 20267 { 20268 ill_group_t *illgrp; 20269 ill_t *save_ill; 20270 20271 ASSERT(IAM_WRITER_ILL(ill)); 20272 /* 20273 * When called from illgrp_delete, ill_group will be non-NULL. 20274 * But when called from ip_sioctl_flags, it could be NULL if 20275 * somebody is setting FAILED/INACTIVE on some interface which 20276 * is not part of a group. 20277 */ 20278 illgrp = ill->ill_group; 20279 if (illgrp == NULL) 20280 return; 20281 if (illgrp->illgrp_ill_schednext != ill) 20282 return; 20283 20284 illgrp->illgrp_ill_schednext = NULL; 20285 save_ill = ill; 20286 /* 20287 * Choose a good ill to be the next one for 20288 * outbound traffic. As the flags FAILED/STANDBY is 20289 * not yet marked when called from ip_sioctl_flags, 20290 * we check for ill separately. 20291 */ 20292 for (ill = illgrp->illgrp_ill; ill != NULL; 20293 ill = ill->ill_group_next) { 20294 if ((ill != save_ill) && 20295 !(ill->ill_phyint->phyint_flags & 20296 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) { 20297 illgrp->illgrp_ill_schednext = ill; 20298 return; 20299 } 20300 } 20301 } 20302 20303 /* 20304 * Given an ill, find the next ill in the group to be scheduled. 20305 * (This should be called by ip_newroute() before ire_create().) 20306 * The passed in ill may be pulled out of the group, after we have picked 20307 * up a different outgoing ill from the same group. However ire add will 20308 * atomically check this. 20309 */ 20310 ill_t * 20311 illgrp_scheduler(ill_t *ill) 20312 { 20313 ill_t *retill; 20314 ill_group_t *illgrp; 20315 int illcnt; 20316 int i; 20317 uint64_t flags; 20318 20319 /* 20320 * We don't use a lock to check for the ill_group. If this ill 20321 * is currently being inserted we may end up just returning this 20322 * ill itself. That is ok. 20323 */ 20324 if (ill->ill_group == NULL) { 20325 ill_refhold(ill); 20326 return (ill); 20327 } 20328 20329 /* 20330 * Grab the ill_g_lock as reader to make sure we are dealing with 20331 * a set of stable ills. No ill can be added or deleted or change 20332 * group while we hold the reader lock. 20333 */ 20334 rw_enter(&ill_g_lock, RW_READER); 20335 if ((illgrp = ill->ill_group) == NULL) { 20336 rw_exit(&ill_g_lock); 20337 ill_refhold(ill); 20338 return (ill); 20339 } 20340 20341 illcnt = illgrp->illgrp_ill_count; 20342 mutex_enter(&illgrp->illgrp_lock); 20343 retill = illgrp->illgrp_ill_schednext; 20344 20345 if (retill == NULL) 20346 retill = illgrp->illgrp_ill; 20347 20348 /* 20349 * We do a circular search beginning at illgrp_ill_schednext 20350 * or illgrp_ill. We don't check the flags against the ill lock 20351 * since it can change anytime. The ire creation will be atomic 20352 * and will fail if the ill is FAILED or OFFLINE. 20353 */ 20354 for (i = 0; i < illcnt; i++) { 20355 flags = retill->ill_phyint->phyint_flags; 20356 20357 if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 20358 ILL_CAN_LOOKUP(retill)) { 20359 illgrp->illgrp_ill_schednext = retill->ill_group_next; 20360 ill_refhold(retill); 20361 break; 20362 } 20363 retill = retill->ill_group_next; 20364 if (retill == NULL) 20365 retill = illgrp->illgrp_ill; 20366 } 20367 mutex_exit(&illgrp->illgrp_lock); 20368 rw_exit(&ill_g_lock); 20369 20370 return (i == illcnt ? NULL : retill); 20371 } 20372 20373 /* 20374 * Checks for availbility of a usable source address (if there is one) when the 20375 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 20376 * this selection is done regardless of the destination. 20377 */ 20378 boolean_t 20379 ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) 20380 { 20381 uint_t ifindex; 20382 ipif_t *ipif = NULL; 20383 ill_t *uill; 20384 boolean_t isv6; 20385 20386 ASSERT(ill != NULL); 20387 20388 isv6 = ill->ill_isv6; 20389 ifindex = ill->ill_usesrc_ifindex; 20390 if (ifindex != 0) { 20391 uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, 20392 NULL); 20393 if (uill == NULL) 20394 return (NULL); 20395 mutex_enter(&uill->ill_lock); 20396 for (ipif = uill->ill_ipif; ipif != NULL; 20397 ipif = ipif->ipif_next) { 20398 if (!IPIF_CAN_LOOKUP(ipif)) 20399 continue; 20400 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 20401 continue; 20402 if (!(ipif->ipif_flags & IPIF_UP)) 20403 continue; 20404 if (ipif->ipif_zoneid != zoneid) 20405 continue; 20406 if ((isv6 && 20407 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) || 20408 (ipif->ipif_lcl_addr == INADDR_ANY)) 20409 continue; 20410 mutex_exit(&uill->ill_lock); 20411 ill_refrele(uill); 20412 return (B_TRUE); 20413 } 20414 mutex_exit(&uill->ill_lock); 20415 ill_refrele(uill); 20416 } 20417 return (B_FALSE); 20418 } 20419 20420 /* 20421 * Determine the best source address given a destination address and an ill. 20422 * Prefers non-deprecated over deprecated but will return a deprecated 20423 * address if there is no other choice. If there is a usable source address 20424 * on the interface pointed to by ill_usesrc_ifindex then that is given 20425 * first preference. 20426 * 20427 * Returns NULL if there is no suitable source address for the ill. 20428 * This only occurs when there is no valid source address for the ill. 20429 */ 20430 ipif_t * 20431 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) 20432 { 20433 ipif_t *ipif; 20434 ipif_t *ipif_dep = NULL; /* Fallback to deprecated */ 20435 ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE]; 20436 int index = 0; 20437 boolean_t wrapped = B_FALSE; 20438 boolean_t same_subnet_only = B_FALSE; 20439 boolean_t ipif_same_found, ipif_other_found; 20440 boolean_t specific_found; 20441 ill_t *till, *usill = NULL; 20442 tsol_tpc_t *src_rhtp, *dst_rhtp; 20443 20444 if (ill->ill_usesrc_ifindex != 0) { 20445 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, B_FALSE, 20446 NULL, NULL, NULL, NULL); 20447 if (usill != NULL) 20448 ill = usill; /* Select source from usesrc ILL */ 20449 else 20450 return (NULL); 20451 } 20452 20453 /* 20454 * If we're dealing with an unlabeled destination on a labeled system, 20455 * make sure that we ignore source addresses that are incompatible with 20456 * the destination's default label. That destination's default label 20457 * must dominate the minimum label on the source address. 20458 */ 20459 dst_rhtp = NULL; 20460 if (is_system_labeled()) { 20461 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 20462 if (dst_rhtp == NULL) 20463 return (NULL); 20464 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 20465 TPC_RELE(dst_rhtp); 20466 dst_rhtp = NULL; 20467 } 20468 } 20469 20470 /* 20471 * Holds the ill_g_lock as reader. This makes sure that no ipif/ill 20472 * can be deleted. But an ipif/ill can get CONDEMNED any time. 20473 * After selecting the right ipif, under ill_lock make sure ipif is 20474 * not condemned, and increment refcnt. If ipif is CONDEMNED, 20475 * we retry. Inside the loop we still need to check for CONDEMNED, 20476 * but not under a lock. 20477 */ 20478 rw_enter(&ill_g_lock, RW_READER); 20479 20480 retry: 20481 till = ill; 20482 ipif_arr[0] = NULL; 20483 20484 if (till->ill_group != NULL) 20485 till = till->ill_group->illgrp_ill; 20486 20487 /* 20488 * Choose one good source address from each ill across the group. 20489 * If possible choose a source address in the same subnet as 20490 * the destination address. 20491 * 20492 * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE 20493 * This is okay because of the following. 20494 * 20495 * If PHYI_FAILED is set and we still have non-deprecated 20496 * addresses, it means the addresses have not yet been 20497 * failed over to a different interface. We potentially 20498 * select them to create IRE_CACHES, which will be later 20499 * flushed when the addresses move over. 20500 * 20501 * If PHYI_INACTIVE is set and we still have non-deprecated 20502 * addresses, it means either the user has configured them 20503 * or PHYI_INACTIVE has not been cleared after the addresses 20504 * been moved over. For the former, in.mpathd does a failover 20505 * when the interface becomes INACTIVE and hence we should 20506 * not find them. Once INACTIVE is set, we don't allow them 20507 * to create logical interfaces anymore. For the latter, a 20508 * flush will happen when INACTIVE is cleared which will 20509 * flush the IRE_CACHES. 20510 * 20511 * If PHYI_OFFLINE is set, all the addresses will be failed 20512 * over soon. We potentially select them to create IRE_CACHEs, 20513 * which will be later flushed when the addresses move over. 20514 * 20515 * NOTE : As ipif_select_source is called to borrow source address 20516 * for an ipif that is part of a group, source address selection 20517 * will be re-done whenever the group changes i.e either an 20518 * insertion/deletion in the group. 20519 * 20520 * Fill ipif_arr[] with source addresses, using these rules: 20521 * 20522 * 1. At most one source address from a given ill ends up 20523 * in ipif_arr[] -- that is, at most one of the ipif's 20524 * associated with a given ill ends up in ipif_arr[]. 20525 * 20526 * 2. If there is at least one non-deprecated ipif in the 20527 * IPMP group with a source address on the same subnet as 20528 * our destination, then fill ipif_arr[] only with 20529 * source addresses on the same subnet as our destination. 20530 * Note that because of (1), only the first 20531 * non-deprecated ipif found with a source address 20532 * matching the destination ends up in ipif_arr[]. 20533 * 20534 * 3. Otherwise, fill ipif_arr[] with non-deprecated source 20535 * addresses not in the same subnet as our destination. 20536 * Again, because of (1), only the first off-subnet source 20537 * address will be chosen. 20538 * 20539 * 4. If there are no non-deprecated ipifs, then just use 20540 * the source address associated with the last deprecated 20541 * one we find that happens to be on the same subnet, 20542 * otherwise the first one not in the same subnet. 20543 */ 20544 specific_found = B_FALSE; 20545 for (; till != NULL; till = till->ill_group_next) { 20546 ipif_same_found = B_FALSE; 20547 ipif_other_found = B_FALSE; 20548 for (ipif = till->ill_ipif; ipif != NULL; 20549 ipif = ipif->ipif_next) { 20550 if (!IPIF_CAN_LOOKUP(ipif)) 20551 continue; 20552 /* Always skip NOLOCAL and ANYCAST interfaces */ 20553 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 20554 continue; 20555 if (!(ipif->ipif_flags & IPIF_UP) || 20556 !ipif->ipif_addr_ready) 20557 continue; 20558 if (ipif->ipif_zoneid != zoneid && 20559 ipif->ipif_zoneid != ALL_ZONES) 20560 continue; 20561 /* 20562 * Interfaces with 0.0.0.0 address are allowed to be UP, 20563 * but are not valid as source addresses. 20564 */ 20565 if (ipif->ipif_lcl_addr == INADDR_ANY) 20566 continue; 20567 20568 /* 20569 * Check compatibility of local address for 20570 * destination's default label if we're on a labeled 20571 * system. Incompatible addresses can't be used at 20572 * all. 20573 */ 20574 if (dst_rhtp != NULL) { 20575 boolean_t incompat; 20576 20577 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 20578 IPV4_VERSION, B_FALSE); 20579 if (src_rhtp == NULL) 20580 continue; 20581 incompat = 20582 src_rhtp->tpc_tp.host_type != SUN_CIPSO || 20583 src_rhtp->tpc_tp.tp_doi != 20584 dst_rhtp->tpc_tp.tp_doi || 20585 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 20586 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 20587 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 20588 src_rhtp->tpc_tp.tp_sl_set_cipso)); 20589 TPC_RELE(src_rhtp); 20590 if (incompat) 20591 continue; 20592 } 20593 20594 /* 20595 * We prefer not to use all all-zones addresses, if we 20596 * can avoid it, as they pose problems with unlabeled 20597 * destinations. 20598 */ 20599 if (ipif->ipif_zoneid != ALL_ZONES) { 20600 if (!specific_found && 20601 (!same_subnet_only || 20602 (ipif->ipif_net_mask & dst) == 20603 ipif->ipif_subnet)) { 20604 index = 0; 20605 specific_found = B_TRUE; 20606 ipif_other_found = B_FALSE; 20607 } 20608 } else { 20609 if (specific_found) 20610 continue; 20611 } 20612 if (ipif->ipif_flags & IPIF_DEPRECATED) { 20613 if (ipif_dep == NULL || 20614 (ipif->ipif_net_mask & dst) == 20615 ipif->ipif_subnet) 20616 ipif_dep = ipif; 20617 continue; 20618 } 20619 if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) { 20620 /* found a source address in the same subnet */ 20621 if (!same_subnet_only) { 20622 same_subnet_only = B_TRUE; 20623 index = 0; 20624 } 20625 ipif_same_found = B_TRUE; 20626 } else { 20627 if (same_subnet_only || ipif_other_found) 20628 continue; 20629 ipif_other_found = B_TRUE; 20630 } 20631 ipif_arr[index++] = ipif; 20632 if (index == MAX_IPIF_SELECT_SOURCE) { 20633 wrapped = B_TRUE; 20634 index = 0; 20635 } 20636 if (ipif_same_found) 20637 break; 20638 } 20639 } 20640 20641 if (ipif_arr[0] == NULL) { 20642 ipif = ipif_dep; 20643 } else { 20644 if (wrapped) 20645 index = MAX_IPIF_SELECT_SOURCE; 20646 ipif = ipif_arr[ipif_rand() % index]; 20647 ASSERT(ipif != NULL); 20648 } 20649 20650 if (ipif != NULL) { 20651 mutex_enter(&ipif->ipif_ill->ill_lock); 20652 if (!IPIF_CAN_LOOKUP(ipif)) { 20653 mutex_exit(&ipif->ipif_ill->ill_lock); 20654 goto retry; 20655 } 20656 ipif_refhold_locked(ipif); 20657 mutex_exit(&ipif->ipif_ill->ill_lock); 20658 } 20659 20660 rw_exit(&ill_g_lock); 20661 if (usill != NULL) 20662 ill_refrele(usill); 20663 if (dst_rhtp != NULL) 20664 TPC_RELE(dst_rhtp); 20665 20666 #ifdef DEBUG 20667 if (ipif == NULL) { 20668 char buf1[INET6_ADDRSTRLEN]; 20669 20670 ip1dbg(("ipif_select_source(%s, %s) -> NULL\n", 20671 ill->ill_name, 20672 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 20673 } else { 20674 char buf1[INET6_ADDRSTRLEN]; 20675 char buf2[INET6_ADDRSTRLEN]; 20676 20677 ip1dbg(("ipif_select_source(%s, %s) -> %s\n", 20678 ipif->ipif_ill->ill_name, 20679 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 20680 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 20681 buf2, sizeof (buf2)))); 20682 } 20683 #endif /* DEBUG */ 20684 return (ipif); 20685 } 20686 20687 20688 /* 20689 * If old_ipif is not NULL, see if ipif was derived from old 20690 * ipif and if so, recreate the interface route by re-doing 20691 * source address selection. This happens when ipif_down -> 20692 * ipif_update_other_ipifs calls us. 20693 * 20694 * If old_ipif is NULL, just redo the source address selection 20695 * if needed. This happens when illgrp_insert or ipif_up_done 20696 * calls us. 20697 */ 20698 static void 20699 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) 20700 { 20701 ire_t *ire; 20702 ire_t *ipif_ire; 20703 queue_t *stq; 20704 ipif_t *nipif; 20705 ill_t *ill; 20706 boolean_t need_rele = B_FALSE; 20707 20708 ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif)); 20709 ASSERT(IAM_WRITER_IPIF(ipif)); 20710 20711 ill = ipif->ipif_ill; 20712 if (!(ipif->ipif_flags & 20713 (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { 20714 /* 20715 * Can't possibly have borrowed the source 20716 * from old_ipif. 20717 */ 20718 return; 20719 } 20720 20721 /* 20722 * Is there any work to be done? No work if the address 20723 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST ( 20724 * ipif_select_source() does not borrow addresses from 20725 * NOLOCAL and ANYCAST interfaces). 20726 */ 20727 if ((old_ipif != NULL) && 20728 ((old_ipif->ipif_lcl_addr == INADDR_ANY) || 20729 (old_ipif->ipif_ill->ill_wq == NULL) || 20730 (old_ipif->ipif_flags & 20731 (IPIF_NOLOCAL|IPIF_ANYCAST)))) { 20732 return; 20733 } 20734 20735 /* 20736 * Perform the same checks as when creating the 20737 * IRE_INTERFACE in ipif_up_done. 20738 */ 20739 if (!(ipif->ipif_flags & IPIF_UP)) 20740 return; 20741 20742 if ((ipif->ipif_flags & IPIF_NOXMIT) || 20743 (ipif->ipif_subnet == INADDR_ANY)) 20744 return; 20745 20746 ipif_ire = ipif_to_ire(ipif); 20747 if (ipif_ire == NULL) 20748 return; 20749 20750 /* 20751 * We know that ipif uses some other source for its 20752 * IRE_INTERFACE. Is it using the source of this 20753 * old_ipif? 20754 */ 20755 if (old_ipif != NULL && 20756 old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) { 20757 ire_refrele(ipif_ire); 20758 return; 20759 } 20760 if (ip_debug > 2) { 20761 /* ip1dbg */ 20762 pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for" 20763 " src %s\n", AF_INET, &ipif_ire->ire_src_addr); 20764 } 20765 20766 stq = ipif_ire->ire_stq; 20767 20768 /* 20769 * Can't use our source address. Select a different 20770 * source address for the IRE_INTERFACE. 20771 */ 20772 nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid); 20773 if (nipif == NULL) { 20774 /* Last resort - all ipif's have IPIF_NOLOCAL */ 20775 nipif = ipif; 20776 } else { 20777 need_rele = B_TRUE; 20778 } 20779 20780 ire = ire_create( 20781 (uchar_t *)&ipif->ipif_subnet, /* dest pref */ 20782 (uchar_t *)&ipif->ipif_net_mask, /* mask */ 20783 (uchar_t *)&nipif->ipif_src_addr, /* src addr */ 20784 NULL, /* no gateway */ 20785 NULL, 20786 &ipif->ipif_mtu, /* max frag */ 20787 NULL, /* fast path header */ 20788 NULL, /* no recv from queue */ 20789 stq, /* send-to queue */ 20790 ill->ill_net_type, /* IF_[NO]RESOLVER */ 20791 ill->ill_resolver_mp, /* xmit header */ 20792 ipif, 20793 NULL, 20794 0, 20795 0, 20796 0, 20797 0, 20798 &ire_uinfo_null, 20799 NULL, 20800 NULL); 20801 20802 if (ire != NULL) { 20803 ire_t *ret_ire; 20804 int error; 20805 20806 /* 20807 * We don't need ipif_ire anymore. We need to delete 20808 * before we add so that ire_add does not detect 20809 * duplicates. 20810 */ 20811 ire_delete(ipif_ire); 20812 ret_ire = ire; 20813 error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE); 20814 ASSERT(error == 0); 20815 ASSERT(ire == ret_ire); 20816 /* Held in ire_add */ 20817 ire_refrele(ret_ire); 20818 } 20819 /* 20820 * Either we are falling through from above or could not 20821 * allocate a replacement. 20822 */ 20823 ire_refrele(ipif_ire); 20824 if (need_rele) 20825 ipif_refrele(nipif); 20826 } 20827 20828 /* 20829 * This old_ipif is going away. 20830 * 20831 * Determine if any other ipif's is using our address as 20832 * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or 20833 * IPIF_DEPRECATED). 20834 * Find the IRE_INTERFACE for such ipifs and recreate them 20835 * to use an different source address following the rules in 20836 * ipif_up_done. 20837 * 20838 * This function takes an illgrp as an argument so that illgrp_delete 20839 * can call this to update source address even after deleting the 20840 * old_ipif->ipif_ill from the ill group. 20841 */ 20842 static void 20843 ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp) 20844 { 20845 ipif_t *ipif; 20846 ill_t *ill; 20847 char buf[INET6_ADDRSTRLEN]; 20848 20849 ASSERT(IAM_WRITER_IPIF(old_ipif)); 20850 ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif)); 20851 20852 ill = old_ipif->ipif_ill; 20853 20854 ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", 20855 ill->ill_name, 20856 inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, 20857 buf, sizeof (buf)))); 20858 /* 20859 * If this part of a group, look at all ills as ipif_select_source 20860 * borrows source address across all the ills in the group. 20861 */ 20862 if (illgrp != NULL) 20863 ill = illgrp->illgrp_ill; 20864 20865 for (; ill != NULL; ill = ill->ill_group_next) { 20866 for (ipif = ill->ill_ipif; ipif != NULL; 20867 ipif = ipif->ipif_next) { 20868 20869 if (ipif == old_ipif) 20870 continue; 20871 20872 ipif_recreate_interface_routes(old_ipif, ipif); 20873 } 20874 } 20875 } 20876 20877 /* ARGSUSED */ 20878 int 20879 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 20880 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 20881 { 20882 /* 20883 * ill_phyint_reinit merged the v4 and v6 into a single 20884 * ipsq. Could also have become part of a ipmp group in the 20885 * process, and we might not have been able to complete the 20886 * operation in ipif_set_values, if we could not become 20887 * exclusive. If so restart it here. 20888 */ 20889 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 20890 } 20891 20892 20893 /* ARGSUSED */ 20894 int 20895 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 20896 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 20897 { 20898 queue_t *q1 = q; 20899 char *cp; 20900 char interf_name[LIFNAMSIZ]; 20901 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 20902 20903 if (!q->q_next) { 20904 ip1dbg(( 20905 "if_unitsel: IF_UNITSEL: no q_next\n")); 20906 return (EINVAL); 20907 } 20908 20909 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 20910 return (EALREADY); 20911 20912 do { 20913 q1 = q1->q_next; 20914 } while (q1->q_next); 20915 cp = q1->q_qinfo->qi_minfo->mi_idname; 20916 (void) sprintf(interf_name, "%s%d", cp, ppa); 20917 20918 /* 20919 * Here we are not going to delay the ioack until after 20920 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 20921 * original ioctl message before sending the requests. 20922 */ 20923 return (ipif_set_values(q, mp, interf_name, &ppa)); 20924 } 20925 20926 /* ARGSUSED */ 20927 int 20928 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 20929 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 20930 { 20931 return (ENXIO); 20932 } 20933 20934 /* 20935 * Net and subnet broadcast ire's are now specific to the particular 20936 * physical interface (ill) and not to any one locigal interface (ipif). 20937 * However, if a particular logical interface is being taken down, it's 20938 * associated ire's will be taken down as well. Hence, when we go to 20939 * take down or change the local address, broadcast address or netmask 20940 * of a specific logical interface, we must check to make sure that we 20941 * have valid net and subnet broadcast ire's for the other logical 20942 * interfaces which may have been shared with the logical interface 20943 * being brought down or changed. 20944 * 20945 * There is one set of 0.0.0.0 and 255.255.255.255 per ill. Usually it 20946 * is tied to the first interface coming UP. If that ipif is going down, 20947 * we need to recreate them on the next valid ipif. 20948 * 20949 * Note: assume that the ipif passed in is still up so that it's IRE 20950 * entries are still valid. 20951 */ 20952 static void 20953 ipif_check_bcast_ires(ipif_t *test_ipif) 20954 { 20955 ipif_t *ipif; 20956 ire_t *test_subnet_ire, *test_net_ire; 20957 ire_t *test_allzero_ire, *test_allone_ire; 20958 ire_t *ire_array[12]; 20959 ire_t **irep = &ire_array[0]; 20960 ire_t **irep1; 20961 20962 ipaddr_t net_addr, subnet_addr, net_mask, subnet_mask; 20963 ipaddr_t test_net_addr, test_subnet_addr; 20964 ipaddr_t test_net_mask, test_subnet_mask; 20965 boolean_t need_net_bcast_ire = B_FALSE; 20966 boolean_t need_subnet_bcast_ire = B_FALSE; 20967 boolean_t allzero_bcast_ire_created = B_FALSE; 20968 boolean_t allone_bcast_ire_created = B_FALSE; 20969 boolean_t net_bcast_ire_created = B_FALSE; 20970 boolean_t subnet_bcast_ire_created = B_FALSE; 20971 20972 ipif_t *backup_ipif_net = (ipif_t *)NULL; 20973 ipif_t *backup_ipif_subnet = (ipif_t *)NULL; 20974 ipif_t *backup_ipif_allzeros = (ipif_t *)NULL; 20975 ipif_t *backup_ipif_allones = (ipif_t *)NULL; 20976 uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST; 20977 20978 ASSERT(!test_ipif->ipif_isv6); 20979 ASSERT(IAM_WRITER_IPIF(test_ipif)); 20980 20981 /* 20982 * No broadcast IREs for the LOOPBACK interface 20983 * or others such as point to point and IPIF_NOXMIT. 20984 */ 20985 if (!(test_ipif->ipif_flags & IPIF_BROADCAST) || 20986 (test_ipif->ipif_flags & IPIF_NOXMIT)) 20987 return; 20988 20989 test_allzero_ire = ire_ctable_lookup(0, 0, IRE_BROADCAST, 20990 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 20991 20992 test_allone_ire = ire_ctable_lookup(INADDR_BROADCAST, 0, IRE_BROADCAST, 20993 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 20994 20995 test_net_mask = ip_net_mask(test_ipif->ipif_subnet); 20996 test_subnet_mask = test_ipif->ipif_net_mask; 20997 20998 /* 20999 * If no net mask set, assume the default based on net class. 21000 */ 21001 if (test_subnet_mask == 0) 21002 test_subnet_mask = test_net_mask; 21003 21004 /* 21005 * Check if there is a network broadcast ire associated with this ipif 21006 */ 21007 test_net_addr = test_net_mask & test_ipif->ipif_subnet; 21008 test_net_ire = ire_ctable_lookup(test_net_addr, 0, IRE_BROADCAST, 21009 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 21010 21011 /* 21012 * Check if there is a subnet broadcast IRE associated with this ipif 21013 */ 21014 test_subnet_addr = test_subnet_mask & test_ipif->ipif_subnet; 21015 test_subnet_ire = ire_ctable_lookup(test_subnet_addr, 0, IRE_BROADCAST, 21016 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 21017 21018 /* 21019 * No broadcast ire's associated with this ipif. 21020 */ 21021 if ((test_subnet_ire == NULL) && (test_net_ire == NULL) && 21022 (test_allzero_ire == NULL) && (test_allone_ire == NULL)) { 21023 return; 21024 } 21025 21026 /* 21027 * We have established which bcast ires have to be replaced. 21028 * Next we try to locate ipifs that match there ires. 21029 * The rules are simple: If we find an ipif that matches on the subnet 21030 * address it will also match on the net address, the allzeros and 21031 * allones address. Any ipif that matches only on the net address will 21032 * also match the allzeros and allones addresses. 21033 * The other criterion is the ipif_flags. We look for non-deprecated 21034 * (and non-anycast and non-nolocal) ipifs as the best choice. 21035 * ipifs with check_flags matching (deprecated, etc) are used only 21036 * if good ipifs are not available. While looping, we save existing 21037 * deprecated ipifs as backup_ipif. 21038 * We loop through all the ipifs for this ill looking for ipifs 21039 * whose broadcast addr match the ipif passed in, but do not have 21040 * their own broadcast ires. For creating 0.0.0.0 and 21041 * 255.255.255.255 we just need an ipif on this ill to create. 21042 */ 21043 for (ipif = test_ipif->ipif_ill->ill_ipif; ipif != NULL; 21044 ipif = ipif->ipif_next) { 21045 21046 ASSERT(!ipif->ipif_isv6); 21047 /* 21048 * Already checked the ipif passed in. 21049 */ 21050 if (ipif == test_ipif) { 21051 continue; 21052 } 21053 21054 /* 21055 * We only need to recreate broadcast ires if another ipif in 21056 * the same zone uses them. The new ires must be created in the 21057 * same zone. 21058 */ 21059 if (ipif->ipif_zoneid != test_ipif->ipif_zoneid) { 21060 continue; 21061 } 21062 21063 /* 21064 * Only interested in logical interfaces with valid local 21065 * addresses or with the ability to broadcast. 21066 */ 21067 if ((ipif->ipif_subnet == 0) || 21068 !(ipif->ipif_flags & IPIF_BROADCAST) || 21069 (ipif->ipif_flags & IPIF_NOXMIT) || 21070 !(ipif->ipif_flags & IPIF_UP)) { 21071 continue; 21072 } 21073 /* 21074 * Check if there is a net broadcast ire for this 21075 * net address. If it turns out that the ipif we are 21076 * about to take down owns this ire, we must make a 21077 * new one because it is potentially going away. 21078 */ 21079 if (test_net_ire && (!net_bcast_ire_created)) { 21080 net_mask = ip_net_mask(ipif->ipif_subnet); 21081 net_addr = net_mask & ipif->ipif_subnet; 21082 if (net_addr == test_net_addr) { 21083 need_net_bcast_ire = B_TRUE; 21084 /* 21085 * Use DEPRECATED ipif only if no good 21086 * ires are available. subnet_addr is 21087 * a better match than net_addr. 21088 */ 21089 if ((ipif->ipif_flags & check_flags) && 21090 (backup_ipif_net == NULL)) { 21091 backup_ipif_net = ipif; 21092 } 21093 } 21094 } 21095 /* 21096 * Check if there is a subnet broadcast ire for this 21097 * net address. If it turns out that the ipif we are 21098 * about to take down owns this ire, we must make a 21099 * new one because it is potentially going away. 21100 */ 21101 if (test_subnet_ire && (!subnet_bcast_ire_created)) { 21102 subnet_mask = ipif->ipif_net_mask; 21103 subnet_addr = ipif->ipif_subnet; 21104 if (subnet_addr == test_subnet_addr) { 21105 need_subnet_bcast_ire = B_TRUE; 21106 if ((ipif->ipif_flags & check_flags) && 21107 (backup_ipif_subnet == NULL)) { 21108 backup_ipif_subnet = ipif; 21109 } 21110 } 21111 } 21112 21113 21114 /* Short circuit here if this ipif is deprecated */ 21115 if (ipif->ipif_flags & check_flags) { 21116 if ((test_allzero_ire != NULL) && 21117 (!allzero_bcast_ire_created) && 21118 (backup_ipif_allzeros == NULL)) { 21119 backup_ipif_allzeros = ipif; 21120 } 21121 if ((test_allone_ire != NULL) && 21122 (!allone_bcast_ire_created) && 21123 (backup_ipif_allones == NULL)) { 21124 backup_ipif_allones = ipif; 21125 } 21126 continue; 21127 } 21128 21129 /* 21130 * Found an ipif which has the same broadcast ire as the 21131 * ipif passed in and the ipif passed in "owns" the ire. 21132 * Create new broadcast ire's for this broadcast addr. 21133 */ 21134 if (need_net_bcast_ire && !net_bcast_ire_created) { 21135 irep = ire_create_bcast(ipif, net_addr, irep); 21136 irep = ire_create_bcast(ipif, 21137 ~net_mask | net_addr, irep); 21138 net_bcast_ire_created = B_TRUE; 21139 } 21140 if (need_subnet_bcast_ire && !subnet_bcast_ire_created) { 21141 irep = ire_create_bcast(ipif, subnet_addr, irep); 21142 irep = ire_create_bcast(ipif, 21143 ~subnet_mask | subnet_addr, irep); 21144 subnet_bcast_ire_created = B_TRUE; 21145 } 21146 if (test_allzero_ire != NULL && !allzero_bcast_ire_created) { 21147 irep = ire_create_bcast(ipif, 0, irep); 21148 allzero_bcast_ire_created = B_TRUE; 21149 } 21150 if (test_allone_ire != NULL && !allone_bcast_ire_created) { 21151 irep = ire_create_bcast(ipif, INADDR_BROADCAST, irep); 21152 allone_bcast_ire_created = B_TRUE; 21153 } 21154 /* 21155 * Once we have created all the appropriate ires, we 21156 * just break out of this loop to add what we have created. 21157 * This has been indented similar to ire_match_args for 21158 * readability. 21159 */ 21160 if (((test_net_ire == NULL) || 21161 (net_bcast_ire_created)) && 21162 ((test_subnet_ire == NULL) || 21163 (subnet_bcast_ire_created)) && 21164 ((test_allzero_ire == NULL) || 21165 (allzero_bcast_ire_created)) && 21166 ((test_allone_ire == NULL) || 21167 (allone_bcast_ire_created))) { 21168 break; 21169 } 21170 } 21171 21172 /* 21173 * Create bcast ires on deprecated ipifs if no non-deprecated ipifs 21174 * exist. 6 pairs of bcast ires are needed. 21175 * Note - the old ires are deleted in ipif_down. 21176 */ 21177 if (need_net_bcast_ire && !net_bcast_ire_created && backup_ipif_net) { 21178 ipif = backup_ipif_net; 21179 irep = ire_create_bcast(ipif, net_addr, irep); 21180 irep = ire_create_bcast(ipif, ~net_mask | net_addr, irep); 21181 net_bcast_ire_created = B_TRUE; 21182 } 21183 if (need_subnet_bcast_ire && !subnet_bcast_ire_created && 21184 backup_ipif_subnet) { 21185 ipif = backup_ipif_subnet; 21186 irep = ire_create_bcast(ipif, subnet_addr, irep); 21187 irep = ire_create_bcast(ipif, 21188 ~subnet_mask | subnet_addr, irep); 21189 subnet_bcast_ire_created = B_TRUE; 21190 } 21191 if (test_allzero_ire != NULL && !allzero_bcast_ire_created && 21192 backup_ipif_allzeros) { 21193 irep = ire_create_bcast(backup_ipif_allzeros, 0, irep); 21194 allzero_bcast_ire_created = B_TRUE; 21195 } 21196 if (test_allone_ire != NULL && !allone_bcast_ire_created && 21197 backup_ipif_allones) { 21198 irep = ire_create_bcast(backup_ipif_allones, 21199 INADDR_BROADCAST, irep); 21200 allone_bcast_ire_created = B_TRUE; 21201 } 21202 21203 /* 21204 * If we can't create all of them, don't add any of them. 21205 * Code in ip_wput_ire and ire_to_ill assumes that we 21206 * always have a non-loopback copy and loopback copy 21207 * for a given address. 21208 */ 21209 for (irep1 = irep; irep1 > ire_array; ) { 21210 irep1--; 21211 if (*irep1 == NULL) { 21212 ip0dbg(("ipif_check_bcast_ires: can't create " 21213 "IRE_BROADCAST, memory allocation failure\n")); 21214 while (irep > ire_array) { 21215 irep--; 21216 if (*irep != NULL) 21217 ire_delete(*irep); 21218 } 21219 goto bad; 21220 } 21221 } 21222 for (irep1 = irep; irep1 > ire_array; ) { 21223 int error; 21224 21225 irep1--; 21226 error = ire_add(irep1, NULL, NULL, NULL, B_FALSE); 21227 if (error == 0) { 21228 ire_refrele(*irep1); /* Held in ire_add */ 21229 } 21230 } 21231 bad: 21232 if (test_allzero_ire != NULL) 21233 ire_refrele(test_allzero_ire); 21234 if (test_allone_ire != NULL) 21235 ire_refrele(test_allone_ire); 21236 if (test_net_ire != NULL) 21237 ire_refrele(test_net_ire); 21238 if (test_subnet_ire != NULL) 21239 ire_refrele(test_subnet_ire); 21240 } 21241 21242 /* 21243 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 21244 * from lifr_flags and the name from lifr_name. 21245 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 21246 * since ipif_lookup_on_name uses the _isv6 flags when matching. 21247 * Returns EINPROGRESS when mp has been consumed by queueing it on 21248 * ill_pending_mp and the ioctl will complete in ip_rput. 21249 */ 21250 /* ARGSUSED */ 21251 int 21252 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21253 ip_ioctl_cmd_t *ipip, void *if_req) 21254 { 21255 int err; 21256 ill_t *ill; 21257 struct lifreq *lifr = (struct lifreq *)if_req; 21258 21259 ASSERT(ipif != NULL); 21260 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 21261 ASSERT(q->q_next != NULL); 21262 21263 ill = (ill_t *)q->q_ptr; 21264 /* 21265 * If we are not writer on 'q' then this interface exists already 21266 * and previous lookups (ipif_extract_lifreq_cmn) found this ipif. 21267 * So return EALREADY 21268 */ 21269 if (ill != ipif->ipif_ill) 21270 return (EALREADY); 21271 21272 if (ill->ill_name[0] != '\0') 21273 return (EALREADY); 21274 21275 /* 21276 * Set all the flags. Allows all kinds of override. Provide some 21277 * sanity checking by not allowing IFF_BROADCAST and IFF_MULTICAST 21278 * unless there is either multicast/broadcast support in the driver 21279 * or it is a pt-pt link. 21280 */ 21281 if (lifr->lifr_flags & (IFF_PROMISC|IFF_ALLMULTI)) { 21282 /* Meaningless to IP thus don't allow them to be set. */ 21283 ip1dbg(("ip_setname: EINVAL 1\n")); 21284 return (EINVAL); 21285 } 21286 /* 21287 * For a DL_STYLE2 driver (ill_needs_attach), we would not have the 21288 * ill_bcast_addr_length info. 21289 */ 21290 if (!ill->ill_needs_attach && 21291 ((lifr->lifr_flags & IFF_MULTICAST) && 21292 !(lifr->lifr_flags & IFF_POINTOPOINT) && 21293 ill->ill_bcast_addr_length == 0)) { 21294 /* Link not broadcast/pt-pt capable i.e. no multicast */ 21295 ip1dbg(("ip_setname: EINVAL 2\n")); 21296 return (EINVAL); 21297 } 21298 if ((lifr->lifr_flags & IFF_BROADCAST) && 21299 ((lifr->lifr_flags & IFF_IPV6) || 21300 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 21301 /* Link not broadcast capable or IPv6 i.e. no broadcast */ 21302 ip1dbg(("ip_setname: EINVAL 3\n")); 21303 return (EINVAL); 21304 } 21305 if (lifr->lifr_flags & IFF_UP) { 21306 /* Can only be set with SIOCSLIFFLAGS */ 21307 ip1dbg(("ip_setname: EINVAL 4\n")); 21308 return (EINVAL); 21309 } 21310 if ((lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV6 && 21311 (lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV4) { 21312 ip1dbg(("ip_setname: EINVAL 5\n")); 21313 return (EINVAL); 21314 } 21315 /* 21316 * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces. 21317 */ 21318 if ((lifr->lifr_flags & IFF_XRESOLV) && 21319 !(lifr->lifr_flags & IFF_IPV6) && 21320 !(ipif->ipif_isv6)) { 21321 ip1dbg(("ip_setname: EINVAL 6\n")); 21322 return (EINVAL); 21323 } 21324 21325 /* 21326 * The user has done SIOCGLIFFLAGS prior to this ioctl and hence 21327 * we have all the flags here. So, we assign rather than we OR. 21328 * We can't OR the flags here because we don't want to set 21329 * both IFF_IPV4 and IFF_IPV6. We start off as IFF_IPV4 in 21330 * ipif_allocate and become IFF_IPV4 or IFF_IPV6 here depending 21331 * on lifr_flags value here. 21332 */ 21333 /* 21334 * This ill has not been inserted into the global list. 21335 * So we are still single threaded and don't need any lock 21336 */ 21337 ipif->ipif_flags = lifr->lifr_flags & IFF_LOGINT_FLAGS & 21338 ~IFF_DUPLICATE; 21339 ill->ill_flags = lifr->lifr_flags & IFF_PHYINTINST_FLAGS; 21340 ill->ill_phyint->phyint_flags = lifr->lifr_flags & IFF_PHYINT_FLAGS; 21341 21342 /* We started off as V4. */ 21343 if (ill->ill_flags & ILLF_IPV6) { 21344 ill->ill_phyint->phyint_illv6 = ill; 21345 ill->ill_phyint->phyint_illv4 = NULL; 21346 } 21347 err = ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa); 21348 return (err); 21349 } 21350 21351 /* ARGSUSED */ 21352 int 21353 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21354 ip_ioctl_cmd_t *ipip, void *if_req) 21355 { 21356 /* 21357 * ill_phyint_reinit merged the v4 and v6 into a single 21358 * ipsq. Could also have become part of a ipmp group in the 21359 * process, and we might not have been able to complete the 21360 * slifname in ipif_set_values, if we could not become 21361 * exclusive. If so restart it here 21362 */ 21363 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 21364 } 21365 21366 /* 21367 * Return a pointer to the ipif which matches the index, IP version type and 21368 * zoneid. 21369 */ 21370 ipif_t * 21371 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 21372 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err) 21373 { 21374 ill_t *ill; 21375 ipsq_t *ipsq; 21376 phyint_t *phyi; 21377 ipif_t *ipif; 21378 21379 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 21380 (q != NULL && mp != NULL && func != NULL && err != NULL)); 21381 21382 if (err != NULL) 21383 *err = 0; 21384 21385 /* 21386 * Indexes are stored in the phyint - a common structure 21387 * to both IPv4 and IPv6. 21388 */ 21389 21390 rw_enter(&ill_g_lock, RW_READER); 21391 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 21392 (void *) &index, NULL); 21393 if (phyi != NULL) { 21394 ill = isv6 ? phyi->phyint_illv6 : phyi->phyint_illv4; 21395 if (ill == NULL) { 21396 rw_exit(&ill_g_lock); 21397 if (err != NULL) 21398 *err = ENXIO; 21399 return (NULL); 21400 } 21401 GRAB_CONN_LOCK(q); 21402 mutex_enter(&ill->ill_lock); 21403 if (ILL_CAN_LOOKUP(ill)) { 21404 for (ipif = ill->ill_ipif; ipif != NULL; 21405 ipif = ipif->ipif_next) { 21406 if (IPIF_CAN_LOOKUP(ipif) && 21407 (zoneid == ALL_ZONES || 21408 zoneid == ipif->ipif_zoneid || 21409 ipif->ipif_zoneid == ALL_ZONES)) { 21410 ipif_refhold_locked(ipif); 21411 mutex_exit(&ill->ill_lock); 21412 RELEASE_CONN_LOCK(q); 21413 rw_exit(&ill_g_lock); 21414 return (ipif); 21415 } 21416 } 21417 } else if (ILL_CAN_WAIT(ill, q)) { 21418 ipsq = ill->ill_phyint->phyint_ipsq; 21419 mutex_enter(&ipsq->ipsq_lock); 21420 rw_exit(&ill_g_lock); 21421 mutex_exit(&ill->ill_lock); 21422 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 21423 mutex_exit(&ipsq->ipsq_lock); 21424 RELEASE_CONN_LOCK(q); 21425 *err = EINPROGRESS; 21426 return (NULL); 21427 } 21428 mutex_exit(&ill->ill_lock); 21429 RELEASE_CONN_LOCK(q); 21430 } 21431 rw_exit(&ill_g_lock); 21432 if (err != NULL) 21433 *err = ENXIO; 21434 return (NULL); 21435 } 21436 21437 typedef struct conn_change_s { 21438 uint_t cc_old_ifindex; 21439 uint_t cc_new_ifindex; 21440 } conn_change_t; 21441 21442 /* 21443 * ipcl_walk function for changing interface index. 21444 */ 21445 static void 21446 conn_change_ifindex(conn_t *connp, caddr_t arg) 21447 { 21448 conn_change_t *connc; 21449 uint_t old_ifindex; 21450 uint_t new_ifindex; 21451 int i; 21452 ilg_t *ilg; 21453 21454 connc = (conn_change_t *)arg; 21455 old_ifindex = connc->cc_old_ifindex; 21456 new_ifindex = connc->cc_new_ifindex; 21457 21458 if (connp->conn_orig_bound_ifindex == old_ifindex) 21459 connp->conn_orig_bound_ifindex = new_ifindex; 21460 21461 if (connp->conn_orig_multicast_ifindex == old_ifindex) 21462 connp->conn_orig_multicast_ifindex = new_ifindex; 21463 21464 if (connp->conn_orig_xmit_ifindex == old_ifindex) 21465 connp->conn_orig_xmit_ifindex = new_ifindex; 21466 21467 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 21468 ilg = &connp->conn_ilg[i]; 21469 if (ilg->ilg_orig_ifindex == old_ifindex) 21470 ilg->ilg_orig_ifindex = new_ifindex; 21471 } 21472 } 21473 21474 /* 21475 * Walk all the ipifs and ilms on this ill and change the orig_ifindex 21476 * to new_index if it matches the old_index. 21477 * 21478 * Failovers typically happen within a group of ills. But somebody 21479 * can remove an ill from the group after a failover happened. If 21480 * we are setting the ifindex after this, we potentially need to 21481 * look at all the ills rather than just the ones in the group. 21482 * We cut down the work by looking at matching ill_net_types 21483 * and ill_types as we could not possibly grouped them together. 21484 */ 21485 static void 21486 ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc) 21487 { 21488 ill_t *ill; 21489 ipif_t *ipif; 21490 uint_t old_ifindex; 21491 uint_t new_ifindex; 21492 ilm_t *ilm; 21493 ill_walk_context_t ctx; 21494 21495 old_ifindex = connc->cc_old_ifindex; 21496 new_ifindex = connc->cc_new_ifindex; 21497 21498 rw_enter(&ill_g_lock, RW_READER); 21499 ill = ILL_START_WALK_ALL(&ctx); 21500 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 21501 if ((ill_orig->ill_net_type != ill->ill_net_type) || 21502 (ill_orig->ill_type != ill->ill_type)) { 21503 continue; 21504 } 21505 for (ipif = ill->ill_ipif; ipif != NULL; 21506 ipif = ipif->ipif_next) { 21507 if (ipif->ipif_orig_ifindex == old_ifindex) 21508 ipif->ipif_orig_ifindex = new_ifindex; 21509 } 21510 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 21511 if (ilm->ilm_orig_ifindex == old_ifindex) 21512 ilm->ilm_orig_ifindex = new_ifindex; 21513 } 21514 } 21515 rw_exit(&ill_g_lock); 21516 } 21517 21518 /* 21519 * We first need to ensure that the new index is unique, and 21520 * then carry the change across both v4 and v6 ill representation 21521 * of the physical interface. 21522 */ 21523 /* ARGSUSED */ 21524 int 21525 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21526 ip_ioctl_cmd_t *ipip, void *ifreq) 21527 { 21528 ill_t *ill; 21529 ill_t *ill_other; 21530 phyint_t *phyi; 21531 int old_index; 21532 conn_change_t connc; 21533 struct ifreq *ifr = (struct ifreq *)ifreq; 21534 struct lifreq *lifr = (struct lifreq *)ifreq; 21535 uint_t index; 21536 ill_t *ill_v4; 21537 ill_t *ill_v6; 21538 21539 if (ipip->ipi_cmd_type == IF_CMD) 21540 index = ifr->ifr_index; 21541 else 21542 index = lifr->lifr_index; 21543 21544 /* 21545 * Only allow on physical interface. Also, index zero is illegal. 21546 * 21547 * Need to check for PHYI_FAILED and PHYI_INACTIVE 21548 * 21549 * 1) If PHYI_FAILED is set, a failover could have happened which 21550 * implies a possible failback might have to happen. As failback 21551 * depends on the old index, we should fail setting the index. 21552 * 21553 * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that 21554 * any addresses or multicast memberships are failed over to 21555 * a non-STANDBY interface. As failback depends on the old 21556 * index, we should fail setting the index for this case also. 21557 * 21558 * 3) If PHYI_OFFLINE is set, a possible failover has happened. 21559 * Be consistent with PHYI_FAILED and fail the ioctl. 21560 */ 21561 ill = ipif->ipif_ill; 21562 phyi = ill->ill_phyint; 21563 if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) || 21564 ipif->ipif_id != 0 || index == 0) { 21565 return (EINVAL); 21566 } 21567 old_index = phyi->phyint_ifindex; 21568 21569 /* If the index is not changing, no work to do */ 21570 if (old_index == index) 21571 return (0); 21572 21573 /* 21574 * Use ill_lookup_on_ifindex to determine if the 21575 * new index is unused and if so allow the change. 21576 */ 21577 ill_v6 = ill_lookup_on_ifindex(index, B_TRUE, NULL, NULL, NULL, NULL); 21578 ill_v4 = ill_lookup_on_ifindex(index, B_FALSE, NULL, NULL, NULL, NULL); 21579 if (ill_v6 != NULL || ill_v4 != NULL) { 21580 if (ill_v4 != NULL) 21581 ill_refrele(ill_v4); 21582 if (ill_v6 != NULL) 21583 ill_refrele(ill_v6); 21584 return (EBUSY); 21585 } 21586 21587 /* 21588 * The new index is unused. Set it in the phyint. 21589 * Locate the other ill so that we can send a routing 21590 * sockets message. 21591 */ 21592 if (ill->ill_isv6) { 21593 ill_other = phyi->phyint_illv4; 21594 } else { 21595 ill_other = phyi->phyint_illv6; 21596 } 21597 21598 phyi->phyint_ifindex = index; 21599 21600 connc.cc_old_ifindex = old_index; 21601 connc.cc_new_ifindex = index; 21602 ip_change_ifindex(ill, &connc); 21603 ipcl_walk(conn_change_ifindex, (caddr_t)&connc); 21604 21605 /* Send the routing sockets message */ 21606 ip_rts_ifmsg(ipif); 21607 if (ill_other != NULL) 21608 ip_rts_ifmsg(ill_other->ill_ipif); 21609 21610 return (0); 21611 } 21612 21613 /* ARGSUSED */ 21614 int 21615 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21616 ip_ioctl_cmd_t *ipip, void *ifreq) 21617 { 21618 struct ifreq *ifr = (struct ifreq *)ifreq; 21619 struct lifreq *lifr = (struct lifreq *)ifreq; 21620 21621 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 21622 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21623 /* Get the interface index */ 21624 if (ipip->ipi_cmd_type == IF_CMD) { 21625 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 21626 } else { 21627 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 21628 } 21629 return (0); 21630 } 21631 21632 /* ARGSUSED */ 21633 int 21634 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21635 ip_ioctl_cmd_t *ipip, void *ifreq) 21636 { 21637 struct lifreq *lifr = (struct lifreq *)ifreq; 21638 21639 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 21640 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21641 /* Get the interface zone */ 21642 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21643 lifr->lifr_zoneid = ipif->ipif_zoneid; 21644 return (0); 21645 } 21646 21647 /* 21648 * Set the zoneid of an interface. 21649 */ 21650 /* ARGSUSED */ 21651 int 21652 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21653 ip_ioctl_cmd_t *ipip, void *ifreq) 21654 { 21655 struct lifreq *lifr = (struct lifreq *)ifreq; 21656 int err = 0; 21657 boolean_t need_up = B_FALSE; 21658 zone_t *zptr; 21659 zone_status_t status; 21660 zoneid_t zoneid; 21661 21662 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21663 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 21664 if (!is_system_labeled()) 21665 return (ENOTSUP); 21666 zoneid = GLOBAL_ZONEID; 21667 } 21668 21669 /* cannot assign instance zero to a non-global zone */ 21670 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 21671 return (ENOTSUP); 21672 21673 /* 21674 * Cannot assign to a zone that doesn't exist or is shutting down. In 21675 * the event of a race with the zone shutdown processing, since IP 21676 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 21677 * interface will be cleaned up even if the zone is shut down 21678 * immediately after the status check. If the interface can't be brought 21679 * down right away, and the zone is shut down before the restart 21680 * function is called, we resolve the possible races by rechecking the 21681 * zone status in the restart function. 21682 */ 21683 if ((zptr = zone_find_by_id(zoneid)) == NULL) 21684 return (EINVAL); 21685 status = zone_status_get(zptr); 21686 zone_rele(zptr); 21687 21688 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 21689 return (EINVAL); 21690 21691 if (ipif->ipif_flags & IPIF_UP) { 21692 /* 21693 * If the interface is already marked up, 21694 * we call ipif_down which will take care 21695 * of ditching any IREs that have been set 21696 * up based on the old interface address. 21697 */ 21698 err = ipif_logical_down(ipif, q, mp); 21699 if (err == EINPROGRESS) 21700 return (err); 21701 ipif_down_tail(ipif); 21702 need_up = B_TRUE; 21703 } 21704 21705 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 21706 return (err); 21707 } 21708 21709 static int 21710 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 21711 queue_t *q, mblk_t *mp, boolean_t need_up) 21712 { 21713 int err = 0; 21714 21715 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 21716 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21717 21718 /* Set the new zone id. */ 21719 ipif->ipif_zoneid = zoneid; 21720 21721 /* Update sctp list */ 21722 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 21723 21724 if (need_up) { 21725 /* 21726 * Now bring the interface back up. If this 21727 * is the only IPIF for the ILL, ipif_up 21728 * will have to re-bind to the device, so 21729 * we may get back EINPROGRESS, in which 21730 * case, this IOCTL will get completed in 21731 * ip_rput_dlpi when we see the DL_BIND_ACK. 21732 */ 21733 err = ipif_up(ipif, q, mp); 21734 } 21735 return (err); 21736 } 21737 21738 /* ARGSUSED */ 21739 int 21740 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21741 ip_ioctl_cmd_t *ipip, void *if_req) 21742 { 21743 struct lifreq *lifr = (struct lifreq *)if_req; 21744 zoneid_t zoneid; 21745 zone_t *zptr; 21746 zone_status_t status; 21747 21748 ASSERT(ipif->ipif_id != 0); 21749 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21750 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 21751 zoneid = GLOBAL_ZONEID; 21752 21753 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 21754 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21755 21756 /* 21757 * We recheck the zone status to resolve the following race condition: 21758 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 21759 * 2) hme0:1 is up and can't be brought down right away; 21760 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 21761 * 3) zone "myzone" is halted; the zone status switches to 21762 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 21763 * the interfaces to remove - hme0:1 is not returned because it's not 21764 * yet in "myzone", so it won't be removed; 21765 * 4) the restart function for SIOCSLIFZONE is called; without the 21766 * status check here, we would have hme0:1 in "myzone" after it's been 21767 * destroyed. 21768 * Note that if the status check fails, we need to bring the interface 21769 * back to its state prior to ip_sioctl_slifzone(), hence the call to 21770 * ipif_up_done[_v6](). 21771 */ 21772 status = ZONE_IS_UNINITIALIZED; 21773 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 21774 status = zone_status_get(zptr); 21775 zone_rele(zptr); 21776 } 21777 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 21778 if (ipif->ipif_isv6) { 21779 (void) ipif_up_done_v6(ipif); 21780 } else { 21781 (void) ipif_up_done(ipif); 21782 } 21783 return (EINVAL); 21784 } 21785 21786 ipif_down_tail(ipif); 21787 21788 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 21789 B_TRUE)); 21790 } 21791 21792 /* ARGSUSED */ 21793 int 21794 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21795 ip_ioctl_cmd_t *ipip, void *ifreq) 21796 { 21797 struct lifreq *lifr = ifreq; 21798 21799 ASSERT(q->q_next == NULL); 21800 ASSERT(CONN_Q(q)); 21801 21802 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 21803 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21804 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 21805 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 21806 21807 return (0); 21808 } 21809 21810 21811 /* Find the previous ILL in this usesrc group */ 21812 static ill_t * 21813 ill_prev_usesrc(ill_t *uill) 21814 { 21815 ill_t *ill; 21816 21817 for (ill = uill->ill_usesrc_grp_next; 21818 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 21819 ill = ill->ill_usesrc_grp_next) 21820 /* do nothing */; 21821 return (ill); 21822 } 21823 21824 /* 21825 * Release all members of the usesrc group. This routine is called 21826 * from ill_delete when the interface being unplumbed is the 21827 * group head. 21828 */ 21829 static void 21830 ill_disband_usesrc_group(ill_t *uill) 21831 { 21832 ill_t *next_ill, *tmp_ill; 21833 ASSERT(RW_WRITE_HELD(&ill_g_usesrc_lock)); 21834 next_ill = uill->ill_usesrc_grp_next; 21835 21836 do { 21837 ASSERT(next_ill != NULL); 21838 tmp_ill = next_ill->ill_usesrc_grp_next; 21839 ASSERT(tmp_ill != NULL); 21840 next_ill->ill_usesrc_grp_next = NULL; 21841 next_ill->ill_usesrc_ifindex = 0; 21842 next_ill = tmp_ill; 21843 } while (next_ill->ill_usesrc_ifindex != 0); 21844 uill->ill_usesrc_grp_next = NULL; 21845 } 21846 21847 /* 21848 * Remove the client usesrc ILL from the list and relink to a new list 21849 */ 21850 int 21851 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 21852 { 21853 ill_t *ill, *tmp_ill; 21854 21855 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 21856 (uill != NULL) && RW_WRITE_HELD(&ill_g_usesrc_lock)); 21857 21858 /* 21859 * Check if the usesrc client ILL passed in is not already 21860 * in use as a usesrc ILL i.e one whose source address is 21861 * in use OR a usesrc ILL is not already in use as a usesrc 21862 * client ILL 21863 */ 21864 if ((ucill->ill_usesrc_ifindex == 0) || 21865 (uill->ill_usesrc_ifindex != 0)) { 21866 return (-1); 21867 } 21868 21869 ill = ill_prev_usesrc(ucill); 21870 ASSERT(ill->ill_usesrc_grp_next != NULL); 21871 21872 /* Remove from the current list */ 21873 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 21874 /* Only two elements in the list */ 21875 ASSERT(ill->ill_usesrc_ifindex == 0); 21876 ill->ill_usesrc_grp_next = NULL; 21877 } else { 21878 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 21879 } 21880 21881 if (ifindex == 0) { 21882 ucill->ill_usesrc_ifindex = 0; 21883 ucill->ill_usesrc_grp_next = NULL; 21884 return (0); 21885 } 21886 21887 ucill->ill_usesrc_ifindex = ifindex; 21888 tmp_ill = uill->ill_usesrc_grp_next; 21889 uill->ill_usesrc_grp_next = ucill; 21890 ucill->ill_usesrc_grp_next = 21891 (tmp_ill != NULL) ? tmp_ill : uill; 21892 return (0); 21893 } 21894 21895 /* 21896 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 21897 * ip.c for locking details. 21898 */ 21899 /* ARGSUSED */ 21900 int 21901 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21902 ip_ioctl_cmd_t *ipip, void *ifreq) 21903 { 21904 struct lifreq *lifr = (struct lifreq *)ifreq; 21905 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE, 21906 ill_flag_changed = B_FALSE; 21907 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 21908 int err = 0, ret; 21909 uint_t ifindex; 21910 phyint_t *us_phyint, *us_cli_phyint; 21911 ipsq_t *ipsq = NULL; 21912 21913 ASSERT(IAM_WRITER_IPIF(ipif)); 21914 ASSERT(q->q_next == NULL); 21915 ASSERT(CONN_Q(q)); 21916 21917 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 21918 us_cli_phyint = usesrc_cli_ill->ill_phyint; 21919 21920 ASSERT(us_cli_phyint != NULL); 21921 21922 /* 21923 * If the client ILL is being used for IPMP, abort. 21924 * Note, this can be done before ipsq_try_enter since we are already 21925 * exclusive on this ILL 21926 */ 21927 if ((us_cli_phyint->phyint_groupname != NULL) || 21928 (us_cli_phyint->phyint_flags & PHYI_STANDBY)) { 21929 return (EINVAL); 21930 } 21931 21932 ifindex = lifr->lifr_index; 21933 if (ifindex == 0) { 21934 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 21935 /* non usesrc group interface, nothing to reset */ 21936 return (0); 21937 } 21938 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 21939 /* valid reset request */ 21940 reset_flg = B_TRUE; 21941 } 21942 21943 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp, 21944 ip_process_ioctl, &err); 21945 21946 if (usesrc_ill == NULL) { 21947 return (err); 21948 } 21949 21950 /* 21951 * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP 21952 * group nor can either of the interfaces be used for standy. So 21953 * to guarantee mutual exclusion with ip_sioctl_flags (which sets 21954 * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname) 21955 * we need to be exclusive on the ipsq belonging to the usesrc_ill. 21956 * We are already exlusive on this ipsq i.e ipsq corresponding to 21957 * the usesrc_cli_ill 21958 */ 21959 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 21960 NEW_OP, B_TRUE); 21961 if (ipsq == NULL) { 21962 err = EINPROGRESS; 21963 /* Operation enqueued on the ipsq of the usesrc ILL */ 21964 goto done; 21965 } 21966 21967 /* Check if the usesrc_ill is used for IPMP */ 21968 us_phyint = usesrc_ill->ill_phyint; 21969 if ((us_phyint->phyint_groupname != NULL) || 21970 (us_phyint->phyint_flags & PHYI_STANDBY)) { 21971 err = EINVAL; 21972 goto done; 21973 } 21974 21975 /* 21976 * If the client is already in use as a usesrc_ill or a usesrc_ill is 21977 * already a client then return EINVAL 21978 */ 21979 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 21980 err = EINVAL; 21981 goto done; 21982 } 21983 21984 /* 21985 * If the ill_usesrc_ifindex field is already set to what it needs to 21986 * be then this is a duplicate operation. 21987 */ 21988 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 21989 err = 0; 21990 goto done; 21991 } 21992 21993 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 21994 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 21995 usesrc_ill->ill_isv6)); 21996 21997 /* 21998 * The next step ensures that no new ires will be created referencing 21999 * the client ill, until the ILL_CHANGING flag is cleared. Then 22000 * we go through an ire walk deleting all ire caches that reference 22001 * the client ill. New ires referencing the client ill that are added 22002 * to the ire table before the ILL_CHANGING flag is set, will be 22003 * cleaned up by the ire walk below. Attempt to add new ires referencing 22004 * the client ill while the ILL_CHANGING flag is set will be failed 22005 * during the ire_add in ire_atomic_start. ire_atomic_start atomically 22006 * checks (under the ill_g_usesrc_lock) that the ire being added 22007 * is not stale, i.e the ire_stq and ire_ipif are consistent and 22008 * belong to the same usesrc group. 22009 */ 22010 mutex_enter(&usesrc_cli_ill->ill_lock); 22011 usesrc_cli_ill->ill_state_flags |= ILL_CHANGING; 22012 mutex_exit(&usesrc_cli_ill->ill_lock); 22013 ill_flag_changed = B_TRUE; 22014 22015 if (ipif->ipif_isv6) 22016 ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 22017 ALL_ZONES); 22018 else 22019 ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 22020 ALL_ZONES); 22021 22022 /* 22023 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 22024 * and the ill_usesrc_ifindex fields 22025 */ 22026 rw_enter(&ill_g_usesrc_lock, RW_WRITER); 22027 22028 if (reset_flg) { 22029 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 22030 if (ret != 0) { 22031 err = EINVAL; 22032 } 22033 rw_exit(&ill_g_usesrc_lock); 22034 goto done; 22035 } 22036 22037 /* 22038 * Four possibilities to consider: 22039 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 22040 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 22041 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 22042 * 4. Both are part of their respective usesrc groups 22043 */ 22044 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 22045 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 22046 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 22047 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 22048 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 22049 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 22050 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 22051 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 22052 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 22053 /* Insert at head of list */ 22054 usesrc_cli_ill->ill_usesrc_grp_next = 22055 usesrc_ill->ill_usesrc_grp_next; 22056 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 22057 } else { 22058 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 22059 ifindex); 22060 if (ret != 0) 22061 err = EINVAL; 22062 } 22063 rw_exit(&ill_g_usesrc_lock); 22064 22065 done: 22066 if (ill_flag_changed) { 22067 mutex_enter(&usesrc_cli_ill->ill_lock); 22068 usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING; 22069 mutex_exit(&usesrc_cli_ill->ill_lock); 22070 } 22071 if (ipsq != NULL) 22072 ipsq_exit(ipsq, B_TRUE, B_TRUE); 22073 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 22074 ill_refrele(usesrc_ill); 22075 return (err); 22076 } 22077 22078 /* 22079 * comparison function used by avl. 22080 */ 22081 static int 22082 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 22083 { 22084 22085 uint_t index; 22086 22087 ASSERT(phyip != NULL && index_ptr != NULL); 22088 22089 index = *((uint_t *)index_ptr); 22090 /* 22091 * let the phyint with the lowest index be on top. 22092 */ 22093 if (((phyint_t *)phyip)->phyint_ifindex < index) 22094 return (1); 22095 if (((phyint_t *)phyip)->phyint_ifindex > index) 22096 return (-1); 22097 return (0); 22098 } 22099 22100 /* 22101 * comparison function used by avl. 22102 */ 22103 static int 22104 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 22105 { 22106 ill_t *ill; 22107 int res = 0; 22108 22109 ASSERT(phyip != NULL && name_ptr != NULL); 22110 22111 if (((phyint_t *)phyip)->phyint_illv4) 22112 ill = ((phyint_t *)phyip)->phyint_illv4; 22113 else 22114 ill = ((phyint_t *)phyip)->phyint_illv6; 22115 ASSERT(ill != NULL); 22116 22117 res = strcmp(ill->ill_name, (char *)name_ptr); 22118 if (res > 0) 22119 return (1); 22120 else if (res < 0) 22121 return (-1); 22122 return (0); 22123 } 22124 /* 22125 * This function is called from ill_delete when the ill is being 22126 * unplumbed. We remove the reference from the phyint and we also 22127 * free the phyint when there are no more references to it. 22128 */ 22129 static void 22130 ill_phyint_free(ill_t *ill) 22131 { 22132 phyint_t *phyi; 22133 phyint_t *next_phyint; 22134 ipsq_t *cur_ipsq; 22135 22136 ASSERT(ill->ill_phyint != NULL); 22137 22138 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 22139 phyi = ill->ill_phyint; 22140 ill->ill_phyint = NULL; 22141 /* 22142 * ill_init allocates a phyint always to store the copy 22143 * of flags relevant to phyint. At that point in time, we could 22144 * not assign the name and hence phyint_illv4/v6 could not be 22145 * initialized. Later in ipif_set_values, we assign the name to 22146 * the ill, at which point in time we assign phyint_illv4/v6. 22147 * Thus we don't rely on phyint_illv6 to be initialized always. 22148 */ 22149 if (ill->ill_flags & ILLF_IPV6) { 22150 phyi->phyint_illv6 = NULL; 22151 } else { 22152 phyi->phyint_illv4 = NULL; 22153 } 22154 /* 22155 * ipif_down removes it from the group when the last ipif goes 22156 * down. 22157 */ 22158 ASSERT(ill->ill_group == NULL); 22159 22160 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) 22161 return; 22162 22163 /* 22164 * Make sure this phyint was put in the list. 22165 */ 22166 if (phyi->phyint_ifindex > 0) { 22167 avl_remove(&phyint_g_list.phyint_list_avl_by_index, 22168 phyi); 22169 avl_remove(&phyint_g_list.phyint_list_avl_by_name, 22170 phyi); 22171 } 22172 /* 22173 * remove phyint from the ipsq list. 22174 */ 22175 cur_ipsq = phyi->phyint_ipsq; 22176 if (phyi == cur_ipsq->ipsq_phyint_list) { 22177 cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next; 22178 } else { 22179 next_phyint = cur_ipsq->ipsq_phyint_list; 22180 while (next_phyint != NULL) { 22181 if (next_phyint->phyint_ipsq_next == phyi) { 22182 next_phyint->phyint_ipsq_next = 22183 phyi->phyint_ipsq_next; 22184 break; 22185 } 22186 next_phyint = next_phyint->phyint_ipsq_next; 22187 } 22188 ASSERT(next_phyint != NULL); 22189 } 22190 IPSQ_DEC_REF(cur_ipsq); 22191 22192 if (phyi->phyint_groupname_len != 0) { 22193 ASSERT(phyi->phyint_groupname != NULL); 22194 mi_free(phyi->phyint_groupname); 22195 } 22196 mi_free(phyi); 22197 } 22198 22199 /* 22200 * Attach the ill to the phyint structure which can be shared by both 22201 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 22202 * function is called from ipif_set_values and ill_lookup_on_name (for 22203 * loopback) where we know the name of the ill. We lookup the ill and if 22204 * there is one present already with the name use that phyint. Otherwise 22205 * reuse the one allocated by ill_init. 22206 */ 22207 static void 22208 ill_phyint_reinit(ill_t *ill) 22209 { 22210 boolean_t isv6 = ill->ill_isv6; 22211 phyint_t *phyi_old; 22212 phyint_t *phyi; 22213 avl_index_t where = 0; 22214 ill_t *ill_other = NULL; 22215 ipsq_t *ipsq; 22216 22217 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 22218 22219 phyi_old = ill->ill_phyint; 22220 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 22221 phyi_old->phyint_illv6 == NULL)); 22222 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 22223 phyi_old->phyint_illv4 == NULL)); 22224 ASSERT(phyi_old->phyint_ifindex == 0); 22225 22226 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_name, 22227 ill->ill_name, &where); 22228 22229 /* 22230 * 1. We grabbed the ill_g_lock before inserting this ill into 22231 * the global list of ills. So no other thread could have located 22232 * this ill and hence the ipsq of this ill is guaranteed to be empty. 22233 * 2. Now locate the other protocol instance of this ill. 22234 * 3. Now grab both ill locks in the right order, and the phyint lock of 22235 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 22236 * of neither ill can change. 22237 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 22238 * other ill. 22239 * 5. Release all locks. 22240 */ 22241 22242 /* 22243 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 22244 * we are initializing IPv4. 22245 */ 22246 if (phyi != NULL) { 22247 ill_other = (isv6) ? phyi->phyint_illv4 : 22248 phyi->phyint_illv6; 22249 ASSERT(ill_other->ill_phyint != NULL); 22250 ASSERT((isv6 && !ill_other->ill_isv6) || 22251 (!isv6 && ill_other->ill_isv6)); 22252 GRAB_ILL_LOCKS(ill, ill_other); 22253 /* 22254 * We are potentially throwing away phyint_flags which 22255 * could be different from the one that we obtain from 22256 * ill_other->ill_phyint. But it is okay as we are assuming 22257 * that the state maintained within IP is correct. 22258 */ 22259 mutex_enter(&phyi->phyint_lock); 22260 if (isv6) { 22261 ASSERT(phyi->phyint_illv6 == NULL); 22262 phyi->phyint_illv6 = ill; 22263 } else { 22264 ASSERT(phyi->phyint_illv4 == NULL); 22265 phyi->phyint_illv4 = ill; 22266 } 22267 /* 22268 * This is a new ill, currently undergoing SLIFNAME 22269 * So we could not have joined an IPMP group until now. 22270 */ 22271 ASSERT(phyi_old->phyint_ipsq_next == NULL && 22272 phyi_old->phyint_groupname == NULL); 22273 22274 /* 22275 * This phyi_old is going away. Decref ipsq_refs and 22276 * assert it is zero. The ipsq itself will be freed in 22277 * ipsq_exit 22278 */ 22279 ipsq = phyi_old->phyint_ipsq; 22280 IPSQ_DEC_REF(ipsq); 22281 ASSERT(ipsq->ipsq_refs == 0); 22282 /* Get the singleton phyint out of the ipsq list */ 22283 ASSERT(phyi_old->phyint_ipsq_next == NULL); 22284 ipsq->ipsq_phyint_list = NULL; 22285 phyi_old->phyint_illv4 = NULL; 22286 phyi_old->phyint_illv6 = NULL; 22287 mi_free(phyi_old); 22288 } else { 22289 mutex_enter(&ill->ill_lock); 22290 /* 22291 * We don't need to acquire any lock, since 22292 * the ill is not yet visible globally and we 22293 * have not yet released the ill_g_lock. 22294 */ 22295 phyi = phyi_old; 22296 mutex_enter(&phyi->phyint_lock); 22297 /* XXX We need a recovery strategy here. */ 22298 if (!phyint_assign_ifindex(phyi)) 22299 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 22300 22301 avl_insert(&phyint_g_list.phyint_list_avl_by_name, 22302 (void *)phyi, where); 22303 22304 (void) avl_find(&phyint_g_list.phyint_list_avl_by_index, 22305 &phyi->phyint_ifindex, &where); 22306 avl_insert(&phyint_g_list.phyint_list_avl_by_index, 22307 (void *)phyi, where); 22308 } 22309 22310 /* 22311 * Reassigning ill_phyint automatically reassigns the ipsq also. 22312 * pending mp is not affected because that is per ill basis. 22313 */ 22314 ill->ill_phyint = phyi; 22315 22316 /* 22317 * Keep the index on ipif_orig_index to be used by FAILOVER. 22318 * We do this here as when the first ipif was allocated, 22319 * ipif_allocate does not know the right interface index. 22320 */ 22321 22322 ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex; 22323 /* 22324 * Now that the phyint's ifindex has been assigned, complete the 22325 * remaining 22326 */ 22327 if (ill->ill_isv6) { 22328 ill->ill_ip6_mib->ipv6IfIndex = 22329 ill->ill_phyint->phyint_ifindex; 22330 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 22331 ill->ill_phyint->phyint_ifindex; 22332 } 22333 22334 /* 22335 * Generate an event within the hooks framework to indicate that 22336 * a new interface has just been added to IP. For this event to 22337 * be generated, the network interface must, at least, have an 22338 * ifindex assigned to it. 22339 * 22340 * This needs to be run inside the ill_g_lock perimeter to ensure 22341 * that the ordering of delivered events to listeners matches the 22342 * order of them in the kernel. 22343 * 22344 * This function could be called from ill_lookup_on_name. In that case 22345 * the interface is loopback "lo", which will not generate a NIC event. 22346 */ 22347 if (ill->ill_name_length <= 2 || 22348 ill->ill_name[0] != 'l' || ill->ill_name[1] != 'o') { 22349 hook_nic_event_t *info; 22350 if ((info = ill->ill_nic_event_info) != NULL) { 22351 ip2dbg(("ill_phyint_reinit: unexpected nic event %d " 22352 "attached for %s\n", info->hne_event, 22353 ill->ill_name)); 22354 if (info->hne_data != NULL) 22355 kmem_free(info->hne_data, info->hne_datalen); 22356 kmem_free(info, sizeof (hook_nic_event_t)); 22357 } 22358 22359 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 22360 if (info != NULL) { 22361 info->hne_nic = ill->ill_phyint->phyint_ifindex; 22362 info->hne_lif = 0; 22363 info->hne_event = NE_PLUMB; 22364 info->hne_family = ill->ill_isv6 ? ipv6 : ipv4; 22365 info->hne_data = kmem_alloc(ill->ill_name_length, 22366 KM_NOSLEEP); 22367 if (info->hne_data != NULL) { 22368 info->hne_datalen = ill->ill_name_length; 22369 bcopy(ill->ill_name, info->hne_data, 22370 info->hne_datalen); 22371 } else { 22372 ip2dbg(("ill_phyint_reinit: could not attach " 22373 "ill_name information for PLUMB nic event " 22374 "of %s (ENOMEM)\n", ill->ill_name)); 22375 kmem_free(info, sizeof (hook_nic_event_t)); 22376 } 22377 } else 22378 ip2dbg(("ill_phyint_reinit: could not attach PLUMB nic " 22379 "event information for %s (ENOMEM)\n", 22380 ill->ill_name)); 22381 22382 ill->ill_nic_event_info = info; 22383 } 22384 22385 RELEASE_ILL_LOCKS(ill, ill_other); 22386 mutex_exit(&phyi->phyint_lock); 22387 } 22388 22389 /* 22390 * Notify any downstream modules of the name of this interface. 22391 * An M_IOCTL is used even though we don't expect a successful reply. 22392 * Any reply message from the driver (presumably an M_IOCNAK) will 22393 * eventually get discarded somewhere upstream. The message format is 22394 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 22395 * to IP. 22396 */ 22397 static void 22398 ip_ifname_notify(ill_t *ill, queue_t *q) 22399 { 22400 mblk_t *mp1, *mp2; 22401 struct iocblk *iocp; 22402 struct lifreq *lifr; 22403 22404 mp1 = mkiocb(SIOCSLIFNAME); 22405 if (mp1 == NULL) 22406 return; 22407 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 22408 if (mp2 == NULL) { 22409 freeb(mp1); 22410 return; 22411 } 22412 22413 mp1->b_cont = mp2; 22414 iocp = (struct iocblk *)mp1->b_rptr; 22415 iocp->ioc_count = sizeof (struct lifreq); 22416 22417 lifr = (struct lifreq *)mp2->b_rptr; 22418 mp2->b_wptr += sizeof (struct lifreq); 22419 bzero(lifr, sizeof (struct lifreq)); 22420 22421 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 22422 lifr->lifr_ppa = ill->ill_ppa; 22423 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 22424 22425 putnext(q, mp1); 22426 } 22427 22428 static boolean_t ip_trash_timer_started = B_FALSE; 22429 22430 static int 22431 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 22432 { 22433 int err; 22434 22435 /* Set the obsolete NDD per-interface forwarding name. */ 22436 err = ill_set_ndd_name(ill); 22437 if (err != 0) { 22438 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 22439 err); 22440 } 22441 22442 /* Tell downstream modules where they are. */ 22443 ip_ifname_notify(ill, q); 22444 22445 /* 22446 * ill_dl_phys returns EINPROGRESS in the usual case. 22447 * Error cases are ENOMEM ... 22448 */ 22449 err = ill_dl_phys(ill, ipif, mp, q); 22450 22451 /* 22452 * If there is no IRE expiration timer running, get one started. 22453 * igmp and mld timers will be triggered by the first multicast 22454 */ 22455 if (!ip_trash_timer_started) { 22456 /* 22457 * acquire the lock and check again. 22458 */ 22459 mutex_enter(&ip_trash_timer_lock); 22460 if (!ip_trash_timer_started) { 22461 ip_ire_expire_id = timeout(ip_trash_timer_expire, NULL, 22462 MSEC_TO_TICK(ip_timer_interval)); 22463 ip_trash_timer_started = B_TRUE; 22464 } 22465 mutex_exit(&ip_trash_timer_lock); 22466 } 22467 22468 if (ill->ill_isv6) { 22469 mutex_enter(&mld_slowtimeout_lock); 22470 if (mld_slowtimeout_id == 0) { 22471 mld_slowtimeout_id = timeout(mld_slowtimo, NULL, 22472 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 22473 } 22474 mutex_exit(&mld_slowtimeout_lock); 22475 } else { 22476 mutex_enter(&igmp_slowtimeout_lock); 22477 if (igmp_slowtimeout_id == 0) { 22478 igmp_slowtimeout_id = timeout(igmp_slowtimo, NULL, 22479 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 22480 } 22481 mutex_exit(&igmp_slowtimeout_lock); 22482 } 22483 22484 return (err); 22485 } 22486 22487 /* 22488 * Common routine for ppa and ifname setting. Should be called exclusive. 22489 * 22490 * Returns EINPROGRESS when mp has been consumed by queueing it on 22491 * ill_pending_mp and the ioctl will complete in ip_rput. 22492 * 22493 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 22494 * the new name and new ppa in lifr_name and lifr_ppa respectively. 22495 * For SLIFNAME, we pass these values back to the userland. 22496 */ 22497 static int 22498 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 22499 { 22500 ill_t *ill; 22501 ipif_t *ipif; 22502 ipsq_t *ipsq; 22503 char *ppa_ptr; 22504 char *old_ptr; 22505 char old_char; 22506 int error; 22507 22508 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 22509 ASSERT(q->q_next != NULL); 22510 ASSERT(interf_name != NULL); 22511 22512 ill = (ill_t *)q->q_ptr; 22513 22514 ASSERT(ill->ill_name[0] == '\0'); 22515 ASSERT(IAM_WRITER_ILL(ill)); 22516 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 22517 ASSERT(ill->ill_ppa == UINT_MAX); 22518 22519 /* The ppa is sent down by ifconfig or is chosen */ 22520 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 22521 return (EINVAL); 22522 } 22523 22524 /* 22525 * make sure ppa passed in is same as ppa in the name. 22526 * This check is not made when ppa == UINT_MAX in that case ppa 22527 * in the name could be anything. System will choose a ppa and 22528 * update new_ppa_ptr and inter_name to contain the choosen ppa. 22529 */ 22530 if (*new_ppa_ptr != UINT_MAX) { 22531 /* stoi changes the pointer */ 22532 old_ptr = ppa_ptr; 22533 /* 22534 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 22535 * (they don't have an externally visible ppa). We assign one 22536 * here so that we can manage the interface. Note that in 22537 * the past this value was always 0 for DLPI 1 drivers. 22538 */ 22539 if (*new_ppa_ptr == 0) 22540 *new_ppa_ptr = stoi(&old_ptr); 22541 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 22542 return (EINVAL); 22543 } 22544 /* 22545 * terminate string before ppa 22546 * save char at that location. 22547 */ 22548 old_char = ppa_ptr[0]; 22549 ppa_ptr[0] = '\0'; 22550 22551 ill->ill_ppa = *new_ppa_ptr; 22552 /* 22553 * Finish as much work now as possible before calling ill_glist_insert 22554 * which makes the ill globally visible and also merges it with the 22555 * other protocol instance of this phyint. The remaining work is 22556 * done after entering the ipsq which may happen sometime later. 22557 * ill_set_ndd_name occurs after the ill has been made globally visible. 22558 */ 22559 ipif = ill->ill_ipif; 22560 22561 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 22562 ipif_assign_seqid(ipif); 22563 22564 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 22565 ill->ill_flags |= ILLF_IPV4; 22566 22567 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 22568 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 22569 22570 if (ill->ill_flags & ILLF_IPV6) { 22571 22572 ill->ill_isv6 = B_TRUE; 22573 if (ill->ill_rq != NULL) { 22574 ill->ill_rq->q_qinfo = &rinit_ipv6; 22575 ill->ill_wq->q_qinfo = &winit_ipv6; 22576 } 22577 22578 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 22579 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 22580 ipif->ipif_v6src_addr = ipv6_all_zeros; 22581 ipif->ipif_v6subnet = ipv6_all_zeros; 22582 ipif->ipif_v6net_mask = ipv6_all_zeros; 22583 ipif->ipif_v6brd_addr = ipv6_all_zeros; 22584 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 22585 /* 22586 * point-to-point or Non-mulicast capable 22587 * interfaces won't do NUD unless explicitly 22588 * configured to do so. 22589 */ 22590 if (ipif->ipif_flags & IPIF_POINTOPOINT || 22591 !(ill->ill_flags & ILLF_MULTICAST)) { 22592 ill->ill_flags |= ILLF_NONUD; 22593 } 22594 /* Make sure IPv4 specific flag is not set on IPv6 if */ 22595 if (ill->ill_flags & ILLF_NOARP) { 22596 /* 22597 * Note: xresolv interfaces will eventually need 22598 * NOARP set here as well, but that will require 22599 * those external resolvers to have some 22600 * knowledge of that flag and act appropriately. 22601 * Not to be changed at present. 22602 */ 22603 ill->ill_flags &= ~ILLF_NOARP; 22604 } 22605 /* 22606 * Set the ILLF_ROUTER flag according to the global 22607 * IPv6 forwarding policy. 22608 */ 22609 if (ipv6_forward != 0) 22610 ill->ill_flags |= ILLF_ROUTER; 22611 } else if (ill->ill_flags & ILLF_IPV4) { 22612 ill->ill_isv6 = B_FALSE; 22613 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 22614 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr); 22615 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 22616 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 22617 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 22618 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 22619 /* 22620 * Set the ILLF_ROUTER flag according to the global 22621 * IPv4 forwarding policy. 22622 */ 22623 if (ip_g_forward != 0) 22624 ill->ill_flags |= ILLF_ROUTER; 22625 } 22626 22627 ASSERT(ill->ill_phyint != NULL); 22628 22629 /* 22630 * The ipv6Ifindex and ipv6IfIcmpIfIndex assignments will 22631 * be completed in ill_glist_insert -> ill_phyint_reinit 22632 */ 22633 if (ill->ill_isv6) { 22634 /* allocate v6 mib */ 22635 if (!ill_allocate_mibs(ill)) 22636 return (ENOMEM); 22637 } 22638 22639 /* 22640 * Pick a default sap until we get the DL_INFO_ACK back from 22641 * the driver. 22642 */ 22643 if (ill->ill_sap == 0) { 22644 if (ill->ill_isv6) 22645 ill->ill_sap = IP6_DL_SAP; 22646 else 22647 ill->ill_sap = IP_DL_SAP; 22648 } 22649 22650 ill->ill_ifname_pending = 1; 22651 ill->ill_ifname_pending_err = 0; 22652 22653 ill_refhold(ill); 22654 rw_enter(&ill_g_lock, RW_WRITER); 22655 if ((error = ill_glist_insert(ill, interf_name, 22656 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 22657 ill->ill_ppa = UINT_MAX; 22658 ill->ill_name[0] = '\0'; 22659 /* 22660 * undo null termination done above. 22661 */ 22662 ppa_ptr[0] = old_char; 22663 rw_exit(&ill_g_lock); 22664 ill_refrele(ill); 22665 return (error); 22666 } 22667 22668 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 22669 22670 /* 22671 * When we return the buffer pointed to by interf_name should contain 22672 * the same name as in ill_name. 22673 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 22674 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 22675 * so copy full name and update the ppa ptr. 22676 * When ppa passed in != UINT_MAX all values are correct just undo 22677 * null termination, this saves a bcopy. 22678 */ 22679 if (*new_ppa_ptr == UINT_MAX) { 22680 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 22681 *new_ppa_ptr = ill->ill_ppa; 22682 } else { 22683 /* 22684 * undo null termination done above. 22685 */ 22686 ppa_ptr[0] = old_char; 22687 } 22688 22689 /* Let SCTP know about this ILL */ 22690 sctp_update_ill(ill, SCTP_ILL_INSERT); 22691 22692 /* and also about the first ipif */ 22693 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 22694 22695 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_reprocess_ioctl, NEW_OP, 22696 B_TRUE); 22697 22698 rw_exit(&ill_g_lock); 22699 ill_refrele(ill); 22700 if (ipsq == NULL) 22701 return (EINPROGRESS); 22702 22703 /* 22704 * Need to set the ipsq_current_ipif now, if we have changed ipsq 22705 * due to the phyint merge in ill_phyint_reinit. 22706 */ 22707 ASSERT(ipsq->ipsq_current_ipif == NULL || 22708 ipsq->ipsq_current_ipif == ipif); 22709 ipsq->ipsq_current_ipif = ipif; 22710 ipsq->ipsq_last_cmd = SIOCSLIFNAME; 22711 error = ipif_set_values_tail(ill, ipif, mp, q); 22712 ipsq_exit(ipsq, B_TRUE, B_TRUE); 22713 if (error != 0 && error != EINPROGRESS) { 22714 /* 22715 * restore previous values 22716 */ 22717 ill->ill_isv6 = B_FALSE; 22718 } 22719 return (error); 22720 } 22721 22722 22723 extern void (*ip_cleanup_func)(void); 22724 22725 void 22726 ipif_init(void) 22727 { 22728 hrtime_t hrt; 22729 int i; 22730 22731 /* 22732 * Can't call drv_getparm here as it is too early in the boot. 22733 * As we use ipif_src_random just for picking a different 22734 * source address everytime, this need not be really random. 22735 */ 22736 hrt = gethrtime(); 22737 ipif_src_random = ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff); 22738 22739 for (i = 0; i < MAX_G_HEADS; i++) { 22740 ill_g_heads[i].ill_g_list_head = (ill_if_t *)&ill_g_heads[i]; 22741 ill_g_heads[i].ill_g_list_tail = (ill_if_t *)&ill_g_heads[i]; 22742 } 22743 22744 avl_create(&phyint_g_list.phyint_list_avl_by_index, 22745 ill_phyint_compare_index, 22746 sizeof (phyint_t), 22747 offsetof(struct phyint, phyint_avl_by_index)); 22748 avl_create(&phyint_g_list.phyint_list_avl_by_name, 22749 ill_phyint_compare_name, 22750 sizeof (phyint_t), 22751 offsetof(struct phyint, phyint_avl_by_name)); 22752 22753 ip_cleanup_func = ip_thread_exit; 22754 } 22755 22756 /* 22757 * This is called by ip_rt_add when src_addr value is other than zero. 22758 * src_addr signifies the source address of the incoming packet. For 22759 * reverse tunnel route we need to create a source addr based routing 22760 * table. This routine creates ip_mrtun_table if it's empty and then 22761 * it adds the route entry hashed by source address. It verifies that 22762 * the outgoing interface is always a non-resolver interface (tunnel). 22763 */ 22764 int 22765 ip_mrtun_rt_add(ipaddr_t in_src_addr, int flags, ipif_t *ipif_arg, 22766 ipif_t *src_ipif, ire_t **ire_arg, queue_t *q, mblk_t *mp, ipsq_func_t func) 22767 { 22768 ire_t *ire; 22769 ire_t *save_ire; 22770 ipif_t *ipif; 22771 ill_t *in_ill = NULL; 22772 ill_t *out_ill; 22773 queue_t *stq; 22774 mblk_t *dlureq_mp; 22775 int error; 22776 22777 if (ire_arg != NULL) 22778 *ire_arg = NULL; 22779 ASSERT(in_src_addr != INADDR_ANY); 22780 22781 ipif = ipif_arg; 22782 if (ipif != NULL) { 22783 out_ill = ipif->ipif_ill; 22784 } else { 22785 ip1dbg(("ip_mrtun_rt_add: ipif is NULL\n")); 22786 return (EINVAL); 22787 } 22788 22789 if (src_ipif == NULL) { 22790 ip1dbg(("ip_mrtun_rt_add: src_ipif is NULL\n")); 22791 return (EINVAL); 22792 } 22793 in_ill = src_ipif->ipif_ill; 22794 22795 /* 22796 * Check for duplicates. We don't need to 22797 * match out_ill, because the uniqueness of 22798 * a route is only dependent on src_addr and 22799 * in_ill. 22800 */ 22801 ire = ire_mrtun_lookup(in_src_addr, in_ill); 22802 if (ire != NULL) { 22803 ire_refrele(ire); 22804 return (EEXIST); 22805 } 22806 if (ipif->ipif_net_type != IRE_IF_NORESOLVER) { 22807 ip2dbg(("ip_mrtun_rt_add: outgoing interface is type %d\n", 22808 ipif->ipif_net_type)); 22809 return (EINVAL); 22810 } 22811 22812 stq = ipif->ipif_wq; 22813 ASSERT(stq != NULL); 22814 22815 /* 22816 * The outgoing interface must be non-resolver 22817 * interface. 22818 */ 22819 dlureq_mp = ill_dlur_gen(NULL, 22820 out_ill->ill_phys_addr_length, out_ill->ill_sap, 22821 out_ill->ill_sap_length); 22822 22823 if (dlureq_mp == NULL) { 22824 ip1dbg(("ip_newroute: dlureq_mp NULL\n")); 22825 return (ENOMEM); 22826 } 22827 22828 /* Create the IRE. */ 22829 22830 ire = ire_create( 22831 NULL, /* Zero dst addr */ 22832 NULL, /* Zero mask */ 22833 NULL, /* Zero gateway addr */ 22834 NULL, /* Zero ipif_src addr */ 22835 (uint8_t *)&in_src_addr, /* in_src-addr */ 22836 &ipif->ipif_mtu, 22837 NULL, 22838 NULL, /* rfq */ 22839 stq, 22840 IRE_MIPRTUN, 22841 dlureq_mp, 22842 ipif, 22843 in_ill, 22844 0, 22845 0, 22846 0, 22847 flags, 22848 &ire_uinfo_null, 22849 NULL, 22850 NULL); 22851 22852 if (ire == NULL) { 22853 freeb(dlureq_mp); 22854 return (ENOMEM); 22855 } 22856 ip2dbg(("ip_mrtun_rt_add: mrtun route is created with type %d\n", 22857 ire->ire_type)); 22858 save_ire = ire; 22859 ASSERT(save_ire != NULL); 22860 error = ire_add_mrtun(&ire, q, mp, func); 22861 /* 22862 * If ire_add_mrtun() failed, the ire passed in was freed 22863 * so there is no need to do so here. 22864 */ 22865 if (error != 0) { 22866 return (error); 22867 } 22868 22869 /* Duplicate check */ 22870 if (ire != save_ire) { 22871 /* route already exists by now */ 22872 ire_refrele(ire); 22873 return (EEXIST); 22874 } 22875 22876 if (ire_arg != NULL) { 22877 /* 22878 * Store the ire that was just added. the caller 22879 * ip_rts_request responsible for doing ire_refrele() 22880 * on it. 22881 */ 22882 *ire_arg = ire; 22883 } else { 22884 ire_refrele(ire); /* held in ire_add_mrtun */ 22885 } 22886 22887 return (0); 22888 } 22889 22890 /* 22891 * It is called by ip_rt_delete() only when mipagent requests to delete 22892 * a reverse tunnel route that was added by ip_mrtun_rt_add() before. 22893 */ 22894 22895 int 22896 ip_mrtun_rt_delete(ipaddr_t in_src_addr, ipif_t *src_ipif) 22897 { 22898 ire_t *ire = NULL; 22899 22900 if (in_src_addr == INADDR_ANY) 22901 return (EINVAL); 22902 if (src_ipif == NULL) 22903 return (EINVAL); 22904 22905 /* search if this route exists in the ip_mrtun_table */ 22906 ire = ire_mrtun_lookup(in_src_addr, src_ipif->ipif_ill); 22907 if (ire == NULL) { 22908 ip2dbg(("ip_mrtun_rt_delete: ire not found\n")); 22909 return (ESRCH); 22910 } 22911 ire_delete(ire); 22912 ire_refrele(ire); 22913 return (0); 22914 } 22915 22916 /* 22917 * Lookup the ipif corresponding to the onlink destination address. For 22918 * point-to-point interfaces, it matches with remote endpoint destination 22919 * address. For point-to-multipoint interfaces it only tries to match the 22920 * destination with the interface's subnet address. The longest, most specific 22921 * match is found to take care of such rare network configurations like - 22922 * le0: 129.146.1.1/16 22923 * le1: 129.146.2.2/24 22924 * It is used only by SO_DONTROUTE at the moment. 22925 */ 22926 ipif_t * 22927 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid) 22928 { 22929 ipif_t *ipif, *best_ipif; 22930 ill_t *ill; 22931 ill_walk_context_t ctx; 22932 22933 ASSERT(zoneid != ALL_ZONES); 22934 best_ipif = NULL; 22935 22936 rw_enter(&ill_g_lock, RW_READER); 22937 ill = ILL_START_WALK_V4(&ctx); 22938 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 22939 mutex_enter(&ill->ill_lock); 22940 for (ipif = ill->ill_ipif; ipif != NULL; 22941 ipif = ipif->ipif_next) { 22942 if (!IPIF_CAN_LOOKUP(ipif)) 22943 continue; 22944 if (ipif->ipif_zoneid != zoneid && 22945 ipif->ipif_zoneid != ALL_ZONES) 22946 continue; 22947 /* 22948 * Point-to-point case. Look for exact match with 22949 * destination address. 22950 */ 22951 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 22952 if (ipif->ipif_pp_dst_addr == addr) { 22953 ipif_refhold_locked(ipif); 22954 mutex_exit(&ill->ill_lock); 22955 rw_exit(&ill_g_lock); 22956 if (best_ipif != NULL) 22957 ipif_refrele(best_ipif); 22958 return (ipif); 22959 } 22960 } else if (ipif->ipif_subnet == (addr & 22961 ipif->ipif_net_mask)) { 22962 /* 22963 * Point-to-multipoint case. Looping through to 22964 * find the most specific match. If there are 22965 * multiple best match ipif's then prefer ipif's 22966 * that are UP. If there is only one best match 22967 * ipif and it is DOWN we must still return it. 22968 */ 22969 if ((best_ipif == NULL) || 22970 (ipif->ipif_net_mask > 22971 best_ipif->ipif_net_mask) || 22972 ((ipif->ipif_net_mask == 22973 best_ipif->ipif_net_mask) && 22974 ((ipif->ipif_flags & IPIF_UP) && 22975 (!(best_ipif->ipif_flags & IPIF_UP))))) { 22976 ipif_refhold_locked(ipif); 22977 mutex_exit(&ill->ill_lock); 22978 rw_exit(&ill_g_lock); 22979 if (best_ipif != NULL) 22980 ipif_refrele(best_ipif); 22981 best_ipif = ipif; 22982 rw_enter(&ill_g_lock, RW_READER); 22983 mutex_enter(&ill->ill_lock); 22984 } 22985 } 22986 } 22987 mutex_exit(&ill->ill_lock); 22988 } 22989 rw_exit(&ill_g_lock); 22990 return (best_ipif); 22991 } 22992 22993 22994 /* 22995 * Save enough information so that we can recreate the IRE if 22996 * the interface goes down and then up. 22997 */ 22998 static void 22999 ipif_save_ire(ipif_t *ipif, ire_t *ire) 23000 { 23001 mblk_t *save_mp; 23002 23003 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 23004 if (save_mp != NULL) { 23005 ifrt_t *ifrt; 23006 23007 save_mp->b_wptr += sizeof (ifrt_t); 23008 ifrt = (ifrt_t *)save_mp->b_rptr; 23009 bzero(ifrt, sizeof (ifrt_t)); 23010 ifrt->ifrt_type = ire->ire_type; 23011 ifrt->ifrt_addr = ire->ire_addr; 23012 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 23013 ifrt->ifrt_src_addr = ire->ire_src_addr; 23014 ifrt->ifrt_mask = ire->ire_mask; 23015 ifrt->ifrt_flags = ire->ire_flags; 23016 ifrt->ifrt_max_frag = ire->ire_max_frag; 23017 mutex_enter(&ipif->ipif_saved_ire_lock); 23018 save_mp->b_cont = ipif->ipif_saved_ire_mp; 23019 ipif->ipif_saved_ire_mp = save_mp; 23020 ipif->ipif_saved_ire_cnt++; 23021 mutex_exit(&ipif->ipif_saved_ire_lock); 23022 } 23023 } 23024 23025 23026 static void 23027 ipif_remove_ire(ipif_t *ipif, ire_t *ire) 23028 { 23029 mblk_t **mpp; 23030 mblk_t *mp; 23031 ifrt_t *ifrt; 23032 23033 /* Remove from ipif_saved_ire_mp list if it is there */ 23034 mutex_enter(&ipif->ipif_saved_ire_lock); 23035 for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL; 23036 mpp = &(*mpp)->b_cont) { 23037 /* 23038 * On a given ipif, the triple of address, gateway and 23039 * mask is unique for each saved IRE (in the case of 23040 * ordinary interface routes, the gateway address is 23041 * all-zeroes). 23042 */ 23043 mp = *mpp; 23044 ifrt = (ifrt_t *)mp->b_rptr; 23045 if (ifrt->ifrt_addr == ire->ire_addr && 23046 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 23047 ifrt->ifrt_mask == ire->ire_mask) { 23048 *mpp = mp->b_cont; 23049 ipif->ipif_saved_ire_cnt--; 23050 freeb(mp); 23051 break; 23052 } 23053 } 23054 mutex_exit(&ipif->ipif_saved_ire_lock); 23055 } 23056 23057 23058 /* 23059 * IP multirouting broadcast routes handling 23060 * Append CGTP broadcast IREs to regular ones created 23061 * at ifconfig time. 23062 */ 23063 static void 23064 ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst) 23065 { 23066 ire_t *ire_prim; 23067 23068 ASSERT(ire != NULL); 23069 ASSERT(ire_dst != NULL); 23070 23071 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 23072 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 23073 if (ire_prim != NULL) { 23074 /* 23075 * We are in the special case of broadcasts for 23076 * CGTP. We add an IRE_BROADCAST that holds 23077 * the RTF_MULTIRT flag, the destination 23078 * address of ire_dst and the low level 23079 * info of ire_prim. In other words, CGTP 23080 * broadcast is added to the redundant ipif. 23081 */ 23082 ipif_t *ipif_prim; 23083 ire_t *bcast_ire; 23084 23085 ipif_prim = ire_prim->ire_ipif; 23086 23087 ip2dbg(("ip_cgtp_filter_bcast_add: " 23088 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 23089 (void *)ire_dst, (void *)ire_prim, 23090 (void *)ipif_prim)); 23091 23092 bcast_ire = ire_create( 23093 (uchar_t *)&ire->ire_addr, 23094 (uchar_t *)&ip_g_all_ones, 23095 (uchar_t *)&ire_dst->ire_src_addr, 23096 (uchar_t *)&ire->ire_gateway_addr, 23097 NULL, 23098 &ipif_prim->ipif_mtu, 23099 NULL, 23100 ipif_prim->ipif_rq, 23101 ipif_prim->ipif_wq, 23102 IRE_BROADCAST, 23103 ipif_prim->ipif_bcast_mp, 23104 ipif_prim, 23105 NULL, 23106 0, 23107 0, 23108 0, 23109 ire->ire_flags, 23110 &ire_uinfo_null, 23111 NULL, 23112 NULL); 23113 23114 if (bcast_ire != NULL) { 23115 23116 if (ire_add(&bcast_ire, NULL, NULL, NULL, 23117 B_FALSE) == 0) { 23118 ip2dbg(("ip_cgtp_filter_bcast_add: " 23119 "added bcast_ire %p\n", 23120 (void *)bcast_ire)); 23121 23122 ipif_save_ire(bcast_ire->ire_ipif, 23123 bcast_ire); 23124 ire_refrele(bcast_ire); 23125 } 23126 } 23127 ire_refrele(ire_prim); 23128 } 23129 } 23130 23131 23132 /* 23133 * IP multirouting broadcast routes handling 23134 * Remove the broadcast ire 23135 */ 23136 static void 23137 ip_cgtp_bcast_delete(ire_t *ire) 23138 { 23139 ire_t *ire_dst; 23140 23141 ASSERT(ire != NULL); 23142 ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST, 23143 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 23144 if (ire_dst != NULL) { 23145 ire_t *ire_prim; 23146 23147 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 23148 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 23149 if (ire_prim != NULL) { 23150 ipif_t *ipif_prim; 23151 ire_t *bcast_ire; 23152 23153 ipif_prim = ire_prim->ire_ipif; 23154 23155 ip2dbg(("ip_cgtp_filter_bcast_delete: " 23156 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 23157 (void *)ire_dst, (void *)ire_prim, 23158 (void *)ipif_prim)); 23159 23160 bcast_ire = ire_ctable_lookup(ire->ire_addr, 23161 ire->ire_gateway_addr, 23162 IRE_BROADCAST, 23163 ipif_prim, ALL_ZONES, 23164 NULL, 23165 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF | 23166 MATCH_IRE_MASK); 23167 23168 if (bcast_ire != NULL) { 23169 ip2dbg(("ip_cgtp_filter_bcast_delete: " 23170 "looked up bcast_ire %p\n", 23171 (void *)bcast_ire)); 23172 ipif_remove_ire(bcast_ire->ire_ipif, 23173 bcast_ire); 23174 ire_delete(bcast_ire); 23175 } 23176 ire_refrele(ire_prim); 23177 } 23178 ire_refrele(ire_dst); 23179 } 23180 } 23181 23182 /* 23183 * IPsec hardware acceleration capabilities related functions. 23184 */ 23185 23186 /* 23187 * Free a per-ill IPsec capabilities structure. 23188 */ 23189 static void 23190 ill_ipsec_capab_free(ill_ipsec_capab_t *capab) 23191 { 23192 if (capab->auth_hw_algs != NULL) 23193 kmem_free(capab->auth_hw_algs, capab->algs_size); 23194 if (capab->encr_hw_algs != NULL) 23195 kmem_free(capab->encr_hw_algs, capab->algs_size); 23196 if (capab->encr_algparm != NULL) 23197 kmem_free(capab->encr_algparm, capab->encr_algparm_size); 23198 kmem_free(capab, sizeof (ill_ipsec_capab_t)); 23199 } 23200 23201 /* 23202 * Allocate a new per-ill IPsec capabilities structure. This structure 23203 * is specific to an IPsec protocol (AH or ESP). It is implemented as 23204 * an array which specifies, for each algorithm, whether this algorithm 23205 * is supported by the ill or not. 23206 */ 23207 static ill_ipsec_capab_t * 23208 ill_ipsec_capab_alloc(void) 23209 { 23210 ill_ipsec_capab_t *capab; 23211 uint_t nelems; 23212 23213 capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP); 23214 if (capab == NULL) 23215 return (NULL); 23216 23217 /* we need one bit per algorithm */ 23218 nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t); 23219 capab->algs_size = nelems * sizeof (ipsec_capab_elem_t); 23220 23221 /* allocate memory to store algorithm flags */ 23222 capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 23223 if (capab->encr_hw_algs == NULL) 23224 goto nomem; 23225 capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 23226 if (capab->auth_hw_algs == NULL) 23227 goto nomem; 23228 /* 23229 * Leave encr_algparm NULL for now since we won't need it half 23230 * the time 23231 */ 23232 return (capab); 23233 23234 nomem: 23235 ill_ipsec_capab_free(capab); 23236 return (NULL); 23237 } 23238 23239 /* 23240 * Resize capability array. Since we're exclusive, this is OK. 23241 */ 23242 static boolean_t 23243 ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid) 23244 { 23245 ipsec_capab_algparm_t *nalp, *oalp; 23246 uint32_t olen, nlen; 23247 23248 oalp = capab->encr_algparm; 23249 olen = capab->encr_algparm_size; 23250 23251 if (oalp != NULL) { 23252 if (algid < capab->encr_algparm_end) 23253 return (B_TRUE); 23254 } 23255 23256 nlen = (algid + 1) * sizeof (*nalp); 23257 nalp = kmem_zalloc(nlen, KM_NOSLEEP); 23258 if (nalp == NULL) 23259 return (B_FALSE); 23260 23261 if (oalp != NULL) { 23262 bcopy(oalp, nalp, olen); 23263 kmem_free(oalp, olen); 23264 } 23265 capab->encr_algparm = nalp; 23266 capab->encr_algparm_size = nlen; 23267 capab->encr_algparm_end = algid + 1; 23268 23269 return (B_TRUE); 23270 } 23271 23272 /* 23273 * Compare the capabilities of the specified ill with the protocol 23274 * and algorithms specified by the SA passed as argument. 23275 * If they match, returns B_TRUE, B_FALSE if they do not match. 23276 * 23277 * The ill can be passed as a pointer to it, or by specifying its index 23278 * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments). 23279 * 23280 * Called by ipsec_out_is_accelerated() do decide whether an outbound 23281 * packet is eligible for hardware acceleration, and by 23282 * ill_ipsec_capab_send_all() to decide whether a SA must be sent down 23283 * to a particular ill. 23284 */ 23285 boolean_t 23286 ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6, 23287 ipsa_t *sa) 23288 { 23289 boolean_t sa_isv6; 23290 uint_t algid; 23291 struct ill_ipsec_capab_s *cpp; 23292 boolean_t need_refrele = B_FALSE; 23293 23294 if (ill == NULL) { 23295 ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL, 23296 NULL, NULL, NULL); 23297 if (ill == NULL) { 23298 ip0dbg(("ipsec_capab_match: ill doesn't exist\n")); 23299 return (B_FALSE); 23300 } 23301 need_refrele = B_TRUE; 23302 } 23303 23304 /* 23305 * Use the address length specified by the SA to determine 23306 * if it corresponds to a IPv6 address, and fail the matching 23307 * if the isv6 flag passed as argument does not match. 23308 * Note: this check is used for SADB capability checking before 23309 * sending SA information to an ill. 23310 */ 23311 sa_isv6 = (sa->ipsa_addrfam == AF_INET6); 23312 if (sa_isv6 != ill_isv6) 23313 /* protocol mismatch */ 23314 goto done; 23315 23316 /* 23317 * Check if the ill supports the protocol, algorithm(s) and 23318 * key size(s) specified by the SA, and get the pointers to 23319 * the algorithms supported by the ill. 23320 */ 23321 switch (sa->ipsa_type) { 23322 23323 case SADB_SATYPE_ESP: 23324 if (!(ill->ill_capabilities & ILL_CAPAB_ESP)) 23325 /* ill does not support ESP acceleration */ 23326 goto done; 23327 cpp = ill->ill_ipsec_capab_esp; 23328 algid = sa->ipsa_auth_alg; 23329 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs)) 23330 goto done; 23331 algid = sa->ipsa_encr_alg; 23332 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs)) 23333 goto done; 23334 if (algid < cpp->encr_algparm_end) { 23335 ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid]; 23336 if (sa->ipsa_encrkeybits < alp->minkeylen) 23337 goto done; 23338 if (sa->ipsa_encrkeybits > alp->maxkeylen) 23339 goto done; 23340 } 23341 break; 23342 23343 case SADB_SATYPE_AH: 23344 if (!(ill->ill_capabilities & ILL_CAPAB_AH)) 23345 /* ill does not support AH acceleration */ 23346 goto done; 23347 if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg, 23348 ill->ill_ipsec_capab_ah->auth_hw_algs)) 23349 goto done; 23350 break; 23351 } 23352 23353 if (need_refrele) 23354 ill_refrele(ill); 23355 return (B_TRUE); 23356 done: 23357 if (need_refrele) 23358 ill_refrele(ill); 23359 return (B_FALSE); 23360 } 23361 23362 23363 /* 23364 * Add a new ill to the list of IPsec capable ills. 23365 * Called from ill_capability_ipsec_ack() when an ACK was received 23366 * indicating that IPsec hardware processing was enabled for an ill. 23367 * 23368 * ill must point to the ill for which acceleration was enabled. 23369 * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP. 23370 */ 23371 static void 23372 ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync) 23373 { 23374 ipsec_capab_ill_t **ills, *cur_ill, *new_ill; 23375 uint_t sa_type; 23376 uint_t ipproto; 23377 23378 ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) || 23379 (dl_cap == DL_CAPAB_IPSEC_ESP)); 23380 23381 switch (dl_cap) { 23382 case DL_CAPAB_IPSEC_AH: 23383 sa_type = SADB_SATYPE_AH; 23384 ills = &ipsec_capab_ills_ah; 23385 ipproto = IPPROTO_AH; 23386 break; 23387 case DL_CAPAB_IPSEC_ESP: 23388 sa_type = SADB_SATYPE_ESP; 23389 ills = &ipsec_capab_ills_esp; 23390 ipproto = IPPROTO_ESP; 23391 break; 23392 } 23393 23394 rw_enter(&ipsec_capab_ills_lock, RW_WRITER); 23395 23396 /* 23397 * Add ill index to list of hardware accelerators. If 23398 * already in list, do nothing. 23399 */ 23400 for (cur_ill = *ills; cur_ill != NULL && 23401 (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex || 23402 cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next) 23403 ; 23404 23405 if (cur_ill == NULL) { 23406 /* if this is a new entry for this ill */ 23407 new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP); 23408 if (new_ill == NULL) { 23409 rw_exit(&ipsec_capab_ills_lock); 23410 return; 23411 } 23412 23413 new_ill->ill_index = ill->ill_phyint->phyint_ifindex; 23414 new_ill->ill_isv6 = ill->ill_isv6; 23415 new_ill->next = *ills; 23416 *ills = new_ill; 23417 } else if (!sadb_resync) { 23418 /* not resync'ing SADB and an entry exists for this ill */ 23419 rw_exit(&ipsec_capab_ills_lock); 23420 return; 23421 } 23422 23423 rw_exit(&ipsec_capab_ills_lock); 23424 23425 if (ipcl_proto_fanout_v6[ipproto].connf_head != NULL) 23426 /* 23427 * IPsec module for protocol loaded, initiate dump 23428 * of the SADB to this ill. 23429 */ 23430 sadb_ill_download(ill, sa_type); 23431 } 23432 23433 /* 23434 * Remove an ill from the list of IPsec capable ills. 23435 */ 23436 static void 23437 ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap) 23438 { 23439 ipsec_capab_ill_t **ills, *cur_ill, *prev_ill; 23440 23441 ASSERT(dl_cap == DL_CAPAB_IPSEC_AH || 23442 dl_cap == DL_CAPAB_IPSEC_ESP); 23443 23444 ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipsec_capab_ills_ah : 23445 &ipsec_capab_ills_esp; 23446 23447 rw_enter(&ipsec_capab_ills_lock, RW_WRITER); 23448 23449 prev_ill = NULL; 23450 for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index != 23451 ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 != 23452 ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next) 23453 ; 23454 if (cur_ill == NULL) { 23455 /* entry not found */ 23456 rw_exit(&ipsec_capab_ills_lock); 23457 return; 23458 } 23459 if (prev_ill == NULL) { 23460 /* entry at front of list */ 23461 *ills = NULL; 23462 } else { 23463 prev_ill->next = cur_ill->next; 23464 } 23465 kmem_free(cur_ill, sizeof (ipsec_capab_ill_t)); 23466 rw_exit(&ipsec_capab_ills_lock); 23467 } 23468 23469 23470 /* 23471 * Handling of DL_CONTROL_REQ messages that must be sent down to 23472 * an ill while having exclusive access. 23473 */ 23474 /* ARGSUSED */ 23475 static void 23476 ill_ipsec_capab_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 23477 { 23478 ill_t *ill = (ill_t *)q->q_ptr; 23479 23480 ill_dlpi_send(ill, mp); 23481 } 23482 23483 23484 /* 23485 * Called by SADB to send a DL_CONTROL_REQ message to every ill 23486 * supporting the specified IPsec protocol acceleration. 23487 * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP. 23488 * We free the mblk and, if sa is non-null, release the held referece. 23489 */ 23490 void 23491 ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa) 23492 { 23493 ipsec_capab_ill_t *ici, *cur_ici; 23494 ill_t *ill; 23495 mblk_t *nmp, *mp_ship_list = NULL, *next_mp; 23496 23497 ici = (sa_type == SADB_SATYPE_AH) ? ipsec_capab_ills_ah : 23498 ipsec_capab_ills_esp; 23499 23500 rw_enter(&ipsec_capab_ills_lock, RW_READER); 23501 23502 for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) { 23503 ill = ill_lookup_on_ifindex(cur_ici->ill_index, 23504 cur_ici->ill_isv6, NULL, NULL, NULL, NULL); 23505 23506 /* 23507 * Handle the case where the ill goes away while the SADB is 23508 * attempting to send messages. If it's going away, it's 23509 * nuking its shadow SADB, so we don't care.. 23510 */ 23511 23512 if (ill == NULL) 23513 continue; 23514 23515 if (sa != NULL) { 23516 /* 23517 * Make sure capabilities match before 23518 * sending SA to ill. 23519 */ 23520 if (!ipsec_capab_match(ill, cur_ici->ill_index, 23521 cur_ici->ill_isv6, sa)) { 23522 ill_refrele(ill); 23523 continue; 23524 } 23525 23526 mutex_enter(&sa->ipsa_lock); 23527 sa->ipsa_flags |= IPSA_F_HW; 23528 mutex_exit(&sa->ipsa_lock); 23529 } 23530 23531 /* 23532 * Copy template message, and add it to the front 23533 * of the mblk ship list. We want to avoid holding 23534 * the ipsec_capab_ills_lock while sending the 23535 * message to the ills. 23536 * 23537 * The b_next and b_prev are temporarily used 23538 * to build a list of mblks to be sent down, and to 23539 * save the ill to which they must be sent. 23540 */ 23541 nmp = copymsg(mp); 23542 if (nmp == NULL) { 23543 ill_refrele(ill); 23544 continue; 23545 } 23546 ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL); 23547 nmp->b_next = mp_ship_list; 23548 mp_ship_list = nmp; 23549 nmp->b_prev = (mblk_t *)ill; 23550 } 23551 23552 rw_exit(&ipsec_capab_ills_lock); 23553 23554 nmp = mp_ship_list; 23555 while (nmp != NULL) { 23556 /* restore the mblk to a sane state */ 23557 next_mp = nmp->b_next; 23558 nmp->b_next = NULL; 23559 ill = (ill_t *)nmp->b_prev; 23560 nmp->b_prev = NULL; 23561 23562 /* 23563 * Ship the mblk to the ill, must be exclusive. Keep the 23564 * reference to the ill as qwriter_ip() does a ill_referele(). 23565 */ 23566 (void) qwriter_ip(NULL, ill, ill->ill_wq, nmp, 23567 ill_ipsec_capab_send_writer, NEW_OP, B_TRUE); 23568 23569 nmp = next_mp; 23570 } 23571 23572 if (sa != NULL) 23573 IPSA_REFRELE(sa); 23574 freemsg(mp); 23575 } 23576 23577 23578 /* 23579 * Derive an interface id from the link layer address. 23580 * Knows about IEEE 802 and IEEE EUI-64 mappings. 23581 */ 23582 static boolean_t 23583 ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23584 { 23585 char *addr; 23586 23587 if (phys_length != ETHERADDRL) 23588 return (B_FALSE); 23589 23590 /* Form EUI-64 like address */ 23591 addr = (char *)&v6addr->s6_addr32[2]; 23592 bcopy((char *)phys_addr, addr, 3); 23593 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 23594 addr[3] = (char)0xff; 23595 addr[4] = (char)0xfe; 23596 bcopy((char *)phys_addr + 3, addr + 5, 3); 23597 return (B_TRUE); 23598 } 23599 23600 /* ARGSUSED */ 23601 static boolean_t 23602 ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23603 { 23604 return (B_FALSE); 23605 } 23606 23607 /* ARGSUSED */ 23608 static boolean_t 23609 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 23610 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 23611 { 23612 /* 23613 * Multicast address mappings used over Ethernet/802.X. 23614 * This address is used as a base for mappings. 23615 */ 23616 static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00, 23617 0x00, 0x00, 0x00}; 23618 23619 /* 23620 * Extract low order 32 bits from IPv6 multicast address. 23621 * Or that into the link layer address, starting from the 23622 * second byte. 23623 */ 23624 *hw_start = 2; 23625 v6_extract_mask->s6_addr32[0] = 0; 23626 v6_extract_mask->s6_addr32[1] = 0; 23627 v6_extract_mask->s6_addr32[2] = 0; 23628 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 23629 bcopy(ipv6_g_phys_multi_addr, maddr, lla_length); 23630 return (B_TRUE); 23631 } 23632 23633 /* 23634 * Indicate by return value whether multicast is supported. If not, 23635 * this code should not touch/change any parameters. 23636 */ 23637 /* ARGSUSED */ 23638 static boolean_t 23639 ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 23640 uint32_t *hw_start, ipaddr_t *extract_mask) 23641 { 23642 /* 23643 * Multicast address mappings used over Ethernet/802.X. 23644 * This address is used as a base for mappings. 23645 */ 23646 static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e, 23647 0x00, 0x00, 0x00 }; 23648 23649 if (phys_length != ETHERADDRL) 23650 return (B_FALSE); 23651 23652 *extract_mask = htonl(0x007fffff); 23653 *hw_start = 2; 23654 bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL); 23655 return (B_TRUE); 23656 } 23657 23658 /* 23659 * Derive IPoIB interface id from the link layer address. 23660 */ 23661 static boolean_t 23662 ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23663 { 23664 char *addr; 23665 23666 if (phys_length != 20) 23667 return (B_FALSE); 23668 addr = (char *)&v6addr->s6_addr32[2]; 23669 bcopy(phys_addr + 12, addr, 8); 23670 /* 23671 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 23672 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 23673 * rules. In these cases, the IBA considers these GUIDs to be in 23674 * "Modified EUI-64" format, and thus toggling the u/l bit is not 23675 * required; vendors are required not to assign global EUI-64's 23676 * that differ only in u/l bit values, thus guaranteeing uniqueness 23677 * of the interface identifier. Whether the GUID is in modified 23678 * or proper EUI-64 format, the ipv6 identifier must have the u/l 23679 * bit set to 1. 23680 */ 23681 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 23682 return (B_TRUE); 23683 } 23684 23685 /* 23686 * Note on mapping from multicast IP addresses to IPoIB multicast link 23687 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 23688 * The format of an IPoIB multicast address is: 23689 * 23690 * 4 byte QPN Scope Sign. Pkey 23691 * +--------------------------------------------+ 23692 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 23693 * +--------------------------------------------+ 23694 * 23695 * The Scope and Pkey components are properties of the IBA port and 23696 * network interface. They can be ascertained from the broadcast address. 23697 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 23698 */ 23699 23700 static boolean_t 23701 ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 23702 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 23703 { 23704 /* 23705 * Base IPoIB IPv6 multicast address used for mappings. 23706 * Does not contain the IBA scope/Pkey values. 23707 */ 23708 static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 23709 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 23710 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 23711 23712 /* 23713 * Extract low order 80 bits from IPv6 multicast address. 23714 * Or that into the link layer address, starting from the 23715 * sixth byte. 23716 */ 23717 *hw_start = 6; 23718 bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length); 23719 23720 /* 23721 * Now fill in the IBA scope/Pkey values from the broadcast address. 23722 */ 23723 *(maddr + 5) = *(bphys_addr + 5); 23724 *(maddr + 8) = *(bphys_addr + 8); 23725 *(maddr + 9) = *(bphys_addr + 9); 23726 23727 v6_extract_mask->s6_addr32[0] = 0; 23728 v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff); 23729 v6_extract_mask->s6_addr32[2] = 0xffffffffU; 23730 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 23731 return (B_TRUE); 23732 } 23733 23734 static boolean_t 23735 ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 23736 uint32_t *hw_start, ipaddr_t *extract_mask) 23737 { 23738 /* 23739 * Base IPoIB IPv4 multicast address used for mappings. 23740 * Does not contain the IBA scope/Pkey values. 23741 */ 23742 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 23743 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 23744 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 23745 23746 if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr)) 23747 return (B_FALSE); 23748 23749 /* 23750 * Extract low order 28 bits from IPv4 multicast address. 23751 * Or that into the link layer address, starting from the 23752 * sixteenth byte. 23753 */ 23754 *extract_mask = htonl(0x0fffffff); 23755 *hw_start = 16; 23756 bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length); 23757 23758 /* 23759 * Now fill in the IBA scope/Pkey values from the broadcast address. 23760 */ 23761 *(maddr + 5) = *(bphys_addr + 5); 23762 *(maddr + 8) = *(bphys_addr + 8); 23763 *(maddr + 9) = *(bphys_addr + 9); 23764 return (B_TRUE); 23765 } 23766 23767 /* 23768 * Returns B_TRUE if an ipif is present in the given zone, matching some flags 23769 * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there. 23770 * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with 23771 * the link-local address is preferred. 23772 */ 23773 boolean_t 23774 ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 23775 { 23776 ipif_t *ipif; 23777 ipif_t *maybe_ipif = NULL; 23778 23779 mutex_enter(&ill->ill_lock); 23780 if (ill->ill_state_flags & ILL_CONDEMNED) { 23781 mutex_exit(&ill->ill_lock); 23782 if (ipifp != NULL) 23783 *ipifp = NULL; 23784 return (B_FALSE); 23785 } 23786 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 23787 if (!IPIF_CAN_LOOKUP(ipif)) 23788 continue; 23789 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 23790 ipif->ipif_zoneid != ALL_ZONES) 23791 continue; 23792 if ((ipif->ipif_flags & flags) != flags) 23793 continue; 23794 23795 if (ipifp == NULL) { 23796 mutex_exit(&ill->ill_lock); 23797 ASSERT(maybe_ipif == NULL); 23798 return (B_TRUE); 23799 } 23800 if (!ill->ill_isv6 || 23801 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) { 23802 ipif_refhold_locked(ipif); 23803 mutex_exit(&ill->ill_lock); 23804 *ipifp = ipif; 23805 return (B_TRUE); 23806 } 23807 if (maybe_ipif == NULL) 23808 maybe_ipif = ipif; 23809 } 23810 if (ipifp != NULL) { 23811 if (maybe_ipif != NULL) 23812 ipif_refhold_locked(maybe_ipif); 23813 *ipifp = maybe_ipif; 23814 } 23815 mutex_exit(&ill->ill_lock); 23816 return (maybe_ipif != NULL); 23817 } 23818 23819 /* 23820 * Same as ipif_lookup_zoneid() but looks at all the ills in the same group. 23821 */ 23822 boolean_t 23823 ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 23824 { 23825 ill_t *illg; 23826 23827 /* 23828 * We look at the passed-in ill first without grabbing ill_g_lock. 23829 */ 23830 if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) { 23831 return (B_TRUE); 23832 } 23833 rw_enter(&ill_g_lock, RW_READER); 23834 if (ill->ill_group == NULL) { 23835 /* ill not in a group */ 23836 rw_exit(&ill_g_lock); 23837 return (B_FALSE); 23838 } 23839 23840 /* 23841 * There's no ipif in the zone on ill, however ill is part of an IPMP 23842 * group. We need to look for an ipif in the zone on all the ills in the 23843 * group. 23844 */ 23845 illg = ill->ill_group->illgrp_ill; 23846 do { 23847 /* 23848 * We don't call ipif_lookup_zoneid() on ill as we already know 23849 * that it's not there. 23850 */ 23851 if (illg != ill && 23852 ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) { 23853 break; 23854 } 23855 } while ((illg = illg->ill_group_next) != NULL); 23856 rw_exit(&ill_g_lock); 23857 return (illg != NULL); 23858 } 23859 23860 /* 23861 * Check if this ill is only being used to send ICMP probes for IPMP 23862 */ 23863 boolean_t 23864 ill_is_probeonly(ill_t *ill) 23865 { 23866 /* 23867 * Check if the interface is FAILED, or INACTIVE 23868 */ 23869 if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) 23870 return (B_TRUE); 23871 23872 return (B_FALSE); 23873 } 23874 23875 /* 23876 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 23877 * If a pointer to an ipif_t is returned then the caller will need to do 23878 * an ill_refrele(). 23879 */ 23880 ipif_t * 23881 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6) 23882 { 23883 ipif_t *ipif; 23884 ill_t *ill; 23885 23886 ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL); 23887 23888 if (ill == NULL) 23889 return (NULL); 23890 23891 mutex_enter(&ill->ill_lock); 23892 if (ill->ill_state_flags & ILL_CONDEMNED) { 23893 mutex_exit(&ill->ill_lock); 23894 ill_refrele(ill); 23895 return (NULL); 23896 } 23897 23898 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 23899 if (!IPIF_CAN_LOOKUP(ipif)) 23900 continue; 23901 if (lifidx == ipif->ipif_id) { 23902 ipif_refhold_locked(ipif); 23903 break; 23904 } 23905 } 23906 23907 mutex_exit(&ill->ill_lock); 23908 ill_refrele(ill); 23909 return (ipif); 23910 } 23911