1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains the interface control functions for IP. 31 */ 32 33 #include <sys/types.h> 34 #include <sys/stream.h> 35 #include <sys/dlpi.h> 36 #include <sys/stropts.h> 37 #include <sys/strsun.h> 38 #include <sys/sysmacros.h> 39 #include <sys/strlog.h> 40 #include <sys/ddi.h> 41 #include <sys/sunddi.h> 42 #include <sys/cmn_err.h> 43 #include <sys/kstat.h> 44 #include <sys/debug.h> 45 #include <sys/zone.h> 46 #include <sys/sunldi.h> 47 #include <sys/file.h> 48 49 #include <sys/kmem.h> 50 #include <sys/systm.h> 51 #include <sys/param.h> 52 #include <sys/socket.h> 53 #include <sys/isa_defs.h> 54 #include <net/if.h> 55 #include <net/if_arp.h> 56 #include <net/if_types.h> 57 #include <net/if_dl.h> 58 #include <net/route.h> 59 #include <sys/sockio.h> 60 #include <netinet/in.h> 61 #include <netinet/ip6.h> 62 #include <netinet/icmp6.h> 63 #include <netinet/igmp_var.h> 64 #include <sys/strsun.h> 65 #include <sys/policy.h> 66 #include <sys/ethernet.h> 67 68 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 69 #include <inet/mi.h> 70 #include <inet/nd.h> 71 #include <inet/arp.h> 72 #include <inet/mib2.h> 73 #include <inet/ip.h> 74 #include <inet/ip6.h> 75 #include <inet/ip6_asp.h> 76 #include <inet/tcp.h> 77 #include <inet/ip_multi.h> 78 #include <inet/ip_ire.h> 79 #include <inet/ip_ftable.h> 80 #include <inet/ip_rts.h> 81 #include <inet/ip_ndp.h> 82 #include <inet/ip_if.h> 83 #include <inet/ip_impl.h> 84 #include <inet/tun.h> 85 #include <inet/sctp_ip.h> 86 #include <inet/ip_netinfo.h> 87 #include <inet/mib2.h> 88 89 #include <net/pfkeyv2.h> 90 #include <inet/ipsec_info.h> 91 #include <inet/sadb.h> 92 #include <inet/ipsec_impl.h> 93 #include <sys/iphada.h> 94 95 96 #include <netinet/igmp.h> 97 #include <inet/ip_listutils.h> 98 #include <inet/ipclassifier.h> 99 #include <sys/mac.h> 100 101 #include <sys/systeminfo.h> 102 #include <sys/bootconf.h> 103 104 #include <sys/tsol/tndb.h> 105 #include <sys/tsol/tnet.h> 106 107 /* The character which tells where the ill_name ends */ 108 #define IPIF_SEPARATOR_CHAR ':' 109 110 /* IP ioctl function table entry */ 111 typedef struct ipft_s { 112 int ipft_cmd; 113 pfi_t ipft_pfi; 114 int ipft_min_size; 115 int ipft_flags; 116 } ipft_t; 117 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 118 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 119 120 typedef struct ip_sock_ar_s { 121 union { 122 area_t ip_sock_area; 123 ared_t ip_sock_ared; 124 areq_t ip_sock_areq; 125 } ip_sock_ar_u; 126 queue_t *ip_sock_ar_q; 127 } ip_sock_ar_t; 128 129 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 130 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 131 char *value, caddr_t cp, cred_t *ioc_cr); 132 133 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 134 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 135 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 136 mblk_t *mp, boolean_t need_up); 137 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 138 mblk_t *mp, boolean_t need_up); 139 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 140 queue_t *q, mblk_t *mp, boolean_t need_up); 141 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 142 mblk_t *mp, boolean_t need_up); 143 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 144 mblk_t *mp); 145 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 146 queue_t *q, mblk_t *mp, boolean_t need_up); 147 static int ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, 148 sin_t *sin, boolean_t x_arp_ioctl, boolean_t if_arp_ioctl); 149 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); 150 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 151 static void ipsq_flush(ill_t *ill); 152 153 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 154 queue_t *q, mblk_t *mp, boolean_t need_up); 155 static void ipsq_delete(ipsq_t *); 156 157 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 158 boolean_t initialize); 159 static void ipif_check_bcast_ires(ipif_t *test_ipif); 160 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, 161 boolean_t isv6); 162 static void ipif_down_delete_ire(ire_t *ire, char *ipif); 163 static void ipif_delete_cache_ire(ire_t *, char *); 164 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 165 static void ipif_free(ipif_t *ipif); 166 static void ipif_free_tail(ipif_t *ipif); 167 static void ipif_mtu_change(ire_t *ire, char *ipif_arg); 168 static void ipif_multicast_down(ipif_t *ipif); 169 static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); 170 static void ipif_set_default(ipif_t *ipif); 171 static int ipif_set_values(queue_t *q, mblk_t *mp, 172 char *interf_name, uint_t *ppa); 173 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 174 queue_t *q); 175 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 176 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 177 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *); 178 static int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp); 179 static void ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp); 180 181 static int ill_alloc_ppa(ill_if_t *, ill_t *); 182 static int ill_arp_off(ill_t *ill); 183 static int ill_arp_on(ill_t *ill); 184 static void ill_delete_interface_type(ill_if_t *); 185 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 186 static void ill_dl_down(ill_t *ill); 187 static void ill_down(ill_t *ill); 188 static void ill_downi(ire_t *ire, char *ill_arg); 189 static void ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg); 190 static void ill_down_tail(ill_t *ill); 191 static void ill_free_mib(ill_t *ill); 192 static void ill_glist_delete(ill_t *); 193 static boolean_t ill_has_usable_ipif(ill_t *); 194 static int ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int); 195 static void ill_nominate_bcast_rcv(ill_group_t *illgrp); 196 static void ill_phyint_free(ill_t *ill); 197 static void ill_phyint_reinit(ill_t *ill); 198 static void ill_set_nce_router_flags(ill_t *, boolean_t); 199 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); 200 static void ill_signal_ipsq_ills(ipsq_t *, boolean_t); 201 static boolean_t ill_split_ipsq(ipsq_t *cur_sq); 202 static void ill_stq_cache_delete(ire_t *, char *); 203 204 static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *); 205 static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *); 206 static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 207 in6_addr_t *); 208 static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 209 ipaddr_t *); 210 static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *); 211 static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 212 in6_addr_t *); 213 static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 214 ipaddr_t *); 215 216 static void ipif_save_ire(ipif_t *, ire_t *); 217 static void ipif_remove_ire(ipif_t *, ire_t *); 218 static void ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *); 219 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); 220 221 /* 222 * Per-ill IPsec capabilities management. 223 */ 224 static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void); 225 static void ill_ipsec_capab_free(ill_ipsec_capab_t *); 226 static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); 227 static void ill_ipsec_capab_delete(ill_t *, uint_t); 228 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); 229 static void ill_capability_proto(ill_t *, int, mblk_t *); 230 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, 231 boolean_t); 232 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 233 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 234 static void ill_capability_mdt_reset(ill_t *, mblk_t **); 235 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 236 static void ill_capability_ipsec_reset(ill_t *, mblk_t **); 237 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 238 static void ill_capability_hcksum_reset(ill_t *, mblk_t **); 239 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 240 dl_capability_sub_t *); 241 static void ill_capability_zerocopy_reset(ill_t *, mblk_t **); 242 static void ill_capability_lso_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 243 static void ill_capability_lso_reset(ill_t *, mblk_t **); 244 static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 245 static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *); 246 static void ill_capability_dls_reset(ill_t *, mblk_t **); 247 static void ill_capability_dls_disable(ill_t *); 248 249 static void illgrp_cache_delete(ire_t *, char *); 250 static void illgrp_delete(ill_t *ill); 251 static void illgrp_reset_schednext(ill_t *ill); 252 253 static ill_t *ill_prev_usesrc(ill_t *); 254 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 255 static void ill_disband_usesrc_group(ill_t *); 256 257 static void conn_cleanup_stale_ire(conn_t *, caddr_t); 258 259 /* 260 * if we go over the memory footprint limit more than once in this msec 261 * interval, we'll start pruning aggressively. 262 */ 263 int ip_min_frag_prune_time = 0; 264 265 /* 266 * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY 267 * and the IPsec DOI 268 */ 269 #define MAX_IPSEC_ALGS 256 270 271 #define BITSPERBYTE 8 272 #define BITS(type) (BITSPERBYTE * (long)sizeof (type)) 273 274 #define IPSEC_ALG_ENABLE(algs, algid) \ 275 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \ 276 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 277 278 #define IPSEC_ALG_IS_ENABLED(algid, algs) \ 279 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \ 280 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 281 282 typedef uint8_t ipsec_capab_elem_t; 283 284 /* 285 * Per-algorithm parameters. Note that at present, only encryption 286 * algorithms have variable keysize (IKE does not provide a way to negotiate 287 * auth algorithm keysize). 288 * 289 * All sizes here are in bits. 290 */ 291 typedef struct 292 { 293 uint16_t minkeylen; 294 uint16_t maxkeylen; 295 } ipsec_capab_algparm_t; 296 297 /* 298 * Per-ill capabilities. 299 */ 300 struct ill_ipsec_capab_s { 301 ipsec_capab_elem_t *encr_hw_algs; 302 ipsec_capab_elem_t *auth_hw_algs; 303 uint32_t algs_size; /* size of _hw_algs in bytes */ 304 /* algorithm key lengths */ 305 ipsec_capab_algparm_t *encr_algparm; 306 uint32_t encr_algparm_size; 307 uint32_t encr_algparm_end; 308 }; 309 310 /* 311 * The field values are larger than strictly necessary for simple 312 * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls. 313 */ 314 static area_t ip_area_template = { 315 AR_ENTRY_ADD, /* area_cmd */ 316 sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl), 317 /* area_name_offset */ 318 /* area_name_length temporarily holds this structure length */ 319 sizeof (area_t), /* area_name_length */ 320 IP_ARP_PROTO_TYPE, /* area_proto */ 321 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 322 IP_ADDR_LEN, /* area_proto_addr_length */ 323 sizeof (ip_sock_ar_t) + IP_ADDR_LEN, 324 /* area_proto_mask_offset */ 325 0, /* area_flags */ 326 sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN, 327 /* area_hw_addr_offset */ 328 /* Zero length hw_addr_length means 'use your idea of the address' */ 329 0 /* area_hw_addr_length */ 330 }; 331 332 /* 333 * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver 334 * support 335 */ 336 static area_t ip6_area_template = { 337 AR_ENTRY_ADD, /* area_cmd */ 338 sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t), 339 /* area_name_offset */ 340 /* area_name_length temporarily holds this structure length */ 341 sizeof (area_t), /* area_name_length */ 342 IP_ARP_PROTO_TYPE, /* area_proto */ 343 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 344 IPV6_ADDR_LEN, /* area_proto_addr_length */ 345 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN, 346 /* area_proto_mask_offset */ 347 0, /* area_flags */ 348 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN, 349 /* area_hw_addr_offset */ 350 /* Zero length hw_addr_length means 'use your idea of the address' */ 351 0 /* area_hw_addr_length */ 352 }; 353 354 static ared_t ip_ared_template = { 355 AR_ENTRY_DELETE, 356 sizeof (ared_t) + IP_ADDR_LEN, 357 sizeof (ared_t), 358 IP_ARP_PROTO_TYPE, 359 sizeof (ared_t), 360 IP_ADDR_LEN 361 }; 362 363 static ared_t ip6_ared_template = { 364 AR_ENTRY_DELETE, 365 sizeof (ared_t) + IPV6_ADDR_LEN, 366 sizeof (ared_t), 367 IP_ARP_PROTO_TYPE, 368 sizeof (ared_t), 369 IPV6_ADDR_LEN 370 }; 371 372 /* 373 * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as 374 * as the areq doesn't include an IP address in ill_dl_up() (the only place a 375 * areq is used). 376 */ 377 static areq_t ip_areq_template = { 378 AR_ENTRY_QUERY, /* cmd */ 379 sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */ 380 sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ 381 IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ 382 sizeof (areq_t), /* target addr offset */ 383 IP_ADDR_LEN, /* target addr_length */ 384 0, /* flags */ 385 sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ 386 IP_ADDR_LEN, /* sender addr length */ 387 AR_EQ_DEFAULT_XMIT_COUNT, /* xmit_count */ 388 AR_EQ_DEFAULT_XMIT_INTERVAL, /* (re)xmit_interval in milliseconds */ 389 AR_EQ_DEFAULT_MAX_BUFFERED /* max # of requests to buffer */ 390 /* anything else filled in by the code */ 391 }; 392 393 static arc_t ip_aru_template = { 394 AR_INTERFACE_UP, 395 sizeof (arc_t), /* Name offset */ 396 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 397 }; 398 399 static arc_t ip_ard_template = { 400 AR_INTERFACE_DOWN, 401 sizeof (arc_t), /* Name offset */ 402 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 403 }; 404 405 static arc_t ip_aron_template = { 406 AR_INTERFACE_ON, 407 sizeof (arc_t), /* Name offset */ 408 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 409 }; 410 411 static arc_t ip_aroff_template = { 412 AR_INTERFACE_OFF, 413 sizeof (arc_t), /* Name offset */ 414 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 415 }; 416 417 418 static arma_t ip_arma_multi_template = { 419 AR_MAPPING_ADD, 420 sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN, 421 /* Name offset */ 422 sizeof (arma_t), /* Name length (set by ill_arp_alloc) */ 423 IP_ARP_PROTO_TYPE, 424 sizeof (arma_t), /* proto_addr_offset */ 425 IP_ADDR_LEN, /* proto_addr_length */ 426 sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */ 427 sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */ 428 ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */ 429 sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */ 430 IP_MAX_HW_LEN, /* hw_addr_length */ 431 0, /* hw_mapping_start */ 432 }; 433 434 static ipft_t ip_ioctl_ftbl[] = { 435 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 436 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 437 IPFT_F_NO_REPLY }, 438 { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t), 439 IPFT_F_NO_REPLY }, 440 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 441 { 0 } 442 }; 443 444 /* Simple ICMP IP Header Template */ 445 static ipha_t icmp_ipha = { 446 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 447 }; 448 449 /* Flag descriptors for ip_ipif_report */ 450 static nv_t ipif_nv_tbl[] = { 451 { IPIF_UP, "UP" }, 452 { IPIF_BROADCAST, "BROADCAST" }, 453 { ILLF_DEBUG, "DEBUG" }, 454 { PHYI_LOOPBACK, "LOOPBACK" }, 455 { IPIF_POINTOPOINT, "POINTOPOINT" }, 456 { ILLF_NOTRAILERS, "NOTRAILERS" }, 457 { PHYI_RUNNING, "RUNNING" }, 458 { ILLF_NOARP, "NOARP" }, 459 { PHYI_PROMISC, "PROMISC" }, 460 { PHYI_ALLMULTI, "ALLMULTI" }, 461 { PHYI_INTELLIGENT, "INTELLIGENT" }, 462 { ILLF_MULTICAST, "MULTICAST" }, 463 { PHYI_MULTI_BCAST, "MULTI_BCAST" }, 464 { IPIF_UNNUMBERED, "UNNUMBERED" }, 465 { IPIF_DHCPRUNNING, "DHCP" }, 466 { IPIF_PRIVATE, "PRIVATE" }, 467 { IPIF_NOXMIT, "NOXMIT" }, 468 { IPIF_NOLOCAL, "NOLOCAL" }, 469 { IPIF_DEPRECATED, "DEPRECATED" }, 470 { IPIF_PREFERRED, "PREFERRED" }, 471 { IPIF_TEMPORARY, "TEMPORARY" }, 472 { IPIF_ADDRCONF, "ADDRCONF" }, 473 { PHYI_VIRTUAL, "VIRTUAL" }, 474 { ILLF_ROUTER, "ROUTER" }, 475 { ILLF_NONUD, "NONUD" }, 476 { IPIF_ANYCAST, "ANYCAST" }, 477 { ILLF_NORTEXCH, "NORTEXCH" }, 478 { ILLF_IPV4, "IPV4" }, 479 { ILLF_IPV6, "IPV6" }, 480 { IPIF_MIPRUNNING, "MIP" }, 481 { IPIF_NOFAILOVER, "NOFAILOVER" }, 482 { PHYI_FAILED, "FAILED" }, 483 { PHYI_STANDBY, "STANDBY" }, 484 { PHYI_INACTIVE, "INACTIVE" }, 485 { PHYI_OFFLINE, "OFFLINE" }, 486 }; 487 488 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 489 490 static ip_m_t ip_m_tbl[] = { 491 { DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 492 ip_ether_v6intfid }, 493 { DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 494 ip_nodef_v6intfid }, 495 { DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 496 ip_nodef_v6intfid }, 497 { DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 498 ip_nodef_v6intfid }, 499 { DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 500 ip_ether_v6intfid }, 501 { DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo, 502 ip_ib_v6intfid }, 503 { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL}, 504 { DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 505 ip_nodef_v6intfid } 506 }; 507 508 static ill_t ill_null; /* Empty ILL for init. */ 509 char ipif_loopback_name[] = "lo0"; 510 static char *ipv4_forward_suffix = ":ip_forwarding"; 511 static char *ipv6_forward_suffix = ":ip6_forwarding"; 512 static sin6_t sin6_null; /* Zero address for quick clears */ 513 static sin_t sin_null; /* Zero address for quick clears */ 514 515 /* When set search for unused ipif_seqid */ 516 static ipif_t ipif_zero; 517 518 /* 519 * ppa arena is created after these many 520 * interfaces have been plumbed. 521 */ 522 uint_t ill_no_arena = 12; /* Setable in /etc/system */ 523 524 /* 525 * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout 526 * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is 527 * set through platform specific code (Niagara/Ontario). 528 */ 529 #define SOFT_RINGS_ENABLED() (ip_soft_rings_cnt ? \ 530 (ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE) 531 532 #define ILL_CAPAB_DLS (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL) 533 534 static uint_t 535 ipif_rand(ip_stack_t *ipst) 536 { 537 ipst->ips_ipif_src_random = ipst->ips_ipif_src_random * 1103515245 + 538 12345; 539 return ((ipst->ips_ipif_src_random >> 16) & 0x7fff); 540 } 541 542 /* 543 * Allocate per-interface mibs. 544 * Returns true if ok. False otherwise. 545 * ipsq may not yet be allocated (loopback case ). 546 */ 547 static boolean_t 548 ill_allocate_mibs(ill_t *ill) 549 { 550 /* Already allocated? */ 551 if (ill->ill_ip_mib != NULL) { 552 if (ill->ill_isv6) 553 ASSERT(ill->ill_icmp6_mib != NULL); 554 return (B_TRUE); 555 } 556 557 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), 558 KM_NOSLEEP); 559 if (ill->ill_ip_mib == NULL) { 560 return (B_FALSE); 561 } 562 563 /* Setup static information */ 564 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, 565 sizeof (mib2_ipIfStatsEntry_t)); 566 if (ill->ill_isv6) { 567 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 568 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 569 sizeof (mib2_ipv6AddrEntry_t)); 570 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 571 sizeof (mib2_ipv6RouteEntry_t)); 572 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 573 sizeof (mib2_ipv6NetToMediaEntry_t)); 574 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 575 sizeof (ipv6_member_t)); 576 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 577 sizeof (ipv6_grpsrc_t)); 578 } else { 579 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 580 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 581 sizeof (mib2_ipAddrEntry_t)); 582 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 583 sizeof (mib2_ipRouteEntry_t)); 584 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 585 sizeof (mib2_ipNetToMediaEntry_t)); 586 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 587 sizeof (ip_member_t)); 588 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 589 sizeof (ip_grpsrc_t)); 590 591 /* 592 * For a v4 ill, we are done at this point, because per ill 593 * icmp mibs are only used for v6. 594 */ 595 return (B_TRUE); 596 } 597 598 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 599 KM_NOSLEEP); 600 if (ill->ill_icmp6_mib == NULL) { 601 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 602 ill->ill_ip_mib = NULL; 603 return (B_FALSE); 604 } 605 /* static icmp info */ 606 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 607 sizeof (mib2_ipv6IfIcmpEntry_t); 608 /* 609 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later 610 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 611 * -> ill_phyint_reinit 612 */ 613 return (B_TRUE); 614 } 615 616 /* 617 * Common code for preparation of ARP commands. Two points to remember: 618 * 1) The ill_name is tacked on at the end of the allocated space so 619 * the templates name_offset field must contain the total space 620 * to allocate less the name length. 621 * 622 * 2) The templates name_length field should contain the *template* 623 * length. We use it as a parameter to bcopy() and then write 624 * the real ill_name_length into the name_length field of the copy. 625 * (Always called as writer.) 626 */ 627 mblk_t * 628 ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr) 629 { 630 arc_t *arc = (arc_t *)template; 631 char *cp; 632 int len; 633 mblk_t *mp; 634 uint_t name_length = ill->ill_name_length; 635 uint_t template_len = arc->arc_name_length; 636 637 len = arc->arc_name_offset + name_length; 638 mp = allocb(len, BPRI_HI); 639 if (mp == NULL) 640 return (NULL); 641 cp = (char *)mp->b_rptr; 642 mp->b_wptr = (uchar_t *)&cp[len]; 643 if (template_len) 644 bcopy(template, cp, template_len); 645 if (len > template_len) 646 bzero(&cp[template_len], len - template_len); 647 mp->b_datap->db_type = M_PROTO; 648 649 arc = (arc_t *)cp; 650 arc->arc_name_length = name_length; 651 cp = (char *)arc + arc->arc_name_offset; 652 bcopy(ill->ill_name, cp, name_length); 653 654 if (addr) { 655 area_t *area = (area_t *)mp->b_rptr; 656 657 cp = (char *)area + area->area_proto_addr_offset; 658 bcopy(addr, cp, area->area_proto_addr_length); 659 if (area->area_cmd == AR_ENTRY_ADD) { 660 cp = (char *)area; 661 len = area->area_proto_addr_length; 662 if (area->area_proto_mask_offset) 663 cp += area->area_proto_mask_offset; 664 else 665 cp += area->area_proto_addr_offset + len; 666 while (len-- > 0) 667 *cp++ = (char)~0; 668 } 669 } 670 return (mp); 671 } 672 673 mblk_t * 674 ipif_area_alloc(ipif_t *ipif) 675 { 676 return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template, 677 (char *)&ipif->ipif_lcl_addr)); 678 } 679 680 mblk_t * 681 ipif_ared_alloc(ipif_t *ipif) 682 { 683 return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template, 684 (char *)&ipif->ipif_lcl_addr)); 685 } 686 687 mblk_t * 688 ill_ared_alloc(ill_t *ill, ipaddr_t addr) 689 { 690 return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 691 (char *)&addr)); 692 } 693 694 /* 695 * Completely vaporize a lower level tap and all associated interfaces. 696 * ill_delete is called only out of ip_close when the device control 697 * stream is being closed. 698 */ 699 void 700 ill_delete(ill_t *ill) 701 { 702 ipif_t *ipif; 703 ill_t *prev_ill; 704 ip_stack_t *ipst = ill->ill_ipst; 705 706 /* 707 * ill_delete may be forcibly entering the ipsq. The previous 708 * ioctl may not have completed and may need to be aborted. 709 * ipsq_flush takes care of it. If we don't need to enter the 710 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 711 * ill_delete_tail is sufficient. 712 */ 713 ipsq_flush(ill); 714 715 /* 716 * Nuke all interfaces. ipif_free will take down the interface, 717 * remove it from the list, and free the data structure. 718 * Walk down the ipif list and remove the logical interfaces 719 * first before removing the main ipif. We can't unplumb 720 * zeroth interface first in the case of IPv6 as reset_conn_ill 721 * -> ip_ll_delmulti_v6 de-references ill_ipif for checking 722 * POINTOPOINT. 723 * 724 * If ill_ipif was not properly initialized (i.e low on memory), 725 * then no interfaces to clean up. In this case just clean up the 726 * ill. 727 */ 728 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 729 ipif_free(ipif); 730 731 /* 732 * Used only by ill_arp_on and ill_arp_off, which are writers. 733 * So nobody can be using this mp now. Free the mp allocated for 734 * honoring ILLF_NOARP 735 */ 736 freemsg(ill->ill_arp_on_mp); 737 ill->ill_arp_on_mp = NULL; 738 739 /* Clean up msgs on pending upcalls for mrouted */ 740 reset_mrt_ill(ill); 741 742 /* 743 * ipif_free -> reset_conn_ipif will remove all multicast 744 * references for IPv4. For IPv6, we need to do it here as 745 * it points only at ills. 746 */ 747 reset_conn_ill(ill); 748 749 /* 750 * ill_down will arrange to blow off any IRE's dependent on this 751 * ILL, and shut down fragmentation reassembly. 752 */ 753 ill_down(ill); 754 755 /* Let SCTP know, so that it can remove this from its list. */ 756 sctp_update_ill(ill, SCTP_ILL_REMOVE); 757 758 /* 759 * If an address on this ILL is being used as a source address then 760 * clear out the pointers in other ILLs that point to this ILL. 761 */ 762 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 763 if (ill->ill_usesrc_grp_next != NULL) { 764 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 765 ill_disband_usesrc_group(ill); 766 } else { /* consumer of the usesrc ILL */ 767 prev_ill = ill_prev_usesrc(ill); 768 prev_ill->ill_usesrc_grp_next = 769 ill->ill_usesrc_grp_next; 770 } 771 } 772 rw_exit(&ipst->ips_ill_g_usesrc_lock); 773 } 774 775 static void 776 ipif_non_duplicate(ipif_t *ipif) 777 { 778 ill_t *ill = ipif->ipif_ill; 779 mutex_enter(&ill->ill_lock); 780 if (ipif->ipif_flags & IPIF_DUPLICATE) { 781 ipif->ipif_flags &= ~IPIF_DUPLICATE; 782 ASSERT(ill->ill_ipif_dup_count > 0); 783 ill->ill_ipif_dup_count--; 784 } 785 mutex_exit(&ill->ill_lock); 786 } 787 788 /* 789 * ill_delete_tail is called from ip_modclose after all references 790 * to the closing ill are gone. The wait is done in ip_modclose 791 */ 792 void 793 ill_delete_tail(ill_t *ill) 794 { 795 mblk_t **mpp; 796 ipif_t *ipif; 797 ip_stack_t *ipst = ill->ill_ipst; 798 799 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 800 ipif_non_duplicate(ipif); 801 ipif_down_tail(ipif); 802 } 803 804 ASSERT(ill->ill_ipif_dup_count == 0 && 805 ill->ill_arp_down_mp == NULL && 806 ill->ill_arp_del_mapping_mp == NULL); 807 808 /* 809 * If polling capability is enabled (which signifies direct 810 * upcall into IP and driver has ill saved as a handle), 811 * we need to make sure that unbind has completed before we 812 * let the ill disappear and driver no longer has any reference 813 * to this ill. 814 */ 815 mutex_enter(&ill->ill_lock); 816 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 817 cv_wait(&ill->ill_cv, &ill->ill_lock); 818 mutex_exit(&ill->ill_lock); 819 820 /* 821 * Clean up polling and soft ring capabilities 822 */ 823 if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) 824 ill_capability_dls_disable(ill); 825 826 if (ill->ill_net_type != IRE_LOOPBACK) 827 qprocsoff(ill->ill_rq); 828 829 /* 830 * We do an ipsq_flush once again now. New messages could have 831 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 832 * could also have landed up if an ioctl thread had looked up 833 * the ill before we set the ILL_CONDEMNED flag, but not yet 834 * enqueued the ioctl when we did the ipsq_flush last time. 835 */ 836 ipsq_flush(ill); 837 838 /* 839 * Free capabilities. 840 */ 841 if (ill->ill_ipsec_capab_ah != NULL) { 842 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH); 843 ill_ipsec_capab_free(ill->ill_ipsec_capab_ah); 844 ill->ill_ipsec_capab_ah = NULL; 845 } 846 847 if (ill->ill_ipsec_capab_esp != NULL) { 848 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP); 849 ill_ipsec_capab_free(ill->ill_ipsec_capab_esp); 850 ill->ill_ipsec_capab_esp = NULL; 851 } 852 853 if (ill->ill_mdt_capab != NULL) { 854 kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t)); 855 ill->ill_mdt_capab = NULL; 856 } 857 858 if (ill->ill_hcksum_capab != NULL) { 859 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 860 ill->ill_hcksum_capab = NULL; 861 } 862 863 if (ill->ill_zerocopy_capab != NULL) { 864 kmem_free(ill->ill_zerocopy_capab, 865 sizeof (ill_zerocopy_capab_t)); 866 ill->ill_zerocopy_capab = NULL; 867 } 868 869 if (ill->ill_lso_capab != NULL) { 870 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 871 ill->ill_lso_capab = NULL; 872 } 873 874 if (ill->ill_dls_capab != NULL) { 875 CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn); 876 ill->ill_dls_capab->ill_unbind_conn = NULL; 877 kmem_free(ill->ill_dls_capab, 878 sizeof (ill_dls_capab_t) + 879 (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS)); 880 ill->ill_dls_capab = NULL; 881 } 882 883 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL)); 884 885 while (ill->ill_ipif != NULL) 886 ipif_free_tail(ill->ill_ipif); 887 888 ill_down_tail(ill); 889 890 /* 891 * We have removed all references to ilm from conn and the ones joined 892 * within the kernel. 893 * 894 * We don't walk conns, mrts and ires because 895 * 896 * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts. 897 * 2) ill_down ->ill_downi walks all the ires and cleans up 898 * ill references. 899 */ 900 ASSERT(ilm_walk_ill(ill) == 0); 901 /* 902 * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free 903 * could free the phyint. No more reference to the phyint after this 904 * point. 905 */ 906 (void) ill_glist_delete(ill); 907 908 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 909 if (ill->ill_ndd_name != NULL) 910 nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name); 911 rw_exit(&ipst->ips_ip_g_nd_lock); 912 913 914 if (ill->ill_frag_ptr != NULL) { 915 uint_t count; 916 917 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 918 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 919 } 920 mi_free(ill->ill_frag_ptr); 921 ill->ill_frag_ptr = NULL; 922 ill->ill_frag_hash_tbl = NULL; 923 } 924 925 freemsg(ill->ill_nd_lla_mp); 926 /* Free all retained control messages. */ 927 mpp = &ill->ill_first_mp_to_free; 928 do { 929 while (mpp[0]) { 930 mblk_t *mp; 931 mblk_t *mp1; 932 933 mp = mpp[0]; 934 mpp[0] = mp->b_next; 935 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 936 mp1->b_next = NULL; 937 mp1->b_prev = NULL; 938 } 939 freemsg(mp); 940 } 941 } while (mpp++ != &ill->ill_last_mp_to_free); 942 943 ill_free_mib(ill); 944 /* Drop refcnt here */ 945 netstack_rele(ill->ill_ipst->ips_netstack); 946 ill->ill_ipst = NULL; 947 948 ILL_TRACE_CLEANUP(ill); 949 } 950 951 static void 952 ill_free_mib(ill_t *ill) 953 { 954 ip_stack_t *ipst = ill->ill_ipst; 955 956 /* 957 * MIB statistics must not be lost, so when an interface 958 * goes away the counter values will be added to the global 959 * MIBs. 960 */ 961 if (ill->ill_ip_mib != NULL) { 962 if (ill->ill_isv6) { 963 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, 964 ill->ill_ip_mib); 965 } else { 966 ip_mib2_add_ip_stats(&ipst->ips_ip_mib, 967 ill->ill_ip_mib); 968 } 969 970 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 971 ill->ill_ip_mib = NULL; 972 } 973 if (ill->ill_icmp6_mib != NULL) { 974 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, 975 ill->ill_icmp6_mib); 976 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 977 ill->ill_icmp6_mib = NULL; 978 } 979 } 980 981 /* 982 * Concatenate together a physical address and a sap. 983 * 984 * Sap_lengths are interpreted as follows: 985 * sap_length == 0 ==> no sap 986 * sap_length > 0 ==> sap is at the head of the dlpi address 987 * sap_length < 0 ==> sap is at the tail of the dlpi address 988 */ 989 static void 990 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 991 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 992 { 993 uint16_t sap_addr = (uint16_t)sap_src; 994 995 if (sap_length == 0) { 996 if (phys_src == NULL) 997 bzero(dst, phys_length); 998 else 999 bcopy(phys_src, dst, phys_length); 1000 } else if (sap_length < 0) { 1001 if (phys_src == NULL) 1002 bzero(dst, phys_length); 1003 else 1004 bcopy(phys_src, dst, phys_length); 1005 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 1006 } else { 1007 bcopy(&sap_addr, dst, sizeof (sap_addr)); 1008 if (phys_src == NULL) 1009 bzero((char *)dst + sap_length, phys_length); 1010 else 1011 bcopy(phys_src, (char *)dst + sap_length, phys_length); 1012 } 1013 } 1014 1015 /* 1016 * Generate a dl_unitdata_req mblk for the device and address given. 1017 * addr_length is the length of the physical portion of the address. 1018 * If addr is NULL include an all zero address of the specified length. 1019 * TRUE? In any case, addr_length is taken to be the entire length of the 1020 * dlpi address, including the absolute value of sap_length. 1021 */ 1022 mblk_t * 1023 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 1024 t_scalar_t sap_length) 1025 { 1026 dl_unitdata_req_t *dlur; 1027 mblk_t *mp; 1028 t_scalar_t abs_sap_length; /* absolute value */ 1029 1030 abs_sap_length = ABS(sap_length); 1031 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 1032 DL_UNITDATA_REQ); 1033 if (mp == NULL) 1034 return (NULL); 1035 dlur = (dl_unitdata_req_t *)mp->b_rptr; 1036 /* HACK: accomodate incompatible DLPI drivers */ 1037 if (addr_length == 8) 1038 addr_length = 6; 1039 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 1040 dlur->dl_dest_addr_offset = sizeof (*dlur); 1041 dlur->dl_priority.dl_min = 0; 1042 dlur->dl_priority.dl_max = 0; 1043 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 1044 (uchar_t *)&dlur[1]); 1045 return (mp); 1046 } 1047 1048 /* 1049 * Add the 'mp' to the list of pending mp's headed by ill_pending_mp 1050 * Return an error if we already have 1 or more ioctls in progress. 1051 * This is used only for non-exclusive ioctls. Currently this is used 1052 * for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive 1053 * and thus need to use ipsq_pending_mp_add. 1054 */ 1055 boolean_t 1056 ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp) 1057 { 1058 ASSERT(MUTEX_HELD(&ill->ill_lock)); 1059 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1060 /* 1061 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls. 1062 */ 1063 ASSERT((add_mp->b_datap->db_type == M_IOCDATA) || 1064 (add_mp->b_datap->db_type == M_IOCTL)); 1065 1066 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1067 /* 1068 * Return error if the conn has started closing. The conn 1069 * could have finished cleaning up the pending mp list, 1070 * If so we should not add another mp to the list negating 1071 * the cleanup. 1072 */ 1073 if (connp->conn_state_flags & CONN_CLOSING) 1074 return (B_FALSE); 1075 /* 1076 * Add the pending mp to the head of the list, chained by b_next. 1077 * Note down the conn on which the ioctl request came, in b_prev. 1078 * This will be used to later get the conn, when we get a response 1079 * on the ill queue, from some other module (typically arp) 1080 */ 1081 add_mp->b_next = (void *)ill->ill_pending_mp; 1082 add_mp->b_queue = CONNP_TO_WQ(connp); 1083 ill->ill_pending_mp = add_mp; 1084 if (connp != NULL) 1085 connp->conn_oper_pending_ill = ill; 1086 return (B_TRUE); 1087 } 1088 1089 /* 1090 * Retrieve the ill_pending_mp and return it. We have to walk the list 1091 * of mblks starting at ill_pending_mp, and match based on the ioc_id. 1092 */ 1093 mblk_t * 1094 ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) 1095 { 1096 mblk_t *prev = NULL; 1097 mblk_t *curr = NULL; 1098 uint_t id; 1099 conn_t *connp; 1100 1101 /* 1102 * When the conn closes, conn_ioctl_cleanup needs to clean 1103 * up the pending mp, but it does not know the ioc_id and 1104 * passes in a zero for it. 1105 */ 1106 mutex_enter(&ill->ill_lock); 1107 if (ioc_id != 0) 1108 *connpp = NULL; 1109 1110 /* Search the list for the appropriate ioctl based on ioc_id */ 1111 for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL; 1112 prev = curr, curr = curr->b_next) { 1113 id = ((struct iocblk *)curr->b_rptr)->ioc_id; 1114 connp = Q_TO_CONN(curr->b_queue); 1115 /* Match based on the ioc_id or based on the conn */ 1116 if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp)) 1117 break; 1118 } 1119 1120 if (curr != NULL) { 1121 /* Unlink the mblk from the pending mp list */ 1122 if (prev != NULL) { 1123 prev->b_next = curr->b_next; 1124 } else { 1125 ASSERT(ill->ill_pending_mp == curr); 1126 ill->ill_pending_mp = curr->b_next; 1127 } 1128 1129 /* 1130 * conn refcnt must have been bumped up at the start of 1131 * the ioctl. So we can safely access the conn. 1132 */ 1133 ASSERT(CONN_Q(curr->b_queue)); 1134 *connpp = Q_TO_CONN(curr->b_queue); 1135 curr->b_next = NULL; 1136 curr->b_queue = NULL; 1137 } 1138 1139 mutex_exit(&ill->ill_lock); 1140 1141 return (curr); 1142 } 1143 1144 /* 1145 * Add the pending mp to the list. There can be only 1 pending mp 1146 * in the list. Any exclusive ioctl that needs to wait for a response 1147 * from another module or driver needs to use this function to set 1148 * the ipsq_pending_mp to the ioctl mblk and wait for the response from 1149 * the other module/driver. This is also used while waiting for the 1150 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 1151 */ 1152 boolean_t 1153 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 1154 int waitfor) 1155 { 1156 ipsq_t *ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 1157 1158 ASSERT(IAM_WRITER_IPIF(ipif)); 1159 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 1160 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1161 ASSERT(ipsq->ipsq_pending_mp == NULL); 1162 /* 1163 * The caller may be using a different ipif than the one passed into 1164 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 1165 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT 1166 * that `ipsq_current_ipif == ipif'. 1167 */ 1168 ASSERT(ipsq->ipsq_current_ipif != NULL); 1169 1170 /* 1171 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls, 1172 * M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the driver. 1173 */ 1174 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) || 1175 (DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP) || 1176 (DB_TYPE(add_mp) == M_PROTO) || (DB_TYPE(add_mp) == M_PCPROTO)); 1177 1178 if (connp != NULL) { 1179 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1180 /* 1181 * Return error if the conn has started closing. The conn 1182 * could have finished cleaning up the pending mp list, 1183 * If so we should not add another mp to the list negating 1184 * the cleanup. 1185 */ 1186 if (connp->conn_state_flags & CONN_CLOSING) 1187 return (B_FALSE); 1188 } 1189 mutex_enter(&ipsq->ipsq_lock); 1190 ipsq->ipsq_pending_ipif = ipif; 1191 /* 1192 * Note down the queue in b_queue. This will be returned by 1193 * ipsq_pending_mp_get. Caller will then use these values to restart 1194 * the processing 1195 */ 1196 add_mp->b_next = NULL; 1197 add_mp->b_queue = q; 1198 ipsq->ipsq_pending_mp = add_mp; 1199 ipsq->ipsq_waitfor = waitfor; 1200 1201 if (connp != NULL) 1202 connp->conn_oper_pending_ill = ipif->ipif_ill; 1203 mutex_exit(&ipsq->ipsq_lock); 1204 return (B_TRUE); 1205 } 1206 1207 /* 1208 * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp 1209 * queued in the list. 1210 */ 1211 mblk_t * 1212 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 1213 { 1214 mblk_t *curr = NULL; 1215 1216 mutex_enter(&ipsq->ipsq_lock); 1217 *connpp = NULL; 1218 if (ipsq->ipsq_pending_mp == NULL) { 1219 mutex_exit(&ipsq->ipsq_lock); 1220 return (NULL); 1221 } 1222 1223 /* There can be only 1 such excl message */ 1224 curr = ipsq->ipsq_pending_mp; 1225 ASSERT(curr != NULL && curr->b_next == NULL); 1226 ipsq->ipsq_pending_ipif = NULL; 1227 ipsq->ipsq_pending_mp = NULL; 1228 ipsq->ipsq_waitfor = 0; 1229 mutex_exit(&ipsq->ipsq_lock); 1230 1231 if (CONN_Q(curr->b_queue)) { 1232 /* 1233 * This mp did a refhold on the conn, at the start of the ioctl. 1234 * So we can safely return a pointer to the conn to the caller. 1235 */ 1236 *connpp = Q_TO_CONN(curr->b_queue); 1237 } else { 1238 *connpp = NULL; 1239 } 1240 curr->b_next = NULL; 1241 curr->b_prev = NULL; 1242 return (curr); 1243 } 1244 1245 /* 1246 * Cleanup the ioctl mp queued in ipsq_pending_mp 1247 * - Called in the ill_delete path 1248 * - Called in the M_ERROR or M_HANGUP path on the ill. 1249 * - Called in the conn close path. 1250 */ 1251 boolean_t 1252 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 1253 { 1254 mblk_t *mp; 1255 ipsq_t *ipsq; 1256 queue_t *q; 1257 ipif_t *ipif; 1258 1259 ASSERT(IAM_WRITER_ILL(ill)); 1260 ipsq = ill->ill_phyint->phyint_ipsq; 1261 mutex_enter(&ipsq->ipsq_lock); 1262 /* 1263 * If connp is null, unconditionally clean up the ipsq_pending_mp. 1264 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 1265 * even if it is meant for another ill, since we have to enqueue 1266 * a new mp now in ipsq_pending_mp to complete the ipif_down. 1267 * If connp is non-null we are called from the conn close path. 1268 */ 1269 mp = ipsq->ipsq_pending_mp; 1270 if (mp == NULL || (connp != NULL && 1271 mp->b_queue != CONNP_TO_WQ(connp))) { 1272 mutex_exit(&ipsq->ipsq_lock); 1273 return (B_FALSE); 1274 } 1275 /* Now remove from the ipsq_pending_mp */ 1276 ipsq->ipsq_pending_mp = NULL; 1277 q = mp->b_queue; 1278 mp->b_next = NULL; 1279 mp->b_prev = NULL; 1280 mp->b_queue = NULL; 1281 1282 /* If MOVE was in progress, clear the move_in_progress fields also. */ 1283 ill = ipsq->ipsq_pending_ipif->ipif_ill; 1284 if (ill->ill_move_in_progress) { 1285 ILL_CLEAR_MOVE(ill); 1286 } else if (ill->ill_up_ipifs) { 1287 ill_group_cleanup(ill); 1288 } 1289 1290 ipif = ipsq->ipsq_pending_ipif; 1291 ipsq->ipsq_pending_ipif = NULL; 1292 ipsq->ipsq_waitfor = 0; 1293 ipsq->ipsq_current_ipif = NULL; 1294 ipsq->ipsq_current_ioctl = 0; 1295 mutex_exit(&ipsq->ipsq_lock); 1296 1297 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 1298 if (connp == NULL) { 1299 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 1300 } else { 1301 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); 1302 mutex_enter(&ipif->ipif_ill->ill_lock); 1303 ipif->ipif_state_flags &= ~IPIF_CHANGING; 1304 mutex_exit(&ipif->ipif_ill->ill_lock); 1305 } 1306 } else { 1307 /* 1308 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 1309 * be just inet_freemsg. we have to restart it 1310 * otherwise the thread will be stuck. 1311 */ 1312 inet_freemsg(mp); 1313 } 1314 return (B_TRUE); 1315 } 1316 1317 /* 1318 * The ill is closing. Cleanup all the pending mps. Called exclusively 1319 * towards the end of ill_delete. The refcount has gone to 0. So nobody 1320 * knows this ill, and hence nobody can add an mp to this list 1321 */ 1322 static void 1323 ill_pending_mp_cleanup(ill_t *ill) 1324 { 1325 mblk_t *mp; 1326 queue_t *q; 1327 1328 ASSERT(IAM_WRITER_ILL(ill)); 1329 1330 mutex_enter(&ill->ill_lock); 1331 /* 1332 * Every mp on the pending mp list originating from an ioctl 1333 * added 1 to the conn refcnt, at the start of the ioctl. 1334 * So bump it down now. See comments in ip_wput_nondata() 1335 */ 1336 while (ill->ill_pending_mp != NULL) { 1337 mp = ill->ill_pending_mp; 1338 ill->ill_pending_mp = mp->b_next; 1339 mutex_exit(&ill->ill_lock); 1340 1341 q = mp->b_queue; 1342 ASSERT(CONN_Q(q)); 1343 mp->b_next = NULL; 1344 mp->b_prev = NULL; 1345 mp->b_queue = NULL; 1346 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 1347 mutex_enter(&ill->ill_lock); 1348 } 1349 ill->ill_pending_ipif = NULL; 1350 1351 mutex_exit(&ill->ill_lock); 1352 } 1353 1354 /* 1355 * Called in the conn close path and ill delete path 1356 */ 1357 static void 1358 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 1359 { 1360 ipsq_t *ipsq; 1361 mblk_t *prev; 1362 mblk_t *curr; 1363 mblk_t *next; 1364 queue_t *q; 1365 mblk_t *tmp_list = NULL; 1366 1367 ASSERT(IAM_WRITER_ILL(ill)); 1368 if (connp != NULL) 1369 q = CONNP_TO_WQ(connp); 1370 else 1371 q = ill->ill_wq; 1372 1373 ipsq = ill->ill_phyint->phyint_ipsq; 1374 /* 1375 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 1376 * In the case of ioctl from a conn, there can be only 1 mp 1377 * queued on the ipsq. If an ill is being unplumbed, only messages 1378 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 1379 * ioctls meant for this ill form conn's are not flushed. They will 1380 * be processed during ipsq_exit and will not find the ill and will 1381 * return error. 1382 */ 1383 mutex_enter(&ipsq->ipsq_lock); 1384 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 1385 curr = next) { 1386 next = curr->b_next; 1387 if (curr->b_queue == q || curr->b_queue == RD(q)) { 1388 /* Unlink the mblk from the pending mp list */ 1389 if (prev != NULL) { 1390 prev->b_next = curr->b_next; 1391 } else { 1392 ASSERT(ipsq->ipsq_xopq_mphead == curr); 1393 ipsq->ipsq_xopq_mphead = curr->b_next; 1394 } 1395 if (ipsq->ipsq_xopq_mptail == curr) 1396 ipsq->ipsq_xopq_mptail = prev; 1397 /* 1398 * Create a temporary list and release the ipsq lock 1399 * New elements are added to the head of the tmp_list 1400 */ 1401 curr->b_next = tmp_list; 1402 tmp_list = curr; 1403 } else { 1404 prev = curr; 1405 } 1406 } 1407 mutex_exit(&ipsq->ipsq_lock); 1408 1409 while (tmp_list != NULL) { 1410 curr = tmp_list; 1411 tmp_list = curr->b_next; 1412 curr->b_next = NULL; 1413 curr->b_prev = NULL; 1414 curr->b_queue = NULL; 1415 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 1416 ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? 1417 CONN_CLOSE : NO_COPYOUT, NULL); 1418 } else { 1419 /* 1420 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 1421 * this can't be just inet_freemsg. we have to 1422 * restart it otherwise the thread will be stuck. 1423 */ 1424 inet_freemsg(curr); 1425 } 1426 } 1427 } 1428 1429 /* 1430 * This conn has started closing. Cleanup any pending ioctl from this conn. 1431 * STREAMS ensures that there can be at most 1 ioctl pending on a stream. 1432 */ 1433 void 1434 conn_ioctl_cleanup(conn_t *connp) 1435 { 1436 mblk_t *curr; 1437 ipsq_t *ipsq; 1438 ill_t *ill; 1439 boolean_t refheld; 1440 1441 /* 1442 * Is any exclusive ioctl pending ? If so clean it up. If the 1443 * ioctl has not yet started, the mp is pending in the list headed by 1444 * ipsq_xopq_head. If the ioctl has started the mp could be present in 1445 * ipsq_pending_mp. If the ioctl timed out in the streamhead but 1446 * is currently executing now the mp is not queued anywhere but 1447 * conn_oper_pending_ill is null. The conn close will wait 1448 * till the conn_ref drops to zero. 1449 */ 1450 mutex_enter(&connp->conn_lock); 1451 ill = connp->conn_oper_pending_ill; 1452 if (ill == NULL) { 1453 mutex_exit(&connp->conn_lock); 1454 return; 1455 } 1456 1457 curr = ill_pending_mp_get(ill, &connp, 0); 1458 if (curr != NULL) { 1459 mutex_exit(&connp->conn_lock); 1460 CONN_DEC_REF(connp); 1461 inet_freemsg(curr); 1462 return; 1463 } 1464 /* 1465 * We may not be able to refhold the ill if the ill/ipif 1466 * is changing. But we need to make sure that the ill will 1467 * not vanish. So we just bump up the ill_waiter count. 1468 */ 1469 refheld = ill_waiter_inc(ill); 1470 mutex_exit(&connp->conn_lock); 1471 if (refheld) { 1472 if (ipsq_enter(ill, B_TRUE)) { 1473 ill_waiter_dcr(ill); 1474 /* 1475 * Check whether this ioctl has started and is 1476 * pending now in ipsq_pending_mp. If it is not 1477 * found there then check whether this ioctl has 1478 * not even started and is in the ipsq_xopq list. 1479 */ 1480 if (!ipsq_pending_mp_cleanup(ill, connp)) 1481 ipsq_xopq_mp_cleanup(ill, connp); 1482 ipsq = ill->ill_phyint->phyint_ipsq; 1483 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1484 return; 1485 } 1486 } 1487 1488 /* 1489 * The ill is also closing and we could not bump up the 1490 * ill_waiter_count or we could not enter the ipsq. Leave 1491 * the cleanup to ill_delete 1492 */ 1493 mutex_enter(&connp->conn_lock); 1494 while (connp->conn_oper_pending_ill != NULL) 1495 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1496 mutex_exit(&connp->conn_lock); 1497 if (refheld) 1498 ill_waiter_dcr(ill); 1499 } 1500 1501 /* 1502 * ipcl_walk function for cleaning up conn_*_ill fields. 1503 */ 1504 static void 1505 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1506 { 1507 ill_t *ill = (ill_t *)arg; 1508 ire_t *ire; 1509 1510 mutex_enter(&connp->conn_lock); 1511 if (connp->conn_multicast_ill == ill) { 1512 /* Revert to late binding */ 1513 connp->conn_multicast_ill = NULL; 1514 connp->conn_orig_multicast_ifindex = 0; 1515 } 1516 if (connp->conn_incoming_ill == ill) 1517 connp->conn_incoming_ill = NULL; 1518 if (connp->conn_outgoing_ill == ill) 1519 connp->conn_outgoing_ill = NULL; 1520 if (connp->conn_outgoing_pill == ill) 1521 connp->conn_outgoing_pill = NULL; 1522 if (connp->conn_nofailover_ill == ill) 1523 connp->conn_nofailover_ill = NULL; 1524 if (connp->conn_xmit_if_ill == ill) 1525 connp->conn_xmit_if_ill = NULL; 1526 if (connp->conn_ire_cache != NULL) { 1527 ire = connp->conn_ire_cache; 1528 /* 1529 * ip_newroute creates IRE_CACHE with ire_stq coming from 1530 * interface X and ipif coming from interface Y, if interface 1531 * X and Y are part of the same IPMPgroup. Thus whenever 1532 * interface X goes down, remove all references to it by 1533 * checking both on ire_ipif and ire_stq. 1534 */ 1535 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1536 (ire->ire_type == IRE_CACHE && 1537 ire->ire_stq == ill->ill_wq)) { 1538 connp->conn_ire_cache = NULL; 1539 mutex_exit(&connp->conn_lock); 1540 ire_refrele_notr(ire); 1541 return; 1542 } 1543 } 1544 mutex_exit(&connp->conn_lock); 1545 1546 } 1547 1548 /* ARGSUSED */ 1549 void 1550 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1551 { 1552 ill_t *ill = q->q_ptr; 1553 ipif_t *ipif; 1554 1555 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1556 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1557 ipif_non_duplicate(ipif); 1558 ipif_down_tail(ipif); 1559 } 1560 ill_down_tail(ill); 1561 freemsg(mp); 1562 ipsq_current_finish(ipsq); 1563 } 1564 1565 /* 1566 * ill_down_start is called when we want to down this ill and bring it up again 1567 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1568 * all interfaces, but don't tear down any plumbing. 1569 */ 1570 boolean_t 1571 ill_down_start(queue_t *q, mblk_t *mp) 1572 { 1573 ill_t *ill = q->q_ptr; 1574 ipif_t *ipif; 1575 1576 ASSERT(IAM_WRITER_ILL(ill)); 1577 1578 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1579 (void) ipif_down(ipif, NULL, NULL); 1580 1581 ill_down(ill); 1582 1583 (void) ipsq_pending_mp_cleanup(ill, NULL); 1584 1585 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); 1586 1587 /* 1588 * Atomically test and add the pending mp if references are active. 1589 */ 1590 mutex_enter(&ill->ill_lock); 1591 if (!ill_is_quiescent(ill)) { 1592 /* call cannot fail since `conn_t *' argument is NULL */ 1593 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1594 mp, ILL_DOWN); 1595 mutex_exit(&ill->ill_lock); 1596 return (B_FALSE); 1597 } 1598 mutex_exit(&ill->ill_lock); 1599 return (B_TRUE); 1600 } 1601 1602 static void 1603 ill_down(ill_t *ill) 1604 { 1605 ip_stack_t *ipst = ill->ill_ipst; 1606 1607 /* Blow off any IREs dependent on this ILL. */ 1608 ire_walk(ill_downi, (char *)ill, ipst); 1609 1610 mutex_enter(&ipst->ips_ire_mrtun_lock); 1611 if (ipst->ips_ire_mrtun_count != 0) { 1612 mutex_exit(&ipst->ips_ire_mrtun_lock); 1613 ire_walk_ill_mrtun(0, 0, ill_downi_mrtun_srcif, 1614 (char *)ill, NULL, ipst); 1615 } else { 1616 mutex_exit(&ipst->ips_ire_mrtun_lock); 1617 } 1618 1619 /* 1620 * If any interface based forwarding table exists 1621 * Blow off the ires there dependent on this ill 1622 */ 1623 mutex_enter(&ipst->ips_ire_srcif_table_lock); 1624 if (ipst->ips_ire_srcif_table_count > 0) { 1625 mutex_exit(&ipst->ips_ire_srcif_table_lock); 1626 ire_walk_srcif_table_v4(ill_downi_mrtun_srcif, (char *)ill, 1627 ipst); 1628 } else { 1629 mutex_exit(&ipst->ips_ire_srcif_table_lock); 1630 } 1631 1632 /* Remove any conn_*_ill depending on this ill */ 1633 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); 1634 1635 if (ill->ill_group != NULL) { 1636 illgrp_delete(ill); 1637 } 1638 } 1639 1640 static void 1641 ill_down_tail(ill_t *ill) 1642 { 1643 int i; 1644 1645 /* Destroy ill_srcif_table if it exists */ 1646 /* Lock not reqd really because nobody should be able to access */ 1647 mutex_enter(&ill->ill_lock); 1648 if (ill->ill_srcif_table != NULL) { 1649 ill->ill_srcif_refcnt = 0; 1650 for (i = 0; i < IP_SRCIF_TABLE_SIZE; i++) { 1651 rw_destroy(&ill->ill_srcif_table[i].irb_lock); 1652 } 1653 kmem_free(ill->ill_srcif_table, 1654 IP_SRCIF_TABLE_SIZE * sizeof (irb_t)); 1655 ill->ill_srcif_table = NULL; 1656 ill->ill_srcif_refcnt = 0; 1657 ill->ill_mrtun_refcnt = 0; 1658 } 1659 mutex_exit(&ill->ill_lock); 1660 } 1661 1662 /* 1663 * ire_walk routine used to delete every IRE that depends on queues 1664 * associated with 'ill'. (Always called as writer.) 1665 */ 1666 static void 1667 ill_downi(ire_t *ire, char *ill_arg) 1668 { 1669 ill_t *ill = (ill_t *)ill_arg; 1670 1671 /* 1672 * ip_newroute creates IRE_CACHE with ire_stq coming from 1673 * interface X and ipif coming from interface Y, if interface 1674 * X and Y are part of the same IPMP group. Thus whenever interface 1675 * X goes down, remove all references to it by checking both 1676 * on ire_ipif and ire_stq. 1677 */ 1678 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1679 (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) { 1680 ire_delete(ire); 1681 } 1682 } 1683 1684 /* 1685 * A seperate routine for deleting revtun and srcif based routes 1686 * are needed because the ires only deleted when the interface 1687 * is unplumbed. Also these ires have ire_in_ill non-null as well. 1688 * we want to keep mobile IP specific code separate. 1689 */ 1690 static void 1691 ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg) 1692 { 1693 ill_t *ill = (ill_t *)ill_arg; 1694 1695 ASSERT(ire->ire_in_ill != NULL); 1696 1697 if ((ire->ire_in_ill != NULL && ire->ire_in_ill == ill) || 1698 (ire->ire_stq == ill->ill_wq) || (ire->ire_stq == ill->ill_rq)) { 1699 ire_delete(ire); 1700 } 1701 } 1702 1703 /* 1704 * Remove ire/nce from the fastpath list. 1705 */ 1706 void 1707 ill_fastpath_nack(ill_t *ill) 1708 { 1709 nce_fastpath_list_dispatch(ill, NULL, NULL); 1710 } 1711 1712 /* Consume an M_IOCACK of the fastpath probe. */ 1713 void 1714 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1715 { 1716 mblk_t *mp1 = mp; 1717 1718 /* 1719 * If this was the first attempt turn on the fastpath probing. 1720 */ 1721 mutex_enter(&ill->ill_lock); 1722 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1723 ill->ill_dlpi_fastpath_state = IDS_OK; 1724 mutex_exit(&ill->ill_lock); 1725 1726 /* Free the M_IOCACK mblk, hold on to the data */ 1727 mp = mp->b_cont; 1728 freeb(mp1); 1729 if (mp == NULL) 1730 return; 1731 if (mp->b_cont != NULL) { 1732 /* 1733 * Update all IRE's or NCE's that are waiting for 1734 * fastpath update. 1735 */ 1736 nce_fastpath_list_dispatch(ill, ndp_fastpath_update, mp); 1737 mp1 = mp->b_cont; 1738 freeb(mp); 1739 mp = mp1; 1740 } else { 1741 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1742 } 1743 1744 freeb(mp); 1745 } 1746 1747 /* 1748 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1749 * The data portion of the request is a dl_unitdata_req_t template for 1750 * what we would send downstream in the absence of a fastpath confirmation. 1751 */ 1752 int 1753 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1754 { 1755 struct iocblk *ioc; 1756 mblk_t *mp; 1757 1758 if (dlur_mp == NULL) 1759 return (EINVAL); 1760 1761 mutex_enter(&ill->ill_lock); 1762 switch (ill->ill_dlpi_fastpath_state) { 1763 case IDS_FAILED: 1764 /* 1765 * Driver NAKed the first fastpath ioctl - assume it doesn't 1766 * support it. 1767 */ 1768 mutex_exit(&ill->ill_lock); 1769 return (ENOTSUP); 1770 case IDS_UNKNOWN: 1771 /* This is the first probe */ 1772 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1773 break; 1774 default: 1775 break; 1776 } 1777 mutex_exit(&ill->ill_lock); 1778 1779 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1780 return (EAGAIN); 1781 1782 mp->b_cont = copyb(dlur_mp); 1783 if (mp->b_cont == NULL) { 1784 freeb(mp); 1785 return (EAGAIN); 1786 } 1787 1788 ioc = (struct iocblk *)mp->b_rptr; 1789 ioc->ioc_count = msgdsize(mp->b_cont); 1790 1791 putnext(ill->ill_wq, mp); 1792 return (0); 1793 } 1794 1795 void 1796 ill_capability_probe(ill_t *ill) 1797 { 1798 /* 1799 * Do so only if negotiation is enabled, capabilities are unknown, 1800 * and a capability negotiation is not already in progress. 1801 */ 1802 if (ill->ill_dlpi_capab_state != IDS_UNKNOWN && 1803 ill->ill_dlpi_capab_state != IDS_RENEG) 1804 return; 1805 1806 ill->ill_dlpi_capab_state = IDS_INPROGRESS; 1807 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1808 ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL); 1809 } 1810 1811 void 1812 ill_capability_reset(ill_t *ill) 1813 { 1814 mblk_t *sc_mp = NULL; 1815 mblk_t *tmp; 1816 1817 /* 1818 * Note here that we reset the state to UNKNOWN, and later send 1819 * down the DL_CAPABILITY_REQ without first setting the state to 1820 * INPROGRESS. We do this in order to distinguish the 1821 * DL_CAPABILITY_ACK response which may come back in response to 1822 * a "reset" apart from the "probe" DL_CAPABILITY_REQ. This would 1823 * also handle the case where the driver doesn't send us back 1824 * a DL_CAPABILITY_ACK in response, since the "probe" routine 1825 * requires the state to be in UNKNOWN anyway. In any case, all 1826 * features are turned off until the state reaches IDS_OK. 1827 */ 1828 ill->ill_dlpi_capab_state = IDS_UNKNOWN; 1829 1830 /* 1831 * Disable sub-capabilities and request a list of sub-capability 1832 * messages which will be sent down to the driver. Each handler 1833 * allocates the corresponding dl_capability_sub_t inside an 1834 * mblk, and links it to the existing sc_mp mblk, or return it 1835 * as sc_mp if it's the first sub-capability (the passed in 1836 * sc_mp is NULL). Upon returning from all capability handlers, 1837 * sc_mp will be pulled-up, before passing it downstream. 1838 */ 1839 ill_capability_mdt_reset(ill, &sc_mp); 1840 ill_capability_hcksum_reset(ill, &sc_mp); 1841 ill_capability_zerocopy_reset(ill, &sc_mp); 1842 ill_capability_ipsec_reset(ill, &sc_mp); 1843 ill_capability_dls_reset(ill, &sc_mp); 1844 ill_capability_lso_reset(ill, &sc_mp); 1845 1846 /* Nothing to send down in order to disable the capabilities? */ 1847 if (sc_mp == NULL) 1848 return; 1849 1850 tmp = msgpullup(sc_mp, -1); 1851 freemsg(sc_mp); 1852 if ((sc_mp = tmp) == NULL) { 1853 cmn_err(CE_WARN, "ill_capability_reset: unable to send down " 1854 "DL_CAPABILITY_REQ (ENOMEM)\n"); 1855 return; 1856 } 1857 1858 ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n")); 1859 ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp); 1860 } 1861 1862 /* 1863 * Request or set new-style hardware capabilities supported by DLS provider. 1864 */ 1865 static void 1866 ill_capability_proto(ill_t *ill, int type, mblk_t *reqp) 1867 { 1868 mblk_t *mp; 1869 dl_capability_req_t *capb; 1870 size_t size = 0; 1871 uint8_t *ptr; 1872 1873 if (reqp != NULL) 1874 size = MBLKL(reqp); 1875 1876 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type); 1877 if (mp == NULL) { 1878 freemsg(reqp); 1879 return; 1880 } 1881 ptr = mp->b_rptr; 1882 1883 capb = (dl_capability_req_t *)ptr; 1884 ptr += sizeof (dl_capability_req_t); 1885 1886 if (reqp != NULL) { 1887 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1888 capb->dl_sub_length = size; 1889 bcopy(reqp->b_rptr, ptr, size); 1890 ptr += size; 1891 mp->b_cont = reqp->b_cont; 1892 freeb(reqp); 1893 } 1894 ASSERT(ptr == mp->b_wptr); 1895 1896 ill_dlpi_send(ill, mp); 1897 } 1898 1899 static void 1900 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1901 { 1902 dl_capab_id_t *id_ic; 1903 uint_t sub_dl_cap = outers->dl_cap; 1904 dl_capability_sub_t *inners; 1905 uint8_t *capend; 1906 1907 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1908 1909 /* 1910 * Note: range checks here are not absolutely sufficient to 1911 * make us robust against malformed messages sent by drivers; 1912 * this is in keeping with the rest of IP's dlpi handling. 1913 * (Remember, it's coming from something else in the kernel 1914 * address space) 1915 */ 1916 1917 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1918 if (capend > mp->b_wptr) { 1919 cmn_err(CE_WARN, "ill_capability_id_ack: " 1920 "malformed sub-capability too long for mblk"); 1921 return; 1922 } 1923 1924 id_ic = (dl_capab_id_t *)(outers + 1); 1925 1926 if (outers->dl_length < sizeof (*id_ic) || 1927 (inners = &id_ic->id_subcap, 1928 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1929 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1930 "encapsulated capab type %d too long for mblk", 1931 inners->dl_cap); 1932 return; 1933 } 1934 1935 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1936 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1937 "isn't as expected; pass-thru module(s) detected, " 1938 "discarding capability\n", inners->dl_cap)); 1939 return; 1940 } 1941 1942 /* Process the encapsulated sub-capability */ 1943 ill_capability_dispatch(ill, mp, inners, B_TRUE); 1944 } 1945 1946 /* 1947 * Process Multidata Transmit capability negotiation ack received from a 1948 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a 1949 * DL_CAPABILITY_ACK message. 1950 */ 1951 static void 1952 ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1953 { 1954 mblk_t *nmp = NULL; 1955 dl_capability_req_t *oc; 1956 dl_capab_mdt_t *mdt_ic, *mdt_oc; 1957 ill_mdt_capab_t **ill_mdt_capab; 1958 uint_t sub_dl_cap = isub->dl_cap; 1959 uint8_t *capend; 1960 1961 ASSERT(sub_dl_cap == DL_CAPAB_MDT); 1962 1963 ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab; 1964 1965 /* 1966 * Note: range checks here are not absolutely sufficient to 1967 * make us robust against malformed messages sent by drivers; 1968 * this is in keeping with the rest of IP's dlpi handling. 1969 * (Remember, it's coming from something else in the kernel 1970 * address space) 1971 */ 1972 1973 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1974 if (capend > mp->b_wptr) { 1975 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1976 "malformed sub-capability too long for mblk"); 1977 return; 1978 } 1979 1980 mdt_ic = (dl_capab_mdt_t *)(isub + 1); 1981 1982 if (mdt_ic->mdt_version != MDT_VERSION_2) { 1983 cmn_err(CE_CONT, "ill_capability_mdt_ack: " 1984 "unsupported MDT sub-capability (version %d, expected %d)", 1985 mdt_ic->mdt_version, MDT_VERSION_2); 1986 return; 1987 } 1988 1989 if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) { 1990 ip1dbg(("ill_capability_mdt_ack: mid token for MDT " 1991 "capability isn't as expected; pass-thru module(s) " 1992 "detected, discarding capability\n")); 1993 return; 1994 } 1995 1996 if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) { 1997 1998 if (*ill_mdt_capab == NULL) { 1999 *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), 2000 KM_NOSLEEP); 2001 2002 if (*ill_mdt_capab == NULL) { 2003 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 2004 "could not enable MDT version %d " 2005 "for %s (ENOMEM)\n", MDT_VERSION_2, 2006 ill->ill_name); 2007 return; 2008 } 2009 } 2010 2011 ip1dbg(("ill_capability_mdt_ack: interface %s supports " 2012 "MDT version %d (%d bytes leading, %d bytes trailing " 2013 "header spaces, %d max pld bufs, %d span limit)\n", 2014 ill->ill_name, MDT_VERSION_2, 2015 mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail, 2016 mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit)); 2017 2018 (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2; 2019 (*ill_mdt_capab)->ill_mdt_on = 1; 2020 /* 2021 * Round the following values to the nearest 32-bit; ULP 2022 * may further adjust them to accomodate for additional 2023 * protocol headers. We pass these values to ULP during 2024 * bind time. 2025 */ 2026 (*ill_mdt_capab)->ill_mdt_hdr_head = 2027 roundup(mdt_ic->mdt_hdr_head, 4); 2028 (*ill_mdt_capab)->ill_mdt_hdr_tail = 2029 roundup(mdt_ic->mdt_hdr_tail, 4); 2030 (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld; 2031 (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit; 2032 2033 ill->ill_capabilities |= ILL_CAPAB_MDT; 2034 } else { 2035 uint_t size; 2036 uchar_t *rptr; 2037 2038 size = sizeof (dl_capability_req_t) + 2039 sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); 2040 2041 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2042 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 2043 "could not enable MDT for %s (ENOMEM)\n", 2044 ill->ill_name); 2045 return; 2046 } 2047 2048 rptr = nmp->b_rptr; 2049 /* initialize dl_capability_req_t */ 2050 oc = (dl_capability_req_t *)nmp->b_rptr; 2051 oc->dl_sub_offset = sizeof (dl_capability_req_t); 2052 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 2053 sizeof (dl_capab_mdt_t); 2054 nmp->b_rptr += sizeof (dl_capability_req_t); 2055 2056 /* initialize dl_capability_sub_t */ 2057 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 2058 nmp->b_rptr += sizeof (*isub); 2059 2060 /* initialize dl_capab_mdt_t */ 2061 mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr; 2062 bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic)); 2063 2064 nmp->b_rptr = rptr; 2065 2066 ip1dbg(("ill_capability_mdt_ack: asking interface %s " 2067 "to enable MDT version %d\n", ill->ill_name, 2068 MDT_VERSION_2)); 2069 2070 /* set ENABLE flag */ 2071 mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; 2072 2073 /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ 2074 ill_dlpi_send(ill, nmp); 2075 } 2076 } 2077 2078 static void 2079 ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp) 2080 { 2081 mblk_t *mp; 2082 dl_capab_mdt_t *mdt_subcap; 2083 dl_capability_sub_t *dl_subcap; 2084 int size; 2085 2086 if (!ILL_MDT_CAPABLE(ill)) 2087 return; 2088 2089 ASSERT(ill->ill_mdt_capab != NULL); 2090 /* 2091 * Clear the capability flag for MDT but retain the ill_mdt_capab 2092 * structure since it's possible that another thread is still 2093 * referring to it. The structure only gets deallocated when 2094 * we destroy the ill. 2095 */ 2096 ill->ill_capabilities &= ~ILL_CAPAB_MDT; 2097 2098 size = sizeof (*dl_subcap) + sizeof (*mdt_subcap); 2099 2100 mp = allocb(size, BPRI_HI); 2101 if (mp == NULL) { 2102 ip1dbg(("ill_capability_mdt_reset: unable to allocate " 2103 "request to disable MDT\n")); 2104 return; 2105 } 2106 2107 mp->b_wptr = mp->b_rptr + size; 2108 2109 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2110 dl_subcap->dl_cap = DL_CAPAB_MDT; 2111 dl_subcap->dl_length = sizeof (*mdt_subcap); 2112 2113 mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1); 2114 mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version; 2115 mdt_subcap->mdt_flags = 0; 2116 mdt_subcap->mdt_hdr_head = 0; 2117 mdt_subcap->mdt_hdr_tail = 0; 2118 2119 if (*sc_mp != NULL) 2120 linkb(*sc_mp, mp); 2121 else 2122 *sc_mp = mp; 2123 } 2124 2125 /* 2126 * Send a DL_NOTIFY_REQ to the specified ill to enable 2127 * DL_NOTE_PROMISC_ON/OFF_PHYS notifications. 2128 * Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware 2129 * acceleration. 2130 * Returns B_TRUE on success, B_FALSE if the message could not be sent. 2131 */ 2132 static boolean_t 2133 ill_enable_promisc_notify(ill_t *ill) 2134 { 2135 mblk_t *mp; 2136 dl_notify_req_t *req; 2137 2138 IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n")); 2139 2140 mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ); 2141 if (mp == NULL) 2142 return (B_FALSE); 2143 2144 req = (dl_notify_req_t *)mp->b_rptr; 2145 req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS | 2146 DL_NOTE_PROMISC_OFF_PHYS; 2147 2148 ill_dlpi_send(ill, mp); 2149 2150 return (B_TRUE); 2151 } 2152 2153 2154 /* 2155 * Allocate an IPsec capability request which will be filled by our 2156 * caller to turn on support for one or more algorithms. 2157 */ 2158 static mblk_t * 2159 ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub) 2160 { 2161 mblk_t *nmp; 2162 dl_capability_req_t *ocap; 2163 dl_capab_ipsec_t *ocip; 2164 dl_capab_ipsec_t *icip; 2165 uint8_t *ptr; 2166 icip = (dl_capab_ipsec_t *)(isub + 1); 2167 2168 /* 2169 * The first time around, we send a DL_NOTIFY_REQ to enable 2170 * PROMISC_ON/OFF notification from the provider. We need to 2171 * do this before enabling the algorithms to avoid leakage of 2172 * cleartext packets. 2173 */ 2174 2175 if (!ill_enable_promisc_notify(ill)) 2176 return (NULL); 2177 2178 /* 2179 * Allocate new mblk which will contain a new capability 2180 * request to enable the capabilities. 2181 */ 2182 2183 nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + 2184 sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ); 2185 if (nmp == NULL) 2186 return (NULL); 2187 2188 ptr = nmp->b_rptr; 2189 2190 /* initialize dl_capability_req_t */ 2191 ocap = (dl_capability_req_t *)ptr; 2192 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2193 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2194 ptr += sizeof (dl_capability_req_t); 2195 2196 /* initialize dl_capability_sub_t */ 2197 bcopy(isub, ptr, sizeof (*isub)); 2198 ptr += sizeof (*isub); 2199 2200 /* initialize dl_capab_ipsec_t */ 2201 ocip = (dl_capab_ipsec_t *)ptr; 2202 bcopy(icip, ocip, sizeof (*icip)); 2203 2204 nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]); 2205 return (nmp); 2206 } 2207 2208 /* 2209 * Process an IPsec capability negotiation ack received from a DLS Provider. 2210 * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or 2211 * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message. 2212 */ 2213 static void 2214 ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2215 { 2216 dl_capab_ipsec_t *icip; 2217 dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */ 2218 dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */ 2219 uint_t cipher, nciphers; 2220 mblk_t *nmp; 2221 uint_t alg_len; 2222 boolean_t need_sadb_dump; 2223 uint_t sub_dl_cap = isub->dl_cap; 2224 ill_ipsec_capab_t **ill_capab; 2225 uint64_t ill_capab_flag; 2226 uint8_t *capend, *ciphend; 2227 boolean_t sadb_resync; 2228 2229 ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH || 2230 sub_dl_cap == DL_CAPAB_IPSEC_ESP); 2231 2232 if (sub_dl_cap == DL_CAPAB_IPSEC_AH) { 2233 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah; 2234 ill_capab_flag = ILL_CAPAB_AH; 2235 } else { 2236 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp; 2237 ill_capab_flag = ILL_CAPAB_ESP; 2238 } 2239 2240 /* 2241 * If the ill capability structure exists, then this incoming 2242 * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle. 2243 * If this is so, then we'd need to resynchronize the SADB 2244 * after re-enabling the offloaded ciphers. 2245 */ 2246 sadb_resync = (*ill_capab != NULL); 2247 2248 /* 2249 * Note: range checks here are not absolutely sufficient to 2250 * make us robust against malformed messages sent by drivers; 2251 * this is in keeping with the rest of IP's dlpi handling. 2252 * (Remember, it's coming from something else in the kernel 2253 * address space) 2254 */ 2255 2256 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2257 if (capend > mp->b_wptr) { 2258 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2259 "malformed sub-capability too long for mblk"); 2260 return; 2261 } 2262 2263 /* 2264 * There are two types of acks we process here: 2265 * 1. acks in reply to a (first form) generic capability req 2266 * (no ENABLE flag set) 2267 * 2. acks in reply to a ENABLE capability req. 2268 * (ENABLE flag set) 2269 * 2270 * We process the subcapability passed as argument as follows: 2271 * 1 do initializations 2272 * 1.1 initialize nmp = NULL 2273 * 1.2 set need_sadb_dump to B_FALSE 2274 * 2 for each cipher in subcapability: 2275 * 2.1 if ENABLE flag is set: 2276 * 2.1.1 update per-ill ipsec capabilities info 2277 * 2.1.2 set need_sadb_dump to B_TRUE 2278 * 2.2 if ENABLE flag is not set: 2279 * 2.2.1 if nmp is NULL: 2280 * 2.2.1.1 allocate and initialize nmp 2281 * 2.2.1.2 init current pos in nmp 2282 * 2.2.2 copy current cipher to current pos in nmp 2283 * 2.2.3 set ENABLE flag in nmp 2284 * 2.2.4 update current pos 2285 * 3 if nmp is not equal to NULL, send enable request 2286 * 3.1 send capability request 2287 * 4 if need_sadb_dump is B_TRUE 2288 * 4.1 enable promiscuous on/off notifications 2289 * 4.2 call ill_dlpi_send(isub->dlcap) to send all 2290 * AH or ESP SA's to interface. 2291 */ 2292 2293 nmp = NULL; 2294 oalg = NULL; 2295 need_sadb_dump = B_FALSE; 2296 icip = (dl_capab_ipsec_t *)(isub + 1); 2297 ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]); 2298 2299 nciphers = icip->cip_nciphers; 2300 ciphend = (uint8_t *)(ialg + icip->cip_nciphers); 2301 2302 if (ciphend > capend) { 2303 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2304 "too many ciphers for sub-capability len"); 2305 return; 2306 } 2307 2308 for (cipher = 0; cipher < nciphers; cipher++) { 2309 alg_len = sizeof (dl_capab_ipsec_alg_t); 2310 2311 if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) { 2312 /* 2313 * TBD: when we provide a way to disable capabilities 2314 * from above, need to manage the request-pending state 2315 * and fail if we were not expecting this ACK. 2316 */ 2317 IPSECHW_DEBUG(IPSECHW_CAPAB, 2318 ("ill_capability_ipsec_ack: got ENABLE ACK\n")); 2319 2320 /* 2321 * Update IPsec capabilities for this ill 2322 */ 2323 2324 if (*ill_capab == NULL) { 2325 IPSECHW_DEBUG(IPSECHW_CAPAB, 2326 ("ill_capability_ipsec_ack: " 2327 "allocating ipsec_capab for ill\n")); 2328 *ill_capab = ill_ipsec_capab_alloc(); 2329 2330 if (*ill_capab == NULL) { 2331 cmn_err(CE_WARN, 2332 "ill_capability_ipsec_ack: " 2333 "could not enable IPsec Hardware " 2334 "acceleration for %s (ENOMEM)\n", 2335 ill->ill_name); 2336 return; 2337 } 2338 } 2339 2340 ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH || 2341 ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR); 2342 2343 if (ialg->alg_prim >= MAX_IPSEC_ALGS) { 2344 cmn_err(CE_WARN, 2345 "ill_capability_ipsec_ack: " 2346 "malformed IPsec algorithm id %d", 2347 ialg->alg_prim); 2348 continue; 2349 } 2350 2351 if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) { 2352 IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs, 2353 ialg->alg_prim); 2354 } else { 2355 ipsec_capab_algparm_t *alp; 2356 2357 IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs, 2358 ialg->alg_prim); 2359 if (!ill_ipsec_capab_resize_algparm(*ill_capab, 2360 ialg->alg_prim)) { 2361 cmn_err(CE_WARN, 2362 "ill_capability_ipsec_ack: " 2363 "no space for IPsec alg id %d", 2364 ialg->alg_prim); 2365 continue; 2366 } 2367 alp = &((*ill_capab)->encr_algparm[ 2368 ialg->alg_prim]); 2369 alp->minkeylen = ialg->alg_minbits; 2370 alp->maxkeylen = ialg->alg_maxbits; 2371 } 2372 ill->ill_capabilities |= ill_capab_flag; 2373 /* 2374 * indicate that a capability was enabled, which 2375 * will be used below to kick off a SADB dump 2376 * to the ill. 2377 */ 2378 need_sadb_dump = B_TRUE; 2379 } else { 2380 IPSECHW_DEBUG(IPSECHW_CAPAB, 2381 ("ill_capability_ipsec_ack: enabling alg 0x%x\n", 2382 ialg->alg_prim)); 2383 2384 if (nmp == NULL) { 2385 nmp = ill_alloc_ipsec_cap_req(ill, isub); 2386 if (nmp == NULL) { 2387 /* 2388 * Sending the PROMISC_ON/OFF 2389 * notification request failed. 2390 * We cannot enable the algorithms 2391 * since the Provider will not 2392 * notify IP of promiscous mode 2393 * changes, which could lead 2394 * to leakage of packets. 2395 */ 2396 cmn_err(CE_WARN, 2397 "ill_capability_ipsec_ack: " 2398 "could not enable IPsec Hardware " 2399 "acceleration for %s (ENOMEM)\n", 2400 ill->ill_name); 2401 return; 2402 } 2403 /* ptr to current output alg specifier */ 2404 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2405 } 2406 2407 /* 2408 * Copy current alg specifier, set ENABLE 2409 * flag, and advance to next output alg. 2410 * For now we enable all IPsec capabilities. 2411 */ 2412 ASSERT(oalg != NULL); 2413 bcopy(ialg, oalg, alg_len); 2414 oalg->alg_flag |= DL_CAPAB_ALG_ENABLE; 2415 nmp->b_wptr += alg_len; 2416 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2417 } 2418 2419 /* move to next input algorithm specifier */ 2420 ialg = (dl_capab_ipsec_alg_t *) 2421 ((char *)ialg + alg_len); 2422 } 2423 2424 if (nmp != NULL) 2425 /* 2426 * nmp points to a DL_CAPABILITY_REQ message to enable 2427 * IPsec hardware acceleration. 2428 */ 2429 ill_dlpi_send(ill, nmp); 2430 2431 if (need_sadb_dump) 2432 /* 2433 * An acknowledgement corresponding to a request to 2434 * enable acceleration was received, notify SADB. 2435 */ 2436 ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync); 2437 } 2438 2439 /* 2440 * Given an mblk with enough space in it, create sub-capability entries for 2441 * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised 2442 * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared, 2443 * in preparation for the reset the DL_CAPABILITY_REQ message. 2444 */ 2445 static void 2446 ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, 2447 ill_ipsec_capab_t *ill_cap, mblk_t *mp) 2448 { 2449 dl_capab_ipsec_t *oipsec; 2450 dl_capab_ipsec_alg_t *oalg; 2451 dl_capability_sub_t *dl_subcap; 2452 int i, k; 2453 2454 ASSERT(nciphers > 0); 2455 ASSERT(ill_cap != NULL); 2456 ASSERT(mp != NULL); 2457 ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen); 2458 2459 /* dl_capability_sub_t for "stype" */ 2460 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2461 dl_subcap->dl_cap = stype; 2462 dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen; 2463 mp->b_wptr += sizeof (dl_capability_sub_t); 2464 2465 /* dl_capab_ipsec_t for "stype" */ 2466 oipsec = (dl_capab_ipsec_t *)mp->b_wptr; 2467 oipsec->cip_version = 1; 2468 oipsec->cip_nciphers = nciphers; 2469 mp->b_wptr = (uchar_t *)&oipsec->cip_data[0]; 2470 2471 /* create entries for "stype" AUTH ciphers */ 2472 for (i = 0; i < ill_cap->algs_size; i++) { 2473 for (k = 0; k < BITSPERBYTE; k++) { 2474 if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0) 2475 continue; 2476 2477 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2478 bzero((void *)oalg, sizeof (*oalg)); 2479 oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH; 2480 oalg->alg_prim = k + (BITSPERBYTE * i); 2481 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2482 } 2483 } 2484 /* create entries for "stype" ENCR ciphers */ 2485 for (i = 0; i < ill_cap->algs_size; i++) { 2486 for (k = 0; k < BITSPERBYTE; k++) { 2487 if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0) 2488 continue; 2489 2490 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2491 bzero((void *)oalg, sizeof (*oalg)); 2492 oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR; 2493 oalg->alg_prim = k + (BITSPERBYTE * i); 2494 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2495 } 2496 } 2497 } 2498 2499 /* 2500 * Macro to count number of 1s in a byte (8-bit word). The total count is 2501 * accumulated into the passed-in argument (sum). We could use SPARCv9's 2502 * POPC instruction, but our macro is more flexible for an arbitrary length 2503 * of bytes, such as {auth,encr}_hw_algs. These variables are currently 2504 * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length 2505 * stays that way, we can reduce the number of iterations required. 2506 */ 2507 #define COUNT_1S(val, sum) { \ 2508 uint8_t x = val & 0xff; \ 2509 x = (x & 0x55) + ((x >> 1) & 0x55); \ 2510 x = (x & 0x33) + ((x >> 2) & 0x33); \ 2511 sum += (x & 0xf) + ((x >> 4) & 0xf); \ 2512 } 2513 2514 /* ARGSUSED */ 2515 static void 2516 ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp) 2517 { 2518 mblk_t *mp; 2519 ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; 2520 ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; 2521 uint64_t ill_capabilities = ill->ill_capabilities; 2522 int ah_cnt = 0, esp_cnt = 0; 2523 int ah_len = 0, esp_len = 0; 2524 int i, size = 0; 2525 2526 if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) 2527 return; 2528 2529 ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); 2530 ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); 2531 2532 /* Find out the number of ciphers for AH */ 2533 if (cap_ah != NULL) { 2534 for (i = 0; i < cap_ah->algs_size; i++) { 2535 COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt); 2536 COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt); 2537 } 2538 if (ah_cnt > 0) { 2539 size += sizeof (dl_capability_sub_t) + 2540 sizeof (dl_capab_ipsec_t); 2541 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2542 ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2543 size += ah_len; 2544 } 2545 } 2546 2547 /* Find out the number of ciphers for ESP */ 2548 if (cap_esp != NULL) { 2549 for (i = 0; i < cap_esp->algs_size; i++) { 2550 COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt); 2551 COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt); 2552 } 2553 if (esp_cnt > 0) { 2554 size += sizeof (dl_capability_sub_t) + 2555 sizeof (dl_capab_ipsec_t); 2556 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2557 esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2558 size += esp_len; 2559 } 2560 } 2561 2562 if (size == 0) { 2563 ip1dbg(("ill_capability_ipsec_reset: capabilities exist but " 2564 "there's nothing to reset\n")); 2565 return; 2566 } 2567 2568 mp = allocb(size, BPRI_HI); 2569 if (mp == NULL) { 2570 ip1dbg(("ill_capability_ipsec_reset: unable to allocate " 2571 "request to disable IPSEC Hardware Acceleration\n")); 2572 return; 2573 } 2574 2575 /* 2576 * Clear the capability flags for IPSec HA but retain the ill 2577 * capability structures since it's possible that another thread 2578 * is still referring to them. The structures only get deallocated 2579 * when we destroy the ill. 2580 * 2581 * Various places check the flags to see if the ill is capable of 2582 * hardware acceleration, and by clearing them we ensure that new 2583 * outbound IPSec packets are sent down encrypted. 2584 */ 2585 ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP); 2586 2587 /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ 2588 if (ah_cnt > 0) { 2589 ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, 2590 cap_ah, mp); 2591 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2592 } 2593 2594 /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ 2595 if (esp_cnt > 0) { 2596 ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, 2597 cap_esp, mp); 2598 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2599 } 2600 2601 /* 2602 * At this point we've composed a bunch of sub-capabilities to be 2603 * encapsulated in a DL_CAPABILITY_REQ and later sent downstream 2604 * by the caller. Upon receiving this reset message, the driver 2605 * must stop inbound decryption (by destroying all inbound SAs) 2606 * and let the corresponding packets come in encrypted. 2607 */ 2608 2609 if (*sc_mp != NULL) 2610 linkb(*sc_mp, mp); 2611 else 2612 *sc_mp = mp; 2613 } 2614 2615 static void 2616 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, 2617 boolean_t encapsulated) 2618 { 2619 boolean_t legacy = B_FALSE; 2620 2621 /* 2622 * If this DL_CAPABILITY_ACK came in as a response to our "reset" 2623 * DL_CAPABILITY_REQ, ignore it during this cycle. We've just 2624 * instructed the driver to disable its advertised capabilities, 2625 * so there's no point in accepting any response at this moment. 2626 */ 2627 if (ill->ill_dlpi_capab_state == IDS_UNKNOWN) 2628 return; 2629 2630 /* 2631 * Note that only the following two sub-capabilities may be 2632 * considered as "legacy", since their original definitions 2633 * do not incorporate the dl_mid_t module ID token, and hence 2634 * may require the use of the wrapper sub-capability. 2635 */ 2636 switch (subp->dl_cap) { 2637 case DL_CAPAB_IPSEC_AH: 2638 case DL_CAPAB_IPSEC_ESP: 2639 legacy = B_TRUE; 2640 break; 2641 } 2642 2643 /* 2644 * For legacy sub-capabilities which don't incorporate a queue_t 2645 * pointer in their structures, discard them if we detect that 2646 * there are intermediate modules in between IP and the driver. 2647 */ 2648 if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) { 2649 ip1dbg(("ill_capability_dispatch: unencapsulated capab type " 2650 "%d discarded; %d module(s) present below IP\n", 2651 subp->dl_cap, ill->ill_lmod_cnt)); 2652 return; 2653 } 2654 2655 switch (subp->dl_cap) { 2656 case DL_CAPAB_IPSEC_AH: 2657 case DL_CAPAB_IPSEC_ESP: 2658 ill_capability_ipsec_ack(ill, mp, subp); 2659 break; 2660 case DL_CAPAB_MDT: 2661 ill_capability_mdt_ack(ill, mp, subp); 2662 break; 2663 case DL_CAPAB_HCKSUM: 2664 ill_capability_hcksum_ack(ill, mp, subp); 2665 break; 2666 case DL_CAPAB_ZEROCOPY: 2667 ill_capability_zerocopy_ack(ill, mp, subp); 2668 break; 2669 case DL_CAPAB_POLL: 2670 if (!SOFT_RINGS_ENABLED()) 2671 ill_capability_dls_ack(ill, mp, subp); 2672 break; 2673 case DL_CAPAB_SOFT_RING: 2674 if (SOFT_RINGS_ENABLED()) 2675 ill_capability_dls_ack(ill, mp, subp); 2676 break; 2677 case DL_CAPAB_LSO: 2678 ill_capability_lso_ack(ill, mp, subp); 2679 break; 2680 default: 2681 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 2682 subp->dl_cap)); 2683 } 2684 } 2685 2686 /* 2687 * As part of negotiating polling capability, the driver tells us 2688 * the default (or normal) blanking interval and packet threshold 2689 * (the receive timer fires if blanking interval is reached or 2690 * the packet threshold is reached). 2691 * 2692 * As part of manipulating the polling interval, we always use our 2693 * estimated interval (avg service time * number of packets queued 2694 * on the squeue) but we try to blank for a minimum of 2695 * rr_normal_blank_time * rr_max_blank_ratio. We disable the 2696 * packet threshold during this time. When we are not in polling mode 2697 * we set the blank interval typically lower, rr_normal_pkt_cnt * 2698 * rr_min_blank_ratio but up the packet cnt by a ratio of 2699 * rr_min_pkt_cnt_ratio so that we are still getting chains if 2700 * possible although for a shorter interval. 2701 */ 2702 #define RR_MAX_BLANK_RATIO 20 2703 #define RR_MIN_BLANK_RATIO 10 2704 #define RR_MAX_PKT_CNT_RATIO 3 2705 #define RR_MIN_PKT_CNT_RATIO 3 2706 2707 /* 2708 * These can be tuned via /etc/system. 2709 */ 2710 int rr_max_blank_ratio = RR_MAX_BLANK_RATIO; 2711 int rr_min_blank_ratio = RR_MIN_BLANK_RATIO; 2712 int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO; 2713 int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO; 2714 2715 static mac_resource_handle_t 2716 ill_ring_add(void *arg, mac_resource_t *mrp) 2717 { 2718 ill_t *ill = (ill_t *)arg; 2719 mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 2720 ill_rx_ring_t *rx_ring; 2721 int ip_rx_index; 2722 2723 ASSERT(mrp != NULL); 2724 if (mrp->mr_type != MAC_RX_FIFO) { 2725 return (NULL); 2726 } 2727 ASSERT(ill != NULL); 2728 ASSERT(ill->ill_dls_capab != NULL); 2729 2730 mutex_enter(&ill->ill_lock); 2731 for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) { 2732 rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index]; 2733 ASSERT(rx_ring != NULL); 2734 2735 if (rx_ring->rr_ring_state == ILL_RING_FREE) { 2736 time_t normal_blank_time = 2737 mrfp->mrf_normal_blank_time; 2738 uint_t normal_pkt_cnt = 2739 mrfp->mrf_normal_pkt_count; 2740 2741 bzero(rx_ring, sizeof (ill_rx_ring_t)); 2742 2743 rx_ring->rr_blank = mrfp->mrf_blank; 2744 rx_ring->rr_handle = mrfp->mrf_arg; 2745 rx_ring->rr_ill = ill; 2746 rx_ring->rr_normal_blank_time = normal_blank_time; 2747 rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt; 2748 2749 rx_ring->rr_max_blank_time = 2750 normal_blank_time * rr_max_blank_ratio; 2751 rx_ring->rr_min_blank_time = 2752 normal_blank_time * rr_min_blank_ratio; 2753 rx_ring->rr_max_pkt_cnt = 2754 normal_pkt_cnt * rr_max_pkt_cnt_ratio; 2755 rx_ring->rr_min_pkt_cnt = 2756 normal_pkt_cnt * rr_min_pkt_cnt_ratio; 2757 2758 rx_ring->rr_ring_state = ILL_RING_INUSE; 2759 mutex_exit(&ill->ill_lock); 2760 2761 DTRACE_PROBE2(ill__ring__add, (void *), ill, 2762 (int), ip_rx_index); 2763 return ((mac_resource_handle_t)rx_ring); 2764 } 2765 } 2766 2767 /* 2768 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If 2769 * we have devices which can overwhelm this limit, ILL_MAX_RING 2770 * should be made configurable. Meanwhile it cause no panic because 2771 * driver will pass ip_input a NULL handle which will make 2772 * IP allocate the default squeue and Polling mode will not 2773 * be used for this ring. 2774 */ 2775 cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) " 2776 "for %s\n", ILL_MAX_RINGS, ill->ill_name); 2777 2778 mutex_exit(&ill->ill_lock); 2779 return (NULL); 2780 } 2781 2782 static boolean_t 2783 ill_capability_dls_init(ill_t *ill) 2784 { 2785 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2786 conn_t *connp; 2787 size_t sz; 2788 ip_stack_t *ipst = ill->ill_ipst; 2789 2790 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 2791 if (ill_dls == NULL) { 2792 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2793 "soft_ring enabled for ill=%s (%p) but data " 2794 "structs uninitialized\n", ill->ill_name, 2795 (void *)ill); 2796 } 2797 return (B_TRUE); 2798 } else if (ill->ill_capabilities & ILL_CAPAB_POLL) { 2799 if (ill_dls == NULL) { 2800 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2801 "polling enabled for ill=%s (%p) but data " 2802 "structs uninitialized\n", ill->ill_name, 2803 (void *)ill); 2804 } 2805 return (B_TRUE); 2806 } 2807 2808 if (ill_dls != NULL) { 2809 ill_rx_ring_t *rx_ring = ill_dls->ill_ring_tbl; 2810 /* Soft_Ring or polling is being re-enabled */ 2811 2812 connp = ill_dls->ill_unbind_conn; 2813 ASSERT(rx_ring != NULL); 2814 bzero((void *)ill_dls, sizeof (ill_dls_capab_t)); 2815 bzero((void *)rx_ring, 2816 sizeof (ill_rx_ring_t) * ILL_MAX_RINGS); 2817 ill_dls->ill_ring_tbl = rx_ring; 2818 ill_dls->ill_unbind_conn = connp; 2819 return (B_TRUE); 2820 } 2821 2822 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP, 2823 ipst->ips_netstack)) == NULL) 2824 return (B_FALSE); 2825 2826 sz = sizeof (ill_dls_capab_t); 2827 sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS; 2828 2829 ill_dls = kmem_zalloc(sz, KM_NOSLEEP); 2830 if (ill_dls == NULL) { 2831 cmn_err(CE_WARN, "ill_capability_dls_init: could not " 2832 "allocate dls_capab for %s (%p)\n", ill->ill_name, 2833 (void *)ill); 2834 CONN_DEC_REF(connp); 2835 return (B_FALSE); 2836 } 2837 2838 /* Allocate space to hold ring table */ 2839 ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1]; 2840 ill->ill_dls_capab = ill_dls; 2841 ill_dls->ill_unbind_conn = connp; 2842 return (B_TRUE); 2843 } 2844 2845 /* 2846 * ill_capability_dls_disable: disable soft_ring and/or polling 2847 * capability. Since any of the rings might already be in use, need 2848 * to call ip_squeue_clean_all() which gets behind the squeue to disable 2849 * direct calls if necessary. 2850 */ 2851 static void 2852 ill_capability_dls_disable(ill_t *ill) 2853 { 2854 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2855 2856 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 2857 ip_squeue_clean_all(ill); 2858 ill_dls->ill_tx = NULL; 2859 ill_dls->ill_tx_handle = NULL; 2860 ill_dls->ill_dls_change_status = NULL; 2861 ill_dls->ill_dls_bind = NULL; 2862 ill_dls->ill_dls_unbind = NULL; 2863 } 2864 2865 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS)); 2866 } 2867 2868 static void 2869 ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls, 2870 dl_capability_sub_t *isub) 2871 { 2872 uint_t size; 2873 uchar_t *rptr; 2874 dl_capab_dls_t dls, *odls; 2875 ill_dls_capab_t *ill_dls; 2876 mblk_t *nmp = NULL; 2877 dl_capability_req_t *ocap; 2878 uint_t sub_dl_cap = isub->dl_cap; 2879 2880 if (!ill_capability_dls_init(ill)) 2881 return; 2882 ill_dls = ill->ill_dls_capab; 2883 2884 /* Copy locally to get the members aligned */ 2885 bcopy((void *)idls, (void *)&dls, 2886 sizeof (dl_capab_dls_t)); 2887 2888 /* Get the tx function and handle from dld */ 2889 ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx; 2890 ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle; 2891 2892 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2893 ill_dls->ill_dls_change_status = 2894 (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status; 2895 ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind; 2896 ill_dls->ill_dls_unbind = 2897 (ip_dls_unbind_t)dls.dls_ring_unbind; 2898 ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt; 2899 } 2900 2901 size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) + 2902 isub->dl_length; 2903 2904 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2905 cmn_err(CE_WARN, "ill_capability_dls_capable: could " 2906 "not allocate memory for CAPAB_REQ for %s (%p)\n", 2907 ill->ill_name, (void *)ill); 2908 return; 2909 } 2910 2911 /* initialize dl_capability_req_t */ 2912 rptr = nmp->b_rptr; 2913 ocap = (dl_capability_req_t *)rptr; 2914 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2915 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2916 rptr += sizeof (dl_capability_req_t); 2917 2918 /* initialize dl_capability_sub_t */ 2919 bcopy(isub, rptr, sizeof (*isub)); 2920 rptr += sizeof (*isub); 2921 2922 odls = (dl_capab_dls_t *)rptr; 2923 rptr += sizeof (dl_capab_dls_t); 2924 2925 /* initialize dl_capab_dls_t to be sent down */ 2926 dls.dls_rx_handle = (uintptr_t)ill; 2927 dls.dls_rx = (uintptr_t)ip_input; 2928 dls.dls_ring_add = (uintptr_t)ill_ring_add; 2929 2930 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2931 dls.dls_ring_cnt = ip_soft_rings_cnt; 2932 dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment; 2933 dls.dls_flags = SOFT_RING_ENABLE; 2934 } else { 2935 dls.dls_flags = POLL_ENABLE; 2936 ip1dbg(("ill_capability_dls_capable: asking interface %s " 2937 "to enable polling\n", ill->ill_name)); 2938 } 2939 bcopy((void *)&dls, (void *)odls, 2940 sizeof (dl_capab_dls_t)); 2941 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 2942 /* 2943 * nmp points to a DL_CAPABILITY_REQ message to 2944 * enable either soft_ring or polling 2945 */ 2946 ill_dlpi_send(ill, nmp); 2947 } 2948 2949 static void 2950 ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp) 2951 { 2952 mblk_t *mp; 2953 dl_capab_dls_t *idls; 2954 dl_capability_sub_t *dl_subcap; 2955 int size; 2956 2957 if (!(ill->ill_capabilities & ILL_CAPAB_DLS)) 2958 return; 2959 2960 ASSERT(ill->ill_dls_capab != NULL); 2961 2962 size = sizeof (*dl_subcap) + sizeof (*idls); 2963 2964 mp = allocb(size, BPRI_HI); 2965 if (mp == NULL) { 2966 ip1dbg(("ill_capability_dls_reset: unable to allocate " 2967 "request to disable soft_ring\n")); 2968 return; 2969 } 2970 2971 mp->b_wptr = mp->b_rptr + size; 2972 2973 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2974 dl_subcap->dl_length = sizeof (*idls); 2975 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2976 dl_subcap->dl_cap = DL_CAPAB_SOFT_RING; 2977 else 2978 dl_subcap->dl_cap = DL_CAPAB_POLL; 2979 2980 idls = (dl_capab_dls_t *)(dl_subcap + 1); 2981 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2982 idls->dls_flags = SOFT_RING_DISABLE; 2983 else 2984 idls->dls_flags = POLL_DISABLE; 2985 2986 if (*sc_mp != NULL) 2987 linkb(*sc_mp, mp); 2988 else 2989 *sc_mp = mp; 2990 } 2991 2992 /* 2993 * Process a soft_ring/poll capability negotiation ack received 2994 * from a DLS Provider.isub must point to the sub-capability 2995 * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message. 2996 */ 2997 static void 2998 ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2999 { 3000 dl_capab_dls_t *idls; 3001 uint_t sub_dl_cap = isub->dl_cap; 3002 uint8_t *capend; 3003 3004 ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING || 3005 sub_dl_cap == DL_CAPAB_POLL); 3006 3007 if (ill->ill_isv6) 3008 return; 3009 3010 /* 3011 * Note: range checks here are not absolutely sufficient to 3012 * make us robust against malformed messages sent by drivers; 3013 * this is in keeping with the rest of IP's dlpi handling. 3014 * (Remember, it's coming from something else in the kernel 3015 * address space) 3016 */ 3017 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3018 if (capend > mp->b_wptr) { 3019 cmn_err(CE_WARN, "ill_capability_dls_ack: " 3020 "malformed sub-capability too long for mblk"); 3021 return; 3022 } 3023 3024 /* 3025 * There are two types of acks we process here: 3026 * 1. acks in reply to a (first form) generic capability req 3027 * (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE) 3028 * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE 3029 * capability req. 3030 */ 3031 idls = (dl_capab_dls_t *)(isub + 1); 3032 3033 if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) { 3034 ip1dbg(("ill_capability_dls_ack: mid token for dls " 3035 "capability isn't as expected; pass-thru " 3036 "module(s) detected, discarding capability\n")); 3037 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 3038 /* 3039 * This is a capability renegotitation case. 3040 * The interface better be unusable at this 3041 * point other wise bad things will happen 3042 * if we disable direct calls on a running 3043 * and up interface. 3044 */ 3045 ill_capability_dls_disable(ill); 3046 } 3047 return; 3048 } 3049 3050 switch (idls->dls_flags) { 3051 default: 3052 /* Disable if unknown flag */ 3053 case SOFT_RING_DISABLE: 3054 case POLL_DISABLE: 3055 ill_capability_dls_disable(ill); 3056 break; 3057 case SOFT_RING_CAPABLE: 3058 case POLL_CAPABLE: 3059 /* 3060 * If the capability was already enabled, its safe 3061 * to disable it first to get rid of stale information 3062 * and then start enabling it again. 3063 */ 3064 ill_capability_dls_disable(ill); 3065 ill_capability_dls_capable(ill, idls, isub); 3066 break; 3067 case SOFT_RING_ENABLE: 3068 case POLL_ENABLE: 3069 mutex_enter(&ill->ill_lock); 3070 if (sub_dl_cap == DL_CAPAB_SOFT_RING && 3071 !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) { 3072 ASSERT(ill->ill_dls_capab != NULL); 3073 ill->ill_capabilities |= ILL_CAPAB_SOFT_RING; 3074 } 3075 if (sub_dl_cap == DL_CAPAB_POLL && 3076 !(ill->ill_capabilities & ILL_CAPAB_POLL)) { 3077 ASSERT(ill->ill_dls_capab != NULL); 3078 ill->ill_capabilities |= ILL_CAPAB_POLL; 3079 ip1dbg(("ill_capability_dls_ack: interface %s " 3080 "has enabled polling\n", ill->ill_name)); 3081 } 3082 mutex_exit(&ill->ill_lock); 3083 break; 3084 } 3085 } 3086 3087 /* 3088 * Process a hardware checksum offload capability negotiation ack received 3089 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 3090 * of a DL_CAPABILITY_ACK message. 3091 */ 3092 static void 3093 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3094 { 3095 dl_capability_req_t *ocap; 3096 dl_capab_hcksum_t *ihck, *ohck; 3097 ill_hcksum_capab_t **ill_hcksum; 3098 mblk_t *nmp = NULL; 3099 uint_t sub_dl_cap = isub->dl_cap; 3100 uint8_t *capend; 3101 3102 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 3103 3104 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 3105 3106 /* 3107 * Note: range checks here are not absolutely sufficient to 3108 * make us robust against malformed messages sent by drivers; 3109 * this is in keeping with the rest of IP's dlpi handling. 3110 * (Remember, it's coming from something else in the kernel 3111 * address space) 3112 */ 3113 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3114 if (capend > mp->b_wptr) { 3115 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3116 "malformed sub-capability too long for mblk"); 3117 return; 3118 } 3119 3120 /* 3121 * There are two types of acks we process here: 3122 * 1. acks in reply to a (first form) generic capability req 3123 * (no ENABLE flag set) 3124 * 2. acks in reply to a ENABLE capability req. 3125 * (ENABLE flag set) 3126 */ 3127 ihck = (dl_capab_hcksum_t *)(isub + 1); 3128 3129 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 3130 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 3131 "unsupported hardware checksum " 3132 "sub-capability (version %d, expected %d)", 3133 ihck->hcksum_version, HCKSUM_VERSION_1); 3134 return; 3135 } 3136 3137 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 3138 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 3139 "checksum capability isn't as expected; pass-thru " 3140 "module(s) detected, discarding capability\n")); 3141 return; 3142 } 3143 3144 #define CURR_HCKSUM_CAPAB \ 3145 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 3146 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 3147 3148 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 3149 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 3150 /* do ENABLE processing */ 3151 if (*ill_hcksum == NULL) { 3152 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 3153 KM_NOSLEEP); 3154 3155 if (*ill_hcksum == NULL) { 3156 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3157 "could not enable hcksum version %d " 3158 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 3159 ill->ill_name); 3160 return; 3161 } 3162 } 3163 3164 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 3165 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 3166 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 3167 ip1dbg(("ill_capability_hcksum_ack: interface %s " 3168 "has enabled hardware checksumming\n ", 3169 ill->ill_name)); 3170 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 3171 /* 3172 * Enabling hardware checksum offload 3173 * Currently IP supports {TCP,UDP}/IPv4 3174 * partial and full cksum offload and 3175 * IPv4 header checksum offload. 3176 * Allocate new mblk which will 3177 * contain a new capability request 3178 * to enable hardware checksum offload. 3179 */ 3180 uint_t size; 3181 uchar_t *rptr; 3182 3183 size = sizeof (dl_capability_req_t) + 3184 sizeof (dl_capability_sub_t) + isub->dl_length; 3185 3186 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3187 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3188 "could not enable hardware cksum for %s (ENOMEM)\n", 3189 ill->ill_name); 3190 return; 3191 } 3192 3193 rptr = nmp->b_rptr; 3194 /* initialize dl_capability_req_t */ 3195 ocap = (dl_capability_req_t *)nmp->b_rptr; 3196 ocap->dl_sub_offset = 3197 sizeof (dl_capability_req_t); 3198 ocap->dl_sub_length = 3199 sizeof (dl_capability_sub_t) + 3200 isub->dl_length; 3201 nmp->b_rptr += sizeof (dl_capability_req_t); 3202 3203 /* initialize dl_capability_sub_t */ 3204 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 3205 nmp->b_rptr += sizeof (*isub); 3206 3207 /* initialize dl_capab_hcksum_t */ 3208 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 3209 bcopy(ihck, ohck, sizeof (*ihck)); 3210 3211 nmp->b_rptr = rptr; 3212 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 3213 3214 /* Set ENABLE flag */ 3215 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 3216 ohck->hcksum_txflags |= HCKSUM_ENABLE; 3217 3218 /* 3219 * nmp points to a DL_CAPABILITY_REQ message to enable 3220 * hardware checksum acceleration. 3221 */ 3222 ill_dlpi_send(ill, nmp); 3223 } else { 3224 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 3225 "advertised %x hardware checksum capability flags\n", 3226 ill->ill_name, ihck->hcksum_txflags)); 3227 } 3228 } 3229 3230 static void 3231 ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp) 3232 { 3233 mblk_t *mp; 3234 dl_capab_hcksum_t *hck_subcap; 3235 dl_capability_sub_t *dl_subcap; 3236 int size; 3237 3238 if (!ILL_HCKSUM_CAPABLE(ill)) 3239 return; 3240 3241 ASSERT(ill->ill_hcksum_capab != NULL); 3242 /* 3243 * Clear the capability flag for hardware checksum offload but 3244 * retain the ill_hcksum_capab structure since it's possible that 3245 * another thread is still referring to it. The structure only 3246 * gets deallocated when we destroy the ill. 3247 */ 3248 ill->ill_capabilities &= ~ILL_CAPAB_HCKSUM; 3249 3250 size = sizeof (*dl_subcap) + sizeof (*hck_subcap); 3251 3252 mp = allocb(size, BPRI_HI); 3253 if (mp == NULL) { 3254 ip1dbg(("ill_capability_hcksum_reset: unable to allocate " 3255 "request to disable hardware checksum offload\n")); 3256 return; 3257 } 3258 3259 mp->b_wptr = mp->b_rptr + size; 3260 3261 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3262 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 3263 dl_subcap->dl_length = sizeof (*hck_subcap); 3264 3265 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 3266 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 3267 hck_subcap->hcksum_txflags = 0; 3268 3269 if (*sc_mp != NULL) 3270 linkb(*sc_mp, mp); 3271 else 3272 *sc_mp = mp; 3273 } 3274 3275 static void 3276 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3277 { 3278 mblk_t *nmp = NULL; 3279 dl_capability_req_t *oc; 3280 dl_capab_zerocopy_t *zc_ic, *zc_oc; 3281 ill_zerocopy_capab_t **ill_zerocopy_capab; 3282 uint_t sub_dl_cap = isub->dl_cap; 3283 uint8_t *capend; 3284 3285 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 3286 3287 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 3288 3289 /* 3290 * Note: range checks here are not absolutely sufficient to 3291 * make us robust against malformed messages sent by drivers; 3292 * this is in keeping with the rest of IP's dlpi handling. 3293 * (Remember, it's coming from something else in the kernel 3294 * address space) 3295 */ 3296 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3297 if (capend > mp->b_wptr) { 3298 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3299 "malformed sub-capability too long for mblk"); 3300 return; 3301 } 3302 3303 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 3304 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 3305 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 3306 "unsupported ZEROCOPY sub-capability (version %d, " 3307 "expected %d)", zc_ic->zerocopy_version, 3308 ZEROCOPY_VERSION_1); 3309 return; 3310 } 3311 3312 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 3313 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 3314 "capability isn't as expected; pass-thru module(s) " 3315 "detected, discarding capability\n")); 3316 return; 3317 } 3318 3319 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 3320 if (*ill_zerocopy_capab == NULL) { 3321 *ill_zerocopy_capab = 3322 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 3323 KM_NOSLEEP); 3324 3325 if (*ill_zerocopy_capab == NULL) { 3326 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3327 "could not enable Zero-copy version %d " 3328 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 3329 ill->ill_name); 3330 return; 3331 } 3332 } 3333 3334 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 3335 "supports Zero-copy version %d\n", ill->ill_name, 3336 ZEROCOPY_VERSION_1)); 3337 3338 (*ill_zerocopy_capab)->ill_zerocopy_version = 3339 zc_ic->zerocopy_version; 3340 (*ill_zerocopy_capab)->ill_zerocopy_flags = 3341 zc_ic->zerocopy_flags; 3342 3343 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 3344 } else { 3345 uint_t size; 3346 uchar_t *rptr; 3347 3348 size = sizeof (dl_capability_req_t) + 3349 sizeof (dl_capability_sub_t) + 3350 sizeof (dl_capab_zerocopy_t); 3351 3352 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3353 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3354 "could not enable zerocopy for %s (ENOMEM)\n", 3355 ill->ill_name); 3356 return; 3357 } 3358 3359 rptr = nmp->b_rptr; 3360 /* initialize dl_capability_req_t */ 3361 oc = (dl_capability_req_t *)rptr; 3362 oc->dl_sub_offset = sizeof (dl_capability_req_t); 3363 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 3364 sizeof (dl_capab_zerocopy_t); 3365 rptr += sizeof (dl_capability_req_t); 3366 3367 /* initialize dl_capability_sub_t */ 3368 bcopy(isub, rptr, sizeof (*isub)); 3369 rptr += sizeof (*isub); 3370 3371 /* initialize dl_capab_zerocopy_t */ 3372 zc_oc = (dl_capab_zerocopy_t *)rptr; 3373 *zc_oc = *zc_ic; 3374 3375 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 3376 "to enable zero-copy version %d\n", ill->ill_name, 3377 ZEROCOPY_VERSION_1)); 3378 3379 /* set VMSAFE_MEM flag */ 3380 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 3381 3382 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 3383 ill_dlpi_send(ill, nmp); 3384 } 3385 } 3386 3387 static void 3388 ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp) 3389 { 3390 mblk_t *mp; 3391 dl_capab_zerocopy_t *zerocopy_subcap; 3392 dl_capability_sub_t *dl_subcap; 3393 int size; 3394 3395 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 3396 return; 3397 3398 ASSERT(ill->ill_zerocopy_capab != NULL); 3399 /* 3400 * Clear the capability flag for Zero-copy but retain the 3401 * ill_zerocopy_capab structure since it's possible that another 3402 * thread is still referring to it. The structure only gets 3403 * deallocated when we destroy the ill. 3404 */ 3405 ill->ill_capabilities &= ~ILL_CAPAB_ZEROCOPY; 3406 3407 size = sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 3408 3409 mp = allocb(size, BPRI_HI); 3410 if (mp == NULL) { 3411 ip1dbg(("ill_capability_zerocopy_reset: unable to allocate " 3412 "request to disable Zero-copy\n")); 3413 return; 3414 } 3415 3416 mp->b_wptr = mp->b_rptr + size; 3417 3418 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3419 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 3420 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 3421 3422 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 3423 zerocopy_subcap->zerocopy_version = 3424 ill->ill_zerocopy_capab->ill_zerocopy_version; 3425 zerocopy_subcap->zerocopy_flags = 0; 3426 3427 if (*sc_mp != NULL) 3428 linkb(*sc_mp, mp); 3429 else 3430 *sc_mp = mp; 3431 } 3432 3433 /* 3434 * Process Large Segment Offload capability negotiation ack received from a 3435 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_LSO) of a 3436 * DL_CAPABILITY_ACK message. 3437 */ 3438 static void 3439 ill_capability_lso_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3440 { 3441 mblk_t *nmp = NULL; 3442 dl_capability_req_t *oc; 3443 dl_capab_lso_t *lso_ic, *lso_oc; 3444 ill_lso_capab_t **ill_lso_capab; 3445 uint_t sub_dl_cap = isub->dl_cap; 3446 uint8_t *capend; 3447 3448 ASSERT(sub_dl_cap == DL_CAPAB_LSO); 3449 3450 ill_lso_capab = (ill_lso_capab_t **)&ill->ill_lso_capab; 3451 3452 /* 3453 * Note: range checks here are not absolutely sufficient to 3454 * make us robust against malformed messages sent by drivers; 3455 * this is in keeping with the rest of IP's dlpi handling. 3456 * (Remember, it's coming from something else in the kernel 3457 * address space) 3458 */ 3459 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3460 if (capend > mp->b_wptr) { 3461 cmn_err(CE_WARN, "ill_capability_lso_ack: " 3462 "malformed sub-capability too long for mblk"); 3463 return; 3464 } 3465 3466 lso_ic = (dl_capab_lso_t *)(isub + 1); 3467 3468 if (lso_ic->lso_version != LSO_VERSION_1) { 3469 cmn_err(CE_CONT, "ill_capability_lso_ack: " 3470 "unsupported LSO sub-capability (version %d, expected %d)", 3471 lso_ic->lso_version, LSO_VERSION_1); 3472 return; 3473 } 3474 3475 if (!dlcapabcheckqid(&lso_ic->lso_mid, ill->ill_lmod_rq)) { 3476 ip1dbg(("ill_capability_lso_ack: mid token for LSO " 3477 "capability isn't as expected; pass-thru module(s) " 3478 "detected, discarding capability\n")); 3479 return; 3480 } 3481 3482 if ((lso_ic->lso_flags & LSO_TX_ENABLE) && 3483 (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4)) { 3484 if (*ill_lso_capab == NULL) { 3485 *ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), 3486 KM_NOSLEEP); 3487 3488 if (*ill_lso_capab == NULL) { 3489 cmn_err(CE_WARN, "ill_capability_lso_ack: " 3490 "could not enable LSO version %d " 3491 "for %s (ENOMEM)\n", LSO_VERSION_1, 3492 ill->ill_name); 3493 return; 3494 } 3495 } 3496 3497 (*ill_lso_capab)->ill_lso_version = lso_ic->lso_version; 3498 (*ill_lso_capab)->ill_lso_flags = lso_ic->lso_flags; 3499 (*ill_lso_capab)->ill_lso_max = lso_ic->lso_max; 3500 ill->ill_capabilities |= ILL_CAPAB_LSO; 3501 3502 ip1dbg(("ill_capability_lso_ack: interface %s " 3503 "has enabled LSO\n ", ill->ill_name)); 3504 } else if (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4) { 3505 uint_t size; 3506 uchar_t *rptr; 3507 3508 size = sizeof (dl_capability_req_t) + 3509 sizeof (dl_capability_sub_t) + sizeof (dl_capab_lso_t); 3510 3511 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3512 cmn_err(CE_WARN, "ill_capability_lso_ack: " 3513 "could not enable LSO for %s (ENOMEM)\n", 3514 ill->ill_name); 3515 return; 3516 } 3517 3518 rptr = nmp->b_rptr; 3519 /* initialize dl_capability_req_t */ 3520 oc = (dl_capability_req_t *)nmp->b_rptr; 3521 oc->dl_sub_offset = sizeof (dl_capability_req_t); 3522 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 3523 sizeof (dl_capab_lso_t); 3524 nmp->b_rptr += sizeof (dl_capability_req_t); 3525 3526 /* initialize dl_capability_sub_t */ 3527 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 3528 nmp->b_rptr += sizeof (*isub); 3529 3530 /* initialize dl_capab_lso_t */ 3531 lso_oc = (dl_capab_lso_t *)nmp->b_rptr; 3532 bcopy(lso_ic, lso_oc, sizeof (*lso_ic)); 3533 3534 nmp->b_rptr = rptr; 3535 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 3536 3537 /* set ENABLE flag */ 3538 lso_oc->lso_flags |= LSO_TX_ENABLE; 3539 3540 /* nmp points to a DL_CAPABILITY_REQ message to enable LSO */ 3541 ill_dlpi_send(ill, nmp); 3542 } else { 3543 ip1dbg(("ill_capability_lso_ack: interface %s has " 3544 "advertised %x LSO capability flags\n", 3545 ill->ill_name, lso_ic->lso_flags)); 3546 } 3547 } 3548 3549 3550 static void 3551 ill_capability_lso_reset(ill_t *ill, mblk_t **sc_mp) 3552 { 3553 mblk_t *mp; 3554 dl_capab_lso_t *lso_subcap; 3555 dl_capability_sub_t *dl_subcap; 3556 int size; 3557 3558 if (!(ill->ill_capabilities & ILL_CAPAB_LSO)) 3559 return; 3560 3561 ASSERT(ill->ill_lso_capab != NULL); 3562 /* 3563 * Clear the capability flag for LSO but retain the 3564 * ill_lso_capab structure since it's possible that another 3565 * thread is still referring to it. The structure only gets 3566 * deallocated when we destroy the ill. 3567 */ 3568 ill->ill_capabilities &= ~ILL_CAPAB_LSO; 3569 3570 size = sizeof (*dl_subcap) + sizeof (*lso_subcap); 3571 3572 mp = allocb(size, BPRI_HI); 3573 if (mp == NULL) { 3574 ip1dbg(("ill_capability_lso_reset: unable to allocate " 3575 "request to disable LSO\n")); 3576 return; 3577 } 3578 3579 mp->b_wptr = mp->b_rptr + size; 3580 3581 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3582 dl_subcap->dl_cap = DL_CAPAB_LSO; 3583 dl_subcap->dl_length = sizeof (*lso_subcap); 3584 3585 lso_subcap = (dl_capab_lso_t *)(dl_subcap + 1); 3586 lso_subcap->lso_version = ill->ill_lso_capab->ill_lso_version; 3587 lso_subcap->lso_flags = 0; 3588 3589 if (*sc_mp != NULL) 3590 linkb(*sc_mp, mp); 3591 else 3592 *sc_mp = mp; 3593 } 3594 3595 /* 3596 * Consume a new-style hardware capabilities negotiation ack. 3597 * Called from ip_rput_dlpi_writer(). 3598 */ 3599 void 3600 ill_capability_ack(ill_t *ill, mblk_t *mp) 3601 { 3602 dl_capability_ack_t *capp; 3603 dl_capability_sub_t *subp, *endp; 3604 3605 if (ill->ill_dlpi_capab_state == IDS_INPROGRESS) 3606 ill->ill_dlpi_capab_state = IDS_OK; 3607 3608 capp = (dl_capability_ack_t *)mp->b_rptr; 3609 3610 if (capp->dl_sub_length == 0) 3611 /* no new-style capabilities */ 3612 return; 3613 3614 /* make sure the driver supplied correct dl_sub_length */ 3615 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 3616 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 3617 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 3618 return; 3619 } 3620 3621 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 3622 /* 3623 * There are sub-capabilities. Process the ones we know about. 3624 * Loop until we don't have room for another sub-cap header.. 3625 */ 3626 for (subp = SC(capp, capp->dl_sub_offset), 3627 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 3628 subp <= endp; 3629 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 3630 3631 switch (subp->dl_cap) { 3632 case DL_CAPAB_ID_WRAPPER: 3633 ill_capability_id_ack(ill, mp, subp); 3634 break; 3635 default: 3636 ill_capability_dispatch(ill, mp, subp, B_FALSE); 3637 break; 3638 } 3639 } 3640 #undef SC 3641 } 3642 3643 /* 3644 * This routine is called to scan the fragmentation reassembly table for 3645 * the specified ILL for any packets that are starting to smell. 3646 * dead_interval is the maximum time in seconds that will be tolerated. It 3647 * will either be the value specified in ip_g_frag_timeout, or zero if the 3648 * ILL is shutting down and it is time to blow everything off. 3649 * 3650 * It returns the number of seconds (as a time_t) that the next frag timer 3651 * should be scheduled for, 0 meaning that the timer doesn't need to be 3652 * re-started. Note that the method of calculating next_timeout isn't 3653 * entirely accurate since time will flow between the time we grab 3654 * current_time and the time we schedule the next timeout. This isn't a 3655 * big problem since this is the timer for sending an ICMP reassembly time 3656 * exceeded messages, and it doesn't have to be exactly accurate. 3657 * 3658 * This function is 3659 * sometimes called as writer, although this is not required. 3660 */ 3661 time_t 3662 ill_frag_timeout(ill_t *ill, time_t dead_interval) 3663 { 3664 ipfb_t *ipfb; 3665 ipfb_t *endp; 3666 ipf_t *ipf; 3667 ipf_t *ipfnext; 3668 mblk_t *mp; 3669 time_t current_time = gethrestime_sec(); 3670 time_t next_timeout = 0; 3671 uint32_t hdr_length; 3672 mblk_t *send_icmp_head; 3673 mblk_t *send_icmp_head_v6; 3674 zoneid_t zoneid; 3675 ip_stack_t *ipst = ill->ill_ipst; 3676 3677 ipfb = ill->ill_frag_hash_tbl; 3678 if (ipfb == NULL) 3679 return (B_FALSE); 3680 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 3681 /* Walk the frag hash table. */ 3682 for (; ipfb < endp; ipfb++) { 3683 send_icmp_head = NULL; 3684 send_icmp_head_v6 = NULL; 3685 mutex_enter(&ipfb->ipfb_lock); 3686 while ((ipf = ipfb->ipfb_ipf) != 0) { 3687 time_t frag_time = current_time - ipf->ipf_timestamp; 3688 time_t frag_timeout; 3689 3690 if (frag_time < dead_interval) { 3691 /* 3692 * There are some outstanding fragments 3693 * that will timeout later. Make note of 3694 * the time so that we can reschedule the 3695 * next timeout appropriately. 3696 */ 3697 frag_timeout = dead_interval - frag_time; 3698 if (next_timeout == 0 || 3699 frag_timeout < next_timeout) { 3700 next_timeout = frag_timeout; 3701 } 3702 break; 3703 } 3704 /* Time's up. Get it out of here. */ 3705 hdr_length = ipf->ipf_nf_hdr_len; 3706 ipfnext = ipf->ipf_hash_next; 3707 if (ipfnext) 3708 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 3709 *ipf->ipf_ptphn = ipfnext; 3710 mp = ipf->ipf_mp->b_cont; 3711 for (; mp; mp = mp->b_cont) { 3712 /* Extra points for neatness. */ 3713 IP_REASS_SET_START(mp, 0); 3714 IP_REASS_SET_END(mp, 0); 3715 } 3716 mp = ipf->ipf_mp->b_cont; 3717 ill->ill_frag_count -= ipf->ipf_count; 3718 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 3719 ipfb->ipfb_count -= ipf->ipf_count; 3720 ASSERT(ipfb->ipfb_frag_pkts > 0); 3721 ipfb->ipfb_frag_pkts--; 3722 /* 3723 * We do not send any icmp message from here because 3724 * we currently are holding the ipfb_lock for this 3725 * hash chain. If we try and send any icmp messages 3726 * from here we may end up via a put back into ip 3727 * trying to get the same lock, causing a recursive 3728 * mutex panic. Instead we build a list and send all 3729 * the icmp messages after we have dropped the lock. 3730 */ 3731 if (ill->ill_isv6) { 3732 if (hdr_length != 0) { 3733 mp->b_next = send_icmp_head_v6; 3734 send_icmp_head_v6 = mp; 3735 } else { 3736 freemsg(mp); 3737 } 3738 } else { 3739 if (hdr_length != 0) { 3740 mp->b_next = send_icmp_head; 3741 send_icmp_head = mp; 3742 } else { 3743 freemsg(mp); 3744 } 3745 } 3746 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 3747 freeb(ipf->ipf_mp); 3748 } 3749 mutex_exit(&ipfb->ipfb_lock); 3750 /* 3751 * Now need to send any icmp messages that we delayed from 3752 * above. 3753 */ 3754 while (send_icmp_head_v6 != NULL) { 3755 ip6_t *ip6h; 3756 3757 mp = send_icmp_head_v6; 3758 send_icmp_head_v6 = send_icmp_head_v6->b_next; 3759 mp->b_next = NULL; 3760 if (mp->b_datap->db_type == M_CTL) 3761 ip6h = (ip6_t *)mp->b_cont->b_rptr; 3762 else 3763 ip6h = (ip6_t *)mp->b_rptr; 3764 zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 3765 ill, ipst); 3766 if (zoneid == ALL_ZONES) { 3767 freemsg(mp); 3768 } else { 3769 icmp_time_exceeded_v6(ill->ill_wq, mp, 3770 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 3771 B_FALSE, zoneid, ipst); 3772 } 3773 } 3774 while (send_icmp_head != NULL) { 3775 ipaddr_t dst; 3776 3777 mp = send_icmp_head; 3778 send_icmp_head = send_icmp_head->b_next; 3779 mp->b_next = NULL; 3780 3781 if (mp->b_datap->db_type == M_CTL) 3782 dst = ((ipha_t *)mp->b_cont->b_rptr)->ipha_dst; 3783 else 3784 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 3785 3786 zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); 3787 if (zoneid == ALL_ZONES) { 3788 freemsg(mp); 3789 } else { 3790 icmp_time_exceeded(ill->ill_wq, mp, 3791 ICMP_REASSEMBLY_TIME_EXCEEDED, zoneid, 3792 ipst); 3793 } 3794 } 3795 } 3796 /* 3797 * A non-dying ILL will use the return value to decide whether to 3798 * restart the frag timer, and for how long. 3799 */ 3800 return (next_timeout); 3801 } 3802 3803 /* 3804 * This routine is called when the approximate count of mblk memory used 3805 * for the specified ILL has exceeded max_count. 3806 */ 3807 void 3808 ill_frag_prune(ill_t *ill, uint_t max_count) 3809 { 3810 ipfb_t *ipfb; 3811 ipf_t *ipf; 3812 size_t count; 3813 3814 /* 3815 * If we are here within ip_min_frag_prune_time msecs remove 3816 * ill_frag_free_num_pkts oldest packets from each bucket and increment 3817 * ill_frag_free_num_pkts. 3818 */ 3819 mutex_enter(&ill->ill_lock); 3820 if (TICK_TO_MSEC(lbolt - ill->ill_last_frag_clean_time) <= 3821 (ip_min_frag_prune_time != 0 ? 3822 ip_min_frag_prune_time : msec_per_tick)) { 3823 3824 ill->ill_frag_free_num_pkts++; 3825 3826 } else { 3827 ill->ill_frag_free_num_pkts = 0; 3828 } 3829 ill->ill_last_frag_clean_time = lbolt; 3830 mutex_exit(&ill->ill_lock); 3831 3832 /* 3833 * free ill_frag_free_num_pkts oldest packets from each bucket. 3834 */ 3835 if (ill->ill_frag_free_num_pkts != 0) { 3836 int ix; 3837 3838 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3839 ipfb = &ill->ill_frag_hash_tbl[ix]; 3840 mutex_enter(&ipfb->ipfb_lock); 3841 if (ipfb->ipfb_ipf != NULL) { 3842 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 3843 ill->ill_frag_free_num_pkts); 3844 } 3845 mutex_exit(&ipfb->ipfb_lock); 3846 } 3847 } 3848 /* 3849 * While the reassembly list for this ILL is too big, prune a fragment 3850 * queue by age, oldest first. Note that the per ILL count is 3851 * approximate, while the per frag hash bucket counts are accurate. 3852 */ 3853 while (ill->ill_frag_count > max_count) { 3854 int ix; 3855 ipfb_t *oipfb = NULL; 3856 uint_t oldest = UINT_MAX; 3857 3858 count = 0; 3859 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3860 ipfb = &ill->ill_frag_hash_tbl[ix]; 3861 mutex_enter(&ipfb->ipfb_lock); 3862 ipf = ipfb->ipfb_ipf; 3863 if (ipf != NULL && ipf->ipf_gen < oldest) { 3864 oldest = ipf->ipf_gen; 3865 oipfb = ipfb; 3866 } 3867 count += ipfb->ipfb_count; 3868 mutex_exit(&ipfb->ipfb_lock); 3869 } 3870 /* Refresh the per ILL count */ 3871 ill->ill_frag_count = count; 3872 if (oipfb == NULL) { 3873 ill->ill_frag_count = 0; 3874 break; 3875 } 3876 if (count <= max_count) 3877 return; /* Somebody beat us to it, nothing to do */ 3878 mutex_enter(&oipfb->ipfb_lock); 3879 ipf = oipfb->ipfb_ipf; 3880 if (ipf != NULL) { 3881 ill_frag_free_pkts(ill, oipfb, ipf, 1); 3882 } 3883 mutex_exit(&oipfb->ipfb_lock); 3884 } 3885 } 3886 3887 /* 3888 * free 'free_cnt' fragmented packets starting at ipf. 3889 */ 3890 void 3891 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 3892 { 3893 size_t count; 3894 mblk_t *mp; 3895 mblk_t *tmp; 3896 ipf_t **ipfp = ipf->ipf_ptphn; 3897 3898 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 3899 ASSERT(ipfp != NULL); 3900 ASSERT(ipf != NULL); 3901 3902 while (ipf != NULL && free_cnt-- > 0) { 3903 count = ipf->ipf_count; 3904 mp = ipf->ipf_mp; 3905 ipf = ipf->ipf_hash_next; 3906 for (tmp = mp; tmp; tmp = tmp->b_cont) { 3907 IP_REASS_SET_START(tmp, 0); 3908 IP_REASS_SET_END(tmp, 0); 3909 } 3910 ill->ill_frag_count -= count; 3911 ASSERT(ipfb->ipfb_count >= count); 3912 ipfb->ipfb_count -= count; 3913 ASSERT(ipfb->ipfb_frag_pkts > 0); 3914 ipfb->ipfb_frag_pkts--; 3915 freemsg(mp); 3916 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 3917 } 3918 3919 if (ipf) 3920 ipf->ipf_ptphn = ipfp; 3921 ipfp[0] = ipf; 3922 } 3923 3924 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 3925 "obsolete and may be removed in a future release of Solaris. Use " \ 3926 "ifconfig(1M) to manipulate the forwarding status of an interface." 3927 3928 /* 3929 * For obsolete per-interface forwarding configuration; 3930 * called in response to ND_GET. 3931 */ 3932 /* ARGSUSED */ 3933 static int 3934 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 3935 { 3936 ill_t *ill = (ill_t *)cp; 3937 3938 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3939 3940 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 3941 return (0); 3942 } 3943 3944 /* 3945 * For obsolete per-interface forwarding configuration; 3946 * called in response to ND_SET. 3947 */ 3948 /* ARGSUSED */ 3949 static int 3950 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 3951 cred_t *ioc_cr) 3952 { 3953 long value; 3954 int retval; 3955 ip_stack_t *ipst = CONNQ_TO_IPST(q); 3956 3957 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3958 3959 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 3960 value < 0 || value > 1) { 3961 return (EINVAL); 3962 } 3963 3964 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3965 retval = ill_forward_set((ill_t *)cp, (value != 0)); 3966 rw_exit(&ipst->ips_ill_g_lock); 3967 return (retval); 3968 } 3969 3970 /* 3971 * Set an ill's ILLF_ROUTER flag appropriately. If the ill is part of an 3972 * IPMP group, make sure all ill's in the group adopt the new policy. Send 3973 * up RTS_IFINFO routing socket messages for each interface whose flags we 3974 * change. 3975 */ 3976 int 3977 ill_forward_set(ill_t *ill, boolean_t enable) 3978 { 3979 ill_group_t *illgrp; 3980 ip_stack_t *ipst = ill->ill_ipst; 3981 3982 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 3983 3984 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 3985 (!enable && !(ill->ill_flags & ILLF_ROUTER))) 3986 return (0); 3987 3988 if (IS_LOOPBACK(ill)) 3989 return (EINVAL); 3990 3991 /* 3992 * If the ill is in an IPMP group, set the forwarding policy on all 3993 * members of the group to the same value. 3994 */ 3995 illgrp = ill->ill_group; 3996 if (illgrp != NULL) { 3997 ill_t *tmp_ill; 3998 3999 for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL; 4000 tmp_ill = tmp_ill->ill_group_next) { 4001 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 4002 (enable ? "Enabling" : "Disabling"), 4003 (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"), 4004 tmp_ill->ill_name)); 4005 mutex_enter(&tmp_ill->ill_lock); 4006 if (enable) 4007 tmp_ill->ill_flags |= ILLF_ROUTER; 4008 else 4009 tmp_ill->ill_flags &= ~ILLF_ROUTER; 4010 mutex_exit(&tmp_ill->ill_lock); 4011 if (tmp_ill->ill_isv6) 4012 ill_set_nce_router_flags(tmp_ill, enable); 4013 /* Notify routing socket listeners of this change. */ 4014 ip_rts_ifmsg(tmp_ill->ill_ipif); 4015 } 4016 } else { 4017 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 4018 (enable ? "Enabling" : "Disabling"), 4019 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 4020 mutex_enter(&ill->ill_lock); 4021 if (enable) 4022 ill->ill_flags |= ILLF_ROUTER; 4023 else 4024 ill->ill_flags &= ~ILLF_ROUTER; 4025 mutex_exit(&ill->ill_lock); 4026 if (ill->ill_isv6) 4027 ill_set_nce_router_flags(ill, enable); 4028 /* Notify routing socket listeners of this change. */ 4029 ip_rts_ifmsg(ill->ill_ipif); 4030 } 4031 4032 return (0); 4033 } 4034 4035 /* 4036 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 4037 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 4038 * set or clear. 4039 */ 4040 static void 4041 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 4042 { 4043 ipif_t *ipif; 4044 nce_t *nce; 4045 4046 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4047 nce = ndp_lookup_v6(ill, &ipif->ipif_v6lcl_addr, B_FALSE); 4048 if (nce != NULL) { 4049 mutex_enter(&nce->nce_lock); 4050 if (enable) 4051 nce->nce_flags |= NCE_F_ISROUTER; 4052 else 4053 nce->nce_flags &= ~NCE_F_ISROUTER; 4054 mutex_exit(&nce->nce_lock); 4055 NCE_REFRELE(nce); 4056 } 4057 } 4058 } 4059 4060 /* 4061 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 4062 * for this ill. Make sure the v6/v4 question has been answered about this 4063 * ill. The creation of this ndd variable is only for backwards compatibility. 4064 * The preferred way to control per-interface IP forwarding is through the 4065 * ILLF_ROUTER interface flag. 4066 */ 4067 static int 4068 ill_set_ndd_name(ill_t *ill) 4069 { 4070 char *suffix; 4071 ip_stack_t *ipst = ill->ill_ipst; 4072 4073 ASSERT(IAM_WRITER_ILL(ill)); 4074 4075 if (ill->ill_isv6) 4076 suffix = ipv6_forward_suffix; 4077 else 4078 suffix = ipv4_forward_suffix; 4079 4080 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 4081 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 4082 /* 4083 * Copies over the '\0'. 4084 * Note that strlen(suffix) is always bounded. 4085 */ 4086 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 4087 strlen(suffix) + 1); 4088 4089 /* 4090 * Use of the nd table requires holding the reader lock. 4091 * Modifying the nd table thru nd_load/nd_unload requires 4092 * the writer lock. 4093 */ 4094 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 4095 if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 4096 nd_ill_forward_set, (caddr_t)ill)) { 4097 /* 4098 * If the nd_load failed, it only meant that it could not 4099 * allocate a new bunch of room for further NDD expansion. 4100 * Because of that, the ill_ndd_name will be set to 0, and 4101 * this interface is at the mercy of the global ip_forwarding 4102 * variable. 4103 */ 4104 rw_exit(&ipst->ips_ip_g_nd_lock); 4105 ill->ill_ndd_name = NULL; 4106 return (ENOMEM); 4107 } 4108 rw_exit(&ipst->ips_ip_g_nd_lock); 4109 return (0); 4110 } 4111 4112 /* 4113 * Intializes the context structure and returns the first ill in the list 4114 * cuurently start_list and end_list can have values: 4115 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 4116 * IP_V4_G_HEAD Traverse IPV4 list only. 4117 * IP_V6_G_HEAD Traverse IPV6 list only. 4118 */ 4119 4120 /* 4121 * We don't check for CONDEMNED ills here. Caller must do that if 4122 * necessary under the ill lock. 4123 */ 4124 ill_t * 4125 ill_first(int start_list, int end_list, ill_walk_context_t *ctx, 4126 ip_stack_t *ipst) 4127 { 4128 ill_if_t *ifp; 4129 ill_t *ill; 4130 avl_tree_t *avl_tree; 4131 4132 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 4133 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 4134 4135 /* 4136 * setup the lists to search 4137 */ 4138 if (end_list != MAX_G_HEADS) { 4139 ctx->ctx_current_list = start_list; 4140 ctx->ctx_last_list = end_list; 4141 } else { 4142 ctx->ctx_last_list = MAX_G_HEADS - 1; 4143 ctx->ctx_current_list = 0; 4144 } 4145 4146 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 4147 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 4148 if (ifp != (ill_if_t *) 4149 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 4150 avl_tree = &ifp->illif_avl_by_ppa; 4151 ill = avl_first(avl_tree); 4152 /* 4153 * ill is guaranteed to be non NULL or ifp should have 4154 * not existed. 4155 */ 4156 ASSERT(ill != NULL); 4157 return (ill); 4158 } 4159 ctx->ctx_current_list++; 4160 } 4161 4162 return (NULL); 4163 } 4164 4165 /* 4166 * returns the next ill in the list. ill_first() must have been called 4167 * before calling ill_next() or bad things will happen. 4168 */ 4169 4170 /* 4171 * We don't check for CONDEMNED ills here. Caller must do that if 4172 * necessary under the ill lock. 4173 */ 4174 ill_t * 4175 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 4176 { 4177 ill_if_t *ifp; 4178 ill_t *ill; 4179 ip_stack_t *ipst = lastill->ill_ipst; 4180 4181 ASSERT(lastill->ill_ifptr != (ill_if_t *) 4182 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)); 4183 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 4184 AVL_AFTER)) != NULL) { 4185 return (ill); 4186 } 4187 4188 /* goto next ill_ifp in the list. */ 4189 ifp = lastill->ill_ifptr->illif_next; 4190 4191 /* make sure not at end of circular list */ 4192 while (ifp == 4193 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 4194 if (++ctx->ctx_current_list > ctx->ctx_last_list) 4195 return (NULL); 4196 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 4197 } 4198 4199 return (avl_first(&ifp->illif_avl_by_ppa)); 4200 } 4201 4202 /* 4203 * Check interface name for correct format which is name+ppa. 4204 * name can contain characters and digits, the right most digits 4205 * make up the ppa number. use of octal is not allowed, name must contain 4206 * a ppa, return pointer to the start of ppa. 4207 * In case of error return NULL. 4208 */ 4209 static char * 4210 ill_get_ppa_ptr(char *name) 4211 { 4212 int namelen = mi_strlen(name); 4213 4214 int len = namelen; 4215 4216 name += len; 4217 while (len > 0) { 4218 name--; 4219 if (*name < '0' || *name > '9') 4220 break; 4221 len--; 4222 } 4223 4224 /* empty string, all digits, or no trailing digits */ 4225 if (len == 0 || len == (int)namelen) 4226 return (NULL); 4227 4228 name++; 4229 /* check for attempted use of octal */ 4230 if (*name == '0' && len != (int)namelen - 1) 4231 return (NULL); 4232 return (name); 4233 } 4234 4235 /* 4236 * use avl tree to locate the ill. 4237 */ 4238 static ill_t * 4239 ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, 4240 ipsq_func_t func, int *error, ip_stack_t *ipst) 4241 { 4242 char *ppa_ptr = NULL; 4243 int len; 4244 uint_t ppa; 4245 ill_t *ill = NULL; 4246 ill_if_t *ifp; 4247 int list; 4248 ipsq_t *ipsq; 4249 4250 if (error != NULL) 4251 *error = 0; 4252 4253 /* 4254 * get ppa ptr 4255 */ 4256 if (isv6) 4257 list = IP_V6_G_HEAD; 4258 else 4259 list = IP_V4_G_HEAD; 4260 4261 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 4262 if (error != NULL) 4263 *error = ENXIO; 4264 return (NULL); 4265 } 4266 4267 len = ppa_ptr - name + 1; 4268 4269 ppa = stoi(&ppa_ptr); 4270 4271 ifp = IP_VX_ILL_G_LIST(list, ipst); 4272 4273 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 4274 /* 4275 * match is done on len - 1 as the name is not null 4276 * terminated it contains ppa in addition to the interface 4277 * name. 4278 */ 4279 if ((ifp->illif_name_len == len) && 4280 bcmp(ifp->illif_name, name, len - 1) == 0) { 4281 break; 4282 } else { 4283 ifp = ifp->illif_next; 4284 } 4285 } 4286 4287 4288 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 4289 /* 4290 * Even the interface type does not exist. 4291 */ 4292 if (error != NULL) 4293 *error = ENXIO; 4294 return (NULL); 4295 } 4296 4297 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 4298 if (ill != NULL) { 4299 /* 4300 * The block comment at the start of ipif_down 4301 * explains the use of the macros used below 4302 */ 4303 GRAB_CONN_LOCK(q); 4304 mutex_enter(&ill->ill_lock); 4305 if (ILL_CAN_LOOKUP(ill)) { 4306 ill_refhold_locked(ill); 4307 mutex_exit(&ill->ill_lock); 4308 RELEASE_CONN_LOCK(q); 4309 return (ill); 4310 } else if (ILL_CAN_WAIT(ill, q)) { 4311 ipsq = ill->ill_phyint->phyint_ipsq; 4312 mutex_enter(&ipsq->ipsq_lock); 4313 mutex_exit(&ill->ill_lock); 4314 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 4315 mutex_exit(&ipsq->ipsq_lock); 4316 RELEASE_CONN_LOCK(q); 4317 *error = EINPROGRESS; 4318 return (NULL); 4319 } 4320 mutex_exit(&ill->ill_lock); 4321 RELEASE_CONN_LOCK(q); 4322 } 4323 if (error != NULL) 4324 *error = ENXIO; 4325 return (NULL); 4326 } 4327 4328 /* 4329 * comparison function for use with avl. 4330 */ 4331 static int 4332 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 4333 { 4334 uint_t ppa; 4335 uint_t ill_ppa; 4336 4337 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 4338 4339 ppa = *((uint_t *)ppa_ptr); 4340 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 4341 /* 4342 * We want the ill with the lowest ppa to be on the 4343 * top. 4344 */ 4345 if (ill_ppa < ppa) 4346 return (1); 4347 if (ill_ppa > ppa) 4348 return (-1); 4349 return (0); 4350 } 4351 4352 /* 4353 * remove an interface type from the global list. 4354 */ 4355 static void 4356 ill_delete_interface_type(ill_if_t *interface) 4357 { 4358 ASSERT(interface != NULL); 4359 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 4360 4361 avl_destroy(&interface->illif_avl_by_ppa); 4362 if (interface->illif_ppa_arena != NULL) 4363 vmem_destroy(interface->illif_ppa_arena); 4364 4365 remque(interface); 4366 4367 mi_free(interface); 4368 } 4369 4370 /* Defined in ip_netinfo.c */ 4371 extern ddi_taskq_t *eventq_queue_nic; 4372 4373 /* 4374 * remove ill from the global list. 4375 */ 4376 static void 4377 ill_glist_delete(ill_t *ill) 4378 { 4379 char *nicname; 4380 size_t nicnamelen; 4381 hook_nic_event_t *info; 4382 ip_stack_t *ipst; 4383 4384 if (ill == NULL) 4385 return; 4386 ipst = ill->ill_ipst; 4387 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 4388 4389 if (ill->ill_name != NULL) { 4390 nicname = kmem_alloc(ill->ill_name_length, KM_NOSLEEP); 4391 if (nicname != NULL) { 4392 bcopy(ill->ill_name, nicname, ill->ill_name_length); 4393 nicnamelen = ill->ill_name_length; 4394 } 4395 } else { 4396 nicname = NULL; 4397 nicnamelen = 0; 4398 } 4399 4400 /* 4401 * If the ill was never inserted into the AVL tree 4402 * we skip the if branch. 4403 */ 4404 if (ill->ill_ifptr != NULL) { 4405 /* 4406 * remove from AVL tree and free ppa number 4407 */ 4408 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 4409 4410 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 4411 vmem_free(ill->ill_ifptr->illif_ppa_arena, 4412 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4413 } 4414 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 4415 ill_delete_interface_type(ill->ill_ifptr); 4416 } 4417 4418 /* 4419 * Indicate ill is no longer in the list. 4420 */ 4421 ill->ill_ifptr = NULL; 4422 ill->ill_name_length = 0; 4423 ill->ill_name[0] = '\0'; 4424 ill->ill_ppa = UINT_MAX; 4425 } 4426 4427 /* 4428 * Run the unplumb hook after the NIC has disappeared from being 4429 * visible so that attempts to revalidate its existance will fail. 4430 * 4431 * This needs to be run inside the ill_g_lock perimeter to ensure 4432 * that the ordering of delivered events to listeners matches the 4433 * order of them in the kernel. 4434 */ 4435 if ((info = ill->ill_nic_event_info) != NULL) { 4436 if (info->hne_event != NE_DOWN) { 4437 ip2dbg(("ill_glist_delete: unexpected nic event %d " 4438 "attached for %s\n", info->hne_event, 4439 ill->ill_name)); 4440 if (info->hne_data != NULL) 4441 kmem_free(info->hne_data, info->hne_datalen); 4442 kmem_free(info, sizeof (hook_nic_event_t)); 4443 } else { 4444 if (ddi_taskq_dispatch(eventq_queue_nic, 4445 ip_ne_queue_func, (void *)info, DDI_SLEEP) 4446 == DDI_FAILURE) { 4447 ip2dbg(("ill_glist_delete: ddi_taskq_dispatch " 4448 "failed\n")); 4449 if (info->hne_data != NULL) 4450 kmem_free(info->hne_data, 4451 info->hne_datalen); 4452 kmem_free(info, sizeof (hook_nic_event_t)); 4453 } 4454 } 4455 } 4456 4457 /* Generate NE_UNPLUMB event for ill_name. */ 4458 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 4459 if (info != NULL) { 4460 info->hne_nic = ill->ill_phyint->phyint_ifindex; 4461 info->hne_lif = 0; 4462 info->hne_event = NE_UNPLUMB; 4463 info->hne_data = nicname; 4464 info->hne_datalen = nicnamelen; 4465 info->hne_family = ill->ill_isv6 ? 4466 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 4467 } else { 4468 ip2dbg(("ill_glist_delete: could not attach UNPLUMB nic event " 4469 "information for %s (ENOMEM)\n", ill->ill_name)); 4470 if (nicname != NULL) 4471 kmem_free(nicname, nicnamelen); 4472 } 4473 4474 ill->ill_nic_event_info = info; 4475 4476 ill_phyint_free(ill); 4477 rw_exit(&ipst->ips_ill_g_lock); 4478 } 4479 4480 /* 4481 * allocate a ppa, if the number of plumbed interfaces of this type are 4482 * less than ill_no_arena do a linear search to find a unused ppa. 4483 * When the number goes beyond ill_no_arena switch to using an arena. 4484 * Note: ppa value of zero cannot be allocated from vmem_arena as it 4485 * is the return value for an error condition, so allocation starts at one 4486 * and is decremented by one. 4487 */ 4488 static int 4489 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 4490 { 4491 ill_t *tmp_ill; 4492 uint_t start, end; 4493 int ppa; 4494 4495 if (ifp->illif_ppa_arena == NULL && 4496 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 4497 /* 4498 * Create an arena. 4499 */ 4500 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 4501 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 4502 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 4503 /* allocate what has already been assigned */ 4504 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 4505 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 4506 tmp_ill, AVL_AFTER)) { 4507 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4508 1, /* size */ 4509 1, /* align/quantum */ 4510 0, /* phase */ 4511 0, /* nocross */ 4512 /* minaddr */ 4513 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), 4514 /* maxaddr */ 4515 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), 4516 VM_NOSLEEP|VM_FIRSTFIT); 4517 if (ppa == 0) { 4518 ip1dbg(("ill_alloc_ppa: ppa allocation" 4519 " failed while switching")); 4520 vmem_destroy(ifp->illif_ppa_arena); 4521 ifp->illif_ppa_arena = NULL; 4522 break; 4523 } 4524 } 4525 } 4526 4527 if (ifp->illif_ppa_arena != NULL) { 4528 if (ill->ill_ppa == UINT_MAX) { 4529 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 4530 1, VM_NOSLEEP|VM_FIRSTFIT); 4531 if (ppa == 0) 4532 return (EAGAIN); 4533 ill->ill_ppa = --ppa; 4534 } else { 4535 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4536 1, /* size */ 4537 1, /* align/quantum */ 4538 0, /* phase */ 4539 0, /* nocross */ 4540 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 4541 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 4542 VM_NOSLEEP|VM_FIRSTFIT); 4543 /* 4544 * Most likely the allocation failed because 4545 * the requested ppa was in use. 4546 */ 4547 if (ppa == 0) 4548 return (EEXIST); 4549 } 4550 return (0); 4551 } 4552 4553 /* 4554 * No arena is in use and not enough (>ill_no_arena) interfaces have 4555 * been plumbed to create one. Do a linear search to get a unused ppa. 4556 */ 4557 if (ill->ill_ppa == UINT_MAX) { 4558 end = UINT_MAX - 1; 4559 start = 0; 4560 } else { 4561 end = start = ill->ill_ppa; 4562 } 4563 4564 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 4565 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 4566 if (start++ >= end) { 4567 if (ill->ill_ppa == UINT_MAX) 4568 return (EAGAIN); 4569 else 4570 return (EEXIST); 4571 } 4572 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 4573 } 4574 ill->ill_ppa = start; 4575 return (0); 4576 } 4577 4578 /* 4579 * Insert ill into the list of configured ill's. Once this function completes, 4580 * the ill is globally visible and is available through lookups. More precisely 4581 * this happens after the caller drops the ill_g_lock. 4582 */ 4583 static int 4584 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 4585 { 4586 ill_if_t *ill_interface; 4587 avl_index_t where = 0; 4588 int error; 4589 int name_length; 4590 int index; 4591 boolean_t check_length = B_FALSE; 4592 ip_stack_t *ipst = ill->ill_ipst; 4593 4594 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 4595 4596 name_length = mi_strlen(name) + 1; 4597 4598 if (isv6) 4599 index = IP_V6_G_HEAD; 4600 else 4601 index = IP_V4_G_HEAD; 4602 4603 ill_interface = IP_VX_ILL_G_LIST(index, ipst); 4604 /* 4605 * Search for interface type based on name 4606 */ 4607 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 4608 if ((ill_interface->illif_name_len == name_length) && 4609 (strcmp(ill_interface->illif_name, name) == 0)) { 4610 break; 4611 } 4612 ill_interface = ill_interface->illif_next; 4613 } 4614 4615 /* 4616 * Interface type not found, create one. 4617 */ 4618 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 4619 4620 ill_g_head_t ghead; 4621 4622 /* 4623 * allocate ill_if_t structure 4624 */ 4625 4626 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 4627 if (ill_interface == NULL) { 4628 return (ENOMEM); 4629 } 4630 4631 4632 4633 (void) strcpy(ill_interface->illif_name, name); 4634 ill_interface->illif_name_len = name_length; 4635 4636 avl_create(&ill_interface->illif_avl_by_ppa, 4637 ill_compare_ppa, sizeof (ill_t), 4638 offsetof(struct ill_s, ill_avl_byppa)); 4639 4640 /* 4641 * link the structure in the back to maintain order 4642 * of configuration for ifconfig output. 4643 */ 4644 ghead = ipst->ips_ill_g_heads[index]; 4645 insque(ill_interface, ghead.ill_g_list_tail); 4646 4647 } 4648 4649 if (ill->ill_ppa == UINT_MAX) 4650 check_length = B_TRUE; 4651 4652 error = ill_alloc_ppa(ill_interface, ill); 4653 if (error != 0) { 4654 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 4655 ill_delete_interface_type(ill->ill_ifptr); 4656 return (error); 4657 } 4658 4659 /* 4660 * When the ppa is choosen by the system, check that there is 4661 * enough space to insert ppa. if a specific ppa was passed in this 4662 * check is not required as the interface name passed in will have 4663 * the right ppa in it. 4664 */ 4665 if (check_length) { 4666 /* 4667 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 4668 */ 4669 char buf[sizeof (uint_t) * 3]; 4670 4671 /* 4672 * convert ppa to string to calculate the amount of space 4673 * required for it in the name. 4674 */ 4675 numtos(ill->ill_ppa, buf); 4676 4677 /* Do we have enough space to insert ppa ? */ 4678 4679 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 4680 /* Free ppa and interface type struct */ 4681 if (ill_interface->illif_ppa_arena != NULL) { 4682 vmem_free(ill_interface->illif_ppa_arena, 4683 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4684 } 4685 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 4686 0) { 4687 ill_delete_interface_type(ill->ill_ifptr); 4688 } 4689 4690 return (EINVAL); 4691 } 4692 } 4693 4694 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 4695 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 4696 4697 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 4698 &where); 4699 ill->ill_ifptr = ill_interface; 4700 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 4701 4702 ill_phyint_reinit(ill); 4703 return (0); 4704 } 4705 4706 /* Initialize the per phyint (per IPMP group) ipsq used for serialization */ 4707 static boolean_t 4708 ipsq_init(ill_t *ill) 4709 { 4710 ipsq_t *ipsq; 4711 4712 /* Init the ipsq and impicitly enter as writer */ 4713 ill->ill_phyint->phyint_ipsq = 4714 kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 4715 if (ill->ill_phyint->phyint_ipsq == NULL) 4716 return (B_FALSE); 4717 ipsq = ill->ill_phyint->phyint_ipsq; 4718 ipsq->ipsq_phyint_list = ill->ill_phyint; 4719 ill->ill_phyint->phyint_ipsq_next = NULL; 4720 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 4721 ipsq->ipsq_refs = 1; 4722 ipsq->ipsq_writer = curthread; 4723 ipsq->ipsq_reentry_cnt = 1; 4724 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */ 4725 #ifdef ILL_DEBUG 4726 ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack, IP_STACK_DEPTH); 4727 #endif 4728 (void) strcpy(ipsq->ipsq_name, ill->ill_name); 4729 return (B_TRUE); 4730 } 4731 4732 /* 4733 * ill_init is called by ip_open when a device control stream is opened. 4734 * It does a few initializations, and shoots a DL_INFO_REQ message down 4735 * to the driver. The response is later picked up in ip_rput_dlpi and 4736 * used to set up default mechanisms for talking to the driver. (Always 4737 * called as writer.) 4738 * 4739 * If this function returns error, ip_open will call ip_close which in 4740 * turn will call ill_delete to clean up any memory allocated here that 4741 * is not yet freed. 4742 */ 4743 int 4744 ill_init(queue_t *q, ill_t *ill) 4745 { 4746 int count; 4747 dl_info_req_t *dlir; 4748 mblk_t *info_mp; 4749 uchar_t *frag_ptr; 4750 4751 /* 4752 * The ill is initialized to zero by mi_alloc*(). In addition 4753 * some fields already contain valid values, initialized in 4754 * ip_open(), before we reach here. 4755 */ 4756 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 4757 4758 ill->ill_rq = q; 4759 ill->ill_wq = WR(q); 4760 4761 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 4762 BPRI_HI); 4763 if (info_mp == NULL) 4764 return (ENOMEM); 4765 4766 /* 4767 * Allocate sufficient space to contain our fragment hash table and 4768 * the device name. 4769 */ 4770 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 4771 2 * LIFNAMSIZ + 5 + strlen(ipv6_forward_suffix)); 4772 if (frag_ptr == NULL) { 4773 freemsg(info_mp); 4774 return (ENOMEM); 4775 } 4776 ill->ill_frag_ptr = frag_ptr; 4777 ill->ill_frag_free_num_pkts = 0; 4778 ill->ill_last_frag_clean_time = 0; 4779 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 4780 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 4781 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 4782 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 4783 NULL, MUTEX_DEFAULT, NULL); 4784 } 4785 4786 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4787 if (ill->ill_phyint == NULL) { 4788 freemsg(info_mp); 4789 mi_free(frag_ptr); 4790 return (ENOMEM); 4791 } 4792 4793 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4794 /* 4795 * For now pretend this is a v4 ill. We need to set phyint_ill* 4796 * at this point because of the following reason. If we can't 4797 * enter the ipsq at some point and cv_wait, the writer that 4798 * wakes us up tries to locate us using the list of all phyints 4799 * in an ipsq and the ills from the phyint thru the phyint_ill*. 4800 * If we don't set it now, we risk a missed wakeup. 4801 */ 4802 ill->ill_phyint->phyint_illv4 = ill; 4803 ill->ill_ppa = UINT_MAX; 4804 ill->ill_fastpath_list = &ill->ill_fastpath_list; 4805 4806 if (!ipsq_init(ill)) { 4807 freemsg(info_mp); 4808 mi_free(frag_ptr); 4809 mi_free(ill->ill_phyint); 4810 return (ENOMEM); 4811 } 4812 4813 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 4814 4815 4816 /* Frag queue limit stuff */ 4817 ill->ill_frag_count = 0; 4818 ill->ill_ipf_gen = 0; 4819 4820 ill->ill_global_timer = INFINITY; 4821 ill->ill_mcast_type = IGMP_V3_ROUTER; /* == MLD_V2_ROUTER */ 4822 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4823 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4824 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4825 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4826 4827 /* 4828 * Initialize IPv6 configuration variables. The IP module is always 4829 * opened as an IPv4 module. Instead tracking down the cases where 4830 * it switches to do ipv6, we'll just initialize the IPv6 configuration 4831 * here for convenience, this has no effect until the ill is set to do 4832 * IPv6. 4833 */ 4834 ill->ill_reachable_time = ND_REACHABLE_TIME; 4835 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 4836 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 4837 ill->ill_max_buf = ND_MAX_Q; 4838 ill->ill_refcnt = 0; 4839 4840 /* Send down the Info Request to the driver. */ 4841 info_mp->b_datap->db_type = M_PCPROTO; 4842 dlir = (dl_info_req_t *)info_mp->b_rptr; 4843 info_mp->b_wptr = (uchar_t *)&dlir[1]; 4844 dlir->dl_primitive = DL_INFO_REQ; 4845 4846 ill->ill_dlpi_pending = DL_PRIM_INVAL; 4847 4848 qprocson(q); 4849 ill_dlpi_send(ill, info_mp); 4850 4851 return (0); 4852 } 4853 4854 /* 4855 * ill_dls_info 4856 * creates datalink socket info from the device. 4857 */ 4858 int 4859 ill_dls_info(struct sockaddr_dl *sdl, const ipif_t *ipif) 4860 { 4861 size_t len; 4862 ill_t *ill = ipif->ipif_ill; 4863 4864 sdl->sdl_family = AF_LINK; 4865 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4866 sdl->sdl_type = ill->ill_type; 4867 (void) ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4868 len = strlen(sdl->sdl_data); 4869 ASSERT(len < 256); 4870 sdl->sdl_nlen = (uchar_t)len; 4871 sdl->sdl_alen = ill->ill_phys_addr_length; 4872 sdl->sdl_slen = 0; 4873 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) 4874 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen); 4875 4876 return (sizeof (struct sockaddr_dl)); 4877 } 4878 4879 /* 4880 * ill_xarp_info 4881 * creates xarp info from the device. 4882 */ 4883 static int 4884 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 4885 { 4886 sdl->sdl_family = AF_LINK; 4887 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4888 sdl->sdl_type = ill->ill_type; 4889 (void) ipif_get_name(ill->ill_ipif, sdl->sdl_data, 4890 sizeof (sdl->sdl_data)); 4891 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 4892 sdl->sdl_alen = ill->ill_phys_addr_length; 4893 sdl->sdl_slen = 0; 4894 return (sdl->sdl_nlen); 4895 } 4896 4897 static int 4898 loopback_kstat_update(kstat_t *ksp, int rw) 4899 { 4900 kstat_named_t *kn; 4901 netstackid_t stackid; 4902 netstack_t *ns; 4903 ip_stack_t *ipst; 4904 4905 if (ksp == NULL || ksp->ks_data == NULL) 4906 return (EIO); 4907 4908 if (rw == KSTAT_WRITE) 4909 return (EACCES); 4910 4911 kn = KSTAT_NAMED_PTR(ksp); 4912 stackid = (zoneid_t)(uintptr_t)ksp->ks_private; 4913 4914 ns = netstack_find_by_stackid(stackid); 4915 if (ns == NULL) 4916 return (-1); 4917 4918 ipst = ns->netstack_ip; 4919 if (ipst == NULL) { 4920 netstack_rele(ns); 4921 return (-1); 4922 } 4923 kn[0].value.ui32 = ipst->ips_loopback_packets; 4924 kn[1].value.ui32 = ipst->ips_loopback_packets; 4925 netstack_rele(ns); 4926 return (0); 4927 } 4928 4929 4930 /* 4931 * Has ifindex been plumbed already. 4932 * Compares both phyint_ifindex and phyint_group_ifindex. 4933 */ 4934 static boolean_t 4935 phyint_exists(uint_t index, ip_stack_t *ipst) 4936 { 4937 phyint_t *phyi; 4938 4939 ASSERT(index != 0); 4940 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 4941 /* 4942 * Indexes are stored in the phyint - a common structure 4943 * to both IPv4 and IPv6. 4944 */ 4945 phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); 4946 for (; phyi != NULL; 4947 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 4948 phyi, AVL_AFTER)) { 4949 if (phyi->phyint_ifindex == index || 4950 phyi->phyint_group_ifindex == index) 4951 return (B_TRUE); 4952 } 4953 return (B_FALSE); 4954 } 4955 4956 /* Pick a unique ifindex */ 4957 boolean_t 4958 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst) 4959 { 4960 uint_t starting_index; 4961 4962 if (!ipst->ips_ill_index_wrap) { 4963 *indexp = ipst->ips_ill_index++; 4964 if (ipst->ips_ill_index == 0) { 4965 /* Reached the uint_t limit Next time wrap */ 4966 ipst->ips_ill_index_wrap = B_TRUE; 4967 } 4968 return (B_TRUE); 4969 } 4970 4971 /* 4972 * Start reusing unused indexes. Note that we hold the ill_g_lock 4973 * at this point and don't want to call any function that attempts 4974 * to get the lock again. 4975 */ 4976 starting_index = ipst->ips_ill_index++; 4977 for (; ipst->ips_ill_index != starting_index; ipst->ips_ill_index++) { 4978 if (ipst->ips_ill_index != 0 && 4979 !phyint_exists(ipst->ips_ill_index, ipst)) { 4980 /* found unused index - use it */ 4981 *indexp = ipst->ips_ill_index; 4982 return (B_TRUE); 4983 } 4984 } 4985 4986 /* 4987 * all interface indicies are inuse. 4988 */ 4989 return (B_FALSE); 4990 } 4991 4992 /* 4993 * Assign a unique interface index for the phyint. 4994 */ 4995 static boolean_t 4996 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst) 4997 { 4998 ASSERT(phyi->phyint_ifindex == 0); 4999 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst)); 5000 } 5001 5002 /* 5003 * Return a pointer to the ill which matches the supplied name. Note that 5004 * the ill name length includes the null termination character. (May be 5005 * called as writer.) 5006 * If do_alloc and the interface is "lo0" it will be automatically created. 5007 * Cannot bump up reference on condemned ills. So dup detect can't be done 5008 * using this func. 5009 */ 5010 ill_t * 5011 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 5012 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc, 5013 ip_stack_t *ipst) 5014 { 5015 ill_t *ill; 5016 ipif_t *ipif; 5017 kstat_named_t *kn; 5018 boolean_t isloopback; 5019 ipsq_t *old_ipsq; 5020 in6_addr_t ov6addr; 5021 5022 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 5023 5024 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5025 ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst); 5026 rw_exit(&ipst->ips_ill_g_lock); 5027 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) 5028 return (ill); 5029 5030 /* 5031 * Couldn't find it. Does this happen to be a lookup for the 5032 * loopback device and are we allowed to allocate it? 5033 */ 5034 if (!isloopback || !do_alloc) 5035 return (NULL); 5036 5037 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 5038 5039 ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst); 5040 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) { 5041 rw_exit(&ipst->ips_ill_g_lock); 5042 return (ill); 5043 } 5044 5045 /* Create the loopback device on demand */ 5046 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 5047 sizeof (ipif_loopback_name), BPRI_MED)); 5048 if (ill == NULL) 5049 goto done; 5050 5051 *ill = ill_null; 5052 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 5053 ill->ill_ipst = ipst; 5054 netstack_hold(ipst->ips_netstack); 5055 /* 5056 * For exclusive stacks we set the zoneid to zero 5057 * to make IP operate as if in the global zone. 5058 */ 5059 ill->ill_zoneid = GLOBAL_ZONEID; 5060 5061 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 5062 if (ill->ill_phyint == NULL) 5063 goto done; 5064 5065 if (isv6) 5066 ill->ill_phyint->phyint_illv6 = ill; 5067 else 5068 ill->ill_phyint->phyint_illv4 = ill; 5069 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 5070 ill->ill_max_frag = IP_LOOPBACK_MTU; 5071 /* Add room for tcp+ip headers */ 5072 if (isv6) { 5073 ill->ill_isv6 = B_TRUE; 5074 ill->ill_max_frag += IPV6_HDR_LEN + 20; /* for TCP */ 5075 } else { 5076 ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20; 5077 } 5078 if (!ill_allocate_mibs(ill)) 5079 goto done; 5080 ill->ill_max_mtu = ill->ill_max_frag; 5081 /* 5082 * ipif_loopback_name can't be pointed at directly because its used 5083 * by both the ipv4 and ipv6 interfaces. When the ill is removed 5084 * from the glist, ill_glist_delete() sets the first character of 5085 * ill_name to '\0'. 5086 */ 5087 ill->ill_name = (char *)ill + sizeof (*ill); 5088 (void) strcpy(ill->ill_name, ipif_loopback_name); 5089 ill->ill_name_length = sizeof (ipif_loopback_name); 5090 /* Set ill_name_set for ill_phyint_reinit to work properly */ 5091 5092 ill->ill_global_timer = INFINITY; 5093 ill->ill_mcast_type = IGMP_V3_ROUTER; /* == MLD_V2_ROUTER */ 5094 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 5095 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 5096 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 5097 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 5098 5099 /* No resolver here. */ 5100 ill->ill_net_type = IRE_LOOPBACK; 5101 5102 /* Initialize the ipsq */ 5103 if (!ipsq_init(ill)) 5104 goto done; 5105 5106 ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL; 5107 ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--; 5108 ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0); 5109 #ifdef ILL_DEBUG 5110 ill->ill_phyint->phyint_ipsq->ipsq_depth = 0; 5111 #endif 5112 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE); 5113 if (ipif == NULL) 5114 goto done; 5115 5116 ill->ill_flags = ILLF_MULTICAST; 5117 5118 ov6addr = ipif->ipif_v6lcl_addr; 5119 /* Set up default loopback address and mask. */ 5120 if (!isv6) { 5121 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 5122 5123 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 5124 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 5125 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 5126 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 5127 ipif->ipif_v6subnet); 5128 ill->ill_flags |= ILLF_IPV4; 5129 } else { 5130 ipif->ipif_v6lcl_addr = ipv6_loopback; 5131 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 5132 ipif->ipif_v6net_mask = ipv6_all_ones; 5133 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 5134 ipif->ipif_v6subnet); 5135 ill->ill_flags |= ILLF_IPV6; 5136 } 5137 5138 /* 5139 * Chain us in at the end of the ill list. hold the ill 5140 * before we make it globally visible. 1 for the lookup. 5141 */ 5142 ill->ill_refcnt = 0; 5143 ill_refhold(ill); 5144 5145 ill->ill_frag_count = 0; 5146 ill->ill_frag_free_num_pkts = 0; 5147 ill->ill_last_frag_clean_time = 0; 5148 5149 old_ipsq = ill->ill_phyint->phyint_ipsq; 5150 5151 if (ill_glist_insert(ill, "lo", isv6) != 0) 5152 cmn_err(CE_PANIC, "cannot insert loopback interface"); 5153 5154 /* Let SCTP know so that it can add this to its list */ 5155 sctp_update_ill(ill, SCTP_ILL_INSERT); 5156 5157 /* 5158 * We have already assigned ipif_v6lcl_addr above, but we need to 5159 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which 5160 * requires to be after ill_glist_insert() since we need the 5161 * ill_index set. Pass on ipv6_loopback as the old address. 5162 */ 5163 sctp_update_ipif_addr(ipif, ov6addr); 5164 5165 /* 5166 * If the ipsq was changed in ill_phyint_reinit free the old ipsq. 5167 */ 5168 if (old_ipsq != ill->ill_phyint->phyint_ipsq) { 5169 /* Loopback ills aren't in any IPMP group */ 5170 ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP)); 5171 ipsq_delete(old_ipsq); 5172 } 5173 5174 /* 5175 * Delay this till the ipif is allocated as ipif_allocate 5176 * de-references ill_phyint for getting the ifindex. We 5177 * can't do this before ipif_allocate because ill_phyint_reinit 5178 * -> phyint_assign_ifindex expects ipif to be present. 5179 */ 5180 mutex_enter(&ill->ill_phyint->phyint_lock); 5181 ill->ill_phyint->phyint_flags |= PHYI_LOOPBACK | PHYI_VIRTUAL; 5182 mutex_exit(&ill->ill_phyint->phyint_lock); 5183 5184 if (ipst->ips_loopback_ksp == NULL) { 5185 /* Export loopback interface statistics */ 5186 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0, 5187 ipif_loopback_name, "net", 5188 KSTAT_TYPE_NAMED, 2, 0, 5189 ipst->ips_netstack->netstack_stackid); 5190 if (ipst->ips_loopback_ksp != NULL) { 5191 ipst->ips_loopback_ksp->ks_update = 5192 loopback_kstat_update; 5193 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp); 5194 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 5195 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 5196 ipst->ips_loopback_ksp->ks_private = 5197 (void *)(uintptr_t)ipst->ips_netstack-> 5198 netstack_stackid; 5199 kstat_install(ipst->ips_loopback_ksp); 5200 } 5201 } 5202 5203 if (error != NULL) 5204 *error = 0; 5205 *did_alloc = B_TRUE; 5206 rw_exit(&ipst->ips_ill_g_lock); 5207 return (ill); 5208 done: 5209 if (ill != NULL) { 5210 if (ill->ill_phyint != NULL) { 5211 ipsq_t *ipsq; 5212 5213 ipsq = ill->ill_phyint->phyint_ipsq; 5214 if (ipsq != NULL) { 5215 ipsq->ipsq_ipst = NULL; 5216 kmem_free(ipsq, sizeof (ipsq_t)); 5217 } 5218 mi_free(ill->ill_phyint); 5219 } 5220 ill_free_mib(ill); 5221 if (ill->ill_ipst != NULL) 5222 netstack_rele(ill->ill_ipst->ips_netstack); 5223 mi_free(ill); 5224 } 5225 rw_exit(&ipst->ips_ill_g_lock); 5226 if (error != NULL) 5227 *error = ENOMEM; 5228 return (NULL); 5229 } 5230 5231 /* 5232 * For IPP calls - use the ip_stack_t for global stack. 5233 */ 5234 ill_t * 5235 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6, 5236 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err) 5237 { 5238 ip_stack_t *ipst; 5239 ill_t *ill; 5240 5241 ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip; 5242 if (ipst == NULL) { 5243 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n"); 5244 return (NULL); 5245 } 5246 5247 ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst); 5248 netstack_rele(ipst->ips_netstack); 5249 return (ill); 5250 } 5251 5252 /* 5253 * Return a pointer to the ill which matches the index and IP version type. 5254 */ 5255 ill_t * 5256 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, 5257 ipsq_func_t func, int *err, ip_stack_t *ipst) 5258 { 5259 ill_t *ill; 5260 ipsq_t *ipsq; 5261 phyint_t *phyi; 5262 5263 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 5264 (q != NULL && mp != NULL && func != NULL && err != NULL)); 5265 5266 if (err != NULL) 5267 *err = 0; 5268 5269 /* 5270 * Indexes are stored in the phyint - a common structure 5271 * to both IPv4 and IPv6. 5272 */ 5273 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5274 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5275 (void *) &index, NULL); 5276 if (phyi != NULL) { 5277 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 5278 if (ill != NULL) { 5279 /* 5280 * The block comment at the start of ipif_down 5281 * explains the use of the macros used below 5282 */ 5283 GRAB_CONN_LOCK(q); 5284 mutex_enter(&ill->ill_lock); 5285 if (ILL_CAN_LOOKUP(ill)) { 5286 ill_refhold_locked(ill); 5287 mutex_exit(&ill->ill_lock); 5288 RELEASE_CONN_LOCK(q); 5289 rw_exit(&ipst->ips_ill_g_lock); 5290 return (ill); 5291 } else if (ILL_CAN_WAIT(ill, q)) { 5292 ipsq = ill->ill_phyint->phyint_ipsq; 5293 mutex_enter(&ipsq->ipsq_lock); 5294 rw_exit(&ipst->ips_ill_g_lock); 5295 mutex_exit(&ill->ill_lock); 5296 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 5297 mutex_exit(&ipsq->ipsq_lock); 5298 RELEASE_CONN_LOCK(q); 5299 *err = EINPROGRESS; 5300 return (NULL); 5301 } 5302 RELEASE_CONN_LOCK(q); 5303 mutex_exit(&ill->ill_lock); 5304 } 5305 } 5306 rw_exit(&ipst->ips_ill_g_lock); 5307 if (err != NULL) 5308 *err = ENXIO; 5309 return (NULL); 5310 } 5311 5312 /* 5313 * Return the ifindex next in sequence after the passed in ifindex. 5314 * If there is no next ifindex for the given protocol, return 0. 5315 */ 5316 uint_t 5317 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 5318 { 5319 phyint_t *phyi; 5320 phyint_t *phyi_initial; 5321 uint_t ifindex; 5322 5323 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5324 5325 if (index == 0) { 5326 phyi = avl_first( 5327 &ipst->ips_phyint_g_list->phyint_list_avl_by_index); 5328 } else { 5329 phyi = phyi_initial = avl_find( 5330 &ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5331 (void *) &index, NULL); 5332 } 5333 5334 for (; phyi != NULL; 5335 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5336 phyi, AVL_AFTER)) { 5337 /* 5338 * If we're not returning the first interface in the tree 5339 * and we still haven't moved past the phyint_t that 5340 * corresponds to index, avl_walk needs to be called again 5341 */ 5342 if (!((index != 0) && (phyi == phyi_initial))) { 5343 if (isv6) { 5344 if ((phyi->phyint_illv6) && 5345 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 5346 (phyi->phyint_illv6->ill_isv6 == 1)) 5347 break; 5348 } else { 5349 if ((phyi->phyint_illv4) && 5350 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 5351 (phyi->phyint_illv4->ill_isv6 == 0)) 5352 break; 5353 } 5354 } 5355 } 5356 5357 rw_exit(&ipst->ips_ill_g_lock); 5358 5359 if (phyi != NULL) 5360 ifindex = phyi->phyint_ifindex; 5361 else 5362 ifindex = 0; 5363 5364 return (ifindex); 5365 } 5366 5367 5368 /* 5369 * Return the ifindex for the named interface. 5370 * If there is no next ifindex for the interface, return 0. 5371 */ 5372 uint_t 5373 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst) 5374 { 5375 phyint_t *phyi; 5376 avl_index_t where = 0; 5377 uint_t ifindex; 5378 5379 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5380 5381 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 5382 name, &where)) == NULL) { 5383 rw_exit(&ipst->ips_ill_g_lock); 5384 return (0); 5385 } 5386 5387 ifindex = phyi->phyint_ifindex; 5388 5389 rw_exit(&ipst->ips_ill_g_lock); 5390 5391 return (ifindex); 5392 } 5393 5394 5395 /* 5396 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 5397 * that gives a running thread a reference to the ill. This reference must be 5398 * released by the thread when it is done accessing the ill and related 5399 * objects. ill_refcnt can not be used to account for static references 5400 * such as other structures pointing to an ill. Callers must generally 5401 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 5402 * or be sure that the ill is not being deleted or changing state before 5403 * calling the refhold functions. A non-zero ill_refcnt ensures that the 5404 * ill won't change any of its critical state such as address, netmask etc. 5405 */ 5406 void 5407 ill_refhold(ill_t *ill) 5408 { 5409 mutex_enter(&ill->ill_lock); 5410 ill->ill_refcnt++; 5411 ILL_TRACE_REF(ill); 5412 mutex_exit(&ill->ill_lock); 5413 } 5414 5415 void 5416 ill_refhold_locked(ill_t *ill) 5417 { 5418 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5419 ill->ill_refcnt++; 5420 ILL_TRACE_REF(ill); 5421 } 5422 5423 int 5424 ill_check_and_refhold(ill_t *ill) 5425 { 5426 mutex_enter(&ill->ill_lock); 5427 if (ILL_CAN_LOOKUP(ill)) { 5428 ill_refhold_locked(ill); 5429 mutex_exit(&ill->ill_lock); 5430 return (0); 5431 } 5432 mutex_exit(&ill->ill_lock); 5433 return (ILL_LOOKUP_FAILED); 5434 } 5435 5436 /* 5437 * Must not be called while holding any locks. Otherwise if this is 5438 * the last reference to be released, there is a chance of recursive mutex 5439 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5440 * to restart an ioctl. 5441 */ 5442 void 5443 ill_refrele(ill_t *ill) 5444 { 5445 mutex_enter(&ill->ill_lock); 5446 ASSERT(ill->ill_refcnt != 0); 5447 ill->ill_refcnt--; 5448 ILL_UNTRACE_REF(ill); 5449 if (ill->ill_refcnt != 0) { 5450 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 5451 mutex_exit(&ill->ill_lock); 5452 return; 5453 } 5454 5455 /* Drops the ill_lock */ 5456 ipif_ill_refrele_tail(ill); 5457 } 5458 5459 /* 5460 * Obtain a weak reference count on the ill. This reference ensures the 5461 * ill won't be freed, but the ill may change any of its critical state 5462 * such as netmask, address etc. Returns an error if the ill has started 5463 * closing. 5464 */ 5465 boolean_t 5466 ill_waiter_inc(ill_t *ill) 5467 { 5468 mutex_enter(&ill->ill_lock); 5469 if (ill->ill_state_flags & ILL_CONDEMNED) { 5470 mutex_exit(&ill->ill_lock); 5471 return (B_FALSE); 5472 } 5473 ill->ill_waiters++; 5474 mutex_exit(&ill->ill_lock); 5475 return (B_TRUE); 5476 } 5477 5478 void 5479 ill_waiter_dcr(ill_t *ill) 5480 { 5481 mutex_enter(&ill->ill_lock); 5482 ill->ill_waiters--; 5483 if (ill->ill_waiters == 0) 5484 cv_broadcast(&ill->ill_cv); 5485 mutex_exit(&ill->ill_lock); 5486 } 5487 5488 /* 5489 * Named Dispatch routine to produce a formatted report on all ILLs. 5490 * This report is accessed by using the ndd utility to "get" ND variable 5491 * "ip_ill_status". 5492 */ 5493 /* ARGSUSED */ 5494 int 5495 ip_ill_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5496 { 5497 ill_t *ill; 5498 ill_walk_context_t ctx; 5499 ip_stack_t *ipst; 5500 5501 ipst = CONNQ_TO_IPST(q); 5502 5503 (void) mi_mpprintf(mp, 5504 "ILL " MI_COL_HDRPAD_STR 5505 /* 01234567[89ABCDEF] */ 5506 "rq " MI_COL_HDRPAD_STR 5507 /* 01234567[89ABCDEF] */ 5508 "wq " MI_COL_HDRPAD_STR 5509 /* 01234567[89ABCDEF] */ 5510 "upcnt mxfrg err name"); 5511 /* 12345 12345 123 xxxxxxxx */ 5512 5513 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5514 ill = ILL_START_WALK_ALL(&ctx, ipst); 5515 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5516 (void) mi_mpprintf(mp, 5517 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 5518 "%05u %05u %03d %s", 5519 (void *)ill, (void *)ill->ill_rq, (void *)ill->ill_wq, 5520 ill->ill_ipif_up_count, 5521 ill->ill_max_frag, ill->ill_error, ill->ill_name); 5522 } 5523 rw_exit(&ipst->ips_ill_g_lock); 5524 5525 return (0); 5526 } 5527 5528 /* 5529 * Named Dispatch routine to produce a formatted report on all IPIFs. 5530 * This report is accessed by using the ndd utility to "get" ND variable 5531 * "ip_ipif_status". 5532 */ 5533 /* ARGSUSED */ 5534 int 5535 ip_ipif_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5536 { 5537 char buf1[INET6_ADDRSTRLEN]; 5538 char buf2[INET6_ADDRSTRLEN]; 5539 char buf3[INET6_ADDRSTRLEN]; 5540 char buf4[INET6_ADDRSTRLEN]; 5541 char buf5[INET6_ADDRSTRLEN]; 5542 char buf6[INET6_ADDRSTRLEN]; 5543 char buf[LIFNAMSIZ]; 5544 ill_t *ill; 5545 ipif_t *ipif; 5546 nv_t *nvp; 5547 uint64_t flags; 5548 zoneid_t zoneid; 5549 ill_walk_context_t ctx; 5550 ip_stack_t *ipst = CONNQ_TO_IPST(q); 5551 5552 (void) mi_mpprintf(mp, 5553 "IPIF metric mtu in/out/forward name zone flags...\n" 5554 "\tlocal address\n" 5555 "\tsrc address\n" 5556 "\tsubnet\n" 5557 "\tmask\n" 5558 "\tbroadcast\n" 5559 "\tp-p-dst"); 5560 5561 ASSERT(q->q_next == NULL); 5562 zoneid = Q_TO_CONN(q)->conn_zoneid; /* IP is a driver */ 5563 5564 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5565 ill = ILL_START_WALK_ALL(&ctx, ipst); 5566 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5567 for (ipif = ill->ill_ipif; ipif != NULL; 5568 ipif = ipif->ipif_next) { 5569 if (zoneid != GLOBAL_ZONEID && 5570 zoneid != ipif->ipif_zoneid && 5571 ipif->ipif_zoneid != ALL_ZONES) 5572 continue; 5573 (void) mi_mpprintf(mp, 5574 MI_COL_PTRFMT_STR 5575 "%04u %05u %u/%u/%u %s %d", 5576 (void *)ipif, 5577 ipif->ipif_metric, ipif->ipif_mtu, 5578 ipif->ipif_ib_pkt_count, 5579 ipif->ipif_ob_pkt_count, 5580 ipif->ipif_fo_pkt_count, 5581 ipif_get_name(ipif, buf, sizeof (buf)), 5582 ipif->ipif_zoneid); 5583 5584 flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags | 5585 ipif->ipif_ill->ill_phyint->phyint_flags; 5586 5587 /* Tack on text strings for any flags. */ 5588 nvp = ipif_nv_tbl; 5589 for (; nvp < A_END(ipif_nv_tbl); nvp++) { 5590 if (nvp->nv_value & flags) 5591 (void) mi_mpprintf_nr(mp, " %s", 5592 nvp->nv_name); 5593 } 5594 (void) mi_mpprintf(mp, 5595 "\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n\t%s", 5596 inet_ntop(AF_INET6, 5597 &ipif->ipif_v6lcl_addr, buf1, sizeof (buf1)), 5598 inet_ntop(AF_INET6, 5599 &ipif->ipif_v6src_addr, buf2, sizeof (buf2)), 5600 inet_ntop(AF_INET6, 5601 &ipif->ipif_v6subnet, buf3, sizeof (buf3)), 5602 inet_ntop(AF_INET6, 5603 &ipif->ipif_v6net_mask, buf4, sizeof (buf4)), 5604 inet_ntop(AF_INET6, 5605 &ipif->ipif_v6brd_addr, buf5, sizeof (buf5)), 5606 inet_ntop(AF_INET6, 5607 &ipif->ipif_v6pp_dst_addr, buf6, sizeof (buf6))); 5608 } 5609 } 5610 rw_exit(&ipst->ips_ill_g_lock); 5611 return (0); 5612 } 5613 5614 /* 5615 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 5616 * driver. We construct best guess defaults for lower level information that 5617 * we need. If an interface is brought up without injection of any overriding 5618 * information from outside, we have to be ready to go with these defaults. 5619 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 5620 * we primarely want the dl_provider_style. 5621 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 5622 * at which point we assume the other part of the information is valid. 5623 */ 5624 void 5625 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 5626 { 5627 uchar_t *brdcst_addr; 5628 uint_t brdcst_addr_length, phys_addr_length; 5629 t_scalar_t sap_length; 5630 dl_info_ack_t *dlia; 5631 ip_m_t *ipm; 5632 dl_qos_cl_sel1_t *sel1; 5633 5634 ASSERT(IAM_WRITER_ILL(ill)); 5635 5636 /* 5637 * Till the ill is fully up ILL_CHANGING will be set and 5638 * the ill is not globally visible. So no need for a lock. 5639 */ 5640 dlia = (dl_info_ack_t *)mp->b_rptr; 5641 ill->ill_mactype = dlia->dl_mac_type; 5642 5643 ipm = ip_m_lookup(dlia->dl_mac_type); 5644 if (ipm == NULL) { 5645 ipm = ip_m_lookup(DL_OTHER); 5646 ASSERT(ipm != NULL); 5647 } 5648 ill->ill_media = ipm; 5649 5650 /* 5651 * When the new DLPI stuff is ready we'll pull lengths 5652 * from dlia. 5653 */ 5654 if (dlia->dl_version == DL_VERSION_2) { 5655 brdcst_addr_length = dlia->dl_brdcst_addr_length; 5656 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 5657 brdcst_addr_length); 5658 if (brdcst_addr == NULL) { 5659 brdcst_addr_length = 0; 5660 } 5661 sap_length = dlia->dl_sap_length; 5662 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 5663 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 5664 brdcst_addr_length, sap_length, phys_addr_length)); 5665 } else { 5666 brdcst_addr_length = 6; 5667 brdcst_addr = ip_six_byte_all_ones; 5668 sap_length = -2; 5669 phys_addr_length = brdcst_addr_length; 5670 } 5671 5672 ill->ill_bcast_addr_length = brdcst_addr_length; 5673 ill->ill_phys_addr_length = phys_addr_length; 5674 ill->ill_sap_length = sap_length; 5675 ill->ill_max_frag = dlia->dl_max_sdu; 5676 ill->ill_max_mtu = ill->ill_max_frag; 5677 5678 ill->ill_type = ipm->ip_m_type; 5679 5680 if (!ill->ill_dlpi_style_set) { 5681 if (dlia->dl_provider_style == DL_STYLE2) 5682 ill->ill_needs_attach = 1; 5683 5684 /* 5685 * Allocate the first ipif on this ill. We don't delay it 5686 * further as ioctl handling assumes atleast one ipif to 5687 * be present. 5688 * 5689 * At this point we don't know whether the ill is v4 or v6. 5690 * We will know this whan the SIOCSLIFNAME happens and 5691 * the correct value for ill_isv6 will be assigned in 5692 * ipif_set_values(). We need to hold the ill lock and 5693 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 5694 * the wakeup. 5695 */ 5696 (void) ipif_allocate(ill, 0, IRE_LOCAL, 5697 dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE); 5698 mutex_enter(&ill->ill_lock); 5699 ASSERT(ill->ill_dlpi_style_set == 0); 5700 ill->ill_dlpi_style_set = 1; 5701 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 5702 cv_broadcast(&ill->ill_cv); 5703 mutex_exit(&ill->ill_lock); 5704 freemsg(mp); 5705 return; 5706 } 5707 ASSERT(ill->ill_ipif != NULL); 5708 /* 5709 * We know whether it is IPv4 or IPv6 now, as this is the 5710 * second DL_INFO_ACK we are recieving in response to the 5711 * DL_INFO_REQ sent in ipif_set_values. 5712 */ 5713 if (ill->ill_isv6) 5714 ill->ill_sap = IP6_DL_SAP; 5715 else 5716 ill->ill_sap = IP_DL_SAP; 5717 /* 5718 * Set ipif_mtu which is used to set the IRE's 5719 * ire_max_frag value. The driver could have sent 5720 * a different mtu from what it sent last time. No 5721 * need to call ipif_mtu_change because IREs have 5722 * not yet been created. 5723 */ 5724 ill->ill_ipif->ipif_mtu = ill->ill_max_mtu; 5725 /* 5726 * Clear all the flags that were set based on ill_bcast_addr_length 5727 * and ill_phys_addr_length (in ipif_set_values) as these could have 5728 * changed now and we need to re-evaluate. 5729 */ 5730 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 5731 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 5732 5733 /* 5734 * Free ill_resolver_mp and ill_bcast_mp as things could have 5735 * changed now. 5736 */ 5737 if (ill->ill_bcast_addr_length == 0) { 5738 if (ill->ill_resolver_mp != NULL) 5739 freemsg(ill->ill_resolver_mp); 5740 if (ill->ill_bcast_mp != NULL) 5741 freemsg(ill->ill_bcast_mp); 5742 if (ill->ill_flags & ILLF_XRESOLV) 5743 ill->ill_net_type = IRE_IF_RESOLVER; 5744 else 5745 ill->ill_net_type = IRE_IF_NORESOLVER; 5746 ill->ill_resolver_mp = ill_dlur_gen(NULL, 5747 ill->ill_phys_addr_length, 5748 ill->ill_sap, 5749 ill->ill_sap_length); 5750 ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp); 5751 5752 if (ill->ill_isv6) 5753 /* 5754 * Note: xresolv interfaces will eventually need NOARP 5755 * set here as well, but that will require those 5756 * external resolvers to have some knowledge of 5757 * that flag and act appropriately. Not to be changed 5758 * at present. 5759 */ 5760 ill->ill_flags |= ILLF_NONUD; 5761 else 5762 ill->ill_flags |= ILLF_NOARP; 5763 5764 if (ill->ill_phys_addr_length == 0) { 5765 if (ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 5766 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 5767 ill->ill_phyint->phyint_flags |= PHYI_VIRTUAL; 5768 } else { 5769 /* pt-pt supports multicast. */ 5770 ill->ill_flags |= ILLF_MULTICAST; 5771 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 5772 } 5773 } 5774 } else { 5775 ill->ill_net_type = IRE_IF_RESOLVER; 5776 if (ill->ill_bcast_mp != NULL) 5777 freemsg(ill->ill_bcast_mp); 5778 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 5779 ill->ill_bcast_addr_length, ill->ill_sap, 5780 ill->ill_sap_length); 5781 /* 5782 * Later detect lack of DLPI driver multicast 5783 * capability by catching DL_ENABMULTI errors in 5784 * ip_rput_dlpi. 5785 */ 5786 ill->ill_flags |= ILLF_MULTICAST; 5787 if (!ill->ill_isv6) 5788 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 5789 } 5790 /* By default an interface does not support any CoS marking */ 5791 ill->ill_flags &= ~ILLF_COS_ENABLED; 5792 5793 /* 5794 * If we get QoS information in DL_INFO_ACK, the device supports 5795 * some form of CoS marking, set ILLF_COS_ENABLED. 5796 */ 5797 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 5798 dlia->dl_qos_length); 5799 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 5800 ill->ill_flags |= ILLF_COS_ENABLED; 5801 } 5802 5803 /* Clear any previous error indication. */ 5804 ill->ill_error = 0; 5805 freemsg(mp); 5806 } 5807 5808 /* 5809 * Perform various checks to verify that an address would make sense as a 5810 * local, remote, or subnet interface address. 5811 */ 5812 static boolean_t 5813 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 5814 { 5815 ipaddr_t net_mask; 5816 5817 /* 5818 * Don't allow all zeroes, all ones or experimental address, but allow 5819 * all ones netmask. 5820 */ 5821 if ((net_mask = ip_net_mask(addr)) == 0) 5822 return (B_FALSE); 5823 /* A given netmask overrides the "guess" netmask */ 5824 if (subnet_mask != 0) 5825 net_mask = subnet_mask; 5826 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 5827 (addr == (addr | ~net_mask)))) { 5828 return (B_FALSE); 5829 } 5830 if (CLASSD(addr)) 5831 return (B_FALSE); 5832 5833 return (B_TRUE); 5834 } 5835 5836 #define V6_IPIF_LINKLOCAL(p) \ 5837 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr) 5838 5839 /* 5840 * Compare two given ipifs and check if the second one is better than 5841 * the first one using the order of preference (not taking deprecated 5842 * into acount) specified in ipif_lookup_multicast(). 5843 */ 5844 static boolean_t 5845 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) 5846 { 5847 /* Check the least preferred first. */ 5848 if (IS_LOOPBACK(old_ipif->ipif_ill)) { 5849 /* If both ipifs are the same, use the first one. */ 5850 if (IS_LOOPBACK(new_ipif->ipif_ill)) 5851 return (B_FALSE); 5852 else 5853 return (B_TRUE); 5854 } 5855 5856 /* For IPv6, check for link local address. */ 5857 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) { 5858 if (IS_LOOPBACK(new_ipif->ipif_ill) || 5859 V6_IPIF_LINKLOCAL(new_ipif)) { 5860 /* The second one is equal or less preferred. */ 5861 return (B_FALSE); 5862 } else { 5863 return (B_TRUE); 5864 } 5865 } 5866 5867 /* Then check for point to point interface. */ 5868 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) { 5869 if (IS_LOOPBACK(new_ipif->ipif_ill) || 5870 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) || 5871 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) { 5872 return (B_FALSE); 5873 } else { 5874 return (B_TRUE); 5875 } 5876 } 5877 5878 /* old_ipif is a normal interface, so no need to use the new one. */ 5879 return (B_FALSE); 5880 } 5881 5882 /* 5883 * Find any non-virtual, not condemned, and up multicast capable interface 5884 * given an IP instance and zoneid. Order of preference is: 5885 * 5886 * 1. normal 5887 * 1.1 normal, but deprecated 5888 * 2. point to point 5889 * 2.1 point to point, but deprecated 5890 * 3. link local 5891 * 3.1 link local, but deprecated 5892 * 4. loopback. 5893 */ 5894 ipif_t * 5895 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 5896 { 5897 ill_t *ill; 5898 ill_walk_context_t ctx; 5899 ipif_t *ipif; 5900 ipif_t *saved_ipif = NULL; 5901 ipif_t *dep_ipif = NULL; 5902 5903 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5904 if (isv6) 5905 ill = ILL_START_WALK_V6(&ctx, ipst); 5906 else 5907 ill = ILL_START_WALK_V4(&ctx, ipst); 5908 5909 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5910 mutex_enter(&ill->ill_lock); 5911 if (IS_VNI(ill) || !ILL_CAN_LOOKUP(ill) || 5912 !(ill->ill_flags & ILLF_MULTICAST)) { 5913 mutex_exit(&ill->ill_lock); 5914 continue; 5915 } 5916 for (ipif = ill->ill_ipif; ipif != NULL; 5917 ipif = ipif->ipif_next) { 5918 if (zoneid != ipif->ipif_zoneid && 5919 zoneid != ALL_ZONES && 5920 ipif->ipif_zoneid != ALL_ZONES) { 5921 continue; 5922 } 5923 if (!(ipif->ipif_flags & IPIF_UP) || 5924 !IPIF_CAN_LOOKUP(ipif)) { 5925 continue; 5926 } 5927 5928 /* 5929 * Found one candidate. If it is deprecated, 5930 * remember it in dep_ipif. If it is not deprecated, 5931 * remember it in saved_ipif. 5932 */ 5933 if (ipif->ipif_flags & IPIF_DEPRECATED) { 5934 if (dep_ipif == NULL) { 5935 dep_ipif = ipif; 5936 } else if (ipif_comp_multi(dep_ipif, ipif, 5937 isv6)) { 5938 /* 5939 * If the previous dep_ipif does not 5940 * belong to the same ill, we've done 5941 * a ipif_refhold() on it. So we need 5942 * to release it. 5943 */ 5944 if (dep_ipif->ipif_ill != ill) 5945 ipif_refrele(dep_ipif); 5946 dep_ipif = ipif; 5947 } 5948 continue; 5949 } 5950 if (saved_ipif == NULL) { 5951 saved_ipif = ipif; 5952 } else { 5953 if (ipif_comp_multi(saved_ipif, ipif, isv6)) { 5954 if (saved_ipif->ipif_ill != ill) 5955 ipif_refrele(saved_ipif); 5956 saved_ipif = ipif; 5957 } 5958 } 5959 } 5960 /* 5961 * Before going to the next ill, do a ipif_refhold() on the 5962 * saved ones. 5963 */ 5964 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill) 5965 ipif_refhold_locked(saved_ipif); 5966 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill) 5967 ipif_refhold_locked(dep_ipif); 5968 mutex_exit(&ill->ill_lock); 5969 } 5970 rw_exit(&ipst->ips_ill_g_lock); 5971 5972 /* 5973 * If we have only the saved_ipif, return it. But if we have both 5974 * saved_ipif and dep_ipif, check to see which one is better. 5975 */ 5976 if (saved_ipif != NULL) { 5977 if (dep_ipif != NULL) { 5978 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) { 5979 ipif_refrele(saved_ipif); 5980 return (dep_ipif); 5981 } else { 5982 ipif_refrele(dep_ipif); 5983 return (saved_ipif); 5984 } 5985 } 5986 return (saved_ipif); 5987 } else { 5988 return (dep_ipif); 5989 } 5990 } 5991 5992 /* 5993 * This function is called when an application does not specify an interface 5994 * to be used for multicast traffic (joining a group/sending data). It 5995 * calls ire_lookup_multi() to look for an interface route for the 5996 * specified multicast group. Doing this allows the administrator to add 5997 * prefix routes for multicast to indicate which interface to be used for 5998 * multicast traffic in the above scenario. The route could be for all 5999 * multicast (224.0/4), for a single multicast group (a /32 route) or 6000 * anything in between. If there is no such multicast route, we just find 6001 * any multicast capable interface and return it. The returned ipif 6002 * is refhold'ed. 6003 */ 6004 ipif_t * 6005 ipif_lookup_group(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst) 6006 { 6007 ire_t *ire; 6008 ipif_t *ipif; 6009 6010 ire = ire_lookup_multi(group, zoneid, ipst); 6011 if (ire != NULL) { 6012 ipif = ire->ire_ipif; 6013 ipif_refhold(ipif); 6014 ire_refrele(ire); 6015 return (ipif); 6016 } 6017 6018 return (ipif_lookup_multicast(ipst, zoneid, B_FALSE)); 6019 } 6020 6021 /* 6022 * Look for an ipif with the specified interface address and destination. 6023 * The destination address is used only for matching point-to-point interfaces. 6024 */ 6025 ipif_t * 6026 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, 6027 ipsq_func_t func, int *error, ip_stack_t *ipst) 6028 { 6029 ipif_t *ipif; 6030 ill_t *ill; 6031 ill_walk_context_t ctx; 6032 ipsq_t *ipsq; 6033 6034 if (error != NULL) 6035 *error = 0; 6036 6037 /* 6038 * First match all the point-to-point interfaces 6039 * before looking at non-point-to-point interfaces. 6040 * This is done to avoid returning non-point-to-point 6041 * ipif instead of unnumbered point-to-point ipif. 6042 */ 6043 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6044 ill = ILL_START_WALK_V4(&ctx, ipst); 6045 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 6046 GRAB_CONN_LOCK(q); 6047 mutex_enter(&ill->ill_lock); 6048 for (ipif = ill->ill_ipif; ipif != NULL; 6049 ipif = ipif->ipif_next) { 6050 /* Allow the ipif to be down */ 6051 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 6052 (ipif->ipif_lcl_addr == if_addr) && 6053 (ipif->ipif_pp_dst_addr == dst)) { 6054 /* 6055 * The block comment at the start of ipif_down 6056 * explains the use of the macros used below 6057 */ 6058 if (IPIF_CAN_LOOKUP(ipif)) { 6059 ipif_refhold_locked(ipif); 6060 mutex_exit(&ill->ill_lock); 6061 RELEASE_CONN_LOCK(q); 6062 rw_exit(&ipst->ips_ill_g_lock); 6063 return (ipif); 6064 } else if (IPIF_CAN_WAIT(ipif, q)) { 6065 ipsq = ill->ill_phyint->phyint_ipsq; 6066 mutex_enter(&ipsq->ipsq_lock); 6067 mutex_exit(&ill->ill_lock); 6068 rw_exit(&ipst->ips_ill_g_lock); 6069 ipsq_enq(ipsq, q, mp, func, NEW_OP, 6070 ill); 6071 mutex_exit(&ipsq->ipsq_lock); 6072 RELEASE_CONN_LOCK(q); 6073 *error = EINPROGRESS; 6074 return (NULL); 6075 } 6076 } 6077 } 6078 mutex_exit(&ill->ill_lock); 6079 RELEASE_CONN_LOCK(q); 6080 } 6081 rw_exit(&ipst->ips_ill_g_lock); 6082 6083 /* lookup the ipif based on interface address */ 6084 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error, 6085 ipst); 6086 ASSERT(ipif == NULL || !ipif->ipif_isv6); 6087 return (ipif); 6088 } 6089 6090 /* 6091 * Look for an ipif with the specified address. For point-point links 6092 * we look for matches on either the destination address and the local 6093 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 6094 * is set. 6095 * Matches on a specific ill if match_ill is set. 6096 */ 6097 ipif_t * 6098 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, 6099 mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) 6100 { 6101 ipif_t *ipif; 6102 ill_t *ill; 6103 boolean_t ptp = B_FALSE; 6104 ipsq_t *ipsq; 6105 ill_walk_context_t ctx; 6106 6107 if (error != NULL) 6108 *error = 0; 6109 6110 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6111 /* 6112 * Repeat twice, first based on local addresses and 6113 * next time for pointopoint. 6114 */ 6115 repeat: 6116 ill = ILL_START_WALK_V4(&ctx, ipst); 6117 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 6118 if (match_ill != NULL && ill != match_ill) { 6119 continue; 6120 } 6121 GRAB_CONN_LOCK(q); 6122 mutex_enter(&ill->ill_lock); 6123 for (ipif = ill->ill_ipif; ipif != NULL; 6124 ipif = ipif->ipif_next) { 6125 if (zoneid != ALL_ZONES && 6126 zoneid != ipif->ipif_zoneid && 6127 ipif->ipif_zoneid != ALL_ZONES) 6128 continue; 6129 /* Allow the ipif to be down */ 6130 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 6131 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 6132 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 6133 (ipif->ipif_pp_dst_addr == addr))) { 6134 /* 6135 * The block comment at the start of ipif_down 6136 * explains the use of the macros used below 6137 */ 6138 if (IPIF_CAN_LOOKUP(ipif)) { 6139 ipif_refhold_locked(ipif); 6140 mutex_exit(&ill->ill_lock); 6141 RELEASE_CONN_LOCK(q); 6142 rw_exit(&ipst->ips_ill_g_lock); 6143 return (ipif); 6144 } else if (IPIF_CAN_WAIT(ipif, q)) { 6145 ipsq = ill->ill_phyint->phyint_ipsq; 6146 mutex_enter(&ipsq->ipsq_lock); 6147 mutex_exit(&ill->ill_lock); 6148 rw_exit(&ipst->ips_ill_g_lock); 6149 ipsq_enq(ipsq, q, mp, func, NEW_OP, 6150 ill); 6151 mutex_exit(&ipsq->ipsq_lock); 6152 RELEASE_CONN_LOCK(q); 6153 *error = EINPROGRESS; 6154 return (NULL); 6155 } 6156 } 6157 } 6158 mutex_exit(&ill->ill_lock); 6159 RELEASE_CONN_LOCK(q); 6160 } 6161 6162 /* If we already did the ptp case, then we are done */ 6163 if (ptp) { 6164 rw_exit(&ipst->ips_ill_g_lock); 6165 if (error != NULL) 6166 *error = ENXIO; 6167 return (NULL); 6168 } 6169 ptp = B_TRUE; 6170 goto repeat; 6171 } 6172 6173 /* 6174 * Look for an ipif with the specified address. For point-point links 6175 * we look for matches on either the destination address and the local 6176 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 6177 * is set. 6178 * Matches on a specific ill if match_ill is set. 6179 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 6180 */ 6181 zoneid_t 6182 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 6183 { 6184 zoneid_t zoneid; 6185 ipif_t *ipif; 6186 ill_t *ill; 6187 boolean_t ptp = B_FALSE; 6188 ill_walk_context_t ctx; 6189 6190 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6191 /* 6192 * Repeat twice, first based on local addresses and 6193 * next time for pointopoint. 6194 */ 6195 repeat: 6196 ill = ILL_START_WALK_V4(&ctx, ipst); 6197 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 6198 if (match_ill != NULL && ill != match_ill) { 6199 continue; 6200 } 6201 mutex_enter(&ill->ill_lock); 6202 for (ipif = ill->ill_ipif; ipif != NULL; 6203 ipif = ipif->ipif_next) { 6204 /* Allow the ipif to be down */ 6205 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 6206 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 6207 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 6208 (ipif->ipif_pp_dst_addr == addr)) && 6209 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 6210 zoneid = ipif->ipif_zoneid; 6211 mutex_exit(&ill->ill_lock); 6212 rw_exit(&ipst->ips_ill_g_lock); 6213 /* 6214 * If ipif_zoneid was ALL_ZONES then we have 6215 * a trusted extensions shared IP address. 6216 * In that case GLOBAL_ZONEID works to send. 6217 */ 6218 if (zoneid == ALL_ZONES) 6219 zoneid = GLOBAL_ZONEID; 6220 return (zoneid); 6221 } 6222 } 6223 mutex_exit(&ill->ill_lock); 6224 } 6225 6226 /* If we already did the ptp case, then we are done */ 6227 if (ptp) { 6228 rw_exit(&ipst->ips_ill_g_lock); 6229 return (ALL_ZONES); 6230 } 6231 ptp = B_TRUE; 6232 goto repeat; 6233 } 6234 6235 /* 6236 * Look for an ipif that matches the specified remote address i.e. the 6237 * ipif that would receive the specified packet. 6238 * First look for directly connected interfaces and then do a recursive 6239 * IRE lookup and pick the first ipif corresponding to the source address in the 6240 * ire. 6241 * Returns: held ipif 6242 */ 6243 ipif_t * 6244 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 6245 { 6246 ipif_t *ipif; 6247 ire_t *ire; 6248 ip_stack_t *ipst = ill->ill_ipst; 6249 6250 ASSERT(!ill->ill_isv6); 6251 6252 /* 6253 * Someone could be changing this ipif currently or change it 6254 * after we return this. Thus a few packets could use the old 6255 * old values. However structure updates/creates (ire, ilg, ilm etc) 6256 * will atomically be updated or cleaned up with the new value 6257 * Thus we don't need a lock to check the flags or other attrs below. 6258 */ 6259 mutex_enter(&ill->ill_lock); 6260 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6261 if (!IPIF_CAN_LOOKUP(ipif)) 6262 continue; 6263 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 6264 ipif->ipif_zoneid != ALL_ZONES) 6265 continue; 6266 /* Allow the ipif to be down */ 6267 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 6268 if ((ipif->ipif_pp_dst_addr == addr) || 6269 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 6270 ipif->ipif_lcl_addr == addr)) { 6271 ipif_refhold_locked(ipif); 6272 mutex_exit(&ill->ill_lock); 6273 return (ipif); 6274 } 6275 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 6276 ipif_refhold_locked(ipif); 6277 mutex_exit(&ill->ill_lock); 6278 return (ipif); 6279 } 6280 } 6281 mutex_exit(&ill->ill_lock); 6282 ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid, 6283 NULL, MATCH_IRE_RECURSIVE, ipst); 6284 if (ire != NULL) { 6285 /* 6286 * The callers of this function wants to know the 6287 * interface on which they have to send the replies 6288 * back. For IRE_CACHES that have ire_stq and ire_ipif 6289 * derived from different ills, we really don't care 6290 * what we return here. 6291 */ 6292 ipif = ire->ire_ipif; 6293 if (ipif != NULL) { 6294 ipif_refhold(ipif); 6295 ire_refrele(ire); 6296 return (ipif); 6297 } 6298 ire_refrele(ire); 6299 } 6300 /* Pick the first interface */ 6301 ipif = ipif_get_next_ipif(NULL, ill); 6302 return (ipif); 6303 } 6304 6305 /* 6306 * This func does not prevent refcnt from increasing. But if 6307 * the caller has taken steps to that effect, then this func 6308 * can be used to determine whether the ill has become quiescent 6309 */ 6310 boolean_t 6311 ill_is_quiescent(ill_t *ill) 6312 { 6313 ipif_t *ipif; 6314 6315 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6316 6317 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6318 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 6319 return (B_FALSE); 6320 } 6321 } 6322 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0 || 6323 ill->ill_nce_cnt != 0 || ill->ill_srcif_refcnt != 0 || 6324 ill->ill_mrtun_refcnt != 0) { 6325 return (B_FALSE); 6326 } 6327 return (B_TRUE); 6328 } 6329 6330 /* 6331 * This func does not prevent refcnt from increasing. But if 6332 * the caller has taken steps to that effect, then this func 6333 * can be used to determine whether the ipif has become quiescent 6334 */ 6335 static boolean_t 6336 ipif_is_quiescent(ipif_t *ipif) 6337 { 6338 ill_t *ill; 6339 6340 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6341 6342 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 6343 return (B_FALSE); 6344 } 6345 6346 ill = ipif->ipif_ill; 6347 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 6348 ill->ill_logical_down) { 6349 return (B_TRUE); 6350 } 6351 6352 /* This is the last ipif going down or being deleted on this ill */ 6353 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { 6354 return (B_FALSE); 6355 } 6356 6357 return (B_TRUE); 6358 } 6359 6360 /* 6361 * This func does not prevent refcnt from increasing. But if 6362 * the caller has taken steps to that effect, then this func 6363 * can be used to determine whether the ipifs marked with IPIF_MOVING 6364 * have become quiescent and can be moved in a failover/failback. 6365 */ 6366 static ipif_t * 6367 ill_quiescent_to_move(ill_t *ill) 6368 { 6369 ipif_t *ipif; 6370 6371 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6372 6373 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6374 if (ipif->ipif_state_flags & IPIF_MOVING) { 6375 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 6376 return (ipif); 6377 } 6378 } 6379 } 6380 return (NULL); 6381 } 6382 6383 /* 6384 * The ipif/ill/ire has been refreled. Do the tail processing. 6385 * Determine if the ipif or ill in question has become quiescent and if so 6386 * wakeup close and/or restart any queued pending ioctl that is waiting 6387 * for the ipif_down (or ill_down) 6388 */ 6389 void 6390 ipif_ill_refrele_tail(ill_t *ill) 6391 { 6392 mblk_t *mp; 6393 conn_t *connp; 6394 ipsq_t *ipsq; 6395 ipif_t *ipif; 6396 dl_notify_ind_t *dlindp; 6397 6398 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6399 6400 if ((ill->ill_state_flags & ILL_CONDEMNED) && 6401 ill_is_quiescent(ill)) { 6402 /* ill_close may be waiting */ 6403 cv_broadcast(&ill->ill_cv); 6404 } 6405 6406 /* ipsq can't change because ill_lock is held */ 6407 ipsq = ill->ill_phyint->phyint_ipsq; 6408 if (ipsq->ipsq_waitfor == 0) { 6409 /* Not waiting for anything, just return. */ 6410 mutex_exit(&ill->ill_lock); 6411 return; 6412 } 6413 ASSERT(ipsq->ipsq_pending_mp != NULL && 6414 ipsq->ipsq_pending_ipif != NULL); 6415 /* 6416 * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF. 6417 * Last ipif going down needs to down the ill, so ill_ire_cnt must 6418 * be zero for restarting an ioctl that ends up downing the ill. 6419 */ 6420 ipif = ipsq->ipsq_pending_ipif; 6421 if (ipif->ipif_ill != ill) { 6422 /* The ioctl is pending on some other ill. */ 6423 mutex_exit(&ill->ill_lock); 6424 return; 6425 } 6426 6427 switch (ipsq->ipsq_waitfor) { 6428 case IPIF_DOWN: 6429 case IPIF_FREE: 6430 if (!ipif_is_quiescent(ipif)) { 6431 mutex_exit(&ill->ill_lock); 6432 return; 6433 } 6434 break; 6435 6436 case ILL_DOWN: 6437 case ILL_FREE: 6438 /* 6439 * case ILL_FREE arises only for loopback. otherwise ill_delete 6440 * waits synchronously in ip_close, and no message is queued in 6441 * ipsq_pending_mp at all in this case 6442 */ 6443 if (!ill_is_quiescent(ill)) { 6444 mutex_exit(&ill->ill_lock); 6445 return; 6446 } 6447 6448 break; 6449 6450 case ILL_MOVE_OK: 6451 if (ill_quiescent_to_move(ill) != NULL) { 6452 mutex_exit(&ill->ill_lock); 6453 return; 6454 } 6455 6456 break; 6457 default: 6458 cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n", 6459 (void *)ipsq, ipsq->ipsq_waitfor); 6460 } 6461 6462 /* 6463 * Incr refcnt for the qwriter_ip call below which 6464 * does a refrele 6465 */ 6466 ill_refhold_locked(ill); 6467 mutex_exit(&ill->ill_lock); 6468 6469 mp = ipsq_pending_mp_get(ipsq, &connp); 6470 ASSERT(mp != NULL); 6471 6472 /* 6473 * NOTE: all of the qwriter_ip() calls below use CUR_OP since 6474 * we can only get here when the current operation decides it 6475 * it needs to quiesce via ipsq_pending_mp_add(). 6476 */ 6477 switch (mp->b_datap->db_type) { 6478 case M_PCPROTO: 6479 case M_PROTO: 6480 /* 6481 * For now, only DL_NOTIFY_IND messages can use this facility. 6482 */ 6483 dlindp = (dl_notify_ind_t *)mp->b_rptr; 6484 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND); 6485 6486 switch (dlindp->dl_notification) { 6487 case DL_NOTE_PHYS_ADDR: 6488 qwriter_ip(ill, ill->ill_rq, mp, 6489 ill_set_phys_addr_tail, CUR_OP, B_TRUE); 6490 return; 6491 default: 6492 ASSERT(0); 6493 } 6494 break; 6495 6496 case M_ERROR: 6497 case M_HANGUP: 6498 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP, 6499 B_TRUE); 6500 return; 6501 6502 case M_IOCTL: 6503 case M_IOCDATA: 6504 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) : 6505 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE); 6506 return; 6507 6508 default: 6509 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 6510 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 6511 } 6512 } 6513 6514 #ifdef ILL_DEBUG 6515 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 6516 void 6517 th_trace_rrecord(th_trace_t *th_trace) 6518 { 6519 tr_buf_t *tr_buf; 6520 uint_t lastref; 6521 6522 lastref = th_trace->th_trace_lastref; 6523 lastref++; 6524 if (lastref == TR_BUF_MAX) 6525 lastref = 0; 6526 th_trace->th_trace_lastref = lastref; 6527 tr_buf = &th_trace->th_trbuf[lastref]; 6528 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, IP_STACK_DEPTH); 6529 } 6530 6531 th_trace_t * 6532 th_trace_ipif_lookup(ipif_t *ipif) 6533 { 6534 int bucket_id; 6535 th_trace_t *th_trace; 6536 6537 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6538 6539 bucket_id = IP_TR_HASH(curthread); 6540 ASSERT(bucket_id < IP_TR_HASH_MAX); 6541 6542 for (th_trace = ipif->ipif_trace[bucket_id]; th_trace != NULL; 6543 th_trace = th_trace->th_next) { 6544 if (th_trace->th_id == curthread) 6545 return (th_trace); 6546 } 6547 return (NULL); 6548 } 6549 6550 void 6551 ipif_trace_ref(ipif_t *ipif) 6552 { 6553 int bucket_id; 6554 th_trace_t *th_trace; 6555 6556 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6557 6558 if (ipif->ipif_trace_disable) 6559 return; 6560 6561 /* 6562 * Attempt to locate the trace buffer for the curthread. 6563 * If it does not exist, then allocate a new trace buffer 6564 * and link it in list of trace bufs for this ipif, at the head 6565 */ 6566 th_trace = th_trace_ipif_lookup(ipif); 6567 if (th_trace == NULL) { 6568 bucket_id = IP_TR_HASH(curthread); 6569 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 6570 KM_NOSLEEP); 6571 if (th_trace == NULL) { 6572 ipif->ipif_trace_disable = B_TRUE; 6573 ipif_trace_cleanup(ipif); 6574 return; 6575 } 6576 th_trace->th_id = curthread; 6577 th_trace->th_next = ipif->ipif_trace[bucket_id]; 6578 th_trace->th_prev = &ipif->ipif_trace[bucket_id]; 6579 if (th_trace->th_next != NULL) 6580 th_trace->th_next->th_prev = &th_trace->th_next; 6581 ipif->ipif_trace[bucket_id] = th_trace; 6582 } 6583 ASSERT(th_trace->th_refcnt >= 0 && 6584 th_trace->th_refcnt < TR_BUF_MAX -1); 6585 th_trace->th_refcnt++; 6586 th_trace_rrecord(th_trace); 6587 } 6588 6589 void 6590 ipif_untrace_ref(ipif_t *ipif) 6591 { 6592 th_trace_t *th_trace; 6593 6594 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6595 6596 if (ipif->ipif_trace_disable) 6597 return; 6598 th_trace = th_trace_ipif_lookup(ipif); 6599 ASSERT(th_trace != NULL); 6600 ASSERT(th_trace->th_refcnt > 0); 6601 6602 th_trace->th_refcnt--; 6603 th_trace_rrecord(th_trace); 6604 } 6605 6606 th_trace_t * 6607 th_trace_ill_lookup(ill_t *ill) 6608 { 6609 th_trace_t *th_trace; 6610 int bucket_id; 6611 6612 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6613 6614 bucket_id = IP_TR_HASH(curthread); 6615 ASSERT(bucket_id < IP_TR_HASH_MAX); 6616 6617 for (th_trace = ill->ill_trace[bucket_id]; th_trace != NULL; 6618 th_trace = th_trace->th_next) { 6619 if (th_trace->th_id == curthread) 6620 return (th_trace); 6621 } 6622 return (NULL); 6623 } 6624 6625 void 6626 ill_trace_ref(ill_t *ill) 6627 { 6628 int bucket_id; 6629 th_trace_t *th_trace; 6630 6631 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6632 if (ill->ill_trace_disable) 6633 return; 6634 /* 6635 * Attempt to locate the trace buffer for the curthread. 6636 * If it does not exist, then allocate a new trace buffer 6637 * and link it in list of trace bufs for this ill, at the head 6638 */ 6639 th_trace = th_trace_ill_lookup(ill); 6640 if (th_trace == NULL) { 6641 bucket_id = IP_TR_HASH(curthread); 6642 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 6643 KM_NOSLEEP); 6644 if (th_trace == NULL) { 6645 ill->ill_trace_disable = B_TRUE; 6646 ill_trace_cleanup(ill); 6647 return; 6648 } 6649 th_trace->th_id = curthread; 6650 th_trace->th_next = ill->ill_trace[bucket_id]; 6651 th_trace->th_prev = &ill->ill_trace[bucket_id]; 6652 if (th_trace->th_next != NULL) 6653 th_trace->th_next->th_prev = &th_trace->th_next; 6654 ill->ill_trace[bucket_id] = th_trace; 6655 } 6656 ASSERT(th_trace->th_refcnt >= 0 && 6657 th_trace->th_refcnt < TR_BUF_MAX - 1); 6658 6659 th_trace->th_refcnt++; 6660 th_trace_rrecord(th_trace); 6661 } 6662 6663 void 6664 ill_untrace_ref(ill_t *ill) 6665 { 6666 th_trace_t *th_trace; 6667 6668 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6669 6670 if (ill->ill_trace_disable) 6671 return; 6672 th_trace = th_trace_ill_lookup(ill); 6673 ASSERT(th_trace != NULL); 6674 ASSERT(th_trace->th_refcnt > 0); 6675 6676 th_trace->th_refcnt--; 6677 th_trace_rrecord(th_trace); 6678 } 6679 6680 /* 6681 * Verify that this thread has no refs to the ipif and free 6682 * the trace buffers 6683 */ 6684 /* ARGSUSED */ 6685 void 6686 ipif_thread_exit(ipif_t *ipif, void *dummy) 6687 { 6688 th_trace_t *th_trace; 6689 6690 mutex_enter(&ipif->ipif_ill->ill_lock); 6691 6692 th_trace = th_trace_ipif_lookup(ipif); 6693 if (th_trace == NULL) { 6694 mutex_exit(&ipif->ipif_ill->ill_lock); 6695 return; 6696 } 6697 ASSERT(th_trace->th_refcnt == 0); 6698 /* unlink th_trace and free it */ 6699 *th_trace->th_prev = th_trace->th_next; 6700 if (th_trace->th_next != NULL) 6701 th_trace->th_next->th_prev = th_trace->th_prev; 6702 th_trace->th_next = NULL; 6703 th_trace->th_prev = NULL; 6704 kmem_free(th_trace, sizeof (th_trace_t)); 6705 6706 mutex_exit(&ipif->ipif_ill->ill_lock); 6707 } 6708 6709 /* 6710 * Verify that this thread has no refs to the ill and free 6711 * the trace buffers 6712 */ 6713 /* ARGSUSED */ 6714 void 6715 ill_thread_exit(ill_t *ill, void *dummy) 6716 { 6717 th_trace_t *th_trace; 6718 6719 mutex_enter(&ill->ill_lock); 6720 6721 th_trace = th_trace_ill_lookup(ill); 6722 if (th_trace == NULL) { 6723 mutex_exit(&ill->ill_lock); 6724 return; 6725 } 6726 ASSERT(th_trace->th_refcnt == 0); 6727 /* unlink th_trace and free it */ 6728 *th_trace->th_prev = th_trace->th_next; 6729 if (th_trace->th_next != NULL) 6730 th_trace->th_next->th_prev = th_trace->th_prev; 6731 th_trace->th_next = NULL; 6732 th_trace->th_prev = NULL; 6733 kmem_free(th_trace, sizeof (th_trace_t)); 6734 6735 mutex_exit(&ill->ill_lock); 6736 } 6737 #endif 6738 6739 #ifdef ILL_DEBUG 6740 void 6741 ip_thread_exit_stack(ip_stack_t *ipst) 6742 { 6743 ill_t *ill; 6744 ipif_t *ipif; 6745 ill_walk_context_t ctx; 6746 6747 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6748 ill = ILL_START_WALK_ALL(&ctx, ipst); 6749 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 6750 for (ipif = ill->ill_ipif; ipif != NULL; 6751 ipif = ipif->ipif_next) { 6752 ipif_thread_exit(ipif, NULL); 6753 } 6754 ill_thread_exit(ill, NULL); 6755 } 6756 rw_exit(&ipst->ips_ill_g_lock); 6757 6758 ire_walk(ire_thread_exit, NULL, ipst); 6759 ndp_walk_common(ipst->ips_ndp4, NULL, nce_thread_exit, NULL, B_FALSE); 6760 ndp_walk_common(ipst->ips_ndp6, NULL, nce_thread_exit, NULL, B_FALSE); 6761 } 6762 6763 /* 6764 * This is a function which is called from thread_exit 6765 * that can be used to debug reference count issues in IP. See comment in 6766 * <inet/ip.h> on how it is used. 6767 */ 6768 void 6769 ip_thread_exit(void) 6770 { 6771 netstack_t *ns; 6772 6773 ns = netstack_get_current(); 6774 if (ns != NULL) { 6775 ip_thread_exit_stack(ns->netstack_ip); 6776 netstack_rele(ns); 6777 } 6778 } 6779 6780 /* 6781 * Called when ipif is unplumbed or when memory alloc fails 6782 */ 6783 void 6784 ipif_trace_cleanup(ipif_t *ipif) 6785 { 6786 int i; 6787 th_trace_t *th_trace; 6788 th_trace_t *th_trace_next; 6789 6790 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6791 for (th_trace = ipif->ipif_trace[i]; th_trace != NULL; 6792 th_trace = th_trace_next) { 6793 th_trace_next = th_trace->th_next; 6794 kmem_free(th_trace, sizeof (th_trace_t)); 6795 } 6796 ipif->ipif_trace[i] = NULL; 6797 } 6798 } 6799 6800 /* 6801 * Called when ill is unplumbed or when memory alloc fails 6802 */ 6803 void 6804 ill_trace_cleanup(ill_t *ill) 6805 { 6806 int i; 6807 th_trace_t *th_trace; 6808 th_trace_t *th_trace_next; 6809 6810 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6811 for (th_trace = ill->ill_trace[i]; th_trace != NULL; 6812 th_trace = th_trace_next) { 6813 th_trace_next = th_trace->th_next; 6814 kmem_free(th_trace, sizeof (th_trace_t)); 6815 } 6816 ill->ill_trace[i] = NULL; 6817 } 6818 } 6819 6820 #else 6821 void ip_thread_exit(void) {} 6822 #endif 6823 6824 void 6825 ipif_refhold_locked(ipif_t *ipif) 6826 { 6827 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6828 ipif->ipif_refcnt++; 6829 IPIF_TRACE_REF(ipif); 6830 } 6831 6832 void 6833 ipif_refhold(ipif_t *ipif) 6834 { 6835 ill_t *ill; 6836 6837 ill = ipif->ipif_ill; 6838 mutex_enter(&ill->ill_lock); 6839 ipif->ipif_refcnt++; 6840 IPIF_TRACE_REF(ipif); 6841 mutex_exit(&ill->ill_lock); 6842 } 6843 6844 /* 6845 * Must not be called while holding any locks. Otherwise if this is 6846 * the last reference to be released there is a chance of recursive mutex 6847 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 6848 * to restart an ioctl. 6849 */ 6850 void 6851 ipif_refrele(ipif_t *ipif) 6852 { 6853 ill_t *ill; 6854 6855 ill = ipif->ipif_ill; 6856 6857 mutex_enter(&ill->ill_lock); 6858 ASSERT(ipif->ipif_refcnt != 0); 6859 ipif->ipif_refcnt--; 6860 IPIF_UNTRACE_REF(ipif); 6861 if (ipif->ipif_refcnt != 0) { 6862 mutex_exit(&ill->ill_lock); 6863 return; 6864 } 6865 6866 /* Drops the ill_lock */ 6867 ipif_ill_refrele_tail(ill); 6868 } 6869 6870 ipif_t * 6871 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 6872 { 6873 ipif_t *ipif; 6874 6875 mutex_enter(&ill->ill_lock); 6876 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 6877 ipif != NULL; ipif = ipif->ipif_next) { 6878 if (!IPIF_CAN_LOOKUP(ipif)) 6879 continue; 6880 ipif_refhold_locked(ipif); 6881 mutex_exit(&ill->ill_lock); 6882 return (ipif); 6883 } 6884 mutex_exit(&ill->ill_lock); 6885 return (NULL); 6886 } 6887 6888 /* 6889 * TODO: make this table extendible at run time 6890 * Return a pointer to the mac type info for 'mac_type' 6891 */ 6892 static ip_m_t * 6893 ip_m_lookup(t_uscalar_t mac_type) 6894 { 6895 ip_m_t *ipm; 6896 6897 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 6898 if (ipm->ip_m_mac_type == mac_type) 6899 return (ipm); 6900 return (NULL); 6901 } 6902 6903 /* 6904 * ip_rt_add is called to add an IPv4 route to the forwarding table. 6905 * ipif_arg is passed in to associate it with the correct interface. 6906 * We may need to restart this operation if the ipif cannot be looked up 6907 * due to an exclusive operation that is currently in progress. The restart 6908 * entry point is specified by 'func' 6909 */ 6910 int 6911 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6912 ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ipif_t *src_ipif, 6913 ire_t **ire_arg, boolean_t ioctl_msg, queue_t *q, mblk_t *mp, 6914 ipsq_func_t func, struct rtsa_s *sp, ip_stack_t *ipst) 6915 { 6916 ire_t *ire; 6917 ire_t *gw_ire = NULL; 6918 ipif_t *ipif = NULL; 6919 boolean_t ipif_refheld = B_FALSE; 6920 uint_t type; 6921 int match_flags = MATCH_IRE_TYPE; 6922 int error; 6923 tsol_gc_t *gc = NULL; 6924 tsol_gcgrp_t *gcgrp = NULL; 6925 boolean_t gcgrp_xtraref = B_FALSE; 6926 6927 ip1dbg(("ip_rt_add:")); 6928 6929 if (ire_arg != NULL) 6930 *ire_arg = NULL; 6931 6932 /* 6933 * If this is the case of RTF_HOST being set, then we set the netmask 6934 * to all ones (regardless if one was supplied). 6935 */ 6936 if (flags & RTF_HOST) 6937 mask = IP_HOST_MASK; 6938 6939 /* 6940 * Prevent routes with a zero gateway from being created (since 6941 * interfaces can currently be plumbed and brought up no assigned 6942 * address). 6943 * For routes with RTA_SRCIFP, the gateway address can be 0.0.0.0. 6944 */ 6945 if (gw_addr == 0 && src_ipif == NULL) 6946 return (ENETUNREACH); 6947 /* 6948 * Get the ipif, if any, corresponding to the gw_addr 6949 */ 6950 if (gw_addr != 0) { 6951 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, 6952 &error, ipst); 6953 if (ipif != NULL) { 6954 if (IS_VNI(ipif->ipif_ill)) { 6955 ipif_refrele(ipif); 6956 return (EINVAL); 6957 } 6958 ipif_refheld = B_TRUE; 6959 } else if (error == EINPROGRESS) { 6960 ip1dbg(("ip_rt_add: null and EINPROGRESS")); 6961 return (EINPROGRESS); 6962 } else { 6963 error = 0; 6964 } 6965 } 6966 6967 if (ipif != NULL) { 6968 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull")); 6969 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6970 } else { 6971 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null")); 6972 } 6973 6974 /* 6975 * GateD will attempt to create routes with a loopback interface 6976 * address as the gateway and with RTF_GATEWAY set. We allow 6977 * these routes to be added, but create them as interface routes 6978 * since the gateway is an interface address. 6979 */ 6980 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 6981 flags &= ~RTF_GATEWAY; 6982 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 6983 mask == IP_HOST_MASK) { 6984 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 6985 ALL_ZONES, NULL, match_flags, ipst); 6986 if (ire != NULL) { 6987 ire_refrele(ire); 6988 if (ipif_refheld) 6989 ipif_refrele(ipif); 6990 return (EEXIST); 6991 } 6992 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x" 6993 "for 0x%x\n", (void *)ipif, 6994 ipif->ipif_ire_type, 6995 ntohl(ipif->ipif_lcl_addr))); 6996 ire = ire_create( 6997 (uchar_t *)&dst_addr, /* dest address */ 6998 (uchar_t *)&mask, /* mask */ 6999 (uchar_t *)&ipif->ipif_src_addr, 7000 NULL, /* no gateway */ 7001 NULL, 7002 &ipif->ipif_mtu, 7003 NULL, 7004 ipif->ipif_rq, /* recv-from queue */ 7005 NULL, /* no send-to queue */ 7006 ipif->ipif_ire_type, /* LOOPBACK */ 7007 NULL, 7008 ipif, 7009 NULL, 7010 0, 7011 0, 7012 0, 7013 (ipif->ipif_flags & IPIF_PRIVATE) ? 7014 RTF_PRIVATE : 0, 7015 &ire_uinfo_null, 7016 NULL, 7017 NULL, 7018 ipst); 7019 7020 if (ire == NULL) { 7021 if (ipif_refheld) 7022 ipif_refrele(ipif); 7023 return (ENOMEM); 7024 } 7025 error = ire_add(&ire, q, mp, func, B_FALSE); 7026 if (error == 0) 7027 goto save_ire; 7028 if (ipif_refheld) 7029 ipif_refrele(ipif); 7030 return (error); 7031 7032 } 7033 } 7034 7035 /* 7036 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 7037 * and the gateway address provided is one of the system's interface 7038 * addresses. By using the routing socket interface and supplying an 7039 * RTA_IFP sockaddr with an interface index, an alternate method of 7040 * specifying an interface route to be created is available which uses 7041 * the interface index that specifies the outgoing interface rather than 7042 * the address of an outgoing interface (which may not be able to 7043 * uniquely identify an interface). When coupled with the RTF_GATEWAY 7044 * flag, routes can be specified which not only specify the next-hop to 7045 * be used when routing to a certain prefix, but also which outgoing 7046 * interface should be used. 7047 * 7048 * Previously, interfaces would have unique addresses assigned to them 7049 * and so the address assigned to a particular interface could be used 7050 * to identify a particular interface. One exception to this was the 7051 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 7052 * 7053 * With the advent of IPv6 and its link-local addresses, this 7054 * restriction was relaxed and interfaces could share addresses between 7055 * themselves. In fact, typically all of the link-local interfaces on 7056 * an IPv6 node or router will have the same link-local address. In 7057 * order to differentiate between these interfaces, the use of an 7058 * interface index is necessary and this index can be carried inside a 7059 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 7060 * of using the interface index, however, is that all of the ipif's that 7061 * are part of an ill have the same index and so the RTA_IFP sockaddr 7062 * cannot be used to differentiate between ipif's (or logical 7063 * interfaces) that belong to the same ill (physical interface). 7064 * 7065 * For example, in the following case involving IPv4 interfaces and 7066 * logical interfaces 7067 * 7068 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 7069 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1 7070 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2 7071 * 7072 * the ipif's corresponding to each of these interface routes can be 7073 * uniquely identified by the "gateway" (actually interface address). 7074 * 7075 * In this case involving multiple IPv6 default routes to a particular 7076 * link-local gateway, the use of RTA_IFP is necessary to specify which 7077 * default route is of interest: 7078 * 7079 * default fe80::123:4567:89ab:cdef U if0 7080 * default fe80::123:4567:89ab:cdef U if1 7081 */ 7082 7083 /* RTF_GATEWAY not set */ 7084 if (!(flags & RTF_GATEWAY)) { 7085 queue_t *stq; 7086 queue_t *rfq = NULL; 7087 ill_t *in_ill = NULL; 7088 7089 if (sp != NULL) { 7090 ip2dbg(("ip_rt_add: gateway security attributes " 7091 "cannot be set with interface route\n")); 7092 if (ipif_refheld) 7093 ipif_refrele(ipif); 7094 return (EINVAL); 7095 } 7096 7097 /* 7098 * As the interface index specified with the RTA_IFP sockaddr is 7099 * the same for all ipif's off of an ill, the matching logic 7100 * below uses MATCH_IRE_ILL if such an index was specified. 7101 * This means that routes sharing the same prefix when added 7102 * using a RTA_IFP sockaddr must have distinct interface 7103 * indices (namely, they must be on distinct ill's). 7104 * 7105 * On the other hand, since the gateway address will usually be 7106 * different for each ipif on the system, the matching logic 7107 * uses MATCH_IRE_IPIF in the case of a traditional interface 7108 * route. This means that interface routes for the same prefix 7109 * can be created if they belong to distinct ipif's and if a 7110 * RTA_IFP sockaddr is not present. 7111 */ 7112 if (ipif_arg != NULL) { 7113 if (ipif_refheld) { 7114 ipif_refrele(ipif); 7115 ipif_refheld = B_FALSE; 7116 } 7117 ipif = ipif_arg; 7118 match_flags |= MATCH_IRE_ILL; 7119 } else { 7120 /* 7121 * Check the ipif corresponding to the gw_addr 7122 */ 7123 if (ipif == NULL) 7124 return (ENETUNREACH); 7125 match_flags |= MATCH_IRE_IPIF; 7126 } 7127 ASSERT(ipif != NULL); 7128 /* 7129 * If src_ipif is not NULL, we have to create 7130 * an ire with non-null ire_in_ill value 7131 */ 7132 if (src_ipif != NULL) { 7133 in_ill = src_ipif->ipif_ill; 7134 } 7135 7136 /* 7137 * We check for an existing entry at this point. 7138 * 7139 * Since a netmask isn't passed in via the ioctl interface 7140 * (SIOCADDRT), we don't check for a matching netmask in that 7141 * case. 7142 */ 7143 if (!ioctl_msg) 7144 match_flags |= MATCH_IRE_MASK; 7145 if (src_ipif != NULL) { 7146 /* Look up in the special table */ 7147 ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE, 7148 ipif, src_ipif->ipif_ill, match_flags); 7149 } else { 7150 ire = ire_ftable_lookup(dst_addr, mask, 0, 7151 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 7152 NULL, match_flags, ipst); 7153 } 7154 if (ire != NULL) { 7155 ire_refrele(ire); 7156 if (ipif_refheld) 7157 ipif_refrele(ipif); 7158 return (EEXIST); 7159 } 7160 7161 if (src_ipif != NULL) { 7162 /* 7163 * Create the special ire for the IRE table 7164 * which hangs out of ire_in_ill. This ire 7165 * is in-between IRE_CACHE and IRE_INTERFACE. 7166 * Thus rfq is non-NULL. 7167 */ 7168 rfq = ipif->ipif_rq; 7169 } 7170 /* Create the usual interface ires */ 7171 7172 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 7173 ? ipif->ipif_rq : ipif->ipif_wq; 7174 7175 /* 7176 * Create a copy of the IRE_LOOPBACK, 7177 * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with 7178 * the modified address and netmask. 7179 */ 7180 ire = ire_create( 7181 (uchar_t *)&dst_addr, 7182 (uint8_t *)&mask, 7183 (uint8_t *)&ipif->ipif_src_addr, 7184 NULL, 7185 NULL, 7186 &ipif->ipif_mtu, 7187 NULL, 7188 rfq, 7189 stq, 7190 ipif->ipif_net_type, 7191 ipif->ipif_resolver_mp, 7192 ipif, 7193 in_ill, 7194 0, 7195 0, 7196 0, 7197 flags, 7198 &ire_uinfo_null, 7199 NULL, 7200 NULL, 7201 ipst); 7202 if (ire == NULL) { 7203 if (ipif_refheld) 7204 ipif_refrele(ipif); 7205 return (ENOMEM); 7206 } 7207 7208 /* 7209 * Some software (for example, GateD and Sun Cluster) attempts 7210 * to create (what amount to) IRE_PREFIX routes with the 7211 * loopback address as the gateway. This is primarily done to 7212 * set up prefixes with the RTF_REJECT flag set (for example, 7213 * when generating aggregate routes.) 7214 * 7215 * If the IRE type (as defined by ipif->ipif_net_type) is 7216 * IRE_LOOPBACK, then we map the request into a 7217 * IRE_IF_NORESOLVER. 7218 * 7219 * Needless to say, the real IRE_LOOPBACK is NOT created by this 7220 * routine, but rather using ire_create() directly. 7221 * 7222 */ 7223 if (ipif->ipif_net_type == IRE_LOOPBACK) 7224 ire->ire_type = IRE_IF_NORESOLVER; 7225 7226 error = ire_add(&ire, q, mp, func, B_FALSE); 7227 if (error == 0) 7228 goto save_ire; 7229 7230 /* 7231 * In the result of failure, ire_add() will have already 7232 * deleted the ire in question, so there is no need to 7233 * do that here. 7234 */ 7235 if (ipif_refheld) 7236 ipif_refrele(ipif); 7237 return (error); 7238 } 7239 if (ipif_refheld) { 7240 ipif_refrele(ipif); 7241 ipif_refheld = B_FALSE; 7242 } 7243 7244 if (src_ipif != NULL) { 7245 /* RTA_SRCIFP is not supported on RTF_GATEWAY */ 7246 ip2dbg(("ip_rt_add: SRCIF cannot be set with gateway route\n")); 7247 return (EINVAL); 7248 } 7249 /* 7250 * Get an interface IRE for the specified gateway. 7251 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 7252 * gateway, it is currently unreachable and we fail the request 7253 * accordingly. 7254 */ 7255 ipif = ipif_arg; 7256 if (ipif_arg != NULL) 7257 match_flags |= MATCH_IRE_ILL; 7258 gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL, 7259 ALL_ZONES, 0, NULL, match_flags, ipst); 7260 if (gw_ire == NULL) 7261 return (ENETUNREACH); 7262 7263 /* 7264 * We create one of three types of IREs as a result of this request 7265 * based on the netmask. A netmask of all ones (which is automatically 7266 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 7267 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 7268 * created. Otherwise, an IRE_PREFIX route is created for the 7269 * destination prefix. 7270 */ 7271 if (mask == IP_HOST_MASK) 7272 type = IRE_HOST; 7273 else if (mask == 0) 7274 type = IRE_DEFAULT; 7275 else 7276 type = IRE_PREFIX; 7277 7278 /* check for a duplicate entry */ 7279 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 7280 NULL, ALL_ZONES, 0, NULL, 7281 match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, ipst); 7282 if (ire != NULL) { 7283 ire_refrele(gw_ire); 7284 ire_refrele(ire); 7285 return (EEXIST); 7286 } 7287 7288 /* Security attribute exists */ 7289 if (sp != NULL) { 7290 tsol_gcgrp_addr_t ga; 7291 7292 /* find or create the gateway credentials group */ 7293 ga.ga_af = AF_INET; 7294 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 7295 7296 /* we hold reference to it upon success */ 7297 gcgrp = gcgrp_lookup(&ga, B_TRUE); 7298 if (gcgrp == NULL) { 7299 ire_refrele(gw_ire); 7300 return (ENOMEM); 7301 } 7302 7303 /* 7304 * Create and add the security attribute to the group; a 7305 * reference to the group is made upon allocating a new 7306 * entry successfully. If it finds an already-existing 7307 * entry for the security attribute in the group, it simply 7308 * returns it and no new reference is made to the group. 7309 */ 7310 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 7311 if (gc == NULL) { 7312 /* release reference held by gcgrp_lookup */ 7313 GCGRP_REFRELE(gcgrp); 7314 ire_refrele(gw_ire); 7315 return (ENOMEM); 7316 } 7317 } 7318 7319 /* Create the IRE. */ 7320 ire = ire_create( 7321 (uchar_t *)&dst_addr, /* dest address */ 7322 (uchar_t *)&mask, /* mask */ 7323 /* src address assigned by the caller? */ 7324 (uchar_t *)(((src_addr != INADDR_ANY) && 7325 (flags & RTF_SETSRC)) ? &src_addr : NULL), 7326 (uchar_t *)&gw_addr, /* gateway address */ 7327 NULL, /* no in-srcaddress */ 7328 &gw_ire->ire_max_frag, 7329 NULL, /* no Fast Path header */ 7330 NULL, /* no recv-from queue */ 7331 NULL, /* no send-to queue */ 7332 (ushort_t)type, /* IRE type */ 7333 NULL, 7334 ipif_arg, 7335 NULL, 7336 0, 7337 0, 7338 0, 7339 flags, 7340 &gw_ire->ire_uinfo, /* Inherit ULP info from gw */ 7341 gc, /* security attribute */ 7342 NULL, 7343 ipst); 7344 7345 /* 7346 * The ire holds a reference to the 'gc' and the 'gc' holds a 7347 * reference to the 'gcgrp'. We can now release the extra reference 7348 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 7349 */ 7350 if (gcgrp_xtraref) 7351 GCGRP_REFRELE(gcgrp); 7352 if (ire == NULL) { 7353 if (gc != NULL) 7354 GC_REFRELE(gc); 7355 ire_refrele(gw_ire); 7356 return (ENOMEM); 7357 } 7358 7359 /* 7360 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 7361 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 7362 */ 7363 7364 /* Add the new IRE. */ 7365 error = ire_add(&ire, q, mp, func, B_FALSE); 7366 if (error != 0) { 7367 /* 7368 * In the result of failure, ire_add() will have already 7369 * deleted the ire in question, so there is no need to 7370 * do that here. 7371 */ 7372 ire_refrele(gw_ire); 7373 return (error); 7374 } 7375 7376 if (flags & RTF_MULTIRT) { 7377 /* 7378 * Invoke the CGTP (multirouting) filtering module 7379 * to add the dst address in the filtering database. 7380 * Replicated inbound packets coming from that address 7381 * will be filtered to discard the duplicates. 7382 * It is not necessary to call the CGTP filter hook 7383 * when the dst address is a broadcast or multicast, 7384 * because an IP source address cannot be a broadcast 7385 * or a multicast. 7386 */ 7387 ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0, 7388 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 7389 if (ire_dst != NULL) { 7390 ip_cgtp_bcast_add(ire, ire_dst, ipst); 7391 ire_refrele(ire_dst); 7392 goto save_ire; 7393 } 7394 if ((ip_cgtp_filter_ops != NULL) && !CLASSD(ire->ire_addr) && 7395 ipst->ips_netstack->netstack_stackid == GLOBAL_NETSTACKID) { 7396 int res = ip_cgtp_filter_ops->cfo_add_dest_v4( 7397 ire->ire_addr, 7398 ire->ire_gateway_addr, 7399 ire->ire_src_addr, 7400 gw_ire->ire_src_addr); 7401 if (res != 0) { 7402 ire_refrele(gw_ire); 7403 ire_delete(ire); 7404 return (res); 7405 } 7406 } 7407 } 7408 7409 /* 7410 * Now that the prefix IRE entry has been created, delete any 7411 * existing gateway IRE cache entries as well as any IRE caches 7412 * using the gateway, and force them to be created through 7413 * ip_newroute. 7414 */ 7415 if (gc != NULL) { 7416 ASSERT(gcgrp != NULL); 7417 ire_clookup_delete_cache_gw(gw_addr, ALL_ZONES, ipst); 7418 } 7419 7420 save_ire: 7421 if (gw_ire != NULL) { 7422 ire_refrele(gw_ire); 7423 } 7424 /* 7425 * We do not do save_ire for the routes added with RTA_SRCIFP 7426 * flag. This route is only added and deleted by mipagent. 7427 * So, for simplicity of design, we refrain from saving 7428 * ires that are created with srcif value. This may change 7429 * in future if we find more usage of srcifp feature. 7430 */ 7431 if (ipif != NULL && src_ipif == NULL) { 7432 /* 7433 * Save enough information so that we can recreate the IRE if 7434 * the interface goes down and then up. The metrics associated 7435 * with the route will be saved as well when rts_setmetrics() is 7436 * called after the IRE has been created. In the case where 7437 * memory cannot be allocated, none of this information will be 7438 * saved. 7439 */ 7440 ipif_save_ire(ipif, ire); 7441 } 7442 if (ioctl_msg) 7443 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst); 7444 if (ire_arg != NULL) { 7445 /* 7446 * Store the ire that was successfully added into where ire_arg 7447 * points to so that callers don't have to look it up 7448 * themselves (but they are responsible for ire_refrele()ing 7449 * the ire when they are finished with it). 7450 */ 7451 *ire_arg = ire; 7452 } else { 7453 ire_refrele(ire); /* Held in ire_add */ 7454 } 7455 if (ipif_refheld) 7456 ipif_refrele(ipif); 7457 return (0); 7458 } 7459 7460 /* 7461 * ip_rt_delete is called to delete an IPv4 route. 7462 * ipif_arg is passed in to associate it with the correct interface. 7463 * src_ipif is passed to associate the incoming interface of the packet. 7464 * We may need to restart this operation if the ipif cannot be looked up 7465 * due to an exclusive operation that is currently in progress. The restart 7466 * entry point is specified by 'func' 7467 */ 7468 /* ARGSUSED4 */ 7469 int 7470 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 7471 uint_t rtm_addrs, int flags, ipif_t *ipif_arg, ipif_t *src_ipif, 7472 boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func, 7473 ip_stack_t *ipst) 7474 { 7475 ire_t *ire = NULL; 7476 ipif_t *ipif; 7477 boolean_t ipif_refheld = B_FALSE; 7478 uint_t type; 7479 uint_t match_flags = MATCH_IRE_TYPE; 7480 int err = 0; 7481 7482 ip1dbg(("ip_rt_delete:")); 7483 /* 7484 * If this is the case of RTF_HOST being set, then we set the netmask 7485 * to all ones. Otherwise, we use the netmask if one was supplied. 7486 */ 7487 if (flags & RTF_HOST) { 7488 mask = IP_HOST_MASK; 7489 match_flags |= MATCH_IRE_MASK; 7490 } else if (rtm_addrs & RTA_NETMASK) { 7491 match_flags |= MATCH_IRE_MASK; 7492 } 7493 7494 /* 7495 * Note that RTF_GATEWAY is never set on a delete, therefore 7496 * we check if the gateway address is one of our interfaces first, 7497 * and fall back on RTF_GATEWAY routes. 7498 * 7499 * This makes it possible to delete an original 7500 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 7501 * 7502 * As the interface index specified with the RTA_IFP sockaddr is the 7503 * same for all ipif's off of an ill, the matching logic below uses 7504 * MATCH_IRE_ILL if such an index was specified. This means a route 7505 * sharing the same prefix and interface index as the the route 7506 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr 7507 * is specified in the request. 7508 * 7509 * On the other hand, since the gateway address will usually be 7510 * different for each ipif on the system, the matching logic 7511 * uses MATCH_IRE_IPIF in the case of a traditional interface 7512 * route. This means that interface routes for the same prefix can be 7513 * uniquely identified if they belong to distinct ipif's and if a 7514 * RTA_IFP sockaddr is not present. 7515 * 7516 * For more detail on specifying routes by gateway address and by 7517 * interface index, see the comments in ip_rt_add(). 7518 * gw_addr could be zero in some cases when both RTA_SRCIFP and 7519 * RTA_IFP are specified. If RTA_SRCIFP is specified and both 7520 * RTA_IFP and gateway_addr are NULL/zero, then delete will not 7521 * succeed. 7522 */ 7523 if (src_ipif != NULL) { 7524 if (ipif_arg == NULL && gw_addr != 0) { 7525 ipif_arg = ipif_lookup_interface(gw_addr, dst_addr, 7526 q, mp, func, &err, ipst); 7527 if (ipif_arg != NULL) 7528 ipif_refheld = B_TRUE; 7529 } 7530 if (ipif_arg == NULL) { 7531 err = (err == EINPROGRESS) ? err : ESRCH; 7532 return (err); 7533 } 7534 ipif = ipif_arg; 7535 } else { 7536 ipif = ipif_lookup_interface(gw_addr, dst_addr, 7537 q, mp, func, &err, ipst); 7538 if (ipif != NULL) 7539 ipif_refheld = B_TRUE; 7540 else if (err == EINPROGRESS) 7541 return (err); 7542 else 7543 err = 0; 7544 } 7545 if (ipif != NULL) { 7546 if (ipif_arg != NULL) { 7547 if (ipif_refheld) { 7548 ipif_refrele(ipif); 7549 ipif_refheld = B_FALSE; 7550 } 7551 ipif = ipif_arg; 7552 match_flags |= MATCH_IRE_ILL; 7553 } else { 7554 match_flags |= MATCH_IRE_IPIF; 7555 } 7556 if (src_ipif != NULL) { 7557 ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE, 7558 ipif, src_ipif->ipif_ill, match_flags); 7559 } else { 7560 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 7561 ire = ire_ctable_lookup(dst_addr, 0, 7562 IRE_LOOPBACK, ipif, ALL_ZONES, NULL, 7563 match_flags, ipst); 7564 } 7565 if (ire == NULL) { 7566 ire = ire_ftable_lookup(dst_addr, mask, 0, 7567 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 7568 NULL, match_flags, ipst); 7569 } 7570 } 7571 } 7572 7573 if (ire == NULL) { 7574 /* 7575 * At this point, the gateway address is not one of our own 7576 * addresses or a matching interface route was not found. We 7577 * set the IRE type to lookup based on whether 7578 * this is a host route, a default route or just a prefix. 7579 * 7580 * If an ipif_arg was passed in, then the lookup is based on an 7581 * interface index so MATCH_IRE_ILL is added to match_flags. 7582 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is 7583 * set as the route being looked up is not a traditional 7584 * interface route. 7585 * Since we do not add gateway route with srcipif, we don't 7586 * expect to find it either. 7587 */ 7588 if (src_ipif != NULL) { 7589 if (ipif_refheld) 7590 ipif_refrele(ipif); 7591 return (ESRCH); 7592 } else { 7593 match_flags &= ~MATCH_IRE_IPIF; 7594 match_flags |= MATCH_IRE_GW; 7595 if (ipif_arg != NULL) 7596 match_flags |= MATCH_IRE_ILL; 7597 if (mask == IP_HOST_MASK) 7598 type = IRE_HOST; 7599 else if (mask == 0) 7600 type = IRE_DEFAULT; 7601 else 7602 type = IRE_PREFIX; 7603 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, 7604 ipif_arg, NULL, ALL_ZONES, 0, NULL, match_flags, 7605 ipst); 7606 } 7607 } 7608 7609 if (ipif_refheld) 7610 ipif_refrele(ipif); 7611 7612 /* ipif is not refheld anymore */ 7613 if (ire == NULL) 7614 return (ESRCH); 7615 7616 if (ire->ire_flags & RTF_MULTIRT) { 7617 /* 7618 * Invoke the CGTP (multirouting) filtering module 7619 * to remove the dst address from the filtering database. 7620 * Packets coming from that address will no longer be 7621 * filtered to remove duplicates. 7622 */ 7623 if (ip_cgtp_filter_ops != NULL && 7624 ipst->ips_netstack->netstack_stackid == GLOBAL_NETSTACKID) { 7625 err = ip_cgtp_filter_ops->cfo_del_dest_v4( 7626 ire->ire_addr, ire->ire_gateway_addr); 7627 } 7628 ip_cgtp_bcast_delete(ire, ipst); 7629 } 7630 7631 ipif = ire->ire_ipif; 7632 /* 7633 * Removing from ipif_saved_ire_mp is not necessary 7634 * when src_ipif being non-NULL. ip_rt_add does not 7635 * save the ires which src_ipif being non-NULL. 7636 */ 7637 if (ipif != NULL && src_ipif == NULL) { 7638 ipif_remove_ire(ipif, ire); 7639 } 7640 if (ioctl_msg) 7641 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst); 7642 ire_delete(ire); 7643 ire_refrele(ire); 7644 return (err); 7645 } 7646 7647 /* 7648 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 7649 */ 7650 /* ARGSUSED */ 7651 int 7652 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7653 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7654 { 7655 ipaddr_t dst_addr; 7656 ipaddr_t gw_addr; 7657 ipaddr_t mask; 7658 int error = 0; 7659 mblk_t *mp1; 7660 struct rtentry *rt; 7661 ipif_t *ipif = NULL; 7662 ip_stack_t *ipst; 7663 7664 ASSERT(q->q_next == NULL); 7665 ipst = CONNQ_TO_IPST(q); 7666 7667 ip1dbg(("ip_siocaddrt:")); 7668 /* Existence of mp1 verified in ip_wput_nondata */ 7669 mp1 = mp->b_cont->b_cont; 7670 rt = (struct rtentry *)mp1->b_rptr; 7671 7672 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7673 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7674 7675 /* 7676 * If the RTF_HOST flag is on, this is a request to assign a gateway 7677 * to a particular host address. In this case, we set the netmask to 7678 * all ones for the particular destination address. Otherwise, 7679 * determine the netmask to be used based on dst_addr and the interfaces 7680 * in use. 7681 */ 7682 if (rt->rt_flags & RTF_HOST) { 7683 mask = IP_HOST_MASK; 7684 } else { 7685 /* 7686 * Note that ip_subnet_mask returns a zero mask in the case of 7687 * default (an all-zeroes address). 7688 */ 7689 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 7690 } 7691 7692 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 7693 NULL, B_TRUE, q, mp, ip_process_ioctl, NULL, ipst); 7694 if (ipif != NULL) 7695 ipif_refrele(ipif); 7696 return (error); 7697 } 7698 7699 /* 7700 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 7701 */ 7702 /* ARGSUSED */ 7703 int 7704 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7705 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7706 { 7707 ipaddr_t dst_addr; 7708 ipaddr_t gw_addr; 7709 ipaddr_t mask; 7710 int error; 7711 mblk_t *mp1; 7712 struct rtentry *rt; 7713 ipif_t *ipif = NULL; 7714 ip_stack_t *ipst; 7715 7716 ASSERT(q->q_next == NULL); 7717 ipst = CONNQ_TO_IPST(q); 7718 7719 ip1dbg(("ip_siocdelrt:")); 7720 /* Existence of mp1 verified in ip_wput_nondata */ 7721 mp1 = mp->b_cont->b_cont; 7722 rt = (struct rtentry *)mp1->b_rptr; 7723 7724 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7725 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7726 7727 /* 7728 * If the RTF_HOST flag is on, this is a request to delete a gateway 7729 * to a particular host address. In this case, we set the netmask to 7730 * all ones for the particular destination address. Otherwise, 7731 * determine the netmask to be used based on dst_addr and the interfaces 7732 * in use. 7733 */ 7734 if (rt->rt_flags & RTF_HOST) { 7735 mask = IP_HOST_MASK; 7736 } else { 7737 /* 7738 * Note that ip_subnet_mask returns a zero mask in the case of 7739 * default (an all-zeroes address). 7740 */ 7741 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 7742 } 7743 7744 error = ip_rt_delete(dst_addr, mask, gw_addr, 7745 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, NULL, 7746 B_TRUE, q, mp, ip_process_ioctl, ipst); 7747 if (ipif != NULL) 7748 ipif_refrele(ipif); 7749 return (error); 7750 } 7751 7752 /* 7753 * Enqueue the mp onto the ipsq, chained by b_next. 7754 * b_prev stores the function to be executed later, and b_queue the queue 7755 * where this mp originated. 7756 */ 7757 void 7758 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7759 ill_t *pending_ill) 7760 { 7761 conn_t *connp = NULL; 7762 7763 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7764 ASSERT(func != NULL); 7765 7766 mp->b_queue = q; 7767 mp->b_prev = (void *)func; 7768 mp->b_next = NULL; 7769 7770 switch (type) { 7771 case CUR_OP: 7772 if (ipsq->ipsq_mptail != NULL) { 7773 ASSERT(ipsq->ipsq_mphead != NULL); 7774 ipsq->ipsq_mptail->b_next = mp; 7775 } else { 7776 ASSERT(ipsq->ipsq_mphead == NULL); 7777 ipsq->ipsq_mphead = mp; 7778 } 7779 ipsq->ipsq_mptail = mp; 7780 break; 7781 7782 case NEW_OP: 7783 if (ipsq->ipsq_xopq_mptail != NULL) { 7784 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 7785 ipsq->ipsq_xopq_mptail->b_next = mp; 7786 } else { 7787 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 7788 ipsq->ipsq_xopq_mphead = mp; 7789 } 7790 ipsq->ipsq_xopq_mptail = mp; 7791 break; 7792 default: 7793 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 7794 } 7795 7796 if (CONN_Q(q) && pending_ill != NULL) { 7797 connp = Q_TO_CONN(q); 7798 7799 ASSERT(MUTEX_HELD(&connp->conn_lock)); 7800 connp->conn_oper_pending_ill = pending_ill; 7801 } 7802 } 7803 7804 /* 7805 * Return the mp at the head of the ipsq. After emptying the ipsq 7806 * look at the next ioctl, if this ioctl is complete. Otherwise 7807 * return, we will resume when we complete the current ioctl. 7808 * The current ioctl will wait till it gets a response from the 7809 * driver below. 7810 */ 7811 static mblk_t * 7812 ipsq_dq(ipsq_t *ipsq) 7813 { 7814 mblk_t *mp; 7815 7816 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7817 7818 mp = ipsq->ipsq_mphead; 7819 if (mp != NULL) { 7820 ipsq->ipsq_mphead = mp->b_next; 7821 if (ipsq->ipsq_mphead == NULL) 7822 ipsq->ipsq_mptail = NULL; 7823 mp->b_next = NULL; 7824 return (mp); 7825 } 7826 if (ipsq->ipsq_current_ipif != NULL) 7827 return (NULL); 7828 mp = ipsq->ipsq_xopq_mphead; 7829 if (mp != NULL) { 7830 ipsq->ipsq_xopq_mphead = mp->b_next; 7831 if (ipsq->ipsq_xopq_mphead == NULL) 7832 ipsq->ipsq_xopq_mptail = NULL; 7833 mp->b_next = NULL; 7834 return (mp); 7835 } 7836 return (NULL); 7837 } 7838 7839 /* 7840 * Enter the ipsq corresponding to ill, by waiting synchronously till 7841 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 7842 * will have to drain completely before ipsq_enter returns success. 7843 * ipsq_current_ipif will be set if some exclusive ioctl is in progress, 7844 * and the ipsq_exit logic will start the next enqueued ioctl after 7845 * completion of the current ioctl. If 'force' is used, we don't wait 7846 * for the enqueued ioctls. This is needed when a conn_close wants to 7847 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 7848 * of an ill can also use this option. But we dont' use it currently. 7849 */ 7850 #define ENTER_SQ_WAIT_TICKS 100 7851 boolean_t 7852 ipsq_enter(ill_t *ill, boolean_t force) 7853 { 7854 ipsq_t *ipsq; 7855 boolean_t waited_enough = B_FALSE; 7856 7857 /* 7858 * Holding the ill_lock prevents <ill-ipsq> assocs from changing. 7859 * Since the <ill-ipsq> assocs could change while we wait for the 7860 * writer, it is easier to wait on a fixed global rather than try to 7861 * cv_wait on a changing ipsq. 7862 */ 7863 mutex_enter(&ill->ill_lock); 7864 for (;;) { 7865 if (ill->ill_state_flags & ILL_CONDEMNED) { 7866 mutex_exit(&ill->ill_lock); 7867 return (B_FALSE); 7868 } 7869 7870 ipsq = ill->ill_phyint->phyint_ipsq; 7871 mutex_enter(&ipsq->ipsq_lock); 7872 if (ipsq->ipsq_writer == NULL && 7873 (ipsq->ipsq_current_ipif == NULL || waited_enough)) { 7874 break; 7875 } else if (ipsq->ipsq_writer != NULL) { 7876 mutex_exit(&ipsq->ipsq_lock); 7877 cv_wait(&ill->ill_cv, &ill->ill_lock); 7878 } else { 7879 mutex_exit(&ipsq->ipsq_lock); 7880 if (force) { 7881 (void) cv_timedwait(&ill->ill_cv, 7882 &ill->ill_lock, 7883 lbolt + ENTER_SQ_WAIT_TICKS); 7884 waited_enough = B_TRUE; 7885 continue; 7886 } else { 7887 cv_wait(&ill->ill_cv, &ill->ill_lock); 7888 } 7889 } 7890 } 7891 7892 ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL); 7893 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7894 ipsq->ipsq_writer = curthread; 7895 ipsq->ipsq_reentry_cnt++; 7896 #ifdef ILL_DEBUG 7897 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH); 7898 #endif 7899 mutex_exit(&ipsq->ipsq_lock); 7900 mutex_exit(&ill->ill_lock); 7901 return (B_TRUE); 7902 } 7903 7904 /* 7905 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 7906 * certain critical operations like plumbing (i.e. most set ioctls), 7907 * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP 7908 * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per 7909 * IPMP group. The ipsq serializes exclusive ioctls issued by applications 7910 * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple 7911 * threads executing in the ipsq. Responses from the driver pertain to the 7912 * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated 7913 * as part of bringing up the interface) and are enqueued in ipsq_mphead. 7914 * 7915 * If a thread does not want to reenter the ipsq when it is already writer, 7916 * it must make sure that the specified reentry point to be called later 7917 * when the ipsq is empty, nor any code path starting from the specified reentry 7918 * point must never ever try to enter the ipsq again. Otherwise it can lead 7919 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 7920 * When the thread that is currently exclusive finishes, it (ipsq_exit) 7921 * dequeues the requests waiting to become exclusive in ipsq_mphead and calls 7922 * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit 7923 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 7924 * ioctl if the current ioctl has completed. If the current ioctl is still 7925 * in progress it simply returns. The current ioctl could be waiting for 7926 * a response from another module (arp_ or the driver or could be waiting for 7927 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp 7928 * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the 7929 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 7930 * ipsq_current_ipif is clear which happens only on ioctl completion. 7931 */ 7932 7933 /* 7934 * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of 7935 * ipif or ill can be specified). The caller ensures ipif or ill is valid by 7936 * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued 7937 * completion. 7938 */ 7939 ipsq_t * 7940 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7941 ipsq_func_t func, int type, boolean_t reentry_ok) 7942 { 7943 ipsq_t *ipsq; 7944 7945 /* Only 1 of ipif or ill can be specified */ 7946 ASSERT((ipif != NULL) ^ (ill != NULL)); 7947 if (ipif != NULL) 7948 ill = ipif->ipif_ill; 7949 7950 /* 7951 * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock 7952 * ipsq of an ill can't change when ill_lock is held. 7953 */ 7954 GRAB_CONN_LOCK(q); 7955 mutex_enter(&ill->ill_lock); 7956 ipsq = ill->ill_phyint->phyint_ipsq; 7957 mutex_enter(&ipsq->ipsq_lock); 7958 7959 /* 7960 * 1. Enter the ipsq if we are already writer and reentry is ok. 7961 * (Note: If the caller does not specify reentry_ok then neither 7962 * 'func' nor any of its callees must ever attempt to enter the ipsq 7963 * again. Otherwise it can lead to an infinite loop 7964 * 2. Enter the ipsq if there is no current writer and this attempted 7965 * entry is part of the current ioctl or operation 7966 * 3. Enter the ipsq if there is no current writer and this is a new 7967 * ioctl (or operation) and the ioctl (or operation) queue is 7968 * empty and there is no ioctl (or operation) currently in progress 7969 */ 7970 if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) || 7971 (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL && 7972 ipsq->ipsq_current_ipif == NULL))) || 7973 (ipsq->ipsq_writer == curthread && reentry_ok)) { 7974 /* Success. */ 7975 ipsq->ipsq_reentry_cnt++; 7976 ipsq->ipsq_writer = curthread; 7977 mutex_exit(&ipsq->ipsq_lock); 7978 mutex_exit(&ill->ill_lock); 7979 RELEASE_CONN_LOCK(q); 7980 #ifdef ILL_DEBUG 7981 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH); 7982 #endif 7983 return (ipsq); 7984 } 7985 7986 ipsq_enq(ipsq, q, mp, func, type, ill); 7987 7988 mutex_exit(&ipsq->ipsq_lock); 7989 mutex_exit(&ill->ill_lock); 7990 RELEASE_CONN_LOCK(q); 7991 return (NULL); 7992 } 7993 7994 /* 7995 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures 7996 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ 7997 * cannot be entered, the mp is queued for completion. 7998 */ 7999 void 8000 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 8001 boolean_t reentry_ok) 8002 { 8003 ipsq_t *ipsq; 8004 8005 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok); 8006 8007 /* 8008 * Drop the caller's refhold on the ill. This is safe since we either 8009 * entered the IPSQ (and thus are exclusive), or failed to enter the 8010 * IPSQ, in which case we return without accessing ill anymore. This 8011 * is needed because func needs to see the correct refcount. 8012 * e.g. removeif can work only then. 8013 */ 8014 ill_refrele(ill); 8015 if (ipsq != NULL) { 8016 (*func)(ipsq, q, mp, NULL); 8017 ipsq_exit(ipsq, B_TRUE, B_TRUE); 8018 } 8019 } 8020 8021 /* 8022 * If there are more than ILL_GRP_CNT ills in a group, 8023 * we use kmem alloc'd buffers, else use the stack 8024 */ 8025 #define ILL_GRP_CNT 14 8026 /* 8027 * Drain the ipsq, if there are messages on it, and then leave the ipsq. 8028 * Called by a thread that is currently exclusive on this ipsq. 8029 */ 8030 void 8031 ipsq_exit(ipsq_t *ipsq, boolean_t start_igmp_timer, boolean_t start_mld_timer) 8032 { 8033 queue_t *q; 8034 mblk_t *mp; 8035 ipsq_func_t func; 8036 int next; 8037 ill_t **ill_list = NULL; 8038 size_t ill_list_size = 0; 8039 int cnt = 0; 8040 boolean_t need_ipsq_free = B_FALSE; 8041 ip_stack_t *ipst = ipsq->ipsq_ipst; 8042 8043 ASSERT(IAM_WRITER_IPSQ(ipsq)); 8044 mutex_enter(&ipsq->ipsq_lock); 8045 ASSERT(ipsq->ipsq_reentry_cnt >= 1); 8046 if (ipsq->ipsq_reentry_cnt != 1) { 8047 ipsq->ipsq_reentry_cnt--; 8048 mutex_exit(&ipsq->ipsq_lock); 8049 return; 8050 } 8051 8052 mp = ipsq_dq(ipsq); 8053 while (mp != NULL) { 8054 again: 8055 mutex_exit(&ipsq->ipsq_lock); 8056 func = (ipsq_func_t)mp->b_prev; 8057 q = (queue_t *)mp->b_queue; 8058 mp->b_prev = NULL; 8059 mp->b_queue = NULL; 8060 8061 /* 8062 * If 'q' is an conn queue, it is valid, since we did a 8063 * a refhold on the connp, at the start of the ioctl. 8064 * If 'q' is an ill queue, it is valid, since close of an 8065 * ill will clean up the 'ipsq'. 8066 */ 8067 (*func)(ipsq, q, mp, NULL); 8068 8069 mutex_enter(&ipsq->ipsq_lock); 8070 mp = ipsq_dq(ipsq); 8071 } 8072 8073 mutex_exit(&ipsq->ipsq_lock); 8074 8075 /* 8076 * Need to grab the locks in the right order. Need to 8077 * atomically check (under ipsq_lock) that there are no 8078 * messages before relinquishing the ipsq. Also need to 8079 * atomically wakeup waiters on ill_cv while holding ill_lock. 8080 * Holding ill_g_lock ensures that ipsq list of ills is stable. 8081 * If we need to call ill_split_ipsq and change <ill-ipsq> we need 8082 * to grab ill_g_lock as writer. 8083 */ 8084 rw_enter(&ipst->ips_ill_g_lock, 8085 ipsq->ipsq_split ? RW_WRITER : RW_READER); 8086 8087 /* ipsq_refs can't change while ill_g_lock is held as reader */ 8088 if (ipsq->ipsq_refs != 0) { 8089 /* At most 2 ills v4/v6 per phyint */ 8090 cnt = ipsq->ipsq_refs << 1; 8091 ill_list_size = cnt * sizeof (ill_t *); 8092 /* 8093 * If memory allocation fails, we will do the split 8094 * the next time ipsq_exit is called for whatever reason. 8095 * As long as the ipsq_split flag is set the need to 8096 * split is remembered. 8097 */ 8098 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 8099 if (ill_list != NULL) 8100 cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt); 8101 } 8102 mutex_enter(&ipsq->ipsq_lock); 8103 mp = ipsq_dq(ipsq); 8104 if (mp != NULL) { 8105 /* oops, some message has landed up, we can't get out */ 8106 if (ill_list != NULL) 8107 ill_unlock_ills(ill_list, cnt); 8108 rw_exit(&ipst->ips_ill_g_lock); 8109 if (ill_list != NULL) 8110 kmem_free(ill_list, ill_list_size); 8111 ill_list = NULL; 8112 ill_list_size = 0; 8113 cnt = 0; 8114 goto again; 8115 } 8116 8117 /* 8118 * Split only if no ioctl is pending and if memory alloc succeeded 8119 * above. 8120 */ 8121 if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL && 8122 ill_list != NULL) { 8123 /* 8124 * No new ill can join this ipsq since we are holding the 8125 * ill_g_lock. Hence ill_split_ipsq can safely traverse the 8126 * ipsq. ill_split_ipsq may fail due to memory shortage. 8127 * If so we will retry on the next ipsq_exit. 8128 */ 8129 ipsq->ipsq_split = ill_split_ipsq(ipsq); 8130 } 8131 8132 /* 8133 * We are holding the ipsq lock, hence no new messages can 8134 * land up on the ipsq, and there are no messages currently. 8135 * Now safe to get out. Wake up waiters and relinquish ipsq 8136 * atomically while holding ill locks. 8137 */ 8138 ipsq->ipsq_writer = NULL; 8139 ipsq->ipsq_reentry_cnt--; 8140 ASSERT(ipsq->ipsq_reentry_cnt == 0); 8141 #ifdef ILL_DEBUG 8142 ipsq->ipsq_depth = 0; 8143 #endif 8144 mutex_exit(&ipsq->ipsq_lock); 8145 /* 8146 * For IPMP this should wake up all ills in this ipsq. 8147 * We need to hold the ill_lock while waking up waiters to 8148 * avoid missed wakeups. But there is no need to acquire all 8149 * the ill locks and then wakeup. If we have not acquired all 8150 * the locks (due to memory failure above) ill_signal_ipsq_ills 8151 * wakes up ills one at a time after getting the right ill_lock 8152 */ 8153 ill_signal_ipsq_ills(ipsq, ill_list != NULL); 8154 if (ill_list != NULL) 8155 ill_unlock_ills(ill_list, cnt); 8156 if (ipsq->ipsq_refs == 0) 8157 need_ipsq_free = B_TRUE; 8158 rw_exit(&ipst->ips_ill_g_lock); 8159 if (ill_list != 0) 8160 kmem_free(ill_list, ill_list_size); 8161 8162 if (need_ipsq_free) { 8163 /* 8164 * Free the ipsq. ipsq_refs can't increase because ipsq can't be 8165 * looked up. ipsq can be looked up only thru ill or phyint 8166 * and there are no ills/phyint on this ipsq. 8167 */ 8168 ipsq_delete(ipsq); 8169 } 8170 /* 8171 * Now start any igmp or mld timers that could not be started 8172 * while inside the ipsq. The timers can't be started while inside 8173 * the ipsq, since igmp_start_timers may need to call untimeout() 8174 * which can't be done while holding a lock i.e. the ipsq. Otherwise 8175 * there could be a deadlock since the timeout handlers 8176 * mld_timeout_handler / igmp_timeout_handler also synchronously 8177 * wait in ipsq_enter() trying to get the ipsq. 8178 * 8179 * However there is one exception to the above. If this thread is 8180 * itself the igmp/mld timeout handler thread, then we don't want 8181 * to start any new timer until the current handler is done. The 8182 * handler thread passes in B_FALSE for start_igmp/mld_timers, while 8183 * all others pass B_TRUE. 8184 */ 8185 if (start_igmp_timer) { 8186 mutex_enter(&ipst->ips_igmp_timer_lock); 8187 next = ipst->ips_igmp_deferred_next; 8188 ipst->ips_igmp_deferred_next = INFINITY; 8189 mutex_exit(&ipst->ips_igmp_timer_lock); 8190 8191 if (next != INFINITY) 8192 igmp_start_timers(next, ipst); 8193 } 8194 8195 if (start_mld_timer) { 8196 mutex_enter(&ipst->ips_mld_timer_lock); 8197 next = ipst->ips_mld_deferred_next; 8198 ipst->ips_mld_deferred_next = INFINITY; 8199 mutex_exit(&ipst->ips_mld_timer_lock); 8200 8201 if (next != INFINITY) 8202 mld_start_timers(next, ipst); 8203 } 8204 } 8205 8206 /* 8207 * Start the current exclusive operation on `ipsq'; associate it with `ipif' 8208 * and `ioccmd'. 8209 */ 8210 void 8211 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) 8212 { 8213 ASSERT(IAM_WRITER_IPSQ(ipsq)); 8214 8215 mutex_enter(&ipsq->ipsq_lock); 8216 ASSERT(ipsq->ipsq_current_ipif == NULL); 8217 ASSERT(ipsq->ipsq_current_ioctl == 0); 8218 ipsq->ipsq_current_ipif = ipif; 8219 ipsq->ipsq_current_ioctl = ioccmd; 8220 mutex_exit(&ipsq->ipsq_lock); 8221 } 8222 8223 /* 8224 * Finish the current exclusive operation on `ipsq'. Note that other 8225 * operations will not be able to proceed until an ipsq_exit() is done. 8226 */ 8227 void 8228 ipsq_current_finish(ipsq_t *ipsq) 8229 { 8230 ipif_t *ipif = ipsq->ipsq_current_ipif; 8231 8232 ASSERT(IAM_WRITER_IPSQ(ipsq)); 8233 8234 /* 8235 * For SIOCSLIFREMOVEIF, the ipif has been already been blown away 8236 * (but we're careful to never set IPIF_CHANGING in that case). 8237 */ 8238 if (ipsq->ipsq_current_ioctl != SIOCLIFREMOVEIF) { 8239 mutex_enter(&ipif->ipif_ill->ill_lock); 8240 ipif->ipif_state_flags &= ~IPIF_CHANGING; 8241 8242 /* Send any queued event */ 8243 ill_nic_info_dispatch(ipif->ipif_ill); 8244 mutex_exit(&ipif->ipif_ill->ill_lock); 8245 } 8246 8247 mutex_enter(&ipsq->ipsq_lock); 8248 ASSERT(ipsq->ipsq_current_ipif != NULL); 8249 ipsq->ipsq_current_ipif = NULL; 8250 ipsq->ipsq_current_ioctl = 0; 8251 mutex_exit(&ipsq->ipsq_lock); 8252 } 8253 8254 /* 8255 * The ill is closing. Flush all messages on the ipsq that originated 8256 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 8257 * for this ill since ipsq_enter could not have entered until then. 8258 * New messages can't be queued since the CONDEMNED flag is set. 8259 */ 8260 static void 8261 ipsq_flush(ill_t *ill) 8262 { 8263 queue_t *q; 8264 mblk_t *prev; 8265 mblk_t *mp; 8266 mblk_t *mp_next; 8267 ipsq_t *ipsq; 8268 8269 ASSERT(IAM_WRITER_ILL(ill)); 8270 ipsq = ill->ill_phyint->phyint_ipsq; 8271 /* 8272 * Flush any messages sent up by the driver. 8273 */ 8274 mutex_enter(&ipsq->ipsq_lock); 8275 for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) { 8276 mp_next = mp->b_next; 8277 q = mp->b_queue; 8278 if (q == ill->ill_rq || q == ill->ill_wq) { 8279 /* Remove the mp from the ipsq */ 8280 if (prev == NULL) 8281 ipsq->ipsq_mphead = mp->b_next; 8282 else 8283 prev->b_next = mp->b_next; 8284 if (ipsq->ipsq_mptail == mp) { 8285 ASSERT(mp_next == NULL); 8286 ipsq->ipsq_mptail = prev; 8287 } 8288 inet_freemsg(mp); 8289 } else { 8290 prev = mp; 8291 } 8292 } 8293 mutex_exit(&ipsq->ipsq_lock); 8294 (void) ipsq_pending_mp_cleanup(ill, NULL); 8295 ipsq_xopq_mp_cleanup(ill, NULL); 8296 ill_pending_mp_cleanup(ill); 8297 } 8298 8299 /* ARGSUSED */ 8300 int 8301 ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8302 ip_ioctl_cmd_t *ipip, void *ifreq) 8303 { 8304 ill_t *ill; 8305 struct lifreq *lifr = (struct lifreq *)ifreq; 8306 boolean_t isv6; 8307 conn_t *connp; 8308 ip_stack_t *ipst; 8309 8310 connp = Q_TO_CONN(q); 8311 ipst = connp->conn_netstack->netstack_ip; 8312 isv6 = connp->conn_af_isv6; 8313 /* 8314 * Set original index. 8315 * Failover and failback move logical interfaces 8316 * from one physical interface to another. The 8317 * original index indicates the parent of a logical 8318 * interface, in other words, the physical interface 8319 * the logical interface will be moved back to on 8320 * failback. 8321 */ 8322 8323 /* 8324 * Don't allow the original index to be changed 8325 * for non-failover addresses, autoconfigured 8326 * addresses, or IPv6 link local addresses. 8327 */ 8328 if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) || 8329 (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) { 8330 return (EINVAL); 8331 } 8332 /* 8333 * The new original index must be in use by some 8334 * physical interface. 8335 */ 8336 ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL, 8337 NULL, NULL, ipst); 8338 if (ill == NULL) 8339 return (ENXIO); 8340 ill_refrele(ill); 8341 8342 ipif->ipif_orig_ifindex = lifr->lifr_index; 8343 /* 8344 * When this ipif gets failed back, don't 8345 * preserve the original id, as it is no 8346 * longer applicable. 8347 */ 8348 ipif->ipif_orig_ipifid = 0; 8349 /* 8350 * For IPv4, change the original index of any 8351 * multicast addresses associated with the 8352 * ipif to the new value. 8353 */ 8354 if (!isv6) { 8355 ilm_t *ilm; 8356 8357 mutex_enter(&ipif->ipif_ill->ill_lock); 8358 for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL; 8359 ilm = ilm->ilm_next) { 8360 if (ilm->ilm_ipif == ipif) { 8361 ilm->ilm_orig_ifindex = lifr->lifr_index; 8362 } 8363 } 8364 mutex_exit(&ipif->ipif_ill->ill_lock); 8365 } 8366 return (0); 8367 } 8368 8369 /* ARGSUSED */ 8370 int 8371 ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8372 ip_ioctl_cmd_t *ipip, void *ifreq) 8373 { 8374 struct lifreq *lifr = (struct lifreq *)ifreq; 8375 8376 /* 8377 * Get the original interface index i.e the one 8378 * before FAILOVER if it ever happened. 8379 */ 8380 lifr->lifr_index = ipif->ipif_orig_ifindex; 8381 return (0); 8382 } 8383 8384 /* 8385 * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls, 8386 * refhold and return the associated ipif 8387 */ 8388 int 8389 ip_extract_tunreq(queue_t *q, mblk_t *mp, ipif_t **ipifp, ipsq_func_t func) 8390 { 8391 boolean_t exists; 8392 struct iftun_req *ta; 8393 ipif_t *ipif; 8394 ill_t *ill; 8395 boolean_t isv6; 8396 mblk_t *mp1; 8397 int error; 8398 conn_t *connp; 8399 ip_stack_t *ipst; 8400 8401 /* Existence verified in ip_wput_nondata */ 8402 mp1 = mp->b_cont->b_cont; 8403 ta = (struct iftun_req *)mp1->b_rptr; 8404 /* 8405 * Null terminate the string to protect against buffer 8406 * overrun. String was generated by user code and may not 8407 * be trusted. 8408 */ 8409 ta->ifta_lifr_name[LIFNAMSIZ - 1] = '\0'; 8410 8411 connp = Q_TO_CONN(q); 8412 isv6 = connp->conn_af_isv6; 8413 ipst = connp->conn_netstack->netstack_ip; 8414 8415 /* Disallows implicit create */ 8416 ipif = ipif_lookup_on_name(ta->ifta_lifr_name, 8417 mi_strlen(ta->ifta_lifr_name), B_FALSE, &exists, isv6, 8418 connp->conn_zoneid, CONNP_TO_WQ(connp), mp, func, &error, ipst); 8419 if (ipif == NULL) 8420 return (error); 8421 8422 if (ipif->ipif_id != 0) { 8423 /* 8424 * We really don't want to set/get tunnel parameters 8425 * on virtual tunnel interfaces. Only allow the 8426 * base tunnel to do these. 8427 */ 8428 ipif_refrele(ipif); 8429 return (EINVAL); 8430 } 8431 8432 /* 8433 * Send down to tunnel mod for ioctl processing. 8434 * Will finish ioctl in ip_rput_other(). 8435 */ 8436 ill = ipif->ipif_ill; 8437 if (ill->ill_net_type == IRE_LOOPBACK) { 8438 ipif_refrele(ipif); 8439 return (EOPNOTSUPP); 8440 } 8441 8442 if (ill->ill_wq == NULL) { 8443 ipif_refrele(ipif); 8444 return (ENXIO); 8445 } 8446 /* 8447 * Mark the ioctl as coming from an IPv6 interface for 8448 * tun's convenience. 8449 */ 8450 if (ill->ill_isv6) 8451 ta->ifta_flags |= 0x80000000; 8452 *ipifp = ipif; 8453 return (0); 8454 } 8455 8456 /* 8457 * Parse an ifreq or lifreq struct coming down ioctls and refhold 8458 * and return the associated ipif. 8459 * Return value: 8460 * Non zero: An error has occurred. ci may not be filled out. 8461 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 8462 * a held ipif in ci.ci_ipif. 8463 */ 8464 int 8465 ip_extract_lifreq_cmn(queue_t *q, mblk_t *mp, int cmd_type, int flags, 8466 cmd_info_t *ci, ipsq_func_t func) 8467 { 8468 sin_t *sin; 8469 sin6_t *sin6; 8470 char *name; 8471 struct ifreq *ifr; 8472 struct lifreq *lifr; 8473 ipif_t *ipif = NULL; 8474 ill_t *ill; 8475 conn_t *connp; 8476 boolean_t isv6; 8477 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8478 boolean_t exists; 8479 int err; 8480 mblk_t *mp1; 8481 zoneid_t zoneid; 8482 ip_stack_t *ipst; 8483 8484 if (q->q_next != NULL) { 8485 ill = (ill_t *)q->q_ptr; 8486 isv6 = ill->ill_isv6; 8487 connp = NULL; 8488 zoneid = ALL_ZONES; 8489 ipst = ill->ill_ipst; 8490 } else { 8491 ill = NULL; 8492 connp = Q_TO_CONN(q); 8493 isv6 = connp->conn_af_isv6; 8494 zoneid = connp->conn_zoneid; 8495 if (zoneid == GLOBAL_ZONEID) { 8496 /* global zone can access ipifs in all zones */ 8497 zoneid = ALL_ZONES; 8498 } 8499 ipst = connp->conn_netstack->netstack_ip; 8500 } 8501 8502 /* Has been checked in ip_wput_nondata */ 8503 mp1 = mp->b_cont->b_cont; 8504 8505 8506 if (cmd_type == IF_CMD) { 8507 /* This a old style SIOC[GS]IF* command */ 8508 ifr = (struct ifreq *)mp1->b_rptr; 8509 /* 8510 * Null terminate the string to protect against buffer 8511 * overrun. String was generated by user code and may not 8512 * be trusted. 8513 */ 8514 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 8515 sin = (sin_t *)&ifr->ifr_addr; 8516 name = ifr->ifr_name; 8517 ci->ci_sin = sin; 8518 ci->ci_sin6 = NULL; 8519 ci->ci_lifr = (struct lifreq *)ifr; 8520 } else { 8521 /* This a new style SIOC[GS]LIF* command */ 8522 ASSERT(cmd_type == LIF_CMD); 8523 lifr = (struct lifreq *)mp1->b_rptr; 8524 /* 8525 * Null terminate the string to protect against buffer 8526 * overrun. String was generated by user code and may not 8527 * be trusted. 8528 */ 8529 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 8530 name = lifr->lifr_name; 8531 sin = (sin_t *)&lifr->lifr_addr; 8532 sin6 = (sin6_t *)&lifr->lifr_addr; 8533 if (iocp->ioc_cmd == SIOCSLIFGROUPNAME) { 8534 (void) strncpy(ci->ci_groupname, lifr->lifr_groupname, 8535 LIFNAMSIZ); 8536 } 8537 ci->ci_sin = sin; 8538 ci->ci_sin6 = sin6; 8539 ci->ci_lifr = lifr; 8540 } 8541 8542 if (iocp->ioc_cmd == SIOCSLIFNAME) { 8543 /* 8544 * The ioctl will be failed if the ioctl comes down 8545 * an conn stream 8546 */ 8547 if (ill == NULL) { 8548 /* 8549 * Not an ill queue, return EINVAL same as the 8550 * old error code. 8551 */ 8552 return (ENXIO); 8553 } 8554 ipif = ill->ill_ipif; 8555 ipif_refhold(ipif); 8556 } else { 8557 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 8558 &exists, isv6, zoneid, 8559 (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err, 8560 ipst); 8561 if (ipif == NULL) { 8562 if (err == EINPROGRESS) 8563 return (err); 8564 if (iocp->ioc_cmd == SIOCLIFFAILOVER || 8565 iocp->ioc_cmd == SIOCLIFFAILBACK) { 8566 /* 8567 * Need to try both v4 and v6 since this 8568 * ioctl can come down either v4 or v6 8569 * socket. The lifreq.lifr_family passed 8570 * down by this ioctl is AF_UNSPEC. 8571 */ 8572 ipif = ipif_lookup_on_name(name, 8573 mi_strlen(name), B_FALSE, &exists, !isv6, 8574 zoneid, (connp == NULL) ? q : 8575 CONNP_TO_WQ(connp), mp, func, &err, ipst); 8576 if (err == EINPROGRESS) 8577 return (err); 8578 } 8579 err = 0; /* Ensure we don't use it below */ 8580 } 8581 } 8582 8583 /* 8584 * Old style [GS]IFCMD does not admit IPv6 ipif 8585 */ 8586 if (ipif != NULL && ipif->ipif_isv6 && cmd_type == IF_CMD) { 8587 ipif_refrele(ipif); 8588 return (ENXIO); 8589 } 8590 8591 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 8592 name[0] == '\0') { 8593 /* 8594 * Handle a or a SIOC?IF* with a null name 8595 * during plumb (on the ill queue before the I_PLINK). 8596 */ 8597 ipif = ill->ill_ipif; 8598 ipif_refhold(ipif); 8599 } 8600 8601 if (ipif == NULL) 8602 return (ENXIO); 8603 8604 /* 8605 * Allow only GET operations if this ipif has been created 8606 * temporarily due to a MOVE operation. 8607 */ 8608 if (ipif->ipif_replace_zero && !(flags & IPI_REPL)) { 8609 ipif_refrele(ipif); 8610 return (EINVAL); 8611 } 8612 8613 ci->ci_ipif = ipif; 8614 return (0); 8615 } 8616 8617 /* 8618 * Return the total number of ipifs. 8619 */ 8620 static uint_t 8621 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst) 8622 { 8623 uint_t numifs = 0; 8624 ill_t *ill; 8625 ill_walk_context_t ctx; 8626 ipif_t *ipif; 8627 8628 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8629 ill = ILL_START_WALK_V4(&ctx, ipst); 8630 8631 while (ill != NULL) { 8632 for (ipif = ill->ill_ipif; ipif != NULL; 8633 ipif = ipif->ipif_next) { 8634 if (ipif->ipif_zoneid == zoneid || 8635 ipif->ipif_zoneid == ALL_ZONES) 8636 numifs++; 8637 } 8638 ill = ill_next(&ctx, ill); 8639 } 8640 rw_exit(&ipst->ips_ill_g_lock); 8641 return (numifs); 8642 } 8643 8644 /* 8645 * Return the total number of ipifs. 8646 */ 8647 static uint_t 8648 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst) 8649 { 8650 uint_t numifs = 0; 8651 ill_t *ill; 8652 ipif_t *ipif; 8653 ill_walk_context_t ctx; 8654 8655 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 8656 8657 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8658 if (family == AF_INET) 8659 ill = ILL_START_WALK_V4(&ctx, ipst); 8660 else if (family == AF_INET6) 8661 ill = ILL_START_WALK_V6(&ctx, ipst); 8662 else 8663 ill = ILL_START_WALK_ALL(&ctx, ipst); 8664 8665 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8666 for (ipif = ill->ill_ipif; ipif != NULL; 8667 ipif = ipif->ipif_next) { 8668 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8669 !(lifn_flags & LIFC_NOXMIT)) 8670 continue; 8671 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8672 !(lifn_flags & LIFC_TEMPORARY)) 8673 continue; 8674 if (((ipif->ipif_flags & 8675 (IPIF_NOXMIT|IPIF_NOLOCAL| 8676 IPIF_DEPRECATED)) || 8677 IS_LOOPBACK(ill) || 8678 !(ipif->ipif_flags & IPIF_UP)) && 8679 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 8680 continue; 8681 8682 if (zoneid != ipif->ipif_zoneid && 8683 ipif->ipif_zoneid != ALL_ZONES && 8684 (zoneid != GLOBAL_ZONEID || 8685 !(lifn_flags & LIFC_ALLZONES))) 8686 continue; 8687 8688 numifs++; 8689 } 8690 } 8691 rw_exit(&ipst->ips_ill_g_lock); 8692 return (numifs); 8693 } 8694 8695 uint_t 8696 ip_get_lifsrcofnum(ill_t *ill) 8697 { 8698 uint_t numifs = 0; 8699 ill_t *ill_head = ill; 8700 ip_stack_t *ipst = ill->ill_ipst; 8701 8702 /* 8703 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 8704 * other thread may be trying to relink the ILLs in this usesrc group 8705 * and adjusting the ill_usesrc_grp_next pointers 8706 */ 8707 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 8708 if ((ill->ill_usesrc_ifindex == 0) && 8709 (ill->ill_usesrc_grp_next != NULL)) { 8710 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 8711 ill = ill->ill_usesrc_grp_next) 8712 numifs++; 8713 } 8714 rw_exit(&ipst->ips_ill_g_usesrc_lock); 8715 8716 return (numifs); 8717 } 8718 8719 /* Null values are passed in for ipif, sin, and ifreq */ 8720 /* ARGSUSED */ 8721 int 8722 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8723 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8724 { 8725 int *nump; 8726 conn_t *connp = Q_TO_CONN(q); 8727 8728 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8729 8730 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8731 nump = (int *)mp->b_cont->b_cont->b_rptr; 8732 8733 *nump = ip_get_numifs(connp->conn_zoneid, 8734 connp->conn_netstack->netstack_ip); 8735 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 8736 return (0); 8737 } 8738 8739 /* Null values are passed in for ipif, sin, and ifreq */ 8740 /* ARGSUSED */ 8741 int 8742 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 8743 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8744 { 8745 struct lifnum *lifn; 8746 mblk_t *mp1; 8747 conn_t *connp = Q_TO_CONN(q); 8748 8749 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8750 8751 /* Existence checked in ip_wput_nondata */ 8752 mp1 = mp->b_cont->b_cont; 8753 8754 lifn = (struct lifnum *)mp1->b_rptr; 8755 switch (lifn->lifn_family) { 8756 case AF_UNSPEC: 8757 case AF_INET: 8758 case AF_INET6: 8759 break; 8760 default: 8761 return (EAFNOSUPPORT); 8762 } 8763 8764 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 8765 connp->conn_zoneid, connp->conn_netstack->netstack_ip); 8766 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 8767 return (0); 8768 } 8769 8770 /* ARGSUSED */ 8771 int 8772 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8773 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8774 { 8775 STRUCT_HANDLE(ifconf, ifc); 8776 mblk_t *mp1; 8777 struct iocblk *iocp; 8778 struct ifreq *ifr; 8779 ill_walk_context_t ctx; 8780 ill_t *ill; 8781 ipif_t *ipif; 8782 struct sockaddr_in *sin; 8783 int32_t ifclen; 8784 zoneid_t zoneid; 8785 ip_stack_t *ipst = CONNQ_TO_IPST(q); 8786 8787 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 8788 8789 ip1dbg(("ip_sioctl_get_ifconf")); 8790 /* Existence verified in ip_wput_nondata */ 8791 mp1 = mp->b_cont->b_cont; 8792 iocp = (struct iocblk *)mp->b_rptr; 8793 zoneid = Q_TO_CONN(q)->conn_zoneid; 8794 8795 /* 8796 * The original SIOCGIFCONF passed in a struct ifconf which specified 8797 * the user buffer address and length into which the list of struct 8798 * ifreqs was to be copied. Since AT&T Streams does not seem to 8799 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 8800 * the SIOCGIFCONF operation was redefined to simply provide 8801 * a large output buffer into which we are supposed to jam the ifreq 8802 * array. The same ioctl command code was used, despite the fact that 8803 * both the applications and the kernel code had to change, thus making 8804 * it impossible to support both interfaces. 8805 * 8806 * For reasons not good enough to try to explain, the following 8807 * algorithm is used for deciding what to do with one of these: 8808 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 8809 * form with the output buffer coming down as the continuation message. 8810 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 8811 * and we have to copy in the ifconf structure to find out how big the 8812 * output buffer is and where to copy out to. Sure no problem... 8813 * 8814 */ 8815 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 8816 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 8817 int numifs = 0; 8818 size_t ifc_bufsize; 8819 8820 /* 8821 * Must be (better be!) continuation of a TRANSPARENT 8822 * IOCTL. We just copied in the ifconf structure. 8823 */ 8824 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 8825 (struct ifconf *)mp1->b_rptr); 8826 8827 /* 8828 * Allocate a buffer to hold requested information. 8829 * 8830 * If ifc_len is larger than what is needed, we only 8831 * allocate what we will use. 8832 * 8833 * If ifc_len is smaller than what is needed, return 8834 * EINVAL. 8835 * 8836 * XXX: the ill_t structure can hava 2 counters, for 8837 * v4 and v6 (not just ill_ipif_up_count) to store the 8838 * number of interfaces for a device, so we don't need 8839 * to count them here... 8840 */ 8841 numifs = ip_get_numifs(zoneid, ipst); 8842 8843 ifclen = STRUCT_FGET(ifc, ifc_len); 8844 ifc_bufsize = numifs * sizeof (struct ifreq); 8845 if (ifc_bufsize > ifclen) { 8846 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8847 /* old behaviour */ 8848 return (EINVAL); 8849 } else { 8850 ifc_bufsize = ifclen; 8851 } 8852 } 8853 8854 mp1 = mi_copyout_alloc(q, mp, 8855 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 8856 if (mp1 == NULL) 8857 return (ENOMEM); 8858 8859 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 8860 } 8861 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8862 /* 8863 * the SIOCGIFCONF ioctl only knows about 8864 * IPv4 addresses, so don't try to tell 8865 * it about interfaces with IPv6-only 8866 * addresses. (Last parm 'isv6' is B_FALSE) 8867 */ 8868 8869 ifr = (struct ifreq *)mp1->b_rptr; 8870 8871 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8872 ill = ILL_START_WALK_V4(&ctx, ipst); 8873 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8874 for (ipif = ill->ill_ipif; ipif != NULL; 8875 ipif = ipif->ipif_next) { 8876 if (zoneid != ipif->ipif_zoneid && 8877 ipif->ipif_zoneid != ALL_ZONES) 8878 continue; 8879 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 8880 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8881 /* old behaviour */ 8882 rw_exit(&ipst->ips_ill_g_lock); 8883 return (EINVAL); 8884 } else { 8885 goto if_copydone; 8886 } 8887 } 8888 (void) ipif_get_name(ipif, 8889 ifr->ifr_name, 8890 sizeof (ifr->ifr_name)); 8891 sin = (sin_t *)&ifr->ifr_addr; 8892 *sin = sin_null; 8893 sin->sin_family = AF_INET; 8894 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8895 ifr++; 8896 } 8897 } 8898 if_copydone: 8899 rw_exit(&ipst->ips_ill_g_lock); 8900 mp1->b_wptr = (uchar_t *)ifr; 8901 8902 if (STRUCT_BUF(ifc) != NULL) { 8903 STRUCT_FSET(ifc, ifc_len, 8904 (int)((uchar_t *)ifr - mp1->b_rptr)); 8905 } 8906 return (0); 8907 } 8908 8909 /* 8910 * Get the interfaces using the address hosted on the interface passed in, 8911 * as a source adddress 8912 */ 8913 /* ARGSUSED */ 8914 int 8915 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8916 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8917 { 8918 mblk_t *mp1; 8919 ill_t *ill, *ill_head; 8920 ipif_t *ipif, *orig_ipif; 8921 int numlifs = 0; 8922 size_t lifs_bufsize, lifsmaxlen; 8923 struct lifreq *lifr; 8924 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8925 uint_t ifindex; 8926 zoneid_t zoneid; 8927 int err = 0; 8928 boolean_t isv6 = B_FALSE; 8929 struct sockaddr_in *sin; 8930 struct sockaddr_in6 *sin6; 8931 STRUCT_HANDLE(lifsrcof, lifs); 8932 ip_stack_t *ipst; 8933 8934 ipst = CONNQ_TO_IPST(q); 8935 8936 ASSERT(q->q_next == NULL); 8937 8938 zoneid = Q_TO_CONN(q)->conn_zoneid; 8939 8940 /* Existence verified in ip_wput_nondata */ 8941 mp1 = mp->b_cont->b_cont; 8942 8943 /* 8944 * Must be (better be!) continuation of a TRANSPARENT 8945 * IOCTL. We just copied in the lifsrcof structure. 8946 */ 8947 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 8948 (struct lifsrcof *)mp1->b_rptr); 8949 8950 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 8951 return (EINVAL); 8952 8953 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 8954 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 8955 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp, 8956 ip_process_ioctl, &err, ipst); 8957 if (ipif == NULL) { 8958 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 8959 ifindex)); 8960 return (err); 8961 } 8962 8963 8964 /* Allocate a buffer to hold requested information */ 8965 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 8966 lifs_bufsize = numlifs * sizeof (struct lifreq); 8967 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 8968 /* The actual size needed is always returned in lifs_len */ 8969 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 8970 8971 /* If the amount we need is more than what is passed in, abort */ 8972 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 8973 ipif_refrele(ipif); 8974 return (0); 8975 } 8976 8977 mp1 = mi_copyout_alloc(q, mp, 8978 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 8979 if (mp1 == NULL) { 8980 ipif_refrele(ipif); 8981 return (ENOMEM); 8982 } 8983 8984 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 8985 bzero(mp1->b_rptr, lifs_bufsize); 8986 8987 lifr = (struct lifreq *)mp1->b_rptr; 8988 8989 ill = ill_head = ipif->ipif_ill; 8990 orig_ipif = ipif; 8991 8992 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 8993 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 8994 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8995 8996 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 8997 for (; (ill != NULL) && (ill != ill_head); 8998 ill = ill->ill_usesrc_grp_next) { 8999 9000 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 9001 break; 9002 9003 ipif = ill->ill_ipif; 9004 (void) ipif_get_name(ipif, 9005 lifr->lifr_name, sizeof (lifr->lifr_name)); 9006 if (ipif->ipif_isv6) { 9007 sin6 = (sin6_t *)&lifr->lifr_addr; 9008 *sin6 = sin6_null; 9009 sin6->sin6_family = AF_INET6; 9010 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 9011 lifr->lifr_addrlen = ip_mask_to_plen_v6( 9012 &ipif->ipif_v6net_mask); 9013 } else { 9014 sin = (sin_t *)&lifr->lifr_addr; 9015 *sin = sin_null; 9016 sin->sin_family = AF_INET; 9017 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 9018 lifr->lifr_addrlen = ip_mask_to_plen( 9019 ipif->ipif_net_mask); 9020 } 9021 lifr++; 9022 } 9023 rw_exit(&ipst->ips_ill_g_usesrc_lock); 9024 rw_exit(&ipst->ips_ill_g_lock); 9025 ipif_refrele(orig_ipif); 9026 mp1->b_wptr = (uchar_t *)lifr; 9027 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 9028 9029 return (0); 9030 } 9031 9032 /* ARGSUSED */ 9033 int 9034 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 9035 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 9036 { 9037 mblk_t *mp1; 9038 int list; 9039 ill_t *ill; 9040 ipif_t *ipif; 9041 int flags; 9042 int numlifs = 0; 9043 size_t lifc_bufsize; 9044 struct lifreq *lifr; 9045 sa_family_t family; 9046 struct sockaddr_in *sin; 9047 struct sockaddr_in6 *sin6; 9048 ill_walk_context_t ctx; 9049 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9050 int32_t lifclen; 9051 zoneid_t zoneid; 9052 STRUCT_HANDLE(lifconf, lifc); 9053 ip_stack_t *ipst = CONNQ_TO_IPST(q); 9054 9055 ip1dbg(("ip_sioctl_get_lifconf")); 9056 9057 ASSERT(q->q_next == NULL); 9058 9059 zoneid = Q_TO_CONN(q)->conn_zoneid; 9060 9061 /* Existence verified in ip_wput_nondata */ 9062 mp1 = mp->b_cont->b_cont; 9063 9064 /* 9065 * An extended version of SIOCGIFCONF that takes an 9066 * additional address family and flags field. 9067 * AF_UNSPEC retrieve both IPv4 and IPv6. 9068 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 9069 * interfaces are omitted. 9070 * Similarly, IPIF_TEMPORARY interfaces are omitted 9071 * unless LIFC_TEMPORARY is specified. 9072 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 9073 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 9074 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 9075 * has priority over LIFC_NOXMIT. 9076 */ 9077 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 9078 9079 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 9080 return (EINVAL); 9081 9082 /* 9083 * Must be (better be!) continuation of a TRANSPARENT 9084 * IOCTL. We just copied in the lifconf structure. 9085 */ 9086 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 9087 9088 family = STRUCT_FGET(lifc, lifc_family); 9089 flags = STRUCT_FGET(lifc, lifc_flags); 9090 9091 switch (family) { 9092 case AF_UNSPEC: 9093 /* 9094 * walk all ILL's. 9095 */ 9096 list = MAX_G_HEADS; 9097 break; 9098 case AF_INET: 9099 /* 9100 * walk only IPV4 ILL's. 9101 */ 9102 list = IP_V4_G_HEAD; 9103 break; 9104 case AF_INET6: 9105 /* 9106 * walk only IPV6 ILL's. 9107 */ 9108 list = IP_V6_G_HEAD; 9109 break; 9110 default: 9111 return (EAFNOSUPPORT); 9112 } 9113 9114 /* 9115 * Allocate a buffer to hold requested information. 9116 * 9117 * If lifc_len is larger than what is needed, we only 9118 * allocate what we will use. 9119 * 9120 * If lifc_len is smaller than what is needed, return 9121 * EINVAL. 9122 */ 9123 numlifs = ip_get_numlifs(family, flags, zoneid, ipst); 9124 lifc_bufsize = numlifs * sizeof (struct lifreq); 9125 lifclen = STRUCT_FGET(lifc, lifc_len); 9126 if (lifc_bufsize > lifclen) { 9127 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 9128 return (EINVAL); 9129 else 9130 lifc_bufsize = lifclen; 9131 } 9132 9133 mp1 = mi_copyout_alloc(q, mp, 9134 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 9135 if (mp1 == NULL) 9136 return (ENOMEM); 9137 9138 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 9139 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 9140 9141 lifr = (struct lifreq *)mp1->b_rptr; 9142 9143 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 9144 ill = ill_first(list, list, &ctx, ipst); 9145 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 9146 for (ipif = ill->ill_ipif; ipif != NULL; 9147 ipif = ipif->ipif_next) { 9148 if ((ipif->ipif_flags & IPIF_NOXMIT) && 9149 !(flags & LIFC_NOXMIT)) 9150 continue; 9151 9152 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 9153 !(flags & LIFC_TEMPORARY)) 9154 continue; 9155 9156 if (((ipif->ipif_flags & 9157 (IPIF_NOXMIT|IPIF_NOLOCAL| 9158 IPIF_DEPRECATED)) || 9159 IS_LOOPBACK(ill) || 9160 !(ipif->ipif_flags & IPIF_UP)) && 9161 (flags & LIFC_EXTERNAL_SOURCE)) 9162 continue; 9163 9164 if (zoneid != ipif->ipif_zoneid && 9165 ipif->ipif_zoneid != ALL_ZONES && 9166 (zoneid != GLOBAL_ZONEID || 9167 !(flags & LIFC_ALLZONES))) 9168 continue; 9169 9170 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 9171 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 9172 rw_exit(&ipst->ips_ill_g_lock); 9173 return (EINVAL); 9174 } else { 9175 goto lif_copydone; 9176 } 9177 } 9178 9179 (void) ipif_get_name(ipif, lifr->lifr_name, 9180 sizeof (lifr->lifr_name)); 9181 if (ipif->ipif_isv6) { 9182 sin6 = (sin6_t *)&lifr->lifr_addr; 9183 *sin6 = sin6_null; 9184 sin6->sin6_family = AF_INET6; 9185 sin6->sin6_addr = 9186 ipif->ipif_v6lcl_addr; 9187 lifr->lifr_addrlen = 9188 ip_mask_to_plen_v6( 9189 &ipif->ipif_v6net_mask); 9190 } else { 9191 sin = (sin_t *)&lifr->lifr_addr; 9192 *sin = sin_null; 9193 sin->sin_family = AF_INET; 9194 sin->sin_addr.s_addr = 9195 ipif->ipif_lcl_addr; 9196 lifr->lifr_addrlen = 9197 ip_mask_to_plen( 9198 ipif->ipif_net_mask); 9199 } 9200 lifr++; 9201 } 9202 } 9203 lif_copydone: 9204 rw_exit(&ipst->ips_ill_g_lock); 9205 9206 mp1->b_wptr = (uchar_t *)lifr; 9207 if (STRUCT_BUF(lifc) != NULL) { 9208 STRUCT_FSET(lifc, lifc_len, 9209 (int)((uchar_t *)lifr - mp1->b_rptr)); 9210 } 9211 return (0); 9212 } 9213 9214 /* ARGSUSED */ 9215 int 9216 ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin, 9217 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 9218 { 9219 ip_stack_t *ipst; 9220 9221 if (q->q_next == NULL) 9222 ipst = CONNQ_TO_IPST(q); 9223 else 9224 ipst = ILLQ_TO_IPST(q); 9225 9226 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 9227 ipst->ips_ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr; 9228 return (0); 9229 } 9230 9231 static void 9232 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 9233 { 9234 ip6_asp_t *table; 9235 size_t table_size; 9236 mblk_t *data_mp; 9237 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9238 ip_stack_t *ipst; 9239 9240 if (q->q_next == NULL) 9241 ipst = CONNQ_TO_IPST(q); 9242 else 9243 ipst = ILLQ_TO_IPST(q); 9244 9245 /* These two ioctls are I_STR only */ 9246 if (iocp->ioc_count == TRANSPARENT) { 9247 miocnak(q, mp, 0, EINVAL); 9248 return; 9249 } 9250 9251 data_mp = mp->b_cont; 9252 if (data_mp == NULL) { 9253 /* The user passed us a NULL argument */ 9254 table = NULL; 9255 table_size = iocp->ioc_count; 9256 } else { 9257 /* 9258 * The user provided a table. The stream head 9259 * may have copied in the user data in chunks, 9260 * so make sure everything is pulled up 9261 * properly. 9262 */ 9263 if (MBLKL(data_mp) < iocp->ioc_count) { 9264 mblk_t *new_data_mp; 9265 if ((new_data_mp = msgpullup(data_mp, -1)) == 9266 NULL) { 9267 miocnak(q, mp, 0, ENOMEM); 9268 return; 9269 } 9270 freemsg(data_mp); 9271 data_mp = new_data_mp; 9272 mp->b_cont = data_mp; 9273 } 9274 table = (ip6_asp_t *)data_mp->b_rptr; 9275 table_size = iocp->ioc_count; 9276 } 9277 9278 switch (iocp->ioc_cmd) { 9279 case SIOCGIP6ADDRPOLICY: 9280 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst); 9281 if (iocp->ioc_rval == -1) 9282 iocp->ioc_error = EINVAL; 9283 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 9284 else if (table != NULL && 9285 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 9286 ip6_asp_t *src = table; 9287 ip6_asp32_t *dst = (void *)table; 9288 int count = table_size / sizeof (ip6_asp_t); 9289 int i; 9290 9291 /* 9292 * We need to do an in-place shrink of the array 9293 * to match the alignment attributes of the 9294 * 32-bit ABI looking at it. 9295 */ 9296 /* LINTED: logical expression always true: op "||" */ 9297 ASSERT(sizeof (*src) > sizeof (*dst)); 9298 for (i = 1; i < count; i++) 9299 bcopy(src + i, dst + i, sizeof (*dst)); 9300 } 9301 #endif 9302 break; 9303 9304 case SIOCSIP6ADDRPOLICY: 9305 ASSERT(mp->b_prev == NULL); 9306 mp->b_prev = (void *)q; 9307 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 9308 /* 9309 * We pass in the datamodel here so that the ip6_asp_replace() 9310 * routine can handle converting from 32-bit to native formats 9311 * where necessary. 9312 * 9313 * A better way to handle this might be to convert the inbound 9314 * data structure here, and hang it off a new 'mp'; thus the 9315 * ip6_asp_replace() logic would always be dealing with native 9316 * format data structures.. 9317 * 9318 * (An even simpler way to handle these ioctls is to just 9319 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 9320 * and just recompile everything that depends on it.) 9321 */ 9322 #endif 9323 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst, 9324 iocp->ioc_flag & IOC_MODELS); 9325 return; 9326 } 9327 9328 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 9329 qreply(q, mp); 9330 } 9331 9332 static void 9333 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 9334 { 9335 mblk_t *data_mp; 9336 struct dstinforeq *dir; 9337 uint8_t *end, *cur; 9338 in6_addr_t *daddr, *saddr; 9339 ipaddr_t v4daddr; 9340 ire_t *ire; 9341 char *slabel, *dlabel; 9342 boolean_t isipv4; 9343 int match_ire; 9344 ill_t *dst_ill; 9345 ipif_t *src_ipif, *ire_ipif; 9346 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9347 zoneid_t zoneid; 9348 ip_stack_t *ipst = CONNQ_TO_IPST(q); 9349 9350 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9351 zoneid = Q_TO_CONN(q)->conn_zoneid; 9352 9353 /* 9354 * This ioctl is I_STR only, and must have a 9355 * data mblk following the M_IOCTL mblk. 9356 */ 9357 data_mp = mp->b_cont; 9358 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 9359 miocnak(q, mp, 0, EINVAL); 9360 return; 9361 } 9362 9363 if (MBLKL(data_mp) < iocp->ioc_count) { 9364 mblk_t *new_data_mp; 9365 9366 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 9367 miocnak(q, mp, 0, ENOMEM); 9368 return; 9369 } 9370 freemsg(data_mp); 9371 data_mp = new_data_mp; 9372 mp->b_cont = data_mp; 9373 } 9374 match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT; 9375 9376 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 9377 end - cur >= sizeof (struct dstinforeq); 9378 cur += sizeof (struct dstinforeq)) { 9379 dir = (struct dstinforeq *)cur; 9380 daddr = &dir->dir_daddr; 9381 saddr = &dir->dir_saddr; 9382 9383 /* 9384 * ip_addr_scope_v6() and ip6_asp_lookup() handle 9385 * v4 mapped addresses; ire_ftable_lookup[_v6]() 9386 * and ipif_select_source[_v6]() do not. 9387 */ 9388 dir->dir_dscope = ip_addr_scope_v6(daddr); 9389 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst); 9390 9391 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 9392 if (isipv4) { 9393 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 9394 ire = ire_ftable_lookup(v4daddr, NULL, NULL, 9395 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst); 9396 } else { 9397 ire = ire_ftable_lookup_v6(daddr, NULL, NULL, 9398 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst); 9399 } 9400 if (ire == NULL) { 9401 dir->dir_dreachable = 0; 9402 9403 /* move on to next dst addr */ 9404 continue; 9405 } 9406 dir->dir_dreachable = 1; 9407 9408 ire_ipif = ire->ire_ipif; 9409 if (ire_ipif == NULL) 9410 goto next_dst; 9411 9412 /* 9413 * We expect to get back an interface ire or a 9414 * gateway ire cache entry. For both types, the 9415 * output interface is ire_ipif->ipif_ill. 9416 */ 9417 dst_ill = ire_ipif->ipif_ill; 9418 dir->dir_dmactype = dst_ill->ill_mactype; 9419 9420 if (isipv4) { 9421 src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid); 9422 } else { 9423 src_ipif = ipif_select_source_v6(dst_ill, 9424 daddr, RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, 9425 zoneid); 9426 } 9427 if (src_ipif == NULL) 9428 goto next_dst; 9429 9430 *saddr = src_ipif->ipif_v6lcl_addr; 9431 dir->dir_sscope = ip_addr_scope_v6(saddr); 9432 slabel = ip6_asp_lookup(saddr, NULL, ipst); 9433 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 9434 dir->dir_sdeprecated = 9435 (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 9436 ipif_refrele(src_ipif); 9437 next_dst: 9438 ire_refrele(ire); 9439 } 9440 miocack(q, mp, iocp->ioc_count, 0); 9441 } 9442 9443 9444 /* 9445 * Check if this is an address assigned to this machine. 9446 * Skips interfaces that are down by using ire checks. 9447 * Translates mapped addresses to v4 addresses and then 9448 * treats them as such, returning true if the v4 address 9449 * associated with this mapped address is configured. 9450 * Note: Applications will have to be careful what they do 9451 * with the response; use of mapped addresses limits 9452 * what can be done with the socket, especially with 9453 * respect to socket options and ioctls - neither IPv4 9454 * options nor IPv6 sticky options/ancillary data options 9455 * may be used. 9456 */ 9457 /* ARGSUSED */ 9458 int 9459 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9460 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9461 { 9462 struct sioc_addrreq *sia; 9463 sin_t *sin; 9464 ire_t *ire; 9465 mblk_t *mp1; 9466 zoneid_t zoneid; 9467 ip_stack_t *ipst; 9468 9469 ip1dbg(("ip_sioctl_tmyaddr")); 9470 9471 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9472 zoneid = Q_TO_CONN(q)->conn_zoneid; 9473 ipst = CONNQ_TO_IPST(q); 9474 9475 /* Existence verified in ip_wput_nondata */ 9476 mp1 = mp->b_cont->b_cont; 9477 sia = (struct sioc_addrreq *)mp1->b_rptr; 9478 sin = (sin_t *)&sia->sa_addr; 9479 switch (sin->sin_family) { 9480 case AF_INET6: { 9481 sin6_t *sin6 = (sin6_t *)sin; 9482 9483 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9484 ipaddr_t v4_addr; 9485 9486 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9487 v4_addr); 9488 ire = ire_ctable_lookup(v4_addr, 0, 9489 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9490 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9491 } else { 9492 in6_addr_t v6addr; 9493 9494 v6addr = sin6->sin6_addr; 9495 ire = ire_ctable_lookup_v6(&v6addr, 0, 9496 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9497 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9498 } 9499 break; 9500 } 9501 case AF_INET: { 9502 ipaddr_t v4addr; 9503 9504 v4addr = sin->sin_addr.s_addr; 9505 ire = ire_ctable_lookup(v4addr, 0, 9506 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9507 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9508 break; 9509 } 9510 default: 9511 return (EAFNOSUPPORT); 9512 } 9513 if (ire != NULL) { 9514 sia->sa_res = 1; 9515 ire_refrele(ire); 9516 } else { 9517 sia->sa_res = 0; 9518 } 9519 return (0); 9520 } 9521 9522 /* 9523 * Check if this is an address assigned on-link i.e. neighbor, 9524 * and makes sure it's reachable from the current zone. 9525 * Returns true for my addresses as well. 9526 * Translates mapped addresses to v4 addresses and then 9527 * treats them as such, returning true if the v4 address 9528 * associated with this mapped address is configured. 9529 * Note: Applications will have to be careful what they do 9530 * with the response; use of mapped addresses limits 9531 * what can be done with the socket, especially with 9532 * respect to socket options and ioctls - neither IPv4 9533 * options nor IPv6 sticky options/ancillary data options 9534 * may be used. 9535 */ 9536 /* ARGSUSED */ 9537 int 9538 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9539 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 9540 { 9541 struct sioc_addrreq *sia; 9542 sin_t *sin; 9543 mblk_t *mp1; 9544 ire_t *ire = NULL; 9545 zoneid_t zoneid; 9546 ip_stack_t *ipst; 9547 9548 ip1dbg(("ip_sioctl_tonlink")); 9549 9550 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9551 zoneid = Q_TO_CONN(q)->conn_zoneid; 9552 ipst = CONNQ_TO_IPST(q); 9553 9554 /* Existence verified in ip_wput_nondata */ 9555 mp1 = mp->b_cont->b_cont; 9556 sia = (struct sioc_addrreq *)mp1->b_rptr; 9557 sin = (sin_t *)&sia->sa_addr; 9558 9559 /* 9560 * Match addresses with a zero gateway field to avoid 9561 * routes going through a router. 9562 * Exclude broadcast and multicast addresses. 9563 */ 9564 switch (sin->sin_family) { 9565 case AF_INET6: { 9566 sin6_t *sin6 = (sin6_t *)sin; 9567 9568 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9569 ipaddr_t v4_addr; 9570 9571 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9572 v4_addr); 9573 if (!CLASSD(v4_addr)) { 9574 ire = ire_route_lookup(v4_addr, 0, 0, 0, 9575 NULL, NULL, zoneid, NULL, 9576 MATCH_IRE_GW, ipst); 9577 } 9578 } else { 9579 in6_addr_t v6addr; 9580 in6_addr_t v6gw; 9581 9582 v6addr = sin6->sin6_addr; 9583 v6gw = ipv6_all_zeros; 9584 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 9585 ire = ire_route_lookup_v6(&v6addr, 0, 9586 &v6gw, 0, NULL, NULL, zoneid, 9587 NULL, MATCH_IRE_GW, ipst); 9588 } 9589 } 9590 break; 9591 } 9592 case AF_INET: { 9593 ipaddr_t v4addr; 9594 9595 v4addr = sin->sin_addr.s_addr; 9596 if (!CLASSD(v4addr)) { 9597 ire = ire_route_lookup(v4addr, 0, 0, 0, 9598 NULL, NULL, zoneid, NULL, 9599 MATCH_IRE_GW, ipst); 9600 } 9601 break; 9602 } 9603 default: 9604 return (EAFNOSUPPORT); 9605 } 9606 sia->sa_res = 0; 9607 if (ire != NULL) { 9608 if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE| 9609 IRE_LOCAL|IRE_LOOPBACK)) { 9610 sia->sa_res = 1; 9611 } 9612 ire_refrele(ire); 9613 } 9614 return (0); 9615 } 9616 9617 /* 9618 * TBD: implement when kernel maintaines a list of site prefixes. 9619 */ 9620 /* ARGSUSED */ 9621 int 9622 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9623 ip_ioctl_cmd_t *ipip, void *ifreq) 9624 { 9625 return (ENXIO); 9626 } 9627 9628 /* ARGSUSED */ 9629 int 9630 ip_sioctl_tunparam(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9631 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9632 { 9633 ill_t *ill; 9634 mblk_t *mp1; 9635 conn_t *connp; 9636 boolean_t success; 9637 9638 ip1dbg(("ip_sioctl_tunparam(%s:%u %p)\n", 9639 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9640 /* ioctl comes down on an conn */ 9641 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9642 connp = Q_TO_CONN(q); 9643 9644 mp->b_datap->db_type = M_IOCTL; 9645 9646 /* 9647 * Send down a copy. (copymsg does not copy b_next/b_prev). 9648 * The original mp contains contaminated b_next values due to 'mi', 9649 * which is needed to do the mi_copy_done. Unfortunately if we 9650 * send down the original mblk itself and if we are popped due to an 9651 * an unplumb before the response comes back from tunnel, 9652 * the streamhead (which does a freemsg) will see this contaminated 9653 * message and the assertion in freemsg about non-null b_next/b_prev 9654 * will panic a DEBUG kernel. 9655 */ 9656 mp1 = copymsg(mp); 9657 if (mp1 == NULL) 9658 return (ENOMEM); 9659 9660 ill = ipif->ipif_ill; 9661 mutex_enter(&connp->conn_lock); 9662 mutex_enter(&ill->ill_lock); 9663 if (ipip->ipi_cmd == SIOCSTUNPARAM || ipip->ipi_cmd == OSIOCSTUNPARAM) { 9664 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), 9665 mp, 0); 9666 } else { 9667 success = ill_pending_mp_add(ill, connp, mp); 9668 } 9669 mutex_exit(&ill->ill_lock); 9670 mutex_exit(&connp->conn_lock); 9671 9672 if (success) { 9673 ip1dbg(("sending down tunparam request ")); 9674 putnext(ill->ill_wq, mp1); 9675 return (EINPROGRESS); 9676 } else { 9677 /* The conn has started closing */ 9678 freemsg(mp1); 9679 return (EINTR); 9680 } 9681 } 9682 9683 static int 9684 ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, sin_t *sin, 9685 boolean_t x_arp_ioctl, boolean_t if_arp_ioctl) 9686 { 9687 mblk_t *mp1; 9688 mblk_t *mp2; 9689 mblk_t *pending_mp; 9690 ipaddr_t ipaddr; 9691 area_t *area; 9692 struct iocblk *iocp; 9693 conn_t *connp; 9694 struct arpreq *ar; 9695 struct xarpreq *xar; 9696 boolean_t success; 9697 int flags, alength; 9698 char *lladdr; 9699 ip_stack_t *ipst; 9700 9701 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9702 connp = Q_TO_CONN(q); 9703 ipst = connp->conn_netstack->netstack_ip; 9704 9705 iocp = (struct iocblk *)mp->b_rptr; 9706 /* 9707 * ill has already been set depending on whether 9708 * bsd style or interface style ioctl. 9709 */ 9710 ASSERT(ill != NULL); 9711 9712 /* 9713 * Is this one of the new SIOC*XARP ioctls? 9714 */ 9715 if (x_arp_ioctl) { 9716 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 9717 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 9718 ar = NULL; 9719 9720 flags = xar->xarp_flags; 9721 lladdr = LLADDR(&xar->xarp_ha); 9722 /* 9723 * Validate against user's link layer address length 9724 * input and name and addr length limits. 9725 */ 9726 alength = ill->ill_phys_addr_length; 9727 if (iocp->ioc_cmd == SIOCSXARP) { 9728 if (alength != xar->xarp_ha.sdl_alen || 9729 (alength + xar->xarp_ha.sdl_nlen > 9730 sizeof (xar->xarp_ha.sdl_data))) 9731 return (EINVAL); 9732 } 9733 } else { 9734 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 9735 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 9736 xar = NULL; 9737 9738 flags = ar->arp_flags; 9739 lladdr = ar->arp_ha.sa_data; 9740 /* 9741 * Theoretically, the sa_family could tell us what link 9742 * layer type this operation is trying to deal with. By 9743 * common usage AF_UNSPEC means ethernet. We'll assume 9744 * any attempt to use the SIOC?ARP ioctls is for ethernet, 9745 * for now. Our new SIOC*XARP ioctls can be used more 9746 * generally. 9747 * 9748 * If the underlying media happens to have a non 6 byte 9749 * address, arp module will fail set/get, but the del 9750 * operation will succeed. 9751 */ 9752 alength = 6; 9753 if ((iocp->ioc_cmd != SIOCDARP) && 9754 (alength != ill->ill_phys_addr_length)) { 9755 return (EINVAL); 9756 } 9757 } 9758 9759 /* 9760 * We are going to pass up to ARP a packet chain that looks 9761 * like: 9762 * 9763 * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 9764 * 9765 * Get a copy of the original IOCTL mblk to head the chain, 9766 * to be sent up (in mp1). Also get another copy to store 9767 * in the ill_pending_mp list, for matching the response 9768 * when it comes back from ARP. 9769 */ 9770 mp1 = copyb(mp); 9771 pending_mp = copymsg(mp); 9772 if (mp1 == NULL || pending_mp == NULL) { 9773 if (mp1 != NULL) 9774 freeb(mp1); 9775 if (pending_mp != NULL) 9776 inet_freemsg(pending_mp); 9777 return (ENOMEM); 9778 } 9779 9780 ipaddr = sin->sin_addr.s_addr; 9781 9782 mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 9783 (caddr_t)&ipaddr); 9784 if (mp2 == NULL) { 9785 freeb(mp1); 9786 inet_freemsg(pending_mp); 9787 return (ENOMEM); 9788 } 9789 /* Put together the chain. */ 9790 mp1->b_cont = mp2; 9791 mp1->b_datap->db_type = M_IOCTL; 9792 mp2->b_cont = mp; 9793 mp2->b_datap->db_type = M_DATA; 9794 9795 iocp = (struct iocblk *)mp1->b_rptr; 9796 9797 /* 9798 * An M_IOCDATA's payload (struct copyresp) is mostly the same as an 9799 * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a 9800 * cp_private field (or cp_rval on 32-bit systems) in place of the 9801 * ioc_count field; set ioc_count to be correct. 9802 */ 9803 iocp->ioc_count = MBLKL(mp1->b_cont); 9804 9805 /* 9806 * Set the proper command in the ARP message. 9807 * Convert the SIOC{G|S|D}ARP calls into our 9808 * AR_ENTRY_xxx calls. 9809 */ 9810 area = (area_t *)mp2->b_rptr; 9811 switch (iocp->ioc_cmd) { 9812 case SIOCDARP: 9813 case SIOCDXARP: 9814 /* 9815 * We defer deleting the corresponding IRE until 9816 * we return from arp. 9817 */ 9818 area->area_cmd = AR_ENTRY_DELETE; 9819 area->area_proto_mask_offset = 0; 9820 break; 9821 case SIOCGARP: 9822 case SIOCGXARP: 9823 area->area_cmd = AR_ENTRY_SQUERY; 9824 area->area_proto_mask_offset = 0; 9825 break; 9826 case SIOCSARP: 9827 case SIOCSXARP: { 9828 /* 9829 * Delete the corresponding ire to make sure IP will 9830 * pick up any change from arp. 9831 */ 9832 if (!if_arp_ioctl) { 9833 (void) ip_ire_clookup_and_delete(ipaddr, NULL, ipst); 9834 break; 9835 } else { 9836 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 9837 if (ipif != NULL) { 9838 (void) ip_ire_clookup_and_delete(ipaddr, ipif, 9839 ipst); 9840 ipif_refrele(ipif); 9841 } 9842 break; 9843 } 9844 } 9845 } 9846 iocp->ioc_cmd = area->area_cmd; 9847 9848 /* 9849 * Before sending 'mp' to ARP, we have to clear the b_next 9850 * and b_prev. Otherwise if STREAMS encounters such a message 9851 * in freemsg(), (because ARP can close any time) it can cause 9852 * a panic. But mi code needs the b_next and b_prev values of 9853 * mp->b_cont, to complete the ioctl. So we store it here 9854 * in pending_mp->bcont, and restore it in ip_sioctl_iocack() 9855 * when the response comes down from ARP. 9856 */ 9857 pending_mp->b_cont->b_next = mp->b_cont->b_next; 9858 pending_mp->b_cont->b_prev = mp->b_cont->b_prev; 9859 mp->b_cont->b_next = NULL; 9860 mp->b_cont->b_prev = NULL; 9861 9862 mutex_enter(&connp->conn_lock); 9863 mutex_enter(&ill->ill_lock); 9864 /* conn has not yet started closing, hence this can't fail */ 9865 success = ill_pending_mp_add(ill, connp, pending_mp); 9866 ASSERT(success); 9867 mutex_exit(&ill->ill_lock); 9868 mutex_exit(&connp->conn_lock); 9869 9870 /* 9871 * Fill in the rest of the ARP operation fields. 9872 */ 9873 area->area_hw_addr_length = alength; 9874 bcopy(lladdr, 9875 (char *)area + area->area_hw_addr_offset, 9876 area->area_hw_addr_length); 9877 /* Translate the flags. */ 9878 if (flags & ATF_PERM) 9879 area->area_flags |= ACE_F_PERMANENT; 9880 if (flags & ATF_PUBL) 9881 area->area_flags |= ACE_F_PUBLISH; 9882 if (flags & ATF_AUTHORITY) 9883 area->area_flags |= ACE_F_AUTHORITY; 9884 9885 /* 9886 * Up to ARP it goes. The response will come 9887 * back in ip_wput as an M_IOCACK message, and 9888 * will be handed to ip_sioctl_iocack for 9889 * completion. 9890 */ 9891 putnext(ill->ill_rq, mp1); 9892 return (EINPROGRESS); 9893 } 9894 9895 /* ARGSUSED */ 9896 int 9897 ip_sioctl_xarp(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9898 ip_ioctl_cmd_t *ipip, void *ifreq) 9899 { 9900 struct xarpreq *xar; 9901 boolean_t isv6; 9902 mblk_t *mp1; 9903 int err; 9904 conn_t *connp; 9905 int ifnamelen; 9906 ire_t *ire = NULL; 9907 ill_t *ill = NULL; 9908 struct sockaddr_in *sin; 9909 boolean_t if_arp_ioctl = B_FALSE; 9910 ip_stack_t *ipst; 9911 9912 /* ioctl comes down on an conn */ 9913 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9914 connp = Q_TO_CONN(q); 9915 isv6 = connp->conn_af_isv6; 9916 ipst = connp->conn_netstack->netstack_ip; 9917 9918 /* Existance verified in ip_wput_nondata */ 9919 mp1 = mp->b_cont->b_cont; 9920 9921 ASSERT(MBLKL(mp1) >= sizeof (*xar)); 9922 xar = (struct xarpreq *)mp1->b_rptr; 9923 sin = (sin_t *)&xar->xarp_pa; 9924 9925 if (isv6 || (xar->xarp_ha.sdl_family != AF_LINK) || 9926 (xar->xarp_pa.ss_family != AF_INET)) 9927 return (ENXIO); 9928 9929 ifnamelen = xar->xarp_ha.sdl_nlen; 9930 if (ifnamelen != 0) { 9931 char *cptr, cval; 9932 9933 if (ifnamelen >= LIFNAMSIZ) 9934 return (EINVAL); 9935 9936 /* 9937 * Instead of bcopying a bunch of bytes, 9938 * null-terminate the string in-situ. 9939 */ 9940 cptr = xar->xarp_ha.sdl_data + ifnamelen; 9941 cval = *cptr; 9942 *cptr = '\0'; 9943 ill = ill_lookup_on_name(xar->xarp_ha.sdl_data, 9944 B_FALSE, isv6, CONNP_TO_WQ(connp), mp, ip_process_ioctl, 9945 &err, NULL, ipst); 9946 *cptr = cval; 9947 if (ill == NULL) 9948 return (err); 9949 if (ill->ill_net_type != IRE_IF_RESOLVER) { 9950 ill_refrele(ill); 9951 return (ENXIO); 9952 } 9953 9954 if_arp_ioctl = B_TRUE; 9955 } else { 9956 /* 9957 * PSARC 2003/088 states that if sdl_nlen == 0, it behaves 9958 * as an extended BSD ioctl. The kernel uses the IP address 9959 * to figure out the network interface. 9960 */ 9961 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL, 9962 ipst); 9963 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 9964 ((ill = ire_to_ill(ire)) == NULL) || 9965 (ill->ill_net_type != IRE_IF_RESOLVER)) { 9966 if (ire != NULL) 9967 ire_refrele(ire); 9968 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 9969 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 9970 NULL, MATCH_IRE_TYPE, ipst); 9971 if ((ire == NULL) || 9972 ((ill = ire_to_ill(ire)) == NULL)) { 9973 if (ire != NULL) 9974 ire_refrele(ire); 9975 return (ENXIO); 9976 } 9977 } 9978 ASSERT(ire != NULL && ill != NULL); 9979 } 9980 9981 err = ip_sioctl_arp_common(ill, q, mp, sin, B_TRUE, if_arp_ioctl); 9982 if (if_arp_ioctl) 9983 ill_refrele(ill); 9984 if (ire != NULL) 9985 ire_refrele(ire); 9986 9987 return (err); 9988 } 9989 9990 /* 9991 * ARP IOCTLs. 9992 * How does IP get in the business of fronting ARP configuration/queries? 9993 * Well its like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP) 9994 * are by tradition passed in through a datagram socket. That lands in IP. 9995 * As it happens, this is just as well since the interface is quite crude in 9996 * that it passes in no information about protocol or hardware types, or 9997 * interface association. After making the protocol assumption, IP is in 9998 * the position to look up the name of the ILL, which ARP will need, and 9999 * format a request that can be handled by ARP. The request is passed up 10000 * stream to ARP, and the original IOCTL is completed by IP when ARP passes 10001 * back a response. ARP supports its own set of more general IOCTLs, in 10002 * case anyone is interested. 10003 */ 10004 /* ARGSUSED */ 10005 int 10006 ip_sioctl_arp(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 10007 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 10008 { 10009 struct arpreq *ar; 10010 struct sockaddr_in *sin; 10011 ire_t *ire; 10012 boolean_t isv6; 10013 mblk_t *mp1; 10014 int err; 10015 conn_t *connp; 10016 ill_t *ill; 10017 ip_stack_t *ipst; 10018 10019 /* ioctl comes down on an conn */ 10020 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 10021 connp = Q_TO_CONN(q); 10022 ipst = CONNQ_TO_IPST(q); 10023 isv6 = connp->conn_af_isv6; 10024 if (isv6) 10025 return (ENXIO); 10026 10027 /* Existance verified in ip_wput_nondata */ 10028 mp1 = mp->b_cont->b_cont; 10029 10030 ar = (struct arpreq *)mp1->b_rptr; 10031 sin = (sin_t *)&ar->arp_pa; 10032 10033 /* 10034 * We need to let ARP know on which interface the IP 10035 * address has an ARP mapping. In the IPMP case, a 10036 * simple forwarding table lookup will return the 10037 * IRE_IF_RESOLVER for the first interface in the group, 10038 * which might not be the interface on which the 10039 * requested IP address was resolved due to the ill 10040 * selection algorithm (see ip_newroute_get_dst_ill()). 10041 * So we do a cache table lookup first: if the IRE cache 10042 * entry for the IP address is still there, it will 10043 * contain the ill pointer for the right interface, so 10044 * we use that. If the cache entry has been flushed, we 10045 * fall back to the forwarding table lookup. This should 10046 * be rare enough since IRE cache entries have a longer 10047 * life expectancy than ARP cache entries. 10048 */ 10049 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL, ipst); 10050 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 10051 ((ill = ire_to_ill(ire)) == NULL)) { 10052 if (ire != NULL) 10053 ire_refrele(ire); 10054 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 10055 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 10056 NULL, MATCH_IRE_TYPE, ipst); 10057 if ((ire == NULL) || ((ill = ire_to_ill(ire)) == NULL)) { 10058 if (ire != NULL) 10059 ire_refrele(ire); 10060 return (ENXIO); 10061 } 10062 } 10063 ASSERT(ire != NULL && ill != NULL); 10064 10065 err = ip_sioctl_arp_common(ill, q, mp, sin, B_FALSE, B_FALSE); 10066 ire_refrele(ire); 10067 return (err); 10068 } 10069 10070 /* 10071 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 10072 * atomically set/clear the muxids. Also complete the ioctl by acking or 10073 * naking it. Note that the code is structured such that the link type, 10074 * whether it's persistent or not, is treated equally. ifconfig(1M) and 10075 * its clones use the persistent link, while pppd(1M) and perhaps many 10076 * other daemons may use non-persistent link. When combined with some 10077 * ill_t states, linking and unlinking lower streams may be used as 10078 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 10079 */ 10080 /* ARGSUSED */ 10081 void 10082 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 10083 { 10084 mblk_t *mp1; 10085 mblk_t *mp2; 10086 struct linkblk *li; 10087 queue_t *ipwq; 10088 char *name; 10089 struct qinit *qinfo; 10090 struct ipmx_s *ipmxp; 10091 ill_t *ill = NULL; 10092 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 10093 int err = 0; 10094 boolean_t entered_ipsq = B_FALSE; 10095 boolean_t islink; 10096 queue_t *dwq = NULL; 10097 ip_stack_t *ipst; 10098 10099 if (CONN_Q(q)) 10100 ipst = CONNQ_TO_IPST(q); 10101 else 10102 ipst = ILLQ_TO_IPST(q); 10103 10104 ASSERT(iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_PUNLINK || 10105 iocp->ioc_cmd == I_LINK || iocp->ioc_cmd == I_UNLINK); 10106 10107 islink = (iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_LINK) ? 10108 B_TRUE : B_FALSE; 10109 10110 mp1 = mp->b_cont; /* This is the linkblk info */ 10111 li = (struct linkblk *)mp1->b_rptr; 10112 10113 /* 10114 * ARP has added this special mblk, and the utility is asking us 10115 * to perform consistency checks, and also atomically set the 10116 * muxid. Ifconfig is an example. It achieves this by using 10117 * /dev/arp as the mux to plink the arp stream, and pushes arp on 10118 * to /dev/udp[6] stream for use as the mux when plinking the IP 10119 * stream. SIOCSLIFMUXID is not required. See ifconfig.c, arp.c 10120 * and other comments in this routine for more details. 10121 */ 10122 mp2 = mp1->b_cont; /* This is added by ARP */ 10123 10124 /* 10125 * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than 10126 * ifconfig which didn't push ARP on top of the dummy mux, we won't 10127 * get the special mblk above. For backward compatibility, we just 10128 * return success. The utility will use SIOCSLIFMUXID to store 10129 * the muxids. This is not atomic, and can leave the streams 10130 * unplumbable if the utility is interrrupted, before it does the 10131 * SIOCSLIFMUXID. 10132 */ 10133 if (mp2 == NULL) { 10134 /* 10135 * At this point we don't know whether or not this is the 10136 * IP module stream or the ARP device stream. We need to 10137 * walk the lower stream in order to find this out, since 10138 * the capability negotiation is done only on the IP module 10139 * stream. IP module instance is identified by the module 10140 * name IP, non-null q_next, and it's wput not being ip_lwput. 10141 * STREAMS ensures that the lower stream (l_qbot) will not 10142 * vanish until this ioctl completes. So we can safely walk 10143 * the stream or refer to the q_ptr. 10144 */ 10145 ipwq = li->l_qbot; 10146 while (ipwq != NULL) { 10147 qinfo = ipwq->q_qinfo; 10148 name = qinfo->qi_minfo->mi_idname; 10149 if (name != NULL && name[0] != NULL && 10150 (strcmp(name, ip_mod_info.mi_idname) == 0) && 10151 ((void *)(qinfo->qi_putp) != (void *)ip_lwput) && 10152 (ipwq->q_next != NULL)) { 10153 break; 10154 } 10155 ipwq = ipwq->q_next; 10156 } 10157 /* 10158 * This looks like an IP module stream, so trigger 10159 * the capability reset or re-negotiation if necessary. 10160 */ 10161 if (ipwq != NULL) { 10162 ill = ipwq->q_ptr; 10163 ASSERT(ill != NULL); 10164 10165 if (ipsq == NULL) { 10166 ipsq = ipsq_try_enter(NULL, ill, q, mp, 10167 ip_sioctl_plink, NEW_OP, B_TRUE); 10168 if (ipsq == NULL) 10169 return; 10170 entered_ipsq = B_TRUE; 10171 } 10172 ASSERT(IAM_WRITER_ILL(ill)); 10173 /* 10174 * Store the upper read queue of the module 10175 * immediately below IP, and count the total 10176 * number of lower modules. Do this only 10177 * for I_PLINK or I_LINK event. 10178 */ 10179 ill->ill_lmod_rq = NULL; 10180 ill->ill_lmod_cnt = 0; 10181 if (islink && (dwq = ipwq->q_next) != NULL) { 10182 ill->ill_lmod_rq = RD(dwq); 10183 10184 while (dwq != NULL) { 10185 ill->ill_lmod_cnt++; 10186 dwq = dwq->q_next; 10187 } 10188 } 10189 /* 10190 * There's no point in resetting or re-negotiating if 10191 * we are not bound to the driver, so only do this if 10192 * the DLPI state is idle (up); we assume such state 10193 * since ill_ipif_up_count gets incremented in 10194 * ipif_up_done(), which is after we are bound to the 10195 * driver. Note that in the case of logical 10196 * interfaces, IP won't rebind to the driver unless 10197 * the ill_ipif_up_count is 0, meaning that all other 10198 * IP interfaces (including the main ipif) are in the 10199 * down state. Because of this, we use such counter 10200 * as an indicator, instead of relying on the IPIF_UP 10201 * flag, which is per ipif instance. 10202 */ 10203 if (ill->ill_ipif_up_count > 0) { 10204 if (islink) 10205 ill_capability_probe(ill); 10206 else 10207 ill_capability_reset(ill); 10208 } 10209 } 10210 goto done; 10211 } 10212 10213 /* 10214 * This is an I_{P}LINK sent down by ifconfig on 10215 * /dev/arp. ARP has appended this last (3rd) mblk, 10216 * giving more info. STREAMS ensures that the lower 10217 * stream (l_qbot) will not vanish until this ioctl 10218 * completes. So we can safely walk the stream or refer 10219 * to the q_ptr. 10220 */ 10221 ipmxp = (struct ipmx_s *)mp2->b_rptr; 10222 if (ipmxp->ipmx_arpdev_stream) { 10223 /* 10224 * The operation is occuring on the arp-device 10225 * stream. 10226 */ 10227 ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE, 10228 q, mp, ip_sioctl_plink, &err, NULL, ipst); 10229 if (ill == NULL) { 10230 if (err == EINPROGRESS) { 10231 return; 10232 } else { 10233 err = EINVAL; 10234 goto done; 10235 } 10236 } 10237 10238 if (ipsq == NULL) { 10239 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 10240 NEW_OP, B_TRUE); 10241 if (ipsq == NULL) { 10242 ill_refrele(ill); 10243 return; 10244 } 10245 entered_ipsq = B_TRUE; 10246 } 10247 ASSERT(IAM_WRITER_ILL(ill)); 10248 ill_refrele(ill); 10249 /* 10250 * To ensure consistency between IP and ARP, 10251 * the following LIFO scheme is used in 10252 * plink/punlink. (IP first, ARP last). 10253 * This is because the muxid's are stored 10254 * in the IP stream on the ill. 10255 * 10256 * I_{P}LINK: ifconfig plinks the IP stream before 10257 * plinking the ARP stream. On an arp-dev 10258 * stream, IP checks that it is not yet 10259 * plinked, and it also checks that the 10260 * corresponding IP stream is already plinked. 10261 * 10262 * I_{P}UNLINK: ifconfig punlinks the ARP stream 10263 * before punlinking the IP stream. IP does 10264 * not allow punlink of the IP stream unless 10265 * the arp stream has been punlinked. 10266 * 10267 */ 10268 if ((islink && 10269 (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) || 10270 (!islink && 10271 ill->ill_arp_muxid != li->l_index)) { 10272 err = EINVAL; 10273 goto done; 10274 } 10275 if (islink) { 10276 ill->ill_arp_muxid = li->l_index; 10277 } else { 10278 ill->ill_arp_muxid = 0; 10279 } 10280 } else { 10281 /* 10282 * This must be the IP module stream with or 10283 * without arp. Walk the stream and locate the 10284 * IP module. An IP module instance is 10285 * identified by the module name IP, non-null 10286 * q_next, and it's wput not being ip_lwput. 10287 */ 10288 ipwq = li->l_qbot; 10289 while (ipwq != NULL) { 10290 qinfo = ipwq->q_qinfo; 10291 name = qinfo->qi_minfo->mi_idname; 10292 if (name != NULL && name[0] != NULL && 10293 (strcmp(name, ip_mod_info.mi_idname) == 0) && 10294 ((void *)(qinfo->qi_putp) != (void *)ip_lwput) && 10295 (ipwq->q_next != NULL)) { 10296 break; 10297 } 10298 ipwq = ipwq->q_next; 10299 } 10300 if (ipwq != NULL) { 10301 ill = ipwq->q_ptr; 10302 ASSERT(ill != NULL); 10303 10304 if (ipsq == NULL) { 10305 ipsq = ipsq_try_enter(NULL, ill, q, mp, 10306 ip_sioctl_plink, NEW_OP, B_TRUE); 10307 if (ipsq == NULL) 10308 return; 10309 entered_ipsq = B_TRUE; 10310 } 10311 ASSERT(IAM_WRITER_ILL(ill)); 10312 /* 10313 * Return error if the ip_mux_id is 10314 * non-zero and command is I_{P}LINK. 10315 * If command is I_{P}UNLINK, return 10316 * error if the arp-devstr is not 10317 * yet punlinked. 10318 */ 10319 if ((islink && ill->ill_ip_muxid != 0) || 10320 (!islink && ill->ill_arp_muxid != 0)) { 10321 err = EINVAL; 10322 goto done; 10323 } 10324 ill->ill_lmod_rq = NULL; 10325 ill->ill_lmod_cnt = 0; 10326 if (islink) { 10327 /* 10328 * Store the upper read queue of the module 10329 * immediately below IP, and count the total 10330 * number of lower modules. 10331 */ 10332 if ((dwq = ipwq->q_next) != NULL) { 10333 ill->ill_lmod_rq = RD(dwq); 10334 10335 while (dwq != NULL) { 10336 ill->ill_lmod_cnt++; 10337 dwq = dwq->q_next; 10338 } 10339 } 10340 ill->ill_ip_muxid = li->l_index; 10341 } else { 10342 ill->ill_ip_muxid = 0; 10343 } 10344 10345 /* 10346 * See comments above about resetting/re- 10347 * negotiating driver sub-capabilities. 10348 */ 10349 if (ill->ill_ipif_up_count > 0) { 10350 if (islink) 10351 ill_capability_probe(ill); 10352 else 10353 ill_capability_reset(ill); 10354 } 10355 } 10356 } 10357 done: 10358 iocp->ioc_count = 0; 10359 iocp->ioc_error = err; 10360 if (err == 0) 10361 mp->b_datap->db_type = M_IOCACK; 10362 else 10363 mp->b_datap->db_type = M_IOCNAK; 10364 qreply(q, mp); 10365 10366 /* Conn was refheld in ip_sioctl_copyin_setup */ 10367 if (CONN_Q(q)) 10368 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 10369 if (entered_ipsq) 10370 ipsq_exit(ipsq, B_TRUE, B_TRUE); 10371 } 10372 10373 /* 10374 * Search the ioctl command in the ioctl tables and return a pointer 10375 * to the ioctl command information. The ioctl command tables are 10376 * static and fully populated at compile time. 10377 */ 10378 ip_ioctl_cmd_t * 10379 ip_sioctl_lookup(int ioc_cmd) 10380 { 10381 int index; 10382 ip_ioctl_cmd_t *ipip; 10383 ip_ioctl_cmd_t *ipip_end; 10384 10385 if (ioc_cmd == IPI_DONTCARE) 10386 return (NULL); 10387 10388 /* 10389 * Do a 2 step search. First search the indexed table 10390 * based on the least significant byte of the ioctl cmd. 10391 * If we don't find a match, then search the misc table 10392 * serially. 10393 */ 10394 index = ioc_cmd & 0xFF; 10395 if (index < ip_ndx_ioctl_count) { 10396 ipip = &ip_ndx_ioctl_table[index]; 10397 if (ipip->ipi_cmd == ioc_cmd) { 10398 /* Found a match in the ndx table */ 10399 return (ipip); 10400 } 10401 } 10402 10403 /* Search the misc table */ 10404 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 10405 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 10406 if (ipip->ipi_cmd == ioc_cmd) 10407 /* Found a match in the misc table */ 10408 return (ipip); 10409 } 10410 10411 return (NULL); 10412 } 10413 10414 /* 10415 * Wrapper function for resuming deferred ioctl processing 10416 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 10417 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 10418 */ 10419 /* ARGSUSED */ 10420 void 10421 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 10422 void *dummy_arg) 10423 { 10424 ip_sioctl_copyin_setup(q, mp); 10425 } 10426 10427 /* 10428 * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message 10429 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 10430 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 10431 * We establish here the size of the block to be copied in. mi_copyin 10432 * arranges for this to happen, an processing continues in ip_wput with 10433 * an M_IOCDATA message. 10434 */ 10435 void 10436 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 10437 { 10438 int copyin_size; 10439 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 10440 ip_ioctl_cmd_t *ipip; 10441 cred_t *cr; 10442 ip_stack_t *ipst; 10443 10444 if (CONN_Q(q)) 10445 ipst = CONNQ_TO_IPST(q); 10446 else 10447 ipst = ILLQ_TO_IPST(q); 10448 10449 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 10450 if (ipip == NULL) { 10451 /* 10452 * The ioctl is not one we understand or own. 10453 * Pass it along to be processed down stream, 10454 * if this is a module instance of IP, else nak 10455 * the ioctl. 10456 */ 10457 if (q->q_next == NULL) { 10458 goto nak; 10459 } else { 10460 putnext(q, mp); 10461 return; 10462 } 10463 } 10464 10465 /* 10466 * If this is deferred, then we will do all the checks when we 10467 * come back. 10468 */ 10469 if ((iocp->ioc_cmd == SIOCGDSTINFO || 10470 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) { 10471 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 10472 return; 10473 } 10474 10475 /* 10476 * Only allow a very small subset of IP ioctls on this stream if 10477 * IP is a module and not a driver. Allowing ioctls to be processed 10478 * in this case may cause assert failures or data corruption. 10479 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 10480 * ioctls allowed on an IP module stream, after which this stream 10481 * normally becomes a multiplexor (at which time the stream head 10482 * will fail all ioctls). 10483 */ 10484 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 10485 if (ipip->ipi_flags & IPI_PASS_DOWN) { 10486 /* 10487 * Pass common Streams ioctls which the IP 10488 * module does not own or consume along to 10489 * be processed down stream. 10490 */ 10491 putnext(q, mp); 10492 return; 10493 } else { 10494 goto nak; 10495 } 10496 } 10497 10498 /* Make sure we have ioctl data to process. */ 10499 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 10500 goto nak; 10501 10502 /* 10503 * Prefer dblk credential over ioctl credential; some synthesized 10504 * ioctls have kcred set because there's no way to crhold() 10505 * a credential in some contexts. (ioc_cr is not crfree() by 10506 * the framework; the caller of ioctl needs to hold the reference 10507 * for the duration of the call). 10508 */ 10509 cr = DB_CREDDEF(mp, iocp->ioc_cr); 10510 10511 /* Make sure normal users don't send down privileged ioctls */ 10512 if ((ipip->ipi_flags & IPI_PRIV) && 10513 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) { 10514 /* We checked the privilege earlier but log it here */ 10515 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE)); 10516 return; 10517 } 10518 10519 /* 10520 * The ioctl command tables can only encode fixed length 10521 * ioctl data. If the length is variable, the table will 10522 * encode the length as zero. Such special cases are handled 10523 * below in the switch. 10524 */ 10525 if (ipip->ipi_copyin_size != 0) { 10526 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 10527 return; 10528 } 10529 10530 switch (iocp->ioc_cmd) { 10531 case O_SIOCGIFCONF: 10532 case SIOCGIFCONF: 10533 /* 10534 * This IOCTL is hilarious. See comments in 10535 * ip_sioctl_get_ifconf for the story. 10536 */ 10537 if (iocp->ioc_count == TRANSPARENT) 10538 copyin_size = SIZEOF_STRUCT(ifconf, 10539 iocp->ioc_flag); 10540 else 10541 copyin_size = iocp->ioc_count; 10542 mi_copyin(q, mp, NULL, copyin_size); 10543 return; 10544 10545 case O_SIOCGLIFCONF: 10546 case SIOCGLIFCONF: 10547 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 10548 mi_copyin(q, mp, NULL, copyin_size); 10549 return; 10550 10551 case SIOCGLIFSRCOF: 10552 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 10553 mi_copyin(q, mp, NULL, copyin_size); 10554 return; 10555 case SIOCGIP6ADDRPOLICY: 10556 ip_sioctl_ip6addrpolicy(q, mp); 10557 ip6_asp_table_refrele(ipst); 10558 return; 10559 10560 case SIOCSIP6ADDRPOLICY: 10561 ip_sioctl_ip6addrpolicy(q, mp); 10562 return; 10563 10564 case SIOCGDSTINFO: 10565 ip_sioctl_dstinfo(q, mp); 10566 ip6_asp_table_refrele(ipst); 10567 return; 10568 10569 case I_PLINK: 10570 case I_PUNLINK: 10571 case I_LINK: 10572 case I_UNLINK: 10573 /* 10574 * We treat non-persistent link similarly as the persistent 10575 * link case, in terms of plumbing/unplumbing, as well as 10576 * dynamic re-plumbing events indicator. See comments 10577 * in ip_sioctl_plink() for more. 10578 * 10579 * Request can be enqueued in the 'ipsq' while waiting 10580 * to become exclusive. So bump up the conn ref. 10581 */ 10582 if (CONN_Q(q)) 10583 CONN_INC_REF(Q_TO_CONN(q)); 10584 ip_sioctl_plink(NULL, q, mp, NULL); 10585 return; 10586 10587 case ND_GET: 10588 case ND_SET: 10589 /* 10590 * Use of the nd table requires holding the reader lock. 10591 * Modifying the nd table thru nd_load/nd_unload requires 10592 * the writer lock. 10593 */ 10594 rw_enter(&ipst->ips_ip_g_nd_lock, RW_READER); 10595 if (nd_getset(q, ipst->ips_ip_g_nd, mp)) { 10596 rw_exit(&ipst->ips_ip_g_nd_lock); 10597 10598 if (iocp->ioc_error) 10599 iocp->ioc_count = 0; 10600 mp->b_datap->db_type = M_IOCACK; 10601 qreply(q, mp); 10602 return; 10603 } 10604 rw_exit(&ipst->ips_ip_g_nd_lock); 10605 /* 10606 * We don't understand this subioctl of ND_GET / ND_SET. 10607 * Maybe intended for some driver / module below us 10608 */ 10609 if (q->q_next) { 10610 putnext(q, mp); 10611 } else { 10612 iocp->ioc_error = ENOENT; 10613 mp->b_datap->db_type = M_IOCNAK; 10614 iocp->ioc_count = 0; 10615 qreply(q, mp); 10616 } 10617 return; 10618 10619 case IP_IOCTL: 10620 ip_wput_ioctl(q, mp); 10621 return; 10622 default: 10623 cmn_err(CE_PANIC, "should not happen "); 10624 } 10625 nak: 10626 if (mp->b_cont != NULL) { 10627 freemsg(mp->b_cont); 10628 mp->b_cont = NULL; 10629 } 10630 iocp->ioc_error = EINVAL; 10631 mp->b_datap->db_type = M_IOCNAK; 10632 iocp->ioc_count = 0; 10633 qreply(q, mp); 10634 } 10635 10636 /* ip_wput hands off ARP IOCTL responses to us */ 10637 void 10638 ip_sioctl_iocack(queue_t *q, mblk_t *mp) 10639 { 10640 struct arpreq *ar; 10641 struct xarpreq *xar; 10642 area_t *area; 10643 mblk_t *area_mp; 10644 struct iocblk *iocp; 10645 mblk_t *orig_ioc_mp, *tmp; 10646 struct iocblk *orig_iocp; 10647 ill_t *ill; 10648 conn_t *connp = NULL; 10649 uint_t ioc_id; 10650 mblk_t *pending_mp; 10651 int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE; 10652 int *flagsp; 10653 char *storage = NULL; 10654 sin_t *sin; 10655 ipaddr_t addr; 10656 int err; 10657 ip_stack_t *ipst; 10658 10659 ill = q->q_ptr; 10660 ASSERT(ill != NULL); 10661 ipst = ill->ill_ipst; 10662 10663 /* 10664 * We should get back from ARP a packet chain that looks like: 10665 * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 10666 */ 10667 if (!(area_mp = mp->b_cont) || 10668 (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) || 10669 !(orig_ioc_mp = area_mp->b_cont) || 10670 !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) { 10671 freemsg(mp); 10672 return; 10673 } 10674 10675 orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr; 10676 10677 tmp = (orig_ioc_mp->b_cont)->b_cont; 10678 if ((orig_iocp->ioc_cmd == SIOCGXARP) || 10679 (orig_iocp->ioc_cmd == SIOCSXARP) || 10680 (orig_iocp->ioc_cmd == SIOCDXARP)) { 10681 x_arp_ioctl = B_TRUE; 10682 xar = (struct xarpreq *)tmp->b_rptr; 10683 sin = (sin_t *)&xar->xarp_pa; 10684 flagsp = &xar->xarp_flags; 10685 storage = xar->xarp_ha.sdl_data; 10686 if (xar->xarp_ha.sdl_nlen != 0) 10687 ifx_arp_ioctl = B_TRUE; 10688 } else { 10689 ar = (struct arpreq *)tmp->b_rptr; 10690 sin = (sin_t *)&ar->arp_pa; 10691 flagsp = &ar->arp_flags; 10692 storage = ar->arp_ha.sa_data; 10693 } 10694 10695 iocp = (struct iocblk *)mp->b_rptr; 10696 10697 /* 10698 * Pick out the originating queue based on the ioc_id. 10699 */ 10700 ioc_id = iocp->ioc_id; 10701 pending_mp = ill_pending_mp_get(ill, &connp, ioc_id); 10702 if (pending_mp == NULL) { 10703 ASSERT(connp == NULL); 10704 inet_freemsg(mp); 10705 return; 10706 } 10707 ASSERT(connp != NULL); 10708 q = CONNP_TO_WQ(connp); 10709 10710 /* Uncouple the internally generated IOCTL from the original one */ 10711 area = (area_t *)area_mp->b_rptr; 10712 area_mp->b_cont = NULL; 10713 10714 /* 10715 * Restore the b_next and b_prev used by mi code. This is needed 10716 * to complete the ioctl using mi* functions. We stored them in 10717 * the pending mp prior to sending the request to ARP. 10718 */ 10719 orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next; 10720 orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev; 10721 inet_freemsg(pending_mp); 10722 10723 /* 10724 * We're done if there was an error or if this is not an SIOCG{X}ARP 10725 * Catch the case where there is an IRE_CACHE by no entry in the 10726 * arp table. 10727 */ 10728 addr = sin->sin_addr.s_addr; 10729 if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) { 10730 ire_t *ire; 10731 dl_unitdata_req_t *dlup; 10732 mblk_t *llmp; 10733 int addr_len; 10734 ill_t *ipsqill = NULL; 10735 10736 if (ifx_arp_ioctl) { 10737 /* 10738 * There's no need to lookup the ill, since 10739 * we've already done that when we started 10740 * processing the ioctl and sent the message 10741 * to ARP on that ill. So use the ill that 10742 * is stored in q->q_ptr. 10743 */ 10744 ipsqill = ill; 10745 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10746 ipsqill->ill_ipif, ALL_ZONES, 10747 NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); 10748 } else { 10749 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10750 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 10751 if (ire != NULL) 10752 ipsqill = ire_to_ill(ire); 10753 } 10754 10755 if ((x_arp_ioctl) && (ipsqill != NULL)) 10756 storage += ill_xarp_info(&xar->xarp_ha, ipsqill); 10757 10758 if (ire != NULL) { 10759 /* 10760 * Since the ire obtained from cachetable is used for 10761 * mac addr copying below, treat an incomplete ire as if 10762 * as if we never found it. 10763 */ 10764 if (ire->ire_nce != NULL && 10765 ire->ire_nce->nce_state != ND_REACHABLE) { 10766 ire_refrele(ire); 10767 ire = NULL; 10768 ipsqill = NULL; 10769 goto errack; 10770 } 10771 *flagsp = ATF_INUSE; 10772 llmp = (ire->ire_nce != NULL ? 10773 ire->ire_nce->nce_res_mp : NULL); 10774 if (llmp != NULL && ipsqill != NULL) { 10775 uchar_t *macaddr; 10776 10777 addr_len = ipsqill->ill_phys_addr_length; 10778 if (x_arp_ioctl && ((addr_len + 10779 ipsqill->ill_name_length) > 10780 sizeof (xar->xarp_ha.sdl_data))) { 10781 ire_refrele(ire); 10782 freemsg(mp); 10783 ip_ioctl_finish(q, orig_ioc_mp, 10784 EINVAL, NO_COPYOUT, NULL); 10785 return; 10786 } 10787 *flagsp |= ATF_COM; 10788 dlup = (dl_unitdata_req_t *)llmp->b_rptr; 10789 if (ipsqill->ill_sap_length < 0) 10790 macaddr = llmp->b_rptr + 10791 dlup->dl_dest_addr_offset; 10792 else 10793 macaddr = llmp->b_rptr + 10794 dlup->dl_dest_addr_offset + 10795 ipsqill->ill_sap_length; 10796 /* 10797 * For SIOCGARP, MAC address length 10798 * validation has already been done 10799 * before the ioctl was issued to ARP to 10800 * allow it to progress only on 6 byte 10801 * addressable (ethernet like) media. Thus 10802 * the mac address copying can not overwrite 10803 * the sa_data area below. 10804 */ 10805 bcopy(macaddr, storage, addr_len); 10806 } 10807 /* Ditch the internal IOCTL. */ 10808 freemsg(mp); 10809 ire_refrele(ire); 10810 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL); 10811 return; 10812 } 10813 } 10814 10815 /* 10816 * Delete the coresponding IRE_CACHE if any. 10817 * Reset the error if there was one (in case there was no entry 10818 * in arp.) 10819 */ 10820 if (iocp->ioc_cmd == AR_ENTRY_DELETE) { 10821 ipif_t *ipintf = NULL; 10822 10823 if (ifx_arp_ioctl) { 10824 /* 10825 * There's no need to lookup the ill, since 10826 * we've already done that when we started 10827 * processing the ioctl and sent the message 10828 * to ARP on that ill. So use the ill that 10829 * is stored in q->q_ptr. 10830 */ 10831 ipintf = ill->ill_ipif; 10832 } 10833 if (ip_ire_clookup_and_delete(addr, ipintf, ipst)) { 10834 /* 10835 * The address in "addr" may be an entry for a 10836 * router. If that's true, then any off-net 10837 * IRE_CACHE entries that go through the router 10838 * with address "addr" must be clobbered. Use 10839 * ire_walk to achieve this goal. 10840 */ 10841 if (ifx_arp_ioctl) 10842 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 10843 ire_delete_cache_gw, (char *)&addr, ill); 10844 else 10845 ire_walk_v4(ire_delete_cache_gw, (char *)&addr, 10846 ALL_ZONES, ipst); 10847 iocp->ioc_error = 0; 10848 } 10849 } 10850 errack: 10851 if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) { 10852 err = iocp->ioc_error; 10853 freemsg(mp); 10854 ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL); 10855 return; 10856 } 10857 10858 /* 10859 * Completion of an SIOCG{X}ARP. Translate the information from 10860 * the area_t into the struct {x}arpreq. 10861 */ 10862 if (x_arp_ioctl) { 10863 storage += ill_xarp_info(&xar->xarp_ha, ill); 10864 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 10865 sizeof (xar->xarp_ha.sdl_data)) { 10866 freemsg(mp); 10867 ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT, 10868 NULL); 10869 return; 10870 } 10871 } 10872 *flagsp = ATF_INUSE; 10873 if (area->area_flags & ACE_F_PERMANENT) 10874 *flagsp |= ATF_PERM; 10875 if (area->area_flags & ACE_F_PUBLISH) 10876 *flagsp |= ATF_PUBL; 10877 if (area->area_flags & ACE_F_AUTHORITY) 10878 *flagsp |= ATF_AUTHORITY; 10879 if (area->area_hw_addr_length != 0) { 10880 *flagsp |= ATF_COM; 10881 /* 10882 * For SIOCGARP, MAC address length validation has 10883 * already been done before the ioctl was issued to ARP 10884 * to allow it to progress only on 6 byte addressable 10885 * (ethernet like) media. Thus the mac address copying 10886 * can not overwrite the sa_data area below. 10887 */ 10888 bcopy((char *)area + area->area_hw_addr_offset, 10889 storage, area->area_hw_addr_length); 10890 } 10891 10892 /* Ditch the internal IOCTL. */ 10893 freemsg(mp); 10894 /* Complete the original. */ 10895 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL); 10896 } 10897 10898 /* 10899 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 10900 * interface) create the next available logical interface for this 10901 * physical interface. 10902 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 10903 * ipif with the specified name. 10904 * 10905 * If the address family is not AF_UNSPEC then set the address as well. 10906 * 10907 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 10908 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 10909 * 10910 * Executed as a writer on the ill or ill group. 10911 * So no lock is needed to traverse the ipif chain, or examine the 10912 * phyint flags. 10913 */ 10914 /* ARGSUSED */ 10915 int 10916 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 10917 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10918 { 10919 mblk_t *mp1; 10920 struct lifreq *lifr; 10921 boolean_t isv6; 10922 boolean_t exists; 10923 char *name; 10924 char *endp; 10925 char *cp; 10926 int namelen; 10927 ipif_t *ipif; 10928 long id; 10929 ipsq_t *ipsq; 10930 ill_t *ill; 10931 sin_t *sin; 10932 int err = 0; 10933 boolean_t found_sep = B_FALSE; 10934 conn_t *connp; 10935 zoneid_t zoneid; 10936 int orig_ifindex = 0; 10937 ip_stack_t *ipst = CONNQ_TO_IPST(q); 10938 10939 ASSERT(q->q_next == NULL); 10940 ip1dbg(("ip_sioctl_addif\n")); 10941 /* Existence of mp1 has been checked in ip_wput_nondata */ 10942 mp1 = mp->b_cont->b_cont; 10943 /* 10944 * Null terminate the string to protect against buffer 10945 * overrun. String was generated by user code and may not 10946 * be trusted. 10947 */ 10948 lifr = (struct lifreq *)mp1->b_rptr; 10949 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 10950 name = lifr->lifr_name; 10951 ASSERT(CONN_Q(q)); 10952 connp = Q_TO_CONN(q); 10953 isv6 = connp->conn_af_isv6; 10954 zoneid = connp->conn_zoneid; 10955 namelen = mi_strlen(name); 10956 if (namelen == 0) 10957 return (EINVAL); 10958 10959 exists = B_FALSE; 10960 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 10961 (mi_strcmp(name, ipif_loopback_name) == 0)) { 10962 /* 10963 * Allow creating lo0 using SIOCLIFADDIF. 10964 * can't be any other writer thread. So can pass null below 10965 * for the last 4 args to ipif_lookup_name. 10966 */ 10967 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE, 10968 &exists, isv6, zoneid, NULL, NULL, NULL, NULL, ipst); 10969 /* Prevent any further action */ 10970 if (ipif == NULL) { 10971 return (ENOBUFS); 10972 } else if (!exists) { 10973 /* We created the ipif now and as writer */ 10974 ipif_refrele(ipif); 10975 return (0); 10976 } else { 10977 ill = ipif->ipif_ill; 10978 ill_refhold(ill); 10979 ipif_refrele(ipif); 10980 } 10981 } else { 10982 /* Look for a colon in the name. */ 10983 endp = &name[namelen]; 10984 for (cp = endp; --cp > name; ) { 10985 if (*cp == IPIF_SEPARATOR_CHAR) { 10986 found_sep = B_TRUE; 10987 /* 10988 * Reject any non-decimal aliases for plumbing 10989 * of logical interfaces. Aliases with leading 10990 * zeroes are also rejected as they introduce 10991 * ambiguity in the naming of the interfaces. 10992 * Comparing with "0" takes care of all such 10993 * cases. 10994 */ 10995 if ((strncmp("0", cp+1, 1)) == 0) 10996 return (EINVAL); 10997 10998 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 10999 id <= 0 || *endp != '\0') { 11000 return (EINVAL); 11001 } 11002 *cp = '\0'; 11003 break; 11004 } 11005 } 11006 ill = ill_lookup_on_name(name, B_FALSE, isv6, 11007 CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL, ipst); 11008 if (found_sep) 11009 *cp = IPIF_SEPARATOR_CHAR; 11010 if (ill == NULL) 11011 return (err); 11012 } 11013 11014 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 11015 B_TRUE); 11016 11017 /* 11018 * Release the refhold due to the lookup, now that we are excl 11019 * or we are just returning 11020 */ 11021 ill_refrele(ill); 11022 11023 if (ipsq == NULL) 11024 return (EINPROGRESS); 11025 11026 /* 11027 * If the interface is failed, inactive or offlined, look for a working 11028 * interface in the ill group and create the ipif there. If we can't 11029 * find a good interface, create the ipif anyway so that in.mpathd can 11030 * move it to the first repaired interface. 11031 */ 11032 if ((ill->ill_phyint->phyint_flags & 11033 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 11034 ill->ill_phyint->phyint_groupname_len != 0) { 11035 phyint_t *phyi; 11036 char *groupname = ill->ill_phyint->phyint_groupname; 11037 11038 /* 11039 * We're looking for a working interface, but it doesn't matter 11040 * if it's up or down; so instead of following the group lists, 11041 * we look at each physical interface and compare the groupname. 11042 * We're only interested in interfaces with IPv4 (resp. IPv6) 11043 * plumbed when we're adding an IPv4 (resp. IPv6) ipif. 11044 * Otherwise we create the ipif on the failed interface. 11045 */ 11046 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 11047 phyi = avl_first(&ipst->ips_phyint_g_list-> 11048 phyint_list_avl_by_index); 11049 for (; phyi != NULL; 11050 phyi = avl_walk(&ipst->ips_phyint_g_list-> 11051 phyint_list_avl_by_index, 11052 phyi, AVL_AFTER)) { 11053 if (phyi->phyint_groupname_len == 0) 11054 continue; 11055 ASSERT(phyi->phyint_groupname != NULL); 11056 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 && 11057 !(phyi->phyint_flags & 11058 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 11059 (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) : 11060 (phyi->phyint_illv4 != NULL))) { 11061 break; 11062 } 11063 } 11064 rw_exit(&ipst->ips_ill_g_lock); 11065 11066 if (phyi != NULL) { 11067 orig_ifindex = ill->ill_phyint->phyint_ifindex; 11068 ill = (ill->ill_isv6 ? phyi->phyint_illv6 : 11069 phyi->phyint_illv4); 11070 } 11071 } 11072 11073 /* 11074 * We are now exclusive on the ipsq, so an ill move will be serialized 11075 * before or after us. 11076 */ 11077 ASSERT(IAM_WRITER_ILL(ill)); 11078 ASSERT(ill->ill_move_in_progress == B_FALSE); 11079 11080 if (found_sep && orig_ifindex == 0) { 11081 /* Now see if there is an IPIF with this unit number. */ 11082 for (ipif = ill->ill_ipif; ipif != NULL; 11083 ipif = ipif->ipif_next) { 11084 if (ipif->ipif_id == id) { 11085 err = EEXIST; 11086 goto done; 11087 } 11088 } 11089 } 11090 11091 /* 11092 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 11093 * of lo0. We never come here when we plumb lo0:0. It 11094 * happens in ipif_lookup_on_name. 11095 * The specified unit number is ignored when we create the ipif on a 11096 * different interface. However, we save it in ipif_orig_ipifid below so 11097 * that the ipif fails back to the right position. 11098 */ 11099 if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ? 11100 id : -1, IRE_LOCAL, B_TRUE)) == NULL) { 11101 err = ENOBUFS; 11102 goto done; 11103 } 11104 11105 /* Return created name with ioctl */ 11106 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 11107 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 11108 ip1dbg(("created %s\n", lifr->lifr_name)); 11109 11110 /* Set address */ 11111 sin = (sin_t *)&lifr->lifr_addr; 11112 if (sin->sin_family != AF_UNSPEC) { 11113 err = ip_sioctl_addr(ipif, sin, q, mp, 11114 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 11115 } 11116 11117 /* Set ifindex and unit number for failback */ 11118 if (err == 0 && orig_ifindex != 0) { 11119 ipif->ipif_orig_ifindex = orig_ifindex; 11120 if (found_sep) { 11121 ipif->ipif_orig_ipifid = id; 11122 } 11123 } 11124 11125 done: 11126 ipsq_exit(ipsq, B_TRUE, B_TRUE); 11127 return (err); 11128 } 11129 11130 /* 11131 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 11132 * interface) delete it based on the IP address (on this physical interface). 11133 * Otherwise delete it based on the ipif_id. 11134 * Also, special handling to allow a removeif of lo0. 11135 */ 11136 /* ARGSUSED */ 11137 int 11138 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11139 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 11140 { 11141 conn_t *connp; 11142 ill_t *ill = ipif->ipif_ill; 11143 boolean_t success; 11144 ip_stack_t *ipst; 11145 11146 ipst = CONNQ_TO_IPST(q); 11147 11148 ASSERT(q->q_next == NULL); 11149 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 11150 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11151 ASSERT(IAM_WRITER_IPIF(ipif)); 11152 11153 connp = Q_TO_CONN(q); 11154 /* 11155 * Special case for unplumbing lo0 (the loopback physical interface). 11156 * If unplumbing lo0, the incoming address structure has been 11157 * initialized to all zeros. When unplumbing lo0, all its logical 11158 * interfaces must be removed too. 11159 * 11160 * Note that this interface may be called to remove a specific 11161 * loopback logical interface (eg, lo0:1). But in that case 11162 * ipif->ipif_id != 0 so that the code path for that case is the 11163 * same as any other interface (meaning it skips the code directly 11164 * below). 11165 */ 11166 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 11167 if (sin->sin_family == AF_UNSPEC && 11168 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 11169 /* 11170 * Mark it condemned. No new ref. will be made to ill. 11171 */ 11172 mutex_enter(&ill->ill_lock); 11173 ill->ill_state_flags |= ILL_CONDEMNED; 11174 for (ipif = ill->ill_ipif; ipif != NULL; 11175 ipif = ipif->ipif_next) { 11176 ipif->ipif_state_flags |= IPIF_CONDEMNED; 11177 } 11178 mutex_exit(&ill->ill_lock); 11179 11180 ipif = ill->ill_ipif; 11181 /* unplumb the loopback interface */ 11182 ill_delete(ill); 11183 mutex_enter(&connp->conn_lock); 11184 mutex_enter(&ill->ill_lock); 11185 ASSERT(ill->ill_group == NULL); 11186 11187 /* Are any references to this ill active */ 11188 if (ill_is_quiescent(ill)) { 11189 mutex_exit(&ill->ill_lock); 11190 mutex_exit(&connp->conn_lock); 11191 ill_delete_tail(ill); 11192 mi_free(ill); 11193 return (0); 11194 } 11195 success = ipsq_pending_mp_add(connp, ipif, 11196 CONNP_TO_WQ(connp), mp, ILL_FREE); 11197 mutex_exit(&connp->conn_lock); 11198 mutex_exit(&ill->ill_lock); 11199 if (success) 11200 return (EINPROGRESS); 11201 else 11202 return (EINTR); 11203 } 11204 } 11205 11206 /* 11207 * We are exclusive on the ipsq, so an ill move will be serialized 11208 * before or after us. 11209 */ 11210 ASSERT(ill->ill_move_in_progress == B_FALSE); 11211 11212 if (ipif->ipif_id == 0) { 11213 /* Find based on address */ 11214 if (ipif->ipif_isv6) { 11215 sin6_t *sin6; 11216 11217 if (sin->sin_family != AF_INET6) 11218 return (EAFNOSUPPORT); 11219 11220 sin6 = (sin6_t *)sin; 11221 /* We are a writer, so we should be able to lookup */ 11222 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 11223 ill, ALL_ZONES, NULL, NULL, NULL, NULL, ipst); 11224 if (ipif == NULL) { 11225 /* 11226 * Maybe the address in on another interface in 11227 * the same IPMP group? We check this below. 11228 */ 11229 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 11230 NULL, ALL_ZONES, NULL, NULL, NULL, NULL, 11231 ipst); 11232 } 11233 } else { 11234 ipaddr_t addr; 11235 11236 if (sin->sin_family != AF_INET) 11237 return (EAFNOSUPPORT); 11238 11239 addr = sin->sin_addr.s_addr; 11240 /* We are a writer, so we should be able to lookup */ 11241 ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL, 11242 NULL, NULL, NULL, ipst); 11243 if (ipif == NULL) { 11244 /* 11245 * Maybe the address in on another interface in 11246 * the same IPMP group? We check this below. 11247 */ 11248 ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES, 11249 NULL, NULL, NULL, NULL, ipst); 11250 } 11251 } 11252 if (ipif == NULL) { 11253 return (EADDRNOTAVAIL); 11254 } 11255 /* 11256 * When the address to be removed is hosted on a different 11257 * interface, we check if the interface is in the same IPMP 11258 * group as the specified one; if so we proceed with the 11259 * removal. 11260 * ill->ill_group is NULL when the ill is down, so we have to 11261 * compare the group names instead. 11262 */ 11263 if (ipif->ipif_ill != ill && 11264 (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 || 11265 ill->ill_phyint->phyint_groupname_len == 0 || 11266 mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname, 11267 ill->ill_phyint->phyint_groupname) != 0)) { 11268 ipif_refrele(ipif); 11269 return (EADDRNOTAVAIL); 11270 } 11271 11272 /* This is a writer */ 11273 ipif_refrele(ipif); 11274 } 11275 11276 /* 11277 * Can not delete instance zero since it is tied to the ill. 11278 */ 11279 if (ipif->ipif_id == 0) 11280 return (EBUSY); 11281 11282 mutex_enter(&ill->ill_lock); 11283 ipif->ipif_state_flags |= IPIF_CONDEMNED; 11284 mutex_exit(&ill->ill_lock); 11285 11286 ipif_free(ipif); 11287 11288 mutex_enter(&connp->conn_lock); 11289 mutex_enter(&ill->ill_lock); 11290 11291 /* Are any references to this ipif active */ 11292 if (ipif->ipif_refcnt == 0 && ipif->ipif_ire_cnt == 0) { 11293 mutex_exit(&ill->ill_lock); 11294 mutex_exit(&connp->conn_lock); 11295 ipif_non_duplicate(ipif); 11296 ipif_down_tail(ipif); 11297 ipif_free_tail(ipif); 11298 return (0); 11299 } 11300 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 11301 IPIF_FREE); 11302 mutex_exit(&ill->ill_lock); 11303 mutex_exit(&connp->conn_lock); 11304 if (success) 11305 return (EINPROGRESS); 11306 else 11307 return (EINTR); 11308 } 11309 11310 /* 11311 * Restart the removeif ioctl. The refcnt has gone down to 0. 11312 * The ipif is already condemned. So can't find it thru lookups. 11313 */ 11314 /* ARGSUSED */ 11315 int 11316 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 11317 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 11318 { 11319 ill_t *ill; 11320 11321 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 11322 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11323 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 11324 ill = ipif->ipif_ill; 11325 ASSERT(IAM_WRITER_ILL(ill)); 11326 ASSERT((ipif->ipif_state_flags & IPIF_CONDEMNED) && 11327 (ill->ill_state_flags & IPIF_CONDEMNED)); 11328 ill_delete_tail(ill); 11329 mi_free(ill); 11330 return (0); 11331 } 11332 11333 ill = ipif->ipif_ill; 11334 ASSERT(IAM_WRITER_IPIF(ipif)); 11335 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 11336 11337 ipif_non_duplicate(ipif); 11338 ipif_down_tail(ipif); 11339 ipif_free_tail(ipif); 11340 11341 ILL_UNMARK_CHANGING(ill); 11342 return (0); 11343 } 11344 11345 /* 11346 * Set the local interface address. 11347 * Allow an address of all zero when the interface is down. 11348 */ 11349 /* ARGSUSED */ 11350 int 11351 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11352 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 11353 { 11354 int err = 0; 11355 in6_addr_t v6addr; 11356 boolean_t need_up = B_FALSE; 11357 11358 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 11359 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11360 11361 ASSERT(IAM_WRITER_IPIF(ipif)); 11362 11363 if (ipif->ipif_isv6) { 11364 sin6_t *sin6; 11365 ill_t *ill; 11366 phyint_t *phyi; 11367 11368 if (sin->sin_family != AF_INET6) 11369 return (EAFNOSUPPORT); 11370 11371 sin6 = (sin6_t *)sin; 11372 v6addr = sin6->sin6_addr; 11373 ill = ipif->ipif_ill; 11374 phyi = ill->ill_phyint; 11375 11376 /* 11377 * Enforce that true multicast interfaces have a link-local 11378 * address for logical unit 0. 11379 */ 11380 if (ipif->ipif_id == 0 && 11381 (ill->ill_flags & ILLF_MULTICAST) && 11382 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 11383 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 11384 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 11385 return (EADDRNOTAVAIL); 11386 } 11387 11388 /* 11389 * up interfaces shouldn't have the unspecified address 11390 * unless they also have the IPIF_NOLOCAL flags set and 11391 * have a subnet assigned. 11392 */ 11393 if ((ipif->ipif_flags & IPIF_UP) && 11394 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 11395 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 11396 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 11397 return (EADDRNOTAVAIL); 11398 } 11399 11400 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 11401 return (EADDRNOTAVAIL); 11402 } else { 11403 ipaddr_t addr; 11404 11405 if (sin->sin_family != AF_INET) 11406 return (EAFNOSUPPORT); 11407 11408 addr = sin->sin_addr.s_addr; 11409 11410 /* Allow 0 as the local address. */ 11411 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 11412 return (EADDRNOTAVAIL); 11413 11414 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11415 } 11416 11417 11418 /* 11419 * Even if there is no change we redo things just to rerun 11420 * ipif_set_default. 11421 */ 11422 if (ipif->ipif_flags & IPIF_UP) { 11423 /* 11424 * Setting a new local address, make sure 11425 * we have net and subnet bcast ire's for 11426 * the old address if we need them. 11427 */ 11428 if (!ipif->ipif_isv6) 11429 ipif_check_bcast_ires(ipif); 11430 /* 11431 * If the interface is already marked up, 11432 * we call ipif_down which will take care 11433 * of ditching any IREs that have been set 11434 * up based on the old interface address. 11435 */ 11436 err = ipif_logical_down(ipif, q, mp); 11437 if (err == EINPROGRESS) 11438 return (err); 11439 ipif_down_tail(ipif); 11440 need_up = 1; 11441 } 11442 11443 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 11444 return (err); 11445 } 11446 11447 int 11448 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11449 boolean_t need_up) 11450 { 11451 in6_addr_t v6addr; 11452 in6_addr_t ov6addr; 11453 ipaddr_t addr; 11454 sin6_t *sin6; 11455 int sinlen; 11456 int err = 0; 11457 ill_t *ill = ipif->ipif_ill; 11458 boolean_t need_dl_down; 11459 boolean_t need_arp_down; 11460 struct iocblk *iocp; 11461 11462 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL; 11463 11464 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 11465 ill->ill_name, ipif->ipif_id, (void *)ipif)); 11466 ASSERT(IAM_WRITER_IPIF(ipif)); 11467 11468 /* Must cancel any pending timer before taking the ill_lock */ 11469 if (ipif->ipif_recovery_id != 0) 11470 (void) untimeout(ipif->ipif_recovery_id); 11471 ipif->ipif_recovery_id = 0; 11472 11473 if (ipif->ipif_isv6) { 11474 sin6 = (sin6_t *)sin; 11475 v6addr = sin6->sin6_addr; 11476 sinlen = sizeof (struct sockaddr_in6); 11477 } else { 11478 addr = sin->sin_addr.s_addr; 11479 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11480 sinlen = sizeof (struct sockaddr_in); 11481 } 11482 mutex_enter(&ill->ill_lock); 11483 ov6addr = ipif->ipif_v6lcl_addr; 11484 ipif->ipif_v6lcl_addr = v6addr; 11485 sctp_update_ipif_addr(ipif, ov6addr); 11486 if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) { 11487 ipif->ipif_v6src_addr = ipv6_all_zeros; 11488 } else { 11489 ipif->ipif_v6src_addr = v6addr; 11490 } 11491 ipif->ipif_addr_ready = 0; 11492 11493 /* 11494 * If the interface was previously marked as a duplicate, then since 11495 * we've now got a "new" address, it should no longer be considered a 11496 * duplicate -- even if the "new" address is the same as the old one. 11497 * Note that if all ipifs are down, we may have a pending ARP down 11498 * event to handle. This is because we want to recover from duplicates 11499 * and thus delay tearing down ARP until the duplicates have been 11500 * removed or disabled. 11501 */ 11502 need_dl_down = need_arp_down = B_FALSE; 11503 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11504 need_arp_down = !need_up; 11505 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11506 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11507 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11508 need_dl_down = B_TRUE; 11509 } 11510 } 11511 11512 if (ipif->ipif_isv6 && IN6_IS_ADDR_6TO4(&v6addr) && 11513 !ill->ill_is_6to4tun) { 11514 queue_t *wqp = ill->ill_wq; 11515 11516 /* 11517 * The local address of this interface is a 6to4 address, 11518 * check if this interface is in fact a 6to4 tunnel or just 11519 * an interface configured with a 6to4 address. We are only 11520 * interested in the former. 11521 */ 11522 if (wqp != NULL) { 11523 while ((wqp->q_next != NULL) && 11524 (wqp->q_next->q_qinfo != NULL) && 11525 (wqp->q_next->q_qinfo->qi_minfo != NULL)) { 11526 11527 if (wqp->q_next->q_qinfo->qi_minfo->mi_idnum 11528 == TUN6TO4_MODID) { 11529 /* set for use in IP */ 11530 ill->ill_is_6to4tun = 1; 11531 break; 11532 } 11533 wqp = wqp->q_next; 11534 } 11535 } 11536 } 11537 11538 ipif_set_default(ipif); 11539 11540 /* 11541 * When publishing an interface address change event, we only notify 11542 * the event listeners of the new address. It is assumed that if they 11543 * actively care about the addresses assigned that they will have 11544 * already discovered the previous address assigned (if there was one.) 11545 * 11546 * Don't attach nic event message for SIOCLIFADDIF ioctl. 11547 */ 11548 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) { 11549 hook_nic_event_t *info; 11550 if ((info = ipif->ipif_ill->ill_nic_event_info) != NULL) { 11551 ip2dbg(("ip_sioctl_addr_tail: unexpected nic event %d " 11552 "attached for %s\n", info->hne_event, 11553 ill->ill_name)); 11554 if (info->hne_data != NULL) 11555 kmem_free(info->hne_data, info->hne_datalen); 11556 kmem_free(info, sizeof (hook_nic_event_t)); 11557 } 11558 11559 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 11560 if (info != NULL) { 11561 ip_stack_t *ipst = ill->ill_ipst; 11562 11563 info->hne_nic = 11564 ipif->ipif_ill->ill_phyint->phyint_hook_ifindex; 11565 info->hne_lif = MAP_IPIF_ID(ipif->ipif_id); 11566 info->hne_event = NE_ADDRESS_CHANGE; 11567 info->hne_family = ipif->ipif_isv6 ? 11568 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 11569 info->hne_data = kmem_alloc(sinlen, KM_NOSLEEP); 11570 if (info->hne_data != NULL) { 11571 info->hne_datalen = sinlen; 11572 bcopy(sin, info->hne_data, sinlen); 11573 } else { 11574 ip2dbg(("ip_sioctl_addr_tail: could not attach " 11575 "address information for ADDRESS_CHANGE nic" 11576 " event of %s (ENOMEM)\n", 11577 ipif->ipif_ill->ill_name)); 11578 kmem_free(info, sizeof (hook_nic_event_t)); 11579 } 11580 } else 11581 ip2dbg(("ip_sioctl_addr_tail: could not attach " 11582 "ADDRESS_CHANGE nic event information for %s " 11583 "(ENOMEM)\n", ipif->ipif_ill->ill_name)); 11584 11585 ipif->ipif_ill->ill_nic_event_info = info; 11586 } 11587 11588 mutex_exit(&ill->ill_lock); 11589 11590 if (need_up) { 11591 /* 11592 * Now bring the interface back up. If this 11593 * is the only IPIF for the ILL, ipif_up 11594 * will have to re-bind to the device, so 11595 * we may get back EINPROGRESS, in which 11596 * case, this IOCTL will get completed in 11597 * ip_rput_dlpi when we see the DL_BIND_ACK. 11598 */ 11599 err = ipif_up(ipif, q, mp); 11600 } 11601 11602 if (need_dl_down) 11603 ill_dl_down(ill); 11604 if (need_arp_down) 11605 ipif_arp_down(ipif); 11606 11607 return (err); 11608 } 11609 11610 11611 /* 11612 * Restart entry point to restart the address set operation after the 11613 * refcounts have dropped to zero. 11614 */ 11615 /* ARGSUSED */ 11616 int 11617 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11618 ip_ioctl_cmd_t *ipip, void *ifreq) 11619 { 11620 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 11621 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11622 ASSERT(IAM_WRITER_IPIF(ipif)); 11623 ipif_down_tail(ipif); 11624 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 11625 } 11626 11627 /* ARGSUSED */ 11628 int 11629 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11630 ip_ioctl_cmd_t *ipip, void *if_req) 11631 { 11632 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11633 struct lifreq *lifr = (struct lifreq *)if_req; 11634 11635 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 11636 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11637 /* 11638 * The net mask and address can't change since we have a 11639 * reference to the ipif. So no lock is necessary. 11640 */ 11641 if (ipif->ipif_isv6) { 11642 *sin6 = sin6_null; 11643 sin6->sin6_family = AF_INET6; 11644 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 11645 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11646 lifr->lifr_addrlen = 11647 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11648 } else { 11649 *sin = sin_null; 11650 sin->sin_family = AF_INET; 11651 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 11652 if (ipip->ipi_cmd_type == LIF_CMD) { 11653 lifr->lifr_addrlen = 11654 ip_mask_to_plen(ipif->ipif_net_mask); 11655 } 11656 } 11657 return (0); 11658 } 11659 11660 /* 11661 * Set the destination address for a pt-pt interface. 11662 */ 11663 /* ARGSUSED */ 11664 int 11665 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11666 ip_ioctl_cmd_t *ipip, void *if_req) 11667 { 11668 int err = 0; 11669 in6_addr_t v6addr; 11670 boolean_t need_up = B_FALSE; 11671 11672 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 11673 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11674 ASSERT(IAM_WRITER_IPIF(ipif)); 11675 11676 if (ipif->ipif_isv6) { 11677 sin6_t *sin6; 11678 11679 if (sin->sin_family != AF_INET6) 11680 return (EAFNOSUPPORT); 11681 11682 sin6 = (sin6_t *)sin; 11683 v6addr = sin6->sin6_addr; 11684 11685 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 11686 return (EADDRNOTAVAIL); 11687 } else { 11688 ipaddr_t addr; 11689 11690 if (sin->sin_family != AF_INET) 11691 return (EAFNOSUPPORT); 11692 11693 addr = sin->sin_addr.s_addr; 11694 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 11695 return (EADDRNOTAVAIL); 11696 11697 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11698 } 11699 11700 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 11701 return (0); /* No change */ 11702 11703 if (ipif->ipif_flags & IPIF_UP) { 11704 /* 11705 * If the interface is already marked up, 11706 * we call ipif_down which will take care 11707 * of ditching any IREs that have been set 11708 * up based on the old pp dst address. 11709 */ 11710 err = ipif_logical_down(ipif, q, mp); 11711 if (err == EINPROGRESS) 11712 return (err); 11713 ipif_down_tail(ipif); 11714 need_up = B_TRUE; 11715 } 11716 /* 11717 * could return EINPROGRESS. If so ioctl will complete in 11718 * ip_rput_dlpi_writer 11719 */ 11720 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 11721 return (err); 11722 } 11723 11724 static int 11725 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11726 boolean_t need_up) 11727 { 11728 in6_addr_t v6addr; 11729 ill_t *ill = ipif->ipif_ill; 11730 int err = 0; 11731 boolean_t need_dl_down; 11732 boolean_t need_arp_down; 11733 11734 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 11735 ipif->ipif_id, (void *)ipif)); 11736 11737 /* Must cancel any pending timer before taking the ill_lock */ 11738 if (ipif->ipif_recovery_id != 0) 11739 (void) untimeout(ipif->ipif_recovery_id); 11740 ipif->ipif_recovery_id = 0; 11741 11742 if (ipif->ipif_isv6) { 11743 sin6_t *sin6; 11744 11745 sin6 = (sin6_t *)sin; 11746 v6addr = sin6->sin6_addr; 11747 } else { 11748 ipaddr_t addr; 11749 11750 addr = sin->sin_addr.s_addr; 11751 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11752 } 11753 mutex_enter(&ill->ill_lock); 11754 /* Set point to point destination address. */ 11755 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11756 /* 11757 * Allow this as a means of creating logical 11758 * pt-pt interfaces on top of e.g. an Ethernet. 11759 * XXX Undocumented HACK for testing. 11760 * pt-pt interfaces are created with NUD disabled. 11761 */ 11762 ipif->ipif_flags |= IPIF_POINTOPOINT; 11763 ipif->ipif_flags &= ~IPIF_BROADCAST; 11764 if (ipif->ipif_isv6) 11765 ill->ill_flags |= ILLF_NONUD; 11766 } 11767 11768 /* 11769 * If the interface was previously marked as a duplicate, then since 11770 * we've now got a "new" address, it should no longer be considered a 11771 * duplicate -- even if the "new" address is the same as the old one. 11772 * Note that if all ipifs are down, we may have a pending ARP down 11773 * event to handle. 11774 */ 11775 need_dl_down = need_arp_down = B_FALSE; 11776 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11777 need_arp_down = !need_up; 11778 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11779 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11780 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11781 need_dl_down = B_TRUE; 11782 } 11783 } 11784 11785 /* Set the new address. */ 11786 ipif->ipif_v6pp_dst_addr = v6addr; 11787 /* Make sure subnet tracks pp_dst */ 11788 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 11789 mutex_exit(&ill->ill_lock); 11790 11791 if (need_up) { 11792 /* 11793 * Now bring the interface back up. If this 11794 * is the only IPIF for the ILL, ipif_up 11795 * will have to re-bind to the device, so 11796 * we may get back EINPROGRESS, in which 11797 * case, this IOCTL will get completed in 11798 * ip_rput_dlpi when we see the DL_BIND_ACK. 11799 */ 11800 err = ipif_up(ipif, q, mp); 11801 } 11802 11803 if (need_dl_down) 11804 ill_dl_down(ill); 11805 11806 if (need_arp_down) 11807 ipif_arp_down(ipif); 11808 return (err); 11809 } 11810 11811 /* 11812 * Restart entry point to restart the dstaddress set operation after the 11813 * refcounts have dropped to zero. 11814 */ 11815 /* ARGSUSED */ 11816 int 11817 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11818 ip_ioctl_cmd_t *ipip, void *ifreq) 11819 { 11820 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 11821 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11822 ipif_down_tail(ipif); 11823 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 11824 } 11825 11826 /* ARGSUSED */ 11827 int 11828 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11829 ip_ioctl_cmd_t *ipip, void *if_req) 11830 { 11831 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11832 11833 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 11834 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11835 /* 11836 * Get point to point destination address. The addresses can't 11837 * change since we hold a reference to the ipif. 11838 */ 11839 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 11840 return (EADDRNOTAVAIL); 11841 11842 if (ipif->ipif_isv6) { 11843 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11844 *sin6 = sin6_null; 11845 sin6->sin6_family = AF_INET6; 11846 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 11847 } else { 11848 *sin = sin_null; 11849 sin->sin_family = AF_INET; 11850 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 11851 } 11852 return (0); 11853 } 11854 11855 /* 11856 * part of ipmp, make this func return the active/inactive state and 11857 * caller can set once atomically instead of multiple mutex_enter/mutex_exit 11858 */ 11859 /* 11860 * This function either sets or clears the IFF_INACTIVE flag. 11861 * 11862 * As long as there are some addresses or multicast memberships on the 11863 * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we 11864 * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface 11865 * will be used for outbound packets. 11866 * 11867 * Caller needs to verify the validity of setting IFF_INACTIVE. 11868 */ 11869 static void 11870 phyint_inactive(phyint_t *phyi) 11871 { 11872 ill_t *ill_v4; 11873 ill_t *ill_v6; 11874 ipif_t *ipif; 11875 ilm_t *ilm; 11876 11877 ill_v4 = phyi->phyint_illv4; 11878 ill_v6 = phyi->phyint_illv6; 11879 11880 /* 11881 * No need for a lock while traversing the list since iam 11882 * a writer 11883 */ 11884 if (ill_v4 != NULL) { 11885 ASSERT(IAM_WRITER_ILL(ill_v4)); 11886 for (ipif = ill_v4->ill_ipif; ipif != NULL; 11887 ipif = ipif->ipif_next) { 11888 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11889 mutex_enter(&phyi->phyint_lock); 11890 phyi->phyint_flags &= ~PHYI_INACTIVE; 11891 mutex_exit(&phyi->phyint_lock); 11892 return; 11893 } 11894 } 11895 for (ilm = ill_v4->ill_ilm; ilm != NULL; 11896 ilm = ilm->ilm_next) { 11897 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11898 mutex_enter(&phyi->phyint_lock); 11899 phyi->phyint_flags &= ~PHYI_INACTIVE; 11900 mutex_exit(&phyi->phyint_lock); 11901 return; 11902 } 11903 } 11904 } 11905 if (ill_v6 != NULL) { 11906 ill_v6 = phyi->phyint_illv6; 11907 for (ipif = ill_v6->ill_ipif; ipif != NULL; 11908 ipif = ipif->ipif_next) { 11909 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11910 mutex_enter(&phyi->phyint_lock); 11911 phyi->phyint_flags &= ~PHYI_INACTIVE; 11912 mutex_exit(&phyi->phyint_lock); 11913 return; 11914 } 11915 } 11916 for (ilm = ill_v6->ill_ilm; ilm != NULL; 11917 ilm = ilm->ilm_next) { 11918 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11919 mutex_enter(&phyi->phyint_lock); 11920 phyi->phyint_flags &= ~PHYI_INACTIVE; 11921 mutex_exit(&phyi->phyint_lock); 11922 return; 11923 } 11924 } 11925 } 11926 mutex_enter(&phyi->phyint_lock); 11927 phyi->phyint_flags |= PHYI_INACTIVE; 11928 mutex_exit(&phyi->phyint_lock); 11929 } 11930 11931 /* 11932 * This function is called only when the phyint flags change. Currently 11933 * called from ip_sioctl_flags. We re-do the broadcast nomination so 11934 * that we can select a good ill. 11935 */ 11936 static void 11937 ip_redo_nomination(phyint_t *phyi) 11938 { 11939 ill_t *ill_v4; 11940 11941 ill_v4 = phyi->phyint_illv4; 11942 11943 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 11944 ASSERT(IAM_WRITER_ILL(ill_v4)); 11945 if (ill_v4->ill_group->illgrp_ill_count > 1) 11946 ill_nominate_bcast_rcv(ill_v4->ill_group); 11947 } 11948 } 11949 11950 /* 11951 * Heuristic to check if ill is INACTIVE. 11952 * Checks if ill has an ipif with an usable ip address. 11953 * 11954 * Return values: 11955 * B_TRUE - ill is INACTIVE; has no usable ipif 11956 * B_FALSE - ill is not INACTIVE; ill has at least one usable ipif 11957 */ 11958 static boolean_t 11959 ill_is_inactive(ill_t *ill) 11960 { 11961 ipif_t *ipif; 11962 11963 /* Check whether it is in an IPMP group */ 11964 if (ill->ill_phyint->phyint_groupname == NULL) 11965 return (B_FALSE); 11966 11967 if (ill->ill_ipif_up_count == 0) 11968 return (B_TRUE); 11969 11970 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 11971 uint64_t flags = ipif->ipif_flags; 11972 11973 /* 11974 * This ipif is usable if it is IPIF_UP and not a 11975 * dedicated test address. A dedicated test address 11976 * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED 11977 * (note in particular that V6 test addresses are 11978 * link-local data addresses and thus are marked 11979 * IPIF_NOFAILOVER but not IPIF_DEPRECATED). 11980 */ 11981 if ((flags & IPIF_UP) && 11982 ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) != 11983 (IPIF_DEPRECATED|IPIF_NOFAILOVER))) 11984 return (B_FALSE); 11985 } 11986 return (B_TRUE); 11987 } 11988 11989 /* 11990 * Set interface flags. 11991 * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, 11992 * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST, 11993 * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE. 11994 * 11995 * NOTE : We really don't enforce that ipif_id zero should be used 11996 * for setting any flags other than IFF_LOGINT_FLAGS. This 11997 * is because applications generally does SICGLIFFLAGS and 11998 * ORs in the new flags (that affects the logical) and does a 11999 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 12000 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 12001 * flags that will be turned on is correct with respect to 12002 * ipif_id 0. For backward compatibility reasons, it is not done. 12003 */ 12004 /* ARGSUSED */ 12005 int 12006 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12007 ip_ioctl_cmd_t *ipip, void *if_req) 12008 { 12009 uint64_t turn_on; 12010 uint64_t turn_off; 12011 int err; 12012 boolean_t need_up = B_FALSE; 12013 phyint_t *phyi; 12014 ill_t *ill; 12015 uint64_t intf_flags; 12016 boolean_t phyint_flags_modified = B_FALSE; 12017 uint64_t flags; 12018 struct ifreq *ifr; 12019 struct lifreq *lifr; 12020 boolean_t set_linklocal = B_FALSE; 12021 boolean_t zero_source = B_FALSE; 12022 ip_stack_t *ipst; 12023 12024 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 12025 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12026 12027 ASSERT(IAM_WRITER_IPIF(ipif)); 12028 12029 ill = ipif->ipif_ill; 12030 phyi = ill->ill_phyint; 12031 ipst = ill->ill_ipst; 12032 12033 if (ipip->ipi_cmd_type == IF_CMD) { 12034 ifr = (struct ifreq *)if_req; 12035 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 12036 } else { 12037 lifr = (struct lifreq *)if_req; 12038 flags = lifr->lifr_flags; 12039 } 12040 12041 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 12042 12043 /* 12044 * Has the flags been set correctly till now ? 12045 */ 12046 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 12047 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 12048 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 12049 /* 12050 * Compare the new flags to the old, and partition 12051 * into those coming on and those going off. 12052 * For the 16 bit command keep the bits above bit 16 unchanged. 12053 */ 12054 if (ipip->ipi_cmd == SIOCSIFFLAGS) 12055 flags |= intf_flags & ~0xFFFF; 12056 12057 /* 12058 * First check which bits will change and then which will 12059 * go on and off 12060 */ 12061 turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE; 12062 if (!turn_on) 12063 return (0); /* No change */ 12064 12065 turn_off = intf_flags & turn_on; 12066 turn_on ^= turn_off; 12067 err = 0; 12068 12069 /* 12070 * Don't allow any bits belonging to the logical interface 12071 * to be set or cleared on the replacement ipif that was 12072 * created temporarily during a MOVE. 12073 */ 12074 if (ipif->ipif_replace_zero && 12075 ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) { 12076 return (EINVAL); 12077 } 12078 12079 /* 12080 * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on 12081 * IPv6 interfaces. 12082 */ 12083 if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6)) 12084 return (EINVAL); 12085 12086 /* 12087 * cannot turn off IFF_NOXMIT on VNI interfaces. 12088 */ 12089 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill)) 12090 return (EINVAL); 12091 12092 /* 12093 * Don't allow the IFF_ROUTER flag to be turned on on loopback 12094 * interfaces. It makes no sense in that context. 12095 */ 12096 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 12097 return (EINVAL); 12098 12099 if (flags & (IFF_NOLOCAL|IFF_ANYCAST)) 12100 zero_source = B_TRUE; 12101 12102 /* 12103 * For IPv6 ipif_id 0, don't allow the interface to be up without 12104 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 12105 * If the link local address isn't set, and can be set, it will get 12106 * set later on in this function. 12107 */ 12108 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 12109 (flags & IFF_UP) && !zero_source && 12110 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 12111 if (ipif_cant_setlinklocal(ipif)) 12112 return (EINVAL); 12113 set_linklocal = B_TRUE; 12114 } 12115 12116 /* 12117 * ILL cannot be part of a usesrc group and and IPMP group at the 12118 * same time. No need to grab ill_g_usesrc_lock here, see 12119 * synchronization notes in ip.c 12120 */ 12121 if (turn_on & PHYI_STANDBY && 12122 ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 12123 return (EINVAL); 12124 } 12125 12126 /* 12127 * If we modify physical interface flags, we'll potentially need to 12128 * send up two routing socket messages for the changes (one for the 12129 * IPv4 ill, and another for the IPv6 ill). Note that here. 12130 */ 12131 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 12132 phyint_flags_modified = B_TRUE; 12133 12134 /* 12135 * If we are setting or clearing FAILED or STANDBY or OFFLINE, 12136 * we need to flush the IRE_CACHES belonging to this ill. 12137 * We handle this case here without doing the DOWN/UP dance 12138 * like it is done for other flags. If some other flags are 12139 * being turned on/off with FAILED/STANDBY/OFFLINE, the code 12140 * below will handle it by bringing it down and then 12141 * bringing it UP. 12142 */ 12143 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) { 12144 ill_t *ill_v4, *ill_v6; 12145 12146 ill_v4 = phyi->phyint_illv4; 12147 ill_v6 = phyi->phyint_illv6; 12148 12149 /* 12150 * First set the INACTIVE flag if needed. Then delete the ires. 12151 * ire_add will atomically prevent creating new IRE_CACHEs 12152 * unless hidden flag is set. 12153 * PHYI_FAILED and PHYI_INACTIVE are exclusive 12154 */ 12155 if ((turn_on & PHYI_FAILED) && 12156 ((intf_flags & PHYI_STANDBY) || 12157 !ipst->ips_ipmp_enable_failback)) { 12158 /* Reset PHYI_INACTIVE when PHYI_FAILED is being set */ 12159 phyi->phyint_flags &= ~PHYI_INACTIVE; 12160 } 12161 if ((turn_off & PHYI_FAILED) && 12162 ((intf_flags & PHYI_STANDBY) || 12163 (!ipst->ips_ipmp_enable_failback && 12164 ill_is_inactive(ill)))) { 12165 phyint_inactive(phyi); 12166 } 12167 12168 if (turn_on & PHYI_STANDBY) { 12169 /* 12170 * We implicitly set INACTIVE only when STANDBY is set. 12171 * INACTIVE is also set on non-STANDBY phyint when user 12172 * disables FAILBACK using configuration file. 12173 * Do not allow STANDBY to be set on such INACTIVE 12174 * phyint 12175 */ 12176 if (phyi->phyint_flags & PHYI_INACTIVE) 12177 return (EINVAL); 12178 if (!(phyi->phyint_flags & PHYI_FAILED)) 12179 phyint_inactive(phyi); 12180 } 12181 if (turn_off & PHYI_STANDBY) { 12182 if (ipst->ips_ipmp_enable_failback) { 12183 /* 12184 * Reset PHYI_INACTIVE. 12185 */ 12186 phyi->phyint_flags &= ~PHYI_INACTIVE; 12187 } else if (ill_is_inactive(ill) && 12188 !(phyi->phyint_flags & PHYI_FAILED)) { 12189 /* 12190 * Need to set INACTIVE, when user sets 12191 * STANDBY on a non-STANDBY phyint and 12192 * later resets STANDBY 12193 */ 12194 phyint_inactive(phyi); 12195 } 12196 } 12197 /* 12198 * We should always send up a message so that the 12199 * daemons come to know of it. Note that the zeroth 12200 * interface can be down and the check below for IPIF_UP 12201 * will not make sense as we are actually setting 12202 * a phyint flag here. We assume that the ipif used 12203 * is always the zeroth ipif. (ip_rts_ifmsg does not 12204 * send up any message for non-zero ipifs). 12205 */ 12206 phyint_flags_modified = B_TRUE; 12207 12208 if (ill_v4 != NULL) { 12209 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 12210 IRE_CACHE, ill_stq_cache_delete, 12211 (char *)ill_v4, ill_v4); 12212 illgrp_reset_schednext(ill_v4); 12213 } 12214 if (ill_v6 != NULL) { 12215 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 12216 IRE_CACHE, ill_stq_cache_delete, 12217 (char *)ill_v6, ill_v6); 12218 illgrp_reset_schednext(ill_v6); 12219 } 12220 } 12221 12222 /* 12223 * If ILLF_ROUTER changes, we need to change the ip forwarding 12224 * status of the interface and, if the interface is part of an IPMP 12225 * group, all other interfaces that are part of the same IPMP 12226 * group. 12227 */ 12228 if ((turn_on | turn_off) & ILLF_ROUTER) 12229 (void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0)); 12230 12231 /* 12232 * If the interface is not UP and we are not going to 12233 * bring it UP, record the flags and return. When the 12234 * interface comes UP later, the right actions will be 12235 * taken. 12236 */ 12237 if (!(ipif->ipif_flags & IPIF_UP) && 12238 !(turn_on & IPIF_UP)) { 12239 /* Record new flags in their respective places. */ 12240 mutex_enter(&ill->ill_lock); 12241 mutex_enter(&ill->ill_phyint->phyint_lock); 12242 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 12243 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 12244 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 12245 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 12246 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 12247 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 12248 mutex_exit(&ill->ill_lock); 12249 mutex_exit(&ill->ill_phyint->phyint_lock); 12250 12251 /* 12252 * We do the broadcast and nomination here rather 12253 * than waiting for a FAILOVER/FAILBACK to happen. In 12254 * the case of FAILBACK from INACTIVE standby to the 12255 * interface that has been repaired, PHYI_FAILED has not 12256 * been cleared yet. If there are only two interfaces in 12257 * that group, all we have is a FAILED and INACTIVE 12258 * interface. If we do the nomination soon after a failback, 12259 * the broadcast nomination code would select the 12260 * INACTIVE interface for receiving broadcasts as FAILED is 12261 * not yet cleared. As we don't want STANDBY/INACTIVE to 12262 * receive broadcast packets, we need to redo nomination 12263 * when the FAILED is cleared here. Thus, in general we 12264 * always do the nomination here for FAILED, STANDBY 12265 * and OFFLINE. 12266 */ 12267 if (((turn_on | turn_off) & 12268 (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) { 12269 ip_redo_nomination(phyi); 12270 } 12271 if (phyint_flags_modified) { 12272 if (phyi->phyint_illv4 != NULL) { 12273 ip_rts_ifmsg(phyi->phyint_illv4-> 12274 ill_ipif); 12275 } 12276 if (phyi->phyint_illv6 != NULL) { 12277 ip_rts_ifmsg(phyi->phyint_illv6-> 12278 ill_ipif); 12279 } 12280 } 12281 return (0); 12282 } else if (set_linklocal || zero_source) { 12283 mutex_enter(&ill->ill_lock); 12284 if (set_linklocal) 12285 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 12286 if (zero_source) 12287 ipif->ipif_state_flags |= IPIF_ZERO_SOURCE; 12288 mutex_exit(&ill->ill_lock); 12289 } 12290 12291 /* 12292 * Disallow IPv6 interfaces coming up that have the unspecified address, 12293 * or point-to-point interfaces with an unspecified destination. We do 12294 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 12295 * have a subnet assigned, which is how in.ndpd currently manages its 12296 * onlink prefix list when no addresses are configured with those 12297 * prefixes. 12298 */ 12299 if (ipif->ipif_isv6 && 12300 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 12301 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 12302 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 12303 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 12304 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 12305 return (EINVAL); 12306 } 12307 12308 /* 12309 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 12310 * from being brought up. 12311 */ 12312 if (!ipif->ipif_isv6 && 12313 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 12314 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 12315 return (EINVAL); 12316 } 12317 12318 /* 12319 * The only flag changes that we currently take specific action on 12320 * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, 12321 * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and 12322 * IPIF_PREFERRED. This is done by bring the ipif down, changing 12323 * the flags and bringing it back up again. 12324 */ 12325 if ((turn_on|turn_off) & 12326 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 12327 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) { 12328 /* 12329 * Taking this ipif down, make sure we have 12330 * valid net and subnet bcast ire's for other 12331 * logical interfaces, if we need them. 12332 */ 12333 if (!ipif->ipif_isv6) 12334 ipif_check_bcast_ires(ipif); 12335 12336 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 12337 !(turn_off & IPIF_UP)) { 12338 need_up = B_TRUE; 12339 if (ipif->ipif_flags & IPIF_UP) 12340 ill->ill_logical_down = 1; 12341 turn_on &= ~IPIF_UP; 12342 } 12343 err = ipif_down(ipif, q, mp); 12344 ip1dbg(("ipif_down returns %d err ", err)); 12345 if (err == EINPROGRESS) 12346 return (err); 12347 ipif_down_tail(ipif); 12348 } 12349 return (ip_sioctl_flags_tail(ipif, flags, q, mp, need_up)); 12350 } 12351 12352 static int 12353 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp, 12354 boolean_t need_up) 12355 { 12356 ill_t *ill; 12357 phyint_t *phyi; 12358 uint64_t turn_on; 12359 uint64_t turn_off; 12360 uint64_t intf_flags; 12361 boolean_t phyint_flags_modified = B_FALSE; 12362 int err = 0; 12363 boolean_t set_linklocal = B_FALSE; 12364 boolean_t zero_source = B_FALSE; 12365 12366 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 12367 ipif->ipif_ill->ill_name, ipif->ipif_id)); 12368 12369 ASSERT(IAM_WRITER_IPIF(ipif)); 12370 12371 ill = ipif->ipif_ill; 12372 phyi = ill->ill_phyint; 12373 12374 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 12375 turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP); 12376 12377 turn_off = intf_flags & turn_on; 12378 turn_on ^= turn_off; 12379 12380 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) 12381 phyint_flags_modified = B_TRUE; 12382 12383 /* 12384 * Now we change the flags. Track current value of 12385 * other flags in their respective places. 12386 */ 12387 mutex_enter(&ill->ill_lock); 12388 mutex_enter(&phyi->phyint_lock); 12389 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 12390 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 12391 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 12392 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 12393 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 12394 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 12395 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 12396 set_linklocal = B_TRUE; 12397 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 12398 } 12399 if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) { 12400 zero_source = B_TRUE; 12401 ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE; 12402 } 12403 mutex_exit(&ill->ill_lock); 12404 mutex_exit(&phyi->phyint_lock); 12405 12406 if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) 12407 ip_redo_nomination(phyi); 12408 12409 if (set_linklocal) 12410 (void) ipif_setlinklocal(ipif); 12411 12412 if (zero_source) 12413 ipif->ipif_v6src_addr = ipv6_all_zeros; 12414 else 12415 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 12416 12417 if (need_up) { 12418 /* 12419 * XXX ipif_up really does not know whether a phyint flags 12420 * was modified or not. So, it sends up information on 12421 * only one routing sockets message. As we don't bring up 12422 * the interface and also set STANDBY/FAILED simultaneously 12423 * it should be okay. 12424 */ 12425 err = ipif_up(ipif, q, mp); 12426 } else { 12427 /* 12428 * Make sure routing socket sees all changes to the flags. 12429 * ipif_up_done* handles this when we use ipif_up. 12430 */ 12431 if (phyint_flags_modified) { 12432 if (phyi->phyint_illv4 != NULL) { 12433 ip_rts_ifmsg(phyi->phyint_illv4-> 12434 ill_ipif); 12435 } 12436 if (phyi->phyint_illv6 != NULL) { 12437 ip_rts_ifmsg(phyi->phyint_illv6-> 12438 ill_ipif); 12439 } 12440 } else { 12441 ip_rts_ifmsg(ipif); 12442 } 12443 /* 12444 * Update the flags in SCTP's IPIF list, ipif_up() will do 12445 * this in need_up case. 12446 */ 12447 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 12448 } 12449 return (err); 12450 } 12451 12452 /* 12453 * Restart entry point to restart the flags restart operation after the 12454 * refcounts have dropped to zero. 12455 */ 12456 /* ARGSUSED */ 12457 int 12458 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12459 ip_ioctl_cmd_t *ipip, void *if_req) 12460 { 12461 int err; 12462 struct ifreq *ifr = (struct ifreq *)if_req; 12463 struct lifreq *lifr = (struct lifreq *)if_req; 12464 12465 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 12466 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12467 12468 ipif_down_tail(ipif); 12469 if (ipip->ipi_cmd_type == IF_CMD) { 12470 /* 12471 * Since ip_sioctl_flags expects an int and ifr_flags 12472 * is a short we need to cast ifr_flags into an int 12473 * to avoid having sign extension cause bits to get 12474 * set that should not be. 12475 */ 12476 err = ip_sioctl_flags_tail(ipif, 12477 (uint64_t)(ifr->ifr_flags & 0x0000ffff), 12478 q, mp, B_TRUE); 12479 } else { 12480 err = ip_sioctl_flags_tail(ipif, lifr->lifr_flags, 12481 q, mp, B_TRUE); 12482 } 12483 return (err); 12484 } 12485 12486 /* 12487 * Can operate on either a module or a driver queue. 12488 */ 12489 /* ARGSUSED */ 12490 int 12491 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12492 ip_ioctl_cmd_t *ipip, void *if_req) 12493 { 12494 /* 12495 * Has the flags been set correctly till now ? 12496 */ 12497 ill_t *ill = ipif->ipif_ill; 12498 phyint_t *phyi = ill->ill_phyint; 12499 12500 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 12501 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12502 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 12503 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 12504 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 12505 12506 /* 12507 * Need a lock since some flags can be set even when there are 12508 * references to the ipif. 12509 */ 12510 mutex_enter(&ill->ill_lock); 12511 if (ipip->ipi_cmd_type == IF_CMD) { 12512 struct ifreq *ifr = (struct ifreq *)if_req; 12513 12514 /* Get interface flags (low 16 only). */ 12515 ifr->ifr_flags = ((ipif->ipif_flags | 12516 ill->ill_flags | phyi->phyint_flags) & 0xffff); 12517 } else { 12518 struct lifreq *lifr = (struct lifreq *)if_req; 12519 12520 /* Get interface flags. */ 12521 lifr->lifr_flags = ipif->ipif_flags | 12522 ill->ill_flags | phyi->phyint_flags; 12523 } 12524 mutex_exit(&ill->ill_lock); 12525 return (0); 12526 } 12527 12528 /* ARGSUSED */ 12529 int 12530 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12531 ip_ioctl_cmd_t *ipip, void *if_req) 12532 { 12533 int mtu; 12534 int ip_min_mtu; 12535 struct ifreq *ifr; 12536 struct lifreq *lifr; 12537 ire_t *ire; 12538 ip_stack_t *ipst; 12539 12540 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 12541 ipif->ipif_id, (void *)ipif)); 12542 if (ipip->ipi_cmd_type == IF_CMD) { 12543 ifr = (struct ifreq *)if_req; 12544 mtu = ifr->ifr_metric; 12545 } else { 12546 lifr = (struct lifreq *)if_req; 12547 mtu = lifr->lifr_mtu; 12548 } 12549 12550 if (ipif->ipif_isv6) 12551 ip_min_mtu = IPV6_MIN_MTU; 12552 else 12553 ip_min_mtu = IP_MIN_MTU; 12554 12555 if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu) 12556 return (EINVAL); 12557 12558 /* 12559 * Change the MTU size in all relevant ire's. 12560 * Mtu change Vs. new ire creation - protocol below. 12561 * First change ipif_mtu and the ire_max_frag of the 12562 * interface ire. Then do an ire walk and change the 12563 * ire_max_frag of all affected ires. During ire_add 12564 * under the bucket lock, set the ire_max_frag of the 12565 * new ire being created from the ipif/ire from which 12566 * it is being derived. If an mtu change happens after 12567 * the ire is added, the new ire will be cleaned up. 12568 * Conversely if the mtu change happens before the ire 12569 * is added, ire_add will see the new value of the mtu. 12570 */ 12571 ipif->ipif_mtu = mtu; 12572 ipif->ipif_flags |= IPIF_FIXEDMTU; 12573 12574 if (ipif->ipif_isv6) 12575 ire = ipif_to_ire_v6(ipif); 12576 else 12577 ire = ipif_to_ire(ipif); 12578 if (ire != NULL) { 12579 ire->ire_max_frag = ipif->ipif_mtu; 12580 ire_refrele(ire); 12581 } 12582 ipst = ipif->ipif_ill->ill_ipst; 12583 if (ipif->ipif_flags & IPIF_UP) { 12584 if (ipif->ipif_isv6) 12585 ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES, 12586 ipst); 12587 else 12588 ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES, 12589 ipst); 12590 } 12591 /* Update the MTU in SCTP's list */ 12592 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 12593 return (0); 12594 } 12595 12596 /* Get interface MTU. */ 12597 /* ARGSUSED */ 12598 int 12599 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12600 ip_ioctl_cmd_t *ipip, void *if_req) 12601 { 12602 struct ifreq *ifr; 12603 struct lifreq *lifr; 12604 12605 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 12606 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12607 if (ipip->ipi_cmd_type == IF_CMD) { 12608 ifr = (struct ifreq *)if_req; 12609 ifr->ifr_metric = ipif->ipif_mtu; 12610 } else { 12611 lifr = (struct lifreq *)if_req; 12612 lifr->lifr_mtu = ipif->ipif_mtu; 12613 } 12614 return (0); 12615 } 12616 12617 /* Set interface broadcast address. */ 12618 /* ARGSUSED2 */ 12619 int 12620 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12621 ip_ioctl_cmd_t *ipip, void *if_req) 12622 { 12623 ipaddr_t addr; 12624 ire_t *ire; 12625 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12626 12627 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name, 12628 ipif->ipif_id)); 12629 12630 ASSERT(IAM_WRITER_IPIF(ipif)); 12631 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 12632 return (EADDRNOTAVAIL); 12633 12634 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 12635 12636 if (sin->sin_family != AF_INET) 12637 return (EAFNOSUPPORT); 12638 12639 addr = sin->sin_addr.s_addr; 12640 if (ipif->ipif_flags & IPIF_UP) { 12641 /* 12642 * If we are already up, make sure the new 12643 * broadcast address makes sense. If it does, 12644 * there should be an IRE for it already. 12645 * Don't match on ipif, only on the ill 12646 * since we are sharing these now. Don't use 12647 * MATCH_IRE_ILL_GROUP as we are looking for 12648 * the broadcast ire on this ill and each ill 12649 * in the group has its own broadcast ire. 12650 */ 12651 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, 12652 ipif, ALL_ZONES, NULL, 12653 (MATCH_IRE_ILL | MATCH_IRE_TYPE), ipst); 12654 if (ire == NULL) { 12655 return (EINVAL); 12656 } else { 12657 ire_refrele(ire); 12658 } 12659 } 12660 /* 12661 * Changing the broadcast addr for this ipif. 12662 * Make sure we have valid net and subnet bcast 12663 * ire's for other logical interfaces, if needed. 12664 */ 12665 if (addr != ipif->ipif_brd_addr) 12666 ipif_check_bcast_ires(ipif); 12667 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 12668 return (0); 12669 } 12670 12671 /* Get interface broadcast address. */ 12672 /* ARGSUSED */ 12673 int 12674 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12675 ip_ioctl_cmd_t *ipip, void *if_req) 12676 { 12677 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 12678 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12679 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 12680 return (EADDRNOTAVAIL); 12681 12682 /* IPIF_BROADCAST not possible with IPv6 */ 12683 ASSERT(!ipif->ipif_isv6); 12684 *sin = sin_null; 12685 sin->sin_family = AF_INET; 12686 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 12687 return (0); 12688 } 12689 12690 /* 12691 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 12692 */ 12693 /* ARGSUSED */ 12694 int 12695 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12696 ip_ioctl_cmd_t *ipip, void *if_req) 12697 { 12698 int err = 0; 12699 in6_addr_t v6mask; 12700 12701 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 12702 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12703 12704 ASSERT(IAM_WRITER_IPIF(ipif)); 12705 12706 if (ipif->ipif_isv6) { 12707 sin6_t *sin6; 12708 12709 if (sin->sin_family != AF_INET6) 12710 return (EAFNOSUPPORT); 12711 12712 sin6 = (sin6_t *)sin; 12713 v6mask = sin6->sin6_addr; 12714 } else { 12715 ipaddr_t mask; 12716 12717 if (sin->sin_family != AF_INET) 12718 return (EAFNOSUPPORT); 12719 12720 mask = sin->sin_addr.s_addr; 12721 V4MASK_TO_V6(mask, v6mask); 12722 } 12723 12724 /* 12725 * No big deal if the interface isn't already up, or the mask 12726 * isn't really changing, or this is pt-pt. 12727 */ 12728 if (!(ipif->ipif_flags & IPIF_UP) || 12729 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 12730 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 12731 ipif->ipif_v6net_mask = v6mask; 12732 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12733 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 12734 ipif->ipif_v6net_mask, 12735 ipif->ipif_v6subnet); 12736 } 12737 return (0); 12738 } 12739 /* 12740 * Make sure we have valid net and subnet broadcast ire's 12741 * for the old netmask, if needed by other logical interfaces. 12742 */ 12743 if (!ipif->ipif_isv6) 12744 ipif_check_bcast_ires(ipif); 12745 12746 err = ipif_logical_down(ipif, q, mp); 12747 if (err == EINPROGRESS) 12748 return (err); 12749 ipif_down_tail(ipif); 12750 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 12751 return (err); 12752 } 12753 12754 static int 12755 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 12756 { 12757 in6_addr_t v6mask; 12758 int err = 0; 12759 12760 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 12761 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12762 12763 if (ipif->ipif_isv6) { 12764 sin6_t *sin6; 12765 12766 sin6 = (sin6_t *)sin; 12767 v6mask = sin6->sin6_addr; 12768 } else { 12769 ipaddr_t mask; 12770 12771 mask = sin->sin_addr.s_addr; 12772 V4MASK_TO_V6(mask, v6mask); 12773 } 12774 12775 ipif->ipif_v6net_mask = v6mask; 12776 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12777 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 12778 ipif->ipif_v6subnet); 12779 } 12780 err = ipif_up(ipif, q, mp); 12781 12782 if (err == 0 || err == EINPROGRESS) { 12783 /* 12784 * The interface must be DL_BOUND if this packet has to 12785 * go out on the wire. Since we only go through a logical 12786 * down and are bound with the driver during an internal 12787 * down/up that is satisfied. 12788 */ 12789 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 12790 /* Potentially broadcast an address mask reply. */ 12791 ipif_mask_reply(ipif); 12792 } 12793 } 12794 return (err); 12795 } 12796 12797 /* ARGSUSED */ 12798 int 12799 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12800 ip_ioctl_cmd_t *ipip, void *if_req) 12801 { 12802 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 12803 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12804 ipif_down_tail(ipif); 12805 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 12806 } 12807 12808 /* Get interface net mask. */ 12809 /* ARGSUSED */ 12810 int 12811 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12812 ip_ioctl_cmd_t *ipip, void *if_req) 12813 { 12814 struct lifreq *lifr = (struct lifreq *)if_req; 12815 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 12816 12817 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 12818 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12819 12820 /* 12821 * net mask can't change since we have a reference to the ipif. 12822 */ 12823 if (ipif->ipif_isv6) { 12824 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12825 *sin6 = sin6_null; 12826 sin6->sin6_family = AF_INET6; 12827 sin6->sin6_addr = ipif->ipif_v6net_mask; 12828 lifr->lifr_addrlen = 12829 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12830 } else { 12831 *sin = sin_null; 12832 sin->sin_family = AF_INET; 12833 sin->sin_addr.s_addr = ipif->ipif_net_mask; 12834 if (ipip->ipi_cmd_type == LIF_CMD) { 12835 lifr->lifr_addrlen = 12836 ip_mask_to_plen(ipif->ipif_net_mask); 12837 } 12838 } 12839 return (0); 12840 } 12841 12842 /* ARGSUSED */ 12843 int 12844 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12845 ip_ioctl_cmd_t *ipip, void *if_req) 12846 { 12847 12848 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 12849 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12850 /* 12851 * Set interface metric. We don't use this for 12852 * anything but we keep track of it in case it is 12853 * important to routing applications or such. 12854 */ 12855 if (ipip->ipi_cmd_type == IF_CMD) { 12856 struct ifreq *ifr; 12857 12858 ifr = (struct ifreq *)if_req; 12859 ipif->ipif_metric = ifr->ifr_metric; 12860 } else { 12861 struct lifreq *lifr; 12862 12863 lifr = (struct lifreq *)if_req; 12864 ipif->ipif_metric = lifr->lifr_metric; 12865 } 12866 return (0); 12867 } 12868 12869 12870 /* ARGSUSED */ 12871 int 12872 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12873 ip_ioctl_cmd_t *ipip, void *if_req) 12874 { 12875 12876 /* Get interface metric. */ 12877 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 12878 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12879 if (ipip->ipi_cmd_type == IF_CMD) { 12880 struct ifreq *ifr; 12881 12882 ifr = (struct ifreq *)if_req; 12883 ifr->ifr_metric = ipif->ipif_metric; 12884 } else { 12885 struct lifreq *lifr; 12886 12887 lifr = (struct lifreq *)if_req; 12888 lifr->lifr_metric = ipif->ipif_metric; 12889 } 12890 12891 return (0); 12892 } 12893 12894 /* ARGSUSED */ 12895 int 12896 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12897 ip_ioctl_cmd_t *ipip, void *if_req) 12898 { 12899 12900 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 12901 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12902 /* 12903 * Set the muxid returned from I_PLINK. 12904 */ 12905 if (ipip->ipi_cmd_type == IF_CMD) { 12906 struct ifreq *ifr = (struct ifreq *)if_req; 12907 12908 ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid; 12909 ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid; 12910 } else { 12911 struct lifreq *lifr = (struct lifreq *)if_req; 12912 12913 ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid; 12914 ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid; 12915 } 12916 return (0); 12917 } 12918 12919 /* ARGSUSED */ 12920 int 12921 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12922 ip_ioctl_cmd_t *ipip, void *if_req) 12923 { 12924 12925 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 12926 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12927 /* 12928 * Get the muxid saved in ill for I_PUNLINK. 12929 */ 12930 if (ipip->ipi_cmd_type == IF_CMD) { 12931 struct ifreq *ifr = (struct ifreq *)if_req; 12932 12933 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12934 ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12935 } else { 12936 struct lifreq *lifr = (struct lifreq *)if_req; 12937 12938 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12939 lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12940 } 12941 return (0); 12942 } 12943 12944 /* 12945 * Set the subnet prefix. Does not modify the broadcast address. 12946 */ 12947 /* ARGSUSED */ 12948 int 12949 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12950 ip_ioctl_cmd_t *ipip, void *if_req) 12951 { 12952 int err = 0; 12953 in6_addr_t v6addr; 12954 in6_addr_t v6mask; 12955 boolean_t need_up = B_FALSE; 12956 int addrlen; 12957 12958 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 12959 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12960 12961 ASSERT(IAM_WRITER_IPIF(ipif)); 12962 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 12963 12964 if (ipif->ipif_isv6) { 12965 sin6_t *sin6; 12966 12967 if (sin->sin_family != AF_INET6) 12968 return (EAFNOSUPPORT); 12969 12970 sin6 = (sin6_t *)sin; 12971 v6addr = sin6->sin6_addr; 12972 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 12973 return (EADDRNOTAVAIL); 12974 } else { 12975 ipaddr_t addr; 12976 12977 if (sin->sin_family != AF_INET) 12978 return (EAFNOSUPPORT); 12979 12980 addr = sin->sin_addr.s_addr; 12981 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 12982 return (EADDRNOTAVAIL); 12983 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12984 /* Add 96 bits */ 12985 addrlen += IPV6_ABITS - IP_ABITS; 12986 } 12987 12988 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 12989 return (EINVAL); 12990 12991 /* Check if bits in the address is set past the mask */ 12992 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 12993 return (EINVAL); 12994 12995 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 12996 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 12997 return (0); /* No change */ 12998 12999 if (ipif->ipif_flags & IPIF_UP) { 13000 /* 13001 * If the interface is already marked up, 13002 * we call ipif_down which will take care 13003 * of ditching any IREs that have been set 13004 * up based on the old interface address. 13005 */ 13006 err = ipif_logical_down(ipif, q, mp); 13007 if (err == EINPROGRESS) 13008 return (err); 13009 ipif_down_tail(ipif); 13010 need_up = B_TRUE; 13011 } 13012 13013 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 13014 return (err); 13015 } 13016 13017 static int 13018 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 13019 queue_t *q, mblk_t *mp, boolean_t need_up) 13020 { 13021 ill_t *ill = ipif->ipif_ill; 13022 int err = 0; 13023 13024 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 13025 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 13026 13027 /* Set the new address. */ 13028 mutex_enter(&ill->ill_lock); 13029 ipif->ipif_v6net_mask = v6mask; 13030 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 13031 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 13032 ipif->ipif_v6subnet); 13033 } 13034 mutex_exit(&ill->ill_lock); 13035 13036 if (need_up) { 13037 /* 13038 * Now bring the interface back up. If this 13039 * is the only IPIF for the ILL, ipif_up 13040 * will have to re-bind to the device, so 13041 * we may get back EINPROGRESS, in which 13042 * case, this IOCTL will get completed in 13043 * ip_rput_dlpi when we see the DL_BIND_ACK. 13044 */ 13045 err = ipif_up(ipif, q, mp); 13046 if (err == EINPROGRESS) 13047 return (err); 13048 } 13049 return (err); 13050 } 13051 13052 /* ARGSUSED */ 13053 int 13054 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 13055 ip_ioctl_cmd_t *ipip, void *if_req) 13056 { 13057 int addrlen; 13058 in6_addr_t v6addr; 13059 in6_addr_t v6mask; 13060 struct lifreq *lifr = (struct lifreq *)if_req; 13061 13062 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 13063 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 13064 ipif_down_tail(ipif); 13065 13066 addrlen = lifr->lifr_addrlen; 13067 if (ipif->ipif_isv6) { 13068 sin6_t *sin6; 13069 13070 sin6 = (sin6_t *)sin; 13071 v6addr = sin6->sin6_addr; 13072 } else { 13073 ipaddr_t addr; 13074 13075 addr = sin->sin_addr.s_addr; 13076 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 13077 addrlen += IPV6_ABITS - IP_ABITS; 13078 } 13079 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 13080 13081 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 13082 } 13083 13084 /* ARGSUSED */ 13085 int 13086 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 13087 ip_ioctl_cmd_t *ipip, void *if_req) 13088 { 13089 struct lifreq *lifr = (struct lifreq *)if_req; 13090 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 13091 13092 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 13093 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 13094 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 13095 13096 if (ipif->ipif_isv6) { 13097 *sin6 = sin6_null; 13098 sin6->sin6_family = AF_INET6; 13099 sin6->sin6_addr = ipif->ipif_v6subnet; 13100 lifr->lifr_addrlen = 13101 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 13102 } else { 13103 *sin = sin_null; 13104 sin->sin_family = AF_INET; 13105 sin->sin_addr.s_addr = ipif->ipif_subnet; 13106 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 13107 } 13108 return (0); 13109 } 13110 13111 /* 13112 * Set the IPv6 address token. 13113 */ 13114 /* ARGSUSED */ 13115 int 13116 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 13117 ip_ioctl_cmd_t *ipi, void *if_req) 13118 { 13119 ill_t *ill = ipif->ipif_ill; 13120 int err; 13121 in6_addr_t v6addr; 13122 in6_addr_t v6mask; 13123 boolean_t need_up = B_FALSE; 13124 int i; 13125 sin6_t *sin6 = (sin6_t *)sin; 13126 struct lifreq *lifr = (struct lifreq *)if_req; 13127 int addrlen; 13128 13129 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 13130 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 13131 ASSERT(IAM_WRITER_IPIF(ipif)); 13132 13133 addrlen = lifr->lifr_addrlen; 13134 /* Only allow for logical unit zero i.e. not on "le0:17" */ 13135 if (ipif->ipif_id != 0) 13136 return (EINVAL); 13137 13138 if (!ipif->ipif_isv6) 13139 return (EINVAL); 13140 13141 if (addrlen > IPV6_ABITS) 13142 return (EINVAL); 13143 13144 v6addr = sin6->sin6_addr; 13145 13146 /* 13147 * The length of the token is the length from the end. To get 13148 * the proper mask for this, compute the mask of the bits not 13149 * in the token; ie. the prefix, and then xor to get the mask. 13150 */ 13151 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 13152 return (EINVAL); 13153 for (i = 0; i < 4; i++) { 13154 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 13155 } 13156 13157 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 13158 ill->ill_token_length == addrlen) 13159 return (0); /* No change */ 13160 13161 if (ipif->ipif_flags & IPIF_UP) { 13162 err = ipif_logical_down(ipif, q, mp); 13163 if (err == EINPROGRESS) 13164 return (err); 13165 ipif_down_tail(ipif); 13166 need_up = B_TRUE; 13167 } 13168 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 13169 return (err); 13170 } 13171 13172 static int 13173 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 13174 mblk_t *mp, boolean_t need_up) 13175 { 13176 in6_addr_t v6addr; 13177 in6_addr_t v6mask; 13178 ill_t *ill = ipif->ipif_ill; 13179 int i; 13180 int err = 0; 13181 13182 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 13183 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 13184 v6addr = sin6->sin6_addr; 13185 /* 13186 * The length of the token is the length from the end. To get 13187 * the proper mask for this, compute the mask of the bits not 13188 * in the token; ie. the prefix, and then xor to get the mask. 13189 */ 13190 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 13191 for (i = 0; i < 4; i++) 13192 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 13193 13194 mutex_enter(&ill->ill_lock); 13195 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 13196 ill->ill_token_length = addrlen; 13197 mutex_exit(&ill->ill_lock); 13198 13199 if (need_up) { 13200 /* 13201 * Now bring the interface back up. If this 13202 * is the only IPIF for the ILL, ipif_up 13203 * will have to re-bind to the device, so 13204 * we may get back EINPROGRESS, in which 13205 * case, this IOCTL will get completed in 13206 * ip_rput_dlpi when we see the DL_BIND_ACK. 13207 */ 13208 err = ipif_up(ipif, q, mp); 13209 if (err == EINPROGRESS) 13210 return (err); 13211 } 13212 return (err); 13213 } 13214 13215 /* ARGSUSED */ 13216 int 13217 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 13218 ip_ioctl_cmd_t *ipi, void *if_req) 13219 { 13220 ill_t *ill; 13221 sin6_t *sin6 = (sin6_t *)sin; 13222 struct lifreq *lifr = (struct lifreq *)if_req; 13223 13224 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 13225 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 13226 if (ipif->ipif_id != 0) 13227 return (EINVAL); 13228 13229 ill = ipif->ipif_ill; 13230 if (!ill->ill_isv6) 13231 return (ENXIO); 13232 13233 *sin6 = sin6_null; 13234 sin6->sin6_family = AF_INET6; 13235 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 13236 sin6->sin6_addr = ill->ill_token; 13237 lifr->lifr_addrlen = ill->ill_token_length; 13238 return (0); 13239 } 13240 13241 /* 13242 * Set (hardware) link specific information that might override 13243 * what was acquired through the DL_INFO_ACK. 13244 * The logic is as follows. 13245 * 13246 * become exclusive 13247 * set CHANGING flag 13248 * change mtu on affected IREs 13249 * clear CHANGING flag 13250 * 13251 * An ire add that occurs before the CHANGING flag is set will have its mtu 13252 * changed by the ip_sioctl_lnkinfo. 13253 * 13254 * During the time the CHANGING flag is set, no new ires will be added to the 13255 * bucket, and ire add will fail (due the CHANGING flag). 13256 * 13257 * An ire add that occurs after the CHANGING flag is set will have the right mtu 13258 * before it is added to the bucket. 13259 * 13260 * Obviously only 1 thread can set the CHANGING flag and we need to become 13261 * exclusive to set the flag. 13262 */ 13263 /* ARGSUSED */ 13264 int 13265 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 13266 ip_ioctl_cmd_t *ipi, void *if_req) 13267 { 13268 ill_t *ill = ipif->ipif_ill; 13269 ipif_t *nipif; 13270 int ip_min_mtu; 13271 boolean_t mtu_walk = B_FALSE; 13272 struct lifreq *lifr = (struct lifreq *)if_req; 13273 lif_ifinfo_req_t *lir; 13274 ire_t *ire; 13275 13276 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 13277 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 13278 lir = &lifr->lifr_ifinfo; 13279 ASSERT(IAM_WRITER_IPIF(ipif)); 13280 13281 /* Only allow for logical unit zero i.e. not on "le0:17" */ 13282 if (ipif->ipif_id != 0) 13283 return (EINVAL); 13284 13285 /* Set interface MTU. */ 13286 if (ipif->ipif_isv6) 13287 ip_min_mtu = IPV6_MIN_MTU; 13288 else 13289 ip_min_mtu = IP_MIN_MTU; 13290 13291 /* 13292 * Verify values before we set anything. Allow zero to 13293 * mean unspecified. 13294 */ 13295 if (lir->lir_maxmtu != 0 && 13296 (lir->lir_maxmtu > ill->ill_max_frag || 13297 lir->lir_maxmtu < ip_min_mtu)) 13298 return (EINVAL); 13299 if (lir->lir_reachtime != 0 && 13300 lir->lir_reachtime > ND_MAX_REACHTIME) 13301 return (EINVAL); 13302 if (lir->lir_reachretrans != 0 && 13303 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 13304 return (EINVAL); 13305 13306 mutex_enter(&ill->ill_lock); 13307 ill->ill_state_flags |= ILL_CHANGING; 13308 for (nipif = ill->ill_ipif; nipif != NULL; 13309 nipif = nipif->ipif_next) { 13310 nipif->ipif_state_flags |= IPIF_CHANGING; 13311 } 13312 13313 mutex_exit(&ill->ill_lock); 13314 13315 if (lir->lir_maxmtu != 0) { 13316 ill->ill_max_mtu = lir->lir_maxmtu; 13317 ill->ill_mtu_userspecified = 1; 13318 mtu_walk = B_TRUE; 13319 } 13320 13321 if (lir->lir_reachtime != 0) 13322 ill->ill_reachable_time = lir->lir_reachtime; 13323 13324 if (lir->lir_reachretrans != 0) 13325 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 13326 13327 ill->ill_max_hops = lir->lir_maxhops; 13328 13329 ill->ill_max_buf = ND_MAX_Q; 13330 13331 if (mtu_walk) { 13332 /* 13333 * Set the MTU on all ipifs associated with this ill except 13334 * for those whose MTU was fixed via SIOCSLIFMTU. 13335 */ 13336 for (nipif = ill->ill_ipif; nipif != NULL; 13337 nipif = nipif->ipif_next) { 13338 if (nipif->ipif_flags & IPIF_FIXEDMTU) 13339 continue; 13340 13341 nipif->ipif_mtu = ill->ill_max_mtu; 13342 13343 if (!(nipif->ipif_flags & IPIF_UP)) 13344 continue; 13345 13346 if (nipif->ipif_isv6) 13347 ire = ipif_to_ire_v6(nipif); 13348 else 13349 ire = ipif_to_ire(nipif); 13350 if (ire != NULL) { 13351 ire->ire_max_frag = ipif->ipif_mtu; 13352 ire_refrele(ire); 13353 } 13354 if (ill->ill_isv6) { 13355 ire_walk_ill_v6(MATCH_IRE_ILL, 0, 13356 ipif_mtu_change, (char *)nipif, 13357 ill); 13358 } else { 13359 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 13360 ipif_mtu_change, (char *)nipif, 13361 ill); 13362 } 13363 } 13364 } 13365 13366 mutex_enter(&ill->ill_lock); 13367 for (nipif = ill->ill_ipif; nipif != NULL; 13368 nipif = nipif->ipif_next) { 13369 nipif->ipif_state_flags &= ~IPIF_CHANGING; 13370 } 13371 ILL_UNMARK_CHANGING(ill); 13372 mutex_exit(&ill->ill_lock); 13373 13374 return (0); 13375 } 13376 13377 /* ARGSUSED */ 13378 int 13379 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 13380 ip_ioctl_cmd_t *ipi, void *if_req) 13381 { 13382 struct lif_ifinfo_req *lir; 13383 ill_t *ill = ipif->ipif_ill; 13384 13385 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 13386 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 13387 if (ipif->ipif_id != 0) 13388 return (EINVAL); 13389 13390 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 13391 lir->lir_maxhops = ill->ill_max_hops; 13392 lir->lir_reachtime = ill->ill_reachable_time; 13393 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 13394 lir->lir_maxmtu = ill->ill_max_mtu; 13395 13396 return (0); 13397 } 13398 13399 /* 13400 * Return best guess as to the subnet mask for the specified address. 13401 * Based on the subnet masks for all the configured interfaces. 13402 * 13403 * We end up returning a zero mask in the case of default, multicast or 13404 * experimental. 13405 */ 13406 static ipaddr_t 13407 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst) 13408 { 13409 ipaddr_t net_mask; 13410 ill_t *ill; 13411 ipif_t *ipif; 13412 ill_walk_context_t ctx; 13413 ipif_t *fallback_ipif = NULL; 13414 13415 net_mask = ip_net_mask(addr); 13416 if (net_mask == 0) { 13417 *ipifp = NULL; 13418 return (0); 13419 } 13420 13421 /* Let's check to see if this is maybe a local subnet route. */ 13422 /* this function only applies to IPv4 interfaces */ 13423 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 13424 ill = ILL_START_WALK_V4(&ctx, ipst); 13425 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 13426 mutex_enter(&ill->ill_lock); 13427 for (ipif = ill->ill_ipif; ipif != NULL; 13428 ipif = ipif->ipif_next) { 13429 if (!IPIF_CAN_LOOKUP(ipif)) 13430 continue; 13431 if (!(ipif->ipif_flags & IPIF_UP)) 13432 continue; 13433 if ((ipif->ipif_subnet & net_mask) == 13434 (addr & net_mask)) { 13435 /* 13436 * Don't trust pt-pt interfaces if there are 13437 * other interfaces. 13438 */ 13439 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 13440 if (fallback_ipif == NULL) { 13441 ipif_refhold_locked(ipif); 13442 fallback_ipif = ipif; 13443 } 13444 continue; 13445 } 13446 13447 /* 13448 * Fine. Just assume the same net mask as the 13449 * directly attached subnet interface is using. 13450 */ 13451 ipif_refhold_locked(ipif); 13452 mutex_exit(&ill->ill_lock); 13453 rw_exit(&ipst->ips_ill_g_lock); 13454 if (fallback_ipif != NULL) 13455 ipif_refrele(fallback_ipif); 13456 *ipifp = ipif; 13457 return (ipif->ipif_net_mask); 13458 } 13459 } 13460 mutex_exit(&ill->ill_lock); 13461 } 13462 rw_exit(&ipst->ips_ill_g_lock); 13463 13464 *ipifp = fallback_ipif; 13465 return ((fallback_ipif != NULL) ? 13466 fallback_ipif->ipif_net_mask : net_mask); 13467 } 13468 13469 /* 13470 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 13471 */ 13472 static void 13473 ip_wput_ioctl(queue_t *q, mblk_t *mp) 13474 { 13475 IOCP iocp; 13476 ipft_t *ipft; 13477 ipllc_t *ipllc; 13478 mblk_t *mp1; 13479 cred_t *cr; 13480 int error = 0; 13481 conn_t *connp; 13482 13483 ip1dbg(("ip_wput_ioctl")); 13484 iocp = (IOCP)mp->b_rptr; 13485 mp1 = mp->b_cont; 13486 if (mp1 == NULL) { 13487 iocp->ioc_error = EINVAL; 13488 mp->b_datap->db_type = M_IOCNAK; 13489 iocp->ioc_count = 0; 13490 qreply(q, mp); 13491 return; 13492 } 13493 13494 /* 13495 * These IOCTLs provide various control capabilities to 13496 * upstream agents such as ULPs and processes. There 13497 * are currently two such IOCTLs implemented. They 13498 * are used by TCP to provide update information for 13499 * existing IREs and to forcibly delete an IRE for a 13500 * host that is not responding, thereby forcing an 13501 * attempt at a new route. 13502 */ 13503 iocp->ioc_error = EINVAL; 13504 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 13505 goto done; 13506 13507 ipllc = (ipllc_t *)mp1->b_rptr; 13508 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 13509 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 13510 break; 13511 } 13512 /* 13513 * prefer credential from mblk over ioctl; 13514 * see ip_sioctl_copyin_setup 13515 */ 13516 cr = DB_CREDDEF(mp, iocp->ioc_cr); 13517 13518 /* 13519 * Refhold the conn in case the request gets queued up in some lookup 13520 */ 13521 ASSERT(CONN_Q(q)); 13522 connp = Q_TO_CONN(q); 13523 CONN_INC_REF(connp); 13524 if (ipft->ipft_pfi && 13525 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 13526 pullupmsg(mp1, ipft->ipft_min_size))) { 13527 error = (*ipft->ipft_pfi)(q, 13528 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 13529 } 13530 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 13531 /* 13532 * CONN_OPER_PENDING_DONE happens in the function called 13533 * through ipft_pfi above. 13534 */ 13535 return; 13536 } 13537 13538 CONN_OPER_PENDING_DONE(connp); 13539 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 13540 freemsg(mp); 13541 return; 13542 } 13543 iocp->ioc_error = error; 13544 13545 done: 13546 mp->b_datap->db_type = M_IOCACK; 13547 if (iocp->ioc_error) 13548 iocp->ioc_count = 0; 13549 qreply(q, mp); 13550 } 13551 13552 /* 13553 * Lookup an ipif using the sequence id (ipif_seqid) 13554 */ 13555 ipif_t * 13556 ipif_lookup_seqid(ill_t *ill, uint_t seqid) 13557 { 13558 ipif_t *ipif; 13559 13560 ASSERT(MUTEX_HELD(&ill->ill_lock)); 13561 13562 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13563 if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif)) 13564 return (ipif); 13565 } 13566 return (NULL); 13567 } 13568 13569 /* 13570 * Assign a unique id for the ipif. This is used later when we send 13571 * IRES to ARP for resolution where we initialize ire_ipif_seqid 13572 * to the value pointed by ire_ipif->ipif_seqid. Later when the 13573 * IRE is added, we verify that ipif has not disappeared. 13574 */ 13575 13576 static void 13577 ipif_assign_seqid(ipif_t *ipif) 13578 { 13579 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13580 13581 ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1); 13582 } 13583 13584 /* 13585 * Insert the ipif, so that the list of ipifs on the ill will be sorted 13586 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 13587 * be inserted into the first space available in the list. The value of 13588 * ipif_id will then be set to the appropriate value for its position. 13589 */ 13590 static int 13591 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) 13592 { 13593 ill_t *ill; 13594 ipif_t *tipif; 13595 ipif_t **tipifp; 13596 int id; 13597 ip_stack_t *ipst; 13598 13599 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 13600 IAM_WRITER_IPIF(ipif)); 13601 13602 ill = ipif->ipif_ill; 13603 ASSERT(ill != NULL); 13604 ipst = ill->ill_ipst; 13605 13606 /* 13607 * In the case of lo0:0 we already hold the ill_g_lock. 13608 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 13609 * ipif_insert. Another such caller is ipif_move. 13610 */ 13611 if (acquire_g_lock) 13612 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13613 if (acquire_ill_lock) 13614 mutex_enter(&ill->ill_lock); 13615 id = ipif->ipif_id; 13616 tipifp = &(ill->ill_ipif); 13617 if (id == -1) { /* need to find a real id */ 13618 id = 0; 13619 while ((tipif = *tipifp) != NULL) { 13620 ASSERT(tipif->ipif_id >= id); 13621 if (tipif->ipif_id != id) 13622 break; /* non-consecutive id */ 13623 id++; 13624 tipifp = &(tipif->ipif_next); 13625 } 13626 /* limit number of logical interfaces */ 13627 if (id >= ipst->ips_ip_addrs_per_if) { 13628 if (acquire_ill_lock) 13629 mutex_exit(&ill->ill_lock); 13630 if (acquire_g_lock) 13631 rw_exit(&ipst->ips_ill_g_lock); 13632 return (-1); 13633 } 13634 ipif->ipif_id = id; /* assign new id */ 13635 } else if (id < ipst->ips_ip_addrs_per_if) { 13636 /* we have a real id; insert ipif in the right place */ 13637 while ((tipif = *tipifp) != NULL) { 13638 ASSERT(tipif->ipif_id != id); 13639 if (tipif->ipif_id > id) 13640 break; /* found correct location */ 13641 tipifp = &(tipif->ipif_next); 13642 } 13643 } else { 13644 if (acquire_ill_lock) 13645 mutex_exit(&ill->ill_lock); 13646 if (acquire_g_lock) 13647 rw_exit(&ipst->ips_ill_g_lock); 13648 return (-1); 13649 } 13650 13651 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 13652 13653 ipif->ipif_next = tipif; 13654 *tipifp = ipif; 13655 if (acquire_ill_lock) 13656 mutex_exit(&ill->ill_lock); 13657 if (acquire_g_lock) 13658 rw_exit(&ipst->ips_ill_g_lock); 13659 return (0); 13660 } 13661 13662 static void 13663 ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock) 13664 { 13665 ipif_t **ipifp; 13666 ill_t *ill = ipif->ipif_ill; 13667 13668 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock)); 13669 if (acquire_ill_lock) 13670 mutex_enter(&ill->ill_lock); 13671 else 13672 ASSERT(MUTEX_HELD(&ill->ill_lock)); 13673 13674 ipifp = &ill->ill_ipif; 13675 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 13676 if (*ipifp == ipif) { 13677 *ipifp = ipif->ipif_next; 13678 break; 13679 } 13680 } 13681 13682 if (acquire_ill_lock) 13683 mutex_exit(&ill->ill_lock); 13684 } 13685 13686 /* 13687 * Allocate and initialize a new interface control structure. (Always 13688 * called as writer.) 13689 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 13690 * is not part of the global linked list of ills. ipif_seqid is unique 13691 * in the system and to preserve the uniqueness, it is assigned only 13692 * when ill becomes part of the global list. At that point ill will 13693 * have a name. If it doesn't get assigned here, it will get assigned 13694 * in ipif_set_values() as part of SIOCSLIFNAME processing. 13695 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 13696 * the interface flags or any other information from the DL_INFO_ACK for 13697 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 13698 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 13699 * second DL_INFO_ACK comes in from the driver. 13700 */ 13701 static ipif_t * 13702 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) 13703 { 13704 ipif_t *ipif; 13705 phyint_t *phyi; 13706 13707 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 13708 ill->ill_name, id, (void *)ill)); 13709 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 13710 13711 if ((ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) 13712 return (NULL); 13713 *ipif = ipif_zero; /* start clean */ 13714 13715 ipif->ipif_ill = ill; 13716 ipif->ipif_id = id; /* could be -1 */ 13717 /* 13718 * Inherit the zoneid from the ill; for the shared stack instance 13719 * this is always the global zone 13720 */ 13721 ipif->ipif_zoneid = ill->ill_zoneid; 13722 13723 mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 13724 13725 ipif->ipif_refcnt = 0; 13726 ipif->ipif_saved_ire_cnt = 0; 13727 13728 if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) { 13729 mi_free(ipif); 13730 return (NULL); 13731 } 13732 /* -1 id should have been replaced by real id */ 13733 id = ipif->ipif_id; 13734 ASSERT(id >= 0); 13735 13736 if (ill->ill_name[0] != '\0') 13737 ipif_assign_seqid(ipif); 13738 13739 /* 13740 * Keep a copy of original id in ipif_orig_ipifid. Failback 13741 * will attempt to restore the original id. The SIOCSLIFOINDEX 13742 * ioctl sets ipif_orig_ipifid to zero. 13743 */ 13744 ipif->ipif_orig_ipifid = id; 13745 13746 /* 13747 * We grab the ill_lock and phyint_lock to protect the flag changes. 13748 * The ipif is still not up and can't be looked up until the 13749 * ioctl completes and the IPIF_CHANGING flag is cleared. 13750 */ 13751 mutex_enter(&ill->ill_lock); 13752 mutex_enter(&ill->ill_phyint->phyint_lock); 13753 /* 13754 * Set the running flag when logical interface zero is created. 13755 * For subsequent logical interfaces, a DLPI link down 13756 * notification message may have cleared the running flag to 13757 * indicate the link is down, so we shouldn't just blindly set it. 13758 */ 13759 if (id == 0) 13760 ill->ill_phyint->phyint_flags |= PHYI_RUNNING; 13761 ipif->ipif_ire_type = ire_type; 13762 phyi = ill->ill_phyint; 13763 ipif->ipif_orig_ifindex = phyi->phyint_ifindex; 13764 13765 if (ipif->ipif_isv6) { 13766 ill->ill_flags |= ILLF_IPV6; 13767 } else { 13768 ipaddr_t inaddr_any = INADDR_ANY; 13769 13770 ill->ill_flags |= ILLF_IPV4; 13771 13772 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 13773 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13774 &ipif->ipif_v6lcl_addr); 13775 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13776 &ipif->ipif_v6src_addr); 13777 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13778 &ipif->ipif_v6subnet); 13779 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13780 &ipif->ipif_v6net_mask); 13781 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13782 &ipif->ipif_v6brd_addr); 13783 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13784 &ipif->ipif_v6pp_dst_addr); 13785 } 13786 13787 /* 13788 * Don't set the interface flags etc. now, will do it in 13789 * ip_ll_subnet_defaults. 13790 */ 13791 if (!initialize) { 13792 mutex_exit(&ill->ill_lock); 13793 mutex_exit(&ill->ill_phyint->phyint_lock); 13794 return (ipif); 13795 } 13796 ipif->ipif_mtu = ill->ill_max_mtu; 13797 13798 if (ill->ill_bcast_addr_length != 0) { 13799 /* 13800 * Later detect lack of DLPI driver multicast 13801 * capability by catching DL_ENABMULTI errors in 13802 * ip_rput_dlpi. 13803 */ 13804 ill->ill_flags |= ILLF_MULTICAST; 13805 if (!ipif->ipif_isv6) 13806 ipif->ipif_flags |= IPIF_BROADCAST; 13807 } else { 13808 if (ill->ill_net_type != IRE_LOOPBACK) { 13809 if (ipif->ipif_isv6) 13810 /* 13811 * Note: xresolv interfaces will eventually need 13812 * NOARP set here as well, but that will require 13813 * those external resolvers to have some 13814 * knowledge of that flag and act appropriately. 13815 * Not to be changed at present. 13816 */ 13817 ill->ill_flags |= ILLF_NONUD; 13818 else 13819 ill->ill_flags |= ILLF_NOARP; 13820 } 13821 if (ill->ill_phys_addr_length == 0) { 13822 if (ill->ill_media && 13823 ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 13824 ipif->ipif_flags |= IPIF_NOXMIT; 13825 phyi->phyint_flags |= PHYI_VIRTUAL; 13826 } else { 13827 /* pt-pt supports multicast. */ 13828 ill->ill_flags |= ILLF_MULTICAST; 13829 if (ill->ill_net_type == IRE_LOOPBACK) { 13830 phyi->phyint_flags |= 13831 (PHYI_LOOPBACK | PHYI_VIRTUAL); 13832 } else { 13833 ipif->ipif_flags |= IPIF_POINTOPOINT; 13834 } 13835 } 13836 } 13837 } 13838 mutex_exit(&ill->ill_lock); 13839 mutex_exit(&ill->ill_phyint->phyint_lock); 13840 return (ipif); 13841 } 13842 13843 /* 13844 * If appropriate, send a message up to the resolver delete the entry 13845 * for the address of this interface which is going out of business. 13846 * (Always called as writer). 13847 * 13848 * NOTE : We need to check for NULL mps as some of the fields are 13849 * initialized only for some interface types. See ipif_resolver_up() 13850 * for details. 13851 */ 13852 void 13853 ipif_arp_down(ipif_t *ipif) 13854 { 13855 mblk_t *mp; 13856 ill_t *ill = ipif->ipif_ill; 13857 13858 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13859 ASSERT(IAM_WRITER_IPIF(ipif)); 13860 13861 /* Delete the mapping for the local address */ 13862 mp = ipif->ipif_arp_del_mp; 13863 if (mp != NULL) { 13864 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13865 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13866 putnext(ill->ill_rq, mp); 13867 ipif->ipif_arp_del_mp = NULL; 13868 } 13869 13870 /* 13871 * If this is the last ipif that is going down and there are no 13872 * duplicate addresses we may yet attempt to re-probe, then we need to 13873 * clean up ARP completely. 13874 */ 13875 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) { 13876 13877 /* Send up AR_INTERFACE_DOWN message */ 13878 mp = ill->ill_arp_down_mp; 13879 if (mp != NULL) { 13880 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13881 *(unsigned *)mp->b_rptr, ill->ill_name, 13882 ipif->ipif_id)); 13883 putnext(ill->ill_rq, mp); 13884 ill->ill_arp_down_mp = NULL; 13885 } 13886 13887 /* Tell ARP to delete the multicast mappings */ 13888 mp = ill->ill_arp_del_mapping_mp; 13889 if (mp != NULL) { 13890 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13891 *(unsigned *)mp->b_rptr, ill->ill_name, 13892 ipif->ipif_id)); 13893 putnext(ill->ill_rq, mp); 13894 ill->ill_arp_del_mapping_mp = NULL; 13895 } 13896 } 13897 } 13898 13899 /* 13900 * This function sets up the multicast mappings in ARP. When ipif_resolver_up 13901 * calls this function, it passes a non-NULL arp_add_mapping_mp indicating 13902 * that it wants the add_mp allocated in this function to be returned 13903 * wihtout sending it to arp. When ip_rput_dlpi_writer calls this to 13904 * just re-do the multicast, it wants us to send the add_mp to ARP also. 13905 * ipif_resolver_up does not want us to do the "add" i.e sending to ARP, 13906 * as it does a ipif_arp_down after calling this function - which will 13907 * remove what we add here. 13908 * 13909 * Returns -1 on failures and 0 on success. 13910 */ 13911 int 13912 ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) 13913 { 13914 mblk_t *del_mp = NULL; 13915 mblk_t *add_mp = NULL; 13916 mblk_t *mp; 13917 ill_t *ill = ipif->ipif_ill; 13918 phyint_t *phyi = ill->ill_phyint; 13919 ipaddr_t addr, mask, extract_mask = 0; 13920 arma_t *arma; 13921 uint8_t *maddr, *bphys_addr; 13922 uint32_t hw_start; 13923 dl_unitdata_req_t *dlur; 13924 13925 ASSERT(IAM_WRITER_IPIF(ipif)); 13926 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13927 return (0); 13928 13929 /* 13930 * Delete the existing mapping from ARP. Normally ipif_down 13931 * -> ipif_arp_down should send this up to ARP. The only 13932 * reason we would find this when we are switching from 13933 * Multicast to Broadcast where we did not do a down. 13934 */ 13935 mp = ill->ill_arp_del_mapping_mp; 13936 if (mp != NULL) { 13937 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13938 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13939 putnext(ill->ill_rq, mp); 13940 ill->ill_arp_del_mapping_mp = NULL; 13941 } 13942 13943 if (arp_add_mapping_mp != NULL) 13944 *arp_add_mapping_mp = NULL; 13945 13946 /* 13947 * Check that the address is not to long for the constant 13948 * length reserved in the template arma_t. 13949 */ 13950 if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) 13951 return (-1); 13952 13953 /* Add mapping mblk */ 13954 addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP); 13955 mask = (ipaddr_t)htonl(IN_CLASSD_NET); 13956 add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template, 13957 (caddr_t)&addr); 13958 if (add_mp == NULL) 13959 return (-1); 13960 arma = (arma_t *)add_mp->b_rptr; 13961 maddr = (uint8_t *)arma + arma->arma_hw_addr_offset; 13962 bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN); 13963 arma->arma_hw_addr_length = ill->ill_phys_addr_length; 13964 13965 /* 13966 * Determine the broadcast address. 13967 */ 13968 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 13969 if (ill->ill_sap_length < 0) 13970 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 13971 else 13972 bphys_addr = (uchar_t *)dlur + 13973 dlur->dl_dest_addr_offset + ill->ill_sap_length; 13974 /* 13975 * Check PHYI_MULTI_BCAST and length of physical 13976 * address to determine if we use the mapping or the 13977 * broadcast address. 13978 */ 13979 if (!(phyi->phyint_flags & PHYI_MULTI_BCAST)) 13980 if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length, 13981 bphys_addr, maddr, &hw_start, &extract_mask)) 13982 phyi->phyint_flags |= PHYI_MULTI_BCAST; 13983 13984 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) || 13985 (ill->ill_flags & ILLF_MULTICAST)) { 13986 /* Make sure this will not match the "exact" entry. */ 13987 addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP); 13988 del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 13989 (caddr_t)&addr); 13990 if (del_mp == NULL) { 13991 freemsg(add_mp); 13992 return (-1); 13993 } 13994 bcopy(&extract_mask, (char *)arma + 13995 arma->arma_proto_extract_mask_offset, IP_ADDR_LEN); 13996 if (phyi->phyint_flags & PHYI_MULTI_BCAST) { 13997 /* Use link-layer broadcast address for MULTI_BCAST */ 13998 bcopy(bphys_addr, maddr, ill->ill_phys_addr_length); 13999 ip2dbg(("ipif_arp_setup_multicast: adding" 14000 " MULTI_BCAST ARP setup for %s\n", ill->ill_name)); 14001 } else { 14002 arma->arma_hw_mapping_start = hw_start; 14003 ip2dbg(("ipif_arp_setup_multicast: adding multicast" 14004 " ARP setup for %s\n", ill->ill_name)); 14005 } 14006 } else { 14007 freemsg(add_mp); 14008 ASSERT(del_mp == NULL); 14009 /* It is neither MULTICAST nor MULTI_BCAST */ 14010 return (0); 14011 } 14012 ASSERT(add_mp != NULL && del_mp != NULL); 14013 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 14014 ill->ill_arp_del_mapping_mp = del_mp; 14015 if (arp_add_mapping_mp != NULL) { 14016 /* The caller just wants the mblks allocated */ 14017 *arp_add_mapping_mp = add_mp; 14018 } else { 14019 /* The caller wants us to send it to arp */ 14020 putnext(ill->ill_rq, add_mp); 14021 } 14022 return (0); 14023 } 14024 14025 /* 14026 * Get the resolver set up for a new interface address. 14027 * (Always called as writer.) 14028 * Called both for IPv4 and IPv6 interfaces, 14029 * though it only sets up the resolver for v6 14030 * if it's an xresolv interface (one using an external resolver). 14031 * Honors ILLF_NOARP. 14032 * The enumerated value res_act is used to tune the behavior. 14033 * If set to Res_act_initial, then we set up all the resolver 14034 * structures for a new interface. If set to Res_act_move, then 14035 * we just send an AR_ENTRY_ADD message up to ARP for IPv4 14036 * interfaces; this is called by ip_rput_dlpi_writer() to handle 14037 * asynchronous hardware address change notification. If set to 14038 * Res_act_defend, then we tell ARP that it needs to send a single 14039 * gratuitous message in defense of the address. 14040 * Returns error on failure. 14041 */ 14042 int 14043 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 14044 { 14045 caddr_t addr; 14046 mblk_t *arp_up_mp = NULL; 14047 mblk_t *arp_down_mp = NULL; 14048 mblk_t *arp_add_mp = NULL; 14049 mblk_t *arp_del_mp = NULL; 14050 mblk_t *arp_add_mapping_mp = NULL; 14051 mblk_t *arp_del_mapping_mp = NULL; 14052 ill_t *ill = ipif->ipif_ill; 14053 uchar_t *area_p = NULL; 14054 uchar_t *ared_p = NULL; 14055 int err = ENOMEM; 14056 boolean_t was_dup; 14057 14058 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 14059 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 14060 ASSERT(IAM_WRITER_IPIF(ipif)); 14061 14062 was_dup = B_FALSE; 14063 if (res_act == Res_act_initial) { 14064 ipif->ipif_addr_ready = 0; 14065 /* 14066 * We're bringing an interface up here. There's no way that we 14067 * should need to shut down ARP now. 14068 */ 14069 mutex_enter(&ill->ill_lock); 14070 if (ipif->ipif_flags & IPIF_DUPLICATE) { 14071 ipif->ipif_flags &= ~IPIF_DUPLICATE; 14072 ill->ill_ipif_dup_count--; 14073 was_dup = B_TRUE; 14074 } 14075 mutex_exit(&ill->ill_lock); 14076 } 14077 if (ipif->ipif_recovery_id != 0) 14078 (void) untimeout(ipif->ipif_recovery_id); 14079 ipif->ipif_recovery_id = 0; 14080 if (ill->ill_net_type != IRE_IF_RESOLVER) { 14081 ipif->ipif_addr_ready = 1; 14082 return (0); 14083 } 14084 /* NDP will set the ipif_addr_ready flag when it's ready */ 14085 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 14086 return (0); 14087 14088 if (ill->ill_isv6) { 14089 /* 14090 * External resolver for IPv6 14091 */ 14092 ASSERT(res_act == Res_act_initial); 14093 if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 14094 addr = (caddr_t)&ipif->ipif_v6lcl_addr; 14095 area_p = (uchar_t *)&ip6_area_template; 14096 ared_p = (uchar_t *)&ip6_ared_template; 14097 } 14098 } else { 14099 /* 14100 * IPv4 arp case. If the ARP stream has already started 14101 * closing, fail this request for ARP bringup. Else 14102 * record the fact that an ARP bringup is pending. 14103 */ 14104 mutex_enter(&ill->ill_lock); 14105 if (ill->ill_arp_closing) { 14106 mutex_exit(&ill->ill_lock); 14107 err = EINVAL; 14108 goto failed; 14109 } else { 14110 if (ill->ill_ipif_up_count == 0 && 14111 ill->ill_ipif_dup_count == 0 && !was_dup) 14112 ill->ill_arp_bringup_pending = 1; 14113 mutex_exit(&ill->ill_lock); 14114 } 14115 if (ipif->ipif_lcl_addr != INADDR_ANY) { 14116 addr = (caddr_t)&ipif->ipif_lcl_addr; 14117 area_p = (uchar_t *)&ip_area_template; 14118 ared_p = (uchar_t *)&ip_ared_template; 14119 } 14120 } 14121 14122 /* 14123 * Add an entry for the local address in ARP only if it 14124 * is not UNNUMBERED and the address is not INADDR_ANY. 14125 */ 14126 if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && area_p != NULL) { 14127 area_t *area; 14128 14129 /* Now ask ARP to publish our address. */ 14130 arp_add_mp = ill_arp_alloc(ill, area_p, addr); 14131 if (arp_add_mp == NULL) 14132 goto failed; 14133 area = (area_t *)arp_add_mp->b_rptr; 14134 if (res_act != Res_act_initial) { 14135 /* 14136 * Copy the new hardware address and length into 14137 * arp_add_mp to be sent to ARP. 14138 */ 14139 area->area_hw_addr_length = ill->ill_phys_addr_length; 14140 bcopy(ill->ill_phys_addr, 14141 ((char *)area + area->area_hw_addr_offset), 14142 area->area_hw_addr_length); 14143 } 14144 14145 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | 14146 ACE_F_MYADDR; 14147 14148 if (res_act == Res_act_defend) { 14149 area->area_flags |= ACE_F_DEFEND; 14150 /* 14151 * If we're just defending our address now, then 14152 * there's no need to set up ARP multicast mappings. 14153 * The publish command is enough. 14154 */ 14155 goto done; 14156 } 14157 14158 if (res_act != Res_act_initial) 14159 goto arp_setup_multicast; 14160 14161 /* 14162 * Allocate an ARP deletion message so we know we can tell ARP 14163 * when the interface goes down. 14164 */ 14165 arp_del_mp = ill_arp_alloc(ill, ared_p, addr); 14166 if (arp_del_mp == NULL) 14167 goto failed; 14168 14169 } else { 14170 if (res_act != Res_act_initial) 14171 goto done; 14172 } 14173 /* 14174 * Need to bring up ARP or setup multicast mapping only 14175 * when the first interface is coming UP. 14176 */ 14177 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 14178 was_dup) { 14179 goto done; 14180 } 14181 14182 /* 14183 * Allocate an ARP down message (to be saved) and an ARP up 14184 * message. 14185 */ 14186 arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0); 14187 if (arp_down_mp == NULL) 14188 goto failed; 14189 14190 arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0); 14191 if (arp_up_mp == NULL) 14192 goto failed; 14193 14194 if (ipif->ipif_flags & IPIF_POINTOPOINT) 14195 goto done; 14196 14197 arp_setup_multicast: 14198 /* 14199 * Setup the multicast mappings. This function initializes 14200 * ill_arp_del_mapping_mp also. This does not need to be done for 14201 * IPv6. 14202 */ 14203 if (!ill->ill_isv6) { 14204 err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp); 14205 if (err != 0) 14206 goto failed; 14207 ASSERT(ill->ill_arp_del_mapping_mp != NULL); 14208 ASSERT(arp_add_mapping_mp != NULL); 14209 } 14210 14211 done: 14212 if (arp_del_mp != NULL) { 14213 ASSERT(ipif->ipif_arp_del_mp == NULL); 14214 ipif->ipif_arp_del_mp = arp_del_mp; 14215 } 14216 if (arp_down_mp != NULL) { 14217 ASSERT(ill->ill_arp_down_mp == NULL); 14218 ill->ill_arp_down_mp = arp_down_mp; 14219 } 14220 if (arp_del_mapping_mp != NULL) { 14221 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 14222 ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; 14223 } 14224 if (arp_up_mp != NULL) { 14225 ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", 14226 ill->ill_name, ipif->ipif_id)); 14227 putnext(ill->ill_rq, arp_up_mp); 14228 } 14229 if (arp_add_mp != NULL) { 14230 ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", 14231 ill->ill_name, ipif->ipif_id)); 14232 /* 14233 * If it's an extended ARP implementation, then we'll wait to 14234 * hear that DAD has finished before using the interface. 14235 */ 14236 if (!ill->ill_arp_extend) 14237 ipif->ipif_addr_ready = 1; 14238 putnext(ill->ill_rq, arp_add_mp); 14239 } else { 14240 ipif->ipif_addr_ready = 1; 14241 } 14242 if (arp_add_mapping_mp != NULL) { 14243 ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", 14244 ill->ill_name, ipif->ipif_id)); 14245 putnext(ill->ill_rq, arp_add_mapping_mp); 14246 } 14247 if (res_act != Res_act_initial) 14248 return (0); 14249 14250 if (ill->ill_flags & ILLF_NOARP) 14251 err = ill_arp_off(ill); 14252 else 14253 err = ill_arp_on(ill); 14254 if (err != 0) { 14255 ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err)); 14256 freemsg(ipif->ipif_arp_del_mp); 14257 freemsg(ill->ill_arp_down_mp); 14258 freemsg(ill->ill_arp_del_mapping_mp); 14259 ipif->ipif_arp_del_mp = NULL; 14260 ill->ill_arp_down_mp = NULL; 14261 ill->ill_arp_del_mapping_mp = NULL; 14262 return (err); 14263 } 14264 return ((ill->ill_ipif_up_count != 0 || was_dup || 14265 ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS); 14266 14267 failed: 14268 ip1dbg(("ipif_resolver_up: FAILED\n")); 14269 freemsg(arp_add_mp); 14270 freemsg(arp_del_mp); 14271 freemsg(arp_add_mapping_mp); 14272 freemsg(arp_up_mp); 14273 freemsg(arp_down_mp); 14274 ill->ill_arp_bringup_pending = 0; 14275 return (err); 14276 } 14277 14278 /* 14279 * This routine restarts IPv4 duplicate address detection (DAD) when a link has 14280 * just gone back up. 14281 */ 14282 static void 14283 ipif_arp_start_dad(ipif_t *ipif) 14284 { 14285 ill_t *ill = ipif->ipif_ill; 14286 mblk_t *arp_add_mp; 14287 area_t *area; 14288 14289 if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing || 14290 (ipif->ipif_flags & IPIF_UNNUMBERED) || 14291 ipif->ipif_lcl_addr == INADDR_ANY || 14292 (arp_add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 14293 (char *)&ipif->ipif_lcl_addr)) == NULL) { 14294 /* 14295 * If we can't contact ARP for some reason, that's not really a 14296 * problem. Just send out the routing socket notification that 14297 * DAD completion would have done, and continue. 14298 */ 14299 ipif_mask_reply(ipif); 14300 ip_rts_ifmsg(ipif); 14301 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 14302 sctp_update_ipif(ipif, SCTP_IPIF_UP); 14303 ipif->ipif_addr_ready = 1; 14304 return; 14305 } 14306 14307 /* Setting the 'unverified' flag restarts DAD */ 14308 area = (area_t *)arp_add_mp->b_rptr; 14309 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | 14310 ACE_F_UNVERIFIED; 14311 putnext(ill->ill_rq, arp_add_mp); 14312 } 14313 14314 static void 14315 ipif_ndp_start_dad(ipif_t *ipif) 14316 { 14317 nce_t *nce; 14318 14319 nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, B_FALSE); 14320 if (nce == NULL) 14321 return; 14322 14323 if (!ndp_restart_dad(nce)) { 14324 /* 14325 * If we can't restart DAD for some reason, that's not really a 14326 * problem. Just send out the routing socket notification that 14327 * DAD completion would have done, and continue. 14328 */ 14329 ip_rts_ifmsg(ipif); 14330 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 14331 sctp_update_ipif(ipif, SCTP_IPIF_UP); 14332 ipif->ipif_addr_ready = 1; 14333 } 14334 NCE_REFRELE(nce); 14335 } 14336 14337 /* 14338 * Restart duplicate address detection on all interfaces on the given ill. 14339 * 14340 * This is called when an interface transitions from down to up 14341 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 14342 * 14343 * Note that since the underlying physical link has transitioned, we must cause 14344 * at least one routing socket message to be sent here, either via DAD 14345 * completion or just by default on the first ipif. (If we don't do this, then 14346 * in.mpathd will see long delays when doing link-based failure recovery.) 14347 */ 14348 void 14349 ill_restart_dad(ill_t *ill, boolean_t went_up) 14350 { 14351 ipif_t *ipif; 14352 14353 if (ill == NULL) 14354 return; 14355 14356 /* 14357 * If layer two doesn't support duplicate address detection, then just 14358 * send the routing socket message now and be done with it. 14359 */ 14360 if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) || 14361 (!ill->ill_isv6 && !ill->ill_arp_extend)) { 14362 ip_rts_ifmsg(ill->ill_ipif); 14363 return; 14364 } 14365 14366 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14367 if (went_up) { 14368 if (ipif->ipif_flags & IPIF_UP) { 14369 if (ill->ill_isv6) 14370 ipif_ndp_start_dad(ipif); 14371 else 14372 ipif_arp_start_dad(ipif); 14373 } else if (ill->ill_isv6 && 14374 (ipif->ipif_flags & IPIF_DUPLICATE)) { 14375 /* 14376 * For IPv4, the ARP module itself will 14377 * automatically start the DAD process when it 14378 * sees DL_NOTE_LINK_UP. We respond to the 14379 * AR_CN_READY at the completion of that task. 14380 * For IPv6, we must kick off the bring-up 14381 * process now. 14382 */ 14383 ndp_do_recovery(ipif); 14384 } else { 14385 /* 14386 * Unfortunately, the first ipif is "special" 14387 * and represents the underlying ill in the 14388 * routing socket messages. Thus, when this 14389 * one ipif is down, we must still notify so 14390 * that the user knows the IFF_RUNNING status 14391 * change. (If the first ipif is up, then 14392 * we'll handle eventual routing socket 14393 * notification via DAD completion.) 14394 */ 14395 if (ipif == ill->ill_ipif) 14396 ip_rts_ifmsg(ill->ill_ipif); 14397 } 14398 } else { 14399 /* 14400 * After link down, we'll need to send a new routing 14401 * message when the link comes back, so clear 14402 * ipif_addr_ready. 14403 */ 14404 ipif->ipif_addr_ready = 0; 14405 } 14406 } 14407 14408 /* 14409 * If we've torn down links, then notify the user right away. 14410 */ 14411 if (!went_up) 14412 ip_rts_ifmsg(ill->ill_ipif); 14413 } 14414 14415 /* 14416 * Wakeup all threads waiting to enter the ipsq, and sleeping 14417 * on any of the ills in this ipsq. The ill_lock of the ill 14418 * must be held so that waiters don't miss wakeups 14419 */ 14420 static void 14421 ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock) 14422 { 14423 phyint_t *phyint; 14424 14425 phyint = ipsq->ipsq_phyint_list; 14426 while (phyint != NULL) { 14427 if (phyint->phyint_illv4) { 14428 if (!caller_holds_lock) 14429 mutex_enter(&phyint->phyint_illv4->ill_lock); 14430 ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14431 cv_broadcast(&phyint->phyint_illv4->ill_cv); 14432 if (!caller_holds_lock) 14433 mutex_exit(&phyint->phyint_illv4->ill_lock); 14434 } 14435 if (phyint->phyint_illv6) { 14436 if (!caller_holds_lock) 14437 mutex_enter(&phyint->phyint_illv6->ill_lock); 14438 ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14439 cv_broadcast(&phyint->phyint_illv6->ill_cv); 14440 if (!caller_holds_lock) 14441 mutex_exit(&phyint->phyint_illv6->ill_lock); 14442 } 14443 phyint = phyint->phyint_ipsq_next; 14444 } 14445 } 14446 14447 static ipsq_t * 14448 ipsq_create(char *groupname, ip_stack_t *ipst) 14449 { 14450 ipsq_t *ipsq; 14451 14452 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 14453 ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 14454 if (ipsq == NULL) { 14455 return (NULL); 14456 } 14457 14458 if (groupname != NULL) 14459 (void) strcpy(ipsq->ipsq_name, groupname); 14460 else 14461 ipsq->ipsq_name[0] = '\0'; 14462 14463 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL); 14464 ipsq->ipsq_flags |= IPSQ_GROUP; 14465 ipsq->ipsq_next = ipst->ips_ipsq_g_head; 14466 ipst->ips_ipsq_g_head = ipsq; 14467 ipsq->ipsq_ipst = ipst; /* No netstack_hold */ 14468 return (ipsq); 14469 } 14470 14471 /* 14472 * Return an ipsq correspoding to the groupname. If 'create' is true 14473 * allocate a new ipsq if one does not exist. Usually an ipsq is associated 14474 * uniquely with an IPMP group. However during IPMP groupname operations, 14475 * multiple IPMP groups may be associated with a single ipsq. But no 14476 * IPMP group can be associated with more than 1 ipsq at any time. 14477 * For example 14478 * Interfaces IPMP grpname ipsq ipsq_name ipsq_refs 14479 * hme1, hme2 mpk17-84 ipsq1 mpk17-84 2 14480 * hme3, hme4 mpk17-85 ipsq2 mpk17-85 2 14481 * 14482 * Now the command ifconfig hme3 group mpk17-84 results in the temporary 14483 * status shown below during the execution of the above command. 14484 * hme1, hme2, hme3, hme4 mpk17-84, mpk17-85 ipsq1 mpk17-84 4 14485 * 14486 * After the completion of the above groupname command we return to the stable 14487 * state shown below. 14488 * hme1, hme2, hme3 mpk17-84 ipsq1 mpk17-84 3 14489 * hme4 mpk17-85 ipsq2 mpk17-85 1 14490 * 14491 * Because of the above, we don't search based on the ipsq_name since that 14492 * would miss the correct ipsq during certain windows as shown above. 14493 * The ipsq_name is only used during split of an ipsq to return the ipsq to its 14494 * natural state. 14495 */ 14496 static ipsq_t * 14497 ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq, 14498 ip_stack_t *ipst) 14499 { 14500 ipsq_t *ipsq; 14501 int group_len; 14502 phyint_t *phyint; 14503 14504 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 14505 14506 group_len = strlen(groupname); 14507 ASSERT(group_len != 0); 14508 group_len++; 14509 14510 for (ipsq = ipst->ips_ipsq_g_head; 14511 ipsq != NULL; 14512 ipsq = ipsq->ipsq_next) { 14513 /* 14514 * When an ipsq is being split, and ill_split_ipsq 14515 * calls this function, we exclude it from being considered. 14516 */ 14517 if (ipsq == exclude_ipsq) 14518 continue; 14519 14520 /* 14521 * Compare against the ipsq_name. The groupname change happens 14522 * in 2 phases. The 1st phase merges the from group into 14523 * the to group's ipsq, by calling ill_merge_groups and restarts 14524 * the ioctl. The 2nd phase then locates the ipsq again thru 14525 * ipsq_name. At this point the phyint_groupname has not been 14526 * updated. 14527 */ 14528 if ((group_len == strlen(ipsq->ipsq_name) + 1) && 14529 (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) { 14530 /* 14531 * Verify that an ipmp groupname is exactly 14532 * part of 1 ipsq and is not found in any other 14533 * ipsq. 14534 */ 14535 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq, ipst) == 14536 NULL); 14537 return (ipsq); 14538 } 14539 14540 /* 14541 * Comparison against ipsq_name alone is not sufficient. 14542 * In the case when groups are currently being 14543 * merged, the ipsq could hold other IPMP groups temporarily. 14544 * so we walk the phyint list and compare against the 14545 * phyint_groupname as well. 14546 */ 14547 phyint = ipsq->ipsq_phyint_list; 14548 while (phyint != NULL) { 14549 if ((group_len == phyint->phyint_groupname_len) && 14550 (bcmp(phyint->phyint_groupname, groupname, 14551 group_len) == 0)) { 14552 /* 14553 * Verify that an ipmp groupname is exactly 14554 * part of 1 ipsq and is not found in any other 14555 * ipsq. 14556 */ 14557 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq, 14558 ipst) == NULL); 14559 return (ipsq); 14560 } 14561 phyint = phyint->phyint_ipsq_next; 14562 } 14563 } 14564 if (create) 14565 ipsq = ipsq_create(groupname, ipst); 14566 return (ipsq); 14567 } 14568 14569 static void 14570 ipsq_delete(ipsq_t *ipsq) 14571 { 14572 ipsq_t *nipsq; 14573 ipsq_t *pipsq = NULL; 14574 ip_stack_t *ipst = ipsq->ipsq_ipst; 14575 14576 /* 14577 * We don't hold the ipsq lock, but we are sure no new 14578 * messages can land up, since the ipsq_refs is zero. 14579 * i.e. this ipsq is unnamed and no phyint or phyint group 14580 * is associated with this ipsq. (Lookups are based on ill_name 14581 * or phyint_groupname) 14582 */ 14583 ASSERT(ipsq->ipsq_refs == 0); 14584 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL); 14585 ASSERT(ipsq->ipsq_pending_mp == NULL); 14586 if (!(ipsq->ipsq_flags & IPSQ_GROUP)) { 14587 /* 14588 * This is not the ipsq of an IPMP group. 14589 */ 14590 ipsq->ipsq_ipst = NULL; 14591 kmem_free(ipsq, sizeof (ipsq_t)); 14592 return; 14593 } 14594 14595 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14596 14597 /* 14598 * Locate the ipsq before we can remove it from 14599 * the singly linked list of ipsq's. 14600 */ 14601 for (nipsq = ipst->ips_ipsq_g_head; nipsq != NULL; 14602 nipsq = nipsq->ipsq_next) { 14603 if (nipsq == ipsq) { 14604 break; 14605 } 14606 pipsq = nipsq; 14607 } 14608 14609 ASSERT(nipsq == ipsq); 14610 14611 /* unlink ipsq from the list */ 14612 if (pipsq != NULL) 14613 pipsq->ipsq_next = ipsq->ipsq_next; 14614 else 14615 ipst->ips_ipsq_g_head = ipsq->ipsq_next; 14616 ipsq->ipsq_ipst = NULL; 14617 kmem_free(ipsq, sizeof (ipsq_t)); 14618 rw_exit(&ipst->ips_ill_g_lock); 14619 } 14620 14621 static void 14622 ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp, 14623 queue_t *q) 14624 { 14625 ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock)); 14626 ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL); 14627 ASSERT(old_ipsq->ipsq_pending_ipif == NULL); 14628 ASSERT(old_ipsq->ipsq_pending_mp == NULL); 14629 ASSERT(current_mp != NULL); 14630 14631 ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl, 14632 NEW_OP, NULL); 14633 14634 ASSERT(new_ipsq->ipsq_xopq_mptail != NULL && 14635 new_ipsq->ipsq_xopq_mphead != NULL); 14636 14637 /* 14638 * move from old ipsq to the new ipsq. 14639 */ 14640 new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead; 14641 if (old_ipsq->ipsq_xopq_mphead != NULL) 14642 new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail; 14643 14644 old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL; 14645 } 14646 14647 void 14648 ill_group_cleanup(ill_t *ill) 14649 { 14650 ill_t *ill_v4; 14651 ill_t *ill_v6; 14652 ipif_t *ipif; 14653 14654 ill_v4 = ill->ill_phyint->phyint_illv4; 14655 ill_v6 = ill->ill_phyint->phyint_illv6; 14656 14657 if (ill_v4 != NULL) { 14658 mutex_enter(&ill_v4->ill_lock); 14659 for (ipif = ill_v4->ill_ipif; ipif != NULL; 14660 ipif = ipif->ipif_next) { 14661 IPIF_UNMARK_MOVING(ipif); 14662 } 14663 ill_v4->ill_up_ipifs = B_FALSE; 14664 mutex_exit(&ill_v4->ill_lock); 14665 } 14666 14667 if (ill_v6 != NULL) { 14668 mutex_enter(&ill_v6->ill_lock); 14669 for (ipif = ill_v6->ill_ipif; ipif != NULL; 14670 ipif = ipif->ipif_next) { 14671 IPIF_UNMARK_MOVING(ipif); 14672 } 14673 ill_v6->ill_up_ipifs = B_FALSE; 14674 mutex_exit(&ill_v6->ill_lock); 14675 } 14676 } 14677 /* 14678 * This function is called when an ill has had a change in its group status 14679 * to bring up all the ipifs that were up before the change. 14680 */ 14681 int 14682 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 14683 { 14684 ipif_t *ipif; 14685 ill_t *ill_v4; 14686 ill_t *ill_v6; 14687 ill_t *from_ill; 14688 int err = 0; 14689 14690 14691 ASSERT(IAM_WRITER_ILL(ill)); 14692 14693 /* 14694 * Except for ipif_state_flags and ill_state_flags the other 14695 * fields of the ipif/ill that are modified below are protected 14696 * implicitly since we are a writer. We would have tried to down 14697 * even an ipif that was already down, in ill_down_ipifs. So we 14698 * just blindly clear the IPIF_CHANGING flag here on all ipifs. 14699 */ 14700 ill_v4 = ill->ill_phyint->phyint_illv4; 14701 ill_v6 = ill->ill_phyint->phyint_illv6; 14702 if (ill_v4 != NULL) { 14703 ill_v4->ill_up_ipifs = B_TRUE; 14704 for (ipif = ill_v4->ill_ipif; ipif != NULL; 14705 ipif = ipif->ipif_next) { 14706 mutex_enter(&ill_v4->ill_lock); 14707 ipif->ipif_state_flags &= ~IPIF_CHANGING; 14708 IPIF_UNMARK_MOVING(ipif); 14709 mutex_exit(&ill_v4->ill_lock); 14710 if (ipif->ipif_was_up) { 14711 if (!(ipif->ipif_flags & IPIF_UP)) 14712 err = ipif_up(ipif, q, mp); 14713 ipif->ipif_was_up = B_FALSE; 14714 if (err != 0) { 14715 /* 14716 * Can there be any other error ? 14717 */ 14718 ASSERT(err == EINPROGRESS); 14719 return (err); 14720 } 14721 } 14722 } 14723 mutex_enter(&ill_v4->ill_lock); 14724 ill_v4->ill_state_flags &= ~ILL_CHANGING; 14725 mutex_exit(&ill_v4->ill_lock); 14726 ill_v4->ill_up_ipifs = B_FALSE; 14727 if (ill_v4->ill_move_in_progress) { 14728 ASSERT(ill_v4->ill_move_peer != NULL); 14729 ill_v4->ill_move_in_progress = B_FALSE; 14730 from_ill = ill_v4->ill_move_peer; 14731 from_ill->ill_move_in_progress = B_FALSE; 14732 from_ill->ill_move_peer = NULL; 14733 mutex_enter(&from_ill->ill_lock); 14734 from_ill->ill_state_flags &= ~ILL_CHANGING; 14735 mutex_exit(&from_ill->ill_lock); 14736 if (ill_v6 == NULL) { 14737 if (from_ill->ill_phyint->phyint_flags & 14738 PHYI_STANDBY) { 14739 phyint_inactive(from_ill->ill_phyint); 14740 } 14741 if (ill_v4->ill_phyint->phyint_flags & 14742 PHYI_STANDBY) { 14743 phyint_inactive(ill_v4->ill_phyint); 14744 } 14745 } 14746 ill_v4->ill_move_peer = NULL; 14747 } 14748 } 14749 14750 if (ill_v6 != NULL) { 14751 ill_v6->ill_up_ipifs = B_TRUE; 14752 for (ipif = ill_v6->ill_ipif; ipif != NULL; 14753 ipif = ipif->ipif_next) { 14754 mutex_enter(&ill_v6->ill_lock); 14755 ipif->ipif_state_flags &= ~IPIF_CHANGING; 14756 IPIF_UNMARK_MOVING(ipif); 14757 mutex_exit(&ill_v6->ill_lock); 14758 if (ipif->ipif_was_up) { 14759 if (!(ipif->ipif_flags & IPIF_UP)) 14760 err = ipif_up(ipif, q, mp); 14761 ipif->ipif_was_up = B_FALSE; 14762 if (err != 0) { 14763 /* 14764 * Can there be any other error ? 14765 */ 14766 ASSERT(err == EINPROGRESS); 14767 return (err); 14768 } 14769 } 14770 } 14771 mutex_enter(&ill_v6->ill_lock); 14772 ill_v6->ill_state_flags &= ~ILL_CHANGING; 14773 mutex_exit(&ill_v6->ill_lock); 14774 ill_v6->ill_up_ipifs = B_FALSE; 14775 if (ill_v6->ill_move_in_progress) { 14776 ASSERT(ill_v6->ill_move_peer != NULL); 14777 ill_v6->ill_move_in_progress = B_FALSE; 14778 from_ill = ill_v6->ill_move_peer; 14779 from_ill->ill_move_in_progress = B_FALSE; 14780 from_ill->ill_move_peer = NULL; 14781 mutex_enter(&from_ill->ill_lock); 14782 from_ill->ill_state_flags &= ~ILL_CHANGING; 14783 mutex_exit(&from_ill->ill_lock); 14784 if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 14785 phyint_inactive(from_ill->ill_phyint); 14786 } 14787 if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) { 14788 phyint_inactive(ill_v6->ill_phyint); 14789 } 14790 ill_v6->ill_move_peer = NULL; 14791 } 14792 } 14793 return (0); 14794 } 14795 14796 /* 14797 * bring down all the approriate ipifs. 14798 */ 14799 /* ARGSUSED */ 14800 static void 14801 ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover) 14802 { 14803 ipif_t *ipif; 14804 14805 ASSERT(IAM_WRITER_ILL(ill)); 14806 14807 /* 14808 * Except for ipif_state_flags the other fields of the ipif/ill that 14809 * are modified below are protected implicitly since we are a writer 14810 */ 14811 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14812 if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER)) 14813 continue; 14814 if (index == 0 || index == ipif->ipif_orig_ifindex) { 14815 /* 14816 * We go through the ipif_down logic even if the ipif 14817 * is already down, since routes can be added based 14818 * on down ipifs. Going through ipif_down once again 14819 * will delete any IREs created based on these routes. 14820 */ 14821 if (ipif->ipif_flags & IPIF_UP) 14822 ipif->ipif_was_up = B_TRUE; 14823 /* 14824 * If called with chk_nofailover true ipif is moving. 14825 */ 14826 mutex_enter(&ill->ill_lock); 14827 if (chk_nofailover) { 14828 ipif->ipif_state_flags |= 14829 IPIF_MOVING | IPIF_CHANGING; 14830 } else { 14831 ipif->ipif_state_flags |= IPIF_CHANGING; 14832 } 14833 mutex_exit(&ill->ill_lock); 14834 /* 14835 * Need to re-create net/subnet bcast ires if 14836 * they are dependent on ipif. 14837 */ 14838 if (!ipif->ipif_isv6) 14839 ipif_check_bcast_ires(ipif); 14840 (void) ipif_logical_down(ipif, NULL, NULL); 14841 ipif_non_duplicate(ipif); 14842 ipif_down_tail(ipif); 14843 /* 14844 * We don't do ipif_multicast_down for IPv4 in 14845 * ipif_down. We need to set this so that 14846 * ipif_multicast_up will join the 14847 * ALLHOSTS_GROUP on to_ill. 14848 */ 14849 ipif->ipif_multicast_up = B_FALSE; 14850 } 14851 } 14852 } 14853 14854 #define IPSQ_INC_REF(ipsq, ipst) { \ 14855 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); \ 14856 (ipsq)->ipsq_refs++; \ 14857 } 14858 14859 #define IPSQ_DEC_REF(ipsq, ipst) { \ 14860 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); \ 14861 (ipsq)->ipsq_refs--; \ 14862 if ((ipsq)->ipsq_refs == 0) \ 14863 (ipsq)->ipsq_name[0] = '\0'; \ 14864 } 14865 14866 /* 14867 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 14868 * new_ipsq. 14869 */ 14870 static void 14871 ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq, ip_stack_t *ipst) 14872 { 14873 phyint_t *phyint; 14874 phyint_t *next_phyint; 14875 14876 /* 14877 * To change the ipsq of an ill, we need to hold the ill_g_lock as 14878 * writer and the ill_lock of the ill in question. Also the dest 14879 * ipsq can't vanish while we hold the ill_g_lock as writer. 14880 */ 14881 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 14882 14883 phyint = cur_ipsq->ipsq_phyint_list; 14884 cur_ipsq->ipsq_phyint_list = NULL; 14885 while (phyint != NULL) { 14886 next_phyint = phyint->phyint_ipsq_next; 14887 IPSQ_DEC_REF(cur_ipsq, ipst); 14888 phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list; 14889 new_ipsq->ipsq_phyint_list = phyint; 14890 IPSQ_INC_REF(new_ipsq, ipst); 14891 phyint->phyint_ipsq = new_ipsq; 14892 phyint = next_phyint; 14893 } 14894 } 14895 14896 #define SPLIT_SUCCESS 0 14897 #define SPLIT_NOT_NEEDED 1 14898 #define SPLIT_FAILED 2 14899 14900 int 14901 ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry, 14902 ip_stack_t *ipst) 14903 { 14904 ipsq_t *newipsq = NULL; 14905 14906 /* 14907 * Assertions denote pre-requisites for changing the ipsq of 14908 * a phyint 14909 */ 14910 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 14911 /* 14912 * <ill-phyint> assocs can't change while ill_g_lock 14913 * is held as writer. See ill_phyint_reinit() 14914 */ 14915 ASSERT(phyint->phyint_illv4 == NULL || 14916 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14917 ASSERT(phyint->phyint_illv6 == NULL || 14918 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14919 14920 if ((phyint->phyint_groupname_len != 14921 (strlen(cur_ipsq->ipsq_name) + 1) || 14922 bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name, 14923 phyint->phyint_groupname_len) != 0)) { 14924 /* 14925 * Once we fail in creating a new ipsq due to memory shortage, 14926 * don't attempt to create new ipsq again, based on another 14927 * phyint, since we want all phyints belonging to an IPMP group 14928 * to be in the same ipsq even in the event of mem alloc fails. 14929 */ 14930 newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry, 14931 cur_ipsq, ipst); 14932 if (newipsq == NULL) { 14933 /* Memory allocation failure */ 14934 return (SPLIT_FAILED); 14935 } else { 14936 /* ipsq_refs protected by ill_g_lock (writer) */ 14937 IPSQ_DEC_REF(cur_ipsq, ipst); 14938 phyint->phyint_ipsq = newipsq; 14939 phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list; 14940 newipsq->ipsq_phyint_list = phyint; 14941 IPSQ_INC_REF(newipsq, ipst); 14942 return (SPLIT_SUCCESS); 14943 } 14944 } 14945 return (SPLIT_NOT_NEEDED); 14946 } 14947 14948 /* 14949 * The ill locks of the phyint and the ill_g_lock (writer) must be held 14950 * to do this split 14951 */ 14952 static int 14953 ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, ip_stack_t *ipst) 14954 { 14955 ipsq_t *newipsq; 14956 14957 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 14958 /* 14959 * <ill-phyint> assocs can't change while ill_g_lock 14960 * is held as writer. See ill_phyint_reinit() 14961 */ 14962 14963 ASSERT(phyint->phyint_illv4 == NULL || 14964 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14965 ASSERT(phyint->phyint_illv6 == NULL || 14966 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14967 14968 if (!ipsq_init((phyint->phyint_illv4 != NULL) ? 14969 phyint->phyint_illv4: phyint->phyint_illv6)) { 14970 /* 14971 * ipsq_init failed due to no memory 14972 * caller will use the same ipsq 14973 */ 14974 return (SPLIT_FAILED); 14975 } 14976 14977 /* ipsq_ref is protected by ill_g_lock (writer) */ 14978 IPSQ_DEC_REF(cur_ipsq, ipst); 14979 14980 /* 14981 * This is a new ipsq that is unknown to the world. 14982 * So we don't need to hold ipsq_lock, 14983 */ 14984 newipsq = phyint->phyint_ipsq; 14985 newipsq->ipsq_writer = NULL; 14986 newipsq->ipsq_reentry_cnt--; 14987 ASSERT(newipsq->ipsq_reentry_cnt == 0); 14988 #ifdef ILL_DEBUG 14989 newipsq->ipsq_depth = 0; 14990 #endif 14991 14992 return (SPLIT_SUCCESS); 14993 } 14994 14995 /* 14996 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 14997 * ipsq's representing their individual groups or themselves. Return 14998 * whether split needs to be retried again later. 14999 */ 15000 static boolean_t 15001 ill_split_ipsq(ipsq_t *cur_ipsq) 15002 { 15003 phyint_t *phyint; 15004 phyint_t *next_phyint; 15005 int error; 15006 boolean_t need_retry = B_FALSE; 15007 ip_stack_t *ipst = cur_ipsq->ipsq_ipst; 15008 15009 phyint = cur_ipsq->ipsq_phyint_list; 15010 cur_ipsq->ipsq_phyint_list = NULL; 15011 while (phyint != NULL) { 15012 next_phyint = phyint->phyint_ipsq_next; 15013 /* 15014 * 'created' will tell us whether the callee actually 15015 * created an ipsq. Lack of memory may force the callee 15016 * to return without creating an ipsq. 15017 */ 15018 if (phyint->phyint_groupname == NULL) { 15019 error = ill_split_to_own_ipsq(phyint, cur_ipsq, ipst); 15020 } else { 15021 error = ill_split_to_grp_ipsq(phyint, cur_ipsq, 15022 need_retry, ipst); 15023 } 15024 15025 switch (error) { 15026 case SPLIT_FAILED: 15027 need_retry = B_TRUE; 15028 /* FALLTHRU */ 15029 case SPLIT_NOT_NEEDED: 15030 /* 15031 * Keep it on the list. 15032 */ 15033 phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list; 15034 cur_ipsq->ipsq_phyint_list = phyint; 15035 break; 15036 case SPLIT_SUCCESS: 15037 break; 15038 default: 15039 ASSERT(0); 15040 } 15041 15042 phyint = next_phyint; 15043 } 15044 return (need_retry); 15045 } 15046 15047 /* 15048 * given an ipsq 'ipsq' lock all ills associated with this ipsq. 15049 * and return the ills in the list. This list will be 15050 * needed to unlock all the ills later on by the caller. 15051 * The <ill-ipsq> associations could change between the 15052 * lock and unlock. Hence the unlock can't traverse the 15053 * ipsq to get the list of ills. 15054 */ 15055 static int 15056 ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max) 15057 { 15058 int cnt = 0; 15059 phyint_t *phyint; 15060 ip_stack_t *ipst = ipsq->ipsq_ipst; 15061 15062 /* 15063 * The caller holds ill_g_lock to ensure that the ill memberships 15064 * of the ipsq don't change 15065 */ 15066 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 15067 15068 phyint = ipsq->ipsq_phyint_list; 15069 while (phyint != NULL) { 15070 if (phyint->phyint_illv4 != NULL) { 15071 ASSERT(cnt < list_max); 15072 list[cnt++] = phyint->phyint_illv4; 15073 } 15074 if (phyint->phyint_illv6 != NULL) { 15075 ASSERT(cnt < list_max); 15076 list[cnt++] = phyint->phyint_illv6; 15077 } 15078 phyint = phyint->phyint_ipsq_next; 15079 } 15080 ill_lock_ills(list, cnt); 15081 return (cnt); 15082 } 15083 15084 void 15085 ill_lock_ills(ill_t **list, int cnt) 15086 { 15087 int i; 15088 15089 if (cnt > 1) { 15090 boolean_t try_again; 15091 do { 15092 try_again = B_FALSE; 15093 for (i = 0; i < cnt - 1; i++) { 15094 if (list[i] < list[i + 1]) { 15095 ill_t *tmp; 15096 15097 /* swap the elements */ 15098 tmp = list[i]; 15099 list[i] = list[i + 1]; 15100 list[i + 1] = tmp; 15101 try_again = B_TRUE; 15102 } 15103 } 15104 } while (try_again); 15105 } 15106 15107 for (i = 0; i < cnt; i++) { 15108 if (i == 0) { 15109 if (list[i] != NULL) 15110 mutex_enter(&list[i]->ill_lock); 15111 else 15112 return; 15113 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 15114 mutex_enter(&list[i]->ill_lock); 15115 } 15116 } 15117 } 15118 15119 void 15120 ill_unlock_ills(ill_t **list, int cnt) 15121 { 15122 int i; 15123 15124 for (i = 0; i < cnt; i++) { 15125 if ((i == 0) && (list[i] != NULL)) { 15126 mutex_exit(&list[i]->ill_lock); 15127 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 15128 mutex_exit(&list[i]->ill_lock); 15129 } 15130 } 15131 } 15132 15133 /* 15134 * Merge all the ills from 1 ipsq group into another ipsq group. 15135 * The source ipsq group is specified by the ipsq associated with 15136 * 'from_ill'. The destination ipsq group is specified by the ipsq 15137 * associated with 'to_ill' or 'groupname' respectively. 15138 * Note that ipsq itself does not have a reference count mechanism 15139 * and functions don't look up an ipsq and pass it around. Instead 15140 * functions pass around an ill or groupname, and the ipsq is looked 15141 * up from the ill or groupname and the required operation performed 15142 * atomically with the lookup on the ipsq. 15143 */ 15144 static int 15145 ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp, 15146 queue_t *q) 15147 { 15148 ipsq_t *old_ipsq; 15149 ipsq_t *new_ipsq; 15150 ill_t **ill_list; 15151 int cnt; 15152 size_t ill_list_size; 15153 boolean_t became_writer_on_new_sq = B_FALSE; 15154 ip_stack_t *ipst = from_ill->ill_ipst; 15155 15156 ASSERT(to_ill == NULL || ipst == to_ill->ill_ipst); 15157 /* Exactly 1 of 'to_ill' and groupname can be specified. */ 15158 ASSERT((to_ill != NULL) ^ (groupname != NULL)); 15159 15160 /* 15161 * Need to hold ill_g_lock as writer and also the ill_lock to 15162 * change the <ill-ipsq> assoc of an ill. Need to hold the 15163 * ipsq_lock to prevent new messages from landing on an ipsq. 15164 */ 15165 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15166 15167 old_ipsq = from_ill->ill_phyint->phyint_ipsq; 15168 if (groupname != NULL) 15169 new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL, ipst); 15170 else { 15171 new_ipsq = to_ill->ill_phyint->phyint_ipsq; 15172 } 15173 15174 ASSERT(old_ipsq != NULL && new_ipsq != NULL); 15175 15176 /* 15177 * both groups are on the same ipsq. 15178 */ 15179 if (old_ipsq == new_ipsq) { 15180 rw_exit(&ipst->ips_ill_g_lock); 15181 return (0); 15182 } 15183 15184 cnt = old_ipsq->ipsq_refs << 1; 15185 ill_list_size = cnt * sizeof (ill_t *); 15186 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 15187 if (ill_list == NULL) { 15188 rw_exit(&ipst->ips_ill_g_lock); 15189 return (ENOMEM); 15190 } 15191 cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt); 15192 15193 /* Need ipsq lock to enque messages on new ipsq or to become writer */ 15194 mutex_enter(&new_ipsq->ipsq_lock); 15195 if ((new_ipsq->ipsq_writer == NULL && 15196 new_ipsq->ipsq_current_ipif == NULL) || 15197 (new_ipsq->ipsq_writer == curthread)) { 15198 new_ipsq->ipsq_writer = curthread; 15199 new_ipsq->ipsq_reentry_cnt++; 15200 became_writer_on_new_sq = B_TRUE; 15201 } 15202 15203 /* 15204 * We are holding ill_g_lock as writer and all the ill locks of 15205 * the old ipsq. So the old_ipsq can't be looked up, and hence no new 15206 * message can land up on the old ipsq even though we don't hold the 15207 * ipsq_lock of the old_ipsq. Now move all messages to the newipsq. 15208 */ 15209 ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q); 15210 15211 /* 15212 * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'. 15213 * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq> 15214 * assocs. till we release the ill_g_lock, and hence it can't vanish. 15215 */ 15216 ill_merge_ipsq(old_ipsq, new_ipsq, ipst); 15217 15218 /* 15219 * Mark the new ipsq as needing a split since it is currently 15220 * being shared by more than 1 IPMP group. The split will 15221 * occur at the end of ipsq_exit 15222 */ 15223 new_ipsq->ipsq_split = B_TRUE; 15224 15225 /* Now release all the locks */ 15226 mutex_exit(&new_ipsq->ipsq_lock); 15227 ill_unlock_ills(ill_list, cnt); 15228 rw_exit(&ipst->ips_ill_g_lock); 15229 15230 kmem_free(ill_list, ill_list_size); 15231 15232 /* 15233 * If we succeeded in becoming writer on the new ipsq, then 15234 * drain the new ipsq and start processing all enqueued messages 15235 * including the current ioctl we are processing which is either 15236 * a set groupname or failover/failback. 15237 */ 15238 if (became_writer_on_new_sq) 15239 ipsq_exit(new_ipsq, B_TRUE, B_TRUE); 15240 15241 /* 15242 * syncq has been changed and all the messages have been moved. 15243 */ 15244 mutex_enter(&old_ipsq->ipsq_lock); 15245 old_ipsq->ipsq_current_ipif = NULL; 15246 old_ipsq->ipsq_current_ioctl = 0; 15247 mutex_exit(&old_ipsq->ipsq_lock); 15248 return (EINPROGRESS); 15249 } 15250 15251 /* 15252 * Delete and add the loopback copy and non-loopback copy of 15253 * the BROADCAST ire corresponding to ill and addr. Used to 15254 * group broadcast ires together when ill becomes part of 15255 * a group. 15256 * 15257 * This function is also called when ill is leaving the group 15258 * so that the ires belonging to the group gets re-grouped. 15259 */ 15260 static void 15261 ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr) 15262 { 15263 ire_t *ire, *nire, *nire_next, *ire_head = NULL; 15264 ire_t **ire_ptpn = &ire_head; 15265 ip_stack_t *ipst = ill->ill_ipst; 15266 15267 /* 15268 * The loopback and non-loopback IREs are inserted in the order in which 15269 * they're found, on the basis that they are correctly ordered (loopback 15270 * first). 15271 */ 15272 for (;;) { 15273 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 15274 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); 15275 if (ire == NULL) 15276 break; 15277 15278 /* 15279 * we are passing in KM_SLEEP because it is not easy to 15280 * go back to a sane state in case of memory failure. 15281 */ 15282 nire = kmem_cache_alloc(ire_cache, KM_SLEEP); 15283 ASSERT(nire != NULL); 15284 bzero(nire, sizeof (ire_t)); 15285 /* 15286 * Don't use ire_max_frag directly since we don't 15287 * hold on to 'ire' until we add the new ire 'nire' and 15288 * we don't want the new ire to have a dangling reference 15289 * to 'ire'. The ire_max_frag of a broadcast ire must 15290 * be in sync with the ipif_mtu of the associate ipif. 15291 * For eg. this happens as a result of SIOCSLIFNAME, 15292 * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by 15293 * the driver. A change in ire_max_frag triggered as 15294 * as a result of path mtu discovery, or due to an 15295 * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a 15296 * route change -mtu command does not apply to broadcast ires. 15297 * 15298 * XXX We need a recovery strategy here if ire_init fails 15299 */ 15300 if (ire_init(nire, 15301 (uchar_t *)&ire->ire_addr, 15302 (uchar_t *)&ire->ire_mask, 15303 (uchar_t *)&ire->ire_src_addr, 15304 (uchar_t *)&ire->ire_gateway_addr, 15305 (uchar_t *)&ire->ire_in_src_addr, 15306 ire->ire_stq == NULL ? &ip_loopback_mtu : 15307 &ire->ire_ipif->ipif_mtu, 15308 (ire->ire_nce != NULL ? ire->ire_nce->nce_fp_mp : NULL), 15309 ire->ire_rfq, 15310 ire->ire_stq, 15311 ire->ire_type, 15312 (ire->ire_nce != NULL? ire->ire_nce->nce_res_mp : NULL), 15313 ire->ire_ipif, 15314 ire->ire_in_ill, 15315 ire->ire_cmask, 15316 ire->ire_phandle, 15317 ire->ire_ihandle, 15318 ire->ire_flags, 15319 &ire->ire_uinfo, 15320 NULL, 15321 NULL, 15322 ipst) == NULL) { 15323 cmn_err(CE_PANIC, "ire_init() failed"); 15324 } 15325 ire_delete(ire); 15326 ire_refrele(ire); 15327 15328 /* 15329 * The newly created IREs are inserted at the tail of the list 15330 * starting with ire_head. As we've just allocated them no one 15331 * knows about them so it's safe. 15332 */ 15333 *ire_ptpn = nire; 15334 ire_ptpn = &nire->ire_next; 15335 } 15336 15337 for (nire = ire_head; nire != NULL; nire = nire_next) { 15338 int error; 15339 ire_t *oire; 15340 /* unlink the IRE from our list before calling ire_add() */ 15341 nire_next = nire->ire_next; 15342 nire->ire_next = NULL; 15343 15344 /* ire_add adds the ire at the right place in the list */ 15345 oire = nire; 15346 error = ire_add(&nire, NULL, NULL, NULL, B_FALSE); 15347 ASSERT(error == 0); 15348 ASSERT(oire == nire); 15349 ire_refrele(nire); /* Held in ire_add */ 15350 } 15351 } 15352 15353 /* 15354 * This function is usually called when an ill is inserted in 15355 * a group and all the ipifs are already UP. As all the ipifs 15356 * are already UP, the broadcast ires have already been created 15357 * and been inserted. But, ire_add_v4 would not have grouped properly. 15358 * We need to re-group for the benefit of ip_wput_ire which 15359 * expects BROADCAST ires to be grouped properly to avoid sending 15360 * more than one copy of the broadcast packet per group. 15361 * 15362 * NOTE : We don't check for ill_ipif_up_count to be non-zero here 15363 * because when ipif_up_done ends up calling this, ires have 15364 * already been added before illgrp_insert i.e before ill_group 15365 * has been initialized. 15366 */ 15367 static void 15368 ill_group_bcast_for_xmit(ill_t *ill) 15369 { 15370 ill_group_t *illgrp; 15371 ipif_t *ipif; 15372 ipaddr_t addr; 15373 ipaddr_t net_mask; 15374 ipaddr_t subnet_netmask; 15375 15376 illgrp = ill->ill_group; 15377 15378 /* 15379 * This function is called even when an ill is deleted from 15380 * the group. Hence, illgrp could be null. 15381 */ 15382 if (illgrp != NULL && illgrp->illgrp_ill_count == 1) 15383 return; 15384 15385 /* 15386 * Delete all the BROADCAST ires matching this ill and add 15387 * them back. This time, ire_add_v4 should take care of 15388 * grouping them with others because ill is part of the 15389 * group. 15390 */ 15391 ill_bcast_delete_and_add(ill, 0); 15392 ill_bcast_delete_and_add(ill, INADDR_BROADCAST); 15393 15394 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15395 15396 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15397 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15398 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15399 } else { 15400 net_mask = htonl(IN_CLASSA_NET); 15401 } 15402 addr = net_mask & ipif->ipif_subnet; 15403 ill_bcast_delete_and_add(ill, addr); 15404 ill_bcast_delete_and_add(ill, ~net_mask | addr); 15405 15406 subnet_netmask = ipif->ipif_net_mask; 15407 addr = ipif->ipif_subnet; 15408 ill_bcast_delete_and_add(ill, addr); 15409 ill_bcast_delete_and_add(ill, ~subnet_netmask | addr); 15410 } 15411 } 15412 15413 /* 15414 * This function is called from illgrp_delete when ill is being deleted 15415 * from the group. 15416 * 15417 * As ill is not there in the group anymore, any address belonging 15418 * to this ill should be cleared of IRE_MARK_NORECV. 15419 */ 15420 static void 15421 ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr) 15422 { 15423 ire_t *ire; 15424 irb_t *irb; 15425 ip_stack_t *ipst = ill->ill_ipst; 15426 15427 ASSERT(ill->ill_group == NULL); 15428 15429 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 15430 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); 15431 15432 if (ire != NULL) { 15433 /* 15434 * IPMP and plumbing operations are serialized on the ipsq, so 15435 * no one will insert or delete a broadcast ire under our feet. 15436 */ 15437 irb = ire->ire_bucket; 15438 rw_enter(&irb->irb_lock, RW_READER); 15439 ire_refrele(ire); 15440 15441 for (; ire != NULL; ire = ire->ire_next) { 15442 if (ire->ire_addr != addr) 15443 break; 15444 if (ire_to_ill(ire) != ill) 15445 continue; 15446 15447 ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED)); 15448 ire->ire_marks &= ~IRE_MARK_NORECV; 15449 } 15450 rw_exit(&irb->irb_lock); 15451 } 15452 } 15453 15454 /* 15455 * This function must be called only after the broadcast ires 15456 * have been grouped together. For a given address addr, nominate 15457 * only one of the ires whose interface is not FAILED or OFFLINE. 15458 * 15459 * This is also called when an ipif goes down, so that we can nominate 15460 * a different ire with the same address for receiving. 15461 */ 15462 static void 15463 ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr, ip_stack_t *ipst) 15464 { 15465 irb_t *irb; 15466 ire_t *ire; 15467 ire_t *ire1; 15468 ire_t *save_ire; 15469 ire_t **irep = NULL; 15470 boolean_t first = B_TRUE; 15471 ire_t *clear_ire = NULL; 15472 ire_t *start_ire = NULL; 15473 ire_t *new_lb_ire; 15474 ire_t *new_nlb_ire; 15475 boolean_t new_lb_ire_used = B_FALSE; 15476 boolean_t new_nlb_ire_used = B_FALSE; 15477 uint64_t match_flags; 15478 uint64_t phyi_flags; 15479 boolean_t fallback = B_FALSE; 15480 uint_t max_frag; 15481 15482 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES, 15483 NULL, MATCH_IRE_TYPE, ipst); 15484 /* 15485 * We may not be able to find some ires if a previous 15486 * ire_create failed. This happens when an ipif goes 15487 * down and we are unable to create BROADCAST ires due 15488 * to memory failure. Thus, we have to check for NULL 15489 * below. This should handle the case for LOOPBACK, 15490 * POINTOPOINT and interfaces with some POINTOPOINT 15491 * logicals for which there are no BROADCAST ires. 15492 */ 15493 if (ire == NULL) 15494 return; 15495 /* 15496 * Currently IRE_BROADCASTS are deleted when an ipif 15497 * goes down which runs exclusively. Thus, setting 15498 * IRE_MARK_RCVD should not race with ire_delete marking 15499 * IRE_MARK_CONDEMNED. We grab the lock below just to 15500 * be consistent with other parts of the code that walks 15501 * a given bucket. 15502 */ 15503 save_ire = ire; 15504 irb = ire->ire_bucket; 15505 new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 15506 if (new_lb_ire == NULL) { 15507 ire_refrele(ire); 15508 return; 15509 } 15510 new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 15511 if (new_nlb_ire == NULL) { 15512 ire_refrele(ire); 15513 kmem_cache_free(ire_cache, new_lb_ire); 15514 return; 15515 } 15516 IRB_REFHOLD(irb); 15517 rw_enter(&irb->irb_lock, RW_WRITER); 15518 /* 15519 * Get to the first ire matching the address and the 15520 * group. If the address does not match we are done 15521 * as we could not find the IRE. If the address matches 15522 * we should get to the first one matching the group. 15523 */ 15524 while (ire != NULL) { 15525 if (ire->ire_addr != addr || 15526 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 15527 break; 15528 } 15529 ire = ire->ire_next; 15530 } 15531 match_flags = PHYI_FAILED | PHYI_INACTIVE; 15532 start_ire = ire; 15533 redo: 15534 while (ire != NULL && ire->ire_addr == addr && 15535 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 15536 /* 15537 * The first ire for any address within a group 15538 * should always be the one with IRE_MARK_NORECV cleared 15539 * so that ip_wput_ire can avoid searching for one. 15540 * Note down the insertion point which will be used 15541 * later. 15542 */ 15543 if (first && (irep == NULL)) 15544 irep = ire->ire_ptpn; 15545 /* 15546 * PHYI_FAILED is set when the interface fails. 15547 * This interface might have become good, but the 15548 * daemon has not yet detected. We should still 15549 * not receive on this. PHYI_OFFLINE should never 15550 * be picked as this has been offlined and soon 15551 * be removed. 15552 */ 15553 phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags; 15554 if (phyi_flags & PHYI_OFFLINE) { 15555 ire->ire_marks |= IRE_MARK_NORECV; 15556 ire = ire->ire_next; 15557 continue; 15558 } 15559 if (phyi_flags & match_flags) { 15560 ire->ire_marks |= IRE_MARK_NORECV; 15561 ire = ire->ire_next; 15562 if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) == 15563 PHYI_INACTIVE) { 15564 fallback = B_TRUE; 15565 } 15566 continue; 15567 } 15568 if (first) { 15569 /* 15570 * We will move this to the front of the list later 15571 * on. 15572 */ 15573 clear_ire = ire; 15574 ire->ire_marks &= ~IRE_MARK_NORECV; 15575 } else { 15576 ire->ire_marks |= IRE_MARK_NORECV; 15577 } 15578 first = B_FALSE; 15579 ire = ire->ire_next; 15580 } 15581 /* 15582 * If we never nominated anybody, try nominating at least 15583 * an INACTIVE, if we found one. Do it only once though. 15584 */ 15585 if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) && 15586 fallback) { 15587 match_flags = PHYI_FAILED; 15588 ire = start_ire; 15589 irep = NULL; 15590 goto redo; 15591 } 15592 ire_refrele(save_ire); 15593 15594 /* 15595 * irep non-NULL indicates that we entered the while loop 15596 * above. If clear_ire is at the insertion point, we don't 15597 * have to do anything. clear_ire will be NULL if all the 15598 * interfaces are failed. 15599 * 15600 * We cannot unlink and reinsert the ire at the right place 15601 * in the list since there can be other walkers of this bucket. 15602 * Instead we delete and recreate the ire 15603 */ 15604 if (clear_ire != NULL && irep != NULL && *irep != clear_ire) { 15605 ire_t *clear_ire_stq = NULL; 15606 mblk_t *fp_mp = NULL, *res_mp = NULL; 15607 15608 bzero(new_lb_ire, sizeof (ire_t)); 15609 if (clear_ire->ire_nce != NULL) { 15610 fp_mp = clear_ire->ire_nce->nce_fp_mp; 15611 res_mp = clear_ire->ire_nce->nce_res_mp; 15612 } 15613 /* XXX We need a recovery strategy here. */ 15614 if (ire_init(new_lb_ire, 15615 (uchar_t *)&clear_ire->ire_addr, 15616 (uchar_t *)&clear_ire->ire_mask, 15617 (uchar_t *)&clear_ire->ire_src_addr, 15618 (uchar_t *)&clear_ire->ire_gateway_addr, 15619 (uchar_t *)&clear_ire->ire_in_src_addr, 15620 &clear_ire->ire_max_frag, 15621 fp_mp, 15622 clear_ire->ire_rfq, 15623 clear_ire->ire_stq, 15624 clear_ire->ire_type, 15625 res_mp, 15626 clear_ire->ire_ipif, 15627 clear_ire->ire_in_ill, 15628 clear_ire->ire_cmask, 15629 clear_ire->ire_phandle, 15630 clear_ire->ire_ihandle, 15631 clear_ire->ire_flags, 15632 &clear_ire->ire_uinfo, 15633 NULL, 15634 NULL, 15635 ipst) == NULL) 15636 cmn_err(CE_PANIC, "ire_init() failed"); 15637 if (clear_ire->ire_stq == NULL) { 15638 ire_t *ire_next = clear_ire->ire_next; 15639 if (ire_next != NULL && 15640 ire_next->ire_stq != NULL && 15641 ire_next->ire_addr == clear_ire->ire_addr && 15642 ire_next->ire_ipif->ipif_ill == 15643 clear_ire->ire_ipif->ipif_ill) { 15644 clear_ire_stq = ire_next; 15645 15646 bzero(new_nlb_ire, sizeof (ire_t)); 15647 if (clear_ire_stq->ire_nce != NULL) { 15648 fp_mp = 15649 clear_ire_stq->ire_nce->nce_fp_mp; 15650 res_mp = 15651 clear_ire_stq->ire_nce->nce_res_mp; 15652 } else { 15653 fp_mp = res_mp = NULL; 15654 } 15655 /* XXX We need a recovery strategy here. */ 15656 if (ire_init(new_nlb_ire, 15657 (uchar_t *)&clear_ire_stq->ire_addr, 15658 (uchar_t *)&clear_ire_stq->ire_mask, 15659 (uchar_t *)&clear_ire_stq->ire_src_addr, 15660 (uchar_t *)&clear_ire_stq->ire_gateway_addr, 15661 (uchar_t *)&clear_ire_stq->ire_in_src_addr, 15662 &clear_ire_stq->ire_max_frag, 15663 fp_mp, 15664 clear_ire_stq->ire_rfq, 15665 clear_ire_stq->ire_stq, 15666 clear_ire_stq->ire_type, 15667 res_mp, 15668 clear_ire_stq->ire_ipif, 15669 clear_ire_stq->ire_in_ill, 15670 clear_ire_stq->ire_cmask, 15671 clear_ire_stq->ire_phandle, 15672 clear_ire_stq->ire_ihandle, 15673 clear_ire_stq->ire_flags, 15674 &clear_ire_stq->ire_uinfo, 15675 NULL, 15676 NULL, 15677 ipst) == NULL) 15678 cmn_err(CE_PANIC, "ire_init() failed"); 15679 } 15680 } 15681 15682 /* 15683 * Delete the ire. We can't call ire_delete() since 15684 * we are holding the bucket lock. We can't release the 15685 * bucket lock since we can't allow irep to change. So just 15686 * mark it CONDEMNED. The IRB_REFRELE will delete the 15687 * ire from the list and do the refrele. 15688 */ 15689 clear_ire->ire_marks |= IRE_MARK_CONDEMNED; 15690 irb->irb_marks |= IRB_MARK_CONDEMNED; 15691 15692 if (clear_ire_stq != NULL && clear_ire_stq->ire_nce != NULL) { 15693 nce_fastpath_list_delete(clear_ire_stq->ire_nce); 15694 clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED; 15695 } 15696 15697 /* 15698 * Also take care of otherfields like ib/ob pkt count 15699 * etc. Need to dup them. ditto in ill_bcast_delete_and_add 15700 */ 15701 15702 /* Set the max_frag before adding the ire */ 15703 max_frag = *new_lb_ire->ire_max_fragp; 15704 new_lb_ire->ire_max_fragp = NULL; 15705 new_lb_ire->ire_max_frag = max_frag; 15706 15707 /* Add the new ire's. Insert at *irep */ 15708 new_lb_ire->ire_bucket = clear_ire->ire_bucket; 15709 ire1 = *irep; 15710 if (ire1 != NULL) 15711 ire1->ire_ptpn = &new_lb_ire->ire_next; 15712 new_lb_ire->ire_next = ire1; 15713 /* Link the new one in. */ 15714 new_lb_ire->ire_ptpn = irep; 15715 membar_producer(); 15716 *irep = new_lb_ire; 15717 new_lb_ire_used = B_TRUE; 15718 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted); 15719 new_lb_ire->ire_bucket->irb_ire_cnt++; 15720 new_lb_ire->ire_ipif->ipif_ire_cnt++; 15721 15722 if (clear_ire_stq != NULL) { 15723 /* Set the max_frag before adding the ire */ 15724 max_frag = *new_nlb_ire->ire_max_fragp; 15725 new_nlb_ire->ire_max_fragp = NULL; 15726 new_nlb_ire->ire_max_frag = max_frag; 15727 15728 new_nlb_ire->ire_bucket = clear_ire->ire_bucket; 15729 irep = &new_lb_ire->ire_next; 15730 /* Add the new ire. Insert at *irep */ 15731 ire1 = *irep; 15732 if (ire1 != NULL) 15733 ire1->ire_ptpn = &new_nlb_ire->ire_next; 15734 new_nlb_ire->ire_next = ire1; 15735 /* Link the new one in. */ 15736 new_nlb_ire->ire_ptpn = irep; 15737 membar_producer(); 15738 *irep = new_nlb_ire; 15739 new_nlb_ire_used = B_TRUE; 15740 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, 15741 ire_stats_inserted); 15742 new_nlb_ire->ire_bucket->irb_ire_cnt++; 15743 new_nlb_ire->ire_ipif->ipif_ire_cnt++; 15744 ((ill_t *)new_nlb_ire->ire_stq->q_ptr)->ill_ire_cnt++; 15745 } 15746 } 15747 rw_exit(&irb->irb_lock); 15748 if (!new_lb_ire_used) 15749 kmem_cache_free(ire_cache, new_lb_ire); 15750 if (!new_nlb_ire_used) 15751 kmem_cache_free(ire_cache, new_nlb_ire); 15752 IRB_REFRELE(irb); 15753 } 15754 15755 /* 15756 * Whenever an ipif goes down we have to renominate a different 15757 * broadcast ire to receive. Whenever an ipif comes up, we need 15758 * to make sure that we have only one nominated to receive. 15759 */ 15760 static void 15761 ipif_renominate_bcast(ipif_t *ipif) 15762 { 15763 ill_t *ill = ipif->ipif_ill; 15764 ipaddr_t subnet_addr; 15765 ipaddr_t net_addr; 15766 ipaddr_t net_mask = 0; 15767 ipaddr_t subnet_netmask; 15768 ipaddr_t addr; 15769 ill_group_t *illgrp; 15770 ip_stack_t *ipst = ill->ill_ipst; 15771 15772 illgrp = ill->ill_group; 15773 /* 15774 * If this is the last ipif going down, it might take 15775 * the ill out of the group. In that case ipif_down -> 15776 * illgrp_delete takes care of doing the nomination. 15777 * ipif_down does not call for this case. 15778 */ 15779 ASSERT(illgrp != NULL); 15780 15781 /* There could not have been any ires associated with this */ 15782 if (ipif->ipif_subnet == 0) 15783 return; 15784 15785 ill_mark_bcast(illgrp, 0, ipst); 15786 ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst); 15787 15788 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15789 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15790 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15791 } else { 15792 net_mask = htonl(IN_CLASSA_NET); 15793 } 15794 addr = net_mask & ipif->ipif_subnet; 15795 ill_mark_bcast(illgrp, addr, ipst); 15796 15797 net_addr = ~net_mask | addr; 15798 ill_mark_bcast(illgrp, net_addr, ipst); 15799 15800 subnet_netmask = ipif->ipif_net_mask; 15801 addr = ipif->ipif_subnet; 15802 ill_mark_bcast(illgrp, addr, ipst); 15803 15804 subnet_addr = ~subnet_netmask | addr; 15805 ill_mark_bcast(illgrp, subnet_addr, ipst); 15806 } 15807 15808 /* 15809 * Whenever we form or delete ill groups, we need to nominate one set of 15810 * BROADCAST ires for receiving in the group. 15811 * 15812 * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires 15813 * have been added, but ill_ipif_up_count is 0. Thus, we don't assert 15814 * for ill_ipif_up_count to be non-zero. This is the only case where 15815 * ill_ipif_up_count is zero and we would still find the ires. 15816 * 15817 * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one 15818 * ipif is UP and we just have to do the nomination. 15819 * 15820 * 3) When ill_handoff_responsibility calls us, some ill has been removed 15821 * from the group. So, we have to do the nomination. 15822 * 15823 * Because of (3), there could be just one ill in the group. But we have 15824 * to nominate still as IRE_MARK_NORCV may have been marked on this. 15825 * Thus, this function does not optimize when there is only one ill as 15826 * it is not correct for (3). 15827 */ 15828 static void 15829 ill_nominate_bcast_rcv(ill_group_t *illgrp) 15830 { 15831 ill_t *ill; 15832 ipif_t *ipif; 15833 ipaddr_t subnet_addr; 15834 ipaddr_t prev_subnet_addr = 0; 15835 ipaddr_t net_addr; 15836 ipaddr_t prev_net_addr = 0; 15837 ipaddr_t net_mask = 0; 15838 ipaddr_t subnet_netmask; 15839 ipaddr_t addr; 15840 ip_stack_t *ipst; 15841 15842 /* 15843 * When the last memeber is leaving, there is nothing to 15844 * nominate. 15845 */ 15846 if (illgrp->illgrp_ill_count == 0) { 15847 ASSERT(illgrp->illgrp_ill == NULL); 15848 return; 15849 } 15850 15851 ill = illgrp->illgrp_ill; 15852 ASSERT(!ill->ill_isv6); 15853 ipst = ill->ill_ipst; 15854 /* 15855 * We assume that ires with same address and belonging to the 15856 * same group, has been grouped together. Nominating a *single* 15857 * ill in the group for sending and receiving broadcast is done 15858 * by making sure that the first BROADCAST ire (which will be 15859 * the one returned by ire_ctable_lookup for ip_rput and the 15860 * one that will be used in ip_wput_ire) will be the one that 15861 * will not have IRE_MARK_NORECV set. 15862 * 15863 * 1) ip_rput checks and discards packets received on ires marked 15864 * with IRE_MARK_NORECV. Thus, we don't send up duplicate 15865 * broadcast packets. We need to clear IRE_MARK_NORECV on the 15866 * first ire in the group for every broadcast address in the group. 15867 * ip_rput will accept packets only on the first ire i.e only 15868 * one copy of the ill. 15869 * 15870 * 2) ip_wput_ire needs to send out just one copy of the broadcast 15871 * packet for the whole group. It needs to send out on the ill 15872 * whose ire has not been marked with IRE_MARK_NORECV. If it sends 15873 * on the one marked with IRE_MARK_NORECV, ip_rput will accept 15874 * the copy echoed back on other port where the ire is not marked 15875 * with IRE_MARK_NORECV. 15876 * 15877 * Note that we just need to have the first IRE either loopback or 15878 * non-loopback (either of them may not exist if ire_create failed 15879 * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will 15880 * always hit the first one and hence will always accept one copy. 15881 * 15882 * We have a broadcast ire per ill for all the unique prefixes 15883 * hosted on that ill. As we don't have a way of knowing the 15884 * unique prefixes on a given ill and hence in the whole group, 15885 * we just call ill_mark_bcast on all the prefixes that exist 15886 * in the group. For the common case of one prefix, the code 15887 * below optimizes by remebering the last address used for 15888 * markng. In the case of multiple prefixes, this will still 15889 * optimize depending the order of prefixes. 15890 * 15891 * The only unique address across the whole group is 0.0.0.0 and 15892 * 255.255.255.255 and thus we call only once. ill_mark_bcast enables 15893 * the first ire in the bucket for receiving and disables the 15894 * others. 15895 */ 15896 ill_mark_bcast(illgrp, 0, ipst); 15897 ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst); 15898 for (; ill != NULL; ill = ill->ill_group_next) { 15899 15900 for (ipif = ill->ill_ipif; ipif != NULL; 15901 ipif = ipif->ipif_next) { 15902 15903 if (!(ipif->ipif_flags & IPIF_UP) || 15904 ipif->ipif_subnet == 0) { 15905 continue; 15906 } 15907 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15908 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15909 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15910 } else { 15911 net_mask = htonl(IN_CLASSA_NET); 15912 } 15913 addr = net_mask & ipif->ipif_subnet; 15914 if (prev_net_addr == 0 || prev_net_addr != addr) { 15915 ill_mark_bcast(illgrp, addr, ipst); 15916 net_addr = ~net_mask | addr; 15917 ill_mark_bcast(illgrp, net_addr, ipst); 15918 } 15919 prev_net_addr = addr; 15920 15921 subnet_netmask = ipif->ipif_net_mask; 15922 addr = ipif->ipif_subnet; 15923 if (prev_subnet_addr == 0 || 15924 prev_subnet_addr != addr) { 15925 ill_mark_bcast(illgrp, addr, ipst); 15926 subnet_addr = ~subnet_netmask | addr; 15927 ill_mark_bcast(illgrp, subnet_addr, ipst); 15928 } 15929 prev_subnet_addr = addr; 15930 } 15931 } 15932 } 15933 15934 /* 15935 * This function is called while forming ill groups. 15936 * 15937 * Currently, we handle only allmulti groups. We want to join 15938 * allmulti on only one of the ills in the groups. In future, 15939 * when we have link aggregation, we may have to join normal 15940 * multicast groups on multiple ills as switch does inbound load 15941 * balancing. Following are the functions that calls this 15942 * function : 15943 * 15944 * 1) ill_recover_multicast : Interface is coming back UP. 15945 * When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6 15946 * will call ill_recover_multicast to recover all the multicast 15947 * groups. We need to make sure that only one member is joined 15948 * in the ill group. 15949 * 15950 * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed. 15951 * Somebody is joining allmulti. We need to make sure that only one 15952 * member is joined in the group. 15953 * 15954 * 3) illgrp_insert : If allmulti has already joined, we need to make 15955 * sure that only one member is joined in the group. 15956 * 15957 * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving 15958 * allmulti who we have nominated. We need to pick someother ill. 15959 * 15960 * 5) illgrp_delete : The ill we nominated is leaving the group, 15961 * we need to pick a new ill to join the group. 15962 * 15963 * For (1), (2), (5) - we just have to check whether there is 15964 * a good ill joined in the group. If we could not find any ills 15965 * joined the group, we should join. 15966 * 15967 * For (4), the one that was nominated to receive, left the group. 15968 * There could be nobody joined in the group when this function is 15969 * called. 15970 * 15971 * For (3) - we need to explicitly check whether there are multiple 15972 * ills joined in the group. 15973 * 15974 * For simplicity, we don't differentiate any of the above cases. We 15975 * just leave the group if it is joined on any of them and join on 15976 * the first good ill. 15977 */ 15978 int 15979 ill_nominate_mcast_rcv(ill_group_t *illgrp) 15980 { 15981 ilm_t *ilm; 15982 ill_t *ill; 15983 ill_t *fallback_inactive_ill = NULL; 15984 ill_t *fallback_failed_ill = NULL; 15985 int ret = 0; 15986 15987 /* 15988 * Leave the allmulti on all the ills and start fresh. 15989 */ 15990 for (ill = illgrp->illgrp_ill; ill != NULL; 15991 ill = ill->ill_group_next) { 15992 if (ill->ill_join_allmulti) 15993 (void) ip_leave_allmulti(ill->ill_ipif); 15994 } 15995 15996 /* 15997 * Choose a good ill. Fallback to inactive or failed if 15998 * none available. We need to fallback to FAILED in the 15999 * case where we have 2 interfaces in a group - where 16000 * one of them is failed and another is a good one and 16001 * the good one (not marked inactive) is leaving the group. 16002 */ 16003 ret = 0; 16004 for (ill = illgrp->illgrp_ill; ill != NULL; 16005 ill = ill->ill_group_next) { 16006 /* Never pick an offline interface */ 16007 if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE) 16008 continue; 16009 16010 if (ill->ill_phyint->phyint_flags & PHYI_FAILED) { 16011 fallback_failed_ill = ill; 16012 continue; 16013 } 16014 if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) { 16015 fallback_inactive_ill = ill; 16016 continue; 16017 } 16018 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 16019 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16020 ret = ip_join_allmulti(ill->ill_ipif); 16021 /* 16022 * ip_join_allmulti can fail because of memory 16023 * failures. So, make sure we join at least 16024 * on one ill. 16025 */ 16026 if (ill->ill_join_allmulti) 16027 return (0); 16028 } 16029 } 16030 } 16031 if (ret != 0) { 16032 /* 16033 * If we tried nominating above and failed to do so, 16034 * return error. We might have tried multiple times. 16035 * But, return the latest error. 16036 */ 16037 return (ret); 16038 } 16039 if ((ill = fallback_inactive_ill) != NULL) { 16040 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 16041 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16042 ret = ip_join_allmulti(ill->ill_ipif); 16043 return (ret); 16044 } 16045 } 16046 } else if ((ill = fallback_failed_ill) != NULL) { 16047 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 16048 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16049 ret = ip_join_allmulti(ill->ill_ipif); 16050 return (ret); 16051 } 16052 } 16053 } 16054 return (0); 16055 } 16056 16057 /* 16058 * This function is called from illgrp_delete after it is 16059 * deleted from the group to reschedule responsibilities 16060 * to a different ill. 16061 */ 16062 static void 16063 ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp) 16064 { 16065 ilm_t *ilm; 16066 ipif_t *ipif; 16067 ipaddr_t subnet_addr; 16068 ipaddr_t net_addr; 16069 ipaddr_t net_mask = 0; 16070 ipaddr_t subnet_netmask; 16071 ipaddr_t addr; 16072 ip_stack_t *ipst = ill->ill_ipst; 16073 16074 ASSERT(ill->ill_group == NULL); 16075 /* 16076 * Broadcast Responsibility: 16077 * 16078 * 1. If this ill has been nominated for receiving broadcast 16079 * packets, we need to find a new one. Before we find a new 16080 * one, we need to re-group the ires that are part of this new 16081 * group (assumed by ill_nominate_bcast_rcv). We do this by 16082 * calling ill_group_bcast_for_xmit(ill) which will do the right 16083 * thing for us. 16084 * 16085 * 2. If this ill was not nominated for receiving broadcast 16086 * packets, we need to clear the IRE_MARK_NORECV flag 16087 * so that we continue to send up broadcast packets. 16088 */ 16089 if (!ill->ill_isv6) { 16090 /* 16091 * Case 1 above : No optimization here. Just redo the 16092 * nomination. 16093 */ 16094 ill_group_bcast_for_xmit(ill); 16095 ill_nominate_bcast_rcv(illgrp); 16096 16097 /* 16098 * Case 2 above : Lookup and clear IRE_MARK_NORECV. 16099 */ 16100 ill_clear_bcast_mark(ill, 0); 16101 ill_clear_bcast_mark(ill, INADDR_BROADCAST); 16102 16103 for (ipif = ill->ill_ipif; ipif != NULL; 16104 ipif = ipif->ipif_next) { 16105 16106 if (!(ipif->ipif_flags & IPIF_UP) || 16107 ipif->ipif_subnet == 0) { 16108 continue; 16109 } 16110 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 16111 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 16112 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 16113 } else { 16114 net_mask = htonl(IN_CLASSA_NET); 16115 } 16116 addr = net_mask & ipif->ipif_subnet; 16117 ill_clear_bcast_mark(ill, addr); 16118 16119 net_addr = ~net_mask | addr; 16120 ill_clear_bcast_mark(ill, net_addr); 16121 16122 subnet_netmask = ipif->ipif_net_mask; 16123 addr = ipif->ipif_subnet; 16124 ill_clear_bcast_mark(ill, addr); 16125 16126 subnet_addr = ~subnet_netmask | addr; 16127 ill_clear_bcast_mark(ill, subnet_addr); 16128 } 16129 } 16130 16131 /* 16132 * Multicast Responsibility. 16133 * 16134 * If we have joined allmulti on this one, find a new member 16135 * in the group to join allmulti. As this ill is already part 16136 * of allmulti, we don't have to join on this one. 16137 * 16138 * If we have not joined allmulti on this one, there is no 16139 * responsibility to handoff. But we need to take new 16140 * responsibility i.e, join allmulti on this one if we need 16141 * to. 16142 */ 16143 if (ill->ill_join_allmulti) { 16144 (void) ill_nominate_mcast_rcv(illgrp); 16145 } else { 16146 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 16147 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16148 (void) ip_join_allmulti(ill->ill_ipif); 16149 break; 16150 } 16151 } 16152 } 16153 16154 /* 16155 * We intentionally do the flushing of IRE_CACHES only matching 16156 * on the ill and not on groups. Note that we are already deleted 16157 * from the group. 16158 * 16159 * This will make sure that all IRE_CACHES whose stq is pointing 16160 * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get 16161 * deleted and IRE_CACHES that are not pointing at this ill will 16162 * be left alone. 16163 */ 16164 if (ill->ill_isv6) { 16165 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 16166 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 16167 } else { 16168 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 16169 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 16170 } 16171 16172 /* 16173 * Some conn may have cached one of the IREs deleted above. By removing 16174 * the ire reference, we clean up the extra reference to the ill held in 16175 * ire->ire_stq. 16176 */ 16177 ipcl_walk(conn_cleanup_stale_ire, NULL, ipst); 16178 16179 /* 16180 * Re-do source address selection for all the members in the 16181 * group, if they borrowed source address from one of the ipifs 16182 * in this ill. 16183 */ 16184 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 16185 if (ill->ill_isv6) { 16186 ipif_update_other_ipifs_v6(ipif, illgrp); 16187 } else { 16188 ipif_update_other_ipifs(ipif, illgrp); 16189 } 16190 } 16191 } 16192 16193 /* 16194 * Delete the ill from the group. The caller makes sure that it is 16195 * in a group and it okay to delete from the group. So, we always 16196 * delete here. 16197 */ 16198 static void 16199 illgrp_delete(ill_t *ill) 16200 { 16201 ill_group_t *illgrp; 16202 ill_group_t *tmpg; 16203 ill_t *tmp_ill; 16204 ip_stack_t *ipst = ill->ill_ipst; 16205 16206 /* 16207 * Reset illgrp_ill_schednext if it was pointing at us. 16208 * We need to do this before we set ill_group to NULL. 16209 */ 16210 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16211 mutex_enter(&ill->ill_lock); 16212 16213 illgrp_reset_schednext(ill); 16214 16215 illgrp = ill->ill_group; 16216 16217 /* Delete the ill from illgrp. */ 16218 if (illgrp->illgrp_ill == ill) { 16219 illgrp->illgrp_ill = ill->ill_group_next; 16220 } else { 16221 tmp_ill = illgrp->illgrp_ill; 16222 while (tmp_ill->ill_group_next != ill) { 16223 tmp_ill = tmp_ill->ill_group_next; 16224 ASSERT(tmp_ill != NULL); 16225 } 16226 tmp_ill->ill_group_next = ill->ill_group_next; 16227 } 16228 ill->ill_group = NULL; 16229 ill->ill_group_next = NULL; 16230 16231 illgrp->illgrp_ill_count--; 16232 mutex_exit(&ill->ill_lock); 16233 rw_exit(&ipst->ips_ill_g_lock); 16234 16235 /* 16236 * As this ill is leaving the group, we need to hand off 16237 * the responsibilities to the other ills in the group, if 16238 * this ill had some responsibilities. 16239 */ 16240 16241 ill_handoff_responsibility(ill, illgrp); 16242 16243 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16244 16245 if (illgrp->illgrp_ill_count == 0) { 16246 16247 ASSERT(illgrp->illgrp_ill == NULL); 16248 if (ill->ill_isv6) { 16249 if (illgrp == ipst->ips_illgrp_head_v6) { 16250 ipst->ips_illgrp_head_v6 = illgrp->illgrp_next; 16251 } else { 16252 tmpg = ipst->ips_illgrp_head_v6; 16253 while (tmpg->illgrp_next != illgrp) { 16254 tmpg = tmpg->illgrp_next; 16255 ASSERT(tmpg != NULL); 16256 } 16257 tmpg->illgrp_next = illgrp->illgrp_next; 16258 } 16259 } else { 16260 if (illgrp == ipst->ips_illgrp_head_v4) { 16261 ipst->ips_illgrp_head_v4 = illgrp->illgrp_next; 16262 } else { 16263 tmpg = ipst->ips_illgrp_head_v4; 16264 while (tmpg->illgrp_next != illgrp) { 16265 tmpg = tmpg->illgrp_next; 16266 ASSERT(tmpg != NULL); 16267 } 16268 tmpg->illgrp_next = illgrp->illgrp_next; 16269 } 16270 } 16271 mutex_destroy(&illgrp->illgrp_lock); 16272 mi_free(illgrp); 16273 } 16274 rw_exit(&ipst->ips_ill_g_lock); 16275 16276 /* 16277 * Even though the ill is out of the group its not necessary 16278 * to set ipsq_split as TRUE as the ipifs could be down temporarily 16279 * We will split the ipsq when phyint_groupname is set to NULL. 16280 */ 16281 16282 /* 16283 * Send a routing sockets message if we are deleting from 16284 * groups with names. 16285 */ 16286 if (ill->ill_phyint->phyint_groupname_len != 0) 16287 ip_rts_ifmsg(ill->ill_ipif); 16288 } 16289 16290 /* 16291 * Re-do source address selection. This is normally called when 16292 * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST 16293 * ipif comes up. 16294 */ 16295 void 16296 ill_update_source_selection(ill_t *ill) 16297 { 16298 ipif_t *ipif; 16299 16300 ASSERT(IAM_WRITER_ILL(ill)); 16301 16302 if (ill->ill_group != NULL) 16303 ill = ill->ill_group->illgrp_ill; 16304 16305 for (; ill != NULL; ill = ill->ill_group_next) { 16306 for (ipif = ill->ill_ipif; ipif != NULL; 16307 ipif = ipif->ipif_next) { 16308 if (ill->ill_isv6) 16309 ipif_recreate_interface_routes_v6(NULL, ipif); 16310 else 16311 ipif_recreate_interface_routes(NULL, ipif); 16312 } 16313 } 16314 } 16315 16316 /* 16317 * Insert ill in a group headed by illgrp_head. The caller can either 16318 * pass a groupname in which case we search for a group with the 16319 * same name to insert in or pass a group to insert in. This function 16320 * would only search groups with names. 16321 * 16322 * NOTE : The caller should make sure that there is at least one ipif 16323 * UP on this ill so that illgrp_scheduler can pick this ill 16324 * for outbound packets. If ill_ipif_up_count is zero, we have 16325 * already sent a DL_UNBIND to the driver and we don't want to 16326 * send anymore packets. We don't assert for ipif_up_count 16327 * to be greater than zero, because ipif_up_done wants to call 16328 * this function before bumping up the ipif_up_count. See 16329 * ipif_up_done() for details. 16330 */ 16331 int 16332 illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname, 16333 ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up) 16334 { 16335 ill_group_t *illgrp; 16336 ill_t *prev_ill; 16337 phyint_t *phyi; 16338 ip_stack_t *ipst = ill->ill_ipst; 16339 16340 ASSERT(ill->ill_group == NULL); 16341 16342 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16343 mutex_enter(&ill->ill_lock); 16344 16345 if (groupname != NULL) { 16346 /* 16347 * Look for a group with a matching groupname to insert. 16348 */ 16349 for (illgrp = *illgrp_head; illgrp != NULL; 16350 illgrp = illgrp->illgrp_next) { 16351 16352 ill_t *tmp_ill; 16353 16354 /* 16355 * If we have an ill_group_t in the list which has 16356 * no ill_t assigned then we must be in the process of 16357 * removing this group. We skip this as illgrp_delete() 16358 * will remove it from the list. 16359 */ 16360 if ((tmp_ill = illgrp->illgrp_ill) == NULL) { 16361 ASSERT(illgrp->illgrp_ill_count == 0); 16362 continue; 16363 } 16364 16365 ASSERT(tmp_ill->ill_phyint != NULL); 16366 phyi = tmp_ill->ill_phyint; 16367 /* 16368 * Look at groups which has names only. 16369 */ 16370 if (phyi->phyint_groupname_len == 0) 16371 continue; 16372 /* 16373 * Names are stored in the phyint common to both 16374 * IPv4 and IPv6. 16375 */ 16376 if (mi_strcmp(phyi->phyint_groupname, 16377 groupname) == 0) { 16378 break; 16379 } 16380 } 16381 } else { 16382 /* 16383 * If the caller passes in a NULL "grp_to_insert", we 16384 * allocate one below and insert this singleton. 16385 */ 16386 illgrp = grp_to_insert; 16387 } 16388 16389 ill->ill_group_next = NULL; 16390 16391 if (illgrp == NULL) { 16392 illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t)); 16393 if (illgrp == NULL) { 16394 return (ENOMEM); 16395 } 16396 illgrp->illgrp_next = *illgrp_head; 16397 *illgrp_head = illgrp; 16398 illgrp->illgrp_ill = ill; 16399 illgrp->illgrp_ill_count = 1; 16400 ill->ill_group = illgrp; 16401 /* 16402 * Used in illgrp_scheduler to protect multiple threads 16403 * from traversing the list. 16404 */ 16405 mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0); 16406 } else { 16407 ASSERT(ill->ill_net_type == 16408 illgrp->illgrp_ill->ill_net_type); 16409 ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type); 16410 16411 /* Insert ill at tail of this group */ 16412 prev_ill = illgrp->illgrp_ill; 16413 while (prev_ill->ill_group_next != NULL) 16414 prev_ill = prev_ill->ill_group_next; 16415 prev_ill->ill_group_next = ill; 16416 ill->ill_group = illgrp; 16417 illgrp->illgrp_ill_count++; 16418 /* 16419 * Inherit group properties. Currently only forwarding 16420 * is the property we try to keep the same with all the 16421 * ills. When there are more, we will abstract this into 16422 * a function. 16423 */ 16424 ill->ill_flags &= ~ILLF_ROUTER; 16425 ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER); 16426 } 16427 mutex_exit(&ill->ill_lock); 16428 rw_exit(&ipst->ips_ill_g_lock); 16429 16430 /* 16431 * 1) When ipif_up_done() calls this function, ipif_up_count 16432 * may be zero as it has not yet been bumped. But the ires 16433 * have already been added. So, we do the nomination here 16434 * itself. But, when ip_sioctl_groupname calls this, it checks 16435 * for ill_ipif_up_count != 0. Thus we don't check for 16436 * ill_ipif_up_count here while nominating broadcast ires for 16437 * receive. 16438 * 16439 * 2) Similarly, we need to call ill_group_bcast_for_xmit here 16440 * to group them properly as ire_add() has already happened 16441 * in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert 16442 * case, we need to do it here anyway. 16443 */ 16444 if (!ill->ill_isv6) { 16445 ill_group_bcast_for_xmit(ill); 16446 ill_nominate_bcast_rcv(illgrp); 16447 } 16448 16449 if (!ipif_is_coming_up) { 16450 /* 16451 * When ipif_up_done() calls this function, the multicast 16452 * groups have not been joined yet. So, there is no point in 16453 * nomination. ip_join_allmulti will handle groups when 16454 * ill_recover_multicast is called from ipif_up_done() later. 16455 */ 16456 (void) ill_nominate_mcast_rcv(illgrp); 16457 /* 16458 * ipif_up_done calls ill_update_source_selection 16459 * anyway. Moreover, we don't want to re-create 16460 * interface routes while ipif_up_done() still has reference 16461 * to them. Refer to ipif_up_done() for more details. 16462 */ 16463 ill_update_source_selection(ill); 16464 } 16465 16466 /* 16467 * Send a routing sockets message if we are inserting into 16468 * groups with names. 16469 */ 16470 if (groupname != NULL) 16471 ip_rts_ifmsg(ill->ill_ipif); 16472 return (0); 16473 } 16474 16475 /* 16476 * Return the first phyint matching the groupname. There could 16477 * be more than one when there are ill groups. 16478 * 16479 * If 'usable' is set, then we exclude ones that are marked with any of 16480 * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE). 16481 * Needs work: called only from ip_sioctl_groupname and from the ipmp/netinfo 16482 * emulation of ipmp. 16483 */ 16484 phyint_t * 16485 phyint_lookup_group(char *groupname, boolean_t usable, ip_stack_t *ipst) 16486 { 16487 phyint_t *phyi; 16488 16489 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 16490 /* 16491 * Group names are stored in the phyint - a common structure 16492 * to both IPv4 and IPv6. 16493 */ 16494 phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); 16495 for (; phyi != NULL; 16496 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16497 phyi, AVL_AFTER)) { 16498 if (phyi->phyint_groupname_len == 0) 16499 continue; 16500 /* 16501 * Skip the ones that should not be used since the callers 16502 * sometime use this for sending packets. 16503 */ 16504 if (usable && (phyi->phyint_flags & 16505 (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE))) 16506 continue; 16507 16508 ASSERT(phyi->phyint_groupname != NULL); 16509 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0) 16510 return (phyi); 16511 } 16512 return (NULL); 16513 } 16514 16515 16516 /* 16517 * Return the first usable phyint matching the group index. By 'usable' 16518 * we exclude ones that are marked ununsable with any of 16519 * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE). 16520 * 16521 * Used only for the ipmp/netinfo emulation of ipmp. 16522 */ 16523 phyint_t * 16524 phyint_lookup_group_ifindex(uint_t group_ifindex, ip_stack_t *ipst) 16525 { 16526 phyint_t *phyi; 16527 16528 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 16529 16530 if (!ipst->ips_ipmp_hook_emulation) 16531 return (NULL); 16532 16533 /* 16534 * Group indicies are stored in the phyint - a common structure 16535 * to both IPv4 and IPv6. 16536 */ 16537 phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); 16538 for (; phyi != NULL; 16539 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16540 phyi, AVL_AFTER)) { 16541 /* Ignore the ones that do not have a group */ 16542 if (phyi->phyint_groupname_len == 0) 16543 continue; 16544 16545 ASSERT(phyi->phyint_group_ifindex != 0); 16546 /* 16547 * Skip the ones that should not be used since the callers 16548 * sometime use this for sending packets. 16549 */ 16550 if (phyi->phyint_flags & 16551 (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)) 16552 continue; 16553 if (phyi->phyint_group_ifindex == group_ifindex) 16554 return (phyi); 16555 } 16556 return (NULL); 16557 } 16558 16559 16560 /* 16561 * MT notes on creation and deletion of IPMP groups 16562 * 16563 * Creation and deletion of IPMP groups introduce the need to merge or 16564 * split the associated serialization objects i.e the ipsq's. Normally all 16565 * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled 16566 * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during 16567 * the execution of the SIOCSLIFGROUPNAME command the picture changes. There 16568 * is a need to change the <ill-ipsq> association and we have to operate on both 16569 * the source and destination IPMP groups. For eg. attempting to set the 16570 * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to 16571 * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the 16572 * source or destination IPMP group are mapped to a single ipsq for executing 16573 * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's. 16574 * The <ill-ipsq> mapping is restored back to normal at a later point. This is 16575 * termed as a split of the ipsq. The converse of the merge i.e. a split of the 16576 * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname 16577 * occurred on the ipsq, then the ipsq_split flag is set. This indicates the 16578 * ipsq has to be examined for redoing the <ill-ipsq> associations. 16579 * 16580 * In the above example the ioctl handling code locates the current ipsq of hme0 16581 * which is ipsq(mpk17-84). It then enters the above ipsq immediately or 16582 * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates 16583 * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into 16584 * the destination ipsq. If the destination ipsq is not busy, it also enters 16585 * the destination ipsq exclusively. Now the actual groupname setting operation 16586 * can proceed. If the destination ipsq is busy, the operation is enqueued 16587 * on the destination (merged) ipsq and will be handled in the unwind from 16588 * ipsq_exit. 16589 * 16590 * To prevent other threads accessing the ill while the group name change is 16591 * in progres, we bring down the ipifs which also removes the ill from the 16592 * group. The group is changed in phyint and when the first ipif on the ill 16593 * is brought up, the ill is inserted into the right IPMP group by 16594 * illgrp_insert. 16595 */ 16596 /* ARGSUSED */ 16597 int 16598 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16599 ip_ioctl_cmd_t *ipip, void *ifreq) 16600 { 16601 int i; 16602 char *tmp; 16603 int namelen; 16604 ill_t *ill = ipif->ipif_ill; 16605 ill_t *ill_v4, *ill_v6; 16606 int err = 0; 16607 phyint_t *phyi; 16608 phyint_t *phyi_tmp; 16609 struct lifreq *lifr; 16610 mblk_t *mp1; 16611 char *groupname; 16612 ipsq_t *ipsq; 16613 ip_stack_t *ipst = ill->ill_ipst; 16614 16615 ASSERT(IAM_WRITER_IPIF(ipif)); 16616 16617 /* Existance verified in ip_wput_nondata */ 16618 mp1 = mp->b_cont->b_cont; 16619 lifr = (struct lifreq *)mp1->b_rptr; 16620 groupname = lifr->lifr_groupname; 16621 16622 if (ipif->ipif_id != 0) 16623 return (EINVAL); 16624 16625 phyi = ill->ill_phyint; 16626 ASSERT(phyi != NULL); 16627 16628 if (phyi->phyint_flags & PHYI_VIRTUAL) 16629 return (EINVAL); 16630 16631 tmp = groupname; 16632 for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++) 16633 ; 16634 16635 if (i == LIFNAMSIZ) { 16636 /* no null termination */ 16637 return (EINVAL); 16638 } 16639 16640 /* 16641 * Calculate the namelen exclusive of the null 16642 * termination character. 16643 */ 16644 namelen = tmp - groupname; 16645 16646 ill_v4 = phyi->phyint_illv4; 16647 ill_v6 = phyi->phyint_illv6; 16648 16649 /* 16650 * ILL cannot be part of a usesrc group and and IPMP group at the 16651 * same time. No need to grab the ill_g_usesrc_lock here, see 16652 * synchronization notes in ip.c 16653 */ 16654 if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 16655 return (EINVAL); 16656 } 16657 16658 /* 16659 * mark the ill as changing. 16660 * this should queue all new requests on the syncq. 16661 */ 16662 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16663 16664 if (ill_v4 != NULL) 16665 ill_v4->ill_state_flags |= ILL_CHANGING; 16666 if (ill_v6 != NULL) 16667 ill_v6->ill_state_flags |= ILL_CHANGING; 16668 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16669 16670 if (namelen == 0) { 16671 /* 16672 * Null string means remove this interface from the 16673 * existing group. 16674 */ 16675 if (phyi->phyint_groupname_len == 0) { 16676 /* 16677 * Never was in a group. 16678 */ 16679 err = 0; 16680 goto done; 16681 } 16682 16683 /* 16684 * IPv4 or IPv6 may be temporarily out of the group when all 16685 * the ipifs are down. Thus, we need to check for ill_group to 16686 * be non-NULL. 16687 */ 16688 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 16689 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 16690 mutex_enter(&ill_v4->ill_lock); 16691 if (!ill_is_quiescent(ill_v4)) { 16692 /* 16693 * ipsq_pending_mp_add will not fail since 16694 * connp is NULL 16695 */ 16696 (void) ipsq_pending_mp_add(NULL, 16697 ill_v4->ill_ipif, q, mp, ILL_DOWN); 16698 mutex_exit(&ill_v4->ill_lock); 16699 err = EINPROGRESS; 16700 goto done; 16701 } 16702 mutex_exit(&ill_v4->ill_lock); 16703 } 16704 16705 if (ill_v6 != NULL && ill_v6->ill_group != NULL) { 16706 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 16707 mutex_enter(&ill_v6->ill_lock); 16708 if (!ill_is_quiescent(ill_v6)) { 16709 (void) ipsq_pending_mp_add(NULL, 16710 ill_v6->ill_ipif, q, mp, ILL_DOWN); 16711 mutex_exit(&ill_v6->ill_lock); 16712 err = EINPROGRESS; 16713 goto done; 16714 } 16715 mutex_exit(&ill_v6->ill_lock); 16716 } 16717 16718 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16719 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16720 mutex_enter(&phyi->phyint_lock); 16721 ASSERT(phyi->phyint_groupname != NULL); 16722 mi_free(phyi->phyint_groupname); 16723 phyi->phyint_groupname = NULL; 16724 phyi->phyint_groupname_len = 0; 16725 16726 /* Restore the ifindex used to be the per interface one */ 16727 phyi->phyint_group_ifindex = 0; 16728 phyi->phyint_hook_ifindex = phyi->phyint_ifindex; 16729 mutex_exit(&phyi->phyint_lock); 16730 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16731 rw_exit(&ipst->ips_ill_g_lock); 16732 err = ill_up_ipifs(ill, q, mp); 16733 16734 /* 16735 * set the split flag so that the ipsq can be split 16736 */ 16737 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16738 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16739 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16740 16741 } else { 16742 if (phyi->phyint_groupname_len != 0) { 16743 ASSERT(phyi->phyint_groupname != NULL); 16744 /* Are we inserting in the same group ? */ 16745 if (mi_strcmp(groupname, 16746 phyi->phyint_groupname) == 0) { 16747 err = 0; 16748 goto done; 16749 } 16750 } 16751 16752 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 16753 /* 16754 * Merge ipsq for the group's. 16755 * This check is here as multiple groups/ills might be 16756 * sharing the same ipsq. 16757 * If we have to merege than the operation is restarted 16758 * on the new ipsq. 16759 */ 16760 ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL, ipst); 16761 if (phyi->phyint_ipsq != ipsq) { 16762 rw_exit(&ipst->ips_ill_g_lock); 16763 err = ill_merge_groups(ill, NULL, groupname, mp, q); 16764 goto done; 16765 } 16766 /* 16767 * Running exclusive on new ipsq. 16768 */ 16769 16770 ASSERT(ipsq != NULL); 16771 ASSERT(ipsq->ipsq_writer == curthread); 16772 16773 /* 16774 * Check whether the ill_type and ill_net_type matches before 16775 * we allocate any memory so that the cleanup is easier. 16776 * 16777 * We can't group dissimilar ones as we can't load spread 16778 * packets across the group because of potential link-level 16779 * header differences. 16780 */ 16781 phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst); 16782 if (phyi_tmp != NULL) { 16783 if ((ill_v4 != NULL && 16784 phyi_tmp->phyint_illv4 != NULL) && 16785 ((ill_v4->ill_net_type != 16786 phyi_tmp->phyint_illv4->ill_net_type) || 16787 (ill_v4->ill_type != 16788 phyi_tmp->phyint_illv4->ill_type))) { 16789 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16790 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16791 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16792 rw_exit(&ipst->ips_ill_g_lock); 16793 return (EINVAL); 16794 } 16795 if ((ill_v6 != NULL && 16796 phyi_tmp->phyint_illv6 != NULL) && 16797 ((ill_v6->ill_net_type != 16798 phyi_tmp->phyint_illv6->ill_net_type) || 16799 (ill_v6->ill_type != 16800 phyi_tmp->phyint_illv6->ill_type))) { 16801 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16802 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16803 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16804 rw_exit(&ipst->ips_ill_g_lock); 16805 return (EINVAL); 16806 } 16807 } 16808 16809 rw_exit(&ipst->ips_ill_g_lock); 16810 16811 /* 16812 * bring down all v4 ipifs. 16813 */ 16814 if (ill_v4 != NULL) { 16815 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 16816 } 16817 16818 /* 16819 * bring down all v6 ipifs. 16820 */ 16821 if (ill_v6 != NULL) { 16822 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 16823 } 16824 16825 /* 16826 * make sure all ipifs are down and there are no active 16827 * references. Call to ipsq_pending_mp_add will not fail 16828 * since connp is NULL. 16829 */ 16830 if (ill_v4 != NULL) { 16831 mutex_enter(&ill_v4->ill_lock); 16832 if (!ill_is_quiescent(ill_v4)) { 16833 (void) ipsq_pending_mp_add(NULL, 16834 ill_v4->ill_ipif, q, mp, ILL_DOWN); 16835 mutex_exit(&ill_v4->ill_lock); 16836 err = EINPROGRESS; 16837 goto done; 16838 } 16839 mutex_exit(&ill_v4->ill_lock); 16840 } 16841 16842 if (ill_v6 != NULL) { 16843 mutex_enter(&ill_v6->ill_lock); 16844 if (!ill_is_quiescent(ill_v6)) { 16845 (void) ipsq_pending_mp_add(NULL, 16846 ill_v6->ill_ipif, q, mp, ILL_DOWN); 16847 mutex_exit(&ill_v6->ill_lock); 16848 err = EINPROGRESS; 16849 goto done; 16850 } 16851 mutex_exit(&ill_v6->ill_lock); 16852 } 16853 16854 /* 16855 * allocate including space for null terminator 16856 * before we insert. 16857 */ 16858 tmp = (char *)mi_alloc(namelen + 1, BPRI_MED); 16859 if (tmp == NULL) 16860 return (ENOMEM); 16861 16862 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16863 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16864 mutex_enter(&phyi->phyint_lock); 16865 if (phyi->phyint_groupname_len != 0) { 16866 ASSERT(phyi->phyint_groupname != NULL); 16867 mi_free(phyi->phyint_groupname); 16868 } 16869 16870 /* 16871 * setup the new group name. 16872 */ 16873 phyi->phyint_groupname = tmp; 16874 bcopy(groupname, phyi->phyint_groupname, namelen + 1); 16875 phyi->phyint_groupname_len = namelen + 1; 16876 16877 if (ipst->ips_ipmp_hook_emulation) { 16878 /* 16879 * If the group already exists we use the existing 16880 * group_ifindex, otherwise we pick a new index here. 16881 */ 16882 if (phyi_tmp != NULL) { 16883 phyi->phyint_group_ifindex = 16884 phyi_tmp->phyint_group_ifindex; 16885 } else { 16886 /* XXX We need a recovery strategy here. */ 16887 if (!ip_assign_ifindex( 16888 &phyi->phyint_group_ifindex, ipst)) 16889 cmn_err(CE_PANIC, 16890 "ip_assign_ifindex() failed"); 16891 } 16892 } 16893 /* 16894 * Select whether the netinfo and hook use the per-interface 16895 * or per-group ifindex. 16896 */ 16897 if (ipst->ips_ipmp_hook_emulation) 16898 phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex; 16899 else 16900 phyi->phyint_hook_ifindex = phyi->phyint_ifindex; 16901 16902 if (ipst->ips_ipmp_hook_emulation && 16903 phyi_tmp != NULL) { 16904 /* First phyint in group - group PLUMB event */ 16905 ill_nic_info_plumb(ill, B_TRUE); 16906 } 16907 mutex_exit(&phyi->phyint_lock); 16908 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16909 rw_exit(&ipst->ips_ill_g_lock); 16910 16911 err = ill_up_ipifs(ill, q, mp); 16912 } 16913 16914 done: 16915 /* 16916 * normally ILL_CHANGING is cleared in ill_up_ipifs. 16917 */ 16918 if (err != EINPROGRESS) { 16919 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16920 if (ill_v4 != NULL) 16921 ill_v4->ill_state_flags &= ~ILL_CHANGING; 16922 if (ill_v6 != NULL) 16923 ill_v6->ill_state_flags &= ~ILL_CHANGING; 16924 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16925 } 16926 return (err); 16927 } 16928 16929 /* ARGSUSED */ 16930 int 16931 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 16932 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 16933 { 16934 ill_t *ill; 16935 phyint_t *phyi; 16936 struct lifreq *lifr; 16937 mblk_t *mp1; 16938 16939 /* Existence verified in ip_wput_nondata */ 16940 mp1 = mp->b_cont->b_cont; 16941 lifr = (struct lifreq *)mp1->b_rptr; 16942 ill = ipif->ipif_ill; 16943 phyi = ill->ill_phyint; 16944 16945 lifr->lifr_groupname[0] = '\0'; 16946 /* 16947 * ill_group may be null if all the interfaces 16948 * are down. But still, the phyint should always 16949 * hold the name. 16950 */ 16951 if (phyi->phyint_groupname_len != 0) { 16952 bcopy(phyi->phyint_groupname, lifr->lifr_groupname, 16953 phyi->phyint_groupname_len); 16954 } 16955 16956 return (0); 16957 } 16958 16959 16960 typedef struct conn_move_s { 16961 ill_t *cm_from_ill; 16962 ill_t *cm_to_ill; 16963 int cm_ifindex; 16964 } conn_move_t; 16965 16966 /* 16967 * ipcl_walk function for moving conn_multicast_ill for a given ill. 16968 */ 16969 static void 16970 conn_move(conn_t *connp, caddr_t arg) 16971 { 16972 conn_move_t *connm; 16973 int ifindex; 16974 int i; 16975 ill_t *from_ill; 16976 ill_t *to_ill; 16977 ilg_t *ilg; 16978 ilm_t *ret_ilm; 16979 16980 connm = (conn_move_t *)arg; 16981 ifindex = connm->cm_ifindex; 16982 from_ill = connm->cm_from_ill; 16983 to_ill = connm->cm_to_ill; 16984 16985 /* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */ 16986 16987 /* All multicast fields protected by conn_lock */ 16988 mutex_enter(&connp->conn_lock); 16989 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); 16990 if ((connp->conn_outgoing_ill == from_ill) && 16991 (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) { 16992 connp->conn_outgoing_ill = to_ill; 16993 connp->conn_incoming_ill = to_ill; 16994 } 16995 16996 /* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */ 16997 16998 if ((connp->conn_multicast_ill == from_ill) && 16999 (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) { 17000 connp->conn_multicast_ill = connm->cm_to_ill; 17001 } 17002 17003 /* Change IP_XMIT_IF associations */ 17004 if ((connp->conn_xmit_if_ill == from_ill) && 17005 (ifindex == 0 || connp->conn_orig_xmit_ifindex == ifindex)) { 17006 connp->conn_xmit_if_ill = to_ill; 17007 } 17008 /* 17009 * Change the ilg_ill to point to the new one. This assumes 17010 * ilm_move_v6 has moved the ilms to new_ill and the driver 17011 * has been told to receive packets on this interface. 17012 * ilm_move_v6 FAILBACKS all the ilms successfully always. 17013 * But when doing a FAILOVER, it might fail with ENOMEM and so 17014 * some ilms may not have moved. We check to see whether 17015 * the ilms have moved to to_ill. We can't check on from_ill 17016 * as in the process of moving, we could have split an ilm 17017 * in to two - which has the same orig_ifindex and v6group. 17018 * 17019 * For IPv4, ilg_ipif moves implicitly. The code below really 17020 * does not do anything for IPv4 as ilg_ill is NULL for IPv4. 17021 */ 17022 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 17023 ilg = &connp->conn_ilg[i]; 17024 if ((ilg->ilg_ill == from_ill) && 17025 (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) { 17026 /* ifindex != 0 indicates failback */ 17027 if (ifindex != 0) { 17028 connp->conn_ilg[i].ilg_ill = to_ill; 17029 continue; 17030 } 17031 17032 ret_ilm = ilm_lookup_ill_index_v6(to_ill, 17033 &ilg->ilg_v6group, ilg->ilg_orig_ifindex, 17034 connp->conn_zoneid); 17035 17036 if (ret_ilm != NULL) 17037 connp->conn_ilg[i].ilg_ill = to_ill; 17038 } 17039 } 17040 mutex_exit(&connp->conn_lock); 17041 } 17042 17043 static void 17044 conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex) 17045 { 17046 conn_move_t connm; 17047 ip_stack_t *ipst = from_ill->ill_ipst; 17048 17049 connm.cm_from_ill = from_ill; 17050 connm.cm_to_ill = to_ill; 17051 connm.cm_ifindex = ifindex; 17052 17053 ipcl_walk(conn_move, (caddr_t)&connm, ipst); 17054 } 17055 17056 /* 17057 * ilm has been moved from from_ill to to_ill. 17058 * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill. 17059 * appropriately. 17060 * 17061 * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because 17062 * the code there de-references ipif_ill to get the ill to 17063 * send multicast requests. It does not work as ipif is on its 17064 * move and already moved when this function is called. 17065 * Thus, we need to use from_ill and to_ill send down multicast 17066 * requests. 17067 */ 17068 static void 17069 ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill) 17070 { 17071 ipif_t *ipif; 17072 ilm_t *ilm; 17073 17074 /* 17075 * See whether we need to send down DL_ENABMULTI_REQ on 17076 * to_ill as ilm has just been added. 17077 */ 17078 ASSERT(IAM_WRITER_ILL(to_ill)); 17079 ASSERT(IAM_WRITER_ILL(from_ill)); 17080 17081 ILM_WALKER_HOLD(to_ill); 17082 for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 17083 17084 if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED)) 17085 continue; 17086 /* 17087 * no locks held, ill/ipif cannot dissappear as long 17088 * as we are writer. 17089 */ 17090 ipif = to_ill->ill_ipif; 17091 /* 17092 * No need to hold any lock as we are the writer and this 17093 * can only be changed by a writer. 17094 */ 17095 ilm->ilm_is_new = B_FALSE; 17096 17097 if (to_ill->ill_net_type != IRE_IF_RESOLVER || 17098 ipif->ipif_flags & IPIF_POINTOPOINT) { 17099 ip1dbg(("ilm_send_multicast_reqs: to_ill not " 17100 "resolver\n")); 17101 continue; /* Must be IRE_IF_NORESOLVER */ 17102 } 17103 17104 17105 if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 17106 ip1dbg(("ilm_send_multicast_reqs: " 17107 "to_ill MULTI_BCAST\n")); 17108 goto from; 17109 } 17110 17111 if (to_ill->ill_isv6) 17112 mld_joingroup(ilm); 17113 else 17114 igmp_joingroup(ilm); 17115 17116 if (to_ill->ill_ipif_up_count == 0) { 17117 /* 17118 * Nobody there. All multicast addresses will be 17119 * re-joined when we get the DL_BIND_ACK bringing the 17120 * interface up. 17121 */ 17122 ilm->ilm_notify_driver = B_FALSE; 17123 ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n")); 17124 goto from; 17125 } 17126 17127 /* 17128 * For allmulti address, we want to join on only one interface. 17129 * Checking for ilm_numentries_v6 is not correct as you may 17130 * find an ilm with zero address on to_ill, but we may not 17131 * have nominated to_ill for receiving. Thus, if we have 17132 * nominated from_ill (ill_join_allmulti is set), nominate 17133 * only if to_ill is not already nominated (to_ill normally 17134 * should not have been nominated if "from_ill" has already 17135 * been nominated. As we don't prevent failovers from happening 17136 * across groups, we don't assert). 17137 */ 17138 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 17139 /* 17140 * There is no need to hold ill locks as we are 17141 * writer on both ills and when ill_join_allmulti 17142 * is changed the thread is always a writer. 17143 */ 17144 if (from_ill->ill_join_allmulti && 17145 !to_ill->ill_join_allmulti) { 17146 (void) ip_join_allmulti(to_ill->ill_ipif); 17147 } 17148 } else if (ilm->ilm_notify_driver) { 17149 17150 /* 17151 * This is a newly moved ilm so we need to tell the 17152 * driver about the new group. There can be more than 17153 * one ilm's for the same group in the list each with a 17154 * different orig_ifindex. We have to inform the driver 17155 * once. In ilm_move_v[4,6] we only set the flag 17156 * ilm_notify_driver for the first ilm. 17157 */ 17158 17159 (void) ip_ll_send_enabmulti_req(to_ill, 17160 &ilm->ilm_v6addr); 17161 } 17162 17163 ilm->ilm_notify_driver = B_FALSE; 17164 17165 /* 17166 * See whether we need to send down DL_DISABMULTI_REQ on 17167 * from_ill as ilm has just been removed. 17168 */ 17169 from: 17170 ipif = from_ill->ill_ipif; 17171 if (from_ill->ill_net_type != IRE_IF_RESOLVER || 17172 ipif->ipif_flags & IPIF_POINTOPOINT) { 17173 ip1dbg(("ilm_send_multicast_reqs: " 17174 "from_ill not resolver\n")); 17175 continue; /* Must be IRE_IF_NORESOLVER */ 17176 } 17177 17178 if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 17179 ip1dbg(("ilm_send_multicast_reqs: " 17180 "from_ill MULTI_BCAST\n")); 17181 continue; 17182 } 17183 17184 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 17185 if (from_ill->ill_join_allmulti) 17186 (void) ip_leave_allmulti(from_ill->ill_ipif); 17187 } else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) { 17188 (void) ip_ll_send_disabmulti_req(from_ill, 17189 &ilm->ilm_v6addr); 17190 } 17191 } 17192 ILM_WALKER_RELE(to_ill); 17193 } 17194 17195 /* 17196 * This function is called when all multicast memberships needs 17197 * to be moved from "from_ill" to "to_ill" for IPv6. This function is 17198 * called only once unlike the IPv4 counterpart where it is called after 17199 * every logical interface is moved. The reason is due to multicast 17200 * memberships are joined using an interface address in IPv4 while in 17201 * IPv6, interface index is used. 17202 */ 17203 static void 17204 ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex) 17205 { 17206 ilm_t *ilm; 17207 ilm_t *ilm_next; 17208 ilm_t *new_ilm; 17209 ilm_t **ilmp; 17210 int count; 17211 char buf[INET6_ADDRSTRLEN]; 17212 in6_addr_t ipv6_snm = ipv6_solicited_node_mcast; 17213 ip_stack_t *ipst = from_ill->ill_ipst; 17214 17215 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 17216 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 17217 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 17218 17219 if (ifindex == 0) { 17220 /* 17221 * Form the solicited node mcast address which is used later. 17222 */ 17223 ipif_t *ipif; 17224 17225 ipif = from_ill->ill_ipif; 17226 ASSERT(ipif->ipif_id == 0); 17227 17228 ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 17229 } 17230 17231 ilmp = &from_ill->ill_ilm; 17232 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 17233 ilm_next = ilm->ilm_next; 17234 17235 if (ilm->ilm_flags & ILM_DELETED) { 17236 ilmp = &ilm->ilm_next; 17237 continue; 17238 } 17239 17240 new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr, 17241 ilm->ilm_orig_ifindex, ilm->ilm_zoneid); 17242 ASSERT(ilm->ilm_orig_ifindex != 0); 17243 if (ilm->ilm_orig_ifindex == ifindex) { 17244 /* 17245 * We are failing back multicast memberships. 17246 * If the same ilm exists in to_ill, it means somebody 17247 * has joined the same group there e.g. ff02::1 17248 * is joined within the kernel when the interfaces 17249 * came UP. 17250 */ 17251 ASSERT(ilm->ilm_ipif == NULL); 17252 if (new_ilm != NULL) { 17253 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 17254 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 17255 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 17256 new_ilm->ilm_is_new = B_TRUE; 17257 } 17258 } else { 17259 /* 17260 * check if we can just move the ilm 17261 */ 17262 if (from_ill->ill_ilm_walker_cnt != 0) { 17263 /* 17264 * We have walkers we cannot move 17265 * the ilm, so allocate a new ilm, 17266 * this (old) ilm will be marked 17267 * ILM_DELETED at the end of the loop 17268 * and will be freed when the 17269 * last walker exits. 17270 */ 17271 new_ilm = (ilm_t *)mi_zalloc 17272 (sizeof (ilm_t)); 17273 if (new_ilm == NULL) { 17274 ip0dbg(("ilm_move_v6: " 17275 "FAILBACK of IPv6" 17276 " multicast address %s : " 17277 "from %s to" 17278 " %s failed : ENOMEM \n", 17279 inet_ntop(AF_INET6, 17280 &ilm->ilm_v6addr, buf, 17281 sizeof (buf)), 17282 from_ill->ill_name, 17283 to_ill->ill_name)); 17284 17285 ilmp = &ilm->ilm_next; 17286 continue; 17287 } 17288 *new_ilm = *ilm; 17289 /* 17290 * we don't want new_ilm linked to 17291 * ilm's filter list. 17292 */ 17293 new_ilm->ilm_filter = NULL; 17294 } else { 17295 /* 17296 * No walkers we can move the ilm. 17297 * lets take it out of the list. 17298 */ 17299 *ilmp = ilm->ilm_next; 17300 ilm->ilm_next = NULL; 17301 new_ilm = ilm; 17302 } 17303 17304 /* 17305 * if this is the first ilm for the group 17306 * set ilm_notify_driver so that we notify the 17307 * driver in ilm_send_multicast_reqs. 17308 */ 17309 if (ilm_lookup_ill_v6(to_ill, 17310 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 17311 new_ilm->ilm_notify_driver = B_TRUE; 17312 17313 new_ilm->ilm_ill = to_ill; 17314 /* Add to the to_ill's list */ 17315 new_ilm->ilm_next = to_ill->ill_ilm; 17316 to_ill->ill_ilm = new_ilm; 17317 /* 17318 * set the flag so that mld_joingroup is 17319 * called in ilm_send_multicast_reqs(). 17320 */ 17321 new_ilm->ilm_is_new = B_TRUE; 17322 } 17323 goto bottom; 17324 } else if (ifindex != 0) { 17325 /* 17326 * If this is FAILBACK (ifindex != 0) and the ifindex 17327 * has not matched above, look at the next ilm. 17328 */ 17329 ilmp = &ilm->ilm_next; 17330 continue; 17331 } 17332 /* 17333 * If we are here, it means ifindex is 0. Failover 17334 * everything. 17335 * 17336 * We need to handle solicited node mcast address 17337 * and all_nodes mcast address differently as they 17338 * are joined witin the kenrel (ipif_multicast_up) 17339 * and potentially from the userland. We are called 17340 * after the ipifs of from_ill has been moved. 17341 * If we still find ilms on ill with solicited node 17342 * mcast address or all_nodes mcast address, it must 17343 * belong to the UP interface that has not moved e.g. 17344 * ipif_id 0 with the link local prefix does not move. 17345 * We join this on the new ill accounting for all the 17346 * userland memberships so that applications don't 17347 * see any failure. 17348 * 17349 * We need to make sure that we account only for the 17350 * solicited node and all node multicast addresses 17351 * that was brought UP on these. In the case of 17352 * a failover from A to B, we might have ilms belonging 17353 * to A (ilm_orig_ifindex pointing at A) on B accounting 17354 * for the membership from the userland. If we are failing 17355 * over from B to C now, we will find the ones belonging 17356 * to A on B. These don't account for the ill_ipif_up_count. 17357 * They just move from B to C. The check below on 17358 * ilm_orig_ifindex ensures that. 17359 */ 17360 if ((ilm->ilm_orig_ifindex == 17361 from_ill->ill_phyint->phyint_ifindex) && 17362 (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) || 17363 IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, 17364 &ilm->ilm_v6addr))) { 17365 ASSERT(ilm->ilm_refcnt > 0); 17366 count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count; 17367 /* 17368 * For indentation reasons, we are not using a 17369 * "else" here. 17370 */ 17371 if (count == 0) { 17372 ilmp = &ilm->ilm_next; 17373 continue; 17374 } 17375 ilm->ilm_refcnt -= count; 17376 if (new_ilm != NULL) { 17377 /* 17378 * Can find one with the same 17379 * ilm_orig_ifindex, if we are failing 17380 * over to a STANDBY. This happens 17381 * when somebody wants to join a group 17382 * on a STANDBY interface and we 17383 * internally join on a different one. 17384 * If we had joined on from_ill then, a 17385 * failover now will find a new ilm 17386 * with this index. 17387 */ 17388 ip1dbg(("ilm_move_v6: FAILOVER, found" 17389 " new ilm on %s, group address %s\n", 17390 to_ill->ill_name, 17391 inet_ntop(AF_INET6, 17392 &ilm->ilm_v6addr, buf, 17393 sizeof (buf)))); 17394 new_ilm->ilm_refcnt += count; 17395 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 17396 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 17397 new_ilm->ilm_is_new = B_TRUE; 17398 } 17399 } else { 17400 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 17401 if (new_ilm == NULL) { 17402 ip0dbg(("ilm_move_v6: FAILOVER of IPv6" 17403 " multicast address %s : from %s to" 17404 " %s failed : ENOMEM \n", 17405 inet_ntop(AF_INET6, 17406 &ilm->ilm_v6addr, buf, 17407 sizeof (buf)), from_ill->ill_name, 17408 to_ill->ill_name)); 17409 ilmp = &ilm->ilm_next; 17410 continue; 17411 } 17412 *new_ilm = *ilm; 17413 new_ilm->ilm_filter = NULL; 17414 new_ilm->ilm_refcnt = count; 17415 new_ilm->ilm_timer = INFINITY; 17416 new_ilm->ilm_rtx.rtx_timer = INFINITY; 17417 new_ilm->ilm_is_new = B_TRUE; 17418 /* 17419 * If the to_ill has not joined this 17420 * group we need to tell the driver in 17421 * ill_send_multicast_reqs. 17422 */ 17423 if (ilm_lookup_ill_v6(to_ill, 17424 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 17425 new_ilm->ilm_notify_driver = B_TRUE; 17426 17427 new_ilm->ilm_ill = to_ill; 17428 /* Add to the to_ill's list */ 17429 new_ilm->ilm_next = to_ill->ill_ilm; 17430 to_ill->ill_ilm = new_ilm; 17431 ASSERT(new_ilm->ilm_ipif == NULL); 17432 } 17433 if (ilm->ilm_refcnt == 0) { 17434 goto bottom; 17435 } else { 17436 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17437 CLEAR_SLIST(new_ilm->ilm_filter); 17438 ilmp = &ilm->ilm_next; 17439 } 17440 continue; 17441 } else { 17442 /* 17443 * ifindex = 0 means, move everything pointing at 17444 * from_ill. We are doing this becuase ill has 17445 * either FAILED or became INACTIVE. 17446 * 17447 * As we would like to move things later back to 17448 * from_ill, we want to retain the identity of this 17449 * ilm. Thus, we don't blindly increment the reference 17450 * count on the ilms matching the address alone. We 17451 * need to match on the ilm_orig_index also. new_ilm 17452 * was obtained by matching ilm_orig_index also. 17453 */ 17454 if (new_ilm != NULL) { 17455 /* 17456 * This is possible only if a previous restore 17457 * was incomplete i.e restore to 17458 * ilm_orig_ifindex left some ilms because 17459 * of some failures. Thus when we are failing 17460 * again, we might find our old friends there. 17461 */ 17462 ip1dbg(("ilm_move_v6: FAILOVER, found new ilm" 17463 " on %s, group address %s\n", 17464 to_ill->ill_name, 17465 inet_ntop(AF_INET6, 17466 &ilm->ilm_v6addr, buf, 17467 sizeof (buf)))); 17468 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 17469 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 17470 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 17471 new_ilm->ilm_is_new = B_TRUE; 17472 } 17473 } else { 17474 if (from_ill->ill_ilm_walker_cnt != 0) { 17475 new_ilm = (ilm_t *) 17476 mi_zalloc(sizeof (ilm_t)); 17477 if (new_ilm == NULL) { 17478 ip0dbg(("ilm_move_v6: " 17479 "FAILOVER of IPv6" 17480 " multicast address %s : " 17481 "from %s to" 17482 " %s failed : ENOMEM \n", 17483 inet_ntop(AF_INET6, 17484 &ilm->ilm_v6addr, buf, 17485 sizeof (buf)), 17486 from_ill->ill_name, 17487 to_ill->ill_name)); 17488 17489 ilmp = &ilm->ilm_next; 17490 continue; 17491 } 17492 *new_ilm = *ilm; 17493 new_ilm->ilm_filter = NULL; 17494 } else { 17495 *ilmp = ilm->ilm_next; 17496 new_ilm = ilm; 17497 } 17498 /* 17499 * If the to_ill has not joined this 17500 * group we need to tell the driver in 17501 * ill_send_multicast_reqs. 17502 */ 17503 if (ilm_lookup_ill_v6(to_ill, 17504 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 17505 new_ilm->ilm_notify_driver = B_TRUE; 17506 17507 /* Add to the to_ill's list */ 17508 new_ilm->ilm_next = to_ill->ill_ilm; 17509 to_ill->ill_ilm = new_ilm; 17510 ASSERT(ilm->ilm_ipif == NULL); 17511 new_ilm->ilm_ill = to_ill; 17512 new_ilm->ilm_is_new = B_TRUE; 17513 } 17514 17515 } 17516 17517 bottom: 17518 /* 17519 * Revert multicast filter state to (EXCLUDE, NULL). 17520 * new_ilm->ilm_is_new should already be set if needed. 17521 */ 17522 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17523 CLEAR_SLIST(new_ilm->ilm_filter); 17524 /* 17525 * We allocated/got a new ilm, free the old one. 17526 */ 17527 if (new_ilm != ilm) { 17528 if (from_ill->ill_ilm_walker_cnt == 0) { 17529 *ilmp = ilm->ilm_next; 17530 ilm->ilm_next = NULL; 17531 FREE_SLIST(ilm->ilm_filter); 17532 FREE_SLIST(ilm->ilm_pendsrcs); 17533 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 17534 FREE_SLIST(ilm->ilm_rtx.rtx_block); 17535 mi_free((char *)ilm); 17536 } else { 17537 ilm->ilm_flags |= ILM_DELETED; 17538 from_ill->ill_ilm_cleanup_reqd = 1; 17539 ilmp = &ilm->ilm_next; 17540 } 17541 } 17542 } 17543 } 17544 17545 /* 17546 * Move all the multicast memberships to to_ill. Called when 17547 * an ipif moves from "from_ill" to "to_ill". This function is slightly 17548 * different from IPv6 counterpart as multicast memberships are associated 17549 * with ills in IPv6. This function is called after every ipif is moved 17550 * unlike IPv6, where it is moved only once. 17551 */ 17552 static void 17553 ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif) 17554 { 17555 ilm_t *ilm; 17556 ilm_t *ilm_next; 17557 ilm_t *new_ilm; 17558 ilm_t **ilmp; 17559 ip_stack_t *ipst = from_ill->ill_ipst; 17560 17561 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 17562 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 17563 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 17564 17565 ilmp = &from_ill->ill_ilm; 17566 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 17567 ilm_next = ilm->ilm_next; 17568 17569 if (ilm->ilm_flags & ILM_DELETED) { 17570 ilmp = &ilm->ilm_next; 17571 continue; 17572 } 17573 17574 ASSERT(ilm->ilm_ipif != NULL); 17575 17576 if (ilm->ilm_ipif != ipif) { 17577 ilmp = &ilm->ilm_next; 17578 continue; 17579 } 17580 17581 if (V4_PART_OF_V6(ilm->ilm_v6addr) == 17582 htonl(INADDR_ALLHOSTS_GROUP)) { 17583 /* 17584 * We joined this in ipif_multicast_up 17585 * and we never did an ipif_multicast_down 17586 * for IPv4. If nobody else from the userland 17587 * has reference, we free the ilm, and later 17588 * when this ipif comes up on the new ill, 17589 * we will join this again. 17590 */ 17591 if (--ilm->ilm_refcnt == 0) 17592 goto delete_ilm; 17593 17594 new_ilm = ilm_lookup_ipif(ipif, 17595 V4_PART_OF_V6(ilm->ilm_v6addr)); 17596 if (new_ilm != NULL) { 17597 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 17598 /* 17599 * We still need to deal with the from_ill. 17600 */ 17601 new_ilm->ilm_is_new = B_TRUE; 17602 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17603 CLEAR_SLIST(new_ilm->ilm_filter); 17604 goto delete_ilm; 17605 } 17606 /* 17607 * If we could not find one e.g. ipif is 17608 * still down on to_ill, we add this ilm 17609 * on ill_new to preserve the reference 17610 * count. 17611 */ 17612 } 17613 /* 17614 * When ipifs move, ilms always move with it 17615 * to the NEW ill. Thus we should never be 17616 * able to find ilm till we really move it here. 17617 */ 17618 ASSERT(ilm_lookup_ipif(ipif, 17619 V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL); 17620 17621 if (from_ill->ill_ilm_walker_cnt != 0) { 17622 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 17623 if (new_ilm == NULL) { 17624 char buf[INET6_ADDRSTRLEN]; 17625 ip0dbg(("ilm_move_v4: FAILBACK of IPv4" 17626 " multicast address %s : " 17627 "from %s to" 17628 " %s failed : ENOMEM \n", 17629 inet_ntop(AF_INET, 17630 &ilm->ilm_v6addr, buf, 17631 sizeof (buf)), 17632 from_ill->ill_name, 17633 to_ill->ill_name)); 17634 17635 ilmp = &ilm->ilm_next; 17636 continue; 17637 } 17638 *new_ilm = *ilm; 17639 /* We don't want new_ilm linked to ilm's filter list */ 17640 new_ilm->ilm_filter = NULL; 17641 } else { 17642 /* Remove from the list */ 17643 *ilmp = ilm->ilm_next; 17644 new_ilm = ilm; 17645 } 17646 17647 /* 17648 * If we have never joined this group on the to_ill 17649 * make sure we tell the driver. 17650 */ 17651 if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr, 17652 ALL_ZONES) == NULL) 17653 new_ilm->ilm_notify_driver = B_TRUE; 17654 17655 /* Add to the to_ill's list */ 17656 new_ilm->ilm_next = to_ill->ill_ilm; 17657 to_ill->ill_ilm = new_ilm; 17658 new_ilm->ilm_is_new = B_TRUE; 17659 17660 /* 17661 * Revert multicast filter state to (EXCLUDE, NULL) 17662 */ 17663 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17664 CLEAR_SLIST(new_ilm->ilm_filter); 17665 17666 /* 17667 * Delete only if we have allocated a new ilm. 17668 */ 17669 if (new_ilm != ilm) { 17670 delete_ilm: 17671 if (from_ill->ill_ilm_walker_cnt == 0) { 17672 /* Remove from the list */ 17673 *ilmp = ilm->ilm_next; 17674 ilm->ilm_next = NULL; 17675 FREE_SLIST(ilm->ilm_filter); 17676 FREE_SLIST(ilm->ilm_pendsrcs); 17677 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 17678 FREE_SLIST(ilm->ilm_rtx.rtx_block); 17679 mi_free((char *)ilm); 17680 } else { 17681 ilm->ilm_flags |= ILM_DELETED; 17682 from_ill->ill_ilm_cleanup_reqd = 1; 17683 ilmp = &ilm->ilm_next; 17684 } 17685 } 17686 } 17687 } 17688 17689 static uint_t 17690 ipif_get_id(ill_t *ill, uint_t id) 17691 { 17692 uint_t unit; 17693 ipif_t *tipif; 17694 boolean_t found = B_FALSE; 17695 ip_stack_t *ipst = ill->ill_ipst; 17696 17697 /* 17698 * During failback, we want to go back to the same id 17699 * instead of the smallest id so that the original 17700 * configuration is maintained. id is non-zero in that 17701 * case. 17702 */ 17703 if (id != 0) { 17704 /* 17705 * While failing back, if we still have an ipif with 17706 * MAX_ADDRS_PER_IF, it means this will be replaced 17707 * as soon as we return from this function. It was 17708 * to set to MAX_ADDRS_PER_IF by the caller so that 17709 * we can choose the smallest id. Thus we return zero 17710 * in that case ignoring the hint. 17711 */ 17712 if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF) 17713 return (0); 17714 for (tipif = ill->ill_ipif; tipif != NULL; 17715 tipif = tipif->ipif_next) { 17716 if (tipif->ipif_id == id) { 17717 found = B_TRUE; 17718 break; 17719 } 17720 } 17721 /* 17722 * If somebody already plumbed another logical 17723 * with the same id, we won't be able to find it. 17724 */ 17725 if (!found) 17726 return (id); 17727 } 17728 for (unit = 0; unit <= ipst->ips_ip_addrs_per_if; unit++) { 17729 found = B_FALSE; 17730 for (tipif = ill->ill_ipif; tipif != NULL; 17731 tipif = tipif->ipif_next) { 17732 if (tipif->ipif_id == unit) { 17733 found = B_TRUE; 17734 break; 17735 } 17736 } 17737 if (!found) 17738 break; 17739 } 17740 return (unit); 17741 } 17742 17743 /* ARGSUSED */ 17744 static int 17745 ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp, 17746 ipif_t **rep_ipif_ptr) 17747 { 17748 ill_t *from_ill; 17749 ipif_t *rep_ipif; 17750 uint_t unit; 17751 int err = 0; 17752 ipif_t *to_ipif; 17753 struct iocblk *iocp; 17754 boolean_t failback_cmd; 17755 boolean_t remove_ipif; 17756 int rc; 17757 ip_stack_t *ipst; 17758 17759 ASSERT(IAM_WRITER_ILL(to_ill)); 17760 ASSERT(IAM_WRITER_IPIF(ipif)); 17761 17762 iocp = (struct iocblk *)mp->b_rptr; 17763 failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK); 17764 remove_ipif = B_FALSE; 17765 17766 from_ill = ipif->ipif_ill; 17767 ipst = from_ill->ill_ipst; 17768 17769 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 17770 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 17771 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 17772 17773 /* 17774 * Don't move LINK LOCAL addresses as they are tied to 17775 * physical interface. 17776 */ 17777 if (from_ill->ill_isv6 && 17778 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) { 17779 ipif->ipif_was_up = B_FALSE; 17780 IPIF_UNMARK_MOVING(ipif); 17781 return (0); 17782 } 17783 17784 /* 17785 * We set the ipif_id to maximum so that the search for 17786 * ipif_id will pick the lowest number i.e 0 in the 17787 * following 2 cases : 17788 * 17789 * 1) We have a replacement ipif at the head of to_ill. 17790 * We can't remove it yet as we can exceed ip_addrs_per_if 17791 * on to_ill and hence the MOVE might fail. We want to 17792 * remove it only if we could move the ipif. Thus, by 17793 * setting it to the MAX value, we make the search in 17794 * ipif_get_id return the zeroth id. 17795 * 17796 * 2) When DR pulls out the NIC and re-plumbs the interface, 17797 * we might just have a zero address plumbed on the ipif 17798 * with zero id in the case of IPv4. We remove that while 17799 * doing the failback. We want to remove it only if we 17800 * could move the ipif. Thus, by setting it to the MAX 17801 * value, we make the search in ipif_get_id return the 17802 * zeroth id. 17803 * 17804 * Both (1) and (2) are done only when when we are moving 17805 * an ipif (either due to failover/failback) which originally 17806 * belonged to this interface i.e the ipif_orig_ifindex is 17807 * the same as to_ill's ifindex. This is needed so that 17808 * FAILOVER from A -> B ( A failed) followed by FAILOVER 17809 * from B -> A (B is being removed from the group) and 17810 * FAILBACK from A -> B restores the original configuration. 17811 * Without the check for orig_ifindex, the second FAILOVER 17812 * could make the ipif belonging to B replace the A's zeroth 17813 * ipif and the subsequent failback re-creating the replacement 17814 * ipif again. 17815 * 17816 * NOTE : We created the replacement ipif when we did a 17817 * FAILOVER (See below). We could check for FAILBACK and 17818 * then look for replacement ipif to be removed. But we don't 17819 * want to do that because we wan't to allow the possibility 17820 * of a FAILOVER from A -> B (which creates the replacement ipif), 17821 * followed by a *FAILOVER* from B -> A instead of a FAILBACK 17822 * from B -> A. 17823 */ 17824 to_ipif = to_ill->ill_ipif; 17825 if ((to_ill->ill_phyint->phyint_ifindex == 17826 ipif->ipif_orig_ifindex) && 17827 IPIF_REPL_CHECK(to_ipif, failback_cmd)) { 17828 ASSERT(to_ipif->ipif_id == 0); 17829 remove_ipif = B_TRUE; 17830 to_ipif->ipif_id = MAX_ADDRS_PER_IF; 17831 } 17832 /* 17833 * Find the lowest logical unit number on the to_ill. 17834 * If we are failing back, try to get the original id 17835 * rather than the lowest one so that the original 17836 * configuration is maintained. 17837 * 17838 * XXX need a better scheme for this. 17839 */ 17840 if (failback_cmd) { 17841 unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid); 17842 } else { 17843 unit = ipif_get_id(to_ill, 0); 17844 } 17845 17846 /* Reset back to zero in case we fail below */ 17847 if (to_ipif->ipif_id == MAX_ADDRS_PER_IF) 17848 to_ipif->ipif_id = 0; 17849 17850 if (unit == ipst->ips_ip_addrs_per_if) { 17851 ipif->ipif_was_up = B_FALSE; 17852 IPIF_UNMARK_MOVING(ipif); 17853 return (EINVAL); 17854 } 17855 17856 /* 17857 * ipif is ready to move from "from_ill" to "to_ill". 17858 * 17859 * 1) If we are moving ipif with id zero, create a 17860 * replacement ipif for this ipif on from_ill. If this fails 17861 * fail the MOVE operation. 17862 * 17863 * 2) Remove the replacement ipif on to_ill if any. 17864 * We could remove the replacement ipif when we are moving 17865 * the ipif with id zero. But what if somebody already 17866 * unplumbed it ? Thus we always remove it if it is present. 17867 * We want to do it only if we are sure we are going to 17868 * move the ipif to to_ill which is why there are no 17869 * returns due to error till ipif is linked to to_ill. 17870 * Note that the first ipif that we failback will always 17871 * be zero if it is present. 17872 */ 17873 if (ipif->ipif_id == 0) { 17874 ipaddr_t inaddr_any = INADDR_ANY; 17875 17876 rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED); 17877 if (rep_ipif == NULL) { 17878 ipif->ipif_was_up = B_FALSE; 17879 IPIF_UNMARK_MOVING(ipif); 17880 return (ENOMEM); 17881 } 17882 *rep_ipif = ipif_zero; 17883 /* 17884 * Before we put the ipif on the list, store the addresses 17885 * as mapped addresses as some of the ioctls e.g SIOCGIFADDR 17886 * assumes so. This logic is not any different from what 17887 * ipif_allocate does. 17888 */ 17889 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17890 &rep_ipif->ipif_v6lcl_addr); 17891 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17892 &rep_ipif->ipif_v6src_addr); 17893 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17894 &rep_ipif->ipif_v6subnet); 17895 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17896 &rep_ipif->ipif_v6net_mask); 17897 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17898 &rep_ipif->ipif_v6brd_addr); 17899 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17900 &rep_ipif->ipif_v6pp_dst_addr); 17901 /* 17902 * We mark IPIF_NOFAILOVER so that this can never 17903 * move. 17904 */ 17905 rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER; 17906 rep_ipif->ipif_flags &= ~IPIF_UP & ~IPIF_DUPLICATE; 17907 rep_ipif->ipif_replace_zero = B_TRUE; 17908 mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL, 17909 MUTEX_DEFAULT, NULL); 17910 rep_ipif->ipif_id = 0; 17911 rep_ipif->ipif_ire_type = ipif->ipif_ire_type; 17912 rep_ipif->ipif_ill = from_ill; 17913 rep_ipif->ipif_orig_ifindex = 17914 from_ill->ill_phyint->phyint_ifindex; 17915 /* Insert at head */ 17916 rep_ipif->ipif_next = from_ill->ill_ipif; 17917 from_ill->ill_ipif = rep_ipif; 17918 /* 17919 * We don't really care to let apps know about 17920 * this interface. 17921 */ 17922 } 17923 17924 if (remove_ipif) { 17925 /* 17926 * We set to a max value above for this case to get 17927 * id zero. ASSERT that we did get one. 17928 */ 17929 ASSERT((to_ipif->ipif_id == 0) && (unit == 0)); 17930 rep_ipif = to_ipif; 17931 to_ill->ill_ipif = rep_ipif->ipif_next; 17932 rep_ipif->ipif_next = NULL; 17933 /* 17934 * If some apps scanned and find this interface, 17935 * it is time to let them know, so that they can 17936 * delete it. 17937 */ 17938 17939 *rep_ipif_ptr = rep_ipif; 17940 } 17941 17942 /* Get it out of the ILL interface list. */ 17943 ipif_remove(ipif, B_FALSE); 17944 17945 /* Assign the new ill */ 17946 ipif->ipif_ill = to_ill; 17947 ipif->ipif_id = unit; 17948 /* id has already been checked */ 17949 rc = ipif_insert(ipif, B_FALSE, B_FALSE); 17950 ASSERT(rc == 0); 17951 /* Let SCTP update its list */ 17952 sctp_move_ipif(ipif, from_ill, to_ill); 17953 /* 17954 * Handle the failover and failback of ipif_t between 17955 * ill_t that have differing maximum mtu values. 17956 */ 17957 if (ipif->ipif_mtu > to_ill->ill_max_mtu) { 17958 if (ipif->ipif_saved_mtu == 0) { 17959 /* 17960 * As this ipif_t is moving to an ill_t 17961 * that has a lower ill_max_mtu, its 17962 * ipif_mtu needs to be saved so it can 17963 * be restored during failback or during 17964 * failover to an ill_t which has a 17965 * higher ill_max_mtu. 17966 */ 17967 ipif->ipif_saved_mtu = ipif->ipif_mtu; 17968 ipif->ipif_mtu = to_ill->ill_max_mtu; 17969 } else { 17970 /* 17971 * The ipif_t is, once again, moving to 17972 * an ill_t that has a lower maximum mtu 17973 * value. 17974 */ 17975 ipif->ipif_mtu = to_ill->ill_max_mtu; 17976 } 17977 } else if (ipif->ipif_mtu < to_ill->ill_max_mtu && 17978 ipif->ipif_saved_mtu != 0) { 17979 /* 17980 * The mtu of this ipif_t had to be reduced 17981 * during an earlier failover; this is an 17982 * opportunity for it to be increased (either as 17983 * part of another failover or a failback). 17984 */ 17985 if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) { 17986 ipif->ipif_mtu = ipif->ipif_saved_mtu; 17987 ipif->ipif_saved_mtu = 0; 17988 } else { 17989 ipif->ipif_mtu = to_ill->ill_max_mtu; 17990 } 17991 } 17992 17993 /* 17994 * We preserve all the other fields of the ipif including 17995 * ipif_saved_ire_mp. The routes that are saved here will 17996 * be recreated on the new interface and back on the old 17997 * interface when we move back. 17998 */ 17999 ASSERT(ipif->ipif_arp_del_mp == NULL); 18000 18001 return (err); 18002 } 18003 18004 static int 18005 ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp, 18006 int ifindex, ipif_t **rep_ipif_ptr) 18007 { 18008 ipif_t *mipif; 18009 ipif_t *ipif_next; 18010 int err; 18011 18012 /* 18013 * We don't really try to MOVE back things if some of the 18014 * operations fail. The daemon will take care of moving again 18015 * later on. 18016 */ 18017 for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) { 18018 ipif_next = mipif->ipif_next; 18019 if (!(mipif->ipif_flags & IPIF_NOFAILOVER) && 18020 (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) { 18021 18022 err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr); 18023 18024 /* 18025 * When the MOVE fails, it is the job of the 18026 * application to take care of this properly 18027 * i.e try again if it is ENOMEM. 18028 */ 18029 if (mipif->ipif_ill != from_ill) { 18030 /* 18031 * ipif has moved. 18032 * 18033 * Move the multicast memberships associated 18034 * with this ipif to the new ill. For IPv6, we 18035 * do it once after all the ipifs are moved 18036 * (in ill_move) as they are not associated 18037 * with ipifs. 18038 * 18039 * We need to move the ilms as the ipif has 18040 * already been moved to a new ill even 18041 * in the case of errors. Neither 18042 * ilm_free(ipif) will find the ilm 18043 * when somebody unplumbs this ipif nor 18044 * ilm_delete(ilm) will be able to find the 18045 * ilm, if we don't move now. 18046 */ 18047 if (!from_ill->ill_isv6) 18048 ilm_move_v4(from_ill, to_ill, mipif); 18049 } 18050 18051 if (err != 0) 18052 return (err); 18053 } 18054 } 18055 return (0); 18056 } 18057 18058 static int 18059 ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp) 18060 { 18061 int ifindex; 18062 int err; 18063 struct iocblk *iocp; 18064 ipif_t *ipif; 18065 ipif_t *rep_ipif_ptr = NULL; 18066 ipif_t *from_ipif = NULL; 18067 boolean_t check_rep_if = B_FALSE; 18068 ip_stack_t *ipst = from_ill->ill_ipst; 18069 18070 iocp = (struct iocblk *)mp->b_rptr; 18071 if (iocp->ioc_cmd == SIOCLIFFAILOVER) { 18072 /* 18073 * Move everything pointing at from_ill to to_ill. 18074 * We acheive this by passing in 0 as ifindex. 18075 */ 18076 ifindex = 0; 18077 } else { 18078 /* 18079 * Move everything pointing at from_ill whose original 18080 * ifindex of connp, ipif, ilm points at to_ill->ill_index. 18081 * We acheive this by passing in ifindex rather than 0. 18082 * Multicast vifs, ilgs move implicitly because ipifs move. 18083 */ 18084 ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK); 18085 ifindex = to_ill->ill_phyint->phyint_ifindex; 18086 } 18087 18088 /* 18089 * Determine if there is at least one ipif that would move from 18090 * 'from_ill' to 'to_ill'. If so, it is possible that the replacement 18091 * ipif (if it exists) on the to_ill would be consumed as a result of 18092 * the move, in which case we need to quiesce the replacement ipif also. 18093 */ 18094 for (from_ipif = from_ill->ill_ipif; from_ipif != NULL; 18095 from_ipif = from_ipif->ipif_next) { 18096 if (((ifindex == 0) || 18097 (ifindex == from_ipif->ipif_orig_ifindex)) && 18098 !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) { 18099 check_rep_if = B_TRUE; 18100 break; 18101 } 18102 } 18103 18104 18105 ill_down_ipifs(from_ill, mp, ifindex, B_TRUE); 18106 18107 GRAB_ILL_LOCKS(from_ill, to_ill); 18108 if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) { 18109 (void) ipsq_pending_mp_add(NULL, ipif, q, 18110 mp, ILL_MOVE_OK); 18111 RELEASE_ILL_LOCKS(from_ill, to_ill); 18112 return (EINPROGRESS); 18113 } 18114 18115 /* Check if the replacement ipif is quiescent to delete */ 18116 if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif, 18117 (iocp->ioc_cmd == SIOCLIFFAILBACK))) { 18118 to_ill->ill_ipif->ipif_state_flags |= 18119 IPIF_MOVING | IPIF_CHANGING; 18120 if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) { 18121 (void) ipsq_pending_mp_add(NULL, ipif, q, 18122 mp, ILL_MOVE_OK); 18123 RELEASE_ILL_LOCKS(from_ill, to_ill); 18124 return (EINPROGRESS); 18125 } 18126 } 18127 RELEASE_ILL_LOCKS(from_ill, to_ill); 18128 18129 ASSERT(!MUTEX_HELD(&to_ill->ill_lock)); 18130 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 18131 GRAB_ILL_LOCKS(from_ill, to_ill); 18132 err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr); 18133 18134 /* ilm_move is done inside ipif_move for IPv4 */ 18135 if (err == 0 && from_ill->ill_isv6) 18136 ilm_move_v6(from_ill, to_ill, ifindex); 18137 18138 RELEASE_ILL_LOCKS(from_ill, to_ill); 18139 rw_exit(&ipst->ips_ill_g_lock); 18140 18141 /* 18142 * send rts messages and multicast messages. 18143 */ 18144 if (rep_ipif_ptr != NULL) { 18145 if (rep_ipif_ptr->ipif_recovery_id != 0) { 18146 (void) untimeout(rep_ipif_ptr->ipif_recovery_id); 18147 rep_ipif_ptr->ipif_recovery_id = 0; 18148 } 18149 ip_rts_ifmsg(rep_ipif_ptr); 18150 ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr); 18151 IPIF_TRACE_CLEANUP(rep_ipif_ptr); 18152 mi_free(rep_ipif_ptr); 18153 } 18154 18155 conn_move_ill(from_ill, to_ill, ifindex); 18156 18157 return (err); 18158 } 18159 18160 /* 18161 * Used to extract arguments for FAILOVER/FAILBACK ioctls. 18162 * Also checks for the validity of the arguments. 18163 * Note: We are already exclusive inside the from group. 18164 * It is upto the caller to release refcnt on the to_ill's. 18165 */ 18166 static int 18167 ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4, 18168 ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6) 18169 { 18170 int dst_index; 18171 ipif_t *ipif_v4, *ipif_v6; 18172 struct lifreq *lifr; 18173 mblk_t *mp1; 18174 boolean_t exists; 18175 sin_t *sin; 18176 int err = 0; 18177 ip_stack_t *ipst; 18178 18179 if (CONN_Q(q)) 18180 ipst = CONNQ_TO_IPST(q); 18181 else 18182 ipst = ILLQ_TO_IPST(q); 18183 18184 18185 if ((mp1 = mp->b_cont) == NULL) 18186 return (EPROTO); 18187 18188 if ((mp1 = mp1->b_cont) == NULL) 18189 return (EPROTO); 18190 18191 lifr = (struct lifreq *)mp1->b_rptr; 18192 sin = (sin_t *)&lifr->lifr_addr; 18193 18194 /* 18195 * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6 18196 * specific operations. 18197 */ 18198 if (sin->sin_family != AF_UNSPEC) 18199 return (EINVAL); 18200 18201 /* 18202 * Get ipif with id 0. We are writer on the from ill. So we can pass 18203 * NULLs for the last 4 args and we know the lookup won't fail 18204 * with EINPROGRESS. 18205 */ 18206 ipif_v4 = ipif_lookup_on_name(lifr->lifr_name, 18207 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE, 18208 ALL_ZONES, NULL, NULL, NULL, NULL, ipst); 18209 ipif_v6 = ipif_lookup_on_name(lifr->lifr_name, 18210 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE, 18211 ALL_ZONES, NULL, NULL, NULL, NULL, ipst); 18212 18213 if (ipif_v4 == NULL && ipif_v6 == NULL) 18214 return (ENXIO); 18215 18216 if (ipif_v4 != NULL) { 18217 ASSERT(ipif_v4->ipif_refcnt != 0); 18218 if (ipif_v4->ipif_id != 0) { 18219 err = EINVAL; 18220 goto done; 18221 } 18222 18223 ASSERT(IAM_WRITER_IPIF(ipif_v4)); 18224 *ill_from_v4 = ipif_v4->ipif_ill; 18225 } 18226 18227 if (ipif_v6 != NULL) { 18228 ASSERT(ipif_v6->ipif_refcnt != 0); 18229 if (ipif_v6->ipif_id != 0) { 18230 err = EINVAL; 18231 goto done; 18232 } 18233 18234 ASSERT(IAM_WRITER_IPIF(ipif_v6)); 18235 *ill_from_v6 = ipif_v6->ipif_ill; 18236 } 18237 18238 err = 0; 18239 dst_index = lifr->lifr_movetoindex; 18240 *ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE, 18241 q, mp, ip_process_ioctl, &err, ipst); 18242 if (err != 0) { 18243 /* 18244 * There could be only v6. 18245 */ 18246 if (err != ENXIO) 18247 goto done; 18248 err = 0; 18249 } 18250 18251 *ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE, 18252 q, mp, ip_process_ioctl, &err, ipst); 18253 if (err != 0) { 18254 if (err != ENXIO) 18255 goto done; 18256 if (*ill_to_v4 == NULL) { 18257 err = ENXIO; 18258 goto done; 18259 } 18260 err = 0; 18261 } 18262 18263 /* 18264 * If we have something to MOVE i.e "from" not NULL, 18265 * "to" should be non-NULL. 18266 */ 18267 if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) || 18268 (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) { 18269 err = EINVAL; 18270 } 18271 18272 done: 18273 if (ipif_v4 != NULL) 18274 ipif_refrele(ipif_v4); 18275 if (ipif_v6 != NULL) 18276 ipif_refrele(ipif_v6); 18277 return (err); 18278 } 18279 18280 /* 18281 * FAILOVER and FAILBACK are modelled as MOVE operations. 18282 * 18283 * We don't check whether the MOVE is within the same group or 18284 * not, because this ioctl can be used as a generic mechanism 18285 * to failover from interface A to B, though things will function 18286 * only if they are really part of the same group. Moreover, 18287 * all ipifs may be down and hence temporarily out of the group. 18288 * 18289 * ipif's that need to be moved are first brought down; V4 ipifs are brought 18290 * down first and then V6. For each we wait for the ipif's to become quiescent. 18291 * Bringing down the ipifs ensures that all ires pointing to these ipifs's 18292 * have been deleted and there are no active references. Once quiescent the 18293 * ipif's are moved and brought up on the new ill. 18294 * 18295 * Normally the source ill and destination ill belong to the same IPMP group 18296 * and hence the same ipsq_t. In the event they don't belong to the same 18297 * same group the two ipsq's are first merged into one ipsq - that of the 18298 * to_ill. The multicast memberships on the source and destination ill cannot 18299 * change during the move operation since multicast joins/leaves also have to 18300 * execute on the same ipsq and are hence serialized. 18301 */ 18302 /* ARGSUSED */ 18303 int 18304 ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 18305 ip_ioctl_cmd_t *ipip, void *ifreq) 18306 { 18307 ill_t *ill_to_v4 = NULL; 18308 ill_t *ill_to_v6 = NULL; 18309 ill_t *ill_from_v4 = NULL; 18310 ill_t *ill_from_v6 = NULL; 18311 int err = 0; 18312 18313 /* 18314 * setup from and to ill's, we can get EINPROGRESS only for 18315 * to_ill's. 18316 */ 18317 err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6, 18318 &ill_to_v4, &ill_to_v6); 18319 18320 if (err != 0) { 18321 ip0dbg(("ip_sioctl_move: extract args failed\n")); 18322 goto done; 18323 } 18324 18325 /* 18326 * nothing to do. 18327 */ 18328 if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) { 18329 goto done; 18330 } 18331 18332 /* 18333 * nothing to do. 18334 */ 18335 if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) { 18336 goto done; 18337 } 18338 18339 /* 18340 * Mark the ill as changing. 18341 * ILL_CHANGING flag is cleared when the ipif's are brought up 18342 * in ill_up_ipifs in case of error they are cleared below. 18343 */ 18344 18345 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 18346 if (ill_from_v4 != NULL) 18347 ill_from_v4->ill_state_flags |= ILL_CHANGING; 18348 if (ill_from_v6 != NULL) 18349 ill_from_v6->ill_state_flags |= ILL_CHANGING; 18350 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 18351 18352 /* 18353 * Make sure that both src and dst are 18354 * in the same syncq group. If not make it happen. 18355 * We are not holding any locks because we are the writer 18356 * on the from_ipsq and we will hold locks in ill_merge_groups 18357 * to protect to_ipsq against changing. 18358 */ 18359 if (ill_from_v4 != NULL) { 18360 if (ill_from_v4->ill_phyint->phyint_ipsq != 18361 ill_to_v4->ill_phyint->phyint_ipsq) { 18362 err = ill_merge_groups(ill_from_v4, ill_to_v4, 18363 NULL, mp, q); 18364 goto err_ret; 18365 18366 } 18367 ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock)); 18368 } else { 18369 18370 if (ill_from_v6->ill_phyint->phyint_ipsq != 18371 ill_to_v6->ill_phyint->phyint_ipsq) { 18372 err = ill_merge_groups(ill_from_v6, ill_to_v6, 18373 NULL, mp, q); 18374 goto err_ret; 18375 18376 } 18377 ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock)); 18378 } 18379 18380 /* 18381 * Now that the ipsq's have been merged and we are the writer 18382 * lets mark to_ill as changing as well. 18383 */ 18384 18385 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 18386 if (ill_to_v4 != NULL) 18387 ill_to_v4->ill_state_flags |= ILL_CHANGING; 18388 if (ill_to_v6 != NULL) 18389 ill_to_v6->ill_state_flags |= ILL_CHANGING; 18390 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 18391 18392 /* 18393 * Its ok for us to proceed with the move even if 18394 * ill_pending_mp is non null on one of the from ill's as the reply 18395 * should not be looking at the ipif, it should only care about the 18396 * ill itself. 18397 */ 18398 18399 /* 18400 * lets move ipv4 first. 18401 */ 18402 if (ill_from_v4 != NULL) { 18403 ASSERT(IAM_WRITER_ILL(ill_to_v4)); 18404 ill_from_v4->ill_move_in_progress = B_TRUE; 18405 ill_to_v4->ill_move_in_progress = B_TRUE; 18406 ill_to_v4->ill_move_peer = ill_from_v4; 18407 ill_from_v4->ill_move_peer = ill_to_v4; 18408 err = ill_move(ill_from_v4, ill_to_v4, q, mp); 18409 } 18410 18411 /* 18412 * Now lets move ipv6. 18413 */ 18414 if (err == 0 && ill_from_v6 != NULL) { 18415 ASSERT(IAM_WRITER_ILL(ill_to_v6)); 18416 ill_from_v6->ill_move_in_progress = B_TRUE; 18417 ill_to_v6->ill_move_in_progress = B_TRUE; 18418 ill_to_v6->ill_move_peer = ill_from_v6; 18419 ill_from_v6->ill_move_peer = ill_to_v6; 18420 err = ill_move(ill_from_v6, ill_to_v6, q, mp); 18421 } 18422 18423 err_ret: 18424 /* 18425 * EINPROGRESS means we are waiting for the ipif's that need to be 18426 * moved to become quiescent. 18427 */ 18428 if (err == EINPROGRESS) { 18429 goto done; 18430 } 18431 18432 /* 18433 * if err is set ill_up_ipifs will not be called 18434 * lets clear the flags. 18435 */ 18436 18437 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 18438 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 18439 /* 18440 * Some of the clearing may be redundant. But it is simple 18441 * not making any extra checks. 18442 */ 18443 if (ill_from_v6 != NULL) { 18444 ill_from_v6->ill_move_in_progress = B_FALSE; 18445 ill_from_v6->ill_move_peer = NULL; 18446 ill_from_v6->ill_state_flags &= ~ILL_CHANGING; 18447 } 18448 if (ill_from_v4 != NULL) { 18449 ill_from_v4->ill_move_in_progress = B_FALSE; 18450 ill_from_v4->ill_move_peer = NULL; 18451 ill_from_v4->ill_state_flags &= ~ILL_CHANGING; 18452 } 18453 if (ill_to_v6 != NULL) { 18454 ill_to_v6->ill_move_in_progress = B_FALSE; 18455 ill_to_v6->ill_move_peer = NULL; 18456 ill_to_v6->ill_state_flags &= ~ILL_CHANGING; 18457 } 18458 if (ill_to_v4 != NULL) { 18459 ill_to_v4->ill_move_in_progress = B_FALSE; 18460 ill_to_v4->ill_move_peer = NULL; 18461 ill_to_v4->ill_state_flags &= ~ILL_CHANGING; 18462 } 18463 18464 /* 18465 * Check for setting INACTIVE, if STANDBY is set and FAILED is not set. 18466 * Do this always to maintain proper state i.e even in case of errors. 18467 * As phyint_inactive looks at both v4 and v6 interfaces, 18468 * we need not call on both v4 and v6 interfaces. 18469 */ 18470 if (ill_from_v4 != NULL) { 18471 if ((ill_from_v4->ill_phyint->phyint_flags & 18472 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 18473 phyint_inactive(ill_from_v4->ill_phyint); 18474 } 18475 } else if (ill_from_v6 != NULL) { 18476 if ((ill_from_v6->ill_phyint->phyint_flags & 18477 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 18478 phyint_inactive(ill_from_v6->ill_phyint); 18479 } 18480 } 18481 18482 if (ill_to_v4 != NULL) { 18483 if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) { 18484 ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 18485 } 18486 } else if (ill_to_v6 != NULL) { 18487 if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) { 18488 ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 18489 } 18490 } 18491 18492 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 18493 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 18494 18495 no_err: 18496 /* 18497 * lets bring the interfaces up on the to_ill. 18498 */ 18499 if (err == 0) { 18500 err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4, 18501 q, mp); 18502 } 18503 18504 if (err == 0) { 18505 if (ill_from_v4 != NULL && ill_to_v4 != NULL) 18506 ilm_send_multicast_reqs(ill_from_v4, ill_to_v4); 18507 18508 if (ill_from_v6 != NULL && ill_to_v6 != NULL) 18509 ilm_send_multicast_reqs(ill_from_v6, ill_to_v6); 18510 } 18511 done: 18512 18513 if (ill_to_v4 != NULL) { 18514 ill_refrele(ill_to_v4); 18515 } 18516 if (ill_to_v6 != NULL) { 18517 ill_refrele(ill_to_v6); 18518 } 18519 18520 return (err); 18521 } 18522 18523 static void 18524 ill_dl_down(ill_t *ill) 18525 { 18526 /* 18527 * The ill is down; unbind but stay attached since we're still 18528 * associated with a PPA. If we have negotiated DLPI capabilites 18529 * with the data link service provider (IDS_OK) then reset them. 18530 * The interval between unbinding and rebinding is potentially 18531 * unbounded hence we cannot assume things will be the same. 18532 * The DLPI capabilities will be probed again when the data link 18533 * is brought up. 18534 */ 18535 mblk_t *mp = ill->ill_unbind_mp; 18536 hook_nic_event_t *info; 18537 18538 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 18539 18540 ill->ill_unbind_mp = NULL; 18541 if (mp != NULL) { 18542 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 18543 dlpi_prim_str(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 18544 ill->ill_name)); 18545 mutex_enter(&ill->ill_lock); 18546 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 18547 mutex_exit(&ill->ill_lock); 18548 if (ill->ill_dlpi_capab_state == IDS_OK) 18549 ill_capability_reset(ill); 18550 ill_dlpi_send(ill, mp); 18551 } 18552 18553 /* 18554 * Toss all of our multicast memberships. We could keep them, but 18555 * then we'd have to do bookkeeping of any joins and leaves performed 18556 * by the application while the the interface is down (we can't just 18557 * issue them because arp cannot currently process AR_ENTRY_SQUERY's 18558 * on a downed interface). 18559 */ 18560 ill_leave_multicast(ill); 18561 18562 mutex_enter(&ill->ill_lock); 18563 18564 ill->ill_dl_up = 0; 18565 18566 if ((info = ill->ill_nic_event_info) != NULL) { 18567 ip2dbg(("ill_dl_down:unexpected nic event %d attached for %s\n", 18568 info->hne_event, ill->ill_name)); 18569 if (info->hne_data != NULL) 18570 kmem_free(info->hne_data, info->hne_datalen); 18571 kmem_free(info, sizeof (hook_nic_event_t)); 18572 } 18573 18574 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 18575 if (info != NULL) { 18576 ip_stack_t *ipst = ill->ill_ipst; 18577 18578 info->hne_nic = ill->ill_phyint->phyint_hook_ifindex; 18579 info->hne_lif = 0; 18580 info->hne_event = NE_DOWN; 18581 info->hne_data = NULL; 18582 info->hne_datalen = 0; 18583 info->hne_family = ill->ill_isv6 ? 18584 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 18585 } else 18586 ip2dbg(("ill_dl_down: could not attach DOWN nic event " 18587 "information for %s (ENOMEM)\n", ill->ill_name)); 18588 18589 ill->ill_nic_event_info = info; 18590 18591 mutex_exit(&ill->ill_lock); 18592 } 18593 18594 static void 18595 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 18596 { 18597 union DL_primitives *dlp; 18598 t_uscalar_t prim; 18599 18600 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 18601 18602 dlp = (union DL_primitives *)mp->b_rptr; 18603 prim = dlp->dl_primitive; 18604 18605 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 18606 dlpi_prim_str(prim), prim, ill->ill_name)); 18607 18608 switch (prim) { 18609 case DL_PHYS_ADDR_REQ: 18610 { 18611 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 18612 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 18613 break; 18614 } 18615 case DL_BIND_REQ: 18616 mutex_enter(&ill->ill_lock); 18617 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 18618 mutex_exit(&ill->ill_lock); 18619 break; 18620 } 18621 18622 /* 18623 * Except for the ACKs for the M_PCPROTO messages, all other ACKs 18624 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore 18625 * we only wait for the ACK of the DL_UNBIND_REQ. 18626 */ 18627 mutex_enter(&ill->ill_lock); 18628 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 18629 (prim == DL_UNBIND_REQ)) { 18630 ill->ill_dlpi_pending = prim; 18631 } 18632 mutex_exit(&ill->ill_lock); 18633 18634 putnext(ill->ill_wq, mp); 18635 } 18636 18637 /* 18638 * Helper function for ill_dlpi_send(). 18639 */ 18640 /* ARGSUSED */ 18641 static void 18642 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 18643 { 18644 ill_dlpi_send((ill_t *)q->q_ptr, mp); 18645 } 18646 18647 /* 18648 * Send a DLPI control message to the driver but make sure there 18649 * is only one outstanding message. Uses ill_dlpi_pending to tell 18650 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 18651 * when an ACK or a NAK is received to process the next queued message. 18652 */ 18653 void 18654 ill_dlpi_send(ill_t *ill, mblk_t *mp) 18655 { 18656 mblk_t **mpp; 18657 18658 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 18659 18660 /* 18661 * To ensure that any DLPI requests for current exclusive operation 18662 * are always completely sent before any DLPI messages for other 18663 * operations, require writer access before enqueuing. 18664 */ 18665 if (!IAM_WRITER_ILL(ill)) { 18666 ill_refhold(ill); 18667 /* qwriter_ip() does the ill_refrele() */ 18668 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer, 18669 NEW_OP, B_TRUE); 18670 return; 18671 } 18672 18673 mutex_enter(&ill->ill_lock); 18674 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 18675 /* Must queue message. Tail insertion */ 18676 mpp = &ill->ill_dlpi_deferred; 18677 while (*mpp != NULL) 18678 mpp = &((*mpp)->b_next); 18679 18680 ip1dbg(("ill_dlpi_send: deferring request for %s\n", 18681 ill->ill_name)); 18682 18683 *mpp = mp; 18684 mutex_exit(&ill->ill_lock); 18685 return; 18686 } 18687 mutex_exit(&ill->ill_lock); 18688 ill_dlpi_dispatch(ill, mp); 18689 } 18690 18691 /* 18692 * Send all deferred DLPI messages without waiting for their ACKs. 18693 */ 18694 void 18695 ill_dlpi_send_deferred(ill_t *ill) 18696 { 18697 mblk_t *mp, *nextmp; 18698 18699 /* 18700 * Clear ill_dlpi_pending so that the message is not queued in 18701 * ill_dlpi_send(). 18702 */ 18703 mutex_enter(&ill->ill_lock); 18704 ill->ill_dlpi_pending = DL_PRIM_INVAL; 18705 mp = ill->ill_dlpi_deferred; 18706 ill->ill_dlpi_deferred = NULL; 18707 mutex_exit(&ill->ill_lock); 18708 18709 for (; mp != NULL; mp = nextmp) { 18710 nextmp = mp->b_next; 18711 mp->b_next = NULL; 18712 ill_dlpi_send(ill, mp); 18713 } 18714 } 18715 18716 /* 18717 * Check if the DLPI primitive `prim' is pending; print a warning if not. 18718 */ 18719 boolean_t 18720 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim) 18721 { 18722 t_uscalar_t prim_pending; 18723 18724 mutex_enter(&ill->ill_lock); 18725 prim_pending = ill->ill_dlpi_pending; 18726 mutex_exit(&ill->ill_lock); 18727 18728 /* 18729 * During teardown, ill_dlpi_send_deferred() will send requests 18730 * without waiting; don't bother printing any warnings in that case. 18731 */ 18732 if (!(ill->ill_flags & ILL_CONDEMNED) && prim_pending != prim) { 18733 if (prim_pending == DL_PRIM_INVAL) { 18734 (void) mi_strlog(ill->ill_rq, 1, 18735 SL_CONSOLE|SL_ERROR|SL_TRACE, "ip: received " 18736 "unsolicited ack for %s on %s\n", 18737 dlpi_prim_str(prim), ill->ill_name); 18738 } else { 18739 (void) mi_strlog(ill->ill_rq, 1, 18740 SL_CONSOLE|SL_ERROR|SL_TRACE, "ip: received " 18741 "unexpected ack for %s on %s (expecting %s)\n", 18742 dlpi_prim_str(prim), ill->ill_name, 18743 dlpi_prim_str(prim_pending)); 18744 } 18745 } 18746 return (prim_pending == prim); 18747 } 18748 18749 /* 18750 * Called when an DLPI control message has been acked or nacked to 18751 * send down the next queued message (if any). 18752 */ 18753 void 18754 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 18755 { 18756 mblk_t *mp; 18757 18758 ASSERT(IAM_WRITER_ILL(ill)); 18759 mutex_enter(&ill->ill_lock); 18760 18761 ASSERT(prim != DL_PRIM_INVAL); 18762 ASSERT(ill->ill_dlpi_pending == prim); 18763 18764 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 18765 dlpi_prim_str(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 18766 18767 if ((mp = ill->ill_dlpi_deferred) == NULL) { 18768 ill->ill_dlpi_pending = DL_PRIM_INVAL; 18769 cv_signal(&ill->ill_cv); 18770 mutex_exit(&ill->ill_lock); 18771 return; 18772 } 18773 18774 ill->ill_dlpi_deferred = mp->b_next; 18775 mp->b_next = NULL; 18776 mutex_exit(&ill->ill_lock); 18777 18778 ill_dlpi_dispatch(ill, mp); 18779 } 18780 18781 void 18782 conn_delete_ire(conn_t *connp, caddr_t arg) 18783 { 18784 ipif_t *ipif = (ipif_t *)arg; 18785 ire_t *ire; 18786 18787 /* 18788 * Look at the cached ires on conns which has pointers to ipifs. 18789 * We just call ire_refrele which clears up the reference 18790 * to ire. Called when a conn closes. Also called from ipif_free 18791 * to cleanup indirect references to the stale ipif via the cached ire. 18792 */ 18793 mutex_enter(&connp->conn_lock); 18794 ire = connp->conn_ire_cache; 18795 if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) { 18796 connp->conn_ire_cache = NULL; 18797 mutex_exit(&connp->conn_lock); 18798 IRE_REFRELE_NOTR(ire); 18799 return; 18800 } 18801 mutex_exit(&connp->conn_lock); 18802 18803 } 18804 18805 /* 18806 * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number 18807 * of IREs. Those IREs may have been previously cached in the conn structure. 18808 * This ipcl_walk() walker function releases all references to such IREs based 18809 * on the condemned flag. 18810 */ 18811 /* ARGSUSED */ 18812 void 18813 conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) 18814 { 18815 ire_t *ire; 18816 18817 mutex_enter(&connp->conn_lock); 18818 ire = connp->conn_ire_cache; 18819 if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) { 18820 connp->conn_ire_cache = NULL; 18821 mutex_exit(&connp->conn_lock); 18822 IRE_REFRELE_NOTR(ire); 18823 return; 18824 } 18825 mutex_exit(&connp->conn_lock); 18826 } 18827 18828 /* 18829 * Take down a specific interface, but don't lose any information about it. 18830 * Also delete interface from its interface group (ifgrp). 18831 * (Always called as writer.) 18832 * This function goes through the down sequence even if the interface is 18833 * already down. There are 2 reasons. 18834 * a. Currently we permit interface routes that depend on down interfaces 18835 * to be added. This behaviour itself is questionable. However it appears 18836 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 18837 * time. We go thru the cleanup in order to remove these routes. 18838 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 18839 * DL_ERROR_ACK in response to the the DL_BIND request. The interface is 18840 * down, but we need to cleanup i.e. do ill_dl_down and 18841 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 18842 * 18843 * IP-MT notes: 18844 * 18845 * Model of reference to interfaces. 18846 * 18847 * The following members in ipif_t track references to the ipif. 18848 * int ipif_refcnt; Active reference count 18849 * uint_t ipif_ire_cnt; Number of ire's referencing this ipif 18850 * The following members in ill_t track references to the ill. 18851 * int ill_refcnt; active refcnt 18852 * uint_t ill_ire_cnt; Number of ires referencing ill 18853 * uint_t ill_nce_cnt; Number of nces referencing ill 18854 * 18855 * Reference to an ipif or ill can be obtained in any of the following ways. 18856 * 18857 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 18858 * Pointers to ipif / ill from other data structures viz ire and conn. 18859 * Implicit reference to the ipif / ill by holding a reference to the ire. 18860 * 18861 * The ipif/ill lookup functions return a reference held ipif / ill. 18862 * ipif_refcnt and ill_refcnt track the reference counts respectively. 18863 * This is a purely dynamic reference count associated with threads holding 18864 * references to the ipif / ill. Pointers from other structures do not 18865 * count towards this reference count. 18866 * 18867 * ipif_ire_cnt/ill_ire_cnt is the number of ire's associated with the 18868 * ipif/ill. This is incremented whenever a new ire is created referencing the 18869 * ipif/ill. This is done atomically inside ire_add_v[46] where the ire is 18870 * actually added to the ire hash table. The count is decremented in 18871 * ire_inactive where the ire is destroyed. 18872 * 18873 * nce's reference ill's thru nce_ill and the count of nce's associated with 18874 * an ill is recorded in ill_nce_cnt. This is incremented atomically in 18875 * ndp_add() where the nce is actually added to the table. Similarly it is 18876 * decremented in ndp_inactive where the nce is destroyed. 18877 * 18878 * Flow of ioctls involving interface down/up 18879 * 18880 * The following is the sequence of an attempt to set some critical flags on an 18881 * up interface. 18882 * ip_sioctl_flags 18883 * ipif_down 18884 * wait for ipif to be quiescent 18885 * ipif_down_tail 18886 * ip_sioctl_flags_tail 18887 * 18888 * All set ioctls that involve down/up sequence would have a skeleton similar 18889 * to the above. All the *tail functions are called after the refcounts have 18890 * dropped to the appropriate values. 18891 * 18892 * The mechanism to quiesce an ipif is as follows. 18893 * 18894 * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed 18895 * on the ipif. Callers either pass a flag requesting wait or the lookup 18896 * functions will return NULL. 18897 * 18898 * Delete all ires referencing this ipif 18899 * 18900 * Any thread attempting to do an ipif_refhold on an ipif that has been 18901 * obtained thru a cached pointer will first make sure that 18902 * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then 18903 * increment the refcount. 18904 * 18905 * The above guarantees that the ipif refcount will eventually come down to 18906 * zero and the ipif will quiesce, once all threads that currently hold a 18907 * reference to the ipif refrelease the ipif. The ipif is quiescent after the 18908 * ipif_refcount has dropped to zero and all ire's associated with this ipif 18909 * have also been ire_inactive'd. i.e. when ipif_ire_cnt and ipif_refcnt both 18910 * drop to zero. 18911 * 18912 * Lookups during the IPIF_CHANGING/ILL_CHANGING interval. 18913 * 18914 * Threads trying to lookup an ipif or ill can pass a flag requesting 18915 * wait and restart if the ipif / ill cannot be looked up currently. 18916 * For eg. bind, and route operations (Eg. route add / delete) cannot return 18917 * failure if the ipif is currently undergoing an exclusive operation, and 18918 * hence pass the flag. The mblk is then enqueued in the ipsq and the operation 18919 * is restarted by ipsq_exit() when the currently exclusive ioctl completes. 18920 * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The 18921 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 18922 * change while the ill_lock is held. Before dropping the ill_lock we acquire 18923 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 18924 * until we release the ipsq_lock, even though the the ill/ipif state flags 18925 * can change after we drop the ill_lock. 18926 * 18927 * An attempt to send out a packet using an ipif that is currently 18928 * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this 18929 * operation and restart it later when the exclusive condition on the ipif ends. 18930 * This is an example of not passing the wait flag to the lookup functions. For 18931 * example an attempt to refhold and use conn->conn_multicast_ipif and send 18932 * out a multicast packet on that ipif will fail while the ipif is 18933 * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is 18934 * currently IPIF_CHANGING will also fail. 18935 */ 18936 int 18937 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 18938 { 18939 ill_t *ill = ipif->ipif_ill; 18940 phyint_t *phyi; 18941 conn_t *connp; 18942 boolean_t success; 18943 boolean_t ipif_was_up = B_FALSE; 18944 ip_stack_t *ipst = ill->ill_ipst; 18945 18946 ASSERT(IAM_WRITER_IPIF(ipif)); 18947 18948 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 18949 18950 if (ipif->ipif_flags & IPIF_UP) { 18951 mutex_enter(&ill->ill_lock); 18952 ipif->ipif_flags &= ~IPIF_UP; 18953 ASSERT(ill->ill_ipif_up_count > 0); 18954 --ill->ill_ipif_up_count; 18955 mutex_exit(&ill->ill_lock); 18956 ipif_was_up = B_TRUE; 18957 /* Update status in SCTP's list */ 18958 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 18959 } 18960 18961 /* 18962 * Blow away v6 memberships we established in ipif_multicast_up(); the 18963 * v4 ones are left alone (as is the ipif_multicast_up flag, so we 18964 * know not to rejoin when the interface is brought back up). 18965 */ 18966 if (ipif->ipif_isv6) 18967 ipif_multicast_down(ipif); 18968 /* 18969 * Remove from the mapping for __sin6_src_id. We insert only 18970 * when the address is not INADDR_ANY. As IPv4 addresses are 18971 * stored as mapped addresses, we need to check for mapped 18972 * INADDR_ANY also. 18973 */ 18974 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 18975 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 18976 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 18977 int err; 18978 18979 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 18980 ipif->ipif_zoneid, ipst); 18981 if (err != 0) { 18982 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 18983 } 18984 } 18985 18986 /* 18987 * Before we delete the ill from the group (if any), we need 18988 * to make sure that we delete all the routes dependent on 18989 * this and also any ipifs dependent on this ipif for 18990 * source address. We need to do before we delete from 18991 * the group because 18992 * 18993 * 1) ipif_down_delete_ire de-references ill->ill_group. 18994 * 18995 * 2) ipif_update_other_ipifs needs to walk the whole group 18996 * for re-doing source address selection. Note that 18997 * ipif_select_source[_v6] called from 18998 * ipif_update_other_ipifs[_v6] will not pick this ipif 18999 * because we have already marked down here i.e cleared 19000 * IPIF_UP. 19001 */ 19002 if (ipif->ipif_isv6) { 19003 ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, 19004 ipst); 19005 } else { 19006 ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, 19007 ipst); 19008 } 19009 19010 /* 19011 * Need to add these also to be saved and restored when the 19012 * ipif is brought down and up 19013 */ 19014 mutex_enter(&ipst->ips_ire_mrtun_lock); 19015 if (ipst->ips_ire_mrtun_count != 0) { 19016 mutex_exit(&ipst->ips_ire_mrtun_lock); 19017 ire_walk_ill_mrtun(0, 0, ipif_down_delete_ire, 19018 (char *)ipif, NULL, ipst); 19019 } else { 19020 mutex_exit(&ipst->ips_ire_mrtun_lock); 19021 } 19022 19023 mutex_enter(&ipst->ips_ire_srcif_table_lock); 19024 if (ipst->ips_ire_srcif_table_count > 0) { 19025 mutex_exit(&ipst->ips_ire_srcif_table_lock); 19026 ire_walk_srcif_table_v4(ipif_down_delete_ire, (char *)ipif, 19027 ipst); 19028 } else { 19029 mutex_exit(&ipst->ips_ire_srcif_table_lock); 19030 } 19031 19032 /* 19033 * Cleaning up the conn_ire_cache or conns must be done only after the 19034 * ires have been deleted above. Otherwise a thread could end up 19035 * caching an ire in a conn after we have finished the cleanup of the 19036 * conn. The caching is done after making sure that the ire is not yet 19037 * condemned. Also documented in the block comment above ip_output 19038 */ 19039 ipcl_walk(conn_cleanup_stale_ire, NULL, ipst); 19040 /* Also, delete the ires cached in SCTP */ 19041 sctp_ire_cache_flush(ipif); 19042 19043 /* Resolve any IPsec/IKE NAT-T instances that depend on this ipif. */ 19044 nattymod_clean_ipif(ipif); 19045 19046 /* 19047 * Update any other ipifs which have used "our" local address as 19048 * a source address. This entails removing and recreating IRE_INTERFACE 19049 * entries for such ipifs. 19050 */ 19051 if (ipif->ipif_isv6) 19052 ipif_update_other_ipifs_v6(ipif, ill->ill_group); 19053 else 19054 ipif_update_other_ipifs(ipif, ill->ill_group); 19055 19056 if (ipif_was_up) { 19057 /* 19058 * Check whether it is last ipif to leave this group. 19059 * If this is the last ipif to leave, we should remove 19060 * this ill from the group as ipif_select_source will not 19061 * be able to find any useful ipifs if this ill is selected 19062 * for load balancing. 19063 * 19064 * For nameless groups, we should call ifgrp_delete if this 19065 * belongs to some group. As this ipif is going down, we may 19066 * need to reconstruct groups. 19067 */ 19068 phyi = ill->ill_phyint; 19069 /* 19070 * If the phyint_groupname_len is 0, it may or may not 19071 * be in the nameless group. If the phyint_groupname_len is 19072 * not 0, then this ill should be part of some group. 19073 * As we always insert this ill in the group if 19074 * phyint_groupname_len is not zero when the first ipif 19075 * comes up (in ipif_up_done), it should be in a group 19076 * when the namelen is not 0. 19077 * 19078 * NOTE : When we delete the ill from the group,it will 19079 * blow away all the IRE_CACHES pointing either at this ipif or 19080 * ill_wq (illgrp_cache_delete does this). Thus, no IRES 19081 * should be pointing at this ill. 19082 */ 19083 ASSERT(phyi->phyint_groupname_len == 0 || 19084 (phyi->phyint_groupname != NULL && ill->ill_group != NULL)); 19085 19086 if (phyi->phyint_groupname_len != 0) { 19087 if (ill->ill_ipif_up_count == 0) 19088 illgrp_delete(ill); 19089 } 19090 19091 /* 19092 * If we have deleted some of the broadcast ires associated 19093 * with this ipif, we need to re-nominate somebody else if 19094 * the ires that we deleted were the nominated ones. 19095 */ 19096 if (ill->ill_group != NULL && !ill->ill_isv6) 19097 ipif_renominate_bcast(ipif); 19098 } 19099 19100 /* 19101 * neighbor-discovery or arp entries for this interface. 19102 */ 19103 ipif_ndp_down(ipif); 19104 19105 /* 19106 * If mp is NULL the caller will wait for the appropriate refcnt. 19107 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 19108 * and ill_delete -> ipif_free -> ipif_down 19109 */ 19110 if (mp == NULL) { 19111 ASSERT(q == NULL); 19112 return (0); 19113 } 19114 19115 if (CONN_Q(q)) { 19116 connp = Q_TO_CONN(q); 19117 mutex_enter(&connp->conn_lock); 19118 } else { 19119 connp = NULL; 19120 } 19121 mutex_enter(&ill->ill_lock); 19122 /* 19123 * Are there any ire's pointing to this ipif that are still active ? 19124 * If this is the last ipif going down, are there any ire's pointing 19125 * to this ill that are still active ? 19126 */ 19127 if (ipif_is_quiescent(ipif)) { 19128 mutex_exit(&ill->ill_lock); 19129 if (connp != NULL) 19130 mutex_exit(&connp->conn_lock); 19131 return (0); 19132 } 19133 19134 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 19135 ill->ill_name, (void *)ill)); 19136 /* 19137 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 19138 * drops down, the operation will be restarted by ipif_ill_refrele_tail 19139 * which in turn is called by the last refrele on the ipif/ill/ire. 19140 */ 19141 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 19142 if (!success) { 19143 /* The conn is closing. So just return */ 19144 ASSERT(connp != NULL); 19145 mutex_exit(&ill->ill_lock); 19146 mutex_exit(&connp->conn_lock); 19147 return (EINTR); 19148 } 19149 19150 mutex_exit(&ill->ill_lock); 19151 if (connp != NULL) 19152 mutex_exit(&connp->conn_lock); 19153 return (EINPROGRESS); 19154 } 19155 19156 void 19157 ipif_down_tail(ipif_t *ipif) 19158 { 19159 ill_t *ill = ipif->ipif_ill; 19160 19161 /* 19162 * Skip any loopback interface (null wq). 19163 * If this is the last logical interface on the ill 19164 * have ill_dl_down tell the driver we are gone (unbind) 19165 * Note that lun 0 can ipif_down even though 19166 * there are other logical units that are up. 19167 * This occurs e.g. when we change a "significant" IFF_ flag. 19168 */ 19169 if (ill->ill_wq != NULL && !ill->ill_logical_down && 19170 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 19171 ill->ill_dl_up) { 19172 ill_dl_down(ill); 19173 } 19174 ill->ill_logical_down = 0; 19175 19176 /* 19177 * Have to be after removing the routes in ipif_down_delete_ire. 19178 */ 19179 if (ipif->ipif_isv6) { 19180 if (ill->ill_flags & ILLF_XRESOLV) 19181 ipif_arp_down(ipif); 19182 } else { 19183 ipif_arp_down(ipif); 19184 } 19185 19186 ip_rts_ifmsg(ipif); 19187 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif); 19188 } 19189 19190 /* 19191 * Bring interface logically down without bringing the physical interface 19192 * down e.g. when the netmask is changed. This avoids long lasting link 19193 * negotiations between an ethernet interface and a certain switches. 19194 */ 19195 static int 19196 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 19197 { 19198 /* 19199 * The ill_logical_down flag is a transient flag. It is set here 19200 * and is cleared once the down has completed in ipif_down_tail. 19201 * This flag does not indicate whether the ill stream is in the 19202 * DL_BOUND state with the driver. Instead this flag is used by 19203 * ipif_down_tail to determine whether to DL_UNBIND the stream with 19204 * the driver. The state of the ill stream i.e. whether it is 19205 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 19206 */ 19207 ipif->ipif_ill->ill_logical_down = 1; 19208 return (ipif_down(ipif, q, mp)); 19209 } 19210 19211 /* 19212 * This is called when the SIOCSLIFUSESRC ioctl is processed in IP. 19213 * If the usesrc client ILL is already part of a usesrc group or not, 19214 * in either case a ire_stq with the matching usesrc client ILL will 19215 * locate the IRE's that need to be deleted. We want IREs to be created 19216 * with the new source address. 19217 */ 19218 static void 19219 ipif_delete_cache_ire(ire_t *ire, char *ill_arg) 19220 { 19221 ill_t *ucill = (ill_t *)ill_arg; 19222 19223 ASSERT(IAM_WRITER_ILL(ucill)); 19224 19225 if (ire->ire_stq == NULL) 19226 return; 19227 19228 if ((ire->ire_type == IRE_CACHE) && 19229 ((ill_t *)ire->ire_stq->q_ptr == ucill)) 19230 ire_delete(ire); 19231 } 19232 19233 /* 19234 * ire_walk routine to delete every IRE dependent on the interface 19235 * address that is going down. (Always called as writer.) 19236 * Works for both v4 and v6. 19237 * In addition for checking for ire_ipif matches it also checks for 19238 * IRE_CACHE entries which have the same source address as the 19239 * disappearing ipif since ipif_select_source might have picked 19240 * that source. Note that ipif_down/ipif_update_other_ipifs takes 19241 * care of any IRE_INTERFACE with the disappearing source address. 19242 */ 19243 static void 19244 ipif_down_delete_ire(ire_t *ire, char *ipif_arg) 19245 { 19246 ipif_t *ipif = (ipif_t *)ipif_arg; 19247 ill_t *ire_ill; 19248 ill_t *ipif_ill; 19249 19250 ASSERT(IAM_WRITER_IPIF(ipif)); 19251 if (ire->ire_ipif == NULL) 19252 return; 19253 19254 /* 19255 * For IPv4, we derive source addresses for an IRE from ipif's 19256 * belonging to the same IPMP group as the IRE's outgoing 19257 * interface. If an IRE's outgoing interface isn't in the 19258 * same IPMP group as a particular ipif, then that ipif 19259 * couldn't have been used as a source address for this IRE. 19260 * 19261 * For IPv6, source addresses are only restricted to the IPMP group 19262 * if the IRE is for a link-local address or a multicast address. 19263 * Otherwise, source addresses for an IRE can be chosen from 19264 * interfaces other than the the outgoing interface for that IRE. 19265 * 19266 * For source address selection details, see ipif_select_source() 19267 * and ipif_select_source_v6(). 19268 */ 19269 if (ire->ire_ipversion == IPV4_VERSION || 19270 IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) || 19271 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { 19272 ire_ill = ire->ire_ipif->ipif_ill; 19273 ipif_ill = ipif->ipif_ill; 19274 19275 if (ire_ill->ill_group != ipif_ill->ill_group) { 19276 return; 19277 } 19278 } 19279 19280 19281 if (ire->ire_ipif != ipif) { 19282 /* 19283 * Look for a matching source address. 19284 */ 19285 if (ire->ire_type != IRE_CACHE) 19286 return; 19287 if (ipif->ipif_flags & IPIF_NOLOCAL) 19288 return; 19289 19290 if (ire->ire_ipversion == IPV4_VERSION) { 19291 if (ire->ire_src_addr != ipif->ipif_src_addr) 19292 return; 19293 } else { 19294 if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 19295 &ipif->ipif_v6lcl_addr)) 19296 return; 19297 } 19298 ire_delete(ire); 19299 return; 19300 } 19301 /* 19302 * ire_delete() will do an ire_flush_cache which will delete 19303 * all ire_ipif matches 19304 */ 19305 ire_delete(ire); 19306 } 19307 19308 /* 19309 * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when 19310 * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or 19311 * 2) when an interface is brought up or down (on that ill). 19312 * This ensures that the IRE_CACHE entries don't retain stale source 19313 * address selection results. 19314 */ 19315 void 19316 ill_ipif_cache_delete(ire_t *ire, char *ill_arg) 19317 { 19318 ill_t *ill = (ill_t *)ill_arg; 19319 ill_t *ipif_ill; 19320 19321 ASSERT(IAM_WRITER_ILL(ill)); 19322 /* 19323 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 19324 * Hence this should be IRE_CACHE. 19325 */ 19326 ASSERT(ire->ire_type == IRE_CACHE); 19327 19328 /* 19329 * We are called for IRE_CACHES whose ire_ipif matches ill. 19330 * We are only interested in IRE_CACHES that has borrowed 19331 * the source address from ill_arg e.g. ipif_up_done[_v6] 19332 * for which we need to look at ire_ipif->ipif_ill match 19333 * with ill. 19334 */ 19335 ASSERT(ire->ire_ipif != NULL); 19336 ipif_ill = ire->ire_ipif->ipif_ill; 19337 if (ipif_ill == ill || (ill->ill_group != NULL && 19338 ipif_ill->ill_group == ill->ill_group)) { 19339 ire_delete(ire); 19340 } 19341 } 19342 19343 /* 19344 * Delete all the ire whose stq references ill_arg. 19345 */ 19346 static void 19347 ill_stq_cache_delete(ire_t *ire, char *ill_arg) 19348 { 19349 ill_t *ill = (ill_t *)ill_arg; 19350 ill_t *ire_ill; 19351 19352 ASSERT(IAM_WRITER_ILL(ill)); 19353 /* 19354 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 19355 * Hence this should be IRE_CACHE. 19356 */ 19357 ASSERT(ire->ire_type == IRE_CACHE); 19358 19359 /* 19360 * We are called for IRE_CACHES whose ire_stq and ire_ipif 19361 * matches ill. We are only interested in IRE_CACHES that 19362 * has ire_stq->q_ptr pointing at ill_arg. Thus we do the 19363 * filtering here. 19364 */ 19365 ire_ill = (ill_t *)ire->ire_stq->q_ptr; 19366 19367 if (ire_ill == ill) 19368 ire_delete(ire); 19369 } 19370 19371 /* 19372 * This is called when an ill leaves the group. We want to delete 19373 * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is 19374 * pointing at ill. 19375 */ 19376 static void 19377 illgrp_cache_delete(ire_t *ire, char *ill_arg) 19378 { 19379 ill_t *ill = (ill_t *)ill_arg; 19380 19381 ASSERT(IAM_WRITER_ILL(ill)); 19382 ASSERT(ill->ill_group == NULL); 19383 /* 19384 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 19385 * Hence this should be IRE_CACHE. 19386 */ 19387 ASSERT(ire->ire_type == IRE_CACHE); 19388 /* 19389 * We are called for IRE_CACHES whose ire_stq and ire_ipif 19390 * matches ill. We are interested in both. 19391 */ 19392 ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) || 19393 (ire->ire_ipif->ipif_ill == ill)); 19394 19395 ire_delete(ire); 19396 } 19397 19398 /* 19399 * Initiate deallocate of an IPIF. Always called as writer. Called by 19400 * ill_delete or ip_sioctl_removeif. 19401 */ 19402 static void 19403 ipif_free(ipif_t *ipif) 19404 { 19405 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 19406 19407 ASSERT(IAM_WRITER_IPIF(ipif)); 19408 19409 if (ipif->ipif_recovery_id != 0) 19410 (void) untimeout(ipif->ipif_recovery_id); 19411 ipif->ipif_recovery_id = 0; 19412 19413 /* Remove conn references */ 19414 reset_conn_ipif(ipif); 19415 19416 /* 19417 * Make sure we have valid net and subnet broadcast ire's for the 19418 * other ipif's which share them with this ipif. 19419 */ 19420 if (!ipif->ipif_isv6) 19421 ipif_check_bcast_ires(ipif); 19422 19423 /* 19424 * Take down the interface. We can be called either from ill_delete 19425 * or from ip_sioctl_removeif. 19426 */ 19427 (void) ipif_down(ipif, NULL, NULL); 19428 19429 /* 19430 * Now that the interface is down, there's no chance it can still 19431 * become a duplicate. Cancel any timer that may have been set while 19432 * tearing down. 19433 */ 19434 if (ipif->ipif_recovery_id != 0) 19435 (void) untimeout(ipif->ipif_recovery_id); 19436 ipif->ipif_recovery_id = 0; 19437 19438 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 19439 /* Remove pointers to this ill in the multicast routing tables */ 19440 reset_mrt_vif_ipif(ipif); 19441 rw_exit(&ipst->ips_ill_g_lock); 19442 } 19443 19444 /* 19445 * Warning: this is not the only function that calls mi_free on an ipif_t. See 19446 * also ill_move(). 19447 */ 19448 static void 19449 ipif_free_tail(ipif_t *ipif) 19450 { 19451 mblk_t *mp; 19452 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 19453 19454 /* 19455 * Free state for addition IRE_IF_[NO]RESOLVER ire's. 19456 */ 19457 mutex_enter(&ipif->ipif_saved_ire_lock); 19458 mp = ipif->ipif_saved_ire_mp; 19459 ipif->ipif_saved_ire_mp = NULL; 19460 mutex_exit(&ipif->ipif_saved_ire_lock); 19461 freemsg(mp); 19462 19463 /* 19464 * Need to hold both ill_g_lock and ill_lock while 19465 * inserting or removing an ipif from the linked list 19466 * of ipifs hanging off the ill. 19467 */ 19468 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 19469 /* 19470 * Remove all multicast memberships on the interface now. 19471 * This removes IPv4 multicast memberships joined within 19472 * the kernel as ipif_down does not do ipif_multicast_down 19473 * for IPv4. IPv6 is not handled here as the multicast memberships 19474 * are based on ill and not on ipif. 19475 */ 19476 ilm_free(ipif); 19477 19478 /* 19479 * Since we held the ill_g_lock while doing the ilm_free above, 19480 * we can assert the ilms were really deleted and not just marked 19481 * ILM_DELETED. 19482 */ 19483 ASSERT(ilm_walk_ipif(ipif) == 0); 19484 19485 IPIF_TRACE_CLEANUP(ipif); 19486 19487 /* Ask SCTP to take it out of it list */ 19488 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 19489 19490 /* Get it out of the ILL interface list. */ 19491 ipif_remove(ipif, B_TRUE); 19492 rw_exit(&ipst->ips_ill_g_lock); 19493 19494 mutex_destroy(&ipif->ipif_saved_ire_lock); 19495 19496 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 19497 ASSERT(ipif->ipif_recovery_id == 0); 19498 19499 /* Free the memory. */ 19500 mi_free(ipif); 19501 } 19502 19503 /* 19504 * Returns an ipif name in the form "ill_name/unit" if ipif_id is not zero, 19505 * "ill_name" otherwise. 19506 */ 19507 char * 19508 ipif_get_name(const ipif_t *ipif, char *buf, int len) 19509 { 19510 char lbuf[32]; 19511 char *name; 19512 size_t name_len; 19513 19514 buf[0] = '\0'; 19515 if (!ipif) 19516 return (buf); 19517 name = ipif->ipif_ill->ill_name; 19518 name_len = ipif->ipif_ill->ill_name_length; 19519 if (ipif->ipif_id != 0) { 19520 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 19521 ipif->ipif_id); 19522 name = lbuf; 19523 name_len = mi_strlen(name) + 1; 19524 } 19525 len -= 1; 19526 buf[len] = '\0'; 19527 len = MIN(len, name_len); 19528 bcopy(name, buf, len); 19529 return (buf); 19530 } 19531 19532 /* 19533 * Find an IPIF based on the name passed in. Names can be of the 19534 * form <phys> (e.g., le0), <phys>:<#> (e.g., le0:1), 19535 * The <phys> string can have forms like <dev><#> (e.g., le0), 19536 * <dev><#>.<module> (e.g. le0.foo), or <dev>.<module><#> (e.g. ip.tun3). 19537 * When there is no colon, the implied unit id is zero. <phys> must 19538 * correspond to the name of an ILL. (May be called as writer.) 19539 */ 19540 static ipif_t * 19541 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 19542 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q, 19543 mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) 19544 { 19545 char *cp; 19546 char *endp; 19547 long id; 19548 ill_t *ill; 19549 ipif_t *ipif; 19550 uint_t ire_type; 19551 boolean_t did_alloc = B_FALSE; 19552 ipsq_t *ipsq; 19553 19554 if (error != NULL) 19555 *error = 0; 19556 19557 /* 19558 * If the caller wants to us to create the ipif, make sure we have a 19559 * valid zoneid 19560 */ 19561 ASSERT(!do_alloc || zoneid != ALL_ZONES); 19562 19563 if (namelen == 0) { 19564 if (error != NULL) 19565 *error = ENXIO; 19566 return (NULL); 19567 } 19568 19569 *exists = B_FALSE; 19570 /* Look for a colon in the name. */ 19571 endp = &name[namelen]; 19572 for (cp = endp; --cp > name; ) { 19573 if (*cp == IPIF_SEPARATOR_CHAR) 19574 break; 19575 } 19576 19577 if (*cp == IPIF_SEPARATOR_CHAR) { 19578 /* 19579 * Reject any non-decimal aliases for logical 19580 * interfaces. Aliases with leading zeroes 19581 * are also rejected as they introduce ambiguity 19582 * in the naming of the interfaces. 19583 * In order to confirm with existing semantics, 19584 * and to not break any programs/script relying 19585 * on that behaviour, if<0>:0 is considered to be 19586 * a valid interface. 19587 * 19588 * If alias has two or more digits and the first 19589 * is zero, fail. 19590 */ 19591 if (&cp[2] < endp && cp[1] == '0') 19592 return (NULL); 19593 } 19594 19595 if (cp <= name) { 19596 cp = endp; 19597 } else { 19598 *cp = '\0'; 19599 } 19600 19601 /* 19602 * Look up the ILL, based on the portion of the name 19603 * before the slash. ill_lookup_on_name returns a held ill. 19604 * Temporary to check whether ill exists already. If so 19605 * ill_lookup_on_name will clear it. 19606 */ 19607 ill = ill_lookup_on_name(name, do_alloc, isv6, 19608 q, mp, func, error, &did_alloc, ipst); 19609 if (cp != endp) 19610 *cp = IPIF_SEPARATOR_CHAR; 19611 if (ill == NULL) 19612 return (NULL); 19613 19614 /* Establish the unit number in the name. */ 19615 id = 0; 19616 if (cp < endp && *endp == '\0') { 19617 /* If there was a colon, the unit number follows. */ 19618 cp++; 19619 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 19620 ill_refrele(ill); 19621 if (error != NULL) 19622 *error = ENXIO; 19623 return (NULL); 19624 } 19625 } 19626 19627 GRAB_CONN_LOCK(q); 19628 mutex_enter(&ill->ill_lock); 19629 /* Now see if there is an IPIF with this unit number. */ 19630 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 19631 if (ipif->ipif_id == id) { 19632 if (zoneid != ALL_ZONES && 19633 zoneid != ipif->ipif_zoneid && 19634 ipif->ipif_zoneid != ALL_ZONES) { 19635 mutex_exit(&ill->ill_lock); 19636 RELEASE_CONN_LOCK(q); 19637 ill_refrele(ill); 19638 if (error != NULL) 19639 *error = ENXIO; 19640 return (NULL); 19641 } 19642 /* 19643 * The block comment at the start of ipif_down 19644 * explains the use of the macros used below 19645 */ 19646 if (IPIF_CAN_LOOKUP(ipif)) { 19647 ipif_refhold_locked(ipif); 19648 mutex_exit(&ill->ill_lock); 19649 if (!did_alloc) 19650 *exists = B_TRUE; 19651 /* 19652 * Drop locks before calling ill_refrele 19653 * since it can potentially call into 19654 * ipif_ill_refrele_tail which can end up 19655 * in trying to acquire any lock. 19656 */ 19657 RELEASE_CONN_LOCK(q); 19658 ill_refrele(ill); 19659 return (ipif); 19660 } else if (IPIF_CAN_WAIT(ipif, q)) { 19661 ipsq = ill->ill_phyint->phyint_ipsq; 19662 mutex_enter(&ipsq->ipsq_lock); 19663 mutex_exit(&ill->ill_lock); 19664 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 19665 mutex_exit(&ipsq->ipsq_lock); 19666 RELEASE_CONN_LOCK(q); 19667 ill_refrele(ill); 19668 *error = EINPROGRESS; 19669 return (NULL); 19670 } 19671 } 19672 } 19673 RELEASE_CONN_LOCK(q); 19674 19675 if (!do_alloc) { 19676 mutex_exit(&ill->ill_lock); 19677 ill_refrele(ill); 19678 if (error != NULL) 19679 *error = ENXIO; 19680 return (NULL); 19681 } 19682 19683 /* 19684 * If none found, atomically allocate and return a new one. 19685 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 19686 * to support "receive only" use of lo0:1 etc. as is still done 19687 * below as an initial guess. 19688 * However, this is now likely to be overriden later in ipif_up_done() 19689 * when we know for sure what address has been configured on the 19690 * interface, since we might have more than one loopback interface 19691 * with a loopback address, e.g. in the case of zones, and all the 19692 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 19693 */ 19694 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 19695 ire_type = IRE_LOOPBACK; 19696 else 19697 ire_type = IRE_LOCAL; 19698 ipif = ipif_allocate(ill, id, ire_type, B_TRUE); 19699 if (ipif != NULL) 19700 ipif_refhold_locked(ipif); 19701 else if (error != NULL) 19702 *error = ENOMEM; 19703 mutex_exit(&ill->ill_lock); 19704 ill_refrele(ill); 19705 return (ipif); 19706 } 19707 19708 /* 19709 * This routine is called whenever a new address comes up on an ipif. If 19710 * we are configured to respond to address mask requests, then we are supposed 19711 * to broadcast an address mask reply at this time. This routine is also 19712 * called if we are already up, but a netmask change is made. This is legal 19713 * but might not make the system manager very popular. (May be called 19714 * as writer.) 19715 */ 19716 void 19717 ipif_mask_reply(ipif_t *ipif) 19718 { 19719 icmph_t *icmph; 19720 ipha_t *ipha; 19721 mblk_t *mp; 19722 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 19723 19724 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 19725 19726 if (!ipst->ips_ip_respond_to_address_mask_broadcast) 19727 return; 19728 19729 /* ICMP mask reply is IPv4 only */ 19730 ASSERT(!ipif->ipif_isv6); 19731 /* ICMP mask reply is not for a loopback interface */ 19732 ASSERT(ipif->ipif_ill->ill_wq != NULL); 19733 19734 mp = allocb(REPLY_LEN, BPRI_HI); 19735 if (mp == NULL) 19736 return; 19737 mp->b_wptr = mp->b_rptr + REPLY_LEN; 19738 19739 ipha = (ipha_t *)mp->b_rptr; 19740 bzero(ipha, REPLY_LEN); 19741 *ipha = icmp_ipha; 19742 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 19743 ipha->ipha_src = ipif->ipif_src_addr; 19744 ipha->ipha_dst = ipif->ipif_brd_addr; 19745 ipha->ipha_length = htons(REPLY_LEN); 19746 ipha->ipha_ident = 0; 19747 19748 icmph = (icmph_t *)&ipha[1]; 19749 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 19750 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 19751 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 19752 19753 put(ipif->ipif_wq, mp); 19754 19755 #undef REPLY_LEN 19756 } 19757 19758 /* 19759 * When the mtu in the ipif changes, we call this routine through ire_walk 19760 * to update all the relevant IREs. 19761 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 19762 */ 19763 static void 19764 ipif_mtu_change(ire_t *ire, char *ipif_arg) 19765 { 19766 ipif_t *ipif = (ipif_t *)ipif_arg; 19767 19768 if (ire->ire_stq == NULL || ire->ire_ipif != ipif) 19769 return; 19770 ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET); 19771 } 19772 19773 /* 19774 * When the mtu in the ill changes, we call this routine through ire_walk 19775 * to update all the relevant IREs. 19776 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 19777 */ 19778 void 19779 ill_mtu_change(ire_t *ire, char *ill_arg) 19780 { 19781 ill_t *ill = (ill_t *)ill_arg; 19782 19783 if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill) 19784 return; 19785 ire->ire_max_frag = ire->ire_ipif->ipif_mtu; 19786 } 19787 19788 /* 19789 * Join the ipif specific multicast groups. 19790 * Must be called after a mapping has been set up in the resolver. (Always 19791 * called as writer.) 19792 */ 19793 void 19794 ipif_multicast_up(ipif_t *ipif) 19795 { 19796 int err, index; 19797 ill_t *ill; 19798 19799 ASSERT(IAM_WRITER_IPIF(ipif)); 19800 19801 ill = ipif->ipif_ill; 19802 index = ill->ill_phyint->phyint_ifindex; 19803 19804 ip1dbg(("ipif_multicast_up\n")); 19805 if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up) 19806 return; 19807 19808 if (ipif->ipif_isv6) { 19809 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 19810 return; 19811 19812 /* Join the all hosts multicast address */ 19813 ip1dbg(("ipif_multicast_up - addmulti\n")); 19814 /* 19815 * Passing B_TRUE means we have to join the multicast 19816 * membership on this interface even though this is 19817 * FAILED. If we join on a different one in the group, 19818 * we will not be able to delete the membership later 19819 * as we currently don't track where we join when we 19820 * join within the kernel unlike applications where 19821 * we have ilg/ilg_orig_index. See ip_addmulti_v6 19822 * for more on this. 19823 */ 19824 err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index, 19825 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 19826 if (err != 0) { 19827 ip0dbg(("ipif_multicast_up: " 19828 "all_hosts_mcast failed %d\n", 19829 err)); 19830 return; 19831 } 19832 /* 19833 * Enable multicast for the solicited node multicast address 19834 */ 19835 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 19836 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 19837 19838 ipv6_multi.s6_addr32[3] |= 19839 ipif->ipif_v6lcl_addr.s6_addr32[3]; 19840 19841 err = ip_addmulti_v6(&ipv6_multi, ill, index, 19842 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, 19843 NULL); 19844 if (err != 0) { 19845 ip0dbg(("ipif_multicast_up: solicited MC" 19846 " failed %d\n", err)); 19847 (void) ip_delmulti_v6(&ipv6_all_hosts_mcast, 19848 ill, ill->ill_phyint->phyint_ifindex, 19849 ipif->ipif_zoneid, B_TRUE, B_TRUE); 19850 return; 19851 } 19852 } 19853 } else { 19854 if (ipif->ipif_lcl_addr == INADDR_ANY) 19855 return; 19856 19857 /* Join the all hosts multicast address */ 19858 ip1dbg(("ipif_multicast_up - addmulti\n")); 19859 err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, 19860 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 19861 if (err) { 19862 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 19863 return; 19864 } 19865 } 19866 ipif->ipif_multicast_up = 1; 19867 } 19868 19869 /* 19870 * Blow away any IPv6 multicast groups that we joined in ipif_multicast_up(); 19871 * any explicit memberships are blown away in ill_leave_multicast() when the 19872 * ill is brought down. 19873 */ 19874 static void 19875 ipif_multicast_down(ipif_t *ipif) 19876 { 19877 int err; 19878 19879 ASSERT(IAM_WRITER_IPIF(ipif)); 19880 19881 ip1dbg(("ipif_multicast_down\n")); 19882 if (!ipif->ipif_multicast_up) 19883 return; 19884 19885 ASSERT(ipif->ipif_isv6); 19886 19887 ip1dbg(("ipif_multicast_down - delmulti\n")); 19888 19889 /* 19890 * Leave the all hosts multicast address. Similar to ip_addmulti_v6, 19891 * we should look for ilms on this ill rather than the ones that have 19892 * been failed over here. They are here temporarily. As 19893 * ipif_multicast_up has joined on this ill, we should delete only 19894 * from this ill. 19895 */ 19896 err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, 19897 ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid, 19898 B_TRUE, B_TRUE); 19899 if (err != 0) { 19900 ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n", 19901 err)); 19902 } 19903 /* 19904 * Disable multicast for the solicited node multicast address 19905 */ 19906 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 19907 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 19908 19909 ipv6_multi.s6_addr32[3] |= 19910 ipif->ipif_v6lcl_addr.s6_addr32[3]; 19911 19912 err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill, 19913 ipif->ipif_ill->ill_phyint->phyint_ifindex, 19914 ipif->ipif_zoneid, B_TRUE, B_TRUE); 19915 19916 if (err != 0) { 19917 ip0dbg(("ipif_multicast_down: sol MC failed %d\n", 19918 err)); 19919 } 19920 } 19921 19922 ipif->ipif_multicast_up = 0; 19923 } 19924 19925 /* 19926 * Used when an interface comes up to recreate any extra routes on this 19927 * interface. 19928 */ 19929 static ire_t ** 19930 ipif_recover_ire(ipif_t *ipif) 19931 { 19932 mblk_t *mp; 19933 ire_t **ipif_saved_irep; 19934 ire_t **irep; 19935 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 19936 19937 ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name, 19938 ipif->ipif_id)); 19939 19940 mutex_enter(&ipif->ipif_saved_ire_lock); 19941 ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) * 19942 ipif->ipif_saved_ire_cnt, KM_NOSLEEP); 19943 if (ipif_saved_irep == NULL) { 19944 mutex_exit(&ipif->ipif_saved_ire_lock); 19945 return (NULL); 19946 } 19947 19948 irep = ipif_saved_irep; 19949 for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 19950 ire_t *ire; 19951 queue_t *rfq; 19952 queue_t *stq; 19953 ifrt_t *ifrt; 19954 uchar_t *src_addr; 19955 uchar_t *gateway_addr; 19956 mblk_t *resolver_mp; 19957 ushort_t type; 19958 19959 /* 19960 * When the ire was initially created and then added in 19961 * ip_rt_add(), it was created either using ipif->ipif_net_type 19962 * in the case of a traditional interface route, or as one of 19963 * the IRE_OFFSUBNET types (with the exception of 19964 * IRE_HOST types ire which is created by icmp_redirect() and 19965 * which we don't need to save or recover). In the case where 19966 * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update 19967 * the ire_type to IRE_IF_NORESOLVER before calling ire_add() 19968 * to satisfy software like GateD and Sun Cluster which creates 19969 * routes using the the loopback interface's address as a 19970 * gateway. 19971 * 19972 * As ifrt->ifrt_type reflects the already updated ire_type and 19973 * since ire_create() expects that IRE_IF_NORESOLVER will have 19974 * a valid nce_res_mp field (which doesn't make sense for a 19975 * IRE_LOOPBACK), ire_create() will be called in the same way 19976 * here as in ip_rt_add(), namely using ipif->ipif_net_type when 19977 * the route looks like a traditional interface route (where 19978 * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using 19979 * the saved ifrt->ifrt_type. This means that in the case where 19980 * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by 19981 * ire_create() will be an IRE_LOOPBACK, it will then be turned 19982 * into an IRE_IF_NORESOLVER and then added by ire_add(). 19983 */ 19984 ifrt = (ifrt_t *)mp->b_rptr; 19985 if (ifrt->ifrt_type & IRE_INTERFACE) { 19986 rfq = NULL; 19987 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 19988 ? ipif->ipif_rq : ipif->ipif_wq; 19989 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19990 ? (uint8_t *)&ifrt->ifrt_src_addr 19991 : (uint8_t *)&ipif->ipif_src_addr; 19992 gateway_addr = NULL; 19993 resolver_mp = ipif->ipif_resolver_mp; 19994 type = ipif->ipif_net_type; 19995 } else if (ifrt->ifrt_type & IRE_BROADCAST) { 19996 /* Recover multiroute broadcast IRE. */ 19997 rfq = ipif->ipif_rq; 19998 stq = ipif->ipif_wq; 19999 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 20000 ? (uint8_t *)&ifrt->ifrt_src_addr 20001 : (uint8_t *)&ipif->ipif_src_addr; 20002 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 20003 resolver_mp = ipif->ipif_bcast_mp; 20004 type = ifrt->ifrt_type; 20005 } else { 20006 rfq = NULL; 20007 stq = NULL; 20008 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 20009 ? (uint8_t *)&ifrt->ifrt_src_addr : NULL; 20010 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 20011 resolver_mp = NULL; 20012 type = ifrt->ifrt_type; 20013 } 20014 20015 /* 20016 * Create a copy of the IRE with the saved address and netmask. 20017 */ 20018 ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for " 20019 "0x%x/0x%x\n", 20020 ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type, 20021 ntohl(ifrt->ifrt_addr), 20022 ntohl(ifrt->ifrt_mask))); 20023 ire = ire_create( 20024 (uint8_t *)&ifrt->ifrt_addr, 20025 (uint8_t *)&ifrt->ifrt_mask, 20026 src_addr, 20027 gateway_addr, 20028 NULL, 20029 &ifrt->ifrt_max_frag, 20030 NULL, 20031 rfq, 20032 stq, 20033 type, 20034 resolver_mp, 20035 ipif, 20036 NULL, 20037 0, 20038 0, 20039 0, 20040 ifrt->ifrt_flags, 20041 &ifrt->ifrt_iulp_info, 20042 NULL, 20043 NULL, 20044 ipst); 20045 20046 if (ire == NULL) { 20047 mutex_exit(&ipif->ipif_saved_ire_lock); 20048 kmem_free(ipif_saved_irep, 20049 ipif->ipif_saved_ire_cnt * sizeof (ire_t *)); 20050 return (NULL); 20051 } 20052 20053 /* 20054 * Some software (for example, GateD and Sun Cluster) attempts 20055 * to create (what amount to) IRE_PREFIX routes with the 20056 * loopback address as the gateway. This is primarily done to 20057 * set up prefixes with the RTF_REJECT flag set (for example, 20058 * when generating aggregate routes.) 20059 * 20060 * If the IRE type (as defined by ipif->ipif_net_type) is 20061 * IRE_LOOPBACK, then we map the request into a 20062 * IRE_IF_NORESOLVER. 20063 */ 20064 if (ipif->ipif_net_type == IRE_LOOPBACK) 20065 ire->ire_type = IRE_IF_NORESOLVER; 20066 /* 20067 * ire held by ire_add, will be refreled' towards the 20068 * the end of ipif_up_done 20069 */ 20070 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 20071 *irep = ire; 20072 irep++; 20073 ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire)); 20074 } 20075 mutex_exit(&ipif->ipif_saved_ire_lock); 20076 return (ipif_saved_irep); 20077 } 20078 20079 /* 20080 * Used to set the netmask and broadcast address to default values when the 20081 * interface is brought up. (Always called as writer.) 20082 */ 20083 static void 20084 ipif_set_default(ipif_t *ipif) 20085 { 20086 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 20087 20088 if (!ipif->ipif_isv6) { 20089 /* 20090 * Interface holds an IPv4 address. Default 20091 * mask is the natural netmask. 20092 */ 20093 if (!ipif->ipif_net_mask) { 20094 ipaddr_t v4mask; 20095 20096 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 20097 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 20098 } 20099 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 20100 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 20101 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 20102 } else { 20103 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 20104 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 20105 } 20106 /* 20107 * NOTE: SunOS 4.X does this even if the broadcast address 20108 * has been already set thus we do the same here. 20109 */ 20110 if (ipif->ipif_flags & IPIF_BROADCAST) { 20111 ipaddr_t v4addr; 20112 20113 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 20114 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 20115 } 20116 } else { 20117 /* 20118 * Interface holds an IPv6-only address. Default 20119 * mask is all-ones. 20120 */ 20121 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 20122 ipif->ipif_v6net_mask = ipv6_all_ones; 20123 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 20124 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 20125 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 20126 } else { 20127 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 20128 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 20129 } 20130 } 20131 } 20132 20133 /* 20134 * Return 0 if this address can be used as local address without causing 20135 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 20136 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 20137 * Special checks are needed to allow the same IPv6 link-local address 20138 * on different ills. 20139 * TODO: allowing the same site-local address on different ill's. 20140 */ 20141 int 20142 ip_addr_availability_check(ipif_t *new_ipif) 20143 { 20144 in6_addr_t our_v6addr; 20145 ill_t *ill; 20146 ipif_t *ipif; 20147 ill_walk_context_t ctx; 20148 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst; 20149 20150 ASSERT(IAM_WRITER_IPIF(new_ipif)); 20151 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock)); 20152 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); 20153 20154 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 20155 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 20156 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 20157 return (0); 20158 20159 our_v6addr = new_ipif->ipif_v6lcl_addr; 20160 20161 if (new_ipif->ipif_isv6) 20162 ill = ILL_START_WALK_V6(&ctx, ipst); 20163 else 20164 ill = ILL_START_WALK_V4(&ctx, ipst); 20165 20166 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 20167 for (ipif = ill->ill_ipif; ipif != NULL; 20168 ipif = ipif->ipif_next) { 20169 if ((ipif == new_ipif) || 20170 !(ipif->ipif_flags & IPIF_UP) || 20171 (ipif->ipif_flags & IPIF_UNNUMBERED)) 20172 continue; 20173 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 20174 &our_v6addr)) { 20175 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 20176 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 20177 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 20178 ipif->ipif_flags |= IPIF_UNNUMBERED; 20179 else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) && 20180 new_ipif->ipif_ill != ill) 20181 continue; 20182 else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) && 20183 new_ipif->ipif_ill != ill) 20184 continue; 20185 else if (new_ipif->ipif_zoneid != 20186 ipif->ipif_zoneid && 20187 ipif->ipif_zoneid != ALL_ZONES && 20188 IS_LOOPBACK(ill)) 20189 continue; 20190 else if (new_ipif->ipif_ill == ill) 20191 return (EADDRINUSE); 20192 else 20193 return (EADDRNOTAVAIL); 20194 } 20195 } 20196 } 20197 20198 return (0); 20199 } 20200 20201 /* 20202 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 20203 * IREs for the ipif. 20204 * When the routine returns EINPROGRESS then mp has been consumed and 20205 * the ioctl will be acked from ip_rput_dlpi. 20206 */ 20207 static int 20208 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 20209 { 20210 ill_t *ill = ipif->ipif_ill; 20211 boolean_t isv6 = ipif->ipif_isv6; 20212 int err = 0; 20213 boolean_t success; 20214 20215 ASSERT(IAM_WRITER_IPIF(ipif)); 20216 20217 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 20218 20219 /* Shouldn't get here if it is already up. */ 20220 if (ipif->ipif_flags & IPIF_UP) 20221 return (EALREADY); 20222 20223 /* Skip arp/ndp for any loopback interface. */ 20224 if (ill->ill_wq != NULL) { 20225 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 20226 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 20227 20228 if (!ill->ill_dl_up) { 20229 /* 20230 * ill_dl_up is not yet set. i.e. we are yet to 20231 * DL_BIND with the driver and this is the first 20232 * logical interface on the ill to become "up". 20233 * Tell the driver to get going (via DL_BIND_REQ). 20234 * Note that changing "significant" IFF_ flags 20235 * address/netmask etc cause a down/up dance, but 20236 * does not cause an unbind (DL_UNBIND) with the driver 20237 */ 20238 return (ill_dl_up(ill, ipif, mp, q)); 20239 } 20240 20241 /* 20242 * ipif_resolver_up may end up sending an 20243 * AR_INTERFACE_UP message to ARP, which would, in 20244 * turn send a DLPI message to the driver. ioctls are 20245 * serialized and so we cannot send more than one 20246 * interface up message at a time. If ipif_resolver_up 20247 * does send an interface up message to ARP, we get 20248 * EINPROGRESS and we will complete in ip_arp_done. 20249 */ 20250 20251 ASSERT(connp != NULL || !CONN_Q(q)); 20252 ASSERT(ipsq->ipsq_pending_mp == NULL); 20253 if (connp != NULL) 20254 mutex_enter(&connp->conn_lock); 20255 mutex_enter(&ill->ill_lock); 20256 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 20257 mutex_exit(&ill->ill_lock); 20258 if (connp != NULL) 20259 mutex_exit(&connp->conn_lock); 20260 if (!success) 20261 return (EINTR); 20262 20263 /* 20264 * Crank up IPv6 neighbor discovery 20265 * Unlike ARP, this should complete when 20266 * ipif_ndp_up returns. However, for 20267 * ILLF_XRESOLV interfaces we also send a 20268 * AR_INTERFACE_UP to the external resolver. 20269 * That ioctl will complete in ip_rput. 20270 */ 20271 if (isv6) { 20272 err = ipif_ndp_up(ipif, &ipif->ipif_v6lcl_addr); 20273 if (err != 0) { 20274 if (err != EINPROGRESS) 20275 mp = ipsq_pending_mp_get(ipsq, &connp); 20276 return (err); 20277 } 20278 } 20279 /* Now, ARP */ 20280 err = ipif_resolver_up(ipif, Res_act_initial); 20281 if (err == EINPROGRESS) { 20282 /* We will complete it in ip_arp_done */ 20283 return (err); 20284 } 20285 mp = ipsq_pending_mp_get(ipsq, &connp); 20286 ASSERT(mp != NULL); 20287 if (err != 0) 20288 return (err); 20289 } else { 20290 /* 20291 * Interfaces without underlying hardware don't do duplicate 20292 * address detection. 20293 */ 20294 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 20295 ipif->ipif_addr_ready = 1; 20296 } 20297 return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 20298 } 20299 20300 /* 20301 * Perform a bind for the physical device. 20302 * When the routine returns EINPROGRESS then mp has been consumed and 20303 * the ioctl will be acked from ip_rput_dlpi. 20304 * Allocate an unbind message and save it until ipif_down. 20305 */ 20306 static int 20307 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 20308 { 20309 areq_t *areq; 20310 mblk_t *areq_mp = NULL; 20311 mblk_t *bind_mp = NULL; 20312 mblk_t *unbind_mp = NULL; 20313 conn_t *connp; 20314 boolean_t success; 20315 uint16_t sap_addr; 20316 20317 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 20318 ASSERT(IAM_WRITER_ILL(ill)); 20319 ASSERT(mp != NULL); 20320 20321 /* Create a resolver cookie for ARP */ 20322 if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) { 20323 areq_mp = ill_arp_alloc(ill, (uchar_t *)&ip_areq_template, 0); 20324 if (areq_mp == NULL) 20325 return (ENOMEM); 20326 20327 freemsg(ill->ill_resolver_mp); 20328 ill->ill_resolver_mp = areq_mp; 20329 areq = (areq_t *)areq_mp->b_rptr; 20330 sap_addr = ill->ill_sap; 20331 bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr)); 20332 } 20333 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 20334 DL_BIND_REQ); 20335 if (bind_mp == NULL) 20336 goto bad; 20337 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 20338 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 20339 20340 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 20341 if (unbind_mp == NULL) 20342 goto bad; 20343 20344 /* 20345 * Record state needed to complete this operation when the 20346 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 20347 */ 20348 ASSERT(WR(q)->q_next == NULL); 20349 connp = Q_TO_CONN(q); 20350 20351 mutex_enter(&connp->conn_lock); 20352 mutex_enter(&ipif->ipif_ill->ill_lock); 20353 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 20354 mutex_exit(&ipif->ipif_ill->ill_lock); 20355 mutex_exit(&connp->conn_lock); 20356 if (!success) 20357 goto bad; 20358 20359 /* 20360 * Save the unbind message for ill_dl_down(); it will be consumed when 20361 * the interface goes down. 20362 */ 20363 ASSERT(ill->ill_unbind_mp == NULL); 20364 ill->ill_unbind_mp = unbind_mp; 20365 20366 ill_dlpi_send(ill, bind_mp); 20367 /* Send down link-layer capabilities probe if not already done. */ 20368 ill_capability_probe(ill); 20369 20370 /* 20371 * Sysid used to rely on the fact that netboots set domainname 20372 * and the like. Now that miniroot boots aren't strictly netboots 20373 * and miniroot network configuration is driven from userland 20374 * these things still need to be set. This situation can be detected 20375 * by comparing the interface being configured here to the one 20376 * dhcack was set to reference by the boot loader. Once sysid is 20377 * converted to use dhcp_ipc_getinfo() this call can go away. 20378 */ 20379 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && (dhcack != NULL) && 20380 (strcmp(ill->ill_name, dhcack) == 0) && 20381 (strlen(srpc_domain) == 0)) { 20382 if (dhcpinit() != 0) 20383 cmn_err(CE_WARN, "no cached dhcp response"); 20384 } 20385 20386 /* 20387 * This operation will complete in ip_rput_dlpi with either 20388 * a DL_BIND_ACK or DL_ERROR_ACK. 20389 */ 20390 return (EINPROGRESS); 20391 bad: 20392 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 20393 /* 20394 * We don't have to check for possible removal from illgrp 20395 * as we have not yet inserted in illgrp. For groups 20396 * without names, this ipif is still not UP and hence 20397 * this could not have possibly had any influence in forming 20398 * groups. 20399 */ 20400 20401 freemsg(bind_mp); 20402 freemsg(unbind_mp); 20403 return (ENOMEM); 20404 } 20405 20406 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 20407 20408 /* 20409 * DLPI and ARP is up. 20410 * Create all the IREs associated with an interface bring up multicast. 20411 * Set the interface flag and finish other initialization 20412 * that potentially had to be differed to after DL_BIND_ACK. 20413 */ 20414 int 20415 ipif_up_done(ipif_t *ipif) 20416 { 20417 ire_t *ire_array[20]; 20418 ire_t **irep = ire_array; 20419 ire_t **irep1; 20420 ipaddr_t net_mask = 0; 20421 ipaddr_t subnet_mask, route_mask; 20422 ill_t *ill = ipif->ipif_ill; 20423 queue_t *stq; 20424 ipif_t *src_ipif; 20425 ipif_t *tmp_ipif; 20426 boolean_t flush_ire_cache = B_TRUE; 20427 int err = 0; 20428 phyint_t *phyi; 20429 ire_t **ipif_saved_irep = NULL; 20430 int ipif_saved_ire_cnt; 20431 int cnt; 20432 boolean_t src_ipif_held = B_FALSE; 20433 boolean_t ire_added = B_FALSE; 20434 boolean_t loopback = B_FALSE; 20435 ip_stack_t *ipst = ill->ill_ipst; 20436 20437 ip1dbg(("ipif_up_done(%s:%u)\n", 20438 ipif->ipif_ill->ill_name, ipif->ipif_id)); 20439 /* Check if this is a loopback interface */ 20440 if (ipif->ipif_ill->ill_wq == NULL) 20441 loopback = B_TRUE; 20442 20443 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 20444 /* 20445 * If all other interfaces for this ill are down or DEPRECATED, 20446 * or otherwise unsuitable for source address selection, remove 20447 * any IRE_CACHE entries for this ill to make sure source 20448 * address selection gets to take this new ipif into account. 20449 * No need to hold ill_lock while traversing the ipif list since 20450 * we are writer 20451 */ 20452 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 20453 tmp_ipif = tmp_ipif->ipif_next) { 20454 if (((tmp_ipif->ipif_flags & 20455 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 20456 !(tmp_ipif->ipif_flags & IPIF_UP)) || 20457 (tmp_ipif == ipif)) 20458 continue; 20459 /* first useable pre-existing interface */ 20460 flush_ire_cache = B_FALSE; 20461 break; 20462 } 20463 if (flush_ire_cache) 20464 ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE, 20465 IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); 20466 20467 /* 20468 * Figure out which way the send-to queue should go. Only 20469 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK 20470 * should show up here. 20471 */ 20472 switch (ill->ill_net_type) { 20473 case IRE_IF_RESOLVER: 20474 stq = ill->ill_rq; 20475 break; 20476 case IRE_IF_NORESOLVER: 20477 case IRE_LOOPBACK: 20478 stq = ill->ill_wq; 20479 break; 20480 default: 20481 return (EINVAL); 20482 } 20483 20484 if (IS_LOOPBACK(ill)) { 20485 /* 20486 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 20487 * ipif_lookup_on_name(), but in the case of zones we can have 20488 * several loopback addresses on lo0. So all the interfaces with 20489 * loopback addresses need to be marked IRE_LOOPBACK. 20490 */ 20491 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 20492 htonl(INADDR_LOOPBACK)) 20493 ipif->ipif_ire_type = IRE_LOOPBACK; 20494 else 20495 ipif->ipif_ire_type = IRE_LOCAL; 20496 } 20497 20498 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) { 20499 /* 20500 * Can't use our source address. Select a different 20501 * source address for the IRE_INTERFACE and IRE_LOCAL 20502 */ 20503 src_ipif = ipif_select_source(ipif->ipif_ill, 20504 ipif->ipif_subnet, ipif->ipif_zoneid); 20505 if (src_ipif == NULL) 20506 src_ipif = ipif; /* Last resort */ 20507 else 20508 src_ipif_held = B_TRUE; 20509 } else { 20510 src_ipif = ipif; 20511 } 20512 20513 /* Create all the IREs associated with this interface */ 20514 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 20515 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 20516 20517 /* 20518 * If we're on a labeled system then make sure that zone- 20519 * private addresses have proper remote host database entries. 20520 */ 20521 if (is_system_labeled() && 20522 ipif->ipif_ire_type != IRE_LOOPBACK && 20523 !tsol_check_interface_address(ipif)) 20524 return (EINVAL); 20525 20526 /* Register the source address for __sin6_src_id */ 20527 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 20528 ipif->ipif_zoneid, ipst); 20529 if (err != 0) { 20530 ip0dbg(("ipif_up_done: srcid_insert %d\n", err)); 20531 return (err); 20532 } 20533 20534 /* If the interface address is set, create the local IRE. */ 20535 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n", 20536 (void *)ipif, 20537 ipif->ipif_ire_type, 20538 ntohl(ipif->ipif_lcl_addr))); 20539 *irep++ = ire_create( 20540 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 20541 (uchar_t *)&ip_g_all_ones, /* mask */ 20542 (uchar_t *)&src_ipif->ipif_src_addr, /* source address */ 20543 NULL, /* no gateway */ 20544 NULL, 20545 &ip_loopback_mtuplus, /* max frag size */ 20546 NULL, 20547 ipif->ipif_rq, /* recv-from queue */ 20548 NULL, /* no send-to queue */ 20549 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 20550 NULL, 20551 ipif, 20552 NULL, 20553 0, 20554 0, 20555 0, 20556 (ipif->ipif_flags & IPIF_PRIVATE) ? 20557 RTF_PRIVATE : 0, 20558 &ire_uinfo_null, 20559 NULL, 20560 NULL, 20561 ipst); 20562 } else { 20563 ip1dbg(( 20564 "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n", 20565 ipif->ipif_ire_type, 20566 ntohl(ipif->ipif_lcl_addr), 20567 (uint_t)ipif->ipif_flags)); 20568 } 20569 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 20570 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 20571 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 20572 } else { 20573 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 20574 } 20575 20576 subnet_mask = ipif->ipif_net_mask; 20577 20578 /* 20579 * If mask was not specified, use natural netmask of 20580 * interface address. Also, store this mask back into the 20581 * ipif struct. 20582 */ 20583 if (subnet_mask == 0) { 20584 subnet_mask = net_mask; 20585 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 20586 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 20587 ipif->ipif_v6subnet); 20588 } 20589 20590 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 20591 if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) && 20592 ipif->ipif_subnet != INADDR_ANY) { 20593 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 20594 20595 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 20596 route_mask = IP_HOST_MASK; 20597 } else { 20598 route_mask = subnet_mask; 20599 } 20600 20601 ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p " 20602 "creating if IRE ill_net_type 0x%x for 0x%x\n", 20603 (void *)ipif, (void *)ill, 20604 ill->ill_net_type, 20605 ntohl(ipif->ipif_subnet))); 20606 *irep++ = ire_create( 20607 (uchar_t *)&ipif->ipif_subnet, /* dest address */ 20608 (uchar_t *)&route_mask, /* mask */ 20609 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 20610 NULL, /* no gateway */ 20611 NULL, 20612 &ipif->ipif_mtu, /* max frag */ 20613 NULL, 20614 NULL, /* no recv queue */ 20615 stq, /* send-to queue */ 20616 ill->ill_net_type, /* IF_[NO]RESOLVER */ 20617 ill->ill_resolver_mp, /* xmit header */ 20618 ipif, 20619 NULL, 20620 0, 20621 0, 20622 0, 20623 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0, 20624 &ire_uinfo_null, 20625 NULL, 20626 NULL, 20627 ipst); 20628 } 20629 20630 /* 20631 * If the interface address is set, create the broadcast IREs. 20632 * 20633 * ire_create_bcast checks if the proposed new IRE matches 20634 * any existing IRE's with the same physical interface (ILL). 20635 * This should get rid of duplicates. 20636 * ire_create_bcast also check IPIF_NOXMIT and does not create 20637 * any broadcast ires. 20638 */ 20639 if ((ipif->ipif_subnet != INADDR_ANY) && 20640 (ipif->ipif_flags & IPIF_BROADCAST)) { 20641 ipaddr_t addr; 20642 20643 ip1dbg(("ipif_up_done: creating broadcast IRE\n")); 20644 irep = ire_check_and_create_bcast(ipif, 0, irep, 20645 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 20646 irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, 20647 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 20648 20649 /* 20650 * For backward compatibility, we need to create net 20651 * broadcast ire's based on the old "IP address class 20652 * system." The reason is that some old machines only 20653 * respond to these class derived net broadcast. 20654 * 20655 * But we should not create these net broadcast ire's if 20656 * the subnet_mask is shorter than the IP address class based 20657 * derived netmask. Otherwise, we may create a net 20658 * broadcast address which is the same as an IP address 20659 * on the subnet. Then TCP will refuse to talk to that 20660 * address. 20661 * 20662 * Nor do we need IRE_BROADCAST ire's for the interface 20663 * with the netmask as 0xFFFFFFFF, as IRE_LOCAL for that 20664 * interface is already created. Creating these broadcast 20665 * ire's will only create confusion as the "addr" is going 20666 * to be same as that of the IP address of the interface. 20667 */ 20668 if (net_mask < subnet_mask) { 20669 addr = net_mask & ipif->ipif_subnet; 20670 irep = ire_check_and_create_bcast(ipif, addr, irep, 20671 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 20672 irep = ire_check_and_create_bcast(ipif, 20673 ~net_mask | addr, irep, 20674 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 20675 } 20676 20677 if (subnet_mask != 0xFFFFFFFF) { 20678 addr = ipif->ipif_subnet; 20679 irep = ire_check_and_create_bcast(ipif, addr, irep, 20680 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 20681 irep = ire_check_and_create_bcast(ipif, 20682 ~subnet_mask|addr, irep, 20683 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 20684 } 20685 } 20686 20687 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 20688 20689 /* If an earlier ire_create failed, get out now */ 20690 for (irep1 = irep; irep1 > ire_array; ) { 20691 irep1--; 20692 if (*irep1 == NULL) { 20693 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 20694 err = ENOMEM; 20695 goto bad; 20696 } 20697 } 20698 20699 /* 20700 * Need to atomically check for ip_addr_availablity_check 20701 * under ip_addr_avail_lock, and if it fails got bad, and remove 20702 * from group also.The ill_g_lock is grabbed as reader 20703 * just to make sure no new ills or new ipifs are being added 20704 * to the system while we are checking the uniqueness of addresses. 20705 */ 20706 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 20707 mutex_enter(&ipst->ips_ip_addr_avail_lock); 20708 /* Mark it up, and increment counters. */ 20709 ipif->ipif_flags |= IPIF_UP; 20710 ill->ill_ipif_up_count++; 20711 err = ip_addr_availability_check(ipif); 20712 mutex_exit(&ipst->ips_ip_addr_avail_lock); 20713 rw_exit(&ipst->ips_ill_g_lock); 20714 20715 if (err != 0) { 20716 /* 20717 * Our address may already be up on the same ill. In this case, 20718 * the ARP entry for our ipif replaced the one for the other 20719 * ipif. So we don't want to delete it (otherwise the other ipif 20720 * would be unable to send packets). 20721 * ip_addr_availability_check() identifies this case for us and 20722 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL 20723 * which is the expected error code. 20724 */ 20725 if (err == EADDRINUSE) { 20726 freemsg(ipif->ipif_arp_del_mp); 20727 ipif->ipif_arp_del_mp = NULL; 20728 err = EADDRNOTAVAIL; 20729 } 20730 ill->ill_ipif_up_count--; 20731 ipif->ipif_flags &= ~IPIF_UP; 20732 goto bad; 20733 } 20734 20735 /* 20736 * Add in all newly created IREs. ire_create_bcast() has 20737 * already checked for duplicates of the IRE_BROADCAST type. 20738 * We want to add before we call ifgrp_insert which wants 20739 * to know whether IRE_IF_RESOLVER exists or not. 20740 * 20741 * NOTE : We refrele the ire though we may branch to "bad" 20742 * later on where we do ire_delete. This is okay 20743 * because nobody can delete it as we are running 20744 * exclusively. 20745 */ 20746 for (irep1 = irep; irep1 > ire_array; ) { 20747 irep1--; 20748 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock))); 20749 /* 20750 * refheld by ire_add. refele towards the end of the func 20751 */ 20752 (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE); 20753 } 20754 ire_added = B_TRUE; 20755 /* 20756 * Form groups if possible. 20757 * 20758 * If we are supposed to be in a ill_group with a name, insert it 20759 * now as we know that at least one ipif is UP. Otherwise form 20760 * nameless groups. 20761 * 20762 * If ip_enable_group_ifs is set and ipif address is not 0, insert 20763 * this ipif into the appropriate interface group, or create a 20764 * new one. If this is already in a nameless group, we try to form 20765 * a bigger group looking at other ills potentially sharing this 20766 * ipif's prefix. 20767 */ 20768 phyi = ill->ill_phyint; 20769 if (phyi->phyint_groupname_len != 0) { 20770 ASSERT(phyi->phyint_groupname != NULL); 20771 if (ill->ill_ipif_up_count == 1) { 20772 ASSERT(ill->ill_group == NULL); 20773 err = illgrp_insert(&ipst->ips_illgrp_head_v4, ill, 20774 phyi->phyint_groupname, NULL, B_TRUE); 20775 if (err != 0) { 20776 ip1dbg(("ipif_up_done: illgrp allocation " 20777 "failed, error %d\n", err)); 20778 goto bad; 20779 } 20780 } 20781 ASSERT(ill->ill_group != NULL); 20782 } 20783 20784 /* 20785 * When this is part of group, we need to make sure that 20786 * any broadcast ires created because of this ipif coming 20787 * UP gets marked/cleared with IRE_MARK_NORECV appropriately 20788 * so that we don't receive duplicate broadcast packets. 20789 */ 20790 if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0) 20791 ipif_renominate_bcast(ipif); 20792 20793 /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ 20794 ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; 20795 ipif_saved_irep = ipif_recover_ire(ipif); 20796 20797 if (!loopback) { 20798 /* 20799 * If the broadcast address has been set, make sure it makes 20800 * sense based on the interface address. 20801 * Only match on ill since we are sharing broadcast addresses. 20802 */ 20803 if ((ipif->ipif_brd_addr != INADDR_ANY) && 20804 (ipif->ipif_flags & IPIF_BROADCAST)) { 20805 ire_t *ire; 20806 20807 ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0, 20808 IRE_BROADCAST, ipif, ALL_ZONES, 20809 NULL, (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst); 20810 20811 if (ire == NULL) { 20812 /* 20813 * If there isn't a matching broadcast IRE, 20814 * revert to the default for this netmask. 20815 */ 20816 ipif->ipif_v6brd_addr = ipv6_all_zeros; 20817 mutex_enter(&ipif->ipif_ill->ill_lock); 20818 ipif_set_default(ipif); 20819 mutex_exit(&ipif->ipif_ill->ill_lock); 20820 } else { 20821 ire_refrele(ire); 20822 } 20823 } 20824 20825 } 20826 20827 /* This is the first interface on this ill */ 20828 if (ipif->ipif_ipif_up_count == 1 && !loopback) { 20829 /* 20830 * Need to recover all multicast memberships in the driver. 20831 * This had to be deferred until we had attached. 20832 */ 20833 ill_recover_multicast(ill); 20834 } 20835 /* Join the allhosts multicast address */ 20836 ipif_multicast_up(ipif); 20837 20838 if (!loopback) { 20839 /* 20840 * See whether anybody else would benefit from the 20841 * new ipif that we added. We call this always rather 20842 * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST 20843 * ipif is for the benefit of illgrp_insert (done above) 20844 * which does not do source address selection as it does 20845 * not want to re-create interface routes that we are 20846 * having reference to it here. 20847 */ 20848 ill_update_source_selection(ill); 20849 } 20850 20851 for (irep1 = irep; irep1 > ire_array; ) { 20852 irep1--; 20853 if (*irep1 != NULL) { 20854 /* was held in ire_add */ 20855 ire_refrele(*irep1); 20856 } 20857 } 20858 20859 cnt = ipif_saved_ire_cnt; 20860 for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) { 20861 if (*irep1 != NULL) { 20862 /* was held in ire_add */ 20863 ire_refrele(*irep1); 20864 } 20865 } 20866 20867 if (!loopback && ipif->ipif_addr_ready) { 20868 /* Broadcast an address mask reply. */ 20869 ipif_mask_reply(ipif); 20870 } 20871 if (ipif_saved_irep != NULL) { 20872 kmem_free(ipif_saved_irep, 20873 ipif_saved_ire_cnt * sizeof (ire_t *)); 20874 } 20875 if (src_ipif_held) 20876 ipif_refrele(src_ipif); 20877 20878 /* 20879 * This had to be deferred until we had bound. Tell routing sockets and 20880 * others that this interface is up if it looks like the address has 20881 * been validated. Otherwise, if it isn't ready yet, wait for 20882 * duplicate address detection to do its thing. 20883 */ 20884 if (ipif->ipif_addr_ready) { 20885 ip_rts_ifmsg(ipif); 20886 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 20887 /* Let SCTP update the status for this ipif */ 20888 sctp_update_ipif(ipif, SCTP_IPIF_UP); 20889 } 20890 return (0); 20891 20892 bad: 20893 ip1dbg(("ipif_up_done: FAILED \n")); 20894 /* 20895 * We don't have to bother removing from ill groups because 20896 * 20897 * 1) For groups with names, we insert only when the first ipif 20898 * comes up. In that case if it fails, it will not be in any 20899 * group. So, we need not try to remove for that case. 20900 * 20901 * 2) For groups without names, either we tried to insert ipif_ill 20902 * in a group as singleton or found some other group to become 20903 * a bigger group. For the former, if it fails we don't have 20904 * anything to do as ipif_ill is not in the group and for the 20905 * latter, there are no failures in illgrp_insert/illgrp_delete 20906 * (ENOMEM can't occur for this. Check ifgrp_insert). 20907 */ 20908 while (irep > ire_array) { 20909 irep--; 20910 if (*irep != NULL) { 20911 ire_delete(*irep); 20912 if (ire_added) 20913 ire_refrele(*irep); 20914 } 20915 } 20916 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); 20917 20918 if (ipif_saved_irep != NULL) { 20919 kmem_free(ipif_saved_irep, 20920 ipif_saved_ire_cnt * sizeof (ire_t *)); 20921 } 20922 if (src_ipif_held) 20923 ipif_refrele(src_ipif); 20924 20925 ipif_arp_down(ipif); 20926 return (err); 20927 } 20928 20929 /* 20930 * Turn off the ARP with the ILLF_NOARP flag. 20931 */ 20932 static int 20933 ill_arp_off(ill_t *ill) 20934 { 20935 mblk_t *arp_off_mp = NULL; 20936 mblk_t *arp_on_mp = NULL; 20937 20938 ip1dbg(("ill_arp_off(%s)\n", ill->ill_name)); 20939 20940 ASSERT(IAM_WRITER_ILL(ill)); 20941 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 20942 20943 /* 20944 * If the on message is still around we've already done 20945 * an arp_off without doing an arp_on thus there is no 20946 * work needed. 20947 */ 20948 if (ill->ill_arp_on_mp != NULL) 20949 return (0); 20950 20951 /* 20952 * Allocate an ARP on message (to be saved) and an ARP off message 20953 */ 20954 arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0); 20955 if (!arp_off_mp) 20956 return (ENOMEM); 20957 20958 arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0); 20959 if (!arp_on_mp) 20960 goto failed; 20961 20962 ASSERT(ill->ill_arp_on_mp == NULL); 20963 ill->ill_arp_on_mp = arp_on_mp; 20964 20965 /* Send an AR_INTERFACE_OFF request */ 20966 putnext(ill->ill_rq, arp_off_mp); 20967 return (0); 20968 failed: 20969 20970 if (arp_off_mp) 20971 freemsg(arp_off_mp); 20972 return (ENOMEM); 20973 } 20974 20975 /* 20976 * Turn on ARP by turning off the ILLF_NOARP flag. 20977 */ 20978 static int 20979 ill_arp_on(ill_t *ill) 20980 { 20981 mblk_t *mp; 20982 20983 ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name)); 20984 20985 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 20986 20987 ASSERT(IAM_WRITER_ILL(ill)); 20988 /* 20989 * Send an AR_INTERFACE_ON request if we have already done 20990 * an arp_off (which allocated the message). 20991 */ 20992 if (ill->ill_arp_on_mp != NULL) { 20993 mp = ill->ill_arp_on_mp; 20994 ill->ill_arp_on_mp = NULL; 20995 putnext(ill->ill_rq, mp); 20996 } 20997 return (0); 20998 } 20999 21000 /* 21001 * Called after either deleting ill from the group or when setting 21002 * FAILED or STANDBY on the interface. 21003 */ 21004 static void 21005 illgrp_reset_schednext(ill_t *ill) 21006 { 21007 ill_group_t *illgrp; 21008 ill_t *save_ill; 21009 21010 ASSERT(IAM_WRITER_ILL(ill)); 21011 /* 21012 * When called from illgrp_delete, ill_group will be non-NULL. 21013 * But when called from ip_sioctl_flags, it could be NULL if 21014 * somebody is setting FAILED/INACTIVE on some interface which 21015 * is not part of a group. 21016 */ 21017 illgrp = ill->ill_group; 21018 if (illgrp == NULL) 21019 return; 21020 if (illgrp->illgrp_ill_schednext != ill) 21021 return; 21022 21023 illgrp->illgrp_ill_schednext = NULL; 21024 save_ill = ill; 21025 /* 21026 * Choose a good ill to be the next one for 21027 * outbound traffic. As the flags FAILED/STANDBY is 21028 * not yet marked when called from ip_sioctl_flags, 21029 * we check for ill separately. 21030 */ 21031 for (ill = illgrp->illgrp_ill; ill != NULL; 21032 ill = ill->ill_group_next) { 21033 if ((ill != save_ill) && 21034 !(ill->ill_phyint->phyint_flags & 21035 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) { 21036 illgrp->illgrp_ill_schednext = ill; 21037 return; 21038 } 21039 } 21040 } 21041 21042 /* 21043 * Given an ill, find the next ill in the group to be scheduled. 21044 * (This should be called by ip_newroute() before ire_create().) 21045 * The passed in ill may be pulled out of the group, after we have picked 21046 * up a different outgoing ill from the same group. However ire add will 21047 * atomically check this. 21048 */ 21049 ill_t * 21050 illgrp_scheduler(ill_t *ill) 21051 { 21052 ill_t *retill; 21053 ill_group_t *illgrp; 21054 int illcnt; 21055 int i; 21056 uint64_t flags; 21057 ip_stack_t *ipst = ill->ill_ipst; 21058 21059 /* 21060 * We don't use a lock to check for the ill_group. If this ill 21061 * is currently being inserted we may end up just returning this 21062 * ill itself. That is ok. 21063 */ 21064 if (ill->ill_group == NULL) { 21065 ill_refhold(ill); 21066 return (ill); 21067 } 21068 21069 /* 21070 * Grab the ill_g_lock as reader to make sure we are dealing with 21071 * a set of stable ills. No ill can be added or deleted or change 21072 * group while we hold the reader lock. 21073 */ 21074 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 21075 if ((illgrp = ill->ill_group) == NULL) { 21076 rw_exit(&ipst->ips_ill_g_lock); 21077 ill_refhold(ill); 21078 return (ill); 21079 } 21080 21081 illcnt = illgrp->illgrp_ill_count; 21082 mutex_enter(&illgrp->illgrp_lock); 21083 retill = illgrp->illgrp_ill_schednext; 21084 21085 if (retill == NULL) 21086 retill = illgrp->illgrp_ill; 21087 21088 /* 21089 * We do a circular search beginning at illgrp_ill_schednext 21090 * or illgrp_ill. We don't check the flags against the ill lock 21091 * since it can change anytime. The ire creation will be atomic 21092 * and will fail if the ill is FAILED or OFFLINE. 21093 */ 21094 for (i = 0; i < illcnt; i++) { 21095 flags = retill->ill_phyint->phyint_flags; 21096 21097 if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 21098 ILL_CAN_LOOKUP(retill)) { 21099 illgrp->illgrp_ill_schednext = retill->ill_group_next; 21100 ill_refhold(retill); 21101 break; 21102 } 21103 retill = retill->ill_group_next; 21104 if (retill == NULL) 21105 retill = illgrp->illgrp_ill; 21106 } 21107 mutex_exit(&illgrp->illgrp_lock); 21108 rw_exit(&ipst->ips_ill_g_lock); 21109 21110 return (i == illcnt ? NULL : retill); 21111 } 21112 21113 /* 21114 * Checks for availbility of a usable source address (if there is one) when the 21115 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 21116 * this selection is done regardless of the destination. 21117 */ 21118 boolean_t 21119 ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) 21120 { 21121 uint_t ifindex; 21122 ipif_t *ipif = NULL; 21123 ill_t *uill; 21124 boolean_t isv6; 21125 ip_stack_t *ipst = ill->ill_ipst; 21126 21127 ASSERT(ill != NULL); 21128 21129 isv6 = ill->ill_isv6; 21130 ifindex = ill->ill_usesrc_ifindex; 21131 if (ifindex != 0) { 21132 uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, 21133 NULL, ipst); 21134 if (uill == NULL) 21135 return (NULL); 21136 mutex_enter(&uill->ill_lock); 21137 for (ipif = uill->ill_ipif; ipif != NULL; 21138 ipif = ipif->ipif_next) { 21139 if (!IPIF_CAN_LOOKUP(ipif)) 21140 continue; 21141 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 21142 continue; 21143 if (!(ipif->ipif_flags & IPIF_UP)) 21144 continue; 21145 if (ipif->ipif_zoneid != zoneid) 21146 continue; 21147 if ((isv6 && 21148 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) || 21149 (ipif->ipif_lcl_addr == INADDR_ANY)) 21150 continue; 21151 mutex_exit(&uill->ill_lock); 21152 ill_refrele(uill); 21153 return (B_TRUE); 21154 } 21155 mutex_exit(&uill->ill_lock); 21156 ill_refrele(uill); 21157 } 21158 return (B_FALSE); 21159 } 21160 21161 /* 21162 * Determine the best source address given a destination address and an ill. 21163 * Prefers non-deprecated over deprecated but will return a deprecated 21164 * address if there is no other choice. If there is a usable source address 21165 * on the interface pointed to by ill_usesrc_ifindex then that is given 21166 * first preference. 21167 * 21168 * Returns NULL if there is no suitable source address for the ill. 21169 * This only occurs when there is no valid source address for the ill. 21170 */ 21171 ipif_t * 21172 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) 21173 { 21174 ipif_t *ipif; 21175 ipif_t *ipif_dep = NULL; /* Fallback to deprecated */ 21176 ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE]; 21177 int index = 0; 21178 boolean_t wrapped = B_FALSE; 21179 boolean_t same_subnet_only = B_FALSE; 21180 boolean_t ipif_same_found, ipif_other_found; 21181 boolean_t specific_found; 21182 ill_t *till, *usill = NULL; 21183 tsol_tpc_t *src_rhtp, *dst_rhtp; 21184 ip_stack_t *ipst = ill->ill_ipst; 21185 21186 if (ill->ill_usesrc_ifindex != 0) { 21187 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, 21188 B_FALSE, NULL, NULL, NULL, NULL, ipst); 21189 if (usill != NULL) 21190 ill = usill; /* Select source from usesrc ILL */ 21191 else 21192 return (NULL); 21193 } 21194 21195 /* 21196 * If we're dealing with an unlabeled destination on a labeled system, 21197 * make sure that we ignore source addresses that are incompatible with 21198 * the destination's default label. That destination's default label 21199 * must dominate the minimum label on the source address. 21200 */ 21201 dst_rhtp = NULL; 21202 if (is_system_labeled()) { 21203 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 21204 if (dst_rhtp == NULL) 21205 return (NULL); 21206 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 21207 TPC_RELE(dst_rhtp); 21208 dst_rhtp = NULL; 21209 } 21210 } 21211 21212 /* 21213 * Holds the ill_g_lock as reader. This makes sure that no ipif/ill 21214 * can be deleted. But an ipif/ill can get CONDEMNED any time. 21215 * After selecting the right ipif, under ill_lock make sure ipif is 21216 * not condemned, and increment refcnt. If ipif is CONDEMNED, 21217 * we retry. Inside the loop we still need to check for CONDEMNED, 21218 * but not under a lock. 21219 */ 21220 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 21221 21222 retry: 21223 till = ill; 21224 ipif_arr[0] = NULL; 21225 21226 if (till->ill_group != NULL) 21227 till = till->ill_group->illgrp_ill; 21228 21229 /* 21230 * Choose one good source address from each ill across the group. 21231 * If possible choose a source address in the same subnet as 21232 * the destination address. 21233 * 21234 * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE 21235 * This is okay because of the following. 21236 * 21237 * If PHYI_FAILED is set and we still have non-deprecated 21238 * addresses, it means the addresses have not yet been 21239 * failed over to a different interface. We potentially 21240 * select them to create IRE_CACHES, which will be later 21241 * flushed when the addresses move over. 21242 * 21243 * If PHYI_INACTIVE is set and we still have non-deprecated 21244 * addresses, it means either the user has configured them 21245 * or PHYI_INACTIVE has not been cleared after the addresses 21246 * been moved over. For the former, in.mpathd does a failover 21247 * when the interface becomes INACTIVE and hence we should 21248 * not find them. Once INACTIVE is set, we don't allow them 21249 * to create logical interfaces anymore. For the latter, a 21250 * flush will happen when INACTIVE is cleared which will 21251 * flush the IRE_CACHES. 21252 * 21253 * If PHYI_OFFLINE is set, all the addresses will be failed 21254 * over soon. We potentially select them to create IRE_CACHEs, 21255 * which will be later flushed when the addresses move over. 21256 * 21257 * NOTE : As ipif_select_source is called to borrow source address 21258 * for an ipif that is part of a group, source address selection 21259 * will be re-done whenever the group changes i.e either an 21260 * insertion/deletion in the group. 21261 * 21262 * Fill ipif_arr[] with source addresses, using these rules: 21263 * 21264 * 1. At most one source address from a given ill ends up 21265 * in ipif_arr[] -- that is, at most one of the ipif's 21266 * associated with a given ill ends up in ipif_arr[]. 21267 * 21268 * 2. If there is at least one non-deprecated ipif in the 21269 * IPMP group with a source address on the same subnet as 21270 * our destination, then fill ipif_arr[] only with 21271 * source addresses on the same subnet as our destination. 21272 * Note that because of (1), only the first 21273 * non-deprecated ipif found with a source address 21274 * matching the destination ends up in ipif_arr[]. 21275 * 21276 * 3. Otherwise, fill ipif_arr[] with non-deprecated source 21277 * addresses not in the same subnet as our destination. 21278 * Again, because of (1), only the first off-subnet source 21279 * address will be chosen. 21280 * 21281 * 4. If there are no non-deprecated ipifs, then just use 21282 * the source address associated with the last deprecated 21283 * one we find that happens to be on the same subnet, 21284 * otherwise the first one not in the same subnet. 21285 */ 21286 specific_found = B_FALSE; 21287 for (; till != NULL; till = till->ill_group_next) { 21288 ipif_same_found = B_FALSE; 21289 ipif_other_found = B_FALSE; 21290 for (ipif = till->ill_ipif; ipif != NULL; 21291 ipif = ipif->ipif_next) { 21292 if (!IPIF_CAN_LOOKUP(ipif)) 21293 continue; 21294 /* Always skip NOLOCAL and ANYCAST interfaces */ 21295 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 21296 continue; 21297 if (!(ipif->ipif_flags & IPIF_UP) || 21298 !ipif->ipif_addr_ready) 21299 continue; 21300 if (ipif->ipif_zoneid != zoneid && 21301 ipif->ipif_zoneid != ALL_ZONES) 21302 continue; 21303 /* 21304 * Interfaces with 0.0.0.0 address are allowed to be UP, 21305 * but are not valid as source addresses. 21306 */ 21307 if (ipif->ipif_lcl_addr == INADDR_ANY) 21308 continue; 21309 21310 /* 21311 * Check compatibility of local address for 21312 * destination's default label if we're on a labeled 21313 * system. Incompatible addresses can't be used at 21314 * all. 21315 */ 21316 if (dst_rhtp != NULL) { 21317 boolean_t incompat; 21318 21319 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 21320 IPV4_VERSION, B_FALSE); 21321 if (src_rhtp == NULL) 21322 continue; 21323 incompat = 21324 src_rhtp->tpc_tp.host_type != SUN_CIPSO || 21325 src_rhtp->tpc_tp.tp_doi != 21326 dst_rhtp->tpc_tp.tp_doi || 21327 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 21328 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 21329 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 21330 src_rhtp->tpc_tp.tp_sl_set_cipso)); 21331 TPC_RELE(src_rhtp); 21332 if (incompat) 21333 continue; 21334 } 21335 21336 /* 21337 * We prefer not to use all all-zones addresses, if we 21338 * can avoid it, as they pose problems with unlabeled 21339 * destinations. 21340 */ 21341 if (ipif->ipif_zoneid != ALL_ZONES) { 21342 if (!specific_found && 21343 (!same_subnet_only || 21344 (ipif->ipif_net_mask & dst) == 21345 ipif->ipif_subnet)) { 21346 index = 0; 21347 specific_found = B_TRUE; 21348 ipif_other_found = B_FALSE; 21349 } 21350 } else { 21351 if (specific_found) 21352 continue; 21353 } 21354 if (ipif->ipif_flags & IPIF_DEPRECATED) { 21355 if (ipif_dep == NULL || 21356 (ipif->ipif_net_mask & dst) == 21357 ipif->ipif_subnet) 21358 ipif_dep = ipif; 21359 continue; 21360 } 21361 if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) { 21362 /* found a source address in the same subnet */ 21363 if (!same_subnet_only) { 21364 same_subnet_only = B_TRUE; 21365 index = 0; 21366 } 21367 ipif_same_found = B_TRUE; 21368 } else { 21369 if (same_subnet_only || ipif_other_found) 21370 continue; 21371 ipif_other_found = B_TRUE; 21372 } 21373 ipif_arr[index++] = ipif; 21374 if (index == MAX_IPIF_SELECT_SOURCE) { 21375 wrapped = B_TRUE; 21376 index = 0; 21377 } 21378 if (ipif_same_found) 21379 break; 21380 } 21381 } 21382 21383 if (ipif_arr[0] == NULL) { 21384 ipif = ipif_dep; 21385 } else { 21386 if (wrapped) 21387 index = MAX_IPIF_SELECT_SOURCE; 21388 ipif = ipif_arr[ipif_rand(ipst) % index]; 21389 ASSERT(ipif != NULL); 21390 } 21391 21392 if (ipif != NULL) { 21393 mutex_enter(&ipif->ipif_ill->ill_lock); 21394 if (!IPIF_CAN_LOOKUP(ipif)) { 21395 mutex_exit(&ipif->ipif_ill->ill_lock); 21396 goto retry; 21397 } 21398 ipif_refhold_locked(ipif); 21399 mutex_exit(&ipif->ipif_ill->ill_lock); 21400 } 21401 21402 rw_exit(&ipst->ips_ill_g_lock); 21403 if (usill != NULL) 21404 ill_refrele(usill); 21405 if (dst_rhtp != NULL) 21406 TPC_RELE(dst_rhtp); 21407 21408 #ifdef DEBUG 21409 if (ipif == NULL) { 21410 char buf1[INET6_ADDRSTRLEN]; 21411 21412 ip1dbg(("ipif_select_source(%s, %s) -> NULL\n", 21413 ill->ill_name, 21414 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 21415 } else { 21416 char buf1[INET6_ADDRSTRLEN]; 21417 char buf2[INET6_ADDRSTRLEN]; 21418 21419 ip1dbg(("ipif_select_source(%s, %s) -> %s\n", 21420 ipif->ipif_ill->ill_name, 21421 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 21422 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 21423 buf2, sizeof (buf2)))); 21424 } 21425 #endif /* DEBUG */ 21426 return (ipif); 21427 } 21428 21429 21430 /* 21431 * If old_ipif is not NULL, see if ipif was derived from old 21432 * ipif and if so, recreate the interface route by re-doing 21433 * source address selection. This happens when ipif_down -> 21434 * ipif_update_other_ipifs calls us. 21435 * 21436 * If old_ipif is NULL, just redo the source address selection 21437 * if needed. This happens when illgrp_insert or ipif_up_done 21438 * calls us. 21439 */ 21440 static void 21441 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) 21442 { 21443 ire_t *ire; 21444 ire_t *ipif_ire; 21445 queue_t *stq; 21446 ipif_t *nipif; 21447 ill_t *ill; 21448 boolean_t need_rele = B_FALSE; 21449 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 21450 21451 ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif)); 21452 ASSERT(IAM_WRITER_IPIF(ipif)); 21453 21454 ill = ipif->ipif_ill; 21455 if (!(ipif->ipif_flags & 21456 (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { 21457 /* 21458 * Can't possibly have borrowed the source 21459 * from old_ipif. 21460 */ 21461 return; 21462 } 21463 21464 /* 21465 * Is there any work to be done? No work if the address 21466 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST ( 21467 * ipif_select_source() does not borrow addresses from 21468 * NOLOCAL and ANYCAST interfaces). 21469 */ 21470 if ((old_ipif != NULL) && 21471 ((old_ipif->ipif_lcl_addr == INADDR_ANY) || 21472 (old_ipif->ipif_ill->ill_wq == NULL) || 21473 (old_ipif->ipif_flags & 21474 (IPIF_NOLOCAL|IPIF_ANYCAST)))) { 21475 return; 21476 } 21477 21478 /* 21479 * Perform the same checks as when creating the 21480 * IRE_INTERFACE in ipif_up_done. 21481 */ 21482 if (!(ipif->ipif_flags & IPIF_UP)) 21483 return; 21484 21485 if ((ipif->ipif_flags & IPIF_NOXMIT) || 21486 (ipif->ipif_subnet == INADDR_ANY)) 21487 return; 21488 21489 ipif_ire = ipif_to_ire(ipif); 21490 if (ipif_ire == NULL) 21491 return; 21492 21493 /* 21494 * We know that ipif uses some other source for its 21495 * IRE_INTERFACE. Is it using the source of this 21496 * old_ipif? 21497 */ 21498 if (old_ipif != NULL && 21499 old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) { 21500 ire_refrele(ipif_ire); 21501 return; 21502 } 21503 if (ip_debug > 2) { 21504 /* ip1dbg */ 21505 pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for" 21506 " src %s\n", AF_INET, &ipif_ire->ire_src_addr); 21507 } 21508 21509 stq = ipif_ire->ire_stq; 21510 21511 /* 21512 * Can't use our source address. Select a different 21513 * source address for the IRE_INTERFACE. 21514 */ 21515 nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid); 21516 if (nipif == NULL) { 21517 /* Last resort - all ipif's have IPIF_NOLOCAL */ 21518 nipif = ipif; 21519 } else { 21520 need_rele = B_TRUE; 21521 } 21522 21523 ire = ire_create( 21524 (uchar_t *)&ipif->ipif_subnet, /* dest pref */ 21525 (uchar_t *)&ipif->ipif_net_mask, /* mask */ 21526 (uchar_t *)&nipif->ipif_src_addr, /* src addr */ 21527 NULL, /* no gateway */ 21528 NULL, 21529 &ipif->ipif_mtu, /* max frag */ 21530 NULL, /* fast path header */ 21531 NULL, /* no recv from queue */ 21532 stq, /* send-to queue */ 21533 ill->ill_net_type, /* IF_[NO]RESOLVER */ 21534 ill->ill_resolver_mp, /* xmit header */ 21535 ipif, 21536 NULL, 21537 0, 21538 0, 21539 0, 21540 0, 21541 &ire_uinfo_null, 21542 NULL, 21543 NULL, 21544 ipst); 21545 21546 if (ire != NULL) { 21547 ire_t *ret_ire; 21548 int error; 21549 21550 /* 21551 * We don't need ipif_ire anymore. We need to delete 21552 * before we add so that ire_add does not detect 21553 * duplicates. 21554 */ 21555 ire_delete(ipif_ire); 21556 ret_ire = ire; 21557 error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE); 21558 ASSERT(error == 0); 21559 ASSERT(ire == ret_ire); 21560 /* Held in ire_add */ 21561 ire_refrele(ret_ire); 21562 } 21563 /* 21564 * Either we are falling through from above or could not 21565 * allocate a replacement. 21566 */ 21567 ire_refrele(ipif_ire); 21568 if (need_rele) 21569 ipif_refrele(nipif); 21570 } 21571 21572 /* 21573 * This old_ipif is going away. 21574 * 21575 * Determine if any other ipif's is using our address as 21576 * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or 21577 * IPIF_DEPRECATED). 21578 * Find the IRE_INTERFACE for such ipifs and recreate them 21579 * to use an different source address following the rules in 21580 * ipif_up_done. 21581 * 21582 * This function takes an illgrp as an argument so that illgrp_delete 21583 * can call this to update source address even after deleting the 21584 * old_ipif->ipif_ill from the ill group. 21585 */ 21586 static void 21587 ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp) 21588 { 21589 ipif_t *ipif; 21590 ill_t *ill; 21591 char buf[INET6_ADDRSTRLEN]; 21592 21593 ASSERT(IAM_WRITER_IPIF(old_ipif)); 21594 ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif)); 21595 21596 ill = old_ipif->ipif_ill; 21597 21598 ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", 21599 ill->ill_name, 21600 inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, 21601 buf, sizeof (buf)))); 21602 /* 21603 * If this part of a group, look at all ills as ipif_select_source 21604 * borrows source address across all the ills in the group. 21605 */ 21606 if (illgrp != NULL) 21607 ill = illgrp->illgrp_ill; 21608 21609 for (; ill != NULL; ill = ill->ill_group_next) { 21610 for (ipif = ill->ill_ipif; ipif != NULL; 21611 ipif = ipif->ipif_next) { 21612 21613 if (ipif == old_ipif) 21614 continue; 21615 21616 ipif_recreate_interface_routes(old_ipif, ipif); 21617 } 21618 } 21619 } 21620 21621 /* ARGSUSED */ 21622 int 21623 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 21624 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 21625 { 21626 /* 21627 * ill_phyint_reinit merged the v4 and v6 into a single 21628 * ipsq. Could also have become part of a ipmp group in the 21629 * process, and we might not have been able to complete the 21630 * operation in ipif_set_values, if we could not become 21631 * exclusive. If so restart it here. 21632 */ 21633 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 21634 } 21635 21636 21637 /* 21638 * Can operate on either a module or a driver queue. 21639 * Returns an error if not a module queue. 21640 */ 21641 /* ARGSUSED */ 21642 int 21643 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 21644 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 21645 { 21646 queue_t *q1 = q; 21647 char *cp; 21648 char interf_name[LIFNAMSIZ]; 21649 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 21650 21651 if (q->q_next == NULL) { 21652 ip1dbg(( 21653 "if_unitsel: IF_UNITSEL: no q_next\n")); 21654 return (EINVAL); 21655 } 21656 21657 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 21658 return (EALREADY); 21659 21660 do { 21661 q1 = q1->q_next; 21662 } while (q1->q_next); 21663 cp = q1->q_qinfo->qi_minfo->mi_idname; 21664 (void) sprintf(interf_name, "%s%d", cp, ppa); 21665 21666 /* 21667 * Here we are not going to delay the ioack until after 21668 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 21669 * original ioctl message before sending the requests. 21670 */ 21671 return (ipif_set_values(q, mp, interf_name, &ppa)); 21672 } 21673 21674 /* ARGSUSED */ 21675 int 21676 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 21677 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 21678 { 21679 return (ENXIO); 21680 } 21681 21682 /* 21683 * Net and subnet broadcast ire's are now specific to the particular 21684 * physical interface (ill) and not to any one locigal interface (ipif). 21685 * However, if a particular logical interface is being taken down, it's 21686 * associated ire's will be taken down as well. Hence, when we go to 21687 * take down or change the local address, broadcast address or netmask 21688 * of a specific logical interface, we must check to make sure that we 21689 * have valid net and subnet broadcast ire's for the other logical 21690 * interfaces which may have been shared with the logical interface 21691 * being brought down or changed. 21692 * 21693 * There is one set of 0.0.0.0 and 255.255.255.255 per ill. Usually it 21694 * is tied to the first interface coming UP. If that ipif is going down, 21695 * we need to recreate them on the next valid ipif. 21696 * 21697 * Note: assume that the ipif passed in is still up so that it's IRE 21698 * entries are still valid. 21699 */ 21700 static void 21701 ipif_check_bcast_ires(ipif_t *test_ipif) 21702 { 21703 ipif_t *ipif; 21704 ire_t *test_subnet_ire, *test_net_ire; 21705 ire_t *test_allzero_ire, *test_allone_ire; 21706 ire_t *ire_array[12]; 21707 ire_t **irep = &ire_array[0]; 21708 ire_t **irep1; 21709 ipaddr_t net_addr, subnet_addr, net_mask, subnet_mask; 21710 ipaddr_t test_net_addr, test_subnet_addr; 21711 ipaddr_t test_net_mask, test_subnet_mask; 21712 boolean_t need_net_bcast_ire = B_FALSE; 21713 boolean_t need_subnet_bcast_ire = B_FALSE; 21714 boolean_t allzero_bcast_ire_created = B_FALSE; 21715 boolean_t allone_bcast_ire_created = B_FALSE; 21716 boolean_t net_bcast_ire_created = B_FALSE; 21717 boolean_t subnet_bcast_ire_created = B_FALSE; 21718 21719 ipif_t *backup_ipif_net = (ipif_t *)NULL; 21720 ipif_t *backup_ipif_subnet = (ipif_t *)NULL; 21721 ipif_t *backup_ipif_allzeros = (ipif_t *)NULL; 21722 ipif_t *backup_ipif_allones = (ipif_t *)NULL; 21723 uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST; 21724 ip_stack_t *ipst = test_ipif->ipif_ill->ill_ipst; 21725 21726 ASSERT(!test_ipif->ipif_isv6); 21727 ASSERT(IAM_WRITER_IPIF(test_ipif)); 21728 21729 /* 21730 * No broadcast IREs for the LOOPBACK interface 21731 * or others such as point to point and IPIF_NOXMIT. 21732 */ 21733 if (!(test_ipif->ipif_flags & IPIF_BROADCAST) || 21734 (test_ipif->ipif_flags & IPIF_NOXMIT)) 21735 return; 21736 21737 test_allzero_ire = ire_ctable_lookup(0, 0, IRE_BROADCAST, 21738 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF), 21739 ipst); 21740 21741 test_allone_ire = ire_ctable_lookup(INADDR_BROADCAST, 0, IRE_BROADCAST, 21742 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF), 21743 ipst); 21744 21745 test_net_mask = ip_net_mask(test_ipif->ipif_subnet); 21746 test_subnet_mask = test_ipif->ipif_net_mask; 21747 21748 /* 21749 * If no net mask set, assume the default based on net class. 21750 */ 21751 if (test_subnet_mask == 0) 21752 test_subnet_mask = test_net_mask; 21753 21754 /* 21755 * Check if there is a network broadcast ire associated with this ipif 21756 */ 21757 test_net_addr = test_net_mask & test_ipif->ipif_subnet; 21758 test_net_ire = ire_ctable_lookup(test_net_addr, 0, IRE_BROADCAST, 21759 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF), 21760 ipst); 21761 21762 /* 21763 * Check if there is a subnet broadcast IRE associated with this ipif 21764 */ 21765 test_subnet_addr = test_subnet_mask & test_ipif->ipif_subnet; 21766 test_subnet_ire = ire_ctable_lookup(test_subnet_addr, 0, IRE_BROADCAST, 21767 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF), 21768 ipst); 21769 21770 /* 21771 * No broadcast ire's associated with this ipif. 21772 */ 21773 if ((test_subnet_ire == NULL) && (test_net_ire == NULL) && 21774 (test_allzero_ire == NULL) && (test_allone_ire == NULL)) { 21775 return; 21776 } 21777 21778 /* 21779 * We have established which bcast ires have to be replaced. 21780 * Next we try to locate ipifs that match there ires. 21781 * The rules are simple: If we find an ipif that matches on the subnet 21782 * address it will also match on the net address, the allzeros and 21783 * allones address. Any ipif that matches only on the net address will 21784 * also match the allzeros and allones addresses. 21785 * The other criterion is the ipif_flags. We look for non-deprecated 21786 * (and non-anycast and non-nolocal) ipifs as the best choice. 21787 * ipifs with check_flags matching (deprecated, etc) are used only 21788 * if good ipifs are not available. While looping, we save existing 21789 * deprecated ipifs as backup_ipif. 21790 * We loop through all the ipifs for this ill looking for ipifs 21791 * whose broadcast addr match the ipif passed in, but do not have 21792 * their own broadcast ires. For creating 0.0.0.0 and 21793 * 255.255.255.255 we just need an ipif on this ill to create. 21794 */ 21795 for (ipif = test_ipif->ipif_ill->ill_ipif; ipif != NULL; 21796 ipif = ipif->ipif_next) { 21797 21798 ASSERT(!ipif->ipif_isv6); 21799 /* 21800 * Already checked the ipif passed in. 21801 */ 21802 if (ipif == test_ipif) { 21803 continue; 21804 } 21805 21806 /* 21807 * We only need to recreate broadcast ires if another ipif in 21808 * the same zone uses them. The new ires must be created in the 21809 * same zone. 21810 */ 21811 if (ipif->ipif_zoneid != test_ipif->ipif_zoneid) { 21812 continue; 21813 } 21814 21815 /* 21816 * Only interested in logical interfaces with valid local 21817 * addresses or with the ability to broadcast. 21818 */ 21819 if ((ipif->ipif_subnet == 0) || 21820 !(ipif->ipif_flags & IPIF_BROADCAST) || 21821 (ipif->ipif_flags & IPIF_NOXMIT) || 21822 !(ipif->ipif_flags & IPIF_UP)) { 21823 continue; 21824 } 21825 /* 21826 * Check if there is a net broadcast ire for this 21827 * net address. If it turns out that the ipif we are 21828 * about to take down owns this ire, we must make a 21829 * new one because it is potentially going away. 21830 */ 21831 if (test_net_ire && (!net_bcast_ire_created)) { 21832 net_mask = ip_net_mask(ipif->ipif_subnet); 21833 net_addr = net_mask & ipif->ipif_subnet; 21834 if (net_addr == test_net_addr) { 21835 need_net_bcast_ire = B_TRUE; 21836 /* 21837 * Use DEPRECATED ipif only if no good 21838 * ires are available. subnet_addr is 21839 * a better match than net_addr. 21840 */ 21841 if ((ipif->ipif_flags & check_flags) && 21842 (backup_ipif_net == NULL)) { 21843 backup_ipif_net = ipif; 21844 } 21845 } 21846 } 21847 /* 21848 * Check if there is a subnet broadcast ire for this 21849 * net address. If it turns out that the ipif we are 21850 * about to take down owns this ire, we must make a 21851 * new one because it is potentially going away. 21852 */ 21853 if (test_subnet_ire && (!subnet_bcast_ire_created)) { 21854 subnet_mask = ipif->ipif_net_mask; 21855 subnet_addr = ipif->ipif_subnet; 21856 if (subnet_addr == test_subnet_addr) { 21857 need_subnet_bcast_ire = B_TRUE; 21858 if ((ipif->ipif_flags & check_flags) && 21859 (backup_ipif_subnet == NULL)) { 21860 backup_ipif_subnet = ipif; 21861 } 21862 } 21863 } 21864 21865 21866 /* Short circuit here if this ipif is deprecated */ 21867 if (ipif->ipif_flags & check_flags) { 21868 if ((test_allzero_ire != NULL) && 21869 (!allzero_bcast_ire_created) && 21870 (backup_ipif_allzeros == NULL)) { 21871 backup_ipif_allzeros = ipif; 21872 } 21873 if ((test_allone_ire != NULL) && 21874 (!allone_bcast_ire_created) && 21875 (backup_ipif_allones == NULL)) { 21876 backup_ipif_allones = ipif; 21877 } 21878 continue; 21879 } 21880 21881 /* 21882 * Found an ipif which has the same broadcast ire as the 21883 * ipif passed in and the ipif passed in "owns" the ire. 21884 * Create new broadcast ire's for this broadcast addr. 21885 */ 21886 if (need_net_bcast_ire && !net_bcast_ire_created) { 21887 irep = ire_create_bcast(ipif, net_addr, irep); 21888 irep = ire_create_bcast(ipif, 21889 ~net_mask | net_addr, irep); 21890 net_bcast_ire_created = B_TRUE; 21891 } 21892 if (need_subnet_bcast_ire && !subnet_bcast_ire_created) { 21893 irep = ire_create_bcast(ipif, subnet_addr, irep); 21894 irep = ire_create_bcast(ipif, 21895 ~subnet_mask | subnet_addr, irep); 21896 subnet_bcast_ire_created = B_TRUE; 21897 } 21898 if (test_allzero_ire != NULL && !allzero_bcast_ire_created) { 21899 irep = ire_create_bcast(ipif, 0, irep); 21900 allzero_bcast_ire_created = B_TRUE; 21901 } 21902 if (test_allone_ire != NULL && !allone_bcast_ire_created) { 21903 irep = ire_create_bcast(ipif, INADDR_BROADCAST, irep); 21904 allone_bcast_ire_created = B_TRUE; 21905 } 21906 /* 21907 * Once we have created all the appropriate ires, we 21908 * just break out of this loop to add what we have created. 21909 * This has been indented similar to ire_match_args for 21910 * readability. 21911 */ 21912 if (((test_net_ire == NULL) || 21913 (net_bcast_ire_created)) && 21914 ((test_subnet_ire == NULL) || 21915 (subnet_bcast_ire_created)) && 21916 ((test_allzero_ire == NULL) || 21917 (allzero_bcast_ire_created)) && 21918 ((test_allone_ire == NULL) || 21919 (allone_bcast_ire_created))) { 21920 break; 21921 } 21922 } 21923 21924 /* 21925 * Create bcast ires on deprecated ipifs if no non-deprecated ipifs 21926 * exist. 6 pairs of bcast ires are needed. 21927 * Note - the old ires are deleted in ipif_down. 21928 */ 21929 if (need_net_bcast_ire && !net_bcast_ire_created && backup_ipif_net) { 21930 ipif = backup_ipif_net; 21931 irep = ire_create_bcast(ipif, net_addr, irep); 21932 irep = ire_create_bcast(ipif, ~net_mask | net_addr, irep); 21933 net_bcast_ire_created = B_TRUE; 21934 } 21935 if (need_subnet_bcast_ire && !subnet_bcast_ire_created && 21936 backup_ipif_subnet) { 21937 ipif = backup_ipif_subnet; 21938 irep = ire_create_bcast(ipif, subnet_addr, irep); 21939 irep = ire_create_bcast(ipif, 21940 ~subnet_mask | subnet_addr, irep); 21941 subnet_bcast_ire_created = B_TRUE; 21942 } 21943 if (test_allzero_ire != NULL && !allzero_bcast_ire_created && 21944 backup_ipif_allzeros) { 21945 irep = ire_create_bcast(backup_ipif_allzeros, 0, irep); 21946 allzero_bcast_ire_created = B_TRUE; 21947 } 21948 if (test_allone_ire != NULL && !allone_bcast_ire_created && 21949 backup_ipif_allones) { 21950 irep = ire_create_bcast(backup_ipif_allones, 21951 INADDR_BROADCAST, irep); 21952 allone_bcast_ire_created = B_TRUE; 21953 } 21954 21955 /* 21956 * If we can't create all of them, don't add any of them. 21957 * Code in ip_wput_ire and ire_to_ill assumes that we 21958 * always have a non-loopback copy and loopback copy 21959 * for a given address. 21960 */ 21961 for (irep1 = irep; irep1 > ire_array; ) { 21962 irep1--; 21963 if (*irep1 == NULL) { 21964 ip0dbg(("ipif_check_bcast_ires: can't create " 21965 "IRE_BROADCAST, memory allocation failure\n")); 21966 while (irep > ire_array) { 21967 irep--; 21968 if (*irep != NULL) 21969 ire_delete(*irep); 21970 } 21971 goto bad; 21972 } 21973 } 21974 for (irep1 = irep; irep1 > ire_array; ) { 21975 int error; 21976 21977 irep1--; 21978 error = ire_add(irep1, NULL, NULL, NULL, B_FALSE); 21979 if (error == 0) { 21980 ire_refrele(*irep1); /* Held in ire_add */ 21981 } 21982 } 21983 bad: 21984 if (test_allzero_ire != NULL) 21985 ire_refrele(test_allzero_ire); 21986 if (test_allone_ire != NULL) 21987 ire_refrele(test_allone_ire); 21988 if (test_net_ire != NULL) 21989 ire_refrele(test_net_ire); 21990 if (test_subnet_ire != NULL) 21991 ire_refrele(test_subnet_ire); 21992 } 21993 21994 /* 21995 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 21996 * from lifr_flags and the name from lifr_name. 21997 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 21998 * since ipif_lookup_on_name uses the _isv6 flags when matching. 21999 * Returns EINPROGRESS when mp has been consumed by queueing it on 22000 * ill_pending_mp and the ioctl will complete in ip_rput. 22001 * 22002 * Can operate on either a module or a driver queue. 22003 * Returns an error if not a module queue. 22004 */ 22005 /* ARGSUSED */ 22006 int 22007 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22008 ip_ioctl_cmd_t *ipip, void *if_req) 22009 { 22010 int err; 22011 ill_t *ill; 22012 struct lifreq *lifr = (struct lifreq *)if_req; 22013 22014 ASSERT(ipif != NULL); 22015 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 22016 22017 if (q->q_next == NULL) { 22018 ip1dbg(( 22019 "if_sioctl_slifname: SIOCSLIFNAME: no q_next\n")); 22020 return (EINVAL); 22021 } 22022 22023 ill = (ill_t *)q->q_ptr; 22024 /* 22025 * If we are not writer on 'q' then this interface exists already 22026 * and previous lookups (ipif_extract_lifreq_cmn) found this ipif. 22027 * So return EALREADY 22028 */ 22029 if (ill != ipif->ipif_ill) 22030 return (EALREADY); 22031 22032 if (ill->ill_name[0] != '\0') 22033 return (EALREADY); 22034 22035 /* 22036 * Set all the flags. Allows all kinds of override. Provide some 22037 * sanity checking by not allowing IFF_BROADCAST and IFF_MULTICAST 22038 * unless there is either multicast/broadcast support in the driver 22039 * or it is a pt-pt link. 22040 */ 22041 if (lifr->lifr_flags & (IFF_PROMISC|IFF_ALLMULTI)) { 22042 /* Meaningless to IP thus don't allow them to be set. */ 22043 ip1dbg(("ip_setname: EINVAL 1\n")); 22044 return (EINVAL); 22045 } 22046 /* 22047 * For a DL_STYLE2 driver (ill_needs_attach), we would not have the 22048 * ill_bcast_addr_length info. 22049 */ 22050 if (!ill->ill_needs_attach && 22051 ((lifr->lifr_flags & IFF_MULTICAST) && 22052 !(lifr->lifr_flags & IFF_POINTOPOINT) && 22053 ill->ill_bcast_addr_length == 0)) { 22054 /* Link not broadcast/pt-pt capable i.e. no multicast */ 22055 ip1dbg(("ip_setname: EINVAL 2\n")); 22056 return (EINVAL); 22057 } 22058 if ((lifr->lifr_flags & IFF_BROADCAST) && 22059 ((lifr->lifr_flags & IFF_IPV6) || 22060 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 22061 /* Link not broadcast capable or IPv6 i.e. no broadcast */ 22062 ip1dbg(("ip_setname: EINVAL 3\n")); 22063 return (EINVAL); 22064 } 22065 if (lifr->lifr_flags & IFF_UP) { 22066 /* Can only be set with SIOCSLIFFLAGS */ 22067 ip1dbg(("ip_setname: EINVAL 4\n")); 22068 return (EINVAL); 22069 } 22070 if ((lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV6 && 22071 (lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV4) { 22072 ip1dbg(("ip_setname: EINVAL 5\n")); 22073 return (EINVAL); 22074 } 22075 /* 22076 * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces. 22077 */ 22078 if ((lifr->lifr_flags & IFF_XRESOLV) && 22079 !(lifr->lifr_flags & IFF_IPV6) && 22080 !(ipif->ipif_isv6)) { 22081 ip1dbg(("ip_setname: EINVAL 6\n")); 22082 return (EINVAL); 22083 } 22084 22085 /* 22086 * The user has done SIOCGLIFFLAGS prior to this ioctl and hence 22087 * we have all the flags here. So, we assign rather than we OR. 22088 * We can't OR the flags here because we don't want to set 22089 * both IFF_IPV4 and IFF_IPV6. We start off as IFF_IPV4 in 22090 * ipif_allocate and become IFF_IPV4 or IFF_IPV6 here depending 22091 * on lifr_flags value here. 22092 */ 22093 /* 22094 * This ill has not been inserted into the global list. 22095 * So we are still single threaded and don't need any lock 22096 */ 22097 ipif->ipif_flags = lifr->lifr_flags & IFF_LOGINT_FLAGS & 22098 ~IFF_DUPLICATE; 22099 ill->ill_flags = lifr->lifr_flags & IFF_PHYINTINST_FLAGS; 22100 ill->ill_phyint->phyint_flags = lifr->lifr_flags & IFF_PHYINT_FLAGS; 22101 22102 /* We started off as V4. */ 22103 if (ill->ill_flags & ILLF_IPV6) { 22104 ill->ill_phyint->phyint_illv6 = ill; 22105 ill->ill_phyint->phyint_illv4 = NULL; 22106 } 22107 err = ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa); 22108 return (err); 22109 } 22110 22111 /* ARGSUSED */ 22112 int 22113 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22114 ip_ioctl_cmd_t *ipip, void *if_req) 22115 { 22116 /* 22117 * ill_phyint_reinit merged the v4 and v6 into a single 22118 * ipsq. Could also have become part of a ipmp group in the 22119 * process, and we might not have been able to complete the 22120 * slifname in ipif_set_values, if we could not become 22121 * exclusive. If so restart it here 22122 */ 22123 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 22124 } 22125 22126 /* 22127 * Return a pointer to the ipif which matches the index, IP version type and 22128 * zoneid. 22129 */ 22130 ipif_t * 22131 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 22132 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err, ip_stack_t *ipst) 22133 { 22134 ill_t *ill; 22135 ipsq_t *ipsq; 22136 phyint_t *phyi; 22137 ipif_t *ipif; 22138 22139 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 22140 (q != NULL && mp != NULL && func != NULL && err != NULL)); 22141 22142 if (err != NULL) 22143 *err = 0; 22144 22145 /* 22146 * Indexes are stored in the phyint - a common structure 22147 * to both IPv4 and IPv6. 22148 */ 22149 22150 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 22151 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 22152 (void *) &index, NULL); 22153 if (phyi != NULL) { 22154 ill = isv6 ? phyi->phyint_illv6 : phyi->phyint_illv4; 22155 if (ill == NULL) { 22156 rw_exit(&ipst->ips_ill_g_lock); 22157 if (err != NULL) 22158 *err = ENXIO; 22159 return (NULL); 22160 } 22161 GRAB_CONN_LOCK(q); 22162 mutex_enter(&ill->ill_lock); 22163 if (ILL_CAN_LOOKUP(ill)) { 22164 for (ipif = ill->ill_ipif; ipif != NULL; 22165 ipif = ipif->ipif_next) { 22166 if (IPIF_CAN_LOOKUP(ipif) && 22167 (zoneid == ALL_ZONES || 22168 zoneid == ipif->ipif_zoneid || 22169 ipif->ipif_zoneid == ALL_ZONES)) { 22170 ipif_refhold_locked(ipif); 22171 mutex_exit(&ill->ill_lock); 22172 RELEASE_CONN_LOCK(q); 22173 rw_exit(&ipst->ips_ill_g_lock); 22174 return (ipif); 22175 } 22176 } 22177 } else if (ILL_CAN_WAIT(ill, q)) { 22178 ipsq = ill->ill_phyint->phyint_ipsq; 22179 mutex_enter(&ipsq->ipsq_lock); 22180 rw_exit(&ipst->ips_ill_g_lock); 22181 mutex_exit(&ill->ill_lock); 22182 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 22183 mutex_exit(&ipsq->ipsq_lock); 22184 RELEASE_CONN_LOCK(q); 22185 *err = EINPROGRESS; 22186 return (NULL); 22187 } 22188 mutex_exit(&ill->ill_lock); 22189 RELEASE_CONN_LOCK(q); 22190 } 22191 rw_exit(&ipst->ips_ill_g_lock); 22192 if (err != NULL) 22193 *err = ENXIO; 22194 return (NULL); 22195 } 22196 22197 typedef struct conn_change_s { 22198 uint_t cc_old_ifindex; 22199 uint_t cc_new_ifindex; 22200 } conn_change_t; 22201 22202 /* 22203 * ipcl_walk function for changing interface index. 22204 */ 22205 static void 22206 conn_change_ifindex(conn_t *connp, caddr_t arg) 22207 { 22208 conn_change_t *connc; 22209 uint_t old_ifindex; 22210 uint_t new_ifindex; 22211 int i; 22212 ilg_t *ilg; 22213 22214 connc = (conn_change_t *)arg; 22215 old_ifindex = connc->cc_old_ifindex; 22216 new_ifindex = connc->cc_new_ifindex; 22217 22218 if (connp->conn_orig_bound_ifindex == old_ifindex) 22219 connp->conn_orig_bound_ifindex = new_ifindex; 22220 22221 if (connp->conn_orig_multicast_ifindex == old_ifindex) 22222 connp->conn_orig_multicast_ifindex = new_ifindex; 22223 22224 if (connp->conn_orig_xmit_ifindex == old_ifindex) 22225 connp->conn_orig_xmit_ifindex = new_ifindex; 22226 22227 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 22228 ilg = &connp->conn_ilg[i]; 22229 if (ilg->ilg_orig_ifindex == old_ifindex) 22230 ilg->ilg_orig_ifindex = new_ifindex; 22231 } 22232 } 22233 22234 /* 22235 * Walk all the ipifs and ilms on this ill and change the orig_ifindex 22236 * to new_index if it matches the old_index. 22237 * 22238 * Failovers typically happen within a group of ills. But somebody 22239 * can remove an ill from the group after a failover happened. If 22240 * we are setting the ifindex after this, we potentially need to 22241 * look at all the ills rather than just the ones in the group. 22242 * We cut down the work by looking at matching ill_net_types 22243 * and ill_types as we could not possibly grouped them together. 22244 */ 22245 static void 22246 ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc) 22247 { 22248 ill_t *ill; 22249 ipif_t *ipif; 22250 uint_t old_ifindex; 22251 uint_t new_ifindex; 22252 ilm_t *ilm; 22253 ill_walk_context_t ctx; 22254 ip_stack_t *ipst = ill_orig->ill_ipst; 22255 22256 old_ifindex = connc->cc_old_ifindex; 22257 new_ifindex = connc->cc_new_ifindex; 22258 22259 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 22260 ill = ILL_START_WALK_ALL(&ctx, ipst); 22261 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 22262 if ((ill_orig->ill_net_type != ill->ill_net_type) || 22263 (ill_orig->ill_type != ill->ill_type)) { 22264 continue; 22265 } 22266 for (ipif = ill->ill_ipif; ipif != NULL; 22267 ipif = ipif->ipif_next) { 22268 if (ipif->ipif_orig_ifindex == old_ifindex) 22269 ipif->ipif_orig_ifindex = new_ifindex; 22270 } 22271 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 22272 if (ilm->ilm_orig_ifindex == old_ifindex) 22273 ilm->ilm_orig_ifindex = new_ifindex; 22274 } 22275 } 22276 rw_exit(&ipst->ips_ill_g_lock); 22277 } 22278 22279 /* 22280 * We first need to ensure that the new index is unique, and 22281 * then carry the change across both v4 and v6 ill representation 22282 * of the physical interface. 22283 */ 22284 /* ARGSUSED */ 22285 int 22286 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22287 ip_ioctl_cmd_t *ipip, void *ifreq) 22288 { 22289 ill_t *ill; 22290 ill_t *ill_other; 22291 phyint_t *phyi; 22292 int old_index; 22293 conn_change_t connc; 22294 struct ifreq *ifr = (struct ifreq *)ifreq; 22295 struct lifreq *lifr = (struct lifreq *)ifreq; 22296 uint_t index; 22297 ill_t *ill_v4; 22298 ill_t *ill_v6; 22299 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 22300 22301 if (ipip->ipi_cmd_type == IF_CMD) 22302 index = ifr->ifr_index; 22303 else 22304 index = lifr->lifr_index; 22305 22306 /* 22307 * Only allow on physical interface. Also, index zero is illegal. 22308 * 22309 * Need to check for PHYI_FAILED and PHYI_INACTIVE 22310 * 22311 * 1) If PHYI_FAILED is set, a failover could have happened which 22312 * implies a possible failback might have to happen. As failback 22313 * depends on the old index, we should fail setting the index. 22314 * 22315 * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that 22316 * any addresses or multicast memberships are failed over to 22317 * a non-STANDBY interface. As failback depends on the old 22318 * index, we should fail setting the index for this case also. 22319 * 22320 * 3) If PHYI_OFFLINE is set, a possible failover has happened. 22321 * Be consistent with PHYI_FAILED and fail the ioctl. 22322 */ 22323 ill = ipif->ipif_ill; 22324 phyi = ill->ill_phyint; 22325 if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) || 22326 ipif->ipif_id != 0 || index == 0) { 22327 return (EINVAL); 22328 } 22329 old_index = phyi->phyint_ifindex; 22330 22331 /* If the index is not changing, no work to do */ 22332 if (old_index == index) 22333 return (0); 22334 22335 /* 22336 * Use ill_lookup_on_ifindex to determine if the 22337 * new index is unused and if so allow the change. 22338 */ 22339 ill_v6 = ill_lookup_on_ifindex(index, B_TRUE, NULL, NULL, NULL, NULL, 22340 ipst); 22341 ill_v4 = ill_lookup_on_ifindex(index, B_FALSE, NULL, NULL, NULL, NULL, 22342 ipst); 22343 if (ill_v6 != NULL || ill_v4 != NULL) { 22344 if (ill_v4 != NULL) 22345 ill_refrele(ill_v4); 22346 if (ill_v6 != NULL) 22347 ill_refrele(ill_v6); 22348 return (EBUSY); 22349 } 22350 22351 /* 22352 * The new index is unused. Set it in the phyint. 22353 * Locate the other ill so that we can send a routing 22354 * sockets message. 22355 */ 22356 if (ill->ill_isv6) { 22357 ill_other = phyi->phyint_illv4; 22358 } else { 22359 ill_other = phyi->phyint_illv6; 22360 } 22361 22362 phyi->phyint_ifindex = index; 22363 22364 /* Update SCTP's ILL list */ 22365 sctp_ill_reindex(ill, old_index); 22366 22367 connc.cc_old_ifindex = old_index; 22368 connc.cc_new_ifindex = index; 22369 ip_change_ifindex(ill, &connc); 22370 ipcl_walk(conn_change_ifindex, (caddr_t)&connc, ipst); 22371 22372 /* Send the routing sockets message */ 22373 ip_rts_ifmsg(ipif); 22374 if (ill_other != NULL) 22375 ip_rts_ifmsg(ill_other->ill_ipif); 22376 22377 return (0); 22378 } 22379 22380 /* ARGSUSED */ 22381 int 22382 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22383 ip_ioctl_cmd_t *ipip, void *ifreq) 22384 { 22385 struct ifreq *ifr = (struct ifreq *)ifreq; 22386 struct lifreq *lifr = (struct lifreq *)ifreq; 22387 22388 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 22389 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 22390 /* Get the interface index */ 22391 if (ipip->ipi_cmd_type == IF_CMD) { 22392 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 22393 } else { 22394 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 22395 } 22396 return (0); 22397 } 22398 22399 /* ARGSUSED */ 22400 int 22401 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22402 ip_ioctl_cmd_t *ipip, void *ifreq) 22403 { 22404 struct lifreq *lifr = (struct lifreq *)ifreq; 22405 22406 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 22407 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 22408 /* Get the interface zone */ 22409 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 22410 lifr->lifr_zoneid = ipif->ipif_zoneid; 22411 return (0); 22412 } 22413 22414 /* 22415 * Set the zoneid of an interface. 22416 */ 22417 /* ARGSUSED */ 22418 int 22419 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22420 ip_ioctl_cmd_t *ipip, void *ifreq) 22421 { 22422 struct lifreq *lifr = (struct lifreq *)ifreq; 22423 int err = 0; 22424 boolean_t need_up = B_FALSE; 22425 zone_t *zptr; 22426 zone_status_t status; 22427 zoneid_t zoneid; 22428 22429 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 22430 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 22431 if (!is_system_labeled()) 22432 return (ENOTSUP); 22433 zoneid = GLOBAL_ZONEID; 22434 } 22435 22436 /* cannot assign instance zero to a non-global zone */ 22437 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 22438 return (ENOTSUP); 22439 22440 /* 22441 * Cannot assign to a zone that doesn't exist or is shutting down. In 22442 * the event of a race with the zone shutdown processing, since IP 22443 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 22444 * interface will be cleaned up even if the zone is shut down 22445 * immediately after the status check. If the interface can't be brought 22446 * down right away, and the zone is shut down before the restart 22447 * function is called, we resolve the possible races by rechecking the 22448 * zone status in the restart function. 22449 */ 22450 if ((zptr = zone_find_by_id(zoneid)) == NULL) 22451 return (EINVAL); 22452 status = zone_status_get(zptr); 22453 zone_rele(zptr); 22454 22455 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 22456 return (EINVAL); 22457 22458 if (ipif->ipif_flags & IPIF_UP) { 22459 /* 22460 * If the interface is already marked up, 22461 * we call ipif_down which will take care 22462 * of ditching any IREs that have been set 22463 * up based on the old interface address. 22464 */ 22465 err = ipif_logical_down(ipif, q, mp); 22466 if (err == EINPROGRESS) 22467 return (err); 22468 ipif_down_tail(ipif); 22469 need_up = B_TRUE; 22470 } 22471 22472 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 22473 return (err); 22474 } 22475 22476 static int 22477 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 22478 queue_t *q, mblk_t *mp, boolean_t need_up) 22479 { 22480 int err = 0; 22481 ip_stack_t *ipst; 22482 22483 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 22484 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 22485 22486 if (CONN_Q(q)) 22487 ipst = CONNQ_TO_IPST(q); 22488 else 22489 ipst = ILLQ_TO_IPST(q); 22490 22491 /* 22492 * For exclusive stacks we don't allow a different zoneid than 22493 * global. 22494 */ 22495 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID && 22496 zoneid != GLOBAL_ZONEID) 22497 return (EINVAL); 22498 22499 /* Set the new zone id. */ 22500 ipif->ipif_zoneid = zoneid; 22501 22502 /* Update sctp list */ 22503 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 22504 22505 if (need_up) { 22506 /* 22507 * Now bring the interface back up. If this 22508 * is the only IPIF for the ILL, ipif_up 22509 * will have to re-bind to the device, so 22510 * we may get back EINPROGRESS, in which 22511 * case, this IOCTL will get completed in 22512 * ip_rput_dlpi when we see the DL_BIND_ACK. 22513 */ 22514 err = ipif_up(ipif, q, mp); 22515 } 22516 return (err); 22517 } 22518 22519 /* ARGSUSED */ 22520 int 22521 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22522 ip_ioctl_cmd_t *ipip, void *if_req) 22523 { 22524 struct lifreq *lifr = (struct lifreq *)if_req; 22525 zoneid_t zoneid; 22526 zone_t *zptr; 22527 zone_status_t status; 22528 22529 ASSERT(ipif->ipif_id != 0); 22530 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 22531 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 22532 zoneid = GLOBAL_ZONEID; 22533 22534 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 22535 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 22536 22537 /* 22538 * We recheck the zone status to resolve the following race condition: 22539 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 22540 * 2) hme0:1 is up and can't be brought down right away; 22541 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 22542 * 3) zone "myzone" is halted; the zone status switches to 22543 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 22544 * the interfaces to remove - hme0:1 is not returned because it's not 22545 * yet in "myzone", so it won't be removed; 22546 * 4) the restart function for SIOCSLIFZONE is called; without the 22547 * status check here, we would have hme0:1 in "myzone" after it's been 22548 * destroyed. 22549 * Note that if the status check fails, we need to bring the interface 22550 * back to its state prior to ip_sioctl_slifzone(), hence the call to 22551 * ipif_up_done[_v6](). 22552 */ 22553 status = ZONE_IS_UNINITIALIZED; 22554 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 22555 status = zone_status_get(zptr); 22556 zone_rele(zptr); 22557 } 22558 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 22559 if (ipif->ipif_isv6) { 22560 (void) ipif_up_done_v6(ipif); 22561 } else { 22562 (void) ipif_up_done(ipif); 22563 } 22564 return (EINVAL); 22565 } 22566 22567 ipif_down_tail(ipif); 22568 22569 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 22570 B_TRUE)); 22571 } 22572 22573 /* ARGSUSED */ 22574 int 22575 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22576 ip_ioctl_cmd_t *ipip, void *ifreq) 22577 { 22578 struct lifreq *lifr = ifreq; 22579 22580 ASSERT(q->q_next == NULL); 22581 ASSERT(CONN_Q(q)); 22582 22583 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 22584 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 22585 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 22586 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 22587 22588 return (0); 22589 } 22590 22591 22592 /* Find the previous ILL in this usesrc group */ 22593 static ill_t * 22594 ill_prev_usesrc(ill_t *uill) 22595 { 22596 ill_t *ill; 22597 22598 for (ill = uill->ill_usesrc_grp_next; 22599 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 22600 ill = ill->ill_usesrc_grp_next) 22601 /* do nothing */; 22602 return (ill); 22603 } 22604 22605 /* 22606 * Release all members of the usesrc group. This routine is called 22607 * from ill_delete when the interface being unplumbed is the 22608 * group head. 22609 */ 22610 static void 22611 ill_disband_usesrc_group(ill_t *uill) 22612 { 22613 ill_t *next_ill, *tmp_ill; 22614 ip_stack_t *ipst = uill->ill_ipst; 22615 22616 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 22617 next_ill = uill->ill_usesrc_grp_next; 22618 22619 do { 22620 ASSERT(next_ill != NULL); 22621 tmp_ill = next_ill->ill_usesrc_grp_next; 22622 ASSERT(tmp_ill != NULL); 22623 next_ill->ill_usesrc_grp_next = NULL; 22624 next_ill->ill_usesrc_ifindex = 0; 22625 next_ill = tmp_ill; 22626 } while (next_ill->ill_usesrc_ifindex != 0); 22627 uill->ill_usesrc_grp_next = NULL; 22628 } 22629 22630 /* 22631 * Remove the client usesrc ILL from the list and relink to a new list 22632 */ 22633 int 22634 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 22635 { 22636 ill_t *ill, *tmp_ill; 22637 ip_stack_t *ipst = ucill->ill_ipst; 22638 22639 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 22640 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 22641 22642 /* 22643 * Check if the usesrc client ILL passed in is not already 22644 * in use as a usesrc ILL i.e one whose source address is 22645 * in use OR a usesrc ILL is not already in use as a usesrc 22646 * client ILL 22647 */ 22648 if ((ucill->ill_usesrc_ifindex == 0) || 22649 (uill->ill_usesrc_ifindex != 0)) { 22650 return (-1); 22651 } 22652 22653 ill = ill_prev_usesrc(ucill); 22654 ASSERT(ill->ill_usesrc_grp_next != NULL); 22655 22656 /* Remove from the current list */ 22657 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 22658 /* Only two elements in the list */ 22659 ASSERT(ill->ill_usesrc_ifindex == 0); 22660 ill->ill_usesrc_grp_next = NULL; 22661 } else { 22662 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 22663 } 22664 22665 if (ifindex == 0) { 22666 ucill->ill_usesrc_ifindex = 0; 22667 ucill->ill_usesrc_grp_next = NULL; 22668 return (0); 22669 } 22670 22671 ucill->ill_usesrc_ifindex = ifindex; 22672 tmp_ill = uill->ill_usesrc_grp_next; 22673 uill->ill_usesrc_grp_next = ucill; 22674 ucill->ill_usesrc_grp_next = 22675 (tmp_ill != NULL) ? tmp_ill : uill; 22676 return (0); 22677 } 22678 22679 /* 22680 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 22681 * ip.c for locking details. 22682 */ 22683 /* ARGSUSED */ 22684 int 22685 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22686 ip_ioctl_cmd_t *ipip, void *ifreq) 22687 { 22688 struct lifreq *lifr = (struct lifreq *)ifreq; 22689 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE, 22690 ill_flag_changed = B_FALSE; 22691 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 22692 int err = 0, ret; 22693 uint_t ifindex; 22694 phyint_t *us_phyint, *us_cli_phyint; 22695 ipsq_t *ipsq = NULL; 22696 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 22697 22698 ASSERT(IAM_WRITER_IPIF(ipif)); 22699 ASSERT(q->q_next == NULL); 22700 ASSERT(CONN_Q(q)); 22701 22702 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 22703 us_cli_phyint = usesrc_cli_ill->ill_phyint; 22704 22705 ASSERT(us_cli_phyint != NULL); 22706 22707 /* 22708 * If the client ILL is being used for IPMP, abort. 22709 * Note, this can be done before ipsq_try_enter since we are already 22710 * exclusive on this ILL 22711 */ 22712 if ((us_cli_phyint->phyint_groupname != NULL) || 22713 (us_cli_phyint->phyint_flags & PHYI_STANDBY)) { 22714 return (EINVAL); 22715 } 22716 22717 ifindex = lifr->lifr_index; 22718 if (ifindex == 0) { 22719 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 22720 /* non usesrc group interface, nothing to reset */ 22721 return (0); 22722 } 22723 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 22724 /* valid reset request */ 22725 reset_flg = B_TRUE; 22726 } 22727 22728 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp, 22729 ip_process_ioctl, &err, ipst); 22730 22731 if (usesrc_ill == NULL) { 22732 return (err); 22733 } 22734 22735 /* 22736 * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP 22737 * group nor can either of the interfaces be used for standy. So 22738 * to guarantee mutual exclusion with ip_sioctl_flags (which sets 22739 * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname) 22740 * we need to be exclusive on the ipsq belonging to the usesrc_ill. 22741 * We are already exlusive on this ipsq i.e ipsq corresponding to 22742 * the usesrc_cli_ill 22743 */ 22744 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 22745 NEW_OP, B_TRUE); 22746 if (ipsq == NULL) { 22747 err = EINPROGRESS; 22748 /* Operation enqueued on the ipsq of the usesrc ILL */ 22749 goto done; 22750 } 22751 22752 /* Check if the usesrc_ill is used for IPMP */ 22753 us_phyint = usesrc_ill->ill_phyint; 22754 if ((us_phyint->phyint_groupname != NULL) || 22755 (us_phyint->phyint_flags & PHYI_STANDBY)) { 22756 err = EINVAL; 22757 goto done; 22758 } 22759 22760 /* 22761 * If the client is already in use as a usesrc_ill or a usesrc_ill is 22762 * already a client then return EINVAL 22763 */ 22764 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 22765 err = EINVAL; 22766 goto done; 22767 } 22768 22769 /* 22770 * If the ill_usesrc_ifindex field is already set to what it needs to 22771 * be then this is a duplicate operation. 22772 */ 22773 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 22774 err = 0; 22775 goto done; 22776 } 22777 22778 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 22779 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 22780 usesrc_ill->ill_isv6)); 22781 22782 /* 22783 * The next step ensures that no new ires will be created referencing 22784 * the client ill, until the ILL_CHANGING flag is cleared. Then 22785 * we go through an ire walk deleting all ire caches that reference 22786 * the client ill. New ires referencing the client ill that are added 22787 * to the ire table before the ILL_CHANGING flag is set, will be 22788 * cleaned up by the ire walk below. Attempt to add new ires referencing 22789 * the client ill while the ILL_CHANGING flag is set will be failed 22790 * during the ire_add in ire_atomic_start. ire_atomic_start atomically 22791 * checks (under the ill_g_usesrc_lock) that the ire being added 22792 * is not stale, i.e the ire_stq and ire_ipif are consistent and 22793 * belong to the same usesrc group. 22794 */ 22795 mutex_enter(&usesrc_cli_ill->ill_lock); 22796 usesrc_cli_ill->ill_state_flags |= ILL_CHANGING; 22797 mutex_exit(&usesrc_cli_ill->ill_lock); 22798 ill_flag_changed = B_TRUE; 22799 22800 if (ipif->ipif_isv6) 22801 ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 22802 ALL_ZONES, ipst); 22803 else 22804 ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 22805 ALL_ZONES, ipst); 22806 22807 /* 22808 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 22809 * and the ill_usesrc_ifindex fields 22810 */ 22811 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 22812 22813 if (reset_flg) { 22814 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 22815 if (ret != 0) { 22816 err = EINVAL; 22817 } 22818 rw_exit(&ipst->ips_ill_g_usesrc_lock); 22819 goto done; 22820 } 22821 22822 /* 22823 * Four possibilities to consider: 22824 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 22825 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 22826 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 22827 * 4. Both are part of their respective usesrc groups 22828 */ 22829 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 22830 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 22831 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 22832 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 22833 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 22834 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 22835 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 22836 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 22837 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 22838 /* Insert at head of list */ 22839 usesrc_cli_ill->ill_usesrc_grp_next = 22840 usesrc_ill->ill_usesrc_grp_next; 22841 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 22842 } else { 22843 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 22844 ifindex); 22845 if (ret != 0) 22846 err = EINVAL; 22847 } 22848 rw_exit(&ipst->ips_ill_g_usesrc_lock); 22849 22850 done: 22851 if (ill_flag_changed) { 22852 mutex_enter(&usesrc_cli_ill->ill_lock); 22853 usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING; 22854 mutex_exit(&usesrc_cli_ill->ill_lock); 22855 } 22856 if (ipsq != NULL) 22857 ipsq_exit(ipsq, B_TRUE, B_TRUE); 22858 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 22859 ill_refrele(usesrc_ill); 22860 return (err); 22861 } 22862 22863 /* 22864 * comparison function used by avl. 22865 */ 22866 static int 22867 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 22868 { 22869 22870 uint_t index; 22871 22872 ASSERT(phyip != NULL && index_ptr != NULL); 22873 22874 index = *((uint_t *)index_ptr); 22875 /* 22876 * let the phyint with the lowest index be on top. 22877 */ 22878 if (((phyint_t *)phyip)->phyint_ifindex < index) 22879 return (1); 22880 if (((phyint_t *)phyip)->phyint_ifindex > index) 22881 return (-1); 22882 return (0); 22883 } 22884 22885 /* 22886 * comparison function used by avl. 22887 */ 22888 static int 22889 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 22890 { 22891 ill_t *ill; 22892 int res = 0; 22893 22894 ASSERT(phyip != NULL && name_ptr != NULL); 22895 22896 if (((phyint_t *)phyip)->phyint_illv4) 22897 ill = ((phyint_t *)phyip)->phyint_illv4; 22898 else 22899 ill = ((phyint_t *)phyip)->phyint_illv6; 22900 ASSERT(ill != NULL); 22901 22902 res = strcmp(ill->ill_name, (char *)name_ptr); 22903 if (res > 0) 22904 return (1); 22905 else if (res < 0) 22906 return (-1); 22907 return (0); 22908 } 22909 /* 22910 * This function is called from ill_delete when the ill is being 22911 * unplumbed. We remove the reference from the phyint and we also 22912 * free the phyint when there are no more references to it. 22913 */ 22914 static void 22915 ill_phyint_free(ill_t *ill) 22916 { 22917 phyint_t *phyi; 22918 phyint_t *next_phyint; 22919 ipsq_t *cur_ipsq; 22920 ip_stack_t *ipst = ill->ill_ipst; 22921 22922 ASSERT(ill->ill_phyint != NULL); 22923 22924 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 22925 phyi = ill->ill_phyint; 22926 ill->ill_phyint = NULL; 22927 /* 22928 * ill_init allocates a phyint always to store the copy 22929 * of flags relevant to phyint. At that point in time, we could 22930 * not assign the name and hence phyint_illv4/v6 could not be 22931 * initialized. Later in ipif_set_values, we assign the name to 22932 * the ill, at which point in time we assign phyint_illv4/v6. 22933 * Thus we don't rely on phyint_illv6 to be initialized always. 22934 */ 22935 if (ill->ill_flags & ILLF_IPV6) { 22936 phyi->phyint_illv6 = NULL; 22937 } else { 22938 phyi->phyint_illv4 = NULL; 22939 } 22940 /* 22941 * ipif_down removes it from the group when the last ipif goes 22942 * down. 22943 */ 22944 ASSERT(ill->ill_group == NULL); 22945 22946 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) 22947 return; 22948 22949 /* 22950 * Make sure this phyint was put in the list. 22951 */ 22952 if (phyi->phyint_ifindex > 0) { 22953 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 22954 phyi); 22955 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 22956 phyi); 22957 } 22958 /* 22959 * remove phyint from the ipsq list. 22960 */ 22961 cur_ipsq = phyi->phyint_ipsq; 22962 if (phyi == cur_ipsq->ipsq_phyint_list) { 22963 cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next; 22964 } else { 22965 next_phyint = cur_ipsq->ipsq_phyint_list; 22966 while (next_phyint != NULL) { 22967 if (next_phyint->phyint_ipsq_next == phyi) { 22968 next_phyint->phyint_ipsq_next = 22969 phyi->phyint_ipsq_next; 22970 break; 22971 } 22972 next_phyint = next_phyint->phyint_ipsq_next; 22973 } 22974 ASSERT(next_phyint != NULL); 22975 } 22976 IPSQ_DEC_REF(cur_ipsq, ipst); 22977 22978 if (phyi->phyint_groupname_len != 0) { 22979 ASSERT(phyi->phyint_groupname != NULL); 22980 mi_free(phyi->phyint_groupname); 22981 } 22982 mi_free(phyi); 22983 } 22984 22985 /* 22986 * Attach the ill to the phyint structure which can be shared by both 22987 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 22988 * function is called from ipif_set_values and ill_lookup_on_name (for 22989 * loopback) where we know the name of the ill. We lookup the ill and if 22990 * there is one present already with the name use that phyint. Otherwise 22991 * reuse the one allocated by ill_init. 22992 */ 22993 static void 22994 ill_phyint_reinit(ill_t *ill) 22995 { 22996 boolean_t isv6 = ill->ill_isv6; 22997 phyint_t *phyi_old; 22998 phyint_t *phyi; 22999 avl_index_t where = 0; 23000 ill_t *ill_other = NULL; 23001 ipsq_t *ipsq; 23002 ip_stack_t *ipst = ill->ill_ipst; 23003 23004 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 23005 23006 phyi_old = ill->ill_phyint; 23007 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 23008 phyi_old->phyint_illv6 == NULL)); 23009 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 23010 phyi_old->phyint_illv4 == NULL)); 23011 ASSERT(phyi_old->phyint_ifindex == 0); 23012 23013 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 23014 ill->ill_name, &where); 23015 23016 /* 23017 * 1. We grabbed the ill_g_lock before inserting this ill into 23018 * the global list of ills. So no other thread could have located 23019 * this ill and hence the ipsq of this ill is guaranteed to be empty. 23020 * 2. Now locate the other protocol instance of this ill. 23021 * 3. Now grab both ill locks in the right order, and the phyint lock of 23022 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 23023 * of neither ill can change. 23024 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 23025 * other ill. 23026 * 5. Release all locks. 23027 */ 23028 23029 /* 23030 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 23031 * we are initializing IPv4. 23032 */ 23033 if (phyi != NULL) { 23034 ill_other = (isv6) ? phyi->phyint_illv4 : 23035 phyi->phyint_illv6; 23036 ASSERT(ill_other->ill_phyint != NULL); 23037 ASSERT((isv6 && !ill_other->ill_isv6) || 23038 (!isv6 && ill_other->ill_isv6)); 23039 GRAB_ILL_LOCKS(ill, ill_other); 23040 /* 23041 * We are potentially throwing away phyint_flags which 23042 * could be different from the one that we obtain from 23043 * ill_other->ill_phyint. But it is okay as we are assuming 23044 * that the state maintained within IP is correct. 23045 */ 23046 mutex_enter(&phyi->phyint_lock); 23047 if (isv6) { 23048 ASSERT(phyi->phyint_illv6 == NULL); 23049 phyi->phyint_illv6 = ill; 23050 } else { 23051 ASSERT(phyi->phyint_illv4 == NULL); 23052 phyi->phyint_illv4 = ill; 23053 } 23054 /* 23055 * This is a new ill, currently undergoing SLIFNAME 23056 * So we could not have joined an IPMP group until now. 23057 */ 23058 ASSERT(phyi_old->phyint_ipsq_next == NULL && 23059 phyi_old->phyint_groupname == NULL); 23060 23061 /* 23062 * This phyi_old is going away. Decref ipsq_refs and 23063 * assert it is zero. The ipsq itself will be freed in 23064 * ipsq_exit 23065 */ 23066 ipsq = phyi_old->phyint_ipsq; 23067 IPSQ_DEC_REF(ipsq, ipst); 23068 ASSERT(ipsq->ipsq_refs == 0); 23069 /* Get the singleton phyint out of the ipsq list */ 23070 ASSERT(phyi_old->phyint_ipsq_next == NULL); 23071 ipsq->ipsq_phyint_list = NULL; 23072 phyi_old->phyint_illv4 = NULL; 23073 phyi_old->phyint_illv6 = NULL; 23074 mi_free(phyi_old); 23075 } else { 23076 mutex_enter(&ill->ill_lock); 23077 /* 23078 * We don't need to acquire any lock, since 23079 * the ill is not yet visible globally and we 23080 * have not yet released the ill_g_lock. 23081 */ 23082 phyi = phyi_old; 23083 mutex_enter(&phyi->phyint_lock); 23084 /* XXX We need a recovery strategy here. */ 23085 if (!phyint_assign_ifindex(phyi, ipst)) 23086 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 23087 23088 /* No IPMP group yet, thus the hook uses the ifindex */ 23089 phyi->phyint_hook_ifindex = phyi->phyint_ifindex; 23090 23091 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 23092 (void *)phyi, where); 23093 23094 (void) avl_find(&ipst->ips_phyint_g_list-> 23095 phyint_list_avl_by_index, 23096 &phyi->phyint_ifindex, &where); 23097 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 23098 (void *)phyi, where); 23099 } 23100 23101 /* 23102 * Reassigning ill_phyint automatically reassigns the ipsq also. 23103 * pending mp is not affected because that is per ill basis. 23104 */ 23105 ill->ill_phyint = phyi; 23106 23107 /* 23108 * Keep the index on ipif_orig_index to be used by FAILOVER. 23109 * We do this here as when the first ipif was allocated, 23110 * ipif_allocate does not know the right interface index. 23111 */ 23112 23113 ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex; 23114 /* 23115 * Now that the phyint's ifindex has been assigned, complete the 23116 * remaining 23117 */ 23118 23119 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex; 23120 if (ill->ill_isv6) { 23121 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 23122 ill->ill_phyint->phyint_ifindex; 23123 } 23124 23125 /* 23126 * Generate an event within the hooks framework to indicate that 23127 * a new interface has just been added to IP. For this event to 23128 * be generated, the network interface must, at least, have an 23129 * ifindex assigned to it. 23130 * 23131 * This needs to be run inside the ill_g_lock perimeter to ensure 23132 * that the ordering of delivered events to listeners matches the 23133 * order of them in the kernel. 23134 * 23135 * This function could be called from ill_lookup_on_name. In that case 23136 * the interface is loopback "lo", which will not generate a NIC event. 23137 */ 23138 if (ill->ill_name_length <= 2 || 23139 ill->ill_name[0] != 'l' || ill->ill_name[1] != 'o') { 23140 /* 23141 * Generate nic plumb event for ill_name even if 23142 * ipmp_hook_emulation is set. That avoids generating events 23143 * for the ill_names should ipmp_hook_emulation be turned on 23144 * later. 23145 */ 23146 ill_nic_info_plumb(ill, B_FALSE); 23147 } 23148 RELEASE_ILL_LOCKS(ill, ill_other); 23149 mutex_exit(&phyi->phyint_lock); 23150 } 23151 23152 /* 23153 * Allocate a NE_PLUMB nic info event and store in the ill. 23154 * If 'group' is set we do it for the group name, otherwise the ill name. 23155 * It will be sent when we leave the ipsq. 23156 */ 23157 void 23158 ill_nic_info_plumb(ill_t *ill, boolean_t group) 23159 { 23160 phyint_t *phyi = ill->ill_phyint; 23161 ip_stack_t *ipst = ill->ill_ipst; 23162 hook_nic_event_t *info; 23163 char *name; 23164 int namelen; 23165 23166 ASSERT(MUTEX_HELD(&ill->ill_lock)); 23167 23168 if ((info = ill->ill_nic_event_info) != NULL) { 23169 ip2dbg(("ill_nic_info_plumb: unexpected nic event %d " 23170 "attached for %s\n", info->hne_event, 23171 ill->ill_name)); 23172 if (info->hne_data != NULL) 23173 kmem_free(info->hne_data, info->hne_datalen); 23174 kmem_free(info, sizeof (hook_nic_event_t)); 23175 ill->ill_nic_event_info = NULL; 23176 } 23177 23178 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 23179 if (info == NULL) { 23180 ip2dbg(("ill_nic_info_plumb: could not attach PLUMB nic " 23181 "event information for %s (ENOMEM)\n", 23182 ill->ill_name)); 23183 return; 23184 } 23185 23186 if (group) { 23187 ASSERT(phyi->phyint_groupname_len != 0); 23188 namelen = phyi->phyint_groupname_len; 23189 name = phyi->phyint_groupname; 23190 } else { 23191 namelen = ill->ill_name_length; 23192 name = ill->ill_name; 23193 } 23194 23195 info->hne_nic = phyi->phyint_hook_ifindex; 23196 info->hne_lif = 0; 23197 info->hne_event = NE_PLUMB; 23198 info->hne_family = ill->ill_isv6 ? 23199 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 23200 23201 info->hne_data = kmem_alloc(namelen, KM_NOSLEEP); 23202 if (info->hne_data != NULL) { 23203 info->hne_datalen = namelen; 23204 bcopy(name, info->hne_data, info->hne_datalen); 23205 } else { 23206 ip2dbg(("ill_nic_info_plumb: could not attach " 23207 "name information for PLUMB nic event " 23208 "of %s (ENOMEM)\n", name)); 23209 kmem_free(info, sizeof (hook_nic_event_t)); 23210 info = NULL; 23211 } 23212 ill->ill_nic_event_info = info; 23213 } 23214 23215 /* 23216 * Unhook the nic event message from the ill and enqueue it 23217 * into the nic event taskq. 23218 */ 23219 void 23220 ill_nic_info_dispatch(ill_t *ill) 23221 { 23222 hook_nic_event_t *info; 23223 23224 ASSERT(MUTEX_HELD(&ill->ill_lock)); 23225 23226 if ((info = ill->ill_nic_event_info) != NULL) { 23227 if (ddi_taskq_dispatch(eventq_queue_nic, 23228 ip_ne_queue_func, info, DDI_SLEEP) == DDI_FAILURE) { 23229 ip2dbg(("ill_nic_info_dispatch: " 23230 "ddi_taskq_dispatch failed\n")); 23231 if (info->hne_data != NULL) 23232 kmem_free(info->hne_data, info->hne_datalen); 23233 kmem_free(info, sizeof (hook_nic_event_t)); 23234 } 23235 ill->ill_nic_event_info = NULL; 23236 } 23237 } 23238 23239 /* 23240 * Notify any downstream modules of the name of this interface. 23241 * An M_IOCTL is used even though we don't expect a successful reply. 23242 * Any reply message from the driver (presumably an M_IOCNAK) will 23243 * eventually get discarded somewhere upstream. The message format is 23244 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 23245 * to IP. 23246 */ 23247 static void 23248 ip_ifname_notify(ill_t *ill, queue_t *q) 23249 { 23250 mblk_t *mp1, *mp2; 23251 struct iocblk *iocp; 23252 struct lifreq *lifr; 23253 23254 mp1 = mkiocb(SIOCSLIFNAME); 23255 if (mp1 == NULL) 23256 return; 23257 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 23258 if (mp2 == NULL) { 23259 freeb(mp1); 23260 return; 23261 } 23262 23263 mp1->b_cont = mp2; 23264 iocp = (struct iocblk *)mp1->b_rptr; 23265 iocp->ioc_count = sizeof (struct lifreq); 23266 23267 lifr = (struct lifreq *)mp2->b_rptr; 23268 mp2->b_wptr += sizeof (struct lifreq); 23269 bzero(lifr, sizeof (struct lifreq)); 23270 23271 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 23272 lifr->lifr_ppa = ill->ill_ppa; 23273 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 23274 23275 putnext(q, mp1); 23276 } 23277 23278 static int 23279 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 23280 { 23281 int err; 23282 ip_stack_t *ipst = ill->ill_ipst; 23283 23284 /* Set the obsolete NDD per-interface forwarding name. */ 23285 err = ill_set_ndd_name(ill); 23286 if (err != 0) { 23287 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 23288 err); 23289 } 23290 23291 /* Tell downstream modules where they are. */ 23292 ip_ifname_notify(ill, q); 23293 23294 /* 23295 * ill_dl_phys returns EINPROGRESS in the usual case. 23296 * Error cases are ENOMEM ... 23297 */ 23298 err = ill_dl_phys(ill, ipif, mp, q); 23299 23300 /* 23301 * If there is no IRE expiration timer running, get one started. 23302 * igmp and mld timers will be triggered by the first multicast 23303 */ 23304 if (ipst->ips_ip_ire_expire_id == 0) { 23305 /* 23306 * acquire the lock and check again. 23307 */ 23308 mutex_enter(&ipst->ips_ip_trash_timer_lock); 23309 if (ipst->ips_ip_ire_expire_id == 0) { 23310 ipst->ips_ip_ire_expire_id = timeout( 23311 ip_trash_timer_expire, ipst, 23312 MSEC_TO_TICK(ipst->ips_ip_timer_interval)); 23313 } 23314 mutex_exit(&ipst->ips_ip_trash_timer_lock); 23315 } 23316 23317 if (ill->ill_isv6) { 23318 mutex_enter(&ipst->ips_mld_slowtimeout_lock); 23319 if (ipst->ips_mld_slowtimeout_id == 0) { 23320 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, 23321 (void *)ipst, 23322 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 23323 } 23324 mutex_exit(&ipst->ips_mld_slowtimeout_lock); 23325 } else { 23326 mutex_enter(&ipst->ips_igmp_slowtimeout_lock); 23327 if (ipst->ips_igmp_slowtimeout_id == 0) { 23328 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, 23329 (void *)ipst, 23330 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 23331 } 23332 mutex_exit(&ipst->ips_igmp_slowtimeout_lock); 23333 } 23334 23335 return (err); 23336 } 23337 23338 /* 23339 * Common routine for ppa and ifname setting. Should be called exclusive. 23340 * 23341 * Returns EINPROGRESS when mp has been consumed by queueing it on 23342 * ill_pending_mp and the ioctl will complete in ip_rput. 23343 * 23344 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 23345 * the new name and new ppa in lifr_name and lifr_ppa respectively. 23346 * For SLIFNAME, we pass these values back to the userland. 23347 */ 23348 static int 23349 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 23350 { 23351 ill_t *ill; 23352 ipif_t *ipif; 23353 ipsq_t *ipsq; 23354 char *ppa_ptr; 23355 char *old_ptr; 23356 char old_char; 23357 int error; 23358 ip_stack_t *ipst; 23359 23360 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 23361 ASSERT(q->q_next != NULL); 23362 ASSERT(interf_name != NULL); 23363 23364 ill = (ill_t *)q->q_ptr; 23365 ipst = ill->ill_ipst; 23366 23367 ASSERT(ill->ill_ipst != NULL); 23368 ASSERT(ill->ill_name[0] == '\0'); 23369 ASSERT(IAM_WRITER_ILL(ill)); 23370 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 23371 ASSERT(ill->ill_ppa == UINT_MAX); 23372 23373 /* The ppa is sent down by ifconfig or is chosen */ 23374 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 23375 return (EINVAL); 23376 } 23377 23378 /* 23379 * make sure ppa passed in is same as ppa in the name. 23380 * This check is not made when ppa == UINT_MAX in that case ppa 23381 * in the name could be anything. System will choose a ppa and 23382 * update new_ppa_ptr and inter_name to contain the choosen ppa. 23383 */ 23384 if (*new_ppa_ptr != UINT_MAX) { 23385 /* stoi changes the pointer */ 23386 old_ptr = ppa_ptr; 23387 /* 23388 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 23389 * (they don't have an externally visible ppa). We assign one 23390 * here so that we can manage the interface. Note that in 23391 * the past this value was always 0 for DLPI 1 drivers. 23392 */ 23393 if (*new_ppa_ptr == 0) 23394 *new_ppa_ptr = stoi(&old_ptr); 23395 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 23396 return (EINVAL); 23397 } 23398 /* 23399 * terminate string before ppa 23400 * save char at that location. 23401 */ 23402 old_char = ppa_ptr[0]; 23403 ppa_ptr[0] = '\0'; 23404 23405 ill->ill_ppa = *new_ppa_ptr; 23406 /* 23407 * Finish as much work now as possible before calling ill_glist_insert 23408 * which makes the ill globally visible and also merges it with the 23409 * other protocol instance of this phyint. The remaining work is 23410 * done after entering the ipsq which may happen sometime later. 23411 * ill_set_ndd_name occurs after the ill has been made globally visible. 23412 */ 23413 ipif = ill->ill_ipif; 23414 23415 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 23416 ipif_assign_seqid(ipif); 23417 23418 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 23419 ill->ill_flags |= ILLF_IPV4; 23420 23421 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 23422 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 23423 23424 if (ill->ill_flags & ILLF_IPV6) { 23425 23426 ill->ill_isv6 = B_TRUE; 23427 if (ill->ill_rq != NULL) { 23428 ill->ill_rq->q_qinfo = &rinit_ipv6; 23429 ill->ill_wq->q_qinfo = &winit_ipv6; 23430 } 23431 23432 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 23433 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 23434 ipif->ipif_v6src_addr = ipv6_all_zeros; 23435 ipif->ipif_v6subnet = ipv6_all_zeros; 23436 ipif->ipif_v6net_mask = ipv6_all_zeros; 23437 ipif->ipif_v6brd_addr = ipv6_all_zeros; 23438 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 23439 /* 23440 * point-to-point or Non-mulicast capable 23441 * interfaces won't do NUD unless explicitly 23442 * configured to do so. 23443 */ 23444 if (ipif->ipif_flags & IPIF_POINTOPOINT || 23445 !(ill->ill_flags & ILLF_MULTICAST)) { 23446 ill->ill_flags |= ILLF_NONUD; 23447 } 23448 /* Make sure IPv4 specific flag is not set on IPv6 if */ 23449 if (ill->ill_flags & ILLF_NOARP) { 23450 /* 23451 * Note: xresolv interfaces will eventually need 23452 * NOARP set here as well, but that will require 23453 * those external resolvers to have some 23454 * knowledge of that flag and act appropriately. 23455 * Not to be changed at present. 23456 */ 23457 ill->ill_flags &= ~ILLF_NOARP; 23458 } 23459 /* 23460 * Set the ILLF_ROUTER flag according to the global 23461 * IPv6 forwarding policy. 23462 */ 23463 if (ipst->ips_ipv6_forward != 0) 23464 ill->ill_flags |= ILLF_ROUTER; 23465 } else if (ill->ill_flags & ILLF_IPV4) { 23466 ill->ill_isv6 = B_FALSE; 23467 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 23468 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr); 23469 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 23470 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 23471 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 23472 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 23473 /* 23474 * Set the ILLF_ROUTER flag according to the global 23475 * IPv4 forwarding policy. 23476 */ 23477 if (ipst->ips_ip_g_forward != 0) 23478 ill->ill_flags |= ILLF_ROUTER; 23479 } 23480 23481 ASSERT(ill->ill_phyint != NULL); 23482 23483 /* 23484 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will 23485 * be completed in ill_glist_insert -> ill_phyint_reinit 23486 */ 23487 if (!ill_allocate_mibs(ill)) 23488 return (ENOMEM); 23489 23490 /* 23491 * Pick a default sap until we get the DL_INFO_ACK back from 23492 * the driver. 23493 */ 23494 if (ill->ill_sap == 0) { 23495 if (ill->ill_isv6) 23496 ill->ill_sap = IP6_DL_SAP; 23497 else 23498 ill->ill_sap = IP_DL_SAP; 23499 } 23500 23501 ill->ill_ifname_pending = 1; 23502 ill->ill_ifname_pending_err = 0; 23503 23504 ill_refhold(ill); 23505 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 23506 if ((error = ill_glist_insert(ill, interf_name, 23507 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 23508 ill->ill_ppa = UINT_MAX; 23509 ill->ill_name[0] = '\0'; 23510 /* 23511 * undo null termination done above. 23512 */ 23513 ppa_ptr[0] = old_char; 23514 rw_exit(&ipst->ips_ill_g_lock); 23515 ill_refrele(ill); 23516 return (error); 23517 } 23518 23519 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 23520 23521 /* 23522 * When we return the buffer pointed to by interf_name should contain 23523 * the same name as in ill_name. 23524 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 23525 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 23526 * so copy full name and update the ppa ptr. 23527 * When ppa passed in != UINT_MAX all values are correct just undo 23528 * null termination, this saves a bcopy. 23529 */ 23530 if (*new_ppa_ptr == UINT_MAX) { 23531 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 23532 *new_ppa_ptr = ill->ill_ppa; 23533 } else { 23534 /* 23535 * undo null termination done above. 23536 */ 23537 ppa_ptr[0] = old_char; 23538 } 23539 23540 /* Let SCTP know about this ILL */ 23541 sctp_update_ill(ill, SCTP_ILL_INSERT); 23542 23543 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_reprocess_ioctl, NEW_OP, 23544 B_TRUE); 23545 23546 rw_exit(&ipst->ips_ill_g_lock); 23547 ill_refrele(ill); 23548 if (ipsq == NULL) 23549 return (EINPROGRESS); 23550 23551 /* 23552 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. 23553 */ 23554 if (ipsq->ipsq_current_ipif == NULL) 23555 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); 23556 else 23557 ASSERT(ipsq->ipsq_current_ipif == ipif); 23558 23559 error = ipif_set_values_tail(ill, ipif, mp, q); 23560 ipsq_exit(ipsq, B_TRUE, B_TRUE); 23561 if (error != 0 && error != EINPROGRESS) { 23562 /* 23563 * restore previous values 23564 */ 23565 ill->ill_isv6 = B_FALSE; 23566 } 23567 return (error); 23568 } 23569 23570 23571 void 23572 ipif_init(ip_stack_t *ipst) 23573 { 23574 hrtime_t hrt; 23575 int i; 23576 23577 /* 23578 * Can't call drv_getparm here as it is too early in the boot. 23579 * As we use ipif_src_random just for picking a different 23580 * source address everytime, this need not be really random. 23581 */ 23582 hrt = gethrtime(); 23583 ipst->ips_ipif_src_random = 23584 ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff); 23585 23586 for (i = 0; i < MAX_G_HEADS; i++) { 23587 ipst->ips_ill_g_heads[i].ill_g_list_head = 23588 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 23589 ipst->ips_ill_g_heads[i].ill_g_list_tail = 23590 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 23591 } 23592 23593 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 23594 ill_phyint_compare_index, 23595 sizeof (phyint_t), 23596 offsetof(struct phyint, phyint_avl_by_index)); 23597 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 23598 ill_phyint_compare_name, 23599 sizeof (phyint_t), 23600 offsetof(struct phyint, phyint_avl_by_name)); 23601 } 23602 23603 /* 23604 * This is called by ip_rt_add when src_addr value is other than zero. 23605 * src_addr signifies the source address of the incoming packet. For 23606 * reverse tunnel route we need to create a source addr based routing 23607 * table. This routine creates ip_mrtun_table if it's empty and then 23608 * it adds the route entry hashed by source address. It verifies that 23609 * the outgoing interface is always a non-resolver interface (tunnel). 23610 */ 23611 int 23612 ip_mrtun_rt_add(ipaddr_t in_src_addr, int flags, ipif_t *ipif_arg, 23613 ipif_t *src_ipif, ire_t **ire_arg, queue_t *q, mblk_t *mp, ipsq_func_t func, 23614 ip_stack_t *ipst) 23615 { 23616 ire_t *ire; 23617 ire_t *save_ire; 23618 ipif_t *ipif; 23619 ill_t *in_ill = NULL; 23620 ill_t *out_ill; 23621 queue_t *stq; 23622 mblk_t *dlureq_mp; 23623 int error; 23624 23625 if (ire_arg != NULL) 23626 *ire_arg = NULL; 23627 ASSERT(in_src_addr != INADDR_ANY); 23628 23629 ipif = ipif_arg; 23630 if (ipif != NULL) { 23631 out_ill = ipif->ipif_ill; 23632 } else { 23633 ip1dbg(("ip_mrtun_rt_add: ipif is NULL\n")); 23634 return (EINVAL); 23635 } 23636 23637 if (src_ipif == NULL) { 23638 ip1dbg(("ip_mrtun_rt_add: src_ipif is NULL\n")); 23639 return (EINVAL); 23640 } 23641 in_ill = src_ipif->ipif_ill; 23642 23643 /* 23644 * Check for duplicates. We don't need to 23645 * match out_ill, because the uniqueness of 23646 * a route is only dependent on src_addr and 23647 * in_ill. 23648 */ 23649 ire = ire_mrtun_lookup(in_src_addr, in_ill); 23650 if (ire != NULL) { 23651 ire_refrele(ire); 23652 return (EEXIST); 23653 } 23654 if (ipif->ipif_net_type != IRE_IF_NORESOLVER) { 23655 ip2dbg(("ip_mrtun_rt_add: outgoing interface is type %d\n", 23656 ipif->ipif_net_type)); 23657 return (EINVAL); 23658 } 23659 23660 stq = ipif->ipif_wq; 23661 ASSERT(stq != NULL); 23662 23663 /* 23664 * The outgoing interface must be non-resolver 23665 * interface. 23666 */ 23667 dlureq_mp = ill_dlur_gen(NULL, 23668 out_ill->ill_phys_addr_length, out_ill->ill_sap, 23669 out_ill->ill_sap_length); 23670 23671 if (dlureq_mp == NULL) { 23672 ip1dbg(("ip_newroute: dlureq_mp NULL\n")); 23673 return (ENOMEM); 23674 } 23675 23676 /* Create the IRE. */ 23677 23678 ire = ire_create( 23679 NULL, /* Zero dst addr */ 23680 NULL, /* Zero mask */ 23681 NULL, /* Zero gateway addr */ 23682 NULL, /* Zero ipif_src addr */ 23683 (uint8_t *)&in_src_addr, /* in_src-addr */ 23684 &ipif->ipif_mtu, 23685 NULL, 23686 NULL, /* rfq */ 23687 stq, 23688 IRE_MIPRTUN, 23689 dlureq_mp, 23690 ipif, 23691 in_ill, 23692 0, 23693 0, 23694 0, 23695 flags, 23696 &ire_uinfo_null, 23697 NULL, 23698 NULL, 23699 ipst); 23700 23701 if (ire == NULL) { 23702 freeb(dlureq_mp); 23703 return (ENOMEM); 23704 } 23705 ip2dbg(("ip_mrtun_rt_add: mrtun route is created with type %d\n", 23706 ire->ire_type)); 23707 save_ire = ire; 23708 ASSERT(save_ire != NULL); 23709 error = ire_add_mrtun(&ire, q, mp, func); 23710 /* 23711 * If ire_add_mrtun() failed, the ire passed in was freed 23712 * so there is no need to do so here. 23713 */ 23714 if (error != 0) { 23715 return (error); 23716 } 23717 23718 /* Duplicate check */ 23719 if (ire != save_ire) { 23720 /* route already exists by now */ 23721 ire_refrele(ire); 23722 return (EEXIST); 23723 } 23724 23725 if (ire_arg != NULL) { 23726 /* 23727 * Store the ire that was just added. the caller 23728 * ip_rts_request responsible for doing ire_refrele() 23729 * on it. 23730 */ 23731 *ire_arg = ire; 23732 } else { 23733 ire_refrele(ire); /* held in ire_add_mrtun */ 23734 } 23735 23736 return (0); 23737 } 23738 23739 /* 23740 * It is called by ip_rt_delete() only when mipagent requests to delete 23741 * a reverse tunnel route that was added by ip_mrtun_rt_add() before. 23742 */ 23743 23744 int 23745 ip_mrtun_rt_delete(ipaddr_t in_src_addr, ipif_t *src_ipif) 23746 { 23747 ire_t *ire = NULL; 23748 23749 if (in_src_addr == INADDR_ANY) 23750 return (EINVAL); 23751 if (src_ipif == NULL) 23752 return (EINVAL); 23753 23754 /* search if this route exists in the ip_mrtun_table */ 23755 ire = ire_mrtun_lookup(in_src_addr, src_ipif->ipif_ill); 23756 if (ire == NULL) { 23757 ip2dbg(("ip_mrtun_rt_delete: ire not found\n")); 23758 return (ESRCH); 23759 } 23760 ire_delete(ire); 23761 ire_refrele(ire); 23762 return (0); 23763 } 23764 23765 /* 23766 * Lookup the ipif corresponding to the onlink destination address. For 23767 * point-to-point interfaces, it matches with remote endpoint destination 23768 * address. For point-to-multipoint interfaces it only tries to match the 23769 * destination with the interface's subnet address. The longest, most specific 23770 * match is found to take care of such rare network configurations like - 23771 * le0: 129.146.1.1/16 23772 * le1: 129.146.2.2/24 23773 * It is used only by SO_DONTROUTE at the moment. 23774 */ 23775 ipif_t * 23776 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) 23777 { 23778 ipif_t *ipif, *best_ipif; 23779 ill_t *ill; 23780 ill_walk_context_t ctx; 23781 23782 ASSERT(zoneid != ALL_ZONES); 23783 best_ipif = NULL; 23784 23785 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 23786 ill = ILL_START_WALK_V4(&ctx, ipst); 23787 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 23788 mutex_enter(&ill->ill_lock); 23789 for (ipif = ill->ill_ipif; ipif != NULL; 23790 ipif = ipif->ipif_next) { 23791 if (!IPIF_CAN_LOOKUP(ipif)) 23792 continue; 23793 if (ipif->ipif_zoneid != zoneid && 23794 ipif->ipif_zoneid != ALL_ZONES) 23795 continue; 23796 /* 23797 * Point-to-point case. Look for exact match with 23798 * destination address. 23799 */ 23800 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 23801 if (ipif->ipif_pp_dst_addr == addr) { 23802 ipif_refhold_locked(ipif); 23803 mutex_exit(&ill->ill_lock); 23804 rw_exit(&ipst->ips_ill_g_lock); 23805 if (best_ipif != NULL) 23806 ipif_refrele(best_ipif); 23807 return (ipif); 23808 } 23809 } else if (ipif->ipif_subnet == (addr & 23810 ipif->ipif_net_mask)) { 23811 /* 23812 * Point-to-multipoint case. Looping through to 23813 * find the most specific match. If there are 23814 * multiple best match ipif's then prefer ipif's 23815 * that are UP. If there is only one best match 23816 * ipif and it is DOWN we must still return it. 23817 */ 23818 if ((best_ipif == NULL) || 23819 (ipif->ipif_net_mask > 23820 best_ipif->ipif_net_mask) || 23821 ((ipif->ipif_net_mask == 23822 best_ipif->ipif_net_mask) && 23823 ((ipif->ipif_flags & IPIF_UP) && 23824 (!(best_ipif->ipif_flags & IPIF_UP))))) { 23825 ipif_refhold_locked(ipif); 23826 mutex_exit(&ill->ill_lock); 23827 rw_exit(&ipst->ips_ill_g_lock); 23828 if (best_ipif != NULL) 23829 ipif_refrele(best_ipif); 23830 best_ipif = ipif; 23831 rw_enter(&ipst->ips_ill_g_lock, 23832 RW_READER); 23833 mutex_enter(&ill->ill_lock); 23834 } 23835 } 23836 } 23837 mutex_exit(&ill->ill_lock); 23838 } 23839 rw_exit(&ipst->ips_ill_g_lock); 23840 return (best_ipif); 23841 } 23842 23843 23844 /* 23845 * Save enough information so that we can recreate the IRE if 23846 * the interface goes down and then up. 23847 */ 23848 static void 23849 ipif_save_ire(ipif_t *ipif, ire_t *ire) 23850 { 23851 mblk_t *save_mp; 23852 23853 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 23854 if (save_mp != NULL) { 23855 ifrt_t *ifrt; 23856 23857 save_mp->b_wptr += sizeof (ifrt_t); 23858 ifrt = (ifrt_t *)save_mp->b_rptr; 23859 bzero(ifrt, sizeof (ifrt_t)); 23860 ifrt->ifrt_type = ire->ire_type; 23861 ifrt->ifrt_addr = ire->ire_addr; 23862 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 23863 ifrt->ifrt_src_addr = ire->ire_src_addr; 23864 ifrt->ifrt_mask = ire->ire_mask; 23865 ifrt->ifrt_flags = ire->ire_flags; 23866 ifrt->ifrt_max_frag = ire->ire_max_frag; 23867 mutex_enter(&ipif->ipif_saved_ire_lock); 23868 save_mp->b_cont = ipif->ipif_saved_ire_mp; 23869 ipif->ipif_saved_ire_mp = save_mp; 23870 ipif->ipif_saved_ire_cnt++; 23871 mutex_exit(&ipif->ipif_saved_ire_lock); 23872 } 23873 } 23874 23875 23876 static void 23877 ipif_remove_ire(ipif_t *ipif, ire_t *ire) 23878 { 23879 mblk_t **mpp; 23880 mblk_t *mp; 23881 ifrt_t *ifrt; 23882 23883 /* Remove from ipif_saved_ire_mp list if it is there */ 23884 mutex_enter(&ipif->ipif_saved_ire_lock); 23885 for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL; 23886 mpp = &(*mpp)->b_cont) { 23887 /* 23888 * On a given ipif, the triple of address, gateway and 23889 * mask is unique for each saved IRE (in the case of 23890 * ordinary interface routes, the gateway address is 23891 * all-zeroes). 23892 */ 23893 mp = *mpp; 23894 ifrt = (ifrt_t *)mp->b_rptr; 23895 if (ifrt->ifrt_addr == ire->ire_addr && 23896 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 23897 ifrt->ifrt_mask == ire->ire_mask) { 23898 *mpp = mp->b_cont; 23899 ipif->ipif_saved_ire_cnt--; 23900 freeb(mp); 23901 break; 23902 } 23903 } 23904 mutex_exit(&ipif->ipif_saved_ire_lock); 23905 } 23906 23907 23908 /* 23909 * IP multirouting broadcast routes handling 23910 * Append CGTP broadcast IREs to regular ones created 23911 * at ifconfig time. 23912 */ 23913 static void 23914 ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst, ip_stack_t *ipst) 23915 { 23916 ire_t *ire_prim; 23917 23918 ASSERT(ire != NULL); 23919 ASSERT(ire_dst != NULL); 23920 23921 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 23922 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 23923 if (ire_prim != NULL) { 23924 /* 23925 * We are in the special case of broadcasts for 23926 * CGTP. We add an IRE_BROADCAST that holds 23927 * the RTF_MULTIRT flag, the destination 23928 * address of ire_dst and the low level 23929 * info of ire_prim. In other words, CGTP 23930 * broadcast is added to the redundant ipif. 23931 */ 23932 ipif_t *ipif_prim; 23933 ire_t *bcast_ire; 23934 23935 ipif_prim = ire_prim->ire_ipif; 23936 23937 ip2dbg(("ip_cgtp_filter_bcast_add: " 23938 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 23939 (void *)ire_dst, (void *)ire_prim, 23940 (void *)ipif_prim)); 23941 23942 bcast_ire = ire_create( 23943 (uchar_t *)&ire->ire_addr, 23944 (uchar_t *)&ip_g_all_ones, 23945 (uchar_t *)&ire_dst->ire_src_addr, 23946 (uchar_t *)&ire->ire_gateway_addr, 23947 NULL, 23948 &ipif_prim->ipif_mtu, 23949 NULL, 23950 ipif_prim->ipif_rq, 23951 ipif_prim->ipif_wq, 23952 IRE_BROADCAST, 23953 ipif_prim->ipif_bcast_mp, 23954 ipif_prim, 23955 NULL, 23956 0, 23957 0, 23958 0, 23959 ire->ire_flags, 23960 &ire_uinfo_null, 23961 NULL, 23962 NULL, 23963 ipst); 23964 23965 if (bcast_ire != NULL) { 23966 23967 if (ire_add(&bcast_ire, NULL, NULL, NULL, 23968 B_FALSE) == 0) { 23969 ip2dbg(("ip_cgtp_filter_bcast_add: " 23970 "added bcast_ire %p\n", 23971 (void *)bcast_ire)); 23972 23973 ipif_save_ire(bcast_ire->ire_ipif, 23974 bcast_ire); 23975 ire_refrele(bcast_ire); 23976 } 23977 } 23978 ire_refrele(ire_prim); 23979 } 23980 } 23981 23982 23983 /* 23984 * IP multirouting broadcast routes handling 23985 * Remove the broadcast ire 23986 */ 23987 static void 23988 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst) 23989 { 23990 ire_t *ire_dst; 23991 23992 ASSERT(ire != NULL); 23993 ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST, 23994 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 23995 if (ire_dst != NULL) { 23996 ire_t *ire_prim; 23997 23998 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 23999 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 24000 if (ire_prim != NULL) { 24001 ipif_t *ipif_prim; 24002 ire_t *bcast_ire; 24003 24004 ipif_prim = ire_prim->ire_ipif; 24005 24006 ip2dbg(("ip_cgtp_filter_bcast_delete: " 24007 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 24008 (void *)ire_dst, (void *)ire_prim, 24009 (void *)ipif_prim)); 24010 24011 bcast_ire = ire_ctable_lookup(ire->ire_addr, 24012 ire->ire_gateway_addr, 24013 IRE_BROADCAST, 24014 ipif_prim, ALL_ZONES, 24015 NULL, 24016 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF | 24017 MATCH_IRE_MASK, ipst); 24018 24019 if (bcast_ire != NULL) { 24020 ip2dbg(("ip_cgtp_filter_bcast_delete: " 24021 "looked up bcast_ire %p\n", 24022 (void *)bcast_ire)); 24023 ipif_remove_ire(bcast_ire->ire_ipif, 24024 bcast_ire); 24025 ire_delete(bcast_ire); 24026 } 24027 ire_refrele(ire_prim); 24028 } 24029 ire_refrele(ire_dst); 24030 } 24031 } 24032 24033 /* 24034 * IPsec hardware acceleration capabilities related functions. 24035 */ 24036 24037 /* 24038 * Free a per-ill IPsec capabilities structure. 24039 */ 24040 static void 24041 ill_ipsec_capab_free(ill_ipsec_capab_t *capab) 24042 { 24043 if (capab->auth_hw_algs != NULL) 24044 kmem_free(capab->auth_hw_algs, capab->algs_size); 24045 if (capab->encr_hw_algs != NULL) 24046 kmem_free(capab->encr_hw_algs, capab->algs_size); 24047 if (capab->encr_algparm != NULL) 24048 kmem_free(capab->encr_algparm, capab->encr_algparm_size); 24049 kmem_free(capab, sizeof (ill_ipsec_capab_t)); 24050 } 24051 24052 /* 24053 * Allocate a new per-ill IPsec capabilities structure. This structure 24054 * is specific to an IPsec protocol (AH or ESP). It is implemented as 24055 * an array which specifies, for each algorithm, whether this algorithm 24056 * is supported by the ill or not. 24057 */ 24058 static ill_ipsec_capab_t * 24059 ill_ipsec_capab_alloc(void) 24060 { 24061 ill_ipsec_capab_t *capab; 24062 uint_t nelems; 24063 24064 capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP); 24065 if (capab == NULL) 24066 return (NULL); 24067 24068 /* we need one bit per algorithm */ 24069 nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t); 24070 capab->algs_size = nelems * sizeof (ipsec_capab_elem_t); 24071 24072 /* allocate memory to store algorithm flags */ 24073 capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 24074 if (capab->encr_hw_algs == NULL) 24075 goto nomem; 24076 capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 24077 if (capab->auth_hw_algs == NULL) 24078 goto nomem; 24079 /* 24080 * Leave encr_algparm NULL for now since we won't need it half 24081 * the time 24082 */ 24083 return (capab); 24084 24085 nomem: 24086 ill_ipsec_capab_free(capab); 24087 return (NULL); 24088 } 24089 24090 /* 24091 * Resize capability array. Since we're exclusive, this is OK. 24092 */ 24093 static boolean_t 24094 ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid) 24095 { 24096 ipsec_capab_algparm_t *nalp, *oalp; 24097 uint32_t olen, nlen; 24098 24099 oalp = capab->encr_algparm; 24100 olen = capab->encr_algparm_size; 24101 24102 if (oalp != NULL) { 24103 if (algid < capab->encr_algparm_end) 24104 return (B_TRUE); 24105 } 24106 24107 nlen = (algid + 1) * sizeof (*nalp); 24108 nalp = kmem_zalloc(nlen, KM_NOSLEEP); 24109 if (nalp == NULL) 24110 return (B_FALSE); 24111 24112 if (oalp != NULL) { 24113 bcopy(oalp, nalp, olen); 24114 kmem_free(oalp, olen); 24115 } 24116 capab->encr_algparm = nalp; 24117 capab->encr_algparm_size = nlen; 24118 capab->encr_algparm_end = algid + 1; 24119 24120 return (B_TRUE); 24121 } 24122 24123 /* 24124 * Compare the capabilities of the specified ill with the protocol 24125 * and algorithms specified by the SA passed as argument. 24126 * If they match, returns B_TRUE, B_FALSE if they do not match. 24127 * 24128 * The ill can be passed as a pointer to it, or by specifying its index 24129 * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments). 24130 * 24131 * Called by ipsec_out_is_accelerated() do decide whether an outbound 24132 * packet is eligible for hardware acceleration, and by 24133 * ill_ipsec_capab_send_all() to decide whether a SA must be sent down 24134 * to a particular ill. 24135 */ 24136 boolean_t 24137 ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6, 24138 ipsa_t *sa, netstack_t *ns) 24139 { 24140 boolean_t sa_isv6; 24141 uint_t algid; 24142 struct ill_ipsec_capab_s *cpp; 24143 boolean_t need_refrele = B_FALSE; 24144 ip_stack_t *ipst = ns->netstack_ip; 24145 24146 if (ill == NULL) { 24147 ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL, 24148 NULL, NULL, NULL, ipst); 24149 if (ill == NULL) { 24150 ip0dbg(("ipsec_capab_match: ill doesn't exist\n")); 24151 return (B_FALSE); 24152 } 24153 need_refrele = B_TRUE; 24154 } 24155 24156 /* 24157 * Use the address length specified by the SA to determine 24158 * if it corresponds to a IPv6 address, and fail the matching 24159 * if the isv6 flag passed as argument does not match. 24160 * Note: this check is used for SADB capability checking before 24161 * sending SA information to an ill. 24162 */ 24163 sa_isv6 = (sa->ipsa_addrfam == AF_INET6); 24164 if (sa_isv6 != ill_isv6) 24165 /* protocol mismatch */ 24166 goto done; 24167 24168 /* 24169 * Check if the ill supports the protocol, algorithm(s) and 24170 * key size(s) specified by the SA, and get the pointers to 24171 * the algorithms supported by the ill. 24172 */ 24173 switch (sa->ipsa_type) { 24174 24175 case SADB_SATYPE_ESP: 24176 if (!(ill->ill_capabilities & ILL_CAPAB_ESP)) 24177 /* ill does not support ESP acceleration */ 24178 goto done; 24179 cpp = ill->ill_ipsec_capab_esp; 24180 algid = sa->ipsa_auth_alg; 24181 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs)) 24182 goto done; 24183 algid = sa->ipsa_encr_alg; 24184 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs)) 24185 goto done; 24186 if (algid < cpp->encr_algparm_end) { 24187 ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid]; 24188 if (sa->ipsa_encrkeybits < alp->minkeylen) 24189 goto done; 24190 if (sa->ipsa_encrkeybits > alp->maxkeylen) 24191 goto done; 24192 } 24193 break; 24194 24195 case SADB_SATYPE_AH: 24196 if (!(ill->ill_capabilities & ILL_CAPAB_AH)) 24197 /* ill does not support AH acceleration */ 24198 goto done; 24199 if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg, 24200 ill->ill_ipsec_capab_ah->auth_hw_algs)) 24201 goto done; 24202 break; 24203 } 24204 24205 if (need_refrele) 24206 ill_refrele(ill); 24207 return (B_TRUE); 24208 done: 24209 if (need_refrele) 24210 ill_refrele(ill); 24211 return (B_FALSE); 24212 } 24213 24214 24215 /* 24216 * Add a new ill to the list of IPsec capable ills. 24217 * Called from ill_capability_ipsec_ack() when an ACK was received 24218 * indicating that IPsec hardware processing was enabled for an ill. 24219 * 24220 * ill must point to the ill for which acceleration was enabled. 24221 * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP. 24222 */ 24223 static void 24224 ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync) 24225 { 24226 ipsec_capab_ill_t **ills, *cur_ill, *new_ill; 24227 uint_t sa_type; 24228 uint_t ipproto; 24229 ip_stack_t *ipst = ill->ill_ipst; 24230 24231 ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) || 24232 (dl_cap == DL_CAPAB_IPSEC_ESP)); 24233 24234 switch (dl_cap) { 24235 case DL_CAPAB_IPSEC_AH: 24236 sa_type = SADB_SATYPE_AH; 24237 ills = &ipst->ips_ipsec_capab_ills_ah; 24238 ipproto = IPPROTO_AH; 24239 break; 24240 case DL_CAPAB_IPSEC_ESP: 24241 sa_type = SADB_SATYPE_ESP; 24242 ills = &ipst->ips_ipsec_capab_ills_esp; 24243 ipproto = IPPROTO_ESP; 24244 break; 24245 } 24246 24247 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER); 24248 24249 /* 24250 * Add ill index to list of hardware accelerators. If 24251 * already in list, do nothing. 24252 */ 24253 for (cur_ill = *ills; cur_ill != NULL && 24254 (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex || 24255 cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next) 24256 ; 24257 24258 if (cur_ill == NULL) { 24259 /* if this is a new entry for this ill */ 24260 new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP); 24261 if (new_ill == NULL) { 24262 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 24263 return; 24264 } 24265 24266 new_ill->ill_index = ill->ill_phyint->phyint_ifindex; 24267 new_ill->ill_isv6 = ill->ill_isv6; 24268 new_ill->next = *ills; 24269 *ills = new_ill; 24270 } else if (!sadb_resync) { 24271 /* not resync'ing SADB and an entry exists for this ill */ 24272 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 24273 return; 24274 } 24275 24276 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 24277 24278 if (ipst->ips_ipcl_proto_fanout_v6[ipproto].connf_head != NULL) 24279 /* 24280 * IPsec module for protocol loaded, initiate dump 24281 * of the SADB to this ill. 24282 */ 24283 sadb_ill_download(ill, sa_type); 24284 } 24285 24286 /* 24287 * Remove an ill from the list of IPsec capable ills. 24288 */ 24289 static void 24290 ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap) 24291 { 24292 ipsec_capab_ill_t **ills, *cur_ill, *prev_ill; 24293 ip_stack_t *ipst = ill->ill_ipst; 24294 24295 ASSERT(dl_cap == DL_CAPAB_IPSEC_AH || 24296 dl_cap == DL_CAPAB_IPSEC_ESP); 24297 24298 ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipst->ips_ipsec_capab_ills_ah : 24299 &ipst->ips_ipsec_capab_ills_esp; 24300 24301 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER); 24302 24303 prev_ill = NULL; 24304 for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index != 24305 ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 != 24306 ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next) 24307 ; 24308 if (cur_ill == NULL) { 24309 /* entry not found */ 24310 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 24311 return; 24312 } 24313 if (prev_ill == NULL) { 24314 /* entry at front of list */ 24315 *ills = NULL; 24316 } else { 24317 prev_ill->next = cur_ill->next; 24318 } 24319 kmem_free(cur_ill, sizeof (ipsec_capab_ill_t)); 24320 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 24321 } 24322 24323 /* 24324 * Called by SADB to send a DL_CONTROL_REQ message to every ill 24325 * supporting the specified IPsec protocol acceleration. 24326 * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP. 24327 * We free the mblk and, if sa is non-null, release the held referece. 24328 */ 24329 void 24330 ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa, 24331 netstack_t *ns) 24332 { 24333 ipsec_capab_ill_t *ici, *cur_ici; 24334 ill_t *ill; 24335 mblk_t *nmp, *mp_ship_list = NULL, *next_mp; 24336 ip_stack_t *ipst = ns->netstack_ip; 24337 24338 ici = (sa_type == SADB_SATYPE_AH) ? ipst->ips_ipsec_capab_ills_ah : 24339 ipst->ips_ipsec_capab_ills_esp; 24340 24341 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_READER); 24342 24343 for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) { 24344 ill = ill_lookup_on_ifindex(cur_ici->ill_index, 24345 cur_ici->ill_isv6, NULL, NULL, NULL, NULL, ipst); 24346 24347 /* 24348 * Handle the case where the ill goes away while the SADB is 24349 * attempting to send messages. If it's going away, it's 24350 * nuking its shadow SADB, so we don't care.. 24351 */ 24352 24353 if (ill == NULL) 24354 continue; 24355 24356 if (sa != NULL) { 24357 /* 24358 * Make sure capabilities match before 24359 * sending SA to ill. 24360 */ 24361 if (!ipsec_capab_match(ill, cur_ici->ill_index, 24362 cur_ici->ill_isv6, sa, ipst->ips_netstack)) { 24363 ill_refrele(ill); 24364 continue; 24365 } 24366 24367 mutex_enter(&sa->ipsa_lock); 24368 sa->ipsa_flags |= IPSA_F_HW; 24369 mutex_exit(&sa->ipsa_lock); 24370 } 24371 24372 /* 24373 * Copy template message, and add it to the front 24374 * of the mblk ship list. We want to avoid holding 24375 * the ipsec_capab_ills_lock while sending the 24376 * message to the ills. 24377 * 24378 * The b_next and b_prev are temporarily used 24379 * to build a list of mblks to be sent down, and to 24380 * save the ill to which they must be sent. 24381 */ 24382 nmp = copymsg(mp); 24383 if (nmp == NULL) { 24384 ill_refrele(ill); 24385 continue; 24386 } 24387 ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL); 24388 nmp->b_next = mp_ship_list; 24389 mp_ship_list = nmp; 24390 nmp->b_prev = (mblk_t *)ill; 24391 } 24392 24393 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 24394 24395 for (nmp = mp_ship_list; nmp != NULL; nmp = next_mp) { 24396 /* restore the mblk to a sane state */ 24397 next_mp = nmp->b_next; 24398 nmp->b_next = NULL; 24399 ill = (ill_t *)nmp->b_prev; 24400 nmp->b_prev = NULL; 24401 24402 ill_dlpi_send(ill, nmp); 24403 ill_refrele(ill); 24404 } 24405 24406 if (sa != NULL) 24407 IPSA_REFRELE(sa); 24408 freemsg(mp); 24409 } 24410 24411 /* 24412 * Derive an interface id from the link layer address. 24413 * Knows about IEEE 802 and IEEE EUI-64 mappings. 24414 */ 24415 static boolean_t 24416 ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 24417 { 24418 char *addr; 24419 24420 if (phys_length != ETHERADDRL) 24421 return (B_FALSE); 24422 24423 /* Form EUI-64 like address */ 24424 addr = (char *)&v6addr->s6_addr32[2]; 24425 bcopy((char *)phys_addr, addr, 3); 24426 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 24427 addr[3] = (char)0xff; 24428 addr[4] = (char)0xfe; 24429 bcopy((char *)phys_addr + 3, addr + 5, 3); 24430 return (B_TRUE); 24431 } 24432 24433 /* ARGSUSED */ 24434 static boolean_t 24435 ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 24436 { 24437 return (B_FALSE); 24438 } 24439 24440 /* ARGSUSED */ 24441 static boolean_t 24442 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 24443 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 24444 { 24445 /* 24446 * Multicast address mappings used over Ethernet/802.X. 24447 * This address is used as a base for mappings. 24448 */ 24449 static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00, 24450 0x00, 0x00, 0x00}; 24451 24452 /* 24453 * Extract low order 32 bits from IPv6 multicast address. 24454 * Or that into the link layer address, starting from the 24455 * second byte. 24456 */ 24457 *hw_start = 2; 24458 v6_extract_mask->s6_addr32[0] = 0; 24459 v6_extract_mask->s6_addr32[1] = 0; 24460 v6_extract_mask->s6_addr32[2] = 0; 24461 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 24462 bcopy(ipv6_g_phys_multi_addr, maddr, lla_length); 24463 return (B_TRUE); 24464 } 24465 24466 /* 24467 * Indicate by return value whether multicast is supported. If not, 24468 * this code should not touch/change any parameters. 24469 */ 24470 /* ARGSUSED */ 24471 static boolean_t 24472 ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 24473 uint32_t *hw_start, ipaddr_t *extract_mask) 24474 { 24475 /* 24476 * Multicast address mappings used over Ethernet/802.X. 24477 * This address is used as a base for mappings. 24478 */ 24479 static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e, 24480 0x00, 0x00, 0x00 }; 24481 24482 if (phys_length != ETHERADDRL) 24483 return (B_FALSE); 24484 24485 *extract_mask = htonl(0x007fffff); 24486 *hw_start = 2; 24487 bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL); 24488 return (B_TRUE); 24489 } 24490 24491 /* 24492 * Derive IPoIB interface id from the link layer address. 24493 */ 24494 static boolean_t 24495 ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 24496 { 24497 char *addr; 24498 24499 if (phys_length != 20) 24500 return (B_FALSE); 24501 addr = (char *)&v6addr->s6_addr32[2]; 24502 bcopy(phys_addr + 12, addr, 8); 24503 /* 24504 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 24505 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 24506 * rules. In these cases, the IBA considers these GUIDs to be in 24507 * "Modified EUI-64" format, and thus toggling the u/l bit is not 24508 * required; vendors are required not to assign global EUI-64's 24509 * that differ only in u/l bit values, thus guaranteeing uniqueness 24510 * of the interface identifier. Whether the GUID is in modified 24511 * or proper EUI-64 format, the ipv6 identifier must have the u/l 24512 * bit set to 1. 24513 */ 24514 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 24515 return (B_TRUE); 24516 } 24517 24518 /* 24519 * Note on mapping from multicast IP addresses to IPoIB multicast link 24520 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 24521 * The format of an IPoIB multicast address is: 24522 * 24523 * 4 byte QPN Scope Sign. Pkey 24524 * +--------------------------------------------+ 24525 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 24526 * +--------------------------------------------+ 24527 * 24528 * The Scope and Pkey components are properties of the IBA port and 24529 * network interface. They can be ascertained from the broadcast address. 24530 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 24531 */ 24532 24533 static boolean_t 24534 ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 24535 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 24536 { 24537 /* 24538 * Base IPoIB IPv6 multicast address used for mappings. 24539 * Does not contain the IBA scope/Pkey values. 24540 */ 24541 static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 24542 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 24543 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 24544 24545 /* 24546 * Extract low order 80 bits from IPv6 multicast address. 24547 * Or that into the link layer address, starting from the 24548 * sixth byte. 24549 */ 24550 *hw_start = 6; 24551 bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length); 24552 24553 /* 24554 * Now fill in the IBA scope/Pkey values from the broadcast address. 24555 */ 24556 *(maddr + 5) = *(bphys_addr + 5); 24557 *(maddr + 8) = *(bphys_addr + 8); 24558 *(maddr + 9) = *(bphys_addr + 9); 24559 24560 v6_extract_mask->s6_addr32[0] = 0; 24561 v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff); 24562 v6_extract_mask->s6_addr32[2] = 0xffffffffU; 24563 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 24564 return (B_TRUE); 24565 } 24566 24567 static boolean_t 24568 ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 24569 uint32_t *hw_start, ipaddr_t *extract_mask) 24570 { 24571 /* 24572 * Base IPoIB IPv4 multicast address used for mappings. 24573 * Does not contain the IBA scope/Pkey values. 24574 */ 24575 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 24576 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 24577 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 24578 24579 if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr)) 24580 return (B_FALSE); 24581 24582 /* 24583 * Extract low order 28 bits from IPv4 multicast address. 24584 * Or that into the link layer address, starting from the 24585 * sixteenth byte. 24586 */ 24587 *extract_mask = htonl(0x0fffffff); 24588 *hw_start = 16; 24589 bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length); 24590 24591 /* 24592 * Now fill in the IBA scope/Pkey values from the broadcast address. 24593 */ 24594 *(maddr + 5) = *(bphys_addr + 5); 24595 *(maddr + 8) = *(bphys_addr + 8); 24596 *(maddr + 9) = *(bphys_addr + 9); 24597 return (B_TRUE); 24598 } 24599 24600 /* 24601 * Returns B_TRUE if an ipif is present in the given zone, matching some flags 24602 * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there. 24603 * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with 24604 * the link-local address is preferred. 24605 */ 24606 boolean_t 24607 ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 24608 { 24609 ipif_t *ipif; 24610 ipif_t *maybe_ipif = NULL; 24611 24612 mutex_enter(&ill->ill_lock); 24613 if (ill->ill_state_flags & ILL_CONDEMNED) { 24614 mutex_exit(&ill->ill_lock); 24615 if (ipifp != NULL) 24616 *ipifp = NULL; 24617 return (B_FALSE); 24618 } 24619 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 24620 if (!IPIF_CAN_LOOKUP(ipif)) 24621 continue; 24622 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 24623 ipif->ipif_zoneid != ALL_ZONES) 24624 continue; 24625 if ((ipif->ipif_flags & flags) != flags) 24626 continue; 24627 24628 if (ipifp == NULL) { 24629 mutex_exit(&ill->ill_lock); 24630 ASSERT(maybe_ipif == NULL); 24631 return (B_TRUE); 24632 } 24633 if (!ill->ill_isv6 || 24634 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) { 24635 ipif_refhold_locked(ipif); 24636 mutex_exit(&ill->ill_lock); 24637 *ipifp = ipif; 24638 return (B_TRUE); 24639 } 24640 if (maybe_ipif == NULL) 24641 maybe_ipif = ipif; 24642 } 24643 if (ipifp != NULL) { 24644 if (maybe_ipif != NULL) 24645 ipif_refhold_locked(maybe_ipif); 24646 *ipifp = maybe_ipif; 24647 } 24648 mutex_exit(&ill->ill_lock); 24649 return (maybe_ipif != NULL); 24650 } 24651 24652 /* 24653 * Same as ipif_lookup_zoneid() but looks at all the ills in the same group. 24654 */ 24655 boolean_t 24656 ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 24657 { 24658 ill_t *illg; 24659 ip_stack_t *ipst = ill->ill_ipst; 24660 24661 /* 24662 * We look at the passed-in ill first without grabbing ill_g_lock. 24663 */ 24664 if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) { 24665 return (B_TRUE); 24666 } 24667 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 24668 if (ill->ill_group == NULL) { 24669 /* ill not in a group */ 24670 rw_exit(&ipst->ips_ill_g_lock); 24671 return (B_FALSE); 24672 } 24673 24674 /* 24675 * There's no ipif in the zone on ill, however ill is part of an IPMP 24676 * group. We need to look for an ipif in the zone on all the ills in the 24677 * group. 24678 */ 24679 illg = ill->ill_group->illgrp_ill; 24680 do { 24681 /* 24682 * We don't call ipif_lookup_zoneid() on ill as we already know 24683 * that it's not there. 24684 */ 24685 if (illg != ill && 24686 ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) { 24687 break; 24688 } 24689 } while ((illg = illg->ill_group_next) != NULL); 24690 rw_exit(&ipst->ips_ill_g_lock); 24691 return (illg != NULL); 24692 } 24693 24694 /* 24695 * Check if this ill is only being used to send ICMP probes for IPMP 24696 */ 24697 boolean_t 24698 ill_is_probeonly(ill_t *ill) 24699 { 24700 /* 24701 * Check if the interface is FAILED, or INACTIVE 24702 */ 24703 if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) 24704 return (B_TRUE); 24705 24706 return (B_FALSE); 24707 } 24708 24709 /* 24710 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 24711 * If a pointer to an ipif_t is returned then the caller will need to do 24712 * an ill_refrele(). 24713 * 24714 * If there is no real interface which matches the ifindex, then it looks 24715 * for a group that has a matching index. In the case of a group match the 24716 * lifidx must be zero. We don't need emulate the logical interfaces 24717 * since IP Filter's use of netinfo doesn't use that. 24718 */ 24719 ipif_t * 24720 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, 24721 ip_stack_t *ipst) 24722 { 24723 ipif_t *ipif; 24724 ill_t *ill; 24725 24726 ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, 24727 ipst); 24728 24729 if (ill == NULL) { 24730 /* Fallback to group names only if hook_emulation set */ 24731 if (!ipst->ips_ipmp_hook_emulation) 24732 return (NULL); 24733 24734 if (lifidx != 0) 24735 return (NULL); 24736 ill = ill_group_lookup_on_ifindex(ifindex, isv6, ipst); 24737 if (ill == NULL) 24738 return (NULL); 24739 } 24740 24741 mutex_enter(&ill->ill_lock); 24742 if (ill->ill_state_flags & ILL_CONDEMNED) { 24743 mutex_exit(&ill->ill_lock); 24744 ill_refrele(ill); 24745 return (NULL); 24746 } 24747 24748 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 24749 if (!IPIF_CAN_LOOKUP(ipif)) 24750 continue; 24751 if (lifidx == ipif->ipif_id) { 24752 ipif_refhold_locked(ipif); 24753 break; 24754 } 24755 } 24756 24757 mutex_exit(&ill->ill_lock); 24758 ill_refrele(ill); 24759 return (ipif); 24760 } 24761 24762 /* 24763 * Flush the fastpath by deleting any nce's that are waiting for the fastpath, 24764 * There is one exceptions IRE_BROADCAST are difficult to recreate, 24765 * so instead we just nuke their nce_fp_mp's; see ndp_fastpath_flush() 24766 * for details. 24767 */ 24768 void 24769 ill_fastpath_flush(ill_t *ill) 24770 { 24771 ip_stack_t *ipst = ill->ill_ipst; 24772 24773 nce_fastpath_list_dispatch(ill, NULL, NULL); 24774 ndp_walk_common((ill->ill_isv6 ? ipst->ips_ndp6 : ipst->ips_ndp4), 24775 ill, (pfi_t)ndp_fastpath_flush, NULL, B_TRUE); 24776 } 24777 24778 /* 24779 * Set the physical address information for `ill' to the contents of the 24780 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be 24781 * asynchronous if `ill' cannot immediately be quiesced -- in which case 24782 * EINPROGRESS will be returned. 24783 */ 24784 int 24785 ill_set_phys_addr(ill_t *ill, mblk_t *mp) 24786 { 24787 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 24788 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr; 24789 24790 ASSERT(IAM_WRITER_IPSQ(ipsq)); 24791 24792 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR && 24793 dlindp->dl_data != DL_CURR_PHYS_ADDR) { 24794 /* Changing DL_IPV6_TOKEN is not yet supported */ 24795 return (0); 24796 } 24797 24798 /* 24799 * We need to store up to two copies of `mp' in `ill'. Due to the 24800 * design of ipsq_pending_mp_add(), we can't pass them as separate 24801 * arguments to ill_set_phys_addr_tail(). Instead, chain them 24802 * together here, then pull 'em apart in ill_set_phys_addr_tail(). 24803 */ 24804 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) { 24805 freemsg(mp); 24806 return (ENOMEM); 24807 } 24808 24809 ipsq_current_start(ipsq, ill->ill_ipif, 0); 24810 24811 /* 24812 * If we can quiesce the ill, then set the address. If not, then 24813 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). 24814 */ 24815 ill_down_ipifs(ill, NULL, 0, B_FALSE); 24816 mutex_enter(&ill->ill_lock); 24817 if (!ill_is_quiescent(ill)) { 24818 /* call cannot fail since `conn_t *' argument is NULL */ 24819 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 24820 mp, ILL_DOWN); 24821 mutex_exit(&ill->ill_lock); 24822 return (EINPROGRESS); 24823 } 24824 mutex_exit(&ill->ill_lock); 24825 24826 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL); 24827 return (0); 24828 } 24829 24830 /* 24831 * Once the ill associated with `q' has quiesced, set its physical address 24832 * information to the values in `addrmp'. Note that two copies of `addrmp' 24833 * are passed (linked by b_cont), since we sometimes need to save two distinct 24834 * copies in the ill_t, and our context doesn't permit sleeping or allocation 24835 * failure (we'll free the other copy if it's not needed). Since the ill_t 24836 * is quiesced, we know any stale IREs with the old address information have 24837 * already been removed, so we don't need to call ill_fastpath_flush(). 24838 */ 24839 /* ARGSUSED */ 24840 static void 24841 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) 24842 { 24843 ill_t *ill = q->q_ptr; 24844 mblk_t *addrmp2 = unlinkb(addrmp); 24845 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr; 24846 uint_t addrlen, addroff; 24847 24848 ASSERT(IAM_WRITER_IPSQ(ipsq)); 24849 24850 addroff = dlindp->dl_addr_offset; 24851 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length); 24852 24853 switch (dlindp->dl_data) { 24854 case DL_IPV6_LINK_LAYER_ADDR: 24855 ill_set_ndmp(ill, addrmp, addroff, addrlen); 24856 freemsg(addrmp2); 24857 break; 24858 24859 case DL_CURR_PHYS_ADDR: 24860 freemsg(ill->ill_phys_addr_mp); 24861 ill->ill_phys_addr = addrmp->b_rptr + addroff; 24862 ill->ill_phys_addr_mp = addrmp; 24863 ill->ill_phys_addr_length = addrlen; 24864 24865 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 24866 ill_set_ndmp(ill, addrmp2, addroff, addrlen); 24867 else 24868 freemsg(addrmp2); 24869 break; 24870 default: 24871 ASSERT(0); 24872 } 24873 24874 /* 24875 * If there are ipifs to bring up, ill_up_ipifs() will return 24876 * EINPROGRESS, and ipsq_current_finish() will be called by 24877 * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is 24878 * brought up. 24879 */ 24880 if (ill_up_ipifs(ill, q, addrmp) != EINPROGRESS) 24881 ipsq_current_finish(ipsq); 24882 } 24883 24884 /* 24885 * Helper routine for setting the ill_nd_lla fields. 24886 */ 24887 void 24888 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen) 24889 { 24890 freemsg(ill->ill_nd_lla_mp); 24891 ill->ill_nd_lla = ndmp->b_rptr + addroff; 24892 ill->ill_nd_lla_mp = ndmp; 24893 ill->ill_nd_lla_len = addrlen; 24894 } 24895 24896 major_t IP_MAJ; 24897 #define IP "ip" 24898 24899 #define UDP6DEV "/devices/pseudo/udp6@0:udp6" 24900 #define UDPDEV "/devices/pseudo/udp@0:udp" 24901 24902 /* 24903 * Issue REMOVEIF ioctls to have the loopback interfaces 24904 * go away. Other interfaces are either I_LINKed or I_PLINKed; 24905 * the former going away when the user-level processes in the zone 24906 * are killed * and the latter are cleaned up by the stream head 24907 * str_stack_shutdown callback that undoes all I_PLINKs. 24908 */ 24909 void 24910 ip_loopback_cleanup(ip_stack_t *ipst) 24911 { 24912 int error; 24913 ldi_handle_t lh = NULL; 24914 ldi_ident_t li = NULL; 24915 int rval; 24916 cred_t *cr; 24917 struct strioctl iocb; 24918 struct lifreq lifreq; 24919 24920 IP_MAJ = ddi_name_to_major(IP); 24921 24922 #ifdef NS_DEBUG 24923 (void) printf("ip_loopback_cleanup() stackid %d\n", 24924 ipst->ips_netstack->netstack_stackid); 24925 #endif 24926 24927 bzero(&lifreq, sizeof (lifreq)); 24928 (void) strcpy(lifreq.lifr_name, ipif_loopback_name); 24929 24930 error = ldi_ident_from_major(IP_MAJ, &li); 24931 if (error) { 24932 #ifdef DEBUG 24933 printf("ip_loopback_cleanup: lyr ident get failed error %d\n", 24934 error); 24935 #endif 24936 return; 24937 } 24938 24939 cr = zone_get_kcred(netstackid_to_zoneid( 24940 ipst->ips_netstack->netstack_stackid)); 24941 ASSERT(cr != NULL); 24942 error = ldi_open_by_name(UDP6DEV, FREAD|FWRITE, cr, &lh, li); 24943 if (error) { 24944 #ifdef DEBUG 24945 printf("ip_loopback_cleanup: open of UDP6DEV failed error %d\n", 24946 error); 24947 #endif 24948 goto out; 24949 } 24950 iocb.ic_cmd = SIOCLIFREMOVEIF; 24951 iocb.ic_timout = 15; 24952 iocb.ic_len = sizeof (lifreq); 24953 iocb.ic_dp = (char *)&lifreq; 24954 24955 error = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval); 24956 /* LINTED - statement has no consequent */ 24957 if (error) { 24958 #ifdef NS_DEBUG 24959 printf("ip_loopback_cleanup: ioctl SIOCLIFREMOVEIF failed on " 24960 "UDP6 error %d\n", error); 24961 #endif 24962 } 24963 (void) ldi_close(lh, FREAD|FWRITE, cr); 24964 lh = NULL; 24965 24966 error = ldi_open_by_name(UDPDEV, FREAD|FWRITE, cr, &lh, li); 24967 if (error) { 24968 #ifdef NS_DEBUG 24969 printf("ip_loopback_cleanup: open of UDPDEV failed error %d\n", 24970 error); 24971 #endif 24972 goto out; 24973 } 24974 24975 iocb.ic_cmd = SIOCLIFREMOVEIF; 24976 iocb.ic_timout = 15; 24977 iocb.ic_len = sizeof (lifreq); 24978 iocb.ic_dp = (char *)&lifreq; 24979 24980 error = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval); 24981 /* LINTED - statement has no consequent */ 24982 if (error) { 24983 #ifdef NS_DEBUG 24984 printf("ip_loopback_cleanup: ioctl SIOCLIFREMOVEIF failed on " 24985 "UDP error %d\n", error); 24986 #endif 24987 } 24988 (void) ldi_close(lh, FREAD|FWRITE, cr); 24989 lh = NULL; 24990 24991 out: 24992 /* Close layered handles */ 24993 if (lh) 24994 (void) ldi_close(lh, FREAD|FWRITE, cr); 24995 if (li) 24996 ldi_ident_release(li); 24997 24998 crfree(cr); 24999 } 25000