1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains the interface control functions for IP. 31 */ 32 33 #include <sys/types.h> 34 #include <sys/stream.h> 35 #include <sys/dlpi.h> 36 #include <sys/stropts.h> 37 #include <sys/strsun.h> 38 #include <sys/sysmacros.h> 39 #include <sys/strlog.h> 40 #include <sys/ddi.h> 41 #include <sys/sunddi.h> 42 #include <sys/cmn_err.h> 43 #include <sys/kstat.h> 44 #include <sys/debug.h> 45 #include <sys/zone.h> 46 #include <sys/sunldi.h> 47 #include <sys/file.h> 48 #include <sys/bitmap.h> 49 50 #include <sys/kmem.h> 51 #include <sys/systm.h> 52 #include <sys/param.h> 53 #include <sys/socket.h> 54 #include <sys/isa_defs.h> 55 #include <net/if.h> 56 #include <net/if_arp.h> 57 #include <net/if_types.h> 58 #include <net/if_dl.h> 59 #include <net/route.h> 60 #include <sys/sockio.h> 61 #include <netinet/in.h> 62 #include <netinet/ip6.h> 63 #include <netinet/icmp6.h> 64 #include <netinet/igmp_var.h> 65 #include <sys/strsun.h> 66 #include <sys/policy.h> 67 #include <sys/ethernet.h> 68 69 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 70 #include <inet/mi.h> 71 #include <inet/nd.h> 72 #include <inet/arp.h> 73 #include <inet/mib2.h> 74 #include <inet/ip.h> 75 #include <inet/ip6.h> 76 #include <inet/ip6_asp.h> 77 #include <inet/tcp.h> 78 #include <inet/ip_multi.h> 79 #include <inet/ip_ire.h> 80 #include <inet/ip_ftable.h> 81 #include <inet/ip_rts.h> 82 #include <inet/ip_ndp.h> 83 #include <inet/ip_if.h> 84 #include <inet/ip_impl.h> 85 #include <inet/tun.h> 86 #include <inet/sctp_ip.h> 87 #include <inet/ip_netinfo.h> 88 #include <inet/mib2.h> 89 90 #include <net/pfkeyv2.h> 91 #include <inet/ipsec_info.h> 92 #include <inet/sadb.h> 93 #include <inet/ipsec_impl.h> 94 #include <sys/iphada.h> 95 96 97 #include <netinet/igmp.h> 98 #include <inet/ip_listutils.h> 99 #include <inet/ipclassifier.h> 100 #include <sys/mac.h> 101 102 #include <sys/systeminfo.h> 103 #include <sys/bootconf.h> 104 105 #include <sys/tsol/tndb.h> 106 #include <sys/tsol/tnet.h> 107 108 /* The character which tells where the ill_name ends */ 109 #define IPIF_SEPARATOR_CHAR ':' 110 111 /* IP ioctl function table entry */ 112 typedef struct ipft_s { 113 int ipft_cmd; 114 pfi_t ipft_pfi; 115 int ipft_min_size; 116 int ipft_flags; 117 } ipft_t; 118 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 119 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 120 121 typedef struct ip_sock_ar_s { 122 union { 123 area_t ip_sock_area; 124 ared_t ip_sock_ared; 125 areq_t ip_sock_areq; 126 } ip_sock_ar_u; 127 queue_t *ip_sock_ar_q; 128 } ip_sock_ar_t; 129 130 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 131 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 132 char *value, caddr_t cp, cred_t *ioc_cr); 133 134 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 135 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 136 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 137 mblk_t *mp, boolean_t need_up); 138 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 139 mblk_t *mp, boolean_t need_up); 140 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 141 queue_t *q, mblk_t *mp, boolean_t need_up); 142 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 143 mblk_t *mp, boolean_t need_up); 144 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 145 mblk_t *mp); 146 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 147 queue_t *q, mblk_t *mp, boolean_t need_up); 148 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, 149 int ioccmd, struct linkblk *li, boolean_t doconsist); 150 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); 151 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 152 static void ipsq_flush(ill_t *ill); 153 154 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 155 queue_t *q, mblk_t *mp, boolean_t need_up); 156 static void ipsq_delete(ipsq_t *); 157 158 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 159 boolean_t initialize); 160 static void ipif_check_bcast_ires(ipif_t *test_ipif); 161 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); 162 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, 163 boolean_t isv6); 164 static void ipif_down_delete_ire(ire_t *ire, char *ipif); 165 static void ipif_delete_cache_ire(ire_t *, char *); 166 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 167 static void ipif_free(ipif_t *ipif); 168 static void ipif_free_tail(ipif_t *ipif); 169 static void ipif_mtu_change(ire_t *ire, char *ipif_arg); 170 static void ipif_multicast_down(ipif_t *ipif); 171 static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); 172 static void ipif_set_default(ipif_t *ipif); 173 static int ipif_set_values(queue_t *q, mblk_t *mp, 174 char *interf_name, uint_t *ppa); 175 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 176 queue_t *q); 177 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 178 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 179 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *); 180 static int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp); 181 static void ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp); 182 183 static int ill_alloc_ppa(ill_if_t *, ill_t *); 184 static int ill_arp_off(ill_t *ill); 185 static int ill_arp_on(ill_t *ill); 186 static void ill_delete_interface_type(ill_if_t *); 187 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 188 static void ill_dl_down(ill_t *ill); 189 static void ill_down(ill_t *ill); 190 static void ill_downi(ire_t *ire, char *ill_arg); 191 static void ill_free_mib(ill_t *ill); 192 static void ill_glist_delete(ill_t *); 193 static boolean_t ill_has_usable_ipif(ill_t *); 194 static int ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int); 195 static void ill_nominate_bcast_rcv(ill_group_t *illgrp); 196 static void ill_phyint_free(ill_t *ill); 197 static void ill_phyint_reinit(ill_t *ill); 198 static void ill_set_nce_router_flags(ill_t *, boolean_t); 199 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); 200 static void ill_signal_ipsq_ills(ipsq_t *, boolean_t); 201 static boolean_t ill_split_ipsq(ipsq_t *cur_sq); 202 static void ill_stq_cache_delete(ire_t *, char *); 203 204 static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *); 205 static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *); 206 static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 207 in6_addr_t *); 208 static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 209 ipaddr_t *); 210 static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *); 211 static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 212 in6_addr_t *); 213 static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 214 ipaddr_t *); 215 216 static void ipif_save_ire(ipif_t *, ire_t *); 217 static void ipif_remove_ire(ipif_t *, ire_t *); 218 static void ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *); 219 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); 220 221 /* 222 * Per-ill IPsec capabilities management. 223 */ 224 static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void); 225 static void ill_ipsec_capab_free(ill_ipsec_capab_t *); 226 static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); 227 static void ill_ipsec_capab_delete(ill_t *, uint_t); 228 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); 229 static void ill_capability_proto(ill_t *, int, mblk_t *); 230 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, 231 boolean_t); 232 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 233 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 234 static void ill_capability_mdt_reset(ill_t *, mblk_t **); 235 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 236 static void ill_capability_ipsec_reset(ill_t *, mblk_t **); 237 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 238 static void ill_capability_hcksum_reset(ill_t *, mblk_t **); 239 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 240 dl_capability_sub_t *); 241 static void ill_capability_zerocopy_reset(ill_t *, mblk_t **); 242 static void ill_capability_lso_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 243 static void ill_capability_lso_reset(ill_t *, mblk_t **); 244 static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 245 static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *); 246 static void ill_capability_dls_reset(ill_t *, mblk_t **); 247 static void ill_capability_dls_disable(ill_t *); 248 249 static void illgrp_cache_delete(ire_t *, char *); 250 static void illgrp_delete(ill_t *ill); 251 static void illgrp_reset_schednext(ill_t *ill); 252 253 static ill_t *ill_prev_usesrc(ill_t *); 254 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 255 static void ill_disband_usesrc_group(ill_t *); 256 257 static void conn_cleanup_stale_ire(conn_t *, caddr_t); 258 259 #ifdef DEBUG 260 static void ill_trace_cleanup(const ill_t *); 261 static void ipif_trace_cleanup(const ipif_t *); 262 #endif 263 264 /* 265 * if we go over the memory footprint limit more than once in this msec 266 * interval, we'll start pruning aggressively. 267 */ 268 int ip_min_frag_prune_time = 0; 269 270 /* 271 * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY 272 * and the IPsec DOI 273 */ 274 #define MAX_IPSEC_ALGS 256 275 276 #define BITSPERBYTE 8 277 #define BITS(type) (BITSPERBYTE * (long)sizeof (type)) 278 279 #define IPSEC_ALG_ENABLE(algs, algid) \ 280 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \ 281 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 282 283 #define IPSEC_ALG_IS_ENABLED(algid, algs) \ 284 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \ 285 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 286 287 typedef uint8_t ipsec_capab_elem_t; 288 289 /* 290 * Per-algorithm parameters. Note that at present, only encryption 291 * algorithms have variable keysize (IKE does not provide a way to negotiate 292 * auth algorithm keysize). 293 * 294 * All sizes here are in bits. 295 */ 296 typedef struct 297 { 298 uint16_t minkeylen; 299 uint16_t maxkeylen; 300 } ipsec_capab_algparm_t; 301 302 /* 303 * Per-ill capabilities. 304 */ 305 struct ill_ipsec_capab_s { 306 ipsec_capab_elem_t *encr_hw_algs; 307 ipsec_capab_elem_t *auth_hw_algs; 308 uint32_t algs_size; /* size of _hw_algs in bytes */ 309 /* algorithm key lengths */ 310 ipsec_capab_algparm_t *encr_algparm; 311 uint32_t encr_algparm_size; 312 uint32_t encr_algparm_end; 313 }; 314 315 /* 316 * The field values are larger than strictly necessary for simple 317 * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls. 318 */ 319 static area_t ip_area_template = { 320 AR_ENTRY_ADD, /* area_cmd */ 321 sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl), 322 /* area_name_offset */ 323 /* area_name_length temporarily holds this structure length */ 324 sizeof (area_t), /* area_name_length */ 325 IP_ARP_PROTO_TYPE, /* area_proto */ 326 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 327 IP_ADDR_LEN, /* area_proto_addr_length */ 328 sizeof (ip_sock_ar_t) + IP_ADDR_LEN, 329 /* area_proto_mask_offset */ 330 0, /* area_flags */ 331 sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN, 332 /* area_hw_addr_offset */ 333 /* Zero length hw_addr_length means 'use your idea of the address' */ 334 0 /* area_hw_addr_length */ 335 }; 336 337 /* 338 * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver 339 * support 340 */ 341 static area_t ip6_area_template = { 342 AR_ENTRY_ADD, /* area_cmd */ 343 sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t), 344 /* area_name_offset */ 345 /* area_name_length temporarily holds this structure length */ 346 sizeof (area_t), /* area_name_length */ 347 IP_ARP_PROTO_TYPE, /* area_proto */ 348 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 349 IPV6_ADDR_LEN, /* area_proto_addr_length */ 350 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN, 351 /* area_proto_mask_offset */ 352 0, /* area_flags */ 353 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN, 354 /* area_hw_addr_offset */ 355 /* Zero length hw_addr_length means 'use your idea of the address' */ 356 0 /* area_hw_addr_length */ 357 }; 358 359 static ared_t ip_ared_template = { 360 AR_ENTRY_DELETE, 361 sizeof (ared_t) + IP_ADDR_LEN, 362 sizeof (ared_t), 363 IP_ARP_PROTO_TYPE, 364 sizeof (ared_t), 365 IP_ADDR_LEN 366 }; 367 368 static ared_t ip6_ared_template = { 369 AR_ENTRY_DELETE, 370 sizeof (ared_t) + IPV6_ADDR_LEN, 371 sizeof (ared_t), 372 IP_ARP_PROTO_TYPE, 373 sizeof (ared_t), 374 IPV6_ADDR_LEN 375 }; 376 377 /* 378 * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as 379 * as the areq doesn't include an IP address in ill_dl_up() (the only place a 380 * areq is used). 381 */ 382 static areq_t ip_areq_template = { 383 AR_ENTRY_QUERY, /* cmd */ 384 sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */ 385 sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ 386 IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ 387 sizeof (areq_t), /* target addr offset */ 388 IP_ADDR_LEN, /* target addr_length */ 389 0, /* flags */ 390 sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ 391 IP_ADDR_LEN, /* sender addr length */ 392 AR_EQ_DEFAULT_XMIT_COUNT, /* xmit_count */ 393 AR_EQ_DEFAULT_XMIT_INTERVAL, /* (re)xmit_interval in milliseconds */ 394 AR_EQ_DEFAULT_MAX_BUFFERED /* max # of requests to buffer */ 395 /* anything else filled in by the code */ 396 }; 397 398 static arc_t ip_aru_template = { 399 AR_INTERFACE_UP, 400 sizeof (arc_t), /* Name offset */ 401 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 402 }; 403 404 static arc_t ip_ard_template = { 405 AR_INTERFACE_DOWN, 406 sizeof (arc_t), /* Name offset */ 407 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 408 }; 409 410 static arc_t ip_aron_template = { 411 AR_INTERFACE_ON, 412 sizeof (arc_t), /* Name offset */ 413 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 414 }; 415 416 static arc_t ip_aroff_template = { 417 AR_INTERFACE_OFF, 418 sizeof (arc_t), /* Name offset */ 419 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 420 }; 421 422 423 static arma_t ip_arma_multi_template = { 424 AR_MAPPING_ADD, 425 sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN, 426 /* Name offset */ 427 sizeof (arma_t), /* Name length (set by ill_arp_alloc) */ 428 IP_ARP_PROTO_TYPE, 429 sizeof (arma_t), /* proto_addr_offset */ 430 IP_ADDR_LEN, /* proto_addr_length */ 431 sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */ 432 sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */ 433 ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */ 434 sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */ 435 IP_MAX_HW_LEN, /* hw_addr_length */ 436 0, /* hw_mapping_start */ 437 }; 438 439 static ipft_t ip_ioctl_ftbl[] = { 440 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 441 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 442 IPFT_F_NO_REPLY }, 443 { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t), 444 IPFT_F_NO_REPLY }, 445 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 446 { 0 } 447 }; 448 449 /* Simple ICMP IP Header Template */ 450 static ipha_t icmp_ipha = { 451 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 452 }; 453 454 /* Flag descriptors for ip_ipif_report */ 455 static nv_t ipif_nv_tbl[] = { 456 { IPIF_UP, "UP" }, 457 { IPIF_BROADCAST, "BROADCAST" }, 458 { ILLF_DEBUG, "DEBUG" }, 459 { PHYI_LOOPBACK, "LOOPBACK" }, 460 { IPIF_POINTOPOINT, "POINTOPOINT" }, 461 { ILLF_NOTRAILERS, "NOTRAILERS" }, 462 { PHYI_RUNNING, "RUNNING" }, 463 { ILLF_NOARP, "NOARP" }, 464 { PHYI_PROMISC, "PROMISC" }, 465 { PHYI_ALLMULTI, "ALLMULTI" }, 466 { PHYI_INTELLIGENT, "INTELLIGENT" }, 467 { ILLF_MULTICAST, "MULTICAST" }, 468 { PHYI_MULTI_BCAST, "MULTI_BCAST" }, 469 { IPIF_UNNUMBERED, "UNNUMBERED" }, 470 { IPIF_DHCPRUNNING, "DHCP" }, 471 { IPIF_PRIVATE, "PRIVATE" }, 472 { IPIF_NOXMIT, "NOXMIT" }, 473 { IPIF_NOLOCAL, "NOLOCAL" }, 474 { IPIF_DEPRECATED, "DEPRECATED" }, 475 { IPIF_PREFERRED, "PREFERRED" }, 476 { IPIF_TEMPORARY, "TEMPORARY" }, 477 { IPIF_ADDRCONF, "ADDRCONF" }, 478 { PHYI_VIRTUAL, "VIRTUAL" }, 479 { ILLF_ROUTER, "ROUTER" }, 480 { ILLF_NONUD, "NONUD" }, 481 { IPIF_ANYCAST, "ANYCAST" }, 482 { ILLF_NORTEXCH, "NORTEXCH" }, 483 { ILLF_IPV4, "IPV4" }, 484 { ILLF_IPV6, "IPV6" }, 485 { IPIF_NOFAILOVER, "NOFAILOVER" }, 486 { PHYI_FAILED, "FAILED" }, 487 { PHYI_STANDBY, "STANDBY" }, 488 { PHYI_INACTIVE, "INACTIVE" }, 489 { PHYI_OFFLINE, "OFFLINE" }, 490 }; 491 492 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 493 494 static ip_m_t ip_m_tbl[] = { 495 { DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 496 ip_ether_v6intfid }, 497 { DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 498 ip_nodef_v6intfid }, 499 { DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 500 ip_nodef_v6intfid }, 501 { DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 502 ip_nodef_v6intfid }, 503 { DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 504 ip_ether_v6intfid }, 505 { DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo, 506 ip_ib_v6intfid }, 507 { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL}, 508 { DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 509 ip_nodef_v6intfid } 510 }; 511 512 static ill_t ill_null; /* Empty ILL for init. */ 513 char ipif_loopback_name[] = "lo0"; 514 static char *ipv4_forward_suffix = ":ip_forwarding"; 515 static char *ipv6_forward_suffix = ":ip6_forwarding"; 516 static sin6_t sin6_null; /* Zero address for quick clears */ 517 static sin_t sin_null; /* Zero address for quick clears */ 518 519 /* When set search for unused ipif_seqid */ 520 static ipif_t ipif_zero; 521 522 /* 523 * ppa arena is created after these many 524 * interfaces have been plumbed. 525 */ 526 uint_t ill_no_arena = 12; /* Setable in /etc/system */ 527 528 /* 529 * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout 530 * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is 531 * set through platform specific code (Niagara/Ontario). 532 */ 533 #define SOFT_RINGS_ENABLED() (ip_soft_rings_cnt ? \ 534 (ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE) 535 536 #define ILL_CAPAB_DLS (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL) 537 538 static uint_t 539 ipif_rand(ip_stack_t *ipst) 540 { 541 ipst->ips_ipif_src_random = ipst->ips_ipif_src_random * 1103515245 + 542 12345; 543 return ((ipst->ips_ipif_src_random >> 16) & 0x7fff); 544 } 545 546 /* 547 * Allocate per-interface mibs. 548 * Returns true if ok. False otherwise. 549 * ipsq may not yet be allocated (loopback case ). 550 */ 551 static boolean_t 552 ill_allocate_mibs(ill_t *ill) 553 { 554 /* Already allocated? */ 555 if (ill->ill_ip_mib != NULL) { 556 if (ill->ill_isv6) 557 ASSERT(ill->ill_icmp6_mib != NULL); 558 return (B_TRUE); 559 } 560 561 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), 562 KM_NOSLEEP); 563 if (ill->ill_ip_mib == NULL) { 564 return (B_FALSE); 565 } 566 567 /* Setup static information */ 568 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, 569 sizeof (mib2_ipIfStatsEntry_t)); 570 if (ill->ill_isv6) { 571 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 572 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 573 sizeof (mib2_ipv6AddrEntry_t)); 574 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 575 sizeof (mib2_ipv6RouteEntry_t)); 576 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 577 sizeof (mib2_ipv6NetToMediaEntry_t)); 578 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 579 sizeof (ipv6_member_t)); 580 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 581 sizeof (ipv6_grpsrc_t)); 582 } else { 583 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 584 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 585 sizeof (mib2_ipAddrEntry_t)); 586 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 587 sizeof (mib2_ipRouteEntry_t)); 588 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 589 sizeof (mib2_ipNetToMediaEntry_t)); 590 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 591 sizeof (ip_member_t)); 592 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 593 sizeof (ip_grpsrc_t)); 594 595 /* 596 * For a v4 ill, we are done at this point, because per ill 597 * icmp mibs are only used for v6. 598 */ 599 return (B_TRUE); 600 } 601 602 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 603 KM_NOSLEEP); 604 if (ill->ill_icmp6_mib == NULL) { 605 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 606 ill->ill_ip_mib = NULL; 607 return (B_FALSE); 608 } 609 /* static icmp info */ 610 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 611 sizeof (mib2_ipv6IfIcmpEntry_t); 612 /* 613 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later 614 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 615 * -> ill_phyint_reinit 616 */ 617 return (B_TRUE); 618 } 619 620 /* 621 * Common code for preparation of ARP commands. Two points to remember: 622 * 1) The ill_name is tacked on at the end of the allocated space so 623 * the templates name_offset field must contain the total space 624 * to allocate less the name length. 625 * 626 * 2) The templates name_length field should contain the *template* 627 * length. We use it as a parameter to bcopy() and then write 628 * the real ill_name_length into the name_length field of the copy. 629 * (Always called as writer.) 630 */ 631 mblk_t * 632 ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr) 633 { 634 arc_t *arc = (arc_t *)template; 635 char *cp; 636 int len; 637 mblk_t *mp; 638 uint_t name_length = ill->ill_name_length; 639 uint_t template_len = arc->arc_name_length; 640 641 len = arc->arc_name_offset + name_length; 642 mp = allocb(len, BPRI_HI); 643 if (mp == NULL) 644 return (NULL); 645 cp = (char *)mp->b_rptr; 646 mp->b_wptr = (uchar_t *)&cp[len]; 647 if (template_len) 648 bcopy(template, cp, template_len); 649 if (len > template_len) 650 bzero(&cp[template_len], len - template_len); 651 mp->b_datap->db_type = M_PROTO; 652 653 arc = (arc_t *)cp; 654 arc->arc_name_length = name_length; 655 cp = (char *)arc + arc->arc_name_offset; 656 bcopy(ill->ill_name, cp, name_length); 657 658 if (addr) { 659 area_t *area = (area_t *)mp->b_rptr; 660 661 cp = (char *)area + area->area_proto_addr_offset; 662 bcopy(addr, cp, area->area_proto_addr_length); 663 if (area->area_cmd == AR_ENTRY_ADD) { 664 cp = (char *)area; 665 len = area->area_proto_addr_length; 666 if (area->area_proto_mask_offset) 667 cp += area->area_proto_mask_offset; 668 else 669 cp += area->area_proto_addr_offset + len; 670 while (len-- > 0) 671 *cp++ = (char)~0; 672 } 673 } 674 return (mp); 675 } 676 677 mblk_t * 678 ipif_area_alloc(ipif_t *ipif) 679 { 680 return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template, 681 (char *)&ipif->ipif_lcl_addr)); 682 } 683 684 mblk_t * 685 ipif_ared_alloc(ipif_t *ipif) 686 { 687 return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template, 688 (char *)&ipif->ipif_lcl_addr)); 689 } 690 691 mblk_t * 692 ill_ared_alloc(ill_t *ill, ipaddr_t addr) 693 { 694 return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 695 (char *)&addr)); 696 } 697 698 /* 699 * Completely vaporize a lower level tap and all associated interfaces. 700 * ill_delete is called only out of ip_close when the device control 701 * stream is being closed. 702 */ 703 void 704 ill_delete(ill_t *ill) 705 { 706 ipif_t *ipif; 707 ill_t *prev_ill; 708 ip_stack_t *ipst = ill->ill_ipst; 709 710 /* 711 * ill_delete may be forcibly entering the ipsq. The previous 712 * ioctl may not have completed and may need to be aborted. 713 * ipsq_flush takes care of it. If we don't need to enter the 714 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 715 * ill_delete_tail is sufficient. 716 */ 717 ipsq_flush(ill); 718 719 /* 720 * Nuke all interfaces. ipif_free will take down the interface, 721 * remove it from the list, and free the data structure. 722 * Walk down the ipif list and remove the logical interfaces 723 * first before removing the main ipif. We can't unplumb 724 * zeroth interface first in the case of IPv6 as reset_conn_ill 725 * -> ip_ll_delmulti_v6 de-references ill_ipif for checking 726 * POINTOPOINT. 727 * 728 * If ill_ipif was not properly initialized (i.e low on memory), 729 * then no interfaces to clean up. In this case just clean up the 730 * ill. 731 */ 732 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 733 ipif_free(ipif); 734 735 /* 736 * Used only by ill_arp_on and ill_arp_off, which are writers. 737 * So nobody can be using this mp now. Free the mp allocated for 738 * honoring ILLF_NOARP 739 */ 740 freemsg(ill->ill_arp_on_mp); 741 ill->ill_arp_on_mp = NULL; 742 743 /* Clean up msgs on pending upcalls for mrouted */ 744 reset_mrt_ill(ill); 745 746 /* 747 * ipif_free -> reset_conn_ipif will remove all multicast 748 * references for IPv4. For IPv6, we need to do it here as 749 * it points only at ills. 750 */ 751 reset_conn_ill(ill); 752 753 /* 754 * ill_down will arrange to blow off any IRE's dependent on this 755 * ILL, and shut down fragmentation reassembly. 756 */ 757 ill_down(ill); 758 759 /* Let SCTP know, so that it can remove this from its list. */ 760 sctp_update_ill(ill, SCTP_ILL_REMOVE); 761 762 /* 763 * If an address on this ILL is being used as a source address then 764 * clear out the pointers in other ILLs that point to this ILL. 765 */ 766 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 767 if (ill->ill_usesrc_grp_next != NULL) { 768 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 769 ill_disband_usesrc_group(ill); 770 } else { /* consumer of the usesrc ILL */ 771 prev_ill = ill_prev_usesrc(ill); 772 prev_ill->ill_usesrc_grp_next = 773 ill->ill_usesrc_grp_next; 774 } 775 } 776 rw_exit(&ipst->ips_ill_g_usesrc_lock); 777 } 778 779 static void 780 ipif_non_duplicate(ipif_t *ipif) 781 { 782 ill_t *ill = ipif->ipif_ill; 783 mutex_enter(&ill->ill_lock); 784 if (ipif->ipif_flags & IPIF_DUPLICATE) { 785 ipif->ipif_flags &= ~IPIF_DUPLICATE; 786 ASSERT(ill->ill_ipif_dup_count > 0); 787 ill->ill_ipif_dup_count--; 788 } 789 mutex_exit(&ill->ill_lock); 790 } 791 792 /* 793 * ill_delete_tail is called from ip_modclose after all references 794 * to the closing ill are gone. The wait is done in ip_modclose 795 */ 796 void 797 ill_delete_tail(ill_t *ill) 798 { 799 mblk_t **mpp; 800 ipif_t *ipif; 801 ip_stack_t *ipst = ill->ill_ipst; 802 803 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 804 ipif_non_duplicate(ipif); 805 ipif_down_tail(ipif); 806 } 807 808 ASSERT(ill->ill_ipif_dup_count == 0 && 809 ill->ill_arp_down_mp == NULL && 810 ill->ill_arp_del_mapping_mp == NULL); 811 812 /* 813 * If polling capability is enabled (which signifies direct 814 * upcall into IP and driver has ill saved as a handle), 815 * we need to make sure that unbind has completed before we 816 * let the ill disappear and driver no longer has any reference 817 * to this ill. 818 */ 819 mutex_enter(&ill->ill_lock); 820 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 821 cv_wait(&ill->ill_cv, &ill->ill_lock); 822 mutex_exit(&ill->ill_lock); 823 824 /* 825 * Clean up polling and soft ring capabilities 826 */ 827 if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) 828 ill_capability_dls_disable(ill); 829 830 if (ill->ill_net_type != IRE_LOOPBACK) 831 qprocsoff(ill->ill_rq); 832 833 /* 834 * We do an ipsq_flush once again now. New messages could have 835 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 836 * could also have landed up if an ioctl thread had looked up 837 * the ill before we set the ILL_CONDEMNED flag, but not yet 838 * enqueued the ioctl when we did the ipsq_flush last time. 839 */ 840 ipsq_flush(ill); 841 842 /* 843 * Free capabilities. 844 */ 845 if (ill->ill_ipsec_capab_ah != NULL) { 846 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH); 847 ill_ipsec_capab_free(ill->ill_ipsec_capab_ah); 848 ill->ill_ipsec_capab_ah = NULL; 849 } 850 851 if (ill->ill_ipsec_capab_esp != NULL) { 852 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP); 853 ill_ipsec_capab_free(ill->ill_ipsec_capab_esp); 854 ill->ill_ipsec_capab_esp = NULL; 855 } 856 857 if (ill->ill_mdt_capab != NULL) { 858 kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t)); 859 ill->ill_mdt_capab = NULL; 860 } 861 862 if (ill->ill_hcksum_capab != NULL) { 863 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 864 ill->ill_hcksum_capab = NULL; 865 } 866 867 if (ill->ill_zerocopy_capab != NULL) { 868 kmem_free(ill->ill_zerocopy_capab, 869 sizeof (ill_zerocopy_capab_t)); 870 ill->ill_zerocopy_capab = NULL; 871 } 872 873 if (ill->ill_lso_capab != NULL) { 874 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 875 ill->ill_lso_capab = NULL; 876 } 877 878 if (ill->ill_dls_capab != NULL) { 879 CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn); 880 ill->ill_dls_capab->ill_unbind_conn = NULL; 881 kmem_free(ill->ill_dls_capab, 882 sizeof (ill_dls_capab_t) + 883 (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS)); 884 ill->ill_dls_capab = NULL; 885 } 886 887 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL)); 888 889 while (ill->ill_ipif != NULL) 890 ipif_free_tail(ill->ill_ipif); 891 892 /* 893 * We have removed all references to ilm from conn and the ones joined 894 * within the kernel. 895 * 896 * We don't walk conns, mrts and ires because 897 * 898 * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts. 899 * 2) ill_down ->ill_downi walks all the ires and cleans up 900 * ill references. 901 */ 902 ASSERT(ilm_walk_ill(ill) == 0); 903 /* 904 * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free 905 * could free the phyint. No more reference to the phyint after this 906 * point. 907 */ 908 (void) ill_glist_delete(ill); 909 910 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 911 if (ill->ill_ndd_name != NULL) 912 nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name); 913 rw_exit(&ipst->ips_ip_g_nd_lock); 914 915 916 if (ill->ill_frag_ptr != NULL) { 917 uint_t count; 918 919 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 920 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 921 } 922 mi_free(ill->ill_frag_ptr); 923 ill->ill_frag_ptr = NULL; 924 ill->ill_frag_hash_tbl = NULL; 925 } 926 927 freemsg(ill->ill_nd_lla_mp); 928 /* Free all retained control messages. */ 929 mpp = &ill->ill_first_mp_to_free; 930 do { 931 while (mpp[0]) { 932 mblk_t *mp; 933 mblk_t *mp1; 934 935 mp = mpp[0]; 936 mpp[0] = mp->b_next; 937 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 938 mp1->b_next = NULL; 939 mp1->b_prev = NULL; 940 } 941 freemsg(mp); 942 } 943 } while (mpp++ != &ill->ill_last_mp_to_free); 944 945 ill_free_mib(ill); 946 947 #ifdef DEBUG 948 ill_trace_cleanup(ill); 949 #endif 950 951 /* Drop refcnt here */ 952 netstack_rele(ill->ill_ipst->ips_netstack); 953 ill->ill_ipst = NULL; 954 } 955 956 static void 957 ill_free_mib(ill_t *ill) 958 { 959 ip_stack_t *ipst = ill->ill_ipst; 960 961 /* 962 * MIB statistics must not be lost, so when an interface 963 * goes away the counter values will be added to the global 964 * MIBs. 965 */ 966 if (ill->ill_ip_mib != NULL) { 967 if (ill->ill_isv6) { 968 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, 969 ill->ill_ip_mib); 970 } else { 971 ip_mib2_add_ip_stats(&ipst->ips_ip_mib, 972 ill->ill_ip_mib); 973 } 974 975 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 976 ill->ill_ip_mib = NULL; 977 } 978 if (ill->ill_icmp6_mib != NULL) { 979 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, 980 ill->ill_icmp6_mib); 981 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 982 ill->ill_icmp6_mib = NULL; 983 } 984 } 985 986 /* 987 * Concatenate together a physical address and a sap. 988 * 989 * Sap_lengths are interpreted as follows: 990 * sap_length == 0 ==> no sap 991 * sap_length > 0 ==> sap is at the head of the dlpi address 992 * sap_length < 0 ==> sap is at the tail of the dlpi address 993 */ 994 static void 995 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 996 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 997 { 998 uint16_t sap_addr = (uint16_t)sap_src; 999 1000 if (sap_length == 0) { 1001 if (phys_src == NULL) 1002 bzero(dst, phys_length); 1003 else 1004 bcopy(phys_src, dst, phys_length); 1005 } else if (sap_length < 0) { 1006 if (phys_src == NULL) 1007 bzero(dst, phys_length); 1008 else 1009 bcopy(phys_src, dst, phys_length); 1010 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 1011 } else { 1012 bcopy(&sap_addr, dst, sizeof (sap_addr)); 1013 if (phys_src == NULL) 1014 bzero((char *)dst + sap_length, phys_length); 1015 else 1016 bcopy(phys_src, (char *)dst + sap_length, phys_length); 1017 } 1018 } 1019 1020 /* 1021 * Generate a dl_unitdata_req mblk for the device and address given. 1022 * addr_length is the length of the physical portion of the address. 1023 * If addr is NULL include an all zero address of the specified length. 1024 * TRUE? In any case, addr_length is taken to be the entire length of the 1025 * dlpi address, including the absolute value of sap_length. 1026 */ 1027 mblk_t * 1028 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 1029 t_scalar_t sap_length) 1030 { 1031 dl_unitdata_req_t *dlur; 1032 mblk_t *mp; 1033 t_scalar_t abs_sap_length; /* absolute value */ 1034 1035 abs_sap_length = ABS(sap_length); 1036 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 1037 DL_UNITDATA_REQ); 1038 if (mp == NULL) 1039 return (NULL); 1040 dlur = (dl_unitdata_req_t *)mp->b_rptr; 1041 /* HACK: accomodate incompatible DLPI drivers */ 1042 if (addr_length == 8) 1043 addr_length = 6; 1044 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 1045 dlur->dl_dest_addr_offset = sizeof (*dlur); 1046 dlur->dl_priority.dl_min = 0; 1047 dlur->dl_priority.dl_max = 0; 1048 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 1049 (uchar_t *)&dlur[1]); 1050 return (mp); 1051 } 1052 1053 /* 1054 * Add the 'mp' to the list of pending mp's headed by ill_pending_mp 1055 * Return an error if we already have 1 or more ioctls in progress. 1056 * This is used only for non-exclusive ioctls. Currently this is used 1057 * for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive 1058 * and thus need to use ipsq_pending_mp_add. 1059 */ 1060 boolean_t 1061 ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp) 1062 { 1063 ASSERT(MUTEX_HELD(&ill->ill_lock)); 1064 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1065 /* 1066 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls. 1067 */ 1068 ASSERT((add_mp->b_datap->db_type == M_IOCDATA) || 1069 (add_mp->b_datap->db_type == M_IOCTL)); 1070 1071 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1072 /* 1073 * Return error if the conn has started closing. The conn 1074 * could have finished cleaning up the pending mp list, 1075 * If so we should not add another mp to the list negating 1076 * the cleanup. 1077 */ 1078 if (connp->conn_state_flags & CONN_CLOSING) 1079 return (B_FALSE); 1080 /* 1081 * Add the pending mp to the head of the list, chained by b_next. 1082 * Note down the conn on which the ioctl request came, in b_prev. 1083 * This will be used to later get the conn, when we get a response 1084 * on the ill queue, from some other module (typically arp) 1085 */ 1086 add_mp->b_next = (void *)ill->ill_pending_mp; 1087 add_mp->b_queue = CONNP_TO_WQ(connp); 1088 ill->ill_pending_mp = add_mp; 1089 if (connp != NULL) 1090 connp->conn_oper_pending_ill = ill; 1091 return (B_TRUE); 1092 } 1093 1094 /* 1095 * Retrieve the ill_pending_mp and return it. We have to walk the list 1096 * of mblks starting at ill_pending_mp, and match based on the ioc_id. 1097 */ 1098 mblk_t * 1099 ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) 1100 { 1101 mblk_t *prev = NULL; 1102 mblk_t *curr = NULL; 1103 uint_t id; 1104 conn_t *connp; 1105 1106 /* 1107 * When the conn closes, conn_ioctl_cleanup needs to clean 1108 * up the pending mp, but it does not know the ioc_id and 1109 * passes in a zero for it. 1110 */ 1111 mutex_enter(&ill->ill_lock); 1112 if (ioc_id != 0) 1113 *connpp = NULL; 1114 1115 /* Search the list for the appropriate ioctl based on ioc_id */ 1116 for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL; 1117 prev = curr, curr = curr->b_next) { 1118 id = ((struct iocblk *)curr->b_rptr)->ioc_id; 1119 connp = Q_TO_CONN(curr->b_queue); 1120 /* Match based on the ioc_id or based on the conn */ 1121 if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp)) 1122 break; 1123 } 1124 1125 if (curr != NULL) { 1126 /* Unlink the mblk from the pending mp list */ 1127 if (prev != NULL) { 1128 prev->b_next = curr->b_next; 1129 } else { 1130 ASSERT(ill->ill_pending_mp == curr); 1131 ill->ill_pending_mp = curr->b_next; 1132 } 1133 1134 /* 1135 * conn refcnt must have been bumped up at the start of 1136 * the ioctl. So we can safely access the conn. 1137 */ 1138 ASSERT(CONN_Q(curr->b_queue)); 1139 *connpp = Q_TO_CONN(curr->b_queue); 1140 curr->b_next = NULL; 1141 curr->b_queue = NULL; 1142 } 1143 1144 mutex_exit(&ill->ill_lock); 1145 1146 return (curr); 1147 } 1148 1149 /* 1150 * Add the pending mp to the list. There can be only 1 pending mp 1151 * in the list. Any exclusive ioctl that needs to wait for a response 1152 * from another module or driver needs to use this function to set 1153 * the ipsq_pending_mp to the ioctl mblk and wait for the response from 1154 * the other module/driver. This is also used while waiting for the 1155 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 1156 */ 1157 boolean_t 1158 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 1159 int waitfor) 1160 { 1161 ipsq_t *ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 1162 1163 ASSERT(IAM_WRITER_IPIF(ipif)); 1164 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 1165 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1166 ASSERT(ipsq->ipsq_pending_mp == NULL); 1167 /* 1168 * The caller may be using a different ipif than the one passed into 1169 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 1170 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT 1171 * that `ipsq_current_ipif == ipif'. 1172 */ 1173 ASSERT(ipsq->ipsq_current_ipif != NULL); 1174 1175 /* 1176 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls, 1177 * M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the driver. 1178 */ 1179 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) || 1180 (DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP) || 1181 (DB_TYPE(add_mp) == M_PROTO) || (DB_TYPE(add_mp) == M_PCPROTO)); 1182 1183 if (connp != NULL) { 1184 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1185 /* 1186 * Return error if the conn has started closing. The conn 1187 * could have finished cleaning up the pending mp list, 1188 * If so we should not add another mp to the list negating 1189 * the cleanup. 1190 */ 1191 if (connp->conn_state_flags & CONN_CLOSING) 1192 return (B_FALSE); 1193 } 1194 mutex_enter(&ipsq->ipsq_lock); 1195 ipsq->ipsq_pending_ipif = ipif; 1196 /* 1197 * Note down the queue in b_queue. This will be returned by 1198 * ipsq_pending_mp_get. Caller will then use these values to restart 1199 * the processing 1200 */ 1201 add_mp->b_next = NULL; 1202 add_mp->b_queue = q; 1203 ipsq->ipsq_pending_mp = add_mp; 1204 ipsq->ipsq_waitfor = waitfor; 1205 1206 if (connp != NULL) 1207 connp->conn_oper_pending_ill = ipif->ipif_ill; 1208 mutex_exit(&ipsq->ipsq_lock); 1209 return (B_TRUE); 1210 } 1211 1212 /* 1213 * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp 1214 * queued in the list. 1215 */ 1216 mblk_t * 1217 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 1218 { 1219 mblk_t *curr = NULL; 1220 1221 mutex_enter(&ipsq->ipsq_lock); 1222 *connpp = NULL; 1223 if (ipsq->ipsq_pending_mp == NULL) { 1224 mutex_exit(&ipsq->ipsq_lock); 1225 return (NULL); 1226 } 1227 1228 /* There can be only 1 such excl message */ 1229 curr = ipsq->ipsq_pending_mp; 1230 ASSERT(curr != NULL && curr->b_next == NULL); 1231 ipsq->ipsq_pending_ipif = NULL; 1232 ipsq->ipsq_pending_mp = NULL; 1233 ipsq->ipsq_waitfor = 0; 1234 mutex_exit(&ipsq->ipsq_lock); 1235 1236 if (CONN_Q(curr->b_queue)) { 1237 /* 1238 * This mp did a refhold on the conn, at the start of the ioctl. 1239 * So we can safely return a pointer to the conn to the caller. 1240 */ 1241 *connpp = Q_TO_CONN(curr->b_queue); 1242 } else { 1243 *connpp = NULL; 1244 } 1245 curr->b_next = NULL; 1246 curr->b_prev = NULL; 1247 return (curr); 1248 } 1249 1250 /* 1251 * Cleanup the ioctl mp queued in ipsq_pending_mp 1252 * - Called in the ill_delete path 1253 * - Called in the M_ERROR or M_HANGUP path on the ill. 1254 * - Called in the conn close path. 1255 */ 1256 boolean_t 1257 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 1258 { 1259 mblk_t *mp; 1260 ipsq_t *ipsq; 1261 queue_t *q; 1262 ipif_t *ipif; 1263 1264 ASSERT(IAM_WRITER_ILL(ill)); 1265 ipsq = ill->ill_phyint->phyint_ipsq; 1266 mutex_enter(&ipsq->ipsq_lock); 1267 /* 1268 * If connp is null, unconditionally clean up the ipsq_pending_mp. 1269 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 1270 * even if it is meant for another ill, since we have to enqueue 1271 * a new mp now in ipsq_pending_mp to complete the ipif_down. 1272 * If connp is non-null we are called from the conn close path. 1273 */ 1274 mp = ipsq->ipsq_pending_mp; 1275 if (mp == NULL || (connp != NULL && 1276 mp->b_queue != CONNP_TO_WQ(connp))) { 1277 mutex_exit(&ipsq->ipsq_lock); 1278 return (B_FALSE); 1279 } 1280 /* Now remove from the ipsq_pending_mp */ 1281 ipsq->ipsq_pending_mp = NULL; 1282 q = mp->b_queue; 1283 mp->b_next = NULL; 1284 mp->b_prev = NULL; 1285 mp->b_queue = NULL; 1286 1287 /* If MOVE was in progress, clear the move_in_progress fields also. */ 1288 ill = ipsq->ipsq_pending_ipif->ipif_ill; 1289 if (ill->ill_move_in_progress) { 1290 ILL_CLEAR_MOVE(ill); 1291 } else if (ill->ill_up_ipifs) { 1292 ill_group_cleanup(ill); 1293 } 1294 1295 ipif = ipsq->ipsq_pending_ipif; 1296 ipsq->ipsq_pending_ipif = NULL; 1297 ipsq->ipsq_waitfor = 0; 1298 ipsq->ipsq_current_ipif = NULL; 1299 ipsq->ipsq_current_ioctl = 0; 1300 mutex_exit(&ipsq->ipsq_lock); 1301 1302 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 1303 if (connp == NULL) { 1304 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 1305 } else { 1306 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); 1307 mutex_enter(&ipif->ipif_ill->ill_lock); 1308 ipif->ipif_state_flags &= ~IPIF_CHANGING; 1309 mutex_exit(&ipif->ipif_ill->ill_lock); 1310 } 1311 } else { 1312 /* 1313 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 1314 * be just inet_freemsg. we have to restart it 1315 * otherwise the thread will be stuck. 1316 */ 1317 inet_freemsg(mp); 1318 } 1319 return (B_TRUE); 1320 } 1321 1322 /* 1323 * The ill is closing. Cleanup all the pending mps. Called exclusively 1324 * towards the end of ill_delete. The refcount has gone to 0. So nobody 1325 * knows this ill, and hence nobody can add an mp to this list 1326 */ 1327 static void 1328 ill_pending_mp_cleanup(ill_t *ill) 1329 { 1330 mblk_t *mp; 1331 queue_t *q; 1332 1333 ASSERT(IAM_WRITER_ILL(ill)); 1334 1335 mutex_enter(&ill->ill_lock); 1336 /* 1337 * Every mp on the pending mp list originating from an ioctl 1338 * added 1 to the conn refcnt, at the start of the ioctl. 1339 * So bump it down now. See comments in ip_wput_nondata() 1340 */ 1341 while (ill->ill_pending_mp != NULL) { 1342 mp = ill->ill_pending_mp; 1343 ill->ill_pending_mp = mp->b_next; 1344 mutex_exit(&ill->ill_lock); 1345 1346 q = mp->b_queue; 1347 ASSERT(CONN_Q(q)); 1348 mp->b_next = NULL; 1349 mp->b_prev = NULL; 1350 mp->b_queue = NULL; 1351 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 1352 mutex_enter(&ill->ill_lock); 1353 } 1354 ill->ill_pending_ipif = NULL; 1355 1356 mutex_exit(&ill->ill_lock); 1357 } 1358 1359 /* 1360 * Called in the conn close path and ill delete path 1361 */ 1362 static void 1363 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 1364 { 1365 ipsq_t *ipsq; 1366 mblk_t *prev; 1367 mblk_t *curr; 1368 mblk_t *next; 1369 queue_t *q; 1370 mblk_t *tmp_list = NULL; 1371 1372 ASSERT(IAM_WRITER_ILL(ill)); 1373 if (connp != NULL) 1374 q = CONNP_TO_WQ(connp); 1375 else 1376 q = ill->ill_wq; 1377 1378 ipsq = ill->ill_phyint->phyint_ipsq; 1379 /* 1380 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 1381 * In the case of ioctl from a conn, there can be only 1 mp 1382 * queued on the ipsq. If an ill is being unplumbed, only messages 1383 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 1384 * ioctls meant for this ill form conn's are not flushed. They will 1385 * be processed during ipsq_exit and will not find the ill and will 1386 * return error. 1387 */ 1388 mutex_enter(&ipsq->ipsq_lock); 1389 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 1390 curr = next) { 1391 next = curr->b_next; 1392 if (curr->b_queue == q || curr->b_queue == RD(q)) { 1393 /* Unlink the mblk from the pending mp list */ 1394 if (prev != NULL) { 1395 prev->b_next = curr->b_next; 1396 } else { 1397 ASSERT(ipsq->ipsq_xopq_mphead == curr); 1398 ipsq->ipsq_xopq_mphead = curr->b_next; 1399 } 1400 if (ipsq->ipsq_xopq_mptail == curr) 1401 ipsq->ipsq_xopq_mptail = prev; 1402 /* 1403 * Create a temporary list and release the ipsq lock 1404 * New elements are added to the head of the tmp_list 1405 */ 1406 curr->b_next = tmp_list; 1407 tmp_list = curr; 1408 } else { 1409 prev = curr; 1410 } 1411 } 1412 mutex_exit(&ipsq->ipsq_lock); 1413 1414 while (tmp_list != NULL) { 1415 curr = tmp_list; 1416 tmp_list = curr->b_next; 1417 curr->b_next = NULL; 1418 curr->b_prev = NULL; 1419 curr->b_queue = NULL; 1420 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 1421 ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? 1422 CONN_CLOSE : NO_COPYOUT, NULL); 1423 } else { 1424 /* 1425 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 1426 * this can't be just inet_freemsg. we have to 1427 * restart it otherwise the thread will be stuck. 1428 */ 1429 inet_freemsg(curr); 1430 } 1431 } 1432 } 1433 1434 /* 1435 * This conn has started closing. Cleanup any pending ioctl from this conn. 1436 * STREAMS ensures that there can be at most 1 ioctl pending on a stream. 1437 */ 1438 void 1439 conn_ioctl_cleanup(conn_t *connp) 1440 { 1441 mblk_t *curr; 1442 ipsq_t *ipsq; 1443 ill_t *ill; 1444 boolean_t refheld; 1445 1446 /* 1447 * Is any exclusive ioctl pending ? If so clean it up. If the 1448 * ioctl has not yet started, the mp is pending in the list headed by 1449 * ipsq_xopq_head. If the ioctl has started the mp could be present in 1450 * ipsq_pending_mp. If the ioctl timed out in the streamhead but 1451 * is currently executing now the mp is not queued anywhere but 1452 * conn_oper_pending_ill is null. The conn close will wait 1453 * till the conn_ref drops to zero. 1454 */ 1455 mutex_enter(&connp->conn_lock); 1456 ill = connp->conn_oper_pending_ill; 1457 if (ill == NULL) { 1458 mutex_exit(&connp->conn_lock); 1459 return; 1460 } 1461 1462 curr = ill_pending_mp_get(ill, &connp, 0); 1463 if (curr != NULL) { 1464 mutex_exit(&connp->conn_lock); 1465 CONN_DEC_REF(connp); 1466 inet_freemsg(curr); 1467 return; 1468 } 1469 /* 1470 * We may not be able to refhold the ill if the ill/ipif 1471 * is changing. But we need to make sure that the ill will 1472 * not vanish. So we just bump up the ill_waiter count. 1473 */ 1474 refheld = ill_waiter_inc(ill); 1475 mutex_exit(&connp->conn_lock); 1476 if (refheld) { 1477 if (ipsq_enter(ill, B_TRUE)) { 1478 ill_waiter_dcr(ill); 1479 /* 1480 * Check whether this ioctl has started and is 1481 * pending now in ipsq_pending_mp. If it is not 1482 * found there then check whether this ioctl has 1483 * not even started and is in the ipsq_xopq list. 1484 */ 1485 if (!ipsq_pending_mp_cleanup(ill, connp)) 1486 ipsq_xopq_mp_cleanup(ill, connp); 1487 ipsq = ill->ill_phyint->phyint_ipsq; 1488 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1489 return; 1490 } 1491 } 1492 1493 /* 1494 * The ill is also closing and we could not bump up the 1495 * ill_waiter_count or we could not enter the ipsq. Leave 1496 * the cleanup to ill_delete 1497 */ 1498 mutex_enter(&connp->conn_lock); 1499 while (connp->conn_oper_pending_ill != NULL) 1500 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1501 mutex_exit(&connp->conn_lock); 1502 if (refheld) 1503 ill_waiter_dcr(ill); 1504 } 1505 1506 /* 1507 * ipcl_walk function for cleaning up conn_*_ill fields. 1508 */ 1509 static void 1510 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1511 { 1512 ill_t *ill = (ill_t *)arg; 1513 ire_t *ire; 1514 1515 mutex_enter(&connp->conn_lock); 1516 if (connp->conn_multicast_ill == ill) { 1517 /* Revert to late binding */ 1518 connp->conn_multicast_ill = NULL; 1519 connp->conn_orig_multicast_ifindex = 0; 1520 } 1521 if (connp->conn_incoming_ill == ill) 1522 connp->conn_incoming_ill = NULL; 1523 if (connp->conn_outgoing_ill == ill) 1524 connp->conn_outgoing_ill = NULL; 1525 if (connp->conn_outgoing_pill == ill) 1526 connp->conn_outgoing_pill = NULL; 1527 if (connp->conn_nofailover_ill == ill) 1528 connp->conn_nofailover_ill = NULL; 1529 if (connp->conn_xmit_if_ill == ill) 1530 connp->conn_xmit_if_ill = NULL; 1531 if (connp->conn_ire_cache != NULL) { 1532 ire = connp->conn_ire_cache; 1533 /* 1534 * ip_newroute creates IRE_CACHE with ire_stq coming from 1535 * interface X and ipif coming from interface Y, if interface 1536 * X and Y are part of the same IPMPgroup. Thus whenever 1537 * interface X goes down, remove all references to it by 1538 * checking both on ire_ipif and ire_stq. 1539 */ 1540 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1541 (ire->ire_type == IRE_CACHE && 1542 ire->ire_stq == ill->ill_wq)) { 1543 connp->conn_ire_cache = NULL; 1544 mutex_exit(&connp->conn_lock); 1545 ire_refrele_notr(ire); 1546 return; 1547 } 1548 } 1549 mutex_exit(&connp->conn_lock); 1550 1551 } 1552 1553 /* ARGSUSED */ 1554 void 1555 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1556 { 1557 ill_t *ill = q->q_ptr; 1558 ipif_t *ipif; 1559 1560 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1561 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1562 ipif_non_duplicate(ipif); 1563 ipif_down_tail(ipif); 1564 } 1565 freemsg(mp); 1566 ipsq_current_finish(ipsq); 1567 } 1568 1569 /* 1570 * ill_down_start is called when we want to down this ill and bring it up again 1571 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1572 * all interfaces, but don't tear down any plumbing. 1573 */ 1574 boolean_t 1575 ill_down_start(queue_t *q, mblk_t *mp) 1576 { 1577 ill_t *ill = q->q_ptr; 1578 ipif_t *ipif; 1579 1580 ASSERT(IAM_WRITER_ILL(ill)); 1581 1582 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1583 (void) ipif_down(ipif, NULL, NULL); 1584 1585 ill_down(ill); 1586 1587 (void) ipsq_pending_mp_cleanup(ill, NULL); 1588 1589 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); 1590 1591 /* 1592 * Atomically test and add the pending mp if references are active. 1593 */ 1594 mutex_enter(&ill->ill_lock); 1595 if (!ill_is_quiescent(ill)) { 1596 /* call cannot fail since `conn_t *' argument is NULL */ 1597 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1598 mp, ILL_DOWN); 1599 mutex_exit(&ill->ill_lock); 1600 return (B_FALSE); 1601 } 1602 mutex_exit(&ill->ill_lock); 1603 return (B_TRUE); 1604 } 1605 1606 static void 1607 ill_down(ill_t *ill) 1608 { 1609 ip_stack_t *ipst = ill->ill_ipst; 1610 1611 /* Blow off any IREs dependent on this ILL. */ 1612 ire_walk(ill_downi, (char *)ill, ipst); 1613 1614 /* Remove any conn_*_ill depending on this ill */ 1615 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); 1616 1617 if (ill->ill_group != NULL) { 1618 illgrp_delete(ill); 1619 } 1620 } 1621 1622 /* 1623 * ire_walk routine used to delete every IRE that depends on queues 1624 * associated with 'ill'. (Always called as writer.) 1625 */ 1626 static void 1627 ill_downi(ire_t *ire, char *ill_arg) 1628 { 1629 ill_t *ill = (ill_t *)ill_arg; 1630 1631 /* 1632 * ip_newroute creates IRE_CACHE with ire_stq coming from 1633 * interface X and ipif coming from interface Y, if interface 1634 * X and Y are part of the same IPMP group. Thus whenever interface 1635 * X goes down, remove all references to it by checking both 1636 * on ire_ipif and ire_stq. 1637 */ 1638 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1639 (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) { 1640 ire_delete(ire); 1641 } 1642 } 1643 1644 /* 1645 * Remove ire/nce from the fastpath list. 1646 */ 1647 void 1648 ill_fastpath_nack(ill_t *ill) 1649 { 1650 nce_fastpath_list_dispatch(ill, NULL, NULL); 1651 } 1652 1653 /* Consume an M_IOCACK of the fastpath probe. */ 1654 void 1655 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1656 { 1657 mblk_t *mp1 = mp; 1658 1659 /* 1660 * If this was the first attempt turn on the fastpath probing. 1661 */ 1662 mutex_enter(&ill->ill_lock); 1663 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1664 ill->ill_dlpi_fastpath_state = IDS_OK; 1665 mutex_exit(&ill->ill_lock); 1666 1667 /* Free the M_IOCACK mblk, hold on to the data */ 1668 mp = mp->b_cont; 1669 freeb(mp1); 1670 if (mp == NULL) 1671 return; 1672 if (mp->b_cont != NULL) { 1673 /* 1674 * Update all IRE's or NCE's that are waiting for 1675 * fastpath update. 1676 */ 1677 nce_fastpath_list_dispatch(ill, ndp_fastpath_update, mp); 1678 mp1 = mp->b_cont; 1679 freeb(mp); 1680 mp = mp1; 1681 } else { 1682 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1683 } 1684 1685 freeb(mp); 1686 } 1687 1688 /* 1689 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1690 * The data portion of the request is a dl_unitdata_req_t template for 1691 * what we would send downstream in the absence of a fastpath confirmation. 1692 */ 1693 int 1694 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1695 { 1696 struct iocblk *ioc; 1697 mblk_t *mp; 1698 1699 if (dlur_mp == NULL) 1700 return (EINVAL); 1701 1702 mutex_enter(&ill->ill_lock); 1703 switch (ill->ill_dlpi_fastpath_state) { 1704 case IDS_FAILED: 1705 /* 1706 * Driver NAKed the first fastpath ioctl - assume it doesn't 1707 * support it. 1708 */ 1709 mutex_exit(&ill->ill_lock); 1710 return (ENOTSUP); 1711 case IDS_UNKNOWN: 1712 /* This is the first probe */ 1713 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1714 break; 1715 default: 1716 break; 1717 } 1718 mutex_exit(&ill->ill_lock); 1719 1720 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1721 return (EAGAIN); 1722 1723 mp->b_cont = copyb(dlur_mp); 1724 if (mp->b_cont == NULL) { 1725 freeb(mp); 1726 return (EAGAIN); 1727 } 1728 1729 ioc = (struct iocblk *)mp->b_rptr; 1730 ioc->ioc_count = msgdsize(mp->b_cont); 1731 1732 putnext(ill->ill_wq, mp); 1733 return (0); 1734 } 1735 1736 void 1737 ill_capability_probe(ill_t *ill) 1738 { 1739 /* 1740 * Do so only if capabilities are still unknown. 1741 */ 1742 if (ill->ill_dlpi_capab_state != IDS_UNKNOWN) 1743 return; 1744 1745 ill->ill_dlpi_capab_state = IDS_INPROGRESS; 1746 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1747 ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL); 1748 } 1749 1750 void 1751 ill_capability_reset(ill_t *ill) 1752 { 1753 mblk_t *sc_mp = NULL; 1754 mblk_t *tmp; 1755 1756 /* 1757 * Note here that we reset the state to UNKNOWN, and later send 1758 * down the DL_CAPABILITY_REQ without first setting the state to 1759 * INPROGRESS. We do this in order to distinguish the 1760 * DL_CAPABILITY_ACK response which may come back in response to 1761 * a "reset" apart from the "probe" DL_CAPABILITY_REQ. This would 1762 * also handle the case where the driver doesn't send us back 1763 * a DL_CAPABILITY_ACK in response, since the "probe" routine 1764 * requires the state to be in UNKNOWN anyway. In any case, all 1765 * features are turned off until the state reaches IDS_OK. 1766 */ 1767 ill->ill_dlpi_capab_state = IDS_UNKNOWN; 1768 ill->ill_capab_reneg = B_FALSE; 1769 1770 /* 1771 * Disable sub-capabilities and request a list of sub-capability 1772 * messages which will be sent down to the driver. Each handler 1773 * allocates the corresponding dl_capability_sub_t inside an 1774 * mblk, and links it to the existing sc_mp mblk, or return it 1775 * as sc_mp if it's the first sub-capability (the passed in 1776 * sc_mp is NULL). Upon returning from all capability handlers, 1777 * sc_mp will be pulled-up, before passing it downstream. 1778 */ 1779 ill_capability_mdt_reset(ill, &sc_mp); 1780 ill_capability_hcksum_reset(ill, &sc_mp); 1781 ill_capability_zerocopy_reset(ill, &sc_mp); 1782 ill_capability_ipsec_reset(ill, &sc_mp); 1783 ill_capability_dls_reset(ill, &sc_mp); 1784 ill_capability_lso_reset(ill, &sc_mp); 1785 1786 /* Nothing to send down in order to disable the capabilities? */ 1787 if (sc_mp == NULL) 1788 return; 1789 1790 tmp = msgpullup(sc_mp, -1); 1791 freemsg(sc_mp); 1792 if ((sc_mp = tmp) == NULL) { 1793 cmn_err(CE_WARN, "ill_capability_reset: unable to send down " 1794 "DL_CAPABILITY_REQ (ENOMEM)\n"); 1795 return; 1796 } 1797 1798 ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n")); 1799 ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp); 1800 } 1801 1802 /* 1803 * Request or set new-style hardware capabilities supported by DLS provider. 1804 */ 1805 static void 1806 ill_capability_proto(ill_t *ill, int type, mblk_t *reqp) 1807 { 1808 mblk_t *mp; 1809 dl_capability_req_t *capb; 1810 size_t size = 0; 1811 uint8_t *ptr; 1812 1813 if (reqp != NULL) 1814 size = MBLKL(reqp); 1815 1816 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type); 1817 if (mp == NULL) { 1818 freemsg(reqp); 1819 return; 1820 } 1821 ptr = mp->b_rptr; 1822 1823 capb = (dl_capability_req_t *)ptr; 1824 ptr += sizeof (dl_capability_req_t); 1825 1826 if (reqp != NULL) { 1827 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1828 capb->dl_sub_length = size; 1829 bcopy(reqp->b_rptr, ptr, size); 1830 ptr += size; 1831 mp->b_cont = reqp->b_cont; 1832 freeb(reqp); 1833 } 1834 ASSERT(ptr == mp->b_wptr); 1835 1836 ill_dlpi_send(ill, mp); 1837 } 1838 1839 static void 1840 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1841 { 1842 dl_capab_id_t *id_ic; 1843 uint_t sub_dl_cap = outers->dl_cap; 1844 dl_capability_sub_t *inners; 1845 uint8_t *capend; 1846 1847 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1848 1849 /* 1850 * Note: range checks here are not absolutely sufficient to 1851 * make us robust against malformed messages sent by drivers; 1852 * this is in keeping with the rest of IP's dlpi handling. 1853 * (Remember, it's coming from something else in the kernel 1854 * address space) 1855 */ 1856 1857 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1858 if (capend > mp->b_wptr) { 1859 cmn_err(CE_WARN, "ill_capability_id_ack: " 1860 "malformed sub-capability too long for mblk"); 1861 return; 1862 } 1863 1864 id_ic = (dl_capab_id_t *)(outers + 1); 1865 1866 if (outers->dl_length < sizeof (*id_ic) || 1867 (inners = &id_ic->id_subcap, 1868 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1869 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1870 "encapsulated capab type %d too long for mblk", 1871 inners->dl_cap); 1872 return; 1873 } 1874 1875 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1876 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1877 "isn't as expected; pass-thru module(s) detected, " 1878 "discarding capability\n", inners->dl_cap)); 1879 return; 1880 } 1881 1882 /* Process the encapsulated sub-capability */ 1883 ill_capability_dispatch(ill, mp, inners, B_TRUE); 1884 } 1885 1886 /* 1887 * Process Multidata Transmit capability negotiation ack received from a 1888 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a 1889 * DL_CAPABILITY_ACK message. 1890 */ 1891 static void 1892 ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1893 { 1894 mblk_t *nmp = NULL; 1895 dl_capability_req_t *oc; 1896 dl_capab_mdt_t *mdt_ic, *mdt_oc; 1897 ill_mdt_capab_t **ill_mdt_capab; 1898 uint_t sub_dl_cap = isub->dl_cap; 1899 uint8_t *capend; 1900 1901 ASSERT(sub_dl_cap == DL_CAPAB_MDT); 1902 1903 ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab; 1904 1905 /* 1906 * Note: range checks here are not absolutely sufficient to 1907 * make us robust against malformed messages sent by drivers; 1908 * this is in keeping with the rest of IP's dlpi handling. 1909 * (Remember, it's coming from something else in the kernel 1910 * address space) 1911 */ 1912 1913 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1914 if (capend > mp->b_wptr) { 1915 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1916 "malformed sub-capability too long for mblk"); 1917 return; 1918 } 1919 1920 mdt_ic = (dl_capab_mdt_t *)(isub + 1); 1921 1922 if (mdt_ic->mdt_version != MDT_VERSION_2) { 1923 cmn_err(CE_CONT, "ill_capability_mdt_ack: " 1924 "unsupported MDT sub-capability (version %d, expected %d)", 1925 mdt_ic->mdt_version, MDT_VERSION_2); 1926 return; 1927 } 1928 1929 if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) { 1930 ip1dbg(("ill_capability_mdt_ack: mid token for MDT " 1931 "capability isn't as expected; pass-thru module(s) " 1932 "detected, discarding capability\n")); 1933 return; 1934 } 1935 1936 if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) { 1937 1938 if (*ill_mdt_capab == NULL) { 1939 *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), 1940 KM_NOSLEEP); 1941 1942 if (*ill_mdt_capab == NULL) { 1943 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1944 "could not enable MDT version %d " 1945 "for %s (ENOMEM)\n", MDT_VERSION_2, 1946 ill->ill_name); 1947 return; 1948 } 1949 } 1950 1951 ip1dbg(("ill_capability_mdt_ack: interface %s supports " 1952 "MDT version %d (%d bytes leading, %d bytes trailing " 1953 "header spaces, %d max pld bufs, %d span limit)\n", 1954 ill->ill_name, MDT_VERSION_2, 1955 mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail, 1956 mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit)); 1957 1958 (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2; 1959 (*ill_mdt_capab)->ill_mdt_on = 1; 1960 /* 1961 * Round the following values to the nearest 32-bit; ULP 1962 * may further adjust them to accomodate for additional 1963 * protocol headers. We pass these values to ULP during 1964 * bind time. 1965 */ 1966 (*ill_mdt_capab)->ill_mdt_hdr_head = 1967 roundup(mdt_ic->mdt_hdr_head, 4); 1968 (*ill_mdt_capab)->ill_mdt_hdr_tail = 1969 roundup(mdt_ic->mdt_hdr_tail, 4); 1970 (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld; 1971 (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit; 1972 1973 ill->ill_capabilities |= ILL_CAPAB_MDT; 1974 } else { 1975 uint_t size; 1976 uchar_t *rptr; 1977 1978 size = sizeof (dl_capability_req_t) + 1979 sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); 1980 1981 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1982 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1983 "could not enable MDT for %s (ENOMEM)\n", 1984 ill->ill_name); 1985 return; 1986 } 1987 1988 rptr = nmp->b_rptr; 1989 /* initialize dl_capability_req_t */ 1990 oc = (dl_capability_req_t *)nmp->b_rptr; 1991 oc->dl_sub_offset = sizeof (dl_capability_req_t); 1992 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 1993 sizeof (dl_capab_mdt_t); 1994 nmp->b_rptr += sizeof (dl_capability_req_t); 1995 1996 /* initialize dl_capability_sub_t */ 1997 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 1998 nmp->b_rptr += sizeof (*isub); 1999 2000 /* initialize dl_capab_mdt_t */ 2001 mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr; 2002 bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic)); 2003 2004 nmp->b_rptr = rptr; 2005 2006 ip1dbg(("ill_capability_mdt_ack: asking interface %s " 2007 "to enable MDT version %d\n", ill->ill_name, 2008 MDT_VERSION_2)); 2009 2010 /* set ENABLE flag */ 2011 mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; 2012 2013 /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ 2014 ill_dlpi_send(ill, nmp); 2015 } 2016 } 2017 2018 static void 2019 ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp) 2020 { 2021 mblk_t *mp; 2022 dl_capab_mdt_t *mdt_subcap; 2023 dl_capability_sub_t *dl_subcap; 2024 int size; 2025 2026 if (!ILL_MDT_CAPABLE(ill)) 2027 return; 2028 2029 ASSERT(ill->ill_mdt_capab != NULL); 2030 /* 2031 * Clear the capability flag for MDT but retain the ill_mdt_capab 2032 * structure since it's possible that another thread is still 2033 * referring to it. The structure only gets deallocated when 2034 * we destroy the ill. 2035 */ 2036 ill->ill_capabilities &= ~ILL_CAPAB_MDT; 2037 2038 size = sizeof (*dl_subcap) + sizeof (*mdt_subcap); 2039 2040 mp = allocb(size, BPRI_HI); 2041 if (mp == NULL) { 2042 ip1dbg(("ill_capability_mdt_reset: unable to allocate " 2043 "request to disable MDT\n")); 2044 return; 2045 } 2046 2047 mp->b_wptr = mp->b_rptr + size; 2048 2049 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2050 dl_subcap->dl_cap = DL_CAPAB_MDT; 2051 dl_subcap->dl_length = sizeof (*mdt_subcap); 2052 2053 mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1); 2054 mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version; 2055 mdt_subcap->mdt_flags = 0; 2056 mdt_subcap->mdt_hdr_head = 0; 2057 mdt_subcap->mdt_hdr_tail = 0; 2058 2059 if (*sc_mp != NULL) 2060 linkb(*sc_mp, mp); 2061 else 2062 *sc_mp = mp; 2063 } 2064 2065 /* 2066 * Send a DL_NOTIFY_REQ to the specified ill to enable 2067 * DL_NOTE_PROMISC_ON/OFF_PHYS notifications. 2068 * Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware 2069 * acceleration. 2070 * Returns B_TRUE on success, B_FALSE if the message could not be sent. 2071 */ 2072 static boolean_t 2073 ill_enable_promisc_notify(ill_t *ill) 2074 { 2075 mblk_t *mp; 2076 dl_notify_req_t *req; 2077 2078 IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n")); 2079 2080 mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ); 2081 if (mp == NULL) 2082 return (B_FALSE); 2083 2084 req = (dl_notify_req_t *)mp->b_rptr; 2085 req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS | 2086 DL_NOTE_PROMISC_OFF_PHYS; 2087 2088 ill_dlpi_send(ill, mp); 2089 2090 return (B_TRUE); 2091 } 2092 2093 2094 /* 2095 * Allocate an IPsec capability request which will be filled by our 2096 * caller to turn on support for one or more algorithms. 2097 */ 2098 static mblk_t * 2099 ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub) 2100 { 2101 mblk_t *nmp; 2102 dl_capability_req_t *ocap; 2103 dl_capab_ipsec_t *ocip; 2104 dl_capab_ipsec_t *icip; 2105 uint8_t *ptr; 2106 icip = (dl_capab_ipsec_t *)(isub + 1); 2107 2108 /* 2109 * The first time around, we send a DL_NOTIFY_REQ to enable 2110 * PROMISC_ON/OFF notification from the provider. We need to 2111 * do this before enabling the algorithms to avoid leakage of 2112 * cleartext packets. 2113 */ 2114 2115 if (!ill_enable_promisc_notify(ill)) 2116 return (NULL); 2117 2118 /* 2119 * Allocate new mblk which will contain a new capability 2120 * request to enable the capabilities. 2121 */ 2122 2123 nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + 2124 sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ); 2125 if (nmp == NULL) 2126 return (NULL); 2127 2128 ptr = nmp->b_rptr; 2129 2130 /* initialize dl_capability_req_t */ 2131 ocap = (dl_capability_req_t *)ptr; 2132 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2133 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2134 ptr += sizeof (dl_capability_req_t); 2135 2136 /* initialize dl_capability_sub_t */ 2137 bcopy(isub, ptr, sizeof (*isub)); 2138 ptr += sizeof (*isub); 2139 2140 /* initialize dl_capab_ipsec_t */ 2141 ocip = (dl_capab_ipsec_t *)ptr; 2142 bcopy(icip, ocip, sizeof (*icip)); 2143 2144 nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]); 2145 return (nmp); 2146 } 2147 2148 /* 2149 * Process an IPsec capability negotiation ack received from a DLS Provider. 2150 * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or 2151 * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message. 2152 */ 2153 static void 2154 ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2155 { 2156 dl_capab_ipsec_t *icip; 2157 dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */ 2158 dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */ 2159 uint_t cipher, nciphers; 2160 mblk_t *nmp; 2161 uint_t alg_len; 2162 boolean_t need_sadb_dump; 2163 uint_t sub_dl_cap = isub->dl_cap; 2164 ill_ipsec_capab_t **ill_capab; 2165 uint64_t ill_capab_flag; 2166 uint8_t *capend, *ciphend; 2167 boolean_t sadb_resync; 2168 2169 ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH || 2170 sub_dl_cap == DL_CAPAB_IPSEC_ESP); 2171 2172 if (sub_dl_cap == DL_CAPAB_IPSEC_AH) { 2173 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah; 2174 ill_capab_flag = ILL_CAPAB_AH; 2175 } else { 2176 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp; 2177 ill_capab_flag = ILL_CAPAB_ESP; 2178 } 2179 2180 /* 2181 * If the ill capability structure exists, then this incoming 2182 * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle. 2183 * If this is so, then we'd need to resynchronize the SADB 2184 * after re-enabling the offloaded ciphers. 2185 */ 2186 sadb_resync = (*ill_capab != NULL); 2187 2188 /* 2189 * Note: range checks here are not absolutely sufficient to 2190 * make us robust against malformed messages sent by drivers; 2191 * this is in keeping with the rest of IP's dlpi handling. 2192 * (Remember, it's coming from something else in the kernel 2193 * address space) 2194 */ 2195 2196 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2197 if (capend > mp->b_wptr) { 2198 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2199 "malformed sub-capability too long for mblk"); 2200 return; 2201 } 2202 2203 /* 2204 * There are two types of acks we process here: 2205 * 1. acks in reply to a (first form) generic capability req 2206 * (no ENABLE flag set) 2207 * 2. acks in reply to a ENABLE capability req. 2208 * (ENABLE flag set) 2209 * 2210 * We process the subcapability passed as argument as follows: 2211 * 1 do initializations 2212 * 1.1 initialize nmp = NULL 2213 * 1.2 set need_sadb_dump to B_FALSE 2214 * 2 for each cipher in subcapability: 2215 * 2.1 if ENABLE flag is set: 2216 * 2.1.1 update per-ill ipsec capabilities info 2217 * 2.1.2 set need_sadb_dump to B_TRUE 2218 * 2.2 if ENABLE flag is not set: 2219 * 2.2.1 if nmp is NULL: 2220 * 2.2.1.1 allocate and initialize nmp 2221 * 2.2.1.2 init current pos in nmp 2222 * 2.2.2 copy current cipher to current pos in nmp 2223 * 2.2.3 set ENABLE flag in nmp 2224 * 2.2.4 update current pos 2225 * 3 if nmp is not equal to NULL, send enable request 2226 * 3.1 send capability request 2227 * 4 if need_sadb_dump is B_TRUE 2228 * 4.1 enable promiscuous on/off notifications 2229 * 4.2 call ill_dlpi_send(isub->dlcap) to send all 2230 * AH or ESP SA's to interface. 2231 */ 2232 2233 nmp = NULL; 2234 oalg = NULL; 2235 need_sadb_dump = B_FALSE; 2236 icip = (dl_capab_ipsec_t *)(isub + 1); 2237 ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]); 2238 2239 nciphers = icip->cip_nciphers; 2240 ciphend = (uint8_t *)(ialg + icip->cip_nciphers); 2241 2242 if (ciphend > capend) { 2243 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2244 "too many ciphers for sub-capability len"); 2245 return; 2246 } 2247 2248 for (cipher = 0; cipher < nciphers; cipher++) { 2249 alg_len = sizeof (dl_capab_ipsec_alg_t); 2250 2251 if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) { 2252 /* 2253 * TBD: when we provide a way to disable capabilities 2254 * from above, need to manage the request-pending state 2255 * and fail if we were not expecting this ACK. 2256 */ 2257 IPSECHW_DEBUG(IPSECHW_CAPAB, 2258 ("ill_capability_ipsec_ack: got ENABLE ACK\n")); 2259 2260 /* 2261 * Update IPsec capabilities for this ill 2262 */ 2263 2264 if (*ill_capab == NULL) { 2265 IPSECHW_DEBUG(IPSECHW_CAPAB, 2266 ("ill_capability_ipsec_ack: " 2267 "allocating ipsec_capab for ill\n")); 2268 *ill_capab = ill_ipsec_capab_alloc(); 2269 2270 if (*ill_capab == NULL) { 2271 cmn_err(CE_WARN, 2272 "ill_capability_ipsec_ack: " 2273 "could not enable IPsec Hardware " 2274 "acceleration for %s (ENOMEM)\n", 2275 ill->ill_name); 2276 return; 2277 } 2278 } 2279 2280 ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH || 2281 ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR); 2282 2283 if (ialg->alg_prim >= MAX_IPSEC_ALGS) { 2284 cmn_err(CE_WARN, 2285 "ill_capability_ipsec_ack: " 2286 "malformed IPsec algorithm id %d", 2287 ialg->alg_prim); 2288 continue; 2289 } 2290 2291 if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) { 2292 IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs, 2293 ialg->alg_prim); 2294 } else { 2295 ipsec_capab_algparm_t *alp; 2296 2297 IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs, 2298 ialg->alg_prim); 2299 if (!ill_ipsec_capab_resize_algparm(*ill_capab, 2300 ialg->alg_prim)) { 2301 cmn_err(CE_WARN, 2302 "ill_capability_ipsec_ack: " 2303 "no space for IPsec alg id %d", 2304 ialg->alg_prim); 2305 continue; 2306 } 2307 alp = &((*ill_capab)->encr_algparm[ 2308 ialg->alg_prim]); 2309 alp->minkeylen = ialg->alg_minbits; 2310 alp->maxkeylen = ialg->alg_maxbits; 2311 } 2312 ill->ill_capabilities |= ill_capab_flag; 2313 /* 2314 * indicate that a capability was enabled, which 2315 * will be used below to kick off a SADB dump 2316 * to the ill. 2317 */ 2318 need_sadb_dump = B_TRUE; 2319 } else { 2320 IPSECHW_DEBUG(IPSECHW_CAPAB, 2321 ("ill_capability_ipsec_ack: enabling alg 0x%x\n", 2322 ialg->alg_prim)); 2323 2324 if (nmp == NULL) { 2325 nmp = ill_alloc_ipsec_cap_req(ill, isub); 2326 if (nmp == NULL) { 2327 /* 2328 * Sending the PROMISC_ON/OFF 2329 * notification request failed. 2330 * We cannot enable the algorithms 2331 * since the Provider will not 2332 * notify IP of promiscous mode 2333 * changes, which could lead 2334 * to leakage of packets. 2335 */ 2336 cmn_err(CE_WARN, 2337 "ill_capability_ipsec_ack: " 2338 "could not enable IPsec Hardware " 2339 "acceleration for %s (ENOMEM)\n", 2340 ill->ill_name); 2341 return; 2342 } 2343 /* ptr to current output alg specifier */ 2344 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2345 } 2346 2347 /* 2348 * Copy current alg specifier, set ENABLE 2349 * flag, and advance to next output alg. 2350 * For now we enable all IPsec capabilities. 2351 */ 2352 ASSERT(oalg != NULL); 2353 bcopy(ialg, oalg, alg_len); 2354 oalg->alg_flag |= DL_CAPAB_ALG_ENABLE; 2355 nmp->b_wptr += alg_len; 2356 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2357 } 2358 2359 /* move to next input algorithm specifier */ 2360 ialg = (dl_capab_ipsec_alg_t *) 2361 ((char *)ialg + alg_len); 2362 } 2363 2364 if (nmp != NULL) 2365 /* 2366 * nmp points to a DL_CAPABILITY_REQ message to enable 2367 * IPsec hardware acceleration. 2368 */ 2369 ill_dlpi_send(ill, nmp); 2370 2371 if (need_sadb_dump) 2372 /* 2373 * An acknowledgement corresponding to a request to 2374 * enable acceleration was received, notify SADB. 2375 */ 2376 ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync); 2377 } 2378 2379 /* 2380 * Given an mblk with enough space in it, create sub-capability entries for 2381 * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised 2382 * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared, 2383 * in preparation for the reset the DL_CAPABILITY_REQ message. 2384 */ 2385 static void 2386 ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, 2387 ill_ipsec_capab_t *ill_cap, mblk_t *mp) 2388 { 2389 dl_capab_ipsec_t *oipsec; 2390 dl_capab_ipsec_alg_t *oalg; 2391 dl_capability_sub_t *dl_subcap; 2392 int i, k; 2393 2394 ASSERT(nciphers > 0); 2395 ASSERT(ill_cap != NULL); 2396 ASSERT(mp != NULL); 2397 ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen); 2398 2399 /* dl_capability_sub_t for "stype" */ 2400 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2401 dl_subcap->dl_cap = stype; 2402 dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen; 2403 mp->b_wptr += sizeof (dl_capability_sub_t); 2404 2405 /* dl_capab_ipsec_t for "stype" */ 2406 oipsec = (dl_capab_ipsec_t *)mp->b_wptr; 2407 oipsec->cip_version = 1; 2408 oipsec->cip_nciphers = nciphers; 2409 mp->b_wptr = (uchar_t *)&oipsec->cip_data[0]; 2410 2411 /* create entries for "stype" AUTH ciphers */ 2412 for (i = 0; i < ill_cap->algs_size; i++) { 2413 for (k = 0; k < BITSPERBYTE; k++) { 2414 if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0) 2415 continue; 2416 2417 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2418 bzero((void *)oalg, sizeof (*oalg)); 2419 oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH; 2420 oalg->alg_prim = k + (BITSPERBYTE * i); 2421 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2422 } 2423 } 2424 /* create entries for "stype" ENCR ciphers */ 2425 for (i = 0; i < ill_cap->algs_size; i++) { 2426 for (k = 0; k < BITSPERBYTE; k++) { 2427 if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0) 2428 continue; 2429 2430 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2431 bzero((void *)oalg, sizeof (*oalg)); 2432 oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR; 2433 oalg->alg_prim = k + (BITSPERBYTE * i); 2434 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2435 } 2436 } 2437 } 2438 2439 /* 2440 * Macro to count number of 1s in a byte (8-bit word). The total count is 2441 * accumulated into the passed-in argument (sum). We could use SPARCv9's 2442 * POPC instruction, but our macro is more flexible for an arbitrary length 2443 * of bytes, such as {auth,encr}_hw_algs. These variables are currently 2444 * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length 2445 * stays that way, we can reduce the number of iterations required. 2446 */ 2447 #define COUNT_1S(val, sum) { \ 2448 uint8_t x = val & 0xff; \ 2449 x = (x & 0x55) + ((x >> 1) & 0x55); \ 2450 x = (x & 0x33) + ((x >> 2) & 0x33); \ 2451 sum += (x & 0xf) + ((x >> 4) & 0xf); \ 2452 } 2453 2454 /* ARGSUSED */ 2455 static void 2456 ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp) 2457 { 2458 mblk_t *mp; 2459 ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; 2460 ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; 2461 uint64_t ill_capabilities = ill->ill_capabilities; 2462 int ah_cnt = 0, esp_cnt = 0; 2463 int ah_len = 0, esp_len = 0; 2464 int i, size = 0; 2465 2466 if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) 2467 return; 2468 2469 ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); 2470 ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); 2471 2472 /* Find out the number of ciphers for AH */ 2473 if (cap_ah != NULL) { 2474 for (i = 0; i < cap_ah->algs_size; i++) { 2475 COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt); 2476 COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt); 2477 } 2478 if (ah_cnt > 0) { 2479 size += sizeof (dl_capability_sub_t) + 2480 sizeof (dl_capab_ipsec_t); 2481 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2482 ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2483 size += ah_len; 2484 } 2485 } 2486 2487 /* Find out the number of ciphers for ESP */ 2488 if (cap_esp != NULL) { 2489 for (i = 0; i < cap_esp->algs_size; i++) { 2490 COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt); 2491 COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt); 2492 } 2493 if (esp_cnt > 0) { 2494 size += sizeof (dl_capability_sub_t) + 2495 sizeof (dl_capab_ipsec_t); 2496 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2497 esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2498 size += esp_len; 2499 } 2500 } 2501 2502 if (size == 0) { 2503 ip1dbg(("ill_capability_ipsec_reset: capabilities exist but " 2504 "there's nothing to reset\n")); 2505 return; 2506 } 2507 2508 mp = allocb(size, BPRI_HI); 2509 if (mp == NULL) { 2510 ip1dbg(("ill_capability_ipsec_reset: unable to allocate " 2511 "request to disable IPSEC Hardware Acceleration\n")); 2512 return; 2513 } 2514 2515 /* 2516 * Clear the capability flags for IPsec HA but retain the ill 2517 * capability structures since it's possible that another thread 2518 * is still referring to them. The structures only get deallocated 2519 * when we destroy the ill. 2520 * 2521 * Various places check the flags to see if the ill is capable of 2522 * hardware acceleration, and by clearing them we ensure that new 2523 * outbound IPsec packets are sent down encrypted. 2524 */ 2525 ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP); 2526 2527 /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ 2528 if (ah_cnt > 0) { 2529 ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, 2530 cap_ah, mp); 2531 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2532 } 2533 2534 /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ 2535 if (esp_cnt > 0) { 2536 ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, 2537 cap_esp, mp); 2538 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2539 } 2540 2541 /* 2542 * At this point we've composed a bunch of sub-capabilities to be 2543 * encapsulated in a DL_CAPABILITY_REQ and later sent downstream 2544 * by the caller. Upon receiving this reset message, the driver 2545 * must stop inbound decryption (by destroying all inbound SAs) 2546 * and let the corresponding packets come in encrypted. 2547 */ 2548 2549 if (*sc_mp != NULL) 2550 linkb(*sc_mp, mp); 2551 else 2552 *sc_mp = mp; 2553 } 2554 2555 static void 2556 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, 2557 boolean_t encapsulated) 2558 { 2559 boolean_t legacy = B_FALSE; 2560 2561 /* 2562 * If this DL_CAPABILITY_ACK came in as a response to our "reset" 2563 * DL_CAPABILITY_REQ, ignore it during this cycle. We've just 2564 * instructed the driver to disable its advertised capabilities, 2565 * so there's no point in accepting any response at this moment. 2566 */ 2567 if (ill->ill_dlpi_capab_state == IDS_UNKNOWN) 2568 return; 2569 2570 /* 2571 * Note that only the following two sub-capabilities may be 2572 * considered as "legacy", since their original definitions 2573 * do not incorporate the dl_mid_t module ID token, and hence 2574 * may require the use of the wrapper sub-capability. 2575 */ 2576 switch (subp->dl_cap) { 2577 case DL_CAPAB_IPSEC_AH: 2578 case DL_CAPAB_IPSEC_ESP: 2579 legacy = B_TRUE; 2580 break; 2581 } 2582 2583 /* 2584 * For legacy sub-capabilities which don't incorporate a queue_t 2585 * pointer in their structures, discard them if we detect that 2586 * there are intermediate modules in between IP and the driver. 2587 */ 2588 if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) { 2589 ip1dbg(("ill_capability_dispatch: unencapsulated capab type " 2590 "%d discarded; %d module(s) present below IP\n", 2591 subp->dl_cap, ill->ill_lmod_cnt)); 2592 return; 2593 } 2594 2595 switch (subp->dl_cap) { 2596 case DL_CAPAB_IPSEC_AH: 2597 case DL_CAPAB_IPSEC_ESP: 2598 ill_capability_ipsec_ack(ill, mp, subp); 2599 break; 2600 case DL_CAPAB_MDT: 2601 ill_capability_mdt_ack(ill, mp, subp); 2602 break; 2603 case DL_CAPAB_HCKSUM: 2604 ill_capability_hcksum_ack(ill, mp, subp); 2605 break; 2606 case DL_CAPAB_ZEROCOPY: 2607 ill_capability_zerocopy_ack(ill, mp, subp); 2608 break; 2609 case DL_CAPAB_POLL: 2610 if (!SOFT_RINGS_ENABLED()) 2611 ill_capability_dls_ack(ill, mp, subp); 2612 break; 2613 case DL_CAPAB_SOFT_RING: 2614 if (SOFT_RINGS_ENABLED()) 2615 ill_capability_dls_ack(ill, mp, subp); 2616 break; 2617 case DL_CAPAB_LSO: 2618 ill_capability_lso_ack(ill, mp, subp); 2619 break; 2620 default: 2621 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 2622 subp->dl_cap)); 2623 } 2624 } 2625 2626 /* 2627 * As part of negotiating polling capability, the driver tells us 2628 * the default (or normal) blanking interval and packet threshold 2629 * (the receive timer fires if blanking interval is reached or 2630 * the packet threshold is reached). 2631 * 2632 * As part of manipulating the polling interval, we always use our 2633 * estimated interval (avg service time * number of packets queued 2634 * on the squeue) but we try to blank for a minimum of 2635 * rr_normal_blank_time * rr_max_blank_ratio. We disable the 2636 * packet threshold during this time. When we are not in polling mode 2637 * we set the blank interval typically lower, rr_normal_pkt_cnt * 2638 * rr_min_blank_ratio but up the packet cnt by a ratio of 2639 * rr_min_pkt_cnt_ratio so that we are still getting chains if 2640 * possible although for a shorter interval. 2641 */ 2642 #define RR_MAX_BLANK_RATIO 20 2643 #define RR_MIN_BLANK_RATIO 10 2644 #define RR_MAX_PKT_CNT_RATIO 3 2645 #define RR_MIN_PKT_CNT_RATIO 3 2646 2647 /* 2648 * These can be tuned via /etc/system. 2649 */ 2650 int rr_max_blank_ratio = RR_MAX_BLANK_RATIO; 2651 int rr_min_blank_ratio = RR_MIN_BLANK_RATIO; 2652 int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO; 2653 int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO; 2654 2655 static mac_resource_handle_t 2656 ill_ring_add(void *arg, mac_resource_t *mrp) 2657 { 2658 ill_t *ill = (ill_t *)arg; 2659 mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 2660 ill_rx_ring_t *rx_ring; 2661 int ip_rx_index; 2662 2663 ASSERT(mrp != NULL); 2664 if (mrp->mr_type != MAC_RX_FIFO) { 2665 return (NULL); 2666 } 2667 ASSERT(ill != NULL); 2668 ASSERT(ill->ill_dls_capab != NULL); 2669 2670 mutex_enter(&ill->ill_lock); 2671 for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) { 2672 rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index]; 2673 ASSERT(rx_ring != NULL); 2674 2675 if (rx_ring->rr_ring_state == ILL_RING_FREE) { 2676 time_t normal_blank_time = 2677 mrfp->mrf_normal_blank_time; 2678 uint_t normal_pkt_cnt = 2679 mrfp->mrf_normal_pkt_count; 2680 2681 bzero(rx_ring, sizeof (ill_rx_ring_t)); 2682 2683 rx_ring->rr_blank = mrfp->mrf_blank; 2684 rx_ring->rr_handle = mrfp->mrf_arg; 2685 rx_ring->rr_ill = ill; 2686 rx_ring->rr_normal_blank_time = normal_blank_time; 2687 rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt; 2688 2689 rx_ring->rr_max_blank_time = 2690 normal_blank_time * rr_max_blank_ratio; 2691 rx_ring->rr_min_blank_time = 2692 normal_blank_time * rr_min_blank_ratio; 2693 rx_ring->rr_max_pkt_cnt = 2694 normal_pkt_cnt * rr_max_pkt_cnt_ratio; 2695 rx_ring->rr_min_pkt_cnt = 2696 normal_pkt_cnt * rr_min_pkt_cnt_ratio; 2697 2698 rx_ring->rr_ring_state = ILL_RING_INUSE; 2699 mutex_exit(&ill->ill_lock); 2700 2701 DTRACE_PROBE2(ill__ring__add, (void *), ill, 2702 (int), ip_rx_index); 2703 return ((mac_resource_handle_t)rx_ring); 2704 } 2705 } 2706 2707 /* 2708 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If 2709 * we have devices which can overwhelm this limit, ILL_MAX_RING 2710 * should be made configurable. Meanwhile it cause no panic because 2711 * driver will pass ip_input a NULL handle which will make 2712 * IP allocate the default squeue and Polling mode will not 2713 * be used for this ring. 2714 */ 2715 cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) " 2716 "for %s\n", ILL_MAX_RINGS, ill->ill_name); 2717 2718 mutex_exit(&ill->ill_lock); 2719 return (NULL); 2720 } 2721 2722 static boolean_t 2723 ill_capability_dls_init(ill_t *ill) 2724 { 2725 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2726 conn_t *connp; 2727 size_t sz; 2728 ip_stack_t *ipst = ill->ill_ipst; 2729 2730 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 2731 if (ill_dls == NULL) { 2732 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2733 "soft_ring enabled for ill=%s (%p) but data " 2734 "structs uninitialized\n", ill->ill_name, 2735 (void *)ill); 2736 } 2737 return (B_TRUE); 2738 } else if (ill->ill_capabilities & ILL_CAPAB_POLL) { 2739 if (ill_dls == NULL) { 2740 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2741 "polling enabled for ill=%s (%p) but data " 2742 "structs uninitialized\n", ill->ill_name, 2743 (void *)ill); 2744 } 2745 return (B_TRUE); 2746 } 2747 2748 if (ill_dls != NULL) { 2749 ill_rx_ring_t *rx_ring = ill_dls->ill_ring_tbl; 2750 /* Soft_Ring or polling is being re-enabled */ 2751 2752 connp = ill_dls->ill_unbind_conn; 2753 ASSERT(rx_ring != NULL); 2754 bzero((void *)ill_dls, sizeof (ill_dls_capab_t)); 2755 bzero((void *)rx_ring, 2756 sizeof (ill_rx_ring_t) * ILL_MAX_RINGS); 2757 ill_dls->ill_ring_tbl = rx_ring; 2758 ill_dls->ill_unbind_conn = connp; 2759 return (B_TRUE); 2760 } 2761 2762 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP, 2763 ipst->ips_netstack)) == NULL) 2764 return (B_FALSE); 2765 2766 sz = sizeof (ill_dls_capab_t); 2767 sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS; 2768 2769 ill_dls = kmem_zalloc(sz, KM_NOSLEEP); 2770 if (ill_dls == NULL) { 2771 cmn_err(CE_WARN, "ill_capability_dls_init: could not " 2772 "allocate dls_capab for %s (%p)\n", ill->ill_name, 2773 (void *)ill); 2774 CONN_DEC_REF(connp); 2775 return (B_FALSE); 2776 } 2777 2778 /* Allocate space to hold ring table */ 2779 ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1]; 2780 ill->ill_dls_capab = ill_dls; 2781 ill_dls->ill_unbind_conn = connp; 2782 return (B_TRUE); 2783 } 2784 2785 /* 2786 * ill_capability_dls_disable: disable soft_ring and/or polling 2787 * capability. Since any of the rings might already be in use, need 2788 * to call ip_squeue_clean_all() which gets behind the squeue to disable 2789 * direct calls if necessary. 2790 */ 2791 static void 2792 ill_capability_dls_disable(ill_t *ill) 2793 { 2794 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2795 2796 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 2797 ip_squeue_clean_all(ill); 2798 ill_dls->ill_tx = NULL; 2799 ill_dls->ill_tx_handle = NULL; 2800 ill_dls->ill_dls_change_status = NULL; 2801 ill_dls->ill_dls_bind = NULL; 2802 ill_dls->ill_dls_unbind = NULL; 2803 } 2804 2805 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS)); 2806 } 2807 2808 static void 2809 ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls, 2810 dl_capability_sub_t *isub) 2811 { 2812 uint_t size; 2813 uchar_t *rptr; 2814 dl_capab_dls_t dls, *odls; 2815 ill_dls_capab_t *ill_dls; 2816 mblk_t *nmp = NULL; 2817 dl_capability_req_t *ocap; 2818 uint_t sub_dl_cap = isub->dl_cap; 2819 2820 if (!ill_capability_dls_init(ill)) 2821 return; 2822 ill_dls = ill->ill_dls_capab; 2823 2824 /* Copy locally to get the members aligned */ 2825 bcopy((void *)idls, (void *)&dls, 2826 sizeof (dl_capab_dls_t)); 2827 2828 /* Get the tx function and handle from dld */ 2829 ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx; 2830 ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle; 2831 2832 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2833 ill_dls->ill_dls_change_status = 2834 (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status; 2835 ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind; 2836 ill_dls->ill_dls_unbind = 2837 (ip_dls_unbind_t)dls.dls_ring_unbind; 2838 ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt; 2839 } 2840 2841 size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) + 2842 isub->dl_length; 2843 2844 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2845 cmn_err(CE_WARN, "ill_capability_dls_capable: could " 2846 "not allocate memory for CAPAB_REQ for %s (%p)\n", 2847 ill->ill_name, (void *)ill); 2848 return; 2849 } 2850 2851 /* initialize dl_capability_req_t */ 2852 rptr = nmp->b_rptr; 2853 ocap = (dl_capability_req_t *)rptr; 2854 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2855 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2856 rptr += sizeof (dl_capability_req_t); 2857 2858 /* initialize dl_capability_sub_t */ 2859 bcopy(isub, rptr, sizeof (*isub)); 2860 rptr += sizeof (*isub); 2861 2862 odls = (dl_capab_dls_t *)rptr; 2863 rptr += sizeof (dl_capab_dls_t); 2864 2865 /* initialize dl_capab_dls_t to be sent down */ 2866 dls.dls_rx_handle = (uintptr_t)ill; 2867 dls.dls_rx = (uintptr_t)ip_input; 2868 dls.dls_ring_add = (uintptr_t)ill_ring_add; 2869 2870 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2871 dls.dls_ring_cnt = ip_soft_rings_cnt; 2872 dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment; 2873 dls.dls_flags = SOFT_RING_ENABLE; 2874 } else { 2875 dls.dls_flags = POLL_ENABLE; 2876 ip1dbg(("ill_capability_dls_capable: asking interface %s " 2877 "to enable polling\n", ill->ill_name)); 2878 } 2879 bcopy((void *)&dls, (void *)odls, 2880 sizeof (dl_capab_dls_t)); 2881 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 2882 /* 2883 * nmp points to a DL_CAPABILITY_REQ message to 2884 * enable either soft_ring or polling 2885 */ 2886 ill_dlpi_send(ill, nmp); 2887 } 2888 2889 static void 2890 ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp) 2891 { 2892 mblk_t *mp; 2893 dl_capab_dls_t *idls; 2894 dl_capability_sub_t *dl_subcap; 2895 int size; 2896 2897 if (!(ill->ill_capabilities & ILL_CAPAB_DLS)) 2898 return; 2899 2900 ASSERT(ill->ill_dls_capab != NULL); 2901 2902 size = sizeof (*dl_subcap) + sizeof (*idls); 2903 2904 mp = allocb(size, BPRI_HI); 2905 if (mp == NULL) { 2906 ip1dbg(("ill_capability_dls_reset: unable to allocate " 2907 "request to disable soft_ring\n")); 2908 return; 2909 } 2910 2911 mp->b_wptr = mp->b_rptr + size; 2912 2913 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2914 dl_subcap->dl_length = sizeof (*idls); 2915 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2916 dl_subcap->dl_cap = DL_CAPAB_SOFT_RING; 2917 else 2918 dl_subcap->dl_cap = DL_CAPAB_POLL; 2919 2920 idls = (dl_capab_dls_t *)(dl_subcap + 1); 2921 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2922 idls->dls_flags = SOFT_RING_DISABLE; 2923 else 2924 idls->dls_flags = POLL_DISABLE; 2925 2926 if (*sc_mp != NULL) 2927 linkb(*sc_mp, mp); 2928 else 2929 *sc_mp = mp; 2930 } 2931 2932 /* 2933 * Process a soft_ring/poll capability negotiation ack received 2934 * from a DLS Provider.isub must point to the sub-capability 2935 * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message. 2936 */ 2937 static void 2938 ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2939 { 2940 dl_capab_dls_t *idls; 2941 uint_t sub_dl_cap = isub->dl_cap; 2942 uint8_t *capend; 2943 2944 ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING || 2945 sub_dl_cap == DL_CAPAB_POLL); 2946 2947 if (ill->ill_isv6) 2948 return; 2949 2950 /* 2951 * Note: range checks here are not absolutely sufficient to 2952 * make us robust against malformed messages sent by drivers; 2953 * this is in keeping with the rest of IP's dlpi handling. 2954 * (Remember, it's coming from something else in the kernel 2955 * address space) 2956 */ 2957 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2958 if (capend > mp->b_wptr) { 2959 cmn_err(CE_WARN, "ill_capability_dls_ack: " 2960 "malformed sub-capability too long for mblk"); 2961 return; 2962 } 2963 2964 /* 2965 * There are two types of acks we process here: 2966 * 1. acks in reply to a (first form) generic capability req 2967 * (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE) 2968 * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE 2969 * capability req. 2970 */ 2971 idls = (dl_capab_dls_t *)(isub + 1); 2972 2973 if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) { 2974 ip1dbg(("ill_capability_dls_ack: mid token for dls " 2975 "capability isn't as expected; pass-thru " 2976 "module(s) detected, discarding capability\n")); 2977 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 2978 /* 2979 * This is a capability renegotitation case. 2980 * The interface better be unusable at this 2981 * point other wise bad things will happen 2982 * if we disable direct calls on a running 2983 * and up interface. 2984 */ 2985 ill_capability_dls_disable(ill); 2986 } 2987 return; 2988 } 2989 2990 switch (idls->dls_flags) { 2991 default: 2992 /* Disable if unknown flag */ 2993 case SOFT_RING_DISABLE: 2994 case POLL_DISABLE: 2995 ill_capability_dls_disable(ill); 2996 break; 2997 case SOFT_RING_CAPABLE: 2998 case POLL_CAPABLE: 2999 /* 3000 * If the capability was already enabled, its safe 3001 * to disable it first to get rid of stale information 3002 * and then start enabling it again. 3003 */ 3004 ill_capability_dls_disable(ill); 3005 ill_capability_dls_capable(ill, idls, isub); 3006 break; 3007 case SOFT_RING_ENABLE: 3008 case POLL_ENABLE: 3009 mutex_enter(&ill->ill_lock); 3010 if (sub_dl_cap == DL_CAPAB_SOFT_RING && 3011 !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) { 3012 ASSERT(ill->ill_dls_capab != NULL); 3013 ill->ill_capabilities |= ILL_CAPAB_SOFT_RING; 3014 } 3015 if (sub_dl_cap == DL_CAPAB_POLL && 3016 !(ill->ill_capabilities & ILL_CAPAB_POLL)) { 3017 ASSERT(ill->ill_dls_capab != NULL); 3018 ill->ill_capabilities |= ILL_CAPAB_POLL; 3019 ip1dbg(("ill_capability_dls_ack: interface %s " 3020 "has enabled polling\n", ill->ill_name)); 3021 } 3022 mutex_exit(&ill->ill_lock); 3023 break; 3024 } 3025 } 3026 3027 /* 3028 * Process a hardware checksum offload capability negotiation ack received 3029 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 3030 * of a DL_CAPABILITY_ACK message. 3031 */ 3032 static void 3033 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3034 { 3035 dl_capability_req_t *ocap; 3036 dl_capab_hcksum_t *ihck, *ohck; 3037 ill_hcksum_capab_t **ill_hcksum; 3038 mblk_t *nmp = NULL; 3039 uint_t sub_dl_cap = isub->dl_cap; 3040 uint8_t *capend; 3041 3042 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 3043 3044 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 3045 3046 /* 3047 * Note: range checks here are not absolutely sufficient to 3048 * make us robust against malformed messages sent by drivers; 3049 * this is in keeping with the rest of IP's dlpi handling. 3050 * (Remember, it's coming from something else in the kernel 3051 * address space) 3052 */ 3053 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3054 if (capend > mp->b_wptr) { 3055 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3056 "malformed sub-capability too long for mblk"); 3057 return; 3058 } 3059 3060 /* 3061 * There are two types of acks we process here: 3062 * 1. acks in reply to a (first form) generic capability req 3063 * (no ENABLE flag set) 3064 * 2. acks in reply to a ENABLE capability req. 3065 * (ENABLE flag set) 3066 */ 3067 ihck = (dl_capab_hcksum_t *)(isub + 1); 3068 3069 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 3070 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 3071 "unsupported hardware checksum " 3072 "sub-capability (version %d, expected %d)", 3073 ihck->hcksum_version, HCKSUM_VERSION_1); 3074 return; 3075 } 3076 3077 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 3078 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 3079 "checksum capability isn't as expected; pass-thru " 3080 "module(s) detected, discarding capability\n")); 3081 return; 3082 } 3083 3084 #define CURR_HCKSUM_CAPAB \ 3085 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 3086 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 3087 3088 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 3089 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 3090 /* do ENABLE processing */ 3091 if (*ill_hcksum == NULL) { 3092 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 3093 KM_NOSLEEP); 3094 3095 if (*ill_hcksum == NULL) { 3096 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3097 "could not enable hcksum version %d " 3098 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 3099 ill->ill_name); 3100 return; 3101 } 3102 } 3103 3104 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 3105 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 3106 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 3107 ip1dbg(("ill_capability_hcksum_ack: interface %s " 3108 "has enabled hardware checksumming\n ", 3109 ill->ill_name)); 3110 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 3111 /* 3112 * Enabling hardware checksum offload 3113 * Currently IP supports {TCP,UDP}/IPv4 3114 * partial and full cksum offload and 3115 * IPv4 header checksum offload. 3116 * Allocate new mblk which will 3117 * contain a new capability request 3118 * to enable hardware checksum offload. 3119 */ 3120 uint_t size; 3121 uchar_t *rptr; 3122 3123 size = sizeof (dl_capability_req_t) + 3124 sizeof (dl_capability_sub_t) + isub->dl_length; 3125 3126 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3127 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3128 "could not enable hardware cksum for %s (ENOMEM)\n", 3129 ill->ill_name); 3130 return; 3131 } 3132 3133 rptr = nmp->b_rptr; 3134 /* initialize dl_capability_req_t */ 3135 ocap = (dl_capability_req_t *)nmp->b_rptr; 3136 ocap->dl_sub_offset = 3137 sizeof (dl_capability_req_t); 3138 ocap->dl_sub_length = 3139 sizeof (dl_capability_sub_t) + 3140 isub->dl_length; 3141 nmp->b_rptr += sizeof (dl_capability_req_t); 3142 3143 /* initialize dl_capability_sub_t */ 3144 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 3145 nmp->b_rptr += sizeof (*isub); 3146 3147 /* initialize dl_capab_hcksum_t */ 3148 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 3149 bcopy(ihck, ohck, sizeof (*ihck)); 3150 3151 nmp->b_rptr = rptr; 3152 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 3153 3154 /* Set ENABLE flag */ 3155 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 3156 ohck->hcksum_txflags |= HCKSUM_ENABLE; 3157 3158 /* 3159 * nmp points to a DL_CAPABILITY_REQ message to enable 3160 * hardware checksum acceleration. 3161 */ 3162 ill_dlpi_send(ill, nmp); 3163 } else { 3164 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 3165 "advertised %x hardware checksum capability flags\n", 3166 ill->ill_name, ihck->hcksum_txflags)); 3167 } 3168 } 3169 3170 static void 3171 ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp) 3172 { 3173 mblk_t *mp; 3174 dl_capab_hcksum_t *hck_subcap; 3175 dl_capability_sub_t *dl_subcap; 3176 int size; 3177 3178 if (!ILL_HCKSUM_CAPABLE(ill)) 3179 return; 3180 3181 ASSERT(ill->ill_hcksum_capab != NULL); 3182 /* 3183 * Clear the capability flag for hardware checksum offload but 3184 * retain the ill_hcksum_capab structure since it's possible that 3185 * another thread is still referring to it. The structure only 3186 * gets deallocated when we destroy the ill. 3187 */ 3188 ill->ill_capabilities &= ~ILL_CAPAB_HCKSUM; 3189 3190 size = sizeof (*dl_subcap) + sizeof (*hck_subcap); 3191 3192 mp = allocb(size, BPRI_HI); 3193 if (mp == NULL) { 3194 ip1dbg(("ill_capability_hcksum_reset: unable to allocate " 3195 "request to disable hardware checksum offload\n")); 3196 return; 3197 } 3198 3199 mp->b_wptr = mp->b_rptr + size; 3200 3201 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3202 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 3203 dl_subcap->dl_length = sizeof (*hck_subcap); 3204 3205 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 3206 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 3207 hck_subcap->hcksum_txflags = 0; 3208 3209 if (*sc_mp != NULL) 3210 linkb(*sc_mp, mp); 3211 else 3212 *sc_mp = mp; 3213 } 3214 3215 static void 3216 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3217 { 3218 mblk_t *nmp = NULL; 3219 dl_capability_req_t *oc; 3220 dl_capab_zerocopy_t *zc_ic, *zc_oc; 3221 ill_zerocopy_capab_t **ill_zerocopy_capab; 3222 uint_t sub_dl_cap = isub->dl_cap; 3223 uint8_t *capend; 3224 3225 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 3226 3227 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 3228 3229 /* 3230 * Note: range checks here are not absolutely sufficient to 3231 * make us robust against malformed messages sent by drivers; 3232 * this is in keeping with the rest of IP's dlpi handling. 3233 * (Remember, it's coming from something else in the kernel 3234 * address space) 3235 */ 3236 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3237 if (capend > mp->b_wptr) { 3238 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3239 "malformed sub-capability too long for mblk"); 3240 return; 3241 } 3242 3243 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 3244 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 3245 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 3246 "unsupported ZEROCOPY sub-capability (version %d, " 3247 "expected %d)", zc_ic->zerocopy_version, 3248 ZEROCOPY_VERSION_1); 3249 return; 3250 } 3251 3252 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 3253 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 3254 "capability isn't as expected; pass-thru module(s) " 3255 "detected, discarding capability\n")); 3256 return; 3257 } 3258 3259 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 3260 if (*ill_zerocopy_capab == NULL) { 3261 *ill_zerocopy_capab = 3262 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 3263 KM_NOSLEEP); 3264 3265 if (*ill_zerocopy_capab == NULL) { 3266 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3267 "could not enable Zero-copy version %d " 3268 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 3269 ill->ill_name); 3270 return; 3271 } 3272 } 3273 3274 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 3275 "supports Zero-copy version %d\n", ill->ill_name, 3276 ZEROCOPY_VERSION_1)); 3277 3278 (*ill_zerocopy_capab)->ill_zerocopy_version = 3279 zc_ic->zerocopy_version; 3280 (*ill_zerocopy_capab)->ill_zerocopy_flags = 3281 zc_ic->zerocopy_flags; 3282 3283 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 3284 } else { 3285 uint_t size; 3286 uchar_t *rptr; 3287 3288 size = sizeof (dl_capability_req_t) + 3289 sizeof (dl_capability_sub_t) + 3290 sizeof (dl_capab_zerocopy_t); 3291 3292 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3293 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3294 "could not enable zerocopy for %s (ENOMEM)\n", 3295 ill->ill_name); 3296 return; 3297 } 3298 3299 rptr = nmp->b_rptr; 3300 /* initialize dl_capability_req_t */ 3301 oc = (dl_capability_req_t *)rptr; 3302 oc->dl_sub_offset = sizeof (dl_capability_req_t); 3303 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 3304 sizeof (dl_capab_zerocopy_t); 3305 rptr += sizeof (dl_capability_req_t); 3306 3307 /* initialize dl_capability_sub_t */ 3308 bcopy(isub, rptr, sizeof (*isub)); 3309 rptr += sizeof (*isub); 3310 3311 /* initialize dl_capab_zerocopy_t */ 3312 zc_oc = (dl_capab_zerocopy_t *)rptr; 3313 *zc_oc = *zc_ic; 3314 3315 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 3316 "to enable zero-copy version %d\n", ill->ill_name, 3317 ZEROCOPY_VERSION_1)); 3318 3319 /* set VMSAFE_MEM flag */ 3320 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 3321 3322 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 3323 ill_dlpi_send(ill, nmp); 3324 } 3325 } 3326 3327 static void 3328 ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp) 3329 { 3330 mblk_t *mp; 3331 dl_capab_zerocopy_t *zerocopy_subcap; 3332 dl_capability_sub_t *dl_subcap; 3333 int size; 3334 3335 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 3336 return; 3337 3338 ASSERT(ill->ill_zerocopy_capab != NULL); 3339 /* 3340 * Clear the capability flag for Zero-copy but retain the 3341 * ill_zerocopy_capab structure since it's possible that another 3342 * thread is still referring to it. The structure only gets 3343 * deallocated when we destroy the ill. 3344 */ 3345 ill->ill_capabilities &= ~ILL_CAPAB_ZEROCOPY; 3346 3347 size = sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 3348 3349 mp = allocb(size, BPRI_HI); 3350 if (mp == NULL) { 3351 ip1dbg(("ill_capability_zerocopy_reset: unable to allocate " 3352 "request to disable Zero-copy\n")); 3353 return; 3354 } 3355 3356 mp->b_wptr = mp->b_rptr + size; 3357 3358 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3359 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 3360 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 3361 3362 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 3363 zerocopy_subcap->zerocopy_version = 3364 ill->ill_zerocopy_capab->ill_zerocopy_version; 3365 zerocopy_subcap->zerocopy_flags = 0; 3366 3367 if (*sc_mp != NULL) 3368 linkb(*sc_mp, mp); 3369 else 3370 *sc_mp = mp; 3371 } 3372 3373 /* 3374 * Process Large Segment Offload capability negotiation ack received from a 3375 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_LSO) of a 3376 * DL_CAPABILITY_ACK message. 3377 */ 3378 static void 3379 ill_capability_lso_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3380 { 3381 mblk_t *nmp = NULL; 3382 dl_capability_req_t *oc; 3383 dl_capab_lso_t *lso_ic, *lso_oc; 3384 ill_lso_capab_t **ill_lso_capab; 3385 uint_t sub_dl_cap = isub->dl_cap; 3386 uint8_t *capend; 3387 3388 ASSERT(sub_dl_cap == DL_CAPAB_LSO); 3389 3390 ill_lso_capab = (ill_lso_capab_t **)&ill->ill_lso_capab; 3391 3392 /* 3393 * Note: range checks here are not absolutely sufficient to 3394 * make us robust against malformed messages sent by drivers; 3395 * this is in keeping with the rest of IP's dlpi handling. 3396 * (Remember, it's coming from something else in the kernel 3397 * address space) 3398 */ 3399 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3400 if (capend > mp->b_wptr) { 3401 cmn_err(CE_WARN, "ill_capability_lso_ack: " 3402 "malformed sub-capability too long for mblk"); 3403 return; 3404 } 3405 3406 lso_ic = (dl_capab_lso_t *)(isub + 1); 3407 3408 if (lso_ic->lso_version != LSO_VERSION_1) { 3409 cmn_err(CE_CONT, "ill_capability_lso_ack: " 3410 "unsupported LSO sub-capability (version %d, expected %d)", 3411 lso_ic->lso_version, LSO_VERSION_1); 3412 return; 3413 } 3414 3415 if (!dlcapabcheckqid(&lso_ic->lso_mid, ill->ill_lmod_rq)) { 3416 ip1dbg(("ill_capability_lso_ack: mid token for LSO " 3417 "capability isn't as expected; pass-thru module(s) " 3418 "detected, discarding capability\n")); 3419 return; 3420 } 3421 3422 if ((lso_ic->lso_flags & LSO_TX_ENABLE) && 3423 (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4)) { 3424 if (*ill_lso_capab == NULL) { 3425 *ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), 3426 KM_NOSLEEP); 3427 3428 if (*ill_lso_capab == NULL) { 3429 cmn_err(CE_WARN, "ill_capability_lso_ack: " 3430 "could not enable LSO version %d " 3431 "for %s (ENOMEM)\n", LSO_VERSION_1, 3432 ill->ill_name); 3433 return; 3434 } 3435 } 3436 3437 (*ill_lso_capab)->ill_lso_version = lso_ic->lso_version; 3438 (*ill_lso_capab)->ill_lso_flags = lso_ic->lso_flags; 3439 (*ill_lso_capab)->ill_lso_max = lso_ic->lso_max; 3440 ill->ill_capabilities |= ILL_CAPAB_LSO; 3441 3442 ip1dbg(("ill_capability_lso_ack: interface %s " 3443 "has enabled LSO\n ", ill->ill_name)); 3444 } else if (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4) { 3445 uint_t size; 3446 uchar_t *rptr; 3447 3448 size = sizeof (dl_capability_req_t) + 3449 sizeof (dl_capability_sub_t) + sizeof (dl_capab_lso_t); 3450 3451 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3452 cmn_err(CE_WARN, "ill_capability_lso_ack: " 3453 "could not enable LSO for %s (ENOMEM)\n", 3454 ill->ill_name); 3455 return; 3456 } 3457 3458 rptr = nmp->b_rptr; 3459 /* initialize dl_capability_req_t */ 3460 oc = (dl_capability_req_t *)nmp->b_rptr; 3461 oc->dl_sub_offset = sizeof (dl_capability_req_t); 3462 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 3463 sizeof (dl_capab_lso_t); 3464 nmp->b_rptr += sizeof (dl_capability_req_t); 3465 3466 /* initialize dl_capability_sub_t */ 3467 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 3468 nmp->b_rptr += sizeof (*isub); 3469 3470 /* initialize dl_capab_lso_t */ 3471 lso_oc = (dl_capab_lso_t *)nmp->b_rptr; 3472 bcopy(lso_ic, lso_oc, sizeof (*lso_ic)); 3473 3474 nmp->b_rptr = rptr; 3475 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 3476 3477 /* set ENABLE flag */ 3478 lso_oc->lso_flags |= LSO_TX_ENABLE; 3479 3480 /* nmp points to a DL_CAPABILITY_REQ message to enable LSO */ 3481 ill_dlpi_send(ill, nmp); 3482 } else { 3483 ip1dbg(("ill_capability_lso_ack: interface %s has " 3484 "advertised %x LSO capability flags\n", 3485 ill->ill_name, lso_ic->lso_flags)); 3486 } 3487 } 3488 3489 3490 static void 3491 ill_capability_lso_reset(ill_t *ill, mblk_t **sc_mp) 3492 { 3493 mblk_t *mp; 3494 dl_capab_lso_t *lso_subcap; 3495 dl_capability_sub_t *dl_subcap; 3496 int size; 3497 3498 if (!(ill->ill_capabilities & ILL_CAPAB_LSO)) 3499 return; 3500 3501 ASSERT(ill->ill_lso_capab != NULL); 3502 /* 3503 * Clear the capability flag for LSO but retain the 3504 * ill_lso_capab structure since it's possible that another 3505 * thread is still referring to it. The structure only gets 3506 * deallocated when we destroy the ill. 3507 */ 3508 ill->ill_capabilities &= ~ILL_CAPAB_LSO; 3509 3510 size = sizeof (*dl_subcap) + sizeof (*lso_subcap); 3511 3512 mp = allocb(size, BPRI_HI); 3513 if (mp == NULL) { 3514 ip1dbg(("ill_capability_lso_reset: unable to allocate " 3515 "request to disable LSO\n")); 3516 return; 3517 } 3518 3519 mp->b_wptr = mp->b_rptr + size; 3520 3521 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3522 dl_subcap->dl_cap = DL_CAPAB_LSO; 3523 dl_subcap->dl_length = sizeof (*lso_subcap); 3524 3525 lso_subcap = (dl_capab_lso_t *)(dl_subcap + 1); 3526 lso_subcap->lso_version = ill->ill_lso_capab->ill_lso_version; 3527 lso_subcap->lso_flags = 0; 3528 3529 if (*sc_mp != NULL) 3530 linkb(*sc_mp, mp); 3531 else 3532 *sc_mp = mp; 3533 } 3534 3535 /* 3536 * Consume a new-style hardware capabilities negotiation ack. 3537 * Called from ip_rput_dlpi_writer(). 3538 */ 3539 void 3540 ill_capability_ack(ill_t *ill, mblk_t *mp) 3541 { 3542 dl_capability_ack_t *capp; 3543 dl_capability_sub_t *subp, *endp; 3544 3545 if (ill->ill_dlpi_capab_state == IDS_INPROGRESS) 3546 ill->ill_dlpi_capab_state = IDS_OK; 3547 3548 capp = (dl_capability_ack_t *)mp->b_rptr; 3549 3550 if (capp->dl_sub_length == 0) 3551 /* no new-style capabilities */ 3552 return; 3553 3554 /* make sure the driver supplied correct dl_sub_length */ 3555 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 3556 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 3557 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 3558 return; 3559 } 3560 3561 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 3562 /* 3563 * There are sub-capabilities. Process the ones we know about. 3564 * Loop until we don't have room for another sub-cap header.. 3565 */ 3566 for (subp = SC(capp, capp->dl_sub_offset), 3567 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 3568 subp <= endp; 3569 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 3570 3571 switch (subp->dl_cap) { 3572 case DL_CAPAB_ID_WRAPPER: 3573 ill_capability_id_ack(ill, mp, subp); 3574 break; 3575 default: 3576 ill_capability_dispatch(ill, mp, subp, B_FALSE); 3577 break; 3578 } 3579 } 3580 #undef SC 3581 } 3582 3583 /* 3584 * This routine is called to scan the fragmentation reassembly table for 3585 * the specified ILL for any packets that are starting to smell. 3586 * dead_interval is the maximum time in seconds that will be tolerated. It 3587 * will either be the value specified in ip_g_frag_timeout, or zero if the 3588 * ILL is shutting down and it is time to blow everything off. 3589 * 3590 * It returns the number of seconds (as a time_t) that the next frag timer 3591 * should be scheduled for, 0 meaning that the timer doesn't need to be 3592 * re-started. Note that the method of calculating next_timeout isn't 3593 * entirely accurate since time will flow between the time we grab 3594 * current_time and the time we schedule the next timeout. This isn't a 3595 * big problem since this is the timer for sending an ICMP reassembly time 3596 * exceeded messages, and it doesn't have to be exactly accurate. 3597 * 3598 * This function is 3599 * sometimes called as writer, although this is not required. 3600 */ 3601 time_t 3602 ill_frag_timeout(ill_t *ill, time_t dead_interval) 3603 { 3604 ipfb_t *ipfb; 3605 ipfb_t *endp; 3606 ipf_t *ipf; 3607 ipf_t *ipfnext; 3608 mblk_t *mp; 3609 time_t current_time = gethrestime_sec(); 3610 time_t next_timeout = 0; 3611 uint32_t hdr_length; 3612 mblk_t *send_icmp_head; 3613 mblk_t *send_icmp_head_v6; 3614 zoneid_t zoneid; 3615 ip_stack_t *ipst = ill->ill_ipst; 3616 3617 ipfb = ill->ill_frag_hash_tbl; 3618 if (ipfb == NULL) 3619 return (B_FALSE); 3620 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 3621 /* Walk the frag hash table. */ 3622 for (; ipfb < endp; ipfb++) { 3623 send_icmp_head = NULL; 3624 send_icmp_head_v6 = NULL; 3625 mutex_enter(&ipfb->ipfb_lock); 3626 while ((ipf = ipfb->ipfb_ipf) != 0) { 3627 time_t frag_time = current_time - ipf->ipf_timestamp; 3628 time_t frag_timeout; 3629 3630 if (frag_time < dead_interval) { 3631 /* 3632 * There are some outstanding fragments 3633 * that will timeout later. Make note of 3634 * the time so that we can reschedule the 3635 * next timeout appropriately. 3636 */ 3637 frag_timeout = dead_interval - frag_time; 3638 if (next_timeout == 0 || 3639 frag_timeout < next_timeout) { 3640 next_timeout = frag_timeout; 3641 } 3642 break; 3643 } 3644 /* Time's up. Get it out of here. */ 3645 hdr_length = ipf->ipf_nf_hdr_len; 3646 ipfnext = ipf->ipf_hash_next; 3647 if (ipfnext) 3648 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 3649 *ipf->ipf_ptphn = ipfnext; 3650 mp = ipf->ipf_mp->b_cont; 3651 for (; mp; mp = mp->b_cont) { 3652 /* Extra points for neatness. */ 3653 IP_REASS_SET_START(mp, 0); 3654 IP_REASS_SET_END(mp, 0); 3655 } 3656 mp = ipf->ipf_mp->b_cont; 3657 ill->ill_frag_count -= ipf->ipf_count; 3658 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 3659 ipfb->ipfb_count -= ipf->ipf_count; 3660 ASSERT(ipfb->ipfb_frag_pkts > 0); 3661 ipfb->ipfb_frag_pkts--; 3662 /* 3663 * We do not send any icmp message from here because 3664 * we currently are holding the ipfb_lock for this 3665 * hash chain. If we try and send any icmp messages 3666 * from here we may end up via a put back into ip 3667 * trying to get the same lock, causing a recursive 3668 * mutex panic. Instead we build a list and send all 3669 * the icmp messages after we have dropped the lock. 3670 */ 3671 if (ill->ill_isv6) { 3672 if (hdr_length != 0) { 3673 mp->b_next = send_icmp_head_v6; 3674 send_icmp_head_v6 = mp; 3675 } else { 3676 freemsg(mp); 3677 } 3678 } else { 3679 if (hdr_length != 0) { 3680 mp->b_next = send_icmp_head; 3681 send_icmp_head = mp; 3682 } else { 3683 freemsg(mp); 3684 } 3685 } 3686 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 3687 freeb(ipf->ipf_mp); 3688 } 3689 mutex_exit(&ipfb->ipfb_lock); 3690 /* 3691 * Now need to send any icmp messages that we delayed from 3692 * above. 3693 */ 3694 while (send_icmp_head_v6 != NULL) { 3695 ip6_t *ip6h; 3696 3697 mp = send_icmp_head_v6; 3698 send_icmp_head_v6 = send_icmp_head_v6->b_next; 3699 mp->b_next = NULL; 3700 if (mp->b_datap->db_type == M_CTL) 3701 ip6h = (ip6_t *)mp->b_cont->b_rptr; 3702 else 3703 ip6h = (ip6_t *)mp->b_rptr; 3704 zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 3705 ill, ipst); 3706 if (zoneid == ALL_ZONES) { 3707 freemsg(mp); 3708 } else { 3709 icmp_time_exceeded_v6(ill->ill_wq, mp, 3710 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 3711 B_FALSE, zoneid, ipst); 3712 } 3713 } 3714 while (send_icmp_head != NULL) { 3715 ipaddr_t dst; 3716 3717 mp = send_icmp_head; 3718 send_icmp_head = send_icmp_head->b_next; 3719 mp->b_next = NULL; 3720 3721 if (mp->b_datap->db_type == M_CTL) 3722 dst = ((ipha_t *)mp->b_cont->b_rptr)->ipha_dst; 3723 else 3724 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 3725 3726 zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); 3727 if (zoneid == ALL_ZONES) { 3728 freemsg(mp); 3729 } else { 3730 icmp_time_exceeded(ill->ill_wq, mp, 3731 ICMP_REASSEMBLY_TIME_EXCEEDED, zoneid, 3732 ipst); 3733 } 3734 } 3735 } 3736 /* 3737 * A non-dying ILL will use the return value to decide whether to 3738 * restart the frag timer, and for how long. 3739 */ 3740 return (next_timeout); 3741 } 3742 3743 /* 3744 * This routine is called when the approximate count of mblk memory used 3745 * for the specified ILL has exceeded max_count. 3746 */ 3747 void 3748 ill_frag_prune(ill_t *ill, uint_t max_count) 3749 { 3750 ipfb_t *ipfb; 3751 ipf_t *ipf; 3752 size_t count; 3753 3754 /* 3755 * If we are here within ip_min_frag_prune_time msecs remove 3756 * ill_frag_free_num_pkts oldest packets from each bucket and increment 3757 * ill_frag_free_num_pkts. 3758 */ 3759 mutex_enter(&ill->ill_lock); 3760 if (TICK_TO_MSEC(lbolt - ill->ill_last_frag_clean_time) <= 3761 (ip_min_frag_prune_time != 0 ? 3762 ip_min_frag_prune_time : msec_per_tick)) { 3763 3764 ill->ill_frag_free_num_pkts++; 3765 3766 } else { 3767 ill->ill_frag_free_num_pkts = 0; 3768 } 3769 ill->ill_last_frag_clean_time = lbolt; 3770 mutex_exit(&ill->ill_lock); 3771 3772 /* 3773 * free ill_frag_free_num_pkts oldest packets from each bucket. 3774 */ 3775 if (ill->ill_frag_free_num_pkts != 0) { 3776 int ix; 3777 3778 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3779 ipfb = &ill->ill_frag_hash_tbl[ix]; 3780 mutex_enter(&ipfb->ipfb_lock); 3781 if (ipfb->ipfb_ipf != NULL) { 3782 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 3783 ill->ill_frag_free_num_pkts); 3784 } 3785 mutex_exit(&ipfb->ipfb_lock); 3786 } 3787 } 3788 /* 3789 * While the reassembly list for this ILL is too big, prune a fragment 3790 * queue by age, oldest first. Note that the per ILL count is 3791 * approximate, while the per frag hash bucket counts are accurate. 3792 */ 3793 while (ill->ill_frag_count > max_count) { 3794 int ix; 3795 ipfb_t *oipfb = NULL; 3796 uint_t oldest = UINT_MAX; 3797 3798 count = 0; 3799 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3800 ipfb = &ill->ill_frag_hash_tbl[ix]; 3801 mutex_enter(&ipfb->ipfb_lock); 3802 ipf = ipfb->ipfb_ipf; 3803 if (ipf != NULL && ipf->ipf_gen < oldest) { 3804 oldest = ipf->ipf_gen; 3805 oipfb = ipfb; 3806 } 3807 count += ipfb->ipfb_count; 3808 mutex_exit(&ipfb->ipfb_lock); 3809 } 3810 /* Refresh the per ILL count */ 3811 ill->ill_frag_count = count; 3812 if (oipfb == NULL) { 3813 ill->ill_frag_count = 0; 3814 break; 3815 } 3816 if (count <= max_count) 3817 return; /* Somebody beat us to it, nothing to do */ 3818 mutex_enter(&oipfb->ipfb_lock); 3819 ipf = oipfb->ipfb_ipf; 3820 if (ipf != NULL) { 3821 ill_frag_free_pkts(ill, oipfb, ipf, 1); 3822 } 3823 mutex_exit(&oipfb->ipfb_lock); 3824 } 3825 } 3826 3827 /* 3828 * free 'free_cnt' fragmented packets starting at ipf. 3829 */ 3830 void 3831 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 3832 { 3833 size_t count; 3834 mblk_t *mp; 3835 mblk_t *tmp; 3836 ipf_t **ipfp = ipf->ipf_ptphn; 3837 3838 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 3839 ASSERT(ipfp != NULL); 3840 ASSERT(ipf != NULL); 3841 3842 while (ipf != NULL && free_cnt-- > 0) { 3843 count = ipf->ipf_count; 3844 mp = ipf->ipf_mp; 3845 ipf = ipf->ipf_hash_next; 3846 for (tmp = mp; tmp; tmp = tmp->b_cont) { 3847 IP_REASS_SET_START(tmp, 0); 3848 IP_REASS_SET_END(tmp, 0); 3849 } 3850 ill->ill_frag_count -= count; 3851 ASSERT(ipfb->ipfb_count >= count); 3852 ipfb->ipfb_count -= count; 3853 ASSERT(ipfb->ipfb_frag_pkts > 0); 3854 ipfb->ipfb_frag_pkts--; 3855 freemsg(mp); 3856 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 3857 } 3858 3859 if (ipf) 3860 ipf->ipf_ptphn = ipfp; 3861 ipfp[0] = ipf; 3862 } 3863 3864 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 3865 "obsolete and may be removed in a future release of Solaris. Use " \ 3866 "ifconfig(1M) to manipulate the forwarding status of an interface." 3867 3868 /* 3869 * For obsolete per-interface forwarding configuration; 3870 * called in response to ND_GET. 3871 */ 3872 /* ARGSUSED */ 3873 static int 3874 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 3875 { 3876 ill_t *ill = (ill_t *)cp; 3877 3878 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3879 3880 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 3881 return (0); 3882 } 3883 3884 /* 3885 * For obsolete per-interface forwarding configuration; 3886 * called in response to ND_SET. 3887 */ 3888 /* ARGSUSED */ 3889 static int 3890 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 3891 cred_t *ioc_cr) 3892 { 3893 long value; 3894 int retval; 3895 ip_stack_t *ipst = CONNQ_TO_IPST(q); 3896 3897 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3898 3899 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 3900 value < 0 || value > 1) { 3901 return (EINVAL); 3902 } 3903 3904 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3905 retval = ill_forward_set((ill_t *)cp, (value != 0)); 3906 rw_exit(&ipst->ips_ill_g_lock); 3907 return (retval); 3908 } 3909 3910 /* 3911 * Set an ill's ILLF_ROUTER flag appropriately. If the ill is part of an 3912 * IPMP group, make sure all ill's in the group adopt the new policy. Send 3913 * up RTS_IFINFO routing socket messages for each interface whose flags we 3914 * change. 3915 */ 3916 int 3917 ill_forward_set(ill_t *ill, boolean_t enable) 3918 { 3919 ill_group_t *illgrp; 3920 ip_stack_t *ipst = ill->ill_ipst; 3921 3922 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 3923 3924 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 3925 (!enable && !(ill->ill_flags & ILLF_ROUTER))) 3926 return (0); 3927 3928 if (IS_LOOPBACK(ill)) 3929 return (EINVAL); 3930 3931 /* 3932 * If the ill is in an IPMP group, set the forwarding policy on all 3933 * members of the group to the same value. 3934 */ 3935 illgrp = ill->ill_group; 3936 if (illgrp != NULL) { 3937 ill_t *tmp_ill; 3938 3939 for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL; 3940 tmp_ill = tmp_ill->ill_group_next) { 3941 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3942 (enable ? "Enabling" : "Disabling"), 3943 (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"), 3944 tmp_ill->ill_name)); 3945 mutex_enter(&tmp_ill->ill_lock); 3946 if (enable) 3947 tmp_ill->ill_flags |= ILLF_ROUTER; 3948 else 3949 tmp_ill->ill_flags &= ~ILLF_ROUTER; 3950 mutex_exit(&tmp_ill->ill_lock); 3951 if (tmp_ill->ill_isv6) 3952 ill_set_nce_router_flags(tmp_ill, enable); 3953 /* Notify routing socket listeners of this change. */ 3954 ip_rts_ifmsg(tmp_ill->ill_ipif); 3955 } 3956 } else { 3957 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3958 (enable ? "Enabling" : "Disabling"), 3959 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 3960 mutex_enter(&ill->ill_lock); 3961 if (enable) 3962 ill->ill_flags |= ILLF_ROUTER; 3963 else 3964 ill->ill_flags &= ~ILLF_ROUTER; 3965 mutex_exit(&ill->ill_lock); 3966 if (ill->ill_isv6) 3967 ill_set_nce_router_flags(ill, enable); 3968 /* Notify routing socket listeners of this change. */ 3969 ip_rts_ifmsg(ill->ill_ipif); 3970 } 3971 3972 return (0); 3973 } 3974 3975 /* 3976 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 3977 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 3978 * set or clear. 3979 */ 3980 static void 3981 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 3982 { 3983 ipif_t *ipif; 3984 nce_t *nce; 3985 3986 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3987 nce = ndp_lookup_v6(ill, &ipif->ipif_v6lcl_addr, B_FALSE); 3988 if (nce != NULL) { 3989 mutex_enter(&nce->nce_lock); 3990 if (enable) 3991 nce->nce_flags |= NCE_F_ISROUTER; 3992 else 3993 nce->nce_flags &= ~NCE_F_ISROUTER; 3994 mutex_exit(&nce->nce_lock); 3995 NCE_REFRELE(nce); 3996 } 3997 } 3998 } 3999 4000 /* 4001 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 4002 * for this ill. Make sure the v6/v4 question has been answered about this 4003 * ill. The creation of this ndd variable is only for backwards compatibility. 4004 * The preferred way to control per-interface IP forwarding is through the 4005 * ILLF_ROUTER interface flag. 4006 */ 4007 static int 4008 ill_set_ndd_name(ill_t *ill) 4009 { 4010 char *suffix; 4011 ip_stack_t *ipst = ill->ill_ipst; 4012 4013 ASSERT(IAM_WRITER_ILL(ill)); 4014 4015 if (ill->ill_isv6) 4016 suffix = ipv6_forward_suffix; 4017 else 4018 suffix = ipv4_forward_suffix; 4019 4020 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 4021 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 4022 /* 4023 * Copies over the '\0'. 4024 * Note that strlen(suffix) is always bounded. 4025 */ 4026 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 4027 strlen(suffix) + 1); 4028 4029 /* 4030 * Use of the nd table requires holding the reader lock. 4031 * Modifying the nd table thru nd_load/nd_unload requires 4032 * the writer lock. 4033 */ 4034 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 4035 if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 4036 nd_ill_forward_set, (caddr_t)ill)) { 4037 /* 4038 * If the nd_load failed, it only meant that it could not 4039 * allocate a new bunch of room for further NDD expansion. 4040 * Because of that, the ill_ndd_name will be set to 0, and 4041 * this interface is at the mercy of the global ip_forwarding 4042 * variable. 4043 */ 4044 rw_exit(&ipst->ips_ip_g_nd_lock); 4045 ill->ill_ndd_name = NULL; 4046 return (ENOMEM); 4047 } 4048 rw_exit(&ipst->ips_ip_g_nd_lock); 4049 return (0); 4050 } 4051 4052 /* 4053 * Intializes the context structure and returns the first ill in the list 4054 * cuurently start_list and end_list can have values: 4055 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 4056 * IP_V4_G_HEAD Traverse IPV4 list only. 4057 * IP_V6_G_HEAD Traverse IPV6 list only. 4058 */ 4059 4060 /* 4061 * We don't check for CONDEMNED ills here. Caller must do that if 4062 * necessary under the ill lock. 4063 */ 4064 ill_t * 4065 ill_first(int start_list, int end_list, ill_walk_context_t *ctx, 4066 ip_stack_t *ipst) 4067 { 4068 ill_if_t *ifp; 4069 ill_t *ill; 4070 avl_tree_t *avl_tree; 4071 4072 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 4073 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 4074 4075 /* 4076 * setup the lists to search 4077 */ 4078 if (end_list != MAX_G_HEADS) { 4079 ctx->ctx_current_list = start_list; 4080 ctx->ctx_last_list = end_list; 4081 } else { 4082 ctx->ctx_last_list = MAX_G_HEADS - 1; 4083 ctx->ctx_current_list = 0; 4084 } 4085 4086 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 4087 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 4088 if (ifp != (ill_if_t *) 4089 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 4090 avl_tree = &ifp->illif_avl_by_ppa; 4091 ill = avl_first(avl_tree); 4092 /* 4093 * ill is guaranteed to be non NULL or ifp should have 4094 * not existed. 4095 */ 4096 ASSERT(ill != NULL); 4097 return (ill); 4098 } 4099 ctx->ctx_current_list++; 4100 } 4101 4102 return (NULL); 4103 } 4104 4105 /* 4106 * returns the next ill in the list. ill_first() must have been called 4107 * before calling ill_next() or bad things will happen. 4108 */ 4109 4110 /* 4111 * We don't check for CONDEMNED ills here. Caller must do that if 4112 * necessary under the ill lock. 4113 */ 4114 ill_t * 4115 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 4116 { 4117 ill_if_t *ifp; 4118 ill_t *ill; 4119 ip_stack_t *ipst = lastill->ill_ipst; 4120 4121 ASSERT(lastill->ill_ifptr != (ill_if_t *) 4122 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)); 4123 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 4124 AVL_AFTER)) != NULL) { 4125 return (ill); 4126 } 4127 4128 /* goto next ill_ifp in the list. */ 4129 ifp = lastill->ill_ifptr->illif_next; 4130 4131 /* make sure not at end of circular list */ 4132 while (ifp == 4133 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 4134 if (++ctx->ctx_current_list > ctx->ctx_last_list) 4135 return (NULL); 4136 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 4137 } 4138 4139 return (avl_first(&ifp->illif_avl_by_ppa)); 4140 } 4141 4142 /* 4143 * Check interface name for correct format which is name+ppa. 4144 * name can contain characters and digits, the right most digits 4145 * make up the ppa number. use of octal is not allowed, name must contain 4146 * a ppa, return pointer to the start of ppa. 4147 * In case of error return NULL. 4148 */ 4149 static char * 4150 ill_get_ppa_ptr(char *name) 4151 { 4152 int namelen = mi_strlen(name); 4153 4154 int len = namelen; 4155 4156 name += len; 4157 while (len > 0) { 4158 name--; 4159 if (*name < '0' || *name > '9') 4160 break; 4161 len--; 4162 } 4163 4164 /* empty string, all digits, or no trailing digits */ 4165 if (len == 0 || len == (int)namelen) 4166 return (NULL); 4167 4168 name++; 4169 /* check for attempted use of octal */ 4170 if (*name == '0' && len != (int)namelen - 1) 4171 return (NULL); 4172 return (name); 4173 } 4174 4175 /* 4176 * use avl tree to locate the ill. 4177 */ 4178 static ill_t * 4179 ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, 4180 ipsq_func_t func, int *error, ip_stack_t *ipst) 4181 { 4182 char *ppa_ptr = NULL; 4183 int len; 4184 uint_t ppa; 4185 ill_t *ill = NULL; 4186 ill_if_t *ifp; 4187 int list; 4188 ipsq_t *ipsq; 4189 4190 if (error != NULL) 4191 *error = 0; 4192 4193 /* 4194 * get ppa ptr 4195 */ 4196 if (isv6) 4197 list = IP_V6_G_HEAD; 4198 else 4199 list = IP_V4_G_HEAD; 4200 4201 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 4202 if (error != NULL) 4203 *error = ENXIO; 4204 return (NULL); 4205 } 4206 4207 len = ppa_ptr - name + 1; 4208 4209 ppa = stoi(&ppa_ptr); 4210 4211 ifp = IP_VX_ILL_G_LIST(list, ipst); 4212 4213 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 4214 /* 4215 * match is done on len - 1 as the name is not null 4216 * terminated it contains ppa in addition to the interface 4217 * name. 4218 */ 4219 if ((ifp->illif_name_len == len) && 4220 bcmp(ifp->illif_name, name, len - 1) == 0) { 4221 break; 4222 } else { 4223 ifp = ifp->illif_next; 4224 } 4225 } 4226 4227 4228 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 4229 /* 4230 * Even the interface type does not exist. 4231 */ 4232 if (error != NULL) 4233 *error = ENXIO; 4234 return (NULL); 4235 } 4236 4237 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 4238 if (ill != NULL) { 4239 /* 4240 * The block comment at the start of ipif_down 4241 * explains the use of the macros used below 4242 */ 4243 GRAB_CONN_LOCK(q); 4244 mutex_enter(&ill->ill_lock); 4245 if (ILL_CAN_LOOKUP(ill)) { 4246 ill_refhold_locked(ill); 4247 mutex_exit(&ill->ill_lock); 4248 RELEASE_CONN_LOCK(q); 4249 return (ill); 4250 } else if (ILL_CAN_WAIT(ill, q)) { 4251 ipsq = ill->ill_phyint->phyint_ipsq; 4252 mutex_enter(&ipsq->ipsq_lock); 4253 mutex_exit(&ill->ill_lock); 4254 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 4255 mutex_exit(&ipsq->ipsq_lock); 4256 RELEASE_CONN_LOCK(q); 4257 *error = EINPROGRESS; 4258 return (NULL); 4259 } 4260 mutex_exit(&ill->ill_lock); 4261 RELEASE_CONN_LOCK(q); 4262 } 4263 if (error != NULL) 4264 *error = ENXIO; 4265 return (NULL); 4266 } 4267 4268 /* 4269 * comparison function for use with avl. 4270 */ 4271 static int 4272 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 4273 { 4274 uint_t ppa; 4275 uint_t ill_ppa; 4276 4277 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 4278 4279 ppa = *((uint_t *)ppa_ptr); 4280 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 4281 /* 4282 * We want the ill with the lowest ppa to be on the 4283 * top. 4284 */ 4285 if (ill_ppa < ppa) 4286 return (1); 4287 if (ill_ppa > ppa) 4288 return (-1); 4289 return (0); 4290 } 4291 4292 /* 4293 * remove an interface type from the global list. 4294 */ 4295 static void 4296 ill_delete_interface_type(ill_if_t *interface) 4297 { 4298 ASSERT(interface != NULL); 4299 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 4300 4301 avl_destroy(&interface->illif_avl_by_ppa); 4302 if (interface->illif_ppa_arena != NULL) 4303 vmem_destroy(interface->illif_ppa_arena); 4304 4305 remque(interface); 4306 4307 mi_free(interface); 4308 } 4309 4310 /* Defined in ip_netinfo.c */ 4311 extern ddi_taskq_t *eventq_queue_nic; 4312 4313 /* 4314 * remove ill from the global list. 4315 */ 4316 static void 4317 ill_glist_delete(ill_t *ill) 4318 { 4319 char *nicname; 4320 size_t nicnamelen; 4321 hook_nic_event_t *info; 4322 ip_stack_t *ipst; 4323 4324 if (ill == NULL) 4325 return; 4326 ipst = ill->ill_ipst; 4327 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 4328 4329 if (ill->ill_name != NULL) { 4330 nicname = kmem_alloc(ill->ill_name_length, KM_NOSLEEP); 4331 if (nicname != NULL) { 4332 bcopy(ill->ill_name, nicname, ill->ill_name_length); 4333 nicnamelen = ill->ill_name_length; 4334 } 4335 } else { 4336 nicname = NULL; 4337 nicnamelen = 0; 4338 } 4339 4340 /* 4341 * If the ill was never inserted into the AVL tree 4342 * we skip the if branch. 4343 */ 4344 if (ill->ill_ifptr != NULL) { 4345 /* 4346 * remove from AVL tree and free ppa number 4347 */ 4348 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 4349 4350 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 4351 vmem_free(ill->ill_ifptr->illif_ppa_arena, 4352 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4353 } 4354 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 4355 ill_delete_interface_type(ill->ill_ifptr); 4356 } 4357 4358 /* 4359 * Indicate ill is no longer in the list. 4360 */ 4361 ill->ill_ifptr = NULL; 4362 ill->ill_name_length = 0; 4363 ill->ill_name[0] = '\0'; 4364 ill->ill_ppa = UINT_MAX; 4365 } 4366 4367 /* 4368 * Run the unplumb hook after the NIC has disappeared from being 4369 * visible so that attempts to revalidate its existance will fail. 4370 * 4371 * This needs to be run inside the ill_g_lock perimeter to ensure 4372 * that the ordering of delivered events to listeners matches the 4373 * order of them in the kernel. 4374 */ 4375 if ((info = ill->ill_nic_event_info) != NULL) { 4376 if (info->hne_event != NE_DOWN) { 4377 ip2dbg(("ill_glist_delete: unexpected nic event %d " 4378 "attached for %s\n", info->hne_event, 4379 ill->ill_name)); 4380 if (info->hne_data != NULL) 4381 kmem_free(info->hne_data, info->hne_datalen); 4382 kmem_free(info, sizeof (hook_nic_event_t)); 4383 } else { 4384 if (ddi_taskq_dispatch(eventq_queue_nic, 4385 ip_ne_queue_func, (void *)info, DDI_SLEEP) 4386 == DDI_FAILURE) { 4387 ip2dbg(("ill_glist_delete: ddi_taskq_dispatch " 4388 "failed\n")); 4389 if (info->hne_data != NULL) 4390 kmem_free(info->hne_data, 4391 info->hne_datalen); 4392 kmem_free(info, sizeof (hook_nic_event_t)); 4393 } 4394 } 4395 } 4396 4397 /* Generate NE_UNPLUMB event for ill_name. */ 4398 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 4399 if (info != NULL) { 4400 info->hne_nic = ill->ill_phyint->phyint_ifindex; 4401 info->hne_lif = 0; 4402 info->hne_event = NE_UNPLUMB; 4403 info->hne_data = nicname; 4404 info->hne_datalen = nicnamelen; 4405 info->hne_family = ill->ill_isv6 ? 4406 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 4407 } else { 4408 ip2dbg(("ill_glist_delete: could not attach UNPLUMB nic event " 4409 "information for %s (ENOMEM)\n", ill->ill_name)); 4410 if (nicname != NULL) 4411 kmem_free(nicname, nicnamelen); 4412 } 4413 4414 ill->ill_nic_event_info = info; 4415 4416 ill_phyint_free(ill); 4417 rw_exit(&ipst->ips_ill_g_lock); 4418 } 4419 4420 /* 4421 * allocate a ppa, if the number of plumbed interfaces of this type are 4422 * less than ill_no_arena do a linear search to find a unused ppa. 4423 * When the number goes beyond ill_no_arena switch to using an arena. 4424 * Note: ppa value of zero cannot be allocated from vmem_arena as it 4425 * is the return value for an error condition, so allocation starts at one 4426 * and is decremented by one. 4427 */ 4428 static int 4429 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 4430 { 4431 ill_t *tmp_ill; 4432 uint_t start, end; 4433 int ppa; 4434 4435 if (ifp->illif_ppa_arena == NULL && 4436 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 4437 /* 4438 * Create an arena. 4439 */ 4440 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 4441 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 4442 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 4443 /* allocate what has already been assigned */ 4444 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 4445 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 4446 tmp_ill, AVL_AFTER)) { 4447 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4448 1, /* size */ 4449 1, /* align/quantum */ 4450 0, /* phase */ 4451 0, /* nocross */ 4452 /* minaddr */ 4453 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), 4454 /* maxaddr */ 4455 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), 4456 VM_NOSLEEP|VM_FIRSTFIT); 4457 if (ppa == 0) { 4458 ip1dbg(("ill_alloc_ppa: ppa allocation" 4459 " failed while switching")); 4460 vmem_destroy(ifp->illif_ppa_arena); 4461 ifp->illif_ppa_arena = NULL; 4462 break; 4463 } 4464 } 4465 } 4466 4467 if (ifp->illif_ppa_arena != NULL) { 4468 if (ill->ill_ppa == UINT_MAX) { 4469 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 4470 1, VM_NOSLEEP|VM_FIRSTFIT); 4471 if (ppa == 0) 4472 return (EAGAIN); 4473 ill->ill_ppa = --ppa; 4474 } else { 4475 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4476 1, /* size */ 4477 1, /* align/quantum */ 4478 0, /* phase */ 4479 0, /* nocross */ 4480 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 4481 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 4482 VM_NOSLEEP|VM_FIRSTFIT); 4483 /* 4484 * Most likely the allocation failed because 4485 * the requested ppa was in use. 4486 */ 4487 if (ppa == 0) 4488 return (EEXIST); 4489 } 4490 return (0); 4491 } 4492 4493 /* 4494 * No arena is in use and not enough (>ill_no_arena) interfaces have 4495 * been plumbed to create one. Do a linear search to get a unused ppa. 4496 */ 4497 if (ill->ill_ppa == UINT_MAX) { 4498 end = UINT_MAX - 1; 4499 start = 0; 4500 } else { 4501 end = start = ill->ill_ppa; 4502 } 4503 4504 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 4505 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 4506 if (start++ >= end) { 4507 if (ill->ill_ppa == UINT_MAX) 4508 return (EAGAIN); 4509 else 4510 return (EEXIST); 4511 } 4512 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 4513 } 4514 ill->ill_ppa = start; 4515 return (0); 4516 } 4517 4518 /* 4519 * Insert ill into the list of configured ill's. Once this function completes, 4520 * the ill is globally visible and is available through lookups. More precisely 4521 * this happens after the caller drops the ill_g_lock. 4522 */ 4523 static int 4524 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 4525 { 4526 ill_if_t *ill_interface; 4527 avl_index_t where = 0; 4528 int error; 4529 int name_length; 4530 int index; 4531 boolean_t check_length = B_FALSE; 4532 ip_stack_t *ipst = ill->ill_ipst; 4533 4534 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 4535 4536 name_length = mi_strlen(name) + 1; 4537 4538 if (isv6) 4539 index = IP_V6_G_HEAD; 4540 else 4541 index = IP_V4_G_HEAD; 4542 4543 ill_interface = IP_VX_ILL_G_LIST(index, ipst); 4544 /* 4545 * Search for interface type based on name 4546 */ 4547 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 4548 if ((ill_interface->illif_name_len == name_length) && 4549 (strcmp(ill_interface->illif_name, name) == 0)) { 4550 break; 4551 } 4552 ill_interface = ill_interface->illif_next; 4553 } 4554 4555 /* 4556 * Interface type not found, create one. 4557 */ 4558 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 4559 4560 ill_g_head_t ghead; 4561 4562 /* 4563 * allocate ill_if_t structure 4564 */ 4565 4566 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 4567 if (ill_interface == NULL) { 4568 return (ENOMEM); 4569 } 4570 4571 4572 4573 (void) strcpy(ill_interface->illif_name, name); 4574 ill_interface->illif_name_len = name_length; 4575 4576 avl_create(&ill_interface->illif_avl_by_ppa, 4577 ill_compare_ppa, sizeof (ill_t), 4578 offsetof(struct ill_s, ill_avl_byppa)); 4579 4580 /* 4581 * link the structure in the back to maintain order 4582 * of configuration for ifconfig output. 4583 */ 4584 ghead = ipst->ips_ill_g_heads[index]; 4585 insque(ill_interface, ghead.ill_g_list_tail); 4586 4587 } 4588 4589 if (ill->ill_ppa == UINT_MAX) 4590 check_length = B_TRUE; 4591 4592 error = ill_alloc_ppa(ill_interface, ill); 4593 if (error != 0) { 4594 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 4595 ill_delete_interface_type(ill->ill_ifptr); 4596 return (error); 4597 } 4598 4599 /* 4600 * When the ppa is choosen by the system, check that there is 4601 * enough space to insert ppa. if a specific ppa was passed in this 4602 * check is not required as the interface name passed in will have 4603 * the right ppa in it. 4604 */ 4605 if (check_length) { 4606 /* 4607 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 4608 */ 4609 char buf[sizeof (uint_t) * 3]; 4610 4611 /* 4612 * convert ppa to string to calculate the amount of space 4613 * required for it in the name. 4614 */ 4615 numtos(ill->ill_ppa, buf); 4616 4617 /* Do we have enough space to insert ppa ? */ 4618 4619 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 4620 /* Free ppa and interface type struct */ 4621 if (ill_interface->illif_ppa_arena != NULL) { 4622 vmem_free(ill_interface->illif_ppa_arena, 4623 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4624 } 4625 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 4626 0) { 4627 ill_delete_interface_type(ill->ill_ifptr); 4628 } 4629 4630 return (EINVAL); 4631 } 4632 } 4633 4634 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 4635 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 4636 4637 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 4638 &where); 4639 ill->ill_ifptr = ill_interface; 4640 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 4641 4642 ill_phyint_reinit(ill); 4643 return (0); 4644 } 4645 4646 /* Initialize the per phyint (per IPMP group) ipsq used for serialization */ 4647 static boolean_t 4648 ipsq_init(ill_t *ill) 4649 { 4650 ipsq_t *ipsq; 4651 4652 /* Init the ipsq and impicitly enter as writer */ 4653 ill->ill_phyint->phyint_ipsq = 4654 kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 4655 if (ill->ill_phyint->phyint_ipsq == NULL) 4656 return (B_FALSE); 4657 ipsq = ill->ill_phyint->phyint_ipsq; 4658 ipsq->ipsq_phyint_list = ill->ill_phyint; 4659 ill->ill_phyint->phyint_ipsq_next = NULL; 4660 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 4661 ipsq->ipsq_refs = 1; 4662 ipsq->ipsq_writer = curthread; 4663 ipsq->ipsq_reentry_cnt = 1; 4664 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */ 4665 #ifdef DEBUG 4666 ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack, 4667 IPSQ_STACK_DEPTH); 4668 #endif 4669 (void) strcpy(ipsq->ipsq_name, ill->ill_name); 4670 return (B_TRUE); 4671 } 4672 4673 /* 4674 * ill_init is called by ip_open when a device control stream is opened. 4675 * It does a few initializations, and shoots a DL_INFO_REQ message down 4676 * to the driver. The response is later picked up in ip_rput_dlpi and 4677 * used to set up default mechanisms for talking to the driver. (Always 4678 * called as writer.) 4679 * 4680 * If this function returns error, ip_open will call ip_close which in 4681 * turn will call ill_delete to clean up any memory allocated here that 4682 * is not yet freed. 4683 */ 4684 int 4685 ill_init(queue_t *q, ill_t *ill) 4686 { 4687 int count; 4688 dl_info_req_t *dlir; 4689 mblk_t *info_mp; 4690 uchar_t *frag_ptr; 4691 4692 /* 4693 * The ill is initialized to zero by mi_alloc*(). In addition 4694 * some fields already contain valid values, initialized in 4695 * ip_open(), before we reach here. 4696 */ 4697 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 4698 4699 ill->ill_rq = q; 4700 ill->ill_wq = WR(q); 4701 4702 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 4703 BPRI_HI); 4704 if (info_mp == NULL) 4705 return (ENOMEM); 4706 4707 /* 4708 * Allocate sufficient space to contain our fragment hash table and 4709 * the device name. 4710 */ 4711 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 4712 2 * LIFNAMSIZ + 5 + strlen(ipv6_forward_suffix)); 4713 if (frag_ptr == NULL) { 4714 freemsg(info_mp); 4715 return (ENOMEM); 4716 } 4717 ill->ill_frag_ptr = frag_ptr; 4718 ill->ill_frag_free_num_pkts = 0; 4719 ill->ill_last_frag_clean_time = 0; 4720 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 4721 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 4722 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 4723 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 4724 NULL, MUTEX_DEFAULT, NULL); 4725 } 4726 4727 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4728 if (ill->ill_phyint == NULL) { 4729 freemsg(info_mp); 4730 mi_free(frag_ptr); 4731 return (ENOMEM); 4732 } 4733 4734 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4735 /* 4736 * For now pretend this is a v4 ill. We need to set phyint_ill* 4737 * at this point because of the following reason. If we can't 4738 * enter the ipsq at some point and cv_wait, the writer that 4739 * wakes us up tries to locate us using the list of all phyints 4740 * in an ipsq and the ills from the phyint thru the phyint_ill*. 4741 * If we don't set it now, we risk a missed wakeup. 4742 */ 4743 ill->ill_phyint->phyint_illv4 = ill; 4744 ill->ill_ppa = UINT_MAX; 4745 ill->ill_fastpath_list = &ill->ill_fastpath_list; 4746 4747 if (!ipsq_init(ill)) { 4748 freemsg(info_mp); 4749 mi_free(frag_ptr); 4750 mi_free(ill->ill_phyint); 4751 return (ENOMEM); 4752 } 4753 4754 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 4755 4756 4757 /* Frag queue limit stuff */ 4758 ill->ill_frag_count = 0; 4759 ill->ill_ipf_gen = 0; 4760 4761 ill->ill_global_timer = INFINITY; 4762 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4763 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4764 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4765 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4766 4767 /* 4768 * Initialize IPv6 configuration variables. The IP module is always 4769 * opened as an IPv4 module. Instead tracking down the cases where 4770 * it switches to do ipv6, we'll just initialize the IPv6 configuration 4771 * here for convenience, this has no effect until the ill is set to do 4772 * IPv6. 4773 */ 4774 ill->ill_reachable_time = ND_REACHABLE_TIME; 4775 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 4776 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 4777 ill->ill_max_buf = ND_MAX_Q; 4778 ill->ill_refcnt = 0; 4779 4780 /* Send down the Info Request to the driver. */ 4781 info_mp->b_datap->db_type = M_PCPROTO; 4782 dlir = (dl_info_req_t *)info_mp->b_rptr; 4783 info_mp->b_wptr = (uchar_t *)&dlir[1]; 4784 dlir->dl_primitive = DL_INFO_REQ; 4785 4786 ill->ill_dlpi_pending = DL_PRIM_INVAL; 4787 4788 qprocson(q); 4789 ill_dlpi_send(ill, info_mp); 4790 4791 return (0); 4792 } 4793 4794 /* 4795 * ill_dls_info 4796 * creates datalink socket info from the device. 4797 */ 4798 int 4799 ill_dls_info(struct sockaddr_dl *sdl, const ipif_t *ipif) 4800 { 4801 size_t len; 4802 ill_t *ill = ipif->ipif_ill; 4803 4804 sdl->sdl_family = AF_LINK; 4805 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4806 sdl->sdl_type = ill->ill_type; 4807 ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4808 len = strlen(sdl->sdl_data); 4809 ASSERT(len < 256); 4810 sdl->sdl_nlen = (uchar_t)len; 4811 sdl->sdl_alen = ill->ill_phys_addr_length; 4812 sdl->sdl_slen = 0; 4813 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) 4814 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen); 4815 4816 return (sizeof (struct sockaddr_dl)); 4817 } 4818 4819 /* 4820 * ill_xarp_info 4821 * creates xarp info from the device. 4822 */ 4823 static int 4824 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 4825 { 4826 sdl->sdl_family = AF_LINK; 4827 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4828 sdl->sdl_type = ill->ill_type; 4829 ipif_get_name(ill->ill_ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4830 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 4831 sdl->sdl_alen = ill->ill_phys_addr_length; 4832 sdl->sdl_slen = 0; 4833 return (sdl->sdl_nlen); 4834 } 4835 4836 static int 4837 loopback_kstat_update(kstat_t *ksp, int rw) 4838 { 4839 kstat_named_t *kn; 4840 netstackid_t stackid; 4841 netstack_t *ns; 4842 ip_stack_t *ipst; 4843 4844 if (ksp == NULL || ksp->ks_data == NULL) 4845 return (EIO); 4846 4847 if (rw == KSTAT_WRITE) 4848 return (EACCES); 4849 4850 kn = KSTAT_NAMED_PTR(ksp); 4851 stackid = (zoneid_t)(uintptr_t)ksp->ks_private; 4852 4853 ns = netstack_find_by_stackid(stackid); 4854 if (ns == NULL) 4855 return (-1); 4856 4857 ipst = ns->netstack_ip; 4858 if (ipst == NULL) { 4859 netstack_rele(ns); 4860 return (-1); 4861 } 4862 kn[0].value.ui32 = ipst->ips_loopback_packets; 4863 kn[1].value.ui32 = ipst->ips_loopback_packets; 4864 netstack_rele(ns); 4865 return (0); 4866 } 4867 4868 4869 /* 4870 * Has ifindex been plumbed already. 4871 * Compares both phyint_ifindex and phyint_group_ifindex. 4872 */ 4873 static boolean_t 4874 phyint_exists(uint_t index, ip_stack_t *ipst) 4875 { 4876 phyint_t *phyi; 4877 4878 ASSERT(index != 0); 4879 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 4880 /* 4881 * Indexes are stored in the phyint - a common structure 4882 * to both IPv4 and IPv6. 4883 */ 4884 phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); 4885 for (; phyi != NULL; 4886 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 4887 phyi, AVL_AFTER)) { 4888 if (phyi->phyint_ifindex == index || 4889 phyi->phyint_group_ifindex == index) 4890 return (B_TRUE); 4891 } 4892 return (B_FALSE); 4893 } 4894 4895 /* Pick a unique ifindex */ 4896 boolean_t 4897 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst) 4898 { 4899 uint_t starting_index; 4900 4901 if (!ipst->ips_ill_index_wrap) { 4902 *indexp = ipst->ips_ill_index++; 4903 if (ipst->ips_ill_index == 0) { 4904 /* Reached the uint_t limit Next time wrap */ 4905 ipst->ips_ill_index_wrap = B_TRUE; 4906 } 4907 return (B_TRUE); 4908 } 4909 4910 /* 4911 * Start reusing unused indexes. Note that we hold the ill_g_lock 4912 * at this point and don't want to call any function that attempts 4913 * to get the lock again. 4914 */ 4915 starting_index = ipst->ips_ill_index++; 4916 for (; ipst->ips_ill_index != starting_index; ipst->ips_ill_index++) { 4917 if (ipst->ips_ill_index != 0 && 4918 !phyint_exists(ipst->ips_ill_index, ipst)) { 4919 /* found unused index - use it */ 4920 *indexp = ipst->ips_ill_index; 4921 return (B_TRUE); 4922 } 4923 } 4924 4925 /* 4926 * all interface indicies are inuse. 4927 */ 4928 return (B_FALSE); 4929 } 4930 4931 /* 4932 * Assign a unique interface index for the phyint. 4933 */ 4934 static boolean_t 4935 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst) 4936 { 4937 ASSERT(phyi->phyint_ifindex == 0); 4938 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst)); 4939 } 4940 4941 /* 4942 * Return a pointer to the ill which matches the supplied name. Note that 4943 * the ill name length includes the null termination character. (May be 4944 * called as writer.) 4945 * If do_alloc and the interface is "lo0" it will be automatically created. 4946 * Cannot bump up reference on condemned ills. So dup detect can't be done 4947 * using this func. 4948 */ 4949 ill_t * 4950 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 4951 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc, 4952 ip_stack_t *ipst) 4953 { 4954 ill_t *ill; 4955 ipif_t *ipif; 4956 kstat_named_t *kn; 4957 boolean_t isloopback; 4958 ipsq_t *old_ipsq; 4959 in6_addr_t ov6addr; 4960 4961 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 4962 4963 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4964 ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst); 4965 rw_exit(&ipst->ips_ill_g_lock); 4966 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) 4967 return (ill); 4968 4969 /* 4970 * Couldn't find it. Does this happen to be a lookup for the 4971 * loopback device and are we allowed to allocate it? 4972 */ 4973 if (!isloopback || !do_alloc) 4974 return (NULL); 4975 4976 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 4977 4978 ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst); 4979 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) { 4980 rw_exit(&ipst->ips_ill_g_lock); 4981 return (ill); 4982 } 4983 4984 /* Create the loopback device on demand */ 4985 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 4986 sizeof (ipif_loopback_name), BPRI_MED)); 4987 if (ill == NULL) 4988 goto done; 4989 4990 *ill = ill_null; 4991 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 4992 ill->ill_ipst = ipst; 4993 netstack_hold(ipst->ips_netstack); 4994 /* 4995 * For exclusive stacks we set the zoneid to zero 4996 * to make IP operate as if in the global zone. 4997 */ 4998 ill->ill_zoneid = GLOBAL_ZONEID; 4999 5000 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 5001 if (ill->ill_phyint == NULL) 5002 goto done; 5003 5004 if (isv6) 5005 ill->ill_phyint->phyint_illv6 = ill; 5006 else 5007 ill->ill_phyint->phyint_illv4 = ill; 5008 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 5009 ill->ill_max_frag = IP_LOOPBACK_MTU; 5010 /* Add room for tcp+ip headers */ 5011 if (isv6) { 5012 ill->ill_isv6 = B_TRUE; 5013 ill->ill_max_frag += IPV6_HDR_LEN + 20; /* for TCP */ 5014 } else { 5015 ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20; 5016 } 5017 if (!ill_allocate_mibs(ill)) 5018 goto done; 5019 ill->ill_max_mtu = ill->ill_max_frag; 5020 /* 5021 * ipif_loopback_name can't be pointed at directly because its used 5022 * by both the ipv4 and ipv6 interfaces. When the ill is removed 5023 * from the glist, ill_glist_delete() sets the first character of 5024 * ill_name to '\0'. 5025 */ 5026 ill->ill_name = (char *)ill + sizeof (*ill); 5027 (void) strcpy(ill->ill_name, ipif_loopback_name); 5028 ill->ill_name_length = sizeof (ipif_loopback_name); 5029 /* Set ill_name_set for ill_phyint_reinit to work properly */ 5030 5031 ill->ill_global_timer = INFINITY; 5032 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 5033 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 5034 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 5035 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 5036 5037 /* No resolver here. */ 5038 ill->ill_net_type = IRE_LOOPBACK; 5039 5040 /* Initialize the ipsq */ 5041 if (!ipsq_init(ill)) 5042 goto done; 5043 5044 ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL; 5045 ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--; 5046 ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0); 5047 #ifdef DEBUG 5048 ill->ill_phyint->phyint_ipsq->ipsq_depth = 0; 5049 #endif 5050 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE); 5051 if (ipif == NULL) 5052 goto done; 5053 5054 ill->ill_flags = ILLF_MULTICAST; 5055 5056 ov6addr = ipif->ipif_v6lcl_addr; 5057 /* Set up default loopback address and mask. */ 5058 if (!isv6) { 5059 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 5060 5061 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 5062 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 5063 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 5064 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 5065 ipif->ipif_v6subnet); 5066 ill->ill_flags |= ILLF_IPV4; 5067 } else { 5068 ipif->ipif_v6lcl_addr = ipv6_loopback; 5069 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 5070 ipif->ipif_v6net_mask = ipv6_all_ones; 5071 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 5072 ipif->ipif_v6subnet); 5073 ill->ill_flags |= ILLF_IPV6; 5074 } 5075 5076 /* 5077 * Chain us in at the end of the ill list. hold the ill 5078 * before we make it globally visible. 1 for the lookup. 5079 */ 5080 ill->ill_refcnt = 0; 5081 ill_refhold(ill); 5082 5083 ill->ill_frag_count = 0; 5084 ill->ill_frag_free_num_pkts = 0; 5085 ill->ill_last_frag_clean_time = 0; 5086 5087 old_ipsq = ill->ill_phyint->phyint_ipsq; 5088 5089 if (ill_glist_insert(ill, "lo", isv6) != 0) 5090 cmn_err(CE_PANIC, "cannot insert loopback interface"); 5091 5092 /* Let SCTP know so that it can add this to its list */ 5093 sctp_update_ill(ill, SCTP_ILL_INSERT); 5094 5095 /* 5096 * We have already assigned ipif_v6lcl_addr above, but we need to 5097 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which 5098 * requires to be after ill_glist_insert() since we need the 5099 * ill_index set. Pass on ipv6_loopback as the old address. 5100 */ 5101 sctp_update_ipif_addr(ipif, ov6addr); 5102 5103 /* 5104 * If the ipsq was changed in ill_phyint_reinit free the old ipsq. 5105 */ 5106 if (old_ipsq != ill->ill_phyint->phyint_ipsq) { 5107 /* Loopback ills aren't in any IPMP group */ 5108 ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP)); 5109 ipsq_delete(old_ipsq); 5110 } 5111 5112 /* 5113 * Delay this till the ipif is allocated as ipif_allocate 5114 * de-references ill_phyint for getting the ifindex. We 5115 * can't do this before ipif_allocate because ill_phyint_reinit 5116 * -> phyint_assign_ifindex expects ipif to be present. 5117 */ 5118 mutex_enter(&ill->ill_phyint->phyint_lock); 5119 ill->ill_phyint->phyint_flags |= PHYI_LOOPBACK | PHYI_VIRTUAL; 5120 mutex_exit(&ill->ill_phyint->phyint_lock); 5121 5122 if (ipst->ips_loopback_ksp == NULL) { 5123 /* Export loopback interface statistics */ 5124 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0, 5125 ipif_loopback_name, "net", 5126 KSTAT_TYPE_NAMED, 2, 0, 5127 ipst->ips_netstack->netstack_stackid); 5128 if (ipst->ips_loopback_ksp != NULL) { 5129 ipst->ips_loopback_ksp->ks_update = 5130 loopback_kstat_update; 5131 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp); 5132 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 5133 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 5134 ipst->ips_loopback_ksp->ks_private = 5135 (void *)(uintptr_t)ipst->ips_netstack-> 5136 netstack_stackid; 5137 kstat_install(ipst->ips_loopback_ksp); 5138 } 5139 } 5140 5141 if (error != NULL) 5142 *error = 0; 5143 *did_alloc = B_TRUE; 5144 rw_exit(&ipst->ips_ill_g_lock); 5145 return (ill); 5146 done: 5147 if (ill != NULL) { 5148 if (ill->ill_phyint != NULL) { 5149 ipsq_t *ipsq; 5150 5151 ipsq = ill->ill_phyint->phyint_ipsq; 5152 if (ipsq != NULL) { 5153 ipsq->ipsq_ipst = NULL; 5154 kmem_free(ipsq, sizeof (ipsq_t)); 5155 } 5156 mi_free(ill->ill_phyint); 5157 } 5158 ill_free_mib(ill); 5159 if (ill->ill_ipst != NULL) 5160 netstack_rele(ill->ill_ipst->ips_netstack); 5161 mi_free(ill); 5162 } 5163 rw_exit(&ipst->ips_ill_g_lock); 5164 if (error != NULL) 5165 *error = ENOMEM; 5166 return (NULL); 5167 } 5168 5169 /* 5170 * For IPP calls - use the ip_stack_t for global stack. 5171 */ 5172 ill_t * 5173 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6, 5174 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err) 5175 { 5176 ip_stack_t *ipst; 5177 ill_t *ill; 5178 5179 ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip; 5180 if (ipst == NULL) { 5181 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n"); 5182 return (NULL); 5183 } 5184 5185 ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst); 5186 netstack_rele(ipst->ips_netstack); 5187 return (ill); 5188 } 5189 5190 /* 5191 * Return a pointer to the ill which matches the index and IP version type. 5192 */ 5193 ill_t * 5194 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, 5195 ipsq_func_t func, int *err, ip_stack_t *ipst) 5196 { 5197 ill_t *ill; 5198 ipsq_t *ipsq; 5199 phyint_t *phyi; 5200 5201 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 5202 (q != NULL && mp != NULL && func != NULL && err != NULL)); 5203 5204 if (err != NULL) 5205 *err = 0; 5206 5207 /* 5208 * Indexes are stored in the phyint - a common structure 5209 * to both IPv4 and IPv6. 5210 */ 5211 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5212 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5213 (void *) &index, NULL); 5214 if (phyi != NULL) { 5215 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 5216 if (ill != NULL) { 5217 /* 5218 * The block comment at the start of ipif_down 5219 * explains the use of the macros used below 5220 */ 5221 GRAB_CONN_LOCK(q); 5222 mutex_enter(&ill->ill_lock); 5223 if (ILL_CAN_LOOKUP(ill)) { 5224 ill_refhold_locked(ill); 5225 mutex_exit(&ill->ill_lock); 5226 RELEASE_CONN_LOCK(q); 5227 rw_exit(&ipst->ips_ill_g_lock); 5228 return (ill); 5229 } else if (ILL_CAN_WAIT(ill, q)) { 5230 ipsq = ill->ill_phyint->phyint_ipsq; 5231 mutex_enter(&ipsq->ipsq_lock); 5232 rw_exit(&ipst->ips_ill_g_lock); 5233 mutex_exit(&ill->ill_lock); 5234 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 5235 mutex_exit(&ipsq->ipsq_lock); 5236 RELEASE_CONN_LOCK(q); 5237 *err = EINPROGRESS; 5238 return (NULL); 5239 } 5240 RELEASE_CONN_LOCK(q); 5241 mutex_exit(&ill->ill_lock); 5242 } 5243 } 5244 rw_exit(&ipst->ips_ill_g_lock); 5245 if (err != NULL) 5246 *err = ENXIO; 5247 return (NULL); 5248 } 5249 5250 /* 5251 * Return the ifindex next in sequence after the passed in ifindex. 5252 * If there is no next ifindex for the given protocol, return 0. 5253 */ 5254 uint_t 5255 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 5256 { 5257 phyint_t *phyi; 5258 phyint_t *phyi_initial; 5259 uint_t ifindex; 5260 5261 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5262 5263 if (index == 0) { 5264 phyi = avl_first( 5265 &ipst->ips_phyint_g_list->phyint_list_avl_by_index); 5266 } else { 5267 phyi = phyi_initial = avl_find( 5268 &ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5269 (void *) &index, NULL); 5270 } 5271 5272 for (; phyi != NULL; 5273 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5274 phyi, AVL_AFTER)) { 5275 /* 5276 * If we're not returning the first interface in the tree 5277 * and we still haven't moved past the phyint_t that 5278 * corresponds to index, avl_walk needs to be called again 5279 */ 5280 if (!((index != 0) && (phyi == phyi_initial))) { 5281 if (isv6) { 5282 if ((phyi->phyint_illv6) && 5283 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 5284 (phyi->phyint_illv6->ill_isv6 == 1)) 5285 break; 5286 } else { 5287 if ((phyi->phyint_illv4) && 5288 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 5289 (phyi->phyint_illv4->ill_isv6 == 0)) 5290 break; 5291 } 5292 } 5293 } 5294 5295 rw_exit(&ipst->ips_ill_g_lock); 5296 5297 if (phyi != NULL) 5298 ifindex = phyi->phyint_ifindex; 5299 else 5300 ifindex = 0; 5301 5302 return (ifindex); 5303 } 5304 5305 5306 /* 5307 * Return the ifindex for the named interface. 5308 * If there is no next ifindex for the interface, return 0. 5309 */ 5310 uint_t 5311 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst) 5312 { 5313 phyint_t *phyi; 5314 avl_index_t where = 0; 5315 uint_t ifindex; 5316 5317 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5318 5319 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 5320 name, &where)) == NULL) { 5321 rw_exit(&ipst->ips_ill_g_lock); 5322 return (0); 5323 } 5324 5325 ifindex = phyi->phyint_ifindex; 5326 5327 rw_exit(&ipst->ips_ill_g_lock); 5328 5329 return (ifindex); 5330 } 5331 5332 5333 /* 5334 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 5335 * that gives a running thread a reference to the ill. This reference must be 5336 * released by the thread when it is done accessing the ill and related 5337 * objects. ill_refcnt can not be used to account for static references 5338 * such as other structures pointing to an ill. Callers must generally 5339 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 5340 * or be sure that the ill is not being deleted or changing state before 5341 * calling the refhold functions. A non-zero ill_refcnt ensures that the 5342 * ill won't change any of its critical state such as address, netmask etc. 5343 */ 5344 void 5345 ill_refhold(ill_t *ill) 5346 { 5347 mutex_enter(&ill->ill_lock); 5348 ill->ill_refcnt++; 5349 ILL_TRACE_REF(ill); 5350 mutex_exit(&ill->ill_lock); 5351 } 5352 5353 void 5354 ill_refhold_locked(ill_t *ill) 5355 { 5356 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5357 ill->ill_refcnt++; 5358 ILL_TRACE_REF(ill); 5359 } 5360 5361 int 5362 ill_check_and_refhold(ill_t *ill) 5363 { 5364 mutex_enter(&ill->ill_lock); 5365 if (ILL_CAN_LOOKUP(ill)) { 5366 ill_refhold_locked(ill); 5367 mutex_exit(&ill->ill_lock); 5368 return (0); 5369 } 5370 mutex_exit(&ill->ill_lock); 5371 return (ILL_LOOKUP_FAILED); 5372 } 5373 5374 /* 5375 * Must not be called while holding any locks. Otherwise if this is 5376 * the last reference to be released, there is a chance of recursive mutex 5377 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5378 * to restart an ioctl. 5379 */ 5380 void 5381 ill_refrele(ill_t *ill) 5382 { 5383 mutex_enter(&ill->ill_lock); 5384 ASSERT(ill->ill_refcnt != 0); 5385 ill->ill_refcnt--; 5386 ILL_UNTRACE_REF(ill); 5387 if (ill->ill_refcnt != 0) { 5388 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 5389 mutex_exit(&ill->ill_lock); 5390 return; 5391 } 5392 5393 /* Drops the ill_lock */ 5394 ipif_ill_refrele_tail(ill); 5395 } 5396 5397 /* 5398 * Obtain a weak reference count on the ill. This reference ensures the 5399 * ill won't be freed, but the ill may change any of its critical state 5400 * such as netmask, address etc. Returns an error if the ill has started 5401 * closing. 5402 */ 5403 boolean_t 5404 ill_waiter_inc(ill_t *ill) 5405 { 5406 mutex_enter(&ill->ill_lock); 5407 if (ill->ill_state_flags & ILL_CONDEMNED) { 5408 mutex_exit(&ill->ill_lock); 5409 return (B_FALSE); 5410 } 5411 ill->ill_waiters++; 5412 mutex_exit(&ill->ill_lock); 5413 return (B_TRUE); 5414 } 5415 5416 void 5417 ill_waiter_dcr(ill_t *ill) 5418 { 5419 mutex_enter(&ill->ill_lock); 5420 ill->ill_waiters--; 5421 if (ill->ill_waiters == 0) 5422 cv_broadcast(&ill->ill_cv); 5423 mutex_exit(&ill->ill_lock); 5424 } 5425 5426 /* 5427 * Named Dispatch routine to produce a formatted report on all ILLs. 5428 * This report is accessed by using the ndd utility to "get" ND variable 5429 * "ip_ill_status". 5430 */ 5431 /* ARGSUSED */ 5432 int 5433 ip_ill_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5434 { 5435 ill_t *ill; 5436 ill_walk_context_t ctx; 5437 ip_stack_t *ipst; 5438 5439 ipst = CONNQ_TO_IPST(q); 5440 5441 (void) mi_mpprintf(mp, 5442 "ILL " MI_COL_HDRPAD_STR 5443 /* 01234567[89ABCDEF] */ 5444 "rq " MI_COL_HDRPAD_STR 5445 /* 01234567[89ABCDEF] */ 5446 "wq " MI_COL_HDRPAD_STR 5447 /* 01234567[89ABCDEF] */ 5448 "upcnt mxfrg err name"); 5449 /* 12345 12345 123 xxxxxxxx */ 5450 5451 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5452 ill = ILL_START_WALK_ALL(&ctx, ipst); 5453 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5454 (void) mi_mpprintf(mp, 5455 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 5456 "%05u %05u %03d %s", 5457 (void *)ill, (void *)ill->ill_rq, (void *)ill->ill_wq, 5458 ill->ill_ipif_up_count, 5459 ill->ill_max_frag, ill->ill_error, ill->ill_name); 5460 } 5461 rw_exit(&ipst->ips_ill_g_lock); 5462 5463 return (0); 5464 } 5465 5466 /* 5467 * Named Dispatch routine to produce a formatted report on all IPIFs. 5468 * This report is accessed by using the ndd utility to "get" ND variable 5469 * "ip_ipif_status". 5470 */ 5471 /* ARGSUSED */ 5472 int 5473 ip_ipif_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5474 { 5475 char buf1[INET6_ADDRSTRLEN]; 5476 char buf2[INET6_ADDRSTRLEN]; 5477 char buf3[INET6_ADDRSTRLEN]; 5478 char buf4[INET6_ADDRSTRLEN]; 5479 char buf5[INET6_ADDRSTRLEN]; 5480 char buf6[INET6_ADDRSTRLEN]; 5481 char buf[LIFNAMSIZ]; 5482 ill_t *ill; 5483 ipif_t *ipif; 5484 nv_t *nvp; 5485 uint64_t flags; 5486 zoneid_t zoneid; 5487 ill_walk_context_t ctx; 5488 ip_stack_t *ipst = CONNQ_TO_IPST(q); 5489 5490 (void) mi_mpprintf(mp, 5491 "IPIF metric mtu in/out/forward name zone flags...\n" 5492 "\tlocal address\n" 5493 "\tsrc address\n" 5494 "\tsubnet\n" 5495 "\tmask\n" 5496 "\tbroadcast\n" 5497 "\tp-p-dst"); 5498 5499 ASSERT(q->q_next == NULL); 5500 zoneid = Q_TO_CONN(q)->conn_zoneid; /* IP is a driver */ 5501 5502 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5503 ill = ILL_START_WALK_ALL(&ctx, ipst); 5504 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5505 for (ipif = ill->ill_ipif; ipif != NULL; 5506 ipif = ipif->ipif_next) { 5507 if (zoneid != GLOBAL_ZONEID && 5508 zoneid != ipif->ipif_zoneid && 5509 ipif->ipif_zoneid != ALL_ZONES) 5510 continue; 5511 5512 ipif_get_name(ipif, buf, sizeof (buf)); 5513 (void) mi_mpprintf(mp, 5514 MI_COL_PTRFMT_STR 5515 "%04u %05u %u/%u/%u %s %d", 5516 (void *)ipif, 5517 ipif->ipif_metric, ipif->ipif_mtu, 5518 ipif->ipif_ib_pkt_count, 5519 ipif->ipif_ob_pkt_count, 5520 ipif->ipif_fo_pkt_count, 5521 buf, 5522 ipif->ipif_zoneid); 5523 5524 flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags | 5525 ipif->ipif_ill->ill_phyint->phyint_flags; 5526 5527 /* Tack on text strings for any flags. */ 5528 nvp = ipif_nv_tbl; 5529 for (; nvp < A_END(ipif_nv_tbl); nvp++) { 5530 if (nvp->nv_value & flags) 5531 (void) mi_mpprintf_nr(mp, " %s", 5532 nvp->nv_name); 5533 } 5534 (void) mi_mpprintf(mp, 5535 "\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n\t%s", 5536 inet_ntop(AF_INET6, 5537 &ipif->ipif_v6lcl_addr, buf1, sizeof (buf1)), 5538 inet_ntop(AF_INET6, 5539 &ipif->ipif_v6src_addr, buf2, sizeof (buf2)), 5540 inet_ntop(AF_INET6, 5541 &ipif->ipif_v6subnet, buf3, sizeof (buf3)), 5542 inet_ntop(AF_INET6, 5543 &ipif->ipif_v6net_mask, buf4, sizeof (buf4)), 5544 inet_ntop(AF_INET6, 5545 &ipif->ipif_v6brd_addr, buf5, sizeof (buf5)), 5546 inet_ntop(AF_INET6, 5547 &ipif->ipif_v6pp_dst_addr, buf6, sizeof (buf6))); 5548 } 5549 } 5550 rw_exit(&ipst->ips_ill_g_lock); 5551 return (0); 5552 } 5553 5554 /* 5555 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 5556 * driver. We construct best guess defaults for lower level information that 5557 * we need. If an interface is brought up without injection of any overriding 5558 * information from outside, we have to be ready to go with these defaults. 5559 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 5560 * we primarely want the dl_provider_style. 5561 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 5562 * at which point we assume the other part of the information is valid. 5563 */ 5564 void 5565 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 5566 { 5567 uchar_t *brdcst_addr; 5568 uint_t brdcst_addr_length, phys_addr_length; 5569 t_scalar_t sap_length; 5570 dl_info_ack_t *dlia; 5571 ip_m_t *ipm; 5572 dl_qos_cl_sel1_t *sel1; 5573 5574 ASSERT(IAM_WRITER_ILL(ill)); 5575 5576 /* 5577 * Till the ill is fully up ILL_CHANGING will be set and 5578 * the ill is not globally visible. So no need for a lock. 5579 */ 5580 dlia = (dl_info_ack_t *)mp->b_rptr; 5581 ill->ill_mactype = dlia->dl_mac_type; 5582 5583 ipm = ip_m_lookup(dlia->dl_mac_type); 5584 if (ipm == NULL) { 5585 ipm = ip_m_lookup(DL_OTHER); 5586 ASSERT(ipm != NULL); 5587 } 5588 ill->ill_media = ipm; 5589 5590 /* 5591 * When the new DLPI stuff is ready we'll pull lengths 5592 * from dlia. 5593 */ 5594 if (dlia->dl_version == DL_VERSION_2) { 5595 brdcst_addr_length = dlia->dl_brdcst_addr_length; 5596 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 5597 brdcst_addr_length); 5598 if (brdcst_addr == NULL) { 5599 brdcst_addr_length = 0; 5600 } 5601 sap_length = dlia->dl_sap_length; 5602 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 5603 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 5604 brdcst_addr_length, sap_length, phys_addr_length)); 5605 } else { 5606 brdcst_addr_length = 6; 5607 brdcst_addr = ip_six_byte_all_ones; 5608 sap_length = -2; 5609 phys_addr_length = brdcst_addr_length; 5610 } 5611 5612 ill->ill_bcast_addr_length = brdcst_addr_length; 5613 ill->ill_phys_addr_length = phys_addr_length; 5614 ill->ill_sap_length = sap_length; 5615 ill->ill_max_frag = dlia->dl_max_sdu; 5616 ill->ill_max_mtu = ill->ill_max_frag; 5617 5618 ill->ill_type = ipm->ip_m_type; 5619 5620 if (!ill->ill_dlpi_style_set) { 5621 if (dlia->dl_provider_style == DL_STYLE2) 5622 ill->ill_needs_attach = 1; 5623 5624 /* 5625 * Allocate the first ipif on this ill. We don't delay it 5626 * further as ioctl handling assumes atleast one ipif to 5627 * be present. 5628 * 5629 * At this point we don't know whether the ill is v4 or v6. 5630 * We will know this whan the SIOCSLIFNAME happens and 5631 * the correct value for ill_isv6 will be assigned in 5632 * ipif_set_values(). We need to hold the ill lock and 5633 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 5634 * the wakeup. 5635 */ 5636 (void) ipif_allocate(ill, 0, IRE_LOCAL, 5637 dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE); 5638 mutex_enter(&ill->ill_lock); 5639 ASSERT(ill->ill_dlpi_style_set == 0); 5640 ill->ill_dlpi_style_set = 1; 5641 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 5642 cv_broadcast(&ill->ill_cv); 5643 mutex_exit(&ill->ill_lock); 5644 freemsg(mp); 5645 return; 5646 } 5647 ASSERT(ill->ill_ipif != NULL); 5648 /* 5649 * We know whether it is IPv4 or IPv6 now, as this is the 5650 * second DL_INFO_ACK we are recieving in response to the 5651 * DL_INFO_REQ sent in ipif_set_values. 5652 */ 5653 if (ill->ill_isv6) 5654 ill->ill_sap = IP6_DL_SAP; 5655 else 5656 ill->ill_sap = IP_DL_SAP; 5657 /* 5658 * Set ipif_mtu which is used to set the IRE's 5659 * ire_max_frag value. The driver could have sent 5660 * a different mtu from what it sent last time. No 5661 * need to call ipif_mtu_change because IREs have 5662 * not yet been created. 5663 */ 5664 ill->ill_ipif->ipif_mtu = ill->ill_max_mtu; 5665 /* 5666 * Clear all the flags that were set based on ill_bcast_addr_length 5667 * and ill_phys_addr_length (in ipif_set_values) as these could have 5668 * changed now and we need to re-evaluate. 5669 */ 5670 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 5671 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 5672 5673 /* 5674 * Free ill_resolver_mp and ill_bcast_mp as things could have 5675 * changed now. 5676 */ 5677 if (ill->ill_bcast_addr_length == 0) { 5678 if (ill->ill_resolver_mp != NULL) 5679 freemsg(ill->ill_resolver_mp); 5680 if (ill->ill_bcast_mp != NULL) 5681 freemsg(ill->ill_bcast_mp); 5682 if (ill->ill_flags & ILLF_XRESOLV) 5683 ill->ill_net_type = IRE_IF_RESOLVER; 5684 else 5685 ill->ill_net_type = IRE_IF_NORESOLVER; 5686 ill->ill_resolver_mp = ill_dlur_gen(NULL, 5687 ill->ill_phys_addr_length, 5688 ill->ill_sap, 5689 ill->ill_sap_length); 5690 ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp); 5691 5692 if (ill->ill_isv6) 5693 /* 5694 * Note: xresolv interfaces will eventually need NOARP 5695 * set here as well, but that will require those 5696 * external resolvers to have some knowledge of 5697 * that flag and act appropriately. Not to be changed 5698 * at present. 5699 */ 5700 ill->ill_flags |= ILLF_NONUD; 5701 else 5702 ill->ill_flags |= ILLF_NOARP; 5703 5704 if (ill->ill_phys_addr_length == 0) { 5705 if (ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 5706 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 5707 ill->ill_phyint->phyint_flags |= PHYI_VIRTUAL; 5708 } else { 5709 /* pt-pt supports multicast. */ 5710 ill->ill_flags |= ILLF_MULTICAST; 5711 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 5712 } 5713 } 5714 } else { 5715 ill->ill_net_type = IRE_IF_RESOLVER; 5716 if (ill->ill_bcast_mp != NULL) 5717 freemsg(ill->ill_bcast_mp); 5718 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 5719 ill->ill_bcast_addr_length, ill->ill_sap, 5720 ill->ill_sap_length); 5721 /* 5722 * Later detect lack of DLPI driver multicast 5723 * capability by catching DL_ENABMULTI errors in 5724 * ip_rput_dlpi. 5725 */ 5726 ill->ill_flags |= ILLF_MULTICAST; 5727 if (!ill->ill_isv6) 5728 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 5729 } 5730 /* By default an interface does not support any CoS marking */ 5731 ill->ill_flags &= ~ILLF_COS_ENABLED; 5732 5733 /* 5734 * If we get QoS information in DL_INFO_ACK, the device supports 5735 * some form of CoS marking, set ILLF_COS_ENABLED. 5736 */ 5737 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 5738 dlia->dl_qos_length); 5739 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 5740 ill->ill_flags |= ILLF_COS_ENABLED; 5741 } 5742 5743 /* Clear any previous error indication. */ 5744 ill->ill_error = 0; 5745 freemsg(mp); 5746 } 5747 5748 /* 5749 * Perform various checks to verify that an address would make sense as a 5750 * local, remote, or subnet interface address. 5751 */ 5752 static boolean_t 5753 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 5754 { 5755 ipaddr_t net_mask; 5756 5757 /* 5758 * Don't allow all zeroes, all ones or experimental address, but allow 5759 * all ones netmask. 5760 */ 5761 if ((net_mask = ip_net_mask(addr)) == 0) 5762 return (B_FALSE); 5763 /* A given netmask overrides the "guess" netmask */ 5764 if (subnet_mask != 0) 5765 net_mask = subnet_mask; 5766 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 5767 (addr == (addr | ~net_mask)))) { 5768 return (B_FALSE); 5769 } 5770 if (CLASSD(addr)) 5771 return (B_FALSE); 5772 5773 return (B_TRUE); 5774 } 5775 5776 #define V6_IPIF_LINKLOCAL(p) \ 5777 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr) 5778 5779 /* 5780 * Compare two given ipifs and check if the second one is better than 5781 * the first one using the order of preference (not taking deprecated 5782 * into acount) specified in ipif_lookup_multicast(). 5783 */ 5784 static boolean_t 5785 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) 5786 { 5787 /* Check the least preferred first. */ 5788 if (IS_LOOPBACK(old_ipif->ipif_ill)) { 5789 /* If both ipifs are the same, use the first one. */ 5790 if (IS_LOOPBACK(new_ipif->ipif_ill)) 5791 return (B_FALSE); 5792 else 5793 return (B_TRUE); 5794 } 5795 5796 /* For IPv6, check for link local address. */ 5797 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) { 5798 if (IS_LOOPBACK(new_ipif->ipif_ill) || 5799 V6_IPIF_LINKLOCAL(new_ipif)) { 5800 /* The second one is equal or less preferred. */ 5801 return (B_FALSE); 5802 } else { 5803 return (B_TRUE); 5804 } 5805 } 5806 5807 /* Then check for point to point interface. */ 5808 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) { 5809 if (IS_LOOPBACK(new_ipif->ipif_ill) || 5810 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) || 5811 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) { 5812 return (B_FALSE); 5813 } else { 5814 return (B_TRUE); 5815 } 5816 } 5817 5818 /* old_ipif is a normal interface, so no need to use the new one. */ 5819 return (B_FALSE); 5820 } 5821 5822 /* 5823 * Find any non-virtual, not condemned, and up multicast capable interface 5824 * given an IP instance and zoneid. Order of preference is: 5825 * 5826 * 1. normal 5827 * 1.1 normal, but deprecated 5828 * 2. point to point 5829 * 2.1 point to point, but deprecated 5830 * 3. link local 5831 * 3.1 link local, but deprecated 5832 * 4. loopback. 5833 */ 5834 ipif_t * 5835 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 5836 { 5837 ill_t *ill; 5838 ill_walk_context_t ctx; 5839 ipif_t *ipif; 5840 ipif_t *saved_ipif = NULL; 5841 ipif_t *dep_ipif = NULL; 5842 5843 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5844 if (isv6) 5845 ill = ILL_START_WALK_V6(&ctx, ipst); 5846 else 5847 ill = ILL_START_WALK_V4(&ctx, ipst); 5848 5849 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5850 mutex_enter(&ill->ill_lock); 5851 if (IS_VNI(ill) || !ILL_CAN_LOOKUP(ill) || 5852 !(ill->ill_flags & ILLF_MULTICAST)) { 5853 mutex_exit(&ill->ill_lock); 5854 continue; 5855 } 5856 for (ipif = ill->ill_ipif; ipif != NULL; 5857 ipif = ipif->ipif_next) { 5858 if (zoneid != ipif->ipif_zoneid && 5859 zoneid != ALL_ZONES && 5860 ipif->ipif_zoneid != ALL_ZONES) { 5861 continue; 5862 } 5863 if (!(ipif->ipif_flags & IPIF_UP) || 5864 !IPIF_CAN_LOOKUP(ipif)) { 5865 continue; 5866 } 5867 5868 /* 5869 * Found one candidate. If it is deprecated, 5870 * remember it in dep_ipif. If it is not deprecated, 5871 * remember it in saved_ipif. 5872 */ 5873 if (ipif->ipif_flags & IPIF_DEPRECATED) { 5874 if (dep_ipif == NULL) { 5875 dep_ipif = ipif; 5876 } else if (ipif_comp_multi(dep_ipif, ipif, 5877 isv6)) { 5878 /* 5879 * If the previous dep_ipif does not 5880 * belong to the same ill, we've done 5881 * a ipif_refhold() on it. So we need 5882 * to release it. 5883 */ 5884 if (dep_ipif->ipif_ill != ill) 5885 ipif_refrele(dep_ipif); 5886 dep_ipif = ipif; 5887 } 5888 continue; 5889 } 5890 if (saved_ipif == NULL) { 5891 saved_ipif = ipif; 5892 } else { 5893 if (ipif_comp_multi(saved_ipif, ipif, isv6)) { 5894 if (saved_ipif->ipif_ill != ill) 5895 ipif_refrele(saved_ipif); 5896 saved_ipif = ipif; 5897 } 5898 } 5899 } 5900 /* 5901 * Before going to the next ill, do a ipif_refhold() on the 5902 * saved ones. 5903 */ 5904 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill) 5905 ipif_refhold_locked(saved_ipif); 5906 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill) 5907 ipif_refhold_locked(dep_ipif); 5908 mutex_exit(&ill->ill_lock); 5909 } 5910 rw_exit(&ipst->ips_ill_g_lock); 5911 5912 /* 5913 * If we have only the saved_ipif, return it. But if we have both 5914 * saved_ipif and dep_ipif, check to see which one is better. 5915 */ 5916 if (saved_ipif != NULL) { 5917 if (dep_ipif != NULL) { 5918 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) { 5919 ipif_refrele(saved_ipif); 5920 return (dep_ipif); 5921 } else { 5922 ipif_refrele(dep_ipif); 5923 return (saved_ipif); 5924 } 5925 } 5926 return (saved_ipif); 5927 } else { 5928 return (dep_ipif); 5929 } 5930 } 5931 5932 /* 5933 * This function is called when an application does not specify an interface 5934 * to be used for multicast traffic (joining a group/sending data). It 5935 * calls ire_lookup_multi() to look for an interface route for the 5936 * specified multicast group. Doing this allows the administrator to add 5937 * prefix routes for multicast to indicate which interface to be used for 5938 * multicast traffic in the above scenario. The route could be for all 5939 * multicast (224.0/4), for a single multicast group (a /32 route) or 5940 * anything in between. If there is no such multicast route, we just find 5941 * any multicast capable interface and return it. The returned ipif 5942 * is refhold'ed. 5943 */ 5944 ipif_t * 5945 ipif_lookup_group(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst) 5946 { 5947 ire_t *ire; 5948 ipif_t *ipif; 5949 5950 ire = ire_lookup_multi(group, zoneid, ipst); 5951 if (ire != NULL) { 5952 ipif = ire->ire_ipif; 5953 ipif_refhold(ipif); 5954 ire_refrele(ire); 5955 return (ipif); 5956 } 5957 5958 return (ipif_lookup_multicast(ipst, zoneid, B_FALSE)); 5959 } 5960 5961 /* 5962 * Look for an ipif with the specified interface address and destination. 5963 * The destination address is used only for matching point-to-point interfaces. 5964 */ 5965 ipif_t * 5966 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, 5967 ipsq_func_t func, int *error, ip_stack_t *ipst) 5968 { 5969 ipif_t *ipif; 5970 ill_t *ill; 5971 ill_walk_context_t ctx; 5972 ipsq_t *ipsq; 5973 5974 if (error != NULL) 5975 *error = 0; 5976 5977 /* 5978 * First match all the point-to-point interfaces 5979 * before looking at non-point-to-point interfaces. 5980 * This is done to avoid returning non-point-to-point 5981 * ipif instead of unnumbered point-to-point ipif. 5982 */ 5983 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5984 ill = ILL_START_WALK_V4(&ctx, ipst); 5985 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5986 GRAB_CONN_LOCK(q); 5987 mutex_enter(&ill->ill_lock); 5988 for (ipif = ill->ill_ipif; ipif != NULL; 5989 ipif = ipif->ipif_next) { 5990 /* Allow the ipif to be down */ 5991 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 5992 (ipif->ipif_lcl_addr == if_addr) && 5993 (ipif->ipif_pp_dst_addr == dst)) { 5994 /* 5995 * The block comment at the start of ipif_down 5996 * explains the use of the macros used below 5997 */ 5998 if (IPIF_CAN_LOOKUP(ipif)) { 5999 ipif_refhold_locked(ipif); 6000 mutex_exit(&ill->ill_lock); 6001 RELEASE_CONN_LOCK(q); 6002 rw_exit(&ipst->ips_ill_g_lock); 6003 return (ipif); 6004 } else if (IPIF_CAN_WAIT(ipif, q)) { 6005 ipsq = ill->ill_phyint->phyint_ipsq; 6006 mutex_enter(&ipsq->ipsq_lock); 6007 mutex_exit(&ill->ill_lock); 6008 rw_exit(&ipst->ips_ill_g_lock); 6009 ipsq_enq(ipsq, q, mp, func, NEW_OP, 6010 ill); 6011 mutex_exit(&ipsq->ipsq_lock); 6012 RELEASE_CONN_LOCK(q); 6013 *error = EINPROGRESS; 6014 return (NULL); 6015 } 6016 } 6017 } 6018 mutex_exit(&ill->ill_lock); 6019 RELEASE_CONN_LOCK(q); 6020 } 6021 rw_exit(&ipst->ips_ill_g_lock); 6022 6023 /* lookup the ipif based on interface address */ 6024 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error, 6025 ipst); 6026 ASSERT(ipif == NULL || !ipif->ipif_isv6); 6027 return (ipif); 6028 } 6029 6030 /* 6031 * Look for an ipif with the specified address. For point-point links 6032 * we look for matches on either the destination address and the local 6033 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 6034 * is set. 6035 * Matches on a specific ill if match_ill is set. 6036 */ 6037 ipif_t * 6038 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, 6039 mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) 6040 { 6041 ipif_t *ipif; 6042 ill_t *ill; 6043 boolean_t ptp = B_FALSE; 6044 ipsq_t *ipsq; 6045 ill_walk_context_t ctx; 6046 6047 if (error != NULL) 6048 *error = 0; 6049 6050 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6051 /* 6052 * Repeat twice, first based on local addresses and 6053 * next time for pointopoint. 6054 */ 6055 repeat: 6056 ill = ILL_START_WALK_V4(&ctx, ipst); 6057 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 6058 if (match_ill != NULL && ill != match_ill) { 6059 continue; 6060 } 6061 GRAB_CONN_LOCK(q); 6062 mutex_enter(&ill->ill_lock); 6063 for (ipif = ill->ill_ipif; ipif != NULL; 6064 ipif = ipif->ipif_next) { 6065 if (zoneid != ALL_ZONES && 6066 zoneid != ipif->ipif_zoneid && 6067 ipif->ipif_zoneid != ALL_ZONES) 6068 continue; 6069 /* Allow the ipif to be down */ 6070 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 6071 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 6072 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 6073 (ipif->ipif_pp_dst_addr == addr))) { 6074 /* 6075 * The block comment at the start of ipif_down 6076 * explains the use of the macros used below 6077 */ 6078 if (IPIF_CAN_LOOKUP(ipif)) { 6079 ipif_refhold_locked(ipif); 6080 mutex_exit(&ill->ill_lock); 6081 RELEASE_CONN_LOCK(q); 6082 rw_exit(&ipst->ips_ill_g_lock); 6083 return (ipif); 6084 } else if (IPIF_CAN_WAIT(ipif, q)) { 6085 ipsq = ill->ill_phyint->phyint_ipsq; 6086 mutex_enter(&ipsq->ipsq_lock); 6087 mutex_exit(&ill->ill_lock); 6088 rw_exit(&ipst->ips_ill_g_lock); 6089 ipsq_enq(ipsq, q, mp, func, NEW_OP, 6090 ill); 6091 mutex_exit(&ipsq->ipsq_lock); 6092 RELEASE_CONN_LOCK(q); 6093 *error = EINPROGRESS; 6094 return (NULL); 6095 } 6096 } 6097 } 6098 mutex_exit(&ill->ill_lock); 6099 RELEASE_CONN_LOCK(q); 6100 } 6101 6102 /* If we already did the ptp case, then we are done */ 6103 if (ptp) { 6104 rw_exit(&ipst->ips_ill_g_lock); 6105 if (error != NULL) 6106 *error = ENXIO; 6107 return (NULL); 6108 } 6109 ptp = B_TRUE; 6110 goto repeat; 6111 } 6112 6113 /* 6114 * Look for an ipif with the specified address. For point-point links 6115 * we look for matches on either the destination address and the local 6116 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 6117 * is set. 6118 * Matches on a specific ill if match_ill is set. 6119 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 6120 */ 6121 zoneid_t 6122 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 6123 { 6124 zoneid_t zoneid; 6125 ipif_t *ipif; 6126 ill_t *ill; 6127 boolean_t ptp = B_FALSE; 6128 ill_walk_context_t ctx; 6129 6130 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6131 /* 6132 * Repeat twice, first based on local addresses and 6133 * next time for pointopoint. 6134 */ 6135 repeat: 6136 ill = ILL_START_WALK_V4(&ctx, ipst); 6137 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 6138 if (match_ill != NULL && ill != match_ill) { 6139 continue; 6140 } 6141 mutex_enter(&ill->ill_lock); 6142 for (ipif = ill->ill_ipif; ipif != NULL; 6143 ipif = ipif->ipif_next) { 6144 /* Allow the ipif to be down */ 6145 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 6146 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 6147 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 6148 (ipif->ipif_pp_dst_addr == addr)) && 6149 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 6150 zoneid = ipif->ipif_zoneid; 6151 mutex_exit(&ill->ill_lock); 6152 rw_exit(&ipst->ips_ill_g_lock); 6153 /* 6154 * If ipif_zoneid was ALL_ZONES then we have 6155 * a trusted extensions shared IP address. 6156 * In that case GLOBAL_ZONEID works to send. 6157 */ 6158 if (zoneid == ALL_ZONES) 6159 zoneid = GLOBAL_ZONEID; 6160 return (zoneid); 6161 } 6162 } 6163 mutex_exit(&ill->ill_lock); 6164 } 6165 6166 /* If we already did the ptp case, then we are done */ 6167 if (ptp) { 6168 rw_exit(&ipst->ips_ill_g_lock); 6169 return (ALL_ZONES); 6170 } 6171 ptp = B_TRUE; 6172 goto repeat; 6173 } 6174 6175 /* 6176 * Look for an ipif that matches the specified remote address i.e. the 6177 * ipif that would receive the specified packet. 6178 * First look for directly connected interfaces and then do a recursive 6179 * IRE lookup and pick the first ipif corresponding to the source address in the 6180 * ire. 6181 * Returns: held ipif 6182 */ 6183 ipif_t * 6184 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 6185 { 6186 ipif_t *ipif; 6187 ire_t *ire; 6188 ip_stack_t *ipst = ill->ill_ipst; 6189 6190 ASSERT(!ill->ill_isv6); 6191 6192 /* 6193 * Someone could be changing this ipif currently or change it 6194 * after we return this. Thus a few packets could use the old 6195 * old values. However structure updates/creates (ire, ilg, ilm etc) 6196 * will atomically be updated or cleaned up with the new value 6197 * Thus we don't need a lock to check the flags or other attrs below. 6198 */ 6199 mutex_enter(&ill->ill_lock); 6200 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6201 if (!IPIF_CAN_LOOKUP(ipif)) 6202 continue; 6203 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 6204 ipif->ipif_zoneid != ALL_ZONES) 6205 continue; 6206 /* Allow the ipif to be down */ 6207 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 6208 if ((ipif->ipif_pp_dst_addr == addr) || 6209 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 6210 ipif->ipif_lcl_addr == addr)) { 6211 ipif_refhold_locked(ipif); 6212 mutex_exit(&ill->ill_lock); 6213 return (ipif); 6214 } 6215 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 6216 ipif_refhold_locked(ipif); 6217 mutex_exit(&ill->ill_lock); 6218 return (ipif); 6219 } 6220 } 6221 mutex_exit(&ill->ill_lock); 6222 ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid, 6223 NULL, MATCH_IRE_RECURSIVE, ipst); 6224 if (ire != NULL) { 6225 /* 6226 * The callers of this function wants to know the 6227 * interface on which they have to send the replies 6228 * back. For IRE_CACHES that have ire_stq and ire_ipif 6229 * derived from different ills, we really don't care 6230 * what we return here. 6231 */ 6232 ipif = ire->ire_ipif; 6233 if (ipif != NULL) { 6234 ipif_refhold(ipif); 6235 ire_refrele(ire); 6236 return (ipif); 6237 } 6238 ire_refrele(ire); 6239 } 6240 /* Pick the first interface */ 6241 ipif = ipif_get_next_ipif(NULL, ill); 6242 return (ipif); 6243 } 6244 6245 /* 6246 * This func does not prevent refcnt from increasing. But if 6247 * the caller has taken steps to that effect, then this func 6248 * can be used to determine whether the ill has become quiescent 6249 */ 6250 boolean_t 6251 ill_is_quiescent(ill_t *ill) 6252 { 6253 ipif_t *ipif; 6254 6255 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6256 6257 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6258 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 6259 return (B_FALSE); 6260 } 6261 } 6262 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0 || 6263 ill->ill_nce_cnt != 0) { 6264 return (B_FALSE); 6265 } 6266 return (B_TRUE); 6267 } 6268 6269 /* 6270 * This func does not prevent refcnt from increasing. But if 6271 * the caller has taken steps to that effect, then this func 6272 * can be used to determine whether the ipif has become quiescent 6273 */ 6274 static boolean_t 6275 ipif_is_quiescent(ipif_t *ipif) 6276 { 6277 ill_t *ill; 6278 6279 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6280 6281 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 6282 return (B_FALSE); 6283 } 6284 6285 ill = ipif->ipif_ill; 6286 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 6287 ill->ill_logical_down) { 6288 return (B_TRUE); 6289 } 6290 6291 /* This is the last ipif going down or being deleted on this ill */ 6292 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { 6293 return (B_FALSE); 6294 } 6295 6296 return (B_TRUE); 6297 } 6298 6299 /* 6300 * This func does not prevent refcnt from increasing. But if 6301 * the caller has taken steps to that effect, then this func 6302 * can be used to determine whether the ipifs marked with IPIF_MOVING 6303 * have become quiescent and can be moved in a failover/failback. 6304 */ 6305 static ipif_t * 6306 ill_quiescent_to_move(ill_t *ill) 6307 { 6308 ipif_t *ipif; 6309 6310 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6311 6312 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6313 if (ipif->ipif_state_flags & IPIF_MOVING) { 6314 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 6315 return (ipif); 6316 } 6317 } 6318 } 6319 return (NULL); 6320 } 6321 6322 /* 6323 * The ipif/ill/ire has been refreled. Do the tail processing. 6324 * Determine if the ipif or ill in question has become quiescent and if so 6325 * wakeup close and/or restart any queued pending ioctl that is waiting 6326 * for the ipif_down (or ill_down) 6327 */ 6328 void 6329 ipif_ill_refrele_tail(ill_t *ill) 6330 { 6331 mblk_t *mp; 6332 conn_t *connp; 6333 ipsq_t *ipsq; 6334 ipif_t *ipif; 6335 dl_notify_ind_t *dlindp; 6336 6337 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6338 6339 if ((ill->ill_state_flags & ILL_CONDEMNED) && 6340 ill_is_quiescent(ill)) { 6341 /* ill_close may be waiting */ 6342 cv_broadcast(&ill->ill_cv); 6343 } 6344 6345 /* ipsq can't change because ill_lock is held */ 6346 ipsq = ill->ill_phyint->phyint_ipsq; 6347 if (ipsq->ipsq_waitfor == 0) { 6348 /* Not waiting for anything, just return. */ 6349 mutex_exit(&ill->ill_lock); 6350 return; 6351 } 6352 ASSERT(ipsq->ipsq_pending_mp != NULL && 6353 ipsq->ipsq_pending_ipif != NULL); 6354 /* 6355 * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF. 6356 * Last ipif going down needs to down the ill, so ill_ire_cnt must 6357 * be zero for restarting an ioctl that ends up downing the ill. 6358 */ 6359 ipif = ipsq->ipsq_pending_ipif; 6360 if (ipif->ipif_ill != ill) { 6361 /* The ioctl is pending on some other ill. */ 6362 mutex_exit(&ill->ill_lock); 6363 return; 6364 } 6365 6366 switch (ipsq->ipsq_waitfor) { 6367 case IPIF_DOWN: 6368 case IPIF_FREE: 6369 if (!ipif_is_quiescent(ipif)) { 6370 mutex_exit(&ill->ill_lock); 6371 return; 6372 } 6373 break; 6374 6375 case ILL_DOWN: 6376 case ILL_FREE: 6377 /* 6378 * case ILL_FREE arises only for loopback. otherwise ill_delete 6379 * waits synchronously in ip_close, and no message is queued in 6380 * ipsq_pending_mp at all in this case 6381 */ 6382 if (!ill_is_quiescent(ill)) { 6383 mutex_exit(&ill->ill_lock); 6384 return; 6385 } 6386 6387 break; 6388 6389 case ILL_MOVE_OK: 6390 if (ill_quiescent_to_move(ill) != NULL) { 6391 mutex_exit(&ill->ill_lock); 6392 return; 6393 } 6394 6395 break; 6396 default: 6397 cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n", 6398 (void *)ipsq, ipsq->ipsq_waitfor); 6399 } 6400 6401 /* 6402 * Incr refcnt for the qwriter_ip call below which 6403 * does a refrele 6404 */ 6405 ill_refhold_locked(ill); 6406 mutex_exit(&ill->ill_lock); 6407 6408 mp = ipsq_pending_mp_get(ipsq, &connp); 6409 ASSERT(mp != NULL); 6410 6411 /* 6412 * NOTE: all of the qwriter_ip() calls below use CUR_OP since 6413 * we can only get here when the current operation decides it 6414 * it needs to quiesce via ipsq_pending_mp_add(). 6415 */ 6416 switch (mp->b_datap->db_type) { 6417 case M_PCPROTO: 6418 case M_PROTO: 6419 /* 6420 * For now, only DL_NOTIFY_IND messages can use this facility. 6421 */ 6422 dlindp = (dl_notify_ind_t *)mp->b_rptr; 6423 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND); 6424 6425 switch (dlindp->dl_notification) { 6426 case DL_NOTE_PHYS_ADDR: 6427 qwriter_ip(ill, ill->ill_rq, mp, 6428 ill_set_phys_addr_tail, CUR_OP, B_TRUE); 6429 return; 6430 default: 6431 ASSERT(0); 6432 } 6433 break; 6434 6435 case M_ERROR: 6436 case M_HANGUP: 6437 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP, 6438 B_TRUE); 6439 return; 6440 6441 case M_IOCTL: 6442 case M_IOCDATA: 6443 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) : 6444 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE); 6445 return; 6446 6447 default: 6448 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 6449 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 6450 } 6451 } 6452 6453 #ifdef DEBUG 6454 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 6455 static void 6456 th_trace_rrecord(th_trace_t *th_trace) 6457 { 6458 tr_buf_t *tr_buf; 6459 uint_t lastref; 6460 6461 lastref = th_trace->th_trace_lastref; 6462 lastref++; 6463 if (lastref == TR_BUF_MAX) 6464 lastref = 0; 6465 th_trace->th_trace_lastref = lastref; 6466 tr_buf = &th_trace->th_trbuf[lastref]; 6467 tr_buf->tr_time = lbolt; 6468 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH); 6469 } 6470 6471 static void 6472 th_trace_free(void *value) 6473 { 6474 th_trace_t *th_trace = value; 6475 6476 ASSERT(th_trace->th_refcnt == 0); 6477 kmem_free(th_trace, sizeof (*th_trace)); 6478 } 6479 6480 /* 6481 * Find or create the per-thread hash table used to track object references. 6482 * The ipst argument is NULL if we shouldn't allocate. 6483 * 6484 * Accesses per-thread data, so there's no need to lock here. 6485 */ 6486 static mod_hash_t * 6487 th_trace_gethash(ip_stack_t *ipst) 6488 { 6489 th_hash_t *thh; 6490 6491 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) { 6492 mod_hash_t *mh; 6493 char name[256]; 6494 size_t objsize, rshift; 6495 int retv; 6496 6497 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL) 6498 return (NULL); 6499 (void) snprintf(name, sizeof (name), "th_trace_%p", curthread); 6500 6501 /* 6502 * We use mod_hash_create_extended here rather than the more 6503 * obvious mod_hash_create_ptrhash because the latter has a 6504 * hard-coded KM_SLEEP, and we'd prefer to fail rather than 6505 * block. 6506 */ 6507 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)), 6508 MAX(sizeof (ire_t), sizeof (nce_t))); 6509 rshift = highbit(objsize); 6510 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor, 6511 th_trace_free, mod_hash_byptr, (void *)rshift, 6512 mod_hash_ptrkey_cmp, KM_NOSLEEP); 6513 if (mh == NULL) { 6514 kmem_free(thh, sizeof (*thh)); 6515 return (NULL); 6516 } 6517 thh->thh_hash = mh; 6518 thh->thh_ipst = ipst; 6519 /* 6520 * We trace ills, ipifs, ires, and nces. All of these are 6521 * per-IP-stack, so the lock on the thread list is as well. 6522 */ 6523 rw_enter(&ip_thread_rwlock, RW_WRITER); 6524 list_insert_tail(&ip_thread_list, thh); 6525 rw_exit(&ip_thread_rwlock); 6526 retv = tsd_set(ip_thread_data, thh); 6527 ASSERT(retv == 0); 6528 } 6529 return (thh != NULL ? thh->thh_hash : NULL); 6530 } 6531 6532 boolean_t 6533 th_trace_ref(const void *obj, ip_stack_t *ipst) 6534 { 6535 th_trace_t *th_trace; 6536 mod_hash_t *mh; 6537 mod_hash_val_t val; 6538 6539 if ((mh = th_trace_gethash(ipst)) == NULL) 6540 return (B_FALSE); 6541 6542 /* 6543 * Attempt to locate the trace buffer for this obj and thread. 6544 * If it does not exist, then allocate a new trace buffer and 6545 * insert into the hash. 6546 */ 6547 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) { 6548 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP); 6549 if (th_trace == NULL) 6550 return (B_FALSE); 6551 6552 th_trace->th_id = curthread; 6553 if (mod_hash_insert(mh, (mod_hash_key_t)obj, 6554 (mod_hash_val_t)th_trace) != 0) { 6555 kmem_free(th_trace, sizeof (th_trace_t)); 6556 return (B_FALSE); 6557 } 6558 } else { 6559 th_trace = (th_trace_t *)val; 6560 } 6561 6562 ASSERT(th_trace->th_refcnt >= 0 && 6563 th_trace->th_refcnt < TR_BUF_MAX - 1); 6564 6565 th_trace->th_refcnt++; 6566 th_trace_rrecord(th_trace); 6567 return (B_TRUE); 6568 } 6569 6570 /* 6571 * For the purpose of tracing a reference release, we assume that global 6572 * tracing is always on and that the same thread initiated the reference hold 6573 * is releasing. 6574 */ 6575 void 6576 th_trace_unref(const void *obj) 6577 { 6578 int retv; 6579 mod_hash_t *mh; 6580 th_trace_t *th_trace; 6581 mod_hash_val_t val; 6582 6583 mh = th_trace_gethash(NULL); 6584 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val); 6585 ASSERT(retv == 0); 6586 th_trace = (th_trace_t *)val; 6587 6588 ASSERT(th_trace->th_refcnt > 0); 6589 th_trace->th_refcnt--; 6590 th_trace_rrecord(th_trace); 6591 } 6592 6593 /* 6594 * If tracing has been disabled, then we assume that the reference counts are 6595 * now useless, and we clear them out before destroying the entries. 6596 */ 6597 void 6598 th_trace_cleanup(const void *obj, boolean_t trace_disable) 6599 { 6600 th_hash_t *thh; 6601 mod_hash_t *mh; 6602 mod_hash_val_t val; 6603 th_trace_t *th_trace; 6604 int retv; 6605 6606 rw_enter(&ip_thread_rwlock, RW_READER); 6607 for (thh = list_head(&ip_thread_list); thh != NULL; 6608 thh = list_next(&ip_thread_list, thh)) { 6609 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj, 6610 &val) == 0) { 6611 th_trace = (th_trace_t *)val; 6612 if (trace_disable) 6613 th_trace->th_refcnt = 0; 6614 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj); 6615 ASSERT(retv == 0); 6616 } 6617 } 6618 rw_exit(&ip_thread_rwlock); 6619 } 6620 6621 void 6622 ipif_trace_ref(ipif_t *ipif) 6623 { 6624 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6625 6626 if (ipif->ipif_trace_disable) 6627 return; 6628 6629 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) { 6630 ipif->ipif_trace_disable = B_TRUE; 6631 ipif_trace_cleanup(ipif); 6632 } 6633 } 6634 6635 void 6636 ipif_untrace_ref(ipif_t *ipif) 6637 { 6638 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6639 6640 if (!ipif->ipif_trace_disable) 6641 th_trace_unref(ipif); 6642 } 6643 6644 void 6645 ill_trace_ref(ill_t *ill) 6646 { 6647 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6648 6649 if (ill->ill_trace_disable) 6650 return; 6651 6652 if (!th_trace_ref(ill, ill->ill_ipst)) { 6653 ill->ill_trace_disable = B_TRUE; 6654 ill_trace_cleanup(ill); 6655 } 6656 } 6657 6658 void 6659 ill_untrace_ref(ill_t *ill) 6660 { 6661 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6662 6663 if (!ill->ill_trace_disable) 6664 th_trace_unref(ill); 6665 } 6666 6667 /* 6668 * Called when ipif is unplumbed or when memory alloc fails. Note that on 6669 * failure, ipif_trace_disable is set. 6670 */ 6671 static void 6672 ipif_trace_cleanup(const ipif_t *ipif) 6673 { 6674 th_trace_cleanup(ipif, ipif->ipif_trace_disable); 6675 } 6676 6677 /* 6678 * Called when ill is unplumbed or when memory alloc fails. Note that on 6679 * failure, ill_trace_disable is set. 6680 */ 6681 static void 6682 ill_trace_cleanup(const ill_t *ill) 6683 { 6684 th_trace_cleanup(ill, ill->ill_trace_disable); 6685 } 6686 #endif /* DEBUG */ 6687 6688 void 6689 ipif_refhold_locked(ipif_t *ipif) 6690 { 6691 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6692 ipif->ipif_refcnt++; 6693 IPIF_TRACE_REF(ipif); 6694 } 6695 6696 void 6697 ipif_refhold(ipif_t *ipif) 6698 { 6699 ill_t *ill; 6700 6701 ill = ipif->ipif_ill; 6702 mutex_enter(&ill->ill_lock); 6703 ipif->ipif_refcnt++; 6704 IPIF_TRACE_REF(ipif); 6705 mutex_exit(&ill->ill_lock); 6706 } 6707 6708 /* 6709 * Must not be called while holding any locks. Otherwise if this is 6710 * the last reference to be released there is a chance of recursive mutex 6711 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 6712 * to restart an ioctl. 6713 */ 6714 void 6715 ipif_refrele(ipif_t *ipif) 6716 { 6717 ill_t *ill; 6718 6719 ill = ipif->ipif_ill; 6720 6721 mutex_enter(&ill->ill_lock); 6722 ASSERT(ipif->ipif_refcnt != 0); 6723 ipif->ipif_refcnt--; 6724 IPIF_UNTRACE_REF(ipif); 6725 if (ipif->ipif_refcnt != 0) { 6726 mutex_exit(&ill->ill_lock); 6727 return; 6728 } 6729 6730 /* Drops the ill_lock */ 6731 ipif_ill_refrele_tail(ill); 6732 } 6733 6734 ipif_t * 6735 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 6736 { 6737 ipif_t *ipif; 6738 6739 mutex_enter(&ill->ill_lock); 6740 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 6741 ipif != NULL; ipif = ipif->ipif_next) { 6742 if (!IPIF_CAN_LOOKUP(ipif)) 6743 continue; 6744 ipif_refhold_locked(ipif); 6745 mutex_exit(&ill->ill_lock); 6746 return (ipif); 6747 } 6748 mutex_exit(&ill->ill_lock); 6749 return (NULL); 6750 } 6751 6752 /* 6753 * TODO: make this table extendible at run time 6754 * Return a pointer to the mac type info for 'mac_type' 6755 */ 6756 static ip_m_t * 6757 ip_m_lookup(t_uscalar_t mac_type) 6758 { 6759 ip_m_t *ipm; 6760 6761 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 6762 if (ipm->ip_m_mac_type == mac_type) 6763 return (ipm); 6764 return (NULL); 6765 } 6766 6767 /* 6768 * ip_rt_add is called to add an IPv4 route to the forwarding table. 6769 * ipif_arg is passed in to associate it with the correct interface. 6770 * We may need to restart this operation if the ipif cannot be looked up 6771 * due to an exclusive operation that is currently in progress. The restart 6772 * entry point is specified by 'func' 6773 */ 6774 int 6775 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6776 ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ire_t **ire_arg, 6777 boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func, 6778 struct rtsa_s *sp, ip_stack_t *ipst) 6779 { 6780 ire_t *ire; 6781 ire_t *gw_ire = NULL; 6782 ipif_t *ipif = NULL; 6783 boolean_t ipif_refheld = B_FALSE; 6784 uint_t type; 6785 int match_flags = MATCH_IRE_TYPE; 6786 int error; 6787 tsol_gc_t *gc = NULL; 6788 tsol_gcgrp_t *gcgrp = NULL; 6789 boolean_t gcgrp_xtraref = B_FALSE; 6790 6791 ip1dbg(("ip_rt_add:")); 6792 6793 if (ire_arg != NULL) 6794 *ire_arg = NULL; 6795 6796 /* 6797 * If this is the case of RTF_HOST being set, then we set the netmask 6798 * to all ones (regardless if one was supplied). 6799 */ 6800 if (flags & RTF_HOST) 6801 mask = IP_HOST_MASK; 6802 6803 /* 6804 * Prevent routes with a zero gateway from being created (since 6805 * interfaces can currently be plumbed and brought up no assigned 6806 * address). 6807 */ 6808 if (gw_addr == 0) 6809 return (ENETUNREACH); 6810 /* 6811 * Get the ipif, if any, corresponding to the gw_addr 6812 */ 6813 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &error, 6814 ipst); 6815 if (ipif != NULL) { 6816 if (IS_VNI(ipif->ipif_ill)) { 6817 ipif_refrele(ipif); 6818 return (EINVAL); 6819 } 6820 ipif_refheld = B_TRUE; 6821 } else if (error == EINPROGRESS) { 6822 ip1dbg(("ip_rt_add: null and EINPROGRESS")); 6823 return (EINPROGRESS); 6824 } else { 6825 error = 0; 6826 } 6827 6828 if (ipif != NULL) { 6829 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull")); 6830 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6831 } else { 6832 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null")); 6833 } 6834 6835 /* 6836 * GateD will attempt to create routes with a loopback interface 6837 * address as the gateway and with RTF_GATEWAY set. We allow 6838 * these routes to be added, but create them as interface routes 6839 * since the gateway is an interface address. 6840 */ 6841 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 6842 flags &= ~RTF_GATEWAY; 6843 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 6844 mask == IP_HOST_MASK) { 6845 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 6846 ALL_ZONES, NULL, match_flags, ipst); 6847 if (ire != NULL) { 6848 ire_refrele(ire); 6849 if (ipif_refheld) 6850 ipif_refrele(ipif); 6851 return (EEXIST); 6852 } 6853 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x" 6854 "for 0x%x\n", (void *)ipif, 6855 ipif->ipif_ire_type, 6856 ntohl(ipif->ipif_lcl_addr))); 6857 ire = ire_create( 6858 (uchar_t *)&dst_addr, /* dest address */ 6859 (uchar_t *)&mask, /* mask */ 6860 (uchar_t *)&ipif->ipif_src_addr, 6861 NULL, /* no gateway */ 6862 &ipif->ipif_mtu, 6863 NULL, 6864 ipif->ipif_rq, /* recv-from queue */ 6865 NULL, /* no send-to queue */ 6866 ipif->ipif_ire_type, /* LOOPBACK */ 6867 ipif, 6868 0, 6869 0, 6870 0, 6871 (ipif->ipif_flags & IPIF_PRIVATE) ? 6872 RTF_PRIVATE : 0, 6873 &ire_uinfo_null, 6874 NULL, 6875 NULL, 6876 ipst); 6877 6878 if (ire == NULL) { 6879 if (ipif_refheld) 6880 ipif_refrele(ipif); 6881 return (ENOMEM); 6882 } 6883 error = ire_add(&ire, q, mp, func, B_FALSE); 6884 if (error == 0) 6885 goto save_ire; 6886 if (ipif_refheld) 6887 ipif_refrele(ipif); 6888 return (error); 6889 6890 } 6891 } 6892 6893 /* 6894 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 6895 * and the gateway address provided is one of the system's interface 6896 * addresses. By using the routing socket interface and supplying an 6897 * RTA_IFP sockaddr with an interface index, an alternate method of 6898 * specifying an interface route to be created is available which uses 6899 * the interface index that specifies the outgoing interface rather than 6900 * the address of an outgoing interface (which may not be able to 6901 * uniquely identify an interface). When coupled with the RTF_GATEWAY 6902 * flag, routes can be specified which not only specify the next-hop to 6903 * be used when routing to a certain prefix, but also which outgoing 6904 * interface should be used. 6905 * 6906 * Previously, interfaces would have unique addresses assigned to them 6907 * and so the address assigned to a particular interface could be used 6908 * to identify a particular interface. One exception to this was the 6909 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 6910 * 6911 * With the advent of IPv6 and its link-local addresses, this 6912 * restriction was relaxed and interfaces could share addresses between 6913 * themselves. In fact, typically all of the link-local interfaces on 6914 * an IPv6 node or router will have the same link-local address. In 6915 * order to differentiate between these interfaces, the use of an 6916 * interface index is necessary and this index can be carried inside a 6917 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 6918 * of using the interface index, however, is that all of the ipif's that 6919 * are part of an ill have the same index and so the RTA_IFP sockaddr 6920 * cannot be used to differentiate between ipif's (or logical 6921 * interfaces) that belong to the same ill (physical interface). 6922 * 6923 * For example, in the following case involving IPv4 interfaces and 6924 * logical interfaces 6925 * 6926 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 6927 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1 6928 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2 6929 * 6930 * the ipif's corresponding to each of these interface routes can be 6931 * uniquely identified by the "gateway" (actually interface address). 6932 * 6933 * In this case involving multiple IPv6 default routes to a particular 6934 * link-local gateway, the use of RTA_IFP is necessary to specify which 6935 * default route is of interest: 6936 * 6937 * default fe80::123:4567:89ab:cdef U if0 6938 * default fe80::123:4567:89ab:cdef U if1 6939 */ 6940 6941 /* RTF_GATEWAY not set */ 6942 if (!(flags & RTF_GATEWAY)) { 6943 queue_t *stq; 6944 6945 if (sp != NULL) { 6946 ip2dbg(("ip_rt_add: gateway security attributes " 6947 "cannot be set with interface route\n")); 6948 if (ipif_refheld) 6949 ipif_refrele(ipif); 6950 return (EINVAL); 6951 } 6952 6953 /* 6954 * As the interface index specified with the RTA_IFP sockaddr is 6955 * the same for all ipif's off of an ill, the matching logic 6956 * below uses MATCH_IRE_ILL if such an index was specified. 6957 * This means that routes sharing the same prefix when added 6958 * using a RTA_IFP sockaddr must have distinct interface 6959 * indices (namely, they must be on distinct ill's). 6960 * 6961 * On the other hand, since the gateway address will usually be 6962 * different for each ipif on the system, the matching logic 6963 * uses MATCH_IRE_IPIF in the case of a traditional interface 6964 * route. This means that interface routes for the same prefix 6965 * can be created if they belong to distinct ipif's and if a 6966 * RTA_IFP sockaddr is not present. 6967 */ 6968 if (ipif_arg != NULL) { 6969 if (ipif_refheld) { 6970 ipif_refrele(ipif); 6971 ipif_refheld = B_FALSE; 6972 } 6973 ipif = ipif_arg; 6974 match_flags |= MATCH_IRE_ILL; 6975 } else { 6976 /* 6977 * Check the ipif corresponding to the gw_addr 6978 */ 6979 if (ipif == NULL) 6980 return (ENETUNREACH); 6981 match_flags |= MATCH_IRE_IPIF; 6982 } 6983 ASSERT(ipif != NULL); 6984 6985 /* 6986 * We check for an existing entry at this point. 6987 * 6988 * Since a netmask isn't passed in via the ioctl interface 6989 * (SIOCADDRT), we don't check for a matching netmask in that 6990 * case. 6991 */ 6992 if (!ioctl_msg) 6993 match_flags |= MATCH_IRE_MASK; 6994 ire = ire_ftable_lookup(dst_addr, mask, 0, IRE_INTERFACE, ipif, 6995 NULL, ALL_ZONES, 0, NULL, match_flags, ipst); 6996 if (ire != NULL) { 6997 ire_refrele(ire); 6998 if (ipif_refheld) 6999 ipif_refrele(ipif); 7000 return (EEXIST); 7001 } 7002 7003 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 7004 ? ipif->ipif_rq : ipif->ipif_wq; 7005 7006 /* 7007 * Create a copy of the IRE_LOOPBACK, 7008 * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with 7009 * the modified address and netmask. 7010 */ 7011 ire = ire_create( 7012 (uchar_t *)&dst_addr, 7013 (uint8_t *)&mask, 7014 (uint8_t *)&ipif->ipif_src_addr, 7015 NULL, 7016 &ipif->ipif_mtu, 7017 NULL, 7018 NULL, 7019 stq, 7020 ipif->ipif_net_type, 7021 ipif, 7022 0, 7023 0, 7024 0, 7025 flags, 7026 &ire_uinfo_null, 7027 NULL, 7028 NULL, 7029 ipst); 7030 if (ire == NULL) { 7031 if (ipif_refheld) 7032 ipif_refrele(ipif); 7033 return (ENOMEM); 7034 } 7035 7036 /* 7037 * Some software (for example, GateD and Sun Cluster) attempts 7038 * to create (what amount to) IRE_PREFIX routes with the 7039 * loopback address as the gateway. This is primarily done to 7040 * set up prefixes with the RTF_REJECT flag set (for example, 7041 * when generating aggregate routes.) 7042 * 7043 * If the IRE type (as defined by ipif->ipif_net_type) is 7044 * IRE_LOOPBACK, then we map the request into a 7045 * IRE_IF_NORESOLVER. 7046 * 7047 * Needless to say, the real IRE_LOOPBACK is NOT created by this 7048 * routine, but rather using ire_create() directly. 7049 * 7050 */ 7051 if (ipif->ipif_net_type == IRE_LOOPBACK) 7052 ire->ire_type = IRE_IF_NORESOLVER; 7053 7054 error = ire_add(&ire, q, mp, func, B_FALSE); 7055 if (error == 0) 7056 goto save_ire; 7057 7058 /* 7059 * In the result of failure, ire_add() will have already 7060 * deleted the ire in question, so there is no need to 7061 * do that here. 7062 */ 7063 if (ipif_refheld) 7064 ipif_refrele(ipif); 7065 return (error); 7066 } 7067 if (ipif_refheld) { 7068 ipif_refrele(ipif); 7069 ipif_refheld = B_FALSE; 7070 } 7071 7072 /* 7073 * Get an interface IRE for the specified gateway. 7074 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 7075 * gateway, it is currently unreachable and we fail the request 7076 * accordingly. 7077 */ 7078 ipif = ipif_arg; 7079 if (ipif_arg != NULL) 7080 match_flags |= MATCH_IRE_ILL; 7081 gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL, 7082 ALL_ZONES, 0, NULL, match_flags, ipst); 7083 if (gw_ire == NULL) 7084 return (ENETUNREACH); 7085 7086 /* 7087 * We create one of three types of IREs as a result of this request 7088 * based on the netmask. A netmask of all ones (which is automatically 7089 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 7090 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 7091 * created. Otherwise, an IRE_PREFIX route is created for the 7092 * destination prefix. 7093 */ 7094 if (mask == IP_HOST_MASK) 7095 type = IRE_HOST; 7096 else if (mask == 0) 7097 type = IRE_DEFAULT; 7098 else 7099 type = IRE_PREFIX; 7100 7101 /* check for a duplicate entry */ 7102 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 7103 NULL, ALL_ZONES, 0, NULL, 7104 match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, ipst); 7105 if (ire != NULL) { 7106 ire_refrele(gw_ire); 7107 ire_refrele(ire); 7108 return (EEXIST); 7109 } 7110 7111 /* Security attribute exists */ 7112 if (sp != NULL) { 7113 tsol_gcgrp_addr_t ga; 7114 7115 /* find or create the gateway credentials group */ 7116 ga.ga_af = AF_INET; 7117 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 7118 7119 /* we hold reference to it upon success */ 7120 gcgrp = gcgrp_lookup(&ga, B_TRUE); 7121 if (gcgrp == NULL) { 7122 ire_refrele(gw_ire); 7123 return (ENOMEM); 7124 } 7125 7126 /* 7127 * Create and add the security attribute to the group; a 7128 * reference to the group is made upon allocating a new 7129 * entry successfully. If it finds an already-existing 7130 * entry for the security attribute in the group, it simply 7131 * returns it and no new reference is made to the group. 7132 */ 7133 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 7134 if (gc == NULL) { 7135 /* release reference held by gcgrp_lookup */ 7136 GCGRP_REFRELE(gcgrp); 7137 ire_refrele(gw_ire); 7138 return (ENOMEM); 7139 } 7140 } 7141 7142 /* Create the IRE. */ 7143 ire = ire_create( 7144 (uchar_t *)&dst_addr, /* dest address */ 7145 (uchar_t *)&mask, /* mask */ 7146 /* src address assigned by the caller? */ 7147 (uchar_t *)(((src_addr != INADDR_ANY) && 7148 (flags & RTF_SETSRC)) ? &src_addr : NULL), 7149 (uchar_t *)&gw_addr, /* gateway address */ 7150 &gw_ire->ire_max_frag, 7151 NULL, /* no src nce */ 7152 NULL, /* no recv-from queue */ 7153 NULL, /* no send-to queue */ 7154 (ushort_t)type, /* IRE type */ 7155 ipif_arg, 7156 0, 7157 0, 7158 0, 7159 flags, 7160 &gw_ire->ire_uinfo, /* Inherit ULP info from gw */ 7161 gc, /* security attribute */ 7162 NULL, 7163 ipst); 7164 7165 /* 7166 * The ire holds a reference to the 'gc' and the 'gc' holds a 7167 * reference to the 'gcgrp'. We can now release the extra reference 7168 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 7169 */ 7170 if (gcgrp_xtraref) 7171 GCGRP_REFRELE(gcgrp); 7172 if (ire == NULL) { 7173 if (gc != NULL) 7174 GC_REFRELE(gc); 7175 ire_refrele(gw_ire); 7176 return (ENOMEM); 7177 } 7178 7179 /* 7180 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 7181 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 7182 */ 7183 7184 /* Add the new IRE. */ 7185 error = ire_add(&ire, q, mp, func, B_FALSE); 7186 if (error != 0) { 7187 /* 7188 * In the result of failure, ire_add() will have already 7189 * deleted the ire in question, so there is no need to 7190 * do that here. 7191 */ 7192 ire_refrele(gw_ire); 7193 return (error); 7194 } 7195 7196 if (flags & RTF_MULTIRT) { 7197 /* 7198 * Invoke the CGTP (multirouting) filtering module 7199 * to add the dst address in the filtering database. 7200 * Replicated inbound packets coming from that address 7201 * will be filtered to discard the duplicates. 7202 * It is not necessary to call the CGTP filter hook 7203 * when the dst address is a broadcast or multicast, 7204 * because an IP source address cannot be a broadcast 7205 * or a multicast. 7206 */ 7207 ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0, 7208 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 7209 if (ire_dst != NULL) { 7210 ip_cgtp_bcast_add(ire, ire_dst, ipst); 7211 ire_refrele(ire_dst); 7212 goto save_ire; 7213 } 7214 if (ipst->ips_ip_cgtp_filter_ops != NULL && 7215 !CLASSD(ire->ire_addr)) { 7216 int res = ipst->ips_ip_cgtp_filter_ops->cfo_add_dest_v4( 7217 ipst->ips_netstack->netstack_stackid, 7218 ire->ire_addr, 7219 ire->ire_gateway_addr, 7220 ire->ire_src_addr, 7221 gw_ire->ire_src_addr); 7222 if (res != 0) { 7223 ire_refrele(gw_ire); 7224 ire_delete(ire); 7225 return (res); 7226 } 7227 } 7228 } 7229 7230 /* 7231 * Now that the prefix IRE entry has been created, delete any 7232 * existing gateway IRE cache entries as well as any IRE caches 7233 * using the gateway, and force them to be created through 7234 * ip_newroute. 7235 */ 7236 if (gc != NULL) { 7237 ASSERT(gcgrp != NULL); 7238 ire_clookup_delete_cache_gw(gw_addr, ALL_ZONES, ipst); 7239 } 7240 7241 save_ire: 7242 if (gw_ire != NULL) { 7243 ire_refrele(gw_ire); 7244 } 7245 if (ipif != NULL) { 7246 /* 7247 * Save enough information so that we can recreate the IRE if 7248 * the interface goes down and then up. The metrics associated 7249 * with the route will be saved as well when rts_setmetrics() is 7250 * called after the IRE has been created. In the case where 7251 * memory cannot be allocated, none of this information will be 7252 * saved. 7253 */ 7254 ipif_save_ire(ipif, ire); 7255 } 7256 if (ioctl_msg) 7257 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst); 7258 if (ire_arg != NULL) { 7259 /* 7260 * Store the ire that was successfully added into where ire_arg 7261 * points to so that callers don't have to look it up 7262 * themselves (but they are responsible for ire_refrele()ing 7263 * the ire when they are finished with it). 7264 */ 7265 *ire_arg = ire; 7266 } else { 7267 ire_refrele(ire); /* Held in ire_add */ 7268 } 7269 if (ipif_refheld) 7270 ipif_refrele(ipif); 7271 return (0); 7272 } 7273 7274 /* 7275 * ip_rt_delete is called to delete an IPv4 route. 7276 * ipif_arg is passed in to associate it with the correct interface. 7277 * We may need to restart this operation if the ipif cannot be looked up 7278 * due to an exclusive operation that is currently in progress. The restart 7279 * entry point is specified by 'func' 7280 */ 7281 /* ARGSUSED4 */ 7282 int 7283 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 7284 uint_t rtm_addrs, int flags, ipif_t *ipif_arg, boolean_t ioctl_msg, 7285 queue_t *q, mblk_t *mp, ipsq_func_t func, ip_stack_t *ipst) 7286 { 7287 ire_t *ire = NULL; 7288 ipif_t *ipif; 7289 boolean_t ipif_refheld = B_FALSE; 7290 uint_t type; 7291 uint_t match_flags = MATCH_IRE_TYPE; 7292 int err = 0; 7293 7294 ip1dbg(("ip_rt_delete:")); 7295 /* 7296 * If this is the case of RTF_HOST being set, then we set the netmask 7297 * to all ones. Otherwise, we use the netmask if one was supplied. 7298 */ 7299 if (flags & RTF_HOST) { 7300 mask = IP_HOST_MASK; 7301 match_flags |= MATCH_IRE_MASK; 7302 } else if (rtm_addrs & RTA_NETMASK) { 7303 match_flags |= MATCH_IRE_MASK; 7304 } 7305 7306 /* 7307 * Note that RTF_GATEWAY is never set on a delete, therefore 7308 * we check if the gateway address is one of our interfaces first, 7309 * and fall back on RTF_GATEWAY routes. 7310 * 7311 * This makes it possible to delete an original 7312 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 7313 * 7314 * As the interface index specified with the RTA_IFP sockaddr is the 7315 * same for all ipif's off of an ill, the matching logic below uses 7316 * MATCH_IRE_ILL if such an index was specified. This means a route 7317 * sharing the same prefix and interface index as the the route 7318 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr 7319 * is specified in the request. 7320 * 7321 * On the other hand, since the gateway address will usually be 7322 * different for each ipif on the system, the matching logic 7323 * uses MATCH_IRE_IPIF in the case of a traditional interface 7324 * route. This means that interface routes for the same prefix can be 7325 * uniquely identified if they belong to distinct ipif's and if a 7326 * RTA_IFP sockaddr is not present. 7327 * 7328 * For more detail on specifying routes by gateway address and by 7329 * interface index, see the comments in ip_rt_add(). 7330 */ 7331 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &err, 7332 ipst); 7333 if (ipif != NULL) 7334 ipif_refheld = B_TRUE; 7335 else if (err == EINPROGRESS) 7336 return (err); 7337 else 7338 err = 0; 7339 if (ipif != NULL) { 7340 if (ipif_arg != NULL) { 7341 if (ipif_refheld) { 7342 ipif_refrele(ipif); 7343 ipif_refheld = B_FALSE; 7344 } 7345 ipif = ipif_arg; 7346 match_flags |= MATCH_IRE_ILL; 7347 } else { 7348 match_flags |= MATCH_IRE_IPIF; 7349 } 7350 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 7351 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 7352 ALL_ZONES, NULL, match_flags, ipst); 7353 } 7354 if (ire == NULL) { 7355 ire = ire_ftable_lookup(dst_addr, mask, 0, 7356 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, 7357 match_flags, ipst); 7358 } 7359 } 7360 7361 if (ire == NULL) { 7362 /* 7363 * At this point, the gateway address is not one of our own 7364 * addresses or a matching interface route was not found. We 7365 * set the IRE type to lookup based on whether 7366 * this is a host route, a default route or just a prefix. 7367 * 7368 * If an ipif_arg was passed in, then the lookup is based on an 7369 * interface index so MATCH_IRE_ILL is added to match_flags. 7370 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is 7371 * set as the route being looked up is not a traditional 7372 * interface route. 7373 */ 7374 match_flags &= ~MATCH_IRE_IPIF; 7375 match_flags |= MATCH_IRE_GW; 7376 if (ipif_arg != NULL) 7377 match_flags |= MATCH_IRE_ILL; 7378 if (mask == IP_HOST_MASK) 7379 type = IRE_HOST; 7380 else if (mask == 0) 7381 type = IRE_DEFAULT; 7382 else 7383 type = IRE_PREFIX; 7384 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 7385 NULL, ALL_ZONES, 0, NULL, match_flags, ipst); 7386 } 7387 7388 if (ipif_refheld) 7389 ipif_refrele(ipif); 7390 7391 /* ipif is not refheld anymore */ 7392 if (ire == NULL) 7393 return (ESRCH); 7394 7395 if (ire->ire_flags & RTF_MULTIRT) { 7396 /* 7397 * Invoke the CGTP (multirouting) filtering module 7398 * to remove the dst address from the filtering database. 7399 * Packets coming from that address will no longer be 7400 * filtered to remove duplicates. 7401 */ 7402 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 7403 err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4( 7404 ipst->ips_netstack->netstack_stackid, 7405 ire->ire_addr, ire->ire_gateway_addr); 7406 } 7407 ip_cgtp_bcast_delete(ire, ipst); 7408 } 7409 7410 ipif = ire->ire_ipif; 7411 if (ipif != NULL) 7412 ipif_remove_ire(ipif, ire); 7413 if (ioctl_msg) 7414 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst); 7415 ire_delete(ire); 7416 ire_refrele(ire); 7417 return (err); 7418 } 7419 7420 /* 7421 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 7422 */ 7423 /* ARGSUSED */ 7424 int 7425 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7426 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7427 { 7428 ipaddr_t dst_addr; 7429 ipaddr_t gw_addr; 7430 ipaddr_t mask; 7431 int error = 0; 7432 mblk_t *mp1; 7433 struct rtentry *rt; 7434 ipif_t *ipif = NULL; 7435 ip_stack_t *ipst; 7436 7437 ASSERT(q->q_next == NULL); 7438 ipst = CONNQ_TO_IPST(q); 7439 7440 ip1dbg(("ip_siocaddrt:")); 7441 /* Existence of mp1 verified in ip_wput_nondata */ 7442 mp1 = mp->b_cont->b_cont; 7443 rt = (struct rtentry *)mp1->b_rptr; 7444 7445 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7446 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7447 7448 /* 7449 * If the RTF_HOST flag is on, this is a request to assign a gateway 7450 * to a particular host address. In this case, we set the netmask to 7451 * all ones for the particular destination address. Otherwise, 7452 * determine the netmask to be used based on dst_addr and the interfaces 7453 * in use. 7454 */ 7455 if (rt->rt_flags & RTF_HOST) { 7456 mask = IP_HOST_MASK; 7457 } else { 7458 /* 7459 * Note that ip_subnet_mask returns a zero mask in the case of 7460 * default (an all-zeroes address). 7461 */ 7462 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 7463 } 7464 7465 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 7466 B_TRUE, q, mp, ip_process_ioctl, NULL, ipst); 7467 if (ipif != NULL) 7468 ipif_refrele(ipif); 7469 return (error); 7470 } 7471 7472 /* 7473 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 7474 */ 7475 /* ARGSUSED */ 7476 int 7477 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7478 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7479 { 7480 ipaddr_t dst_addr; 7481 ipaddr_t gw_addr; 7482 ipaddr_t mask; 7483 int error; 7484 mblk_t *mp1; 7485 struct rtentry *rt; 7486 ipif_t *ipif = NULL; 7487 ip_stack_t *ipst; 7488 7489 ASSERT(q->q_next == NULL); 7490 ipst = CONNQ_TO_IPST(q); 7491 7492 ip1dbg(("ip_siocdelrt:")); 7493 /* Existence of mp1 verified in ip_wput_nondata */ 7494 mp1 = mp->b_cont->b_cont; 7495 rt = (struct rtentry *)mp1->b_rptr; 7496 7497 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7498 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7499 7500 /* 7501 * If the RTF_HOST flag is on, this is a request to delete a gateway 7502 * to a particular host address. In this case, we set the netmask to 7503 * all ones for the particular destination address. Otherwise, 7504 * determine the netmask to be used based on dst_addr and the interfaces 7505 * in use. 7506 */ 7507 if (rt->rt_flags & RTF_HOST) { 7508 mask = IP_HOST_MASK; 7509 } else { 7510 /* 7511 * Note that ip_subnet_mask returns a zero mask in the case of 7512 * default (an all-zeroes address). 7513 */ 7514 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 7515 } 7516 7517 error = ip_rt_delete(dst_addr, mask, gw_addr, 7518 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, q, 7519 mp, ip_process_ioctl, ipst); 7520 if (ipif != NULL) 7521 ipif_refrele(ipif); 7522 return (error); 7523 } 7524 7525 /* 7526 * Enqueue the mp onto the ipsq, chained by b_next. 7527 * b_prev stores the function to be executed later, and b_queue the queue 7528 * where this mp originated. 7529 */ 7530 void 7531 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7532 ill_t *pending_ill) 7533 { 7534 conn_t *connp = NULL; 7535 7536 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7537 ASSERT(func != NULL); 7538 7539 mp->b_queue = q; 7540 mp->b_prev = (void *)func; 7541 mp->b_next = NULL; 7542 7543 switch (type) { 7544 case CUR_OP: 7545 if (ipsq->ipsq_mptail != NULL) { 7546 ASSERT(ipsq->ipsq_mphead != NULL); 7547 ipsq->ipsq_mptail->b_next = mp; 7548 } else { 7549 ASSERT(ipsq->ipsq_mphead == NULL); 7550 ipsq->ipsq_mphead = mp; 7551 } 7552 ipsq->ipsq_mptail = mp; 7553 break; 7554 7555 case NEW_OP: 7556 if (ipsq->ipsq_xopq_mptail != NULL) { 7557 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 7558 ipsq->ipsq_xopq_mptail->b_next = mp; 7559 } else { 7560 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 7561 ipsq->ipsq_xopq_mphead = mp; 7562 } 7563 ipsq->ipsq_xopq_mptail = mp; 7564 break; 7565 default: 7566 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 7567 } 7568 7569 if (CONN_Q(q) && pending_ill != NULL) { 7570 connp = Q_TO_CONN(q); 7571 7572 ASSERT(MUTEX_HELD(&connp->conn_lock)); 7573 connp->conn_oper_pending_ill = pending_ill; 7574 } 7575 } 7576 7577 /* 7578 * Return the mp at the head of the ipsq. After emptying the ipsq 7579 * look at the next ioctl, if this ioctl is complete. Otherwise 7580 * return, we will resume when we complete the current ioctl. 7581 * The current ioctl will wait till it gets a response from the 7582 * driver below. 7583 */ 7584 static mblk_t * 7585 ipsq_dq(ipsq_t *ipsq) 7586 { 7587 mblk_t *mp; 7588 7589 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7590 7591 mp = ipsq->ipsq_mphead; 7592 if (mp != NULL) { 7593 ipsq->ipsq_mphead = mp->b_next; 7594 if (ipsq->ipsq_mphead == NULL) 7595 ipsq->ipsq_mptail = NULL; 7596 mp->b_next = NULL; 7597 return (mp); 7598 } 7599 if (ipsq->ipsq_current_ipif != NULL) 7600 return (NULL); 7601 mp = ipsq->ipsq_xopq_mphead; 7602 if (mp != NULL) { 7603 ipsq->ipsq_xopq_mphead = mp->b_next; 7604 if (ipsq->ipsq_xopq_mphead == NULL) 7605 ipsq->ipsq_xopq_mptail = NULL; 7606 mp->b_next = NULL; 7607 return (mp); 7608 } 7609 return (NULL); 7610 } 7611 7612 /* 7613 * Enter the ipsq corresponding to ill, by waiting synchronously till 7614 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 7615 * will have to drain completely before ipsq_enter returns success. 7616 * ipsq_current_ipif will be set if some exclusive ioctl is in progress, 7617 * and the ipsq_exit logic will start the next enqueued ioctl after 7618 * completion of the current ioctl. If 'force' is used, we don't wait 7619 * for the enqueued ioctls. This is needed when a conn_close wants to 7620 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 7621 * of an ill can also use this option. But we dont' use it currently. 7622 */ 7623 #define ENTER_SQ_WAIT_TICKS 100 7624 boolean_t 7625 ipsq_enter(ill_t *ill, boolean_t force) 7626 { 7627 ipsq_t *ipsq; 7628 boolean_t waited_enough = B_FALSE; 7629 7630 /* 7631 * Holding the ill_lock prevents <ill-ipsq> assocs from changing. 7632 * Since the <ill-ipsq> assocs could change while we wait for the 7633 * writer, it is easier to wait on a fixed global rather than try to 7634 * cv_wait on a changing ipsq. 7635 */ 7636 mutex_enter(&ill->ill_lock); 7637 for (;;) { 7638 if (ill->ill_state_flags & ILL_CONDEMNED) { 7639 mutex_exit(&ill->ill_lock); 7640 return (B_FALSE); 7641 } 7642 7643 ipsq = ill->ill_phyint->phyint_ipsq; 7644 mutex_enter(&ipsq->ipsq_lock); 7645 if (ipsq->ipsq_writer == NULL && 7646 (ipsq->ipsq_current_ipif == NULL || waited_enough)) { 7647 break; 7648 } else if (ipsq->ipsq_writer != NULL) { 7649 mutex_exit(&ipsq->ipsq_lock); 7650 cv_wait(&ill->ill_cv, &ill->ill_lock); 7651 } else { 7652 mutex_exit(&ipsq->ipsq_lock); 7653 if (force) { 7654 (void) cv_timedwait(&ill->ill_cv, 7655 &ill->ill_lock, 7656 lbolt + ENTER_SQ_WAIT_TICKS); 7657 waited_enough = B_TRUE; 7658 continue; 7659 } else { 7660 cv_wait(&ill->ill_cv, &ill->ill_lock); 7661 } 7662 } 7663 } 7664 7665 ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL); 7666 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7667 ipsq->ipsq_writer = curthread; 7668 ipsq->ipsq_reentry_cnt++; 7669 #ifdef DEBUG 7670 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IPSQ_STACK_DEPTH); 7671 #endif 7672 mutex_exit(&ipsq->ipsq_lock); 7673 mutex_exit(&ill->ill_lock); 7674 return (B_TRUE); 7675 } 7676 7677 /* 7678 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 7679 * certain critical operations like plumbing (i.e. most set ioctls), 7680 * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP 7681 * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per 7682 * IPMP group. The ipsq serializes exclusive ioctls issued by applications 7683 * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple 7684 * threads executing in the ipsq. Responses from the driver pertain to the 7685 * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated 7686 * as part of bringing up the interface) and are enqueued in ipsq_mphead. 7687 * 7688 * If a thread does not want to reenter the ipsq when it is already writer, 7689 * it must make sure that the specified reentry point to be called later 7690 * when the ipsq is empty, nor any code path starting from the specified reentry 7691 * point must never ever try to enter the ipsq again. Otherwise it can lead 7692 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 7693 * When the thread that is currently exclusive finishes, it (ipsq_exit) 7694 * dequeues the requests waiting to become exclusive in ipsq_mphead and calls 7695 * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit 7696 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 7697 * ioctl if the current ioctl has completed. If the current ioctl is still 7698 * in progress it simply returns. The current ioctl could be waiting for 7699 * a response from another module (arp_ or the driver or could be waiting for 7700 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp 7701 * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the 7702 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 7703 * ipsq_current_ipif is clear which happens only on ioctl completion. 7704 */ 7705 7706 /* 7707 * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of 7708 * ipif or ill can be specified). The caller ensures ipif or ill is valid by 7709 * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued 7710 * completion. 7711 */ 7712 ipsq_t * 7713 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7714 ipsq_func_t func, int type, boolean_t reentry_ok) 7715 { 7716 ipsq_t *ipsq; 7717 7718 /* Only 1 of ipif or ill can be specified */ 7719 ASSERT((ipif != NULL) ^ (ill != NULL)); 7720 if (ipif != NULL) 7721 ill = ipif->ipif_ill; 7722 7723 /* 7724 * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock 7725 * ipsq of an ill can't change when ill_lock is held. 7726 */ 7727 GRAB_CONN_LOCK(q); 7728 mutex_enter(&ill->ill_lock); 7729 ipsq = ill->ill_phyint->phyint_ipsq; 7730 mutex_enter(&ipsq->ipsq_lock); 7731 7732 /* 7733 * 1. Enter the ipsq if we are already writer and reentry is ok. 7734 * (Note: If the caller does not specify reentry_ok then neither 7735 * 'func' nor any of its callees must ever attempt to enter the ipsq 7736 * again. Otherwise it can lead to an infinite loop 7737 * 2. Enter the ipsq if there is no current writer and this attempted 7738 * entry is part of the current ioctl or operation 7739 * 3. Enter the ipsq if there is no current writer and this is a new 7740 * ioctl (or operation) and the ioctl (or operation) queue is 7741 * empty and there is no ioctl (or operation) currently in progress 7742 */ 7743 if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) || 7744 (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL && 7745 ipsq->ipsq_current_ipif == NULL))) || 7746 (ipsq->ipsq_writer == curthread && reentry_ok)) { 7747 /* Success. */ 7748 ipsq->ipsq_reentry_cnt++; 7749 ipsq->ipsq_writer = curthread; 7750 mutex_exit(&ipsq->ipsq_lock); 7751 mutex_exit(&ill->ill_lock); 7752 RELEASE_CONN_LOCK(q); 7753 #ifdef DEBUG 7754 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, 7755 IPSQ_STACK_DEPTH); 7756 #endif 7757 return (ipsq); 7758 } 7759 7760 ipsq_enq(ipsq, q, mp, func, type, ill); 7761 7762 mutex_exit(&ipsq->ipsq_lock); 7763 mutex_exit(&ill->ill_lock); 7764 RELEASE_CONN_LOCK(q); 7765 return (NULL); 7766 } 7767 7768 /* 7769 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures 7770 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ 7771 * cannot be entered, the mp is queued for completion. 7772 */ 7773 void 7774 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7775 boolean_t reentry_ok) 7776 { 7777 ipsq_t *ipsq; 7778 7779 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok); 7780 7781 /* 7782 * Drop the caller's refhold on the ill. This is safe since we either 7783 * entered the IPSQ (and thus are exclusive), or failed to enter the 7784 * IPSQ, in which case we return without accessing ill anymore. This 7785 * is needed because func needs to see the correct refcount. 7786 * e.g. removeif can work only then. 7787 */ 7788 ill_refrele(ill); 7789 if (ipsq != NULL) { 7790 (*func)(ipsq, q, mp, NULL); 7791 ipsq_exit(ipsq, B_TRUE, B_TRUE); 7792 } 7793 } 7794 7795 /* 7796 * If there are more than ILL_GRP_CNT ills in a group, 7797 * we use kmem alloc'd buffers, else use the stack 7798 */ 7799 #define ILL_GRP_CNT 14 7800 /* 7801 * Drain the ipsq, if there are messages on it, and then leave the ipsq. 7802 * Called by a thread that is currently exclusive on this ipsq. 7803 */ 7804 void 7805 ipsq_exit(ipsq_t *ipsq, boolean_t start_igmp_timer, boolean_t start_mld_timer) 7806 { 7807 queue_t *q; 7808 mblk_t *mp; 7809 ipsq_func_t func; 7810 int next; 7811 ill_t **ill_list = NULL; 7812 size_t ill_list_size = 0; 7813 int cnt = 0; 7814 boolean_t need_ipsq_free = B_FALSE; 7815 ip_stack_t *ipst = ipsq->ipsq_ipst; 7816 7817 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7818 mutex_enter(&ipsq->ipsq_lock); 7819 ASSERT(ipsq->ipsq_reentry_cnt >= 1); 7820 if (ipsq->ipsq_reentry_cnt != 1) { 7821 ipsq->ipsq_reentry_cnt--; 7822 mutex_exit(&ipsq->ipsq_lock); 7823 return; 7824 } 7825 7826 mp = ipsq_dq(ipsq); 7827 while (mp != NULL) { 7828 again: 7829 mutex_exit(&ipsq->ipsq_lock); 7830 func = (ipsq_func_t)mp->b_prev; 7831 q = (queue_t *)mp->b_queue; 7832 mp->b_prev = NULL; 7833 mp->b_queue = NULL; 7834 7835 /* 7836 * If 'q' is an conn queue, it is valid, since we did a 7837 * a refhold on the connp, at the start of the ioctl. 7838 * If 'q' is an ill queue, it is valid, since close of an 7839 * ill will clean up the 'ipsq'. 7840 */ 7841 (*func)(ipsq, q, mp, NULL); 7842 7843 mutex_enter(&ipsq->ipsq_lock); 7844 mp = ipsq_dq(ipsq); 7845 } 7846 7847 mutex_exit(&ipsq->ipsq_lock); 7848 7849 /* 7850 * Need to grab the locks in the right order. Need to 7851 * atomically check (under ipsq_lock) that there are no 7852 * messages before relinquishing the ipsq. Also need to 7853 * atomically wakeup waiters on ill_cv while holding ill_lock. 7854 * Holding ill_g_lock ensures that ipsq list of ills is stable. 7855 * If we need to call ill_split_ipsq and change <ill-ipsq> we need 7856 * to grab ill_g_lock as writer. 7857 */ 7858 rw_enter(&ipst->ips_ill_g_lock, 7859 ipsq->ipsq_split ? RW_WRITER : RW_READER); 7860 7861 /* ipsq_refs can't change while ill_g_lock is held as reader */ 7862 if (ipsq->ipsq_refs != 0) { 7863 /* At most 2 ills v4/v6 per phyint */ 7864 cnt = ipsq->ipsq_refs << 1; 7865 ill_list_size = cnt * sizeof (ill_t *); 7866 /* 7867 * If memory allocation fails, we will do the split 7868 * the next time ipsq_exit is called for whatever reason. 7869 * As long as the ipsq_split flag is set the need to 7870 * split is remembered. 7871 */ 7872 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 7873 if (ill_list != NULL) 7874 cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt); 7875 } 7876 mutex_enter(&ipsq->ipsq_lock); 7877 mp = ipsq_dq(ipsq); 7878 if (mp != NULL) { 7879 /* oops, some message has landed up, we can't get out */ 7880 if (ill_list != NULL) 7881 ill_unlock_ills(ill_list, cnt); 7882 rw_exit(&ipst->ips_ill_g_lock); 7883 if (ill_list != NULL) 7884 kmem_free(ill_list, ill_list_size); 7885 ill_list = NULL; 7886 ill_list_size = 0; 7887 cnt = 0; 7888 goto again; 7889 } 7890 7891 /* 7892 * Split only if no ioctl is pending and if memory alloc succeeded 7893 * above. 7894 */ 7895 if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL && 7896 ill_list != NULL) { 7897 /* 7898 * No new ill can join this ipsq since we are holding the 7899 * ill_g_lock. Hence ill_split_ipsq can safely traverse the 7900 * ipsq. ill_split_ipsq may fail due to memory shortage. 7901 * If so we will retry on the next ipsq_exit. 7902 */ 7903 ipsq->ipsq_split = ill_split_ipsq(ipsq); 7904 } 7905 7906 /* 7907 * We are holding the ipsq lock, hence no new messages can 7908 * land up on the ipsq, and there are no messages currently. 7909 * Now safe to get out. Wake up waiters and relinquish ipsq 7910 * atomically while holding ill locks. 7911 */ 7912 ipsq->ipsq_writer = NULL; 7913 ipsq->ipsq_reentry_cnt--; 7914 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7915 #ifdef DEBUG 7916 ipsq->ipsq_depth = 0; 7917 #endif 7918 mutex_exit(&ipsq->ipsq_lock); 7919 /* 7920 * For IPMP this should wake up all ills in this ipsq. 7921 * We need to hold the ill_lock while waking up waiters to 7922 * avoid missed wakeups. But there is no need to acquire all 7923 * the ill locks and then wakeup. If we have not acquired all 7924 * the locks (due to memory failure above) ill_signal_ipsq_ills 7925 * wakes up ills one at a time after getting the right ill_lock 7926 */ 7927 ill_signal_ipsq_ills(ipsq, ill_list != NULL); 7928 if (ill_list != NULL) 7929 ill_unlock_ills(ill_list, cnt); 7930 if (ipsq->ipsq_refs == 0) 7931 need_ipsq_free = B_TRUE; 7932 rw_exit(&ipst->ips_ill_g_lock); 7933 if (ill_list != 0) 7934 kmem_free(ill_list, ill_list_size); 7935 7936 if (need_ipsq_free) { 7937 /* 7938 * Free the ipsq. ipsq_refs can't increase because ipsq can't be 7939 * looked up. ipsq can be looked up only thru ill or phyint 7940 * and there are no ills/phyint on this ipsq. 7941 */ 7942 ipsq_delete(ipsq); 7943 } 7944 /* 7945 * Now start any igmp or mld timers that could not be started 7946 * while inside the ipsq. The timers can't be started while inside 7947 * the ipsq, since igmp_start_timers may need to call untimeout() 7948 * which can't be done while holding a lock i.e. the ipsq. Otherwise 7949 * there could be a deadlock since the timeout handlers 7950 * mld_timeout_handler / igmp_timeout_handler also synchronously 7951 * wait in ipsq_enter() trying to get the ipsq. 7952 * 7953 * However there is one exception to the above. If this thread is 7954 * itself the igmp/mld timeout handler thread, then we don't want 7955 * to start any new timer until the current handler is done. The 7956 * handler thread passes in B_FALSE for start_igmp/mld_timers, while 7957 * all others pass B_TRUE. 7958 */ 7959 if (start_igmp_timer) { 7960 mutex_enter(&ipst->ips_igmp_timer_lock); 7961 next = ipst->ips_igmp_deferred_next; 7962 ipst->ips_igmp_deferred_next = INFINITY; 7963 mutex_exit(&ipst->ips_igmp_timer_lock); 7964 7965 if (next != INFINITY) 7966 igmp_start_timers(next, ipst); 7967 } 7968 7969 if (start_mld_timer) { 7970 mutex_enter(&ipst->ips_mld_timer_lock); 7971 next = ipst->ips_mld_deferred_next; 7972 ipst->ips_mld_deferred_next = INFINITY; 7973 mutex_exit(&ipst->ips_mld_timer_lock); 7974 7975 if (next != INFINITY) 7976 mld_start_timers(next, ipst); 7977 } 7978 } 7979 7980 /* 7981 * Start the current exclusive operation on `ipsq'; associate it with `ipif' 7982 * and `ioccmd'. 7983 */ 7984 void 7985 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) 7986 { 7987 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7988 7989 mutex_enter(&ipsq->ipsq_lock); 7990 ASSERT(ipsq->ipsq_current_ipif == NULL); 7991 ASSERT(ipsq->ipsq_current_ioctl == 0); 7992 ipsq->ipsq_current_ipif = ipif; 7993 ipsq->ipsq_current_ioctl = ioccmd; 7994 mutex_exit(&ipsq->ipsq_lock); 7995 } 7996 7997 /* 7998 * Finish the current exclusive operation on `ipsq'. Note that other 7999 * operations will not be able to proceed until an ipsq_exit() is done. 8000 */ 8001 void 8002 ipsq_current_finish(ipsq_t *ipsq) 8003 { 8004 ipif_t *ipif = ipsq->ipsq_current_ipif; 8005 8006 ASSERT(IAM_WRITER_IPSQ(ipsq)); 8007 8008 /* 8009 * For SIOCSLIFREMOVEIF, the ipif has been already been blown away 8010 * (but we're careful to never set IPIF_CHANGING in that case). 8011 */ 8012 if (ipsq->ipsq_current_ioctl != SIOCLIFREMOVEIF) { 8013 mutex_enter(&ipif->ipif_ill->ill_lock); 8014 ipif->ipif_state_flags &= ~IPIF_CHANGING; 8015 8016 /* Send any queued event */ 8017 ill_nic_info_dispatch(ipif->ipif_ill); 8018 mutex_exit(&ipif->ipif_ill->ill_lock); 8019 } 8020 8021 mutex_enter(&ipsq->ipsq_lock); 8022 ASSERT(ipsq->ipsq_current_ipif != NULL); 8023 ipsq->ipsq_current_ipif = NULL; 8024 ipsq->ipsq_current_ioctl = 0; 8025 mutex_exit(&ipsq->ipsq_lock); 8026 } 8027 8028 /* 8029 * The ill is closing. Flush all messages on the ipsq that originated 8030 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 8031 * for this ill since ipsq_enter could not have entered until then. 8032 * New messages can't be queued since the CONDEMNED flag is set. 8033 */ 8034 static void 8035 ipsq_flush(ill_t *ill) 8036 { 8037 queue_t *q; 8038 mblk_t *prev; 8039 mblk_t *mp; 8040 mblk_t *mp_next; 8041 ipsq_t *ipsq; 8042 8043 ASSERT(IAM_WRITER_ILL(ill)); 8044 ipsq = ill->ill_phyint->phyint_ipsq; 8045 /* 8046 * Flush any messages sent up by the driver. 8047 */ 8048 mutex_enter(&ipsq->ipsq_lock); 8049 for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) { 8050 mp_next = mp->b_next; 8051 q = mp->b_queue; 8052 if (q == ill->ill_rq || q == ill->ill_wq) { 8053 /* Remove the mp from the ipsq */ 8054 if (prev == NULL) 8055 ipsq->ipsq_mphead = mp->b_next; 8056 else 8057 prev->b_next = mp->b_next; 8058 if (ipsq->ipsq_mptail == mp) { 8059 ASSERT(mp_next == NULL); 8060 ipsq->ipsq_mptail = prev; 8061 } 8062 inet_freemsg(mp); 8063 } else { 8064 prev = mp; 8065 } 8066 } 8067 mutex_exit(&ipsq->ipsq_lock); 8068 (void) ipsq_pending_mp_cleanup(ill, NULL); 8069 ipsq_xopq_mp_cleanup(ill, NULL); 8070 ill_pending_mp_cleanup(ill); 8071 } 8072 8073 /* ARGSUSED */ 8074 int 8075 ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8076 ip_ioctl_cmd_t *ipip, void *ifreq) 8077 { 8078 ill_t *ill; 8079 struct lifreq *lifr = (struct lifreq *)ifreq; 8080 boolean_t isv6; 8081 conn_t *connp; 8082 ip_stack_t *ipst; 8083 8084 connp = Q_TO_CONN(q); 8085 ipst = connp->conn_netstack->netstack_ip; 8086 isv6 = connp->conn_af_isv6; 8087 /* 8088 * Set original index. 8089 * Failover and failback move logical interfaces 8090 * from one physical interface to another. The 8091 * original index indicates the parent of a logical 8092 * interface, in other words, the physical interface 8093 * the logical interface will be moved back to on 8094 * failback. 8095 */ 8096 8097 /* 8098 * Don't allow the original index to be changed 8099 * for non-failover addresses, autoconfigured 8100 * addresses, or IPv6 link local addresses. 8101 */ 8102 if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) || 8103 (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) { 8104 return (EINVAL); 8105 } 8106 /* 8107 * The new original index must be in use by some 8108 * physical interface. 8109 */ 8110 ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL, 8111 NULL, NULL, ipst); 8112 if (ill == NULL) 8113 return (ENXIO); 8114 ill_refrele(ill); 8115 8116 ipif->ipif_orig_ifindex = lifr->lifr_index; 8117 /* 8118 * When this ipif gets failed back, don't 8119 * preserve the original id, as it is no 8120 * longer applicable. 8121 */ 8122 ipif->ipif_orig_ipifid = 0; 8123 /* 8124 * For IPv4, change the original index of any 8125 * multicast addresses associated with the 8126 * ipif to the new value. 8127 */ 8128 if (!isv6) { 8129 ilm_t *ilm; 8130 8131 mutex_enter(&ipif->ipif_ill->ill_lock); 8132 for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL; 8133 ilm = ilm->ilm_next) { 8134 if (ilm->ilm_ipif == ipif) { 8135 ilm->ilm_orig_ifindex = lifr->lifr_index; 8136 } 8137 } 8138 mutex_exit(&ipif->ipif_ill->ill_lock); 8139 } 8140 return (0); 8141 } 8142 8143 /* ARGSUSED */ 8144 int 8145 ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8146 ip_ioctl_cmd_t *ipip, void *ifreq) 8147 { 8148 struct lifreq *lifr = (struct lifreq *)ifreq; 8149 8150 /* 8151 * Get the original interface index i.e the one 8152 * before FAILOVER if it ever happened. 8153 */ 8154 lifr->lifr_index = ipif->ipif_orig_ifindex; 8155 return (0); 8156 } 8157 8158 /* 8159 * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls, 8160 * refhold and return the associated ipif 8161 */ 8162 /* ARGSUSED */ 8163 int 8164 ip_extract_tunreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 8165 cmd_info_t *ci, ipsq_func_t func) 8166 { 8167 boolean_t exists; 8168 struct iftun_req *ta; 8169 ipif_t *ipif; 8170 ill_t *ill; 8171 boolean_t isv6; 8172 mblk_t *mp1; 8173 int error; 8174 conn_t *connp; 8175 ip_stack_t *ipst; 8176 8177 /* Existence verified in ip_wput_nondata */ 8178 mp1 = mp->b_cont->b_cont; 8179 ta = (struct iftun_req *)mp1->b_rptr; 8180 /* 8181 * Null terminate the string to protect against buffer 8182 * overrun. String was generated by user code and may not 8183 * be trusted. 8184 */ 8185 ta->ifta_lifr_name[LIFNAMSIZ - 1] = '\0'; 8186 8187 connp = Q_TO_CONN(q); 8188 isv6 = connp->conn_af_isv6; 8189 ipst = connp->conn_netstack->netstack_ip; 8190 8191 /* Disallows implicit create */ 8192 ipif = ipif_lookup_on_name(ta->ifta_lifr_name, 8193 mi_strlen(ta->ifta_lifr_name), B_FALSE, &exists, isv6, 8194 connp->conn_zoneid, CONNP_TO_WQ(connp), mp, func, &error, ipst); 8195 if (ipif == NULL) 8196 return (error); 8197 8198 if (ipif->ipif_id != 0) { 8199 /* 8200 * We really don't want to set/get tunnel parameters 8201 * on virtual tunnel interfaces. Only allow the 8202 * base tunnel to do these. 8203 */ 8204 ipif_refrele(ipif); 8205 return (EINVAL); 8206 } 8207 8208 /* 8209 * Send down to tunnel mod for ioctl processing. 8210 * Will finish ioctl in ip_rput_other(). 8211 */ 8212 ill = ipif->ipif_ill; 8213 if (ill->ill_net_type == IRE_LOOPBACK) { 8214 ipif_refrele(ipif); 8215 return (EOPNOTSUPP); 8216 } 8217 8218 if (ill->ill_wq == NULL) { 8219 ipif_refrele(ipif); 8220 return (ENXIO); 8221 } 8222 /* 8223 * Mark the ioctl as coming from an IPv6 interface for 8224 * tun's convenience. 8225 */ 8226 if (ill->ill_isv6) 8227 ta->ifta_flags |= 0x80000000; 8228 ci->ci_ipif = ipif; 8229 return (0); 8230 } 8231 8232 /* 8233 * Parse an ifreq or lifreq struct coming down ioctls and refhold 8234 * and return the associated ipif. 8235 * Return value: 8236 * Non zero: An error has occurred. ci may not be filled out. 8237 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 8238 * a held ipif in ci.ci_ipif. 8239 */ 8240 int 8241 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 8242 cmd_info_t *ci, ipsq_func_t func) 8243 { 8244 sin_t *sin; 8245 sin6_t *sin6; 8246 char *name; 8247 struct ifreq *ifr; 8248 struct lifreq *lifr; 8249 ipif_t *ipif = NULL; 8250 ill_t *ill; 8251 conn_t *connp; 8252 boolean_t isv6; 8253 boolean_t exists; 8254 int err; 8255 mblk_t *mp1; 8256 zoneid_t zoneid; 8257 ip_stack_t *ipst; 8258 8259 if (q->q_next != NULL) { 8260 ill = (ill_t *)q->q_ptr; 8261 isv6 = ill->ill_isv6; 8262 connp = NULL; 8263 zoneid = ALL_ZONES; 8264 ipst = ill->ill_ipst; 8265 } else { 8266 ill = NULL; 8267 connp = Q_TO_CONN(q); 8268 isv6 = connp->conn_af_isv6; 8269 zoneid = connp->conn_zoneid; 8270 if (zoneid == GLOBAL_ZONEID) { 8271 /* global zone can access ipifs in all zones */ 8272 zoneid = ALL_ZONES; 8273 } 8274 ipst = connp->conn_netstack->netstack_ip; 8275 } 8276 8277 /* Has been checked in ip_wput_nondata */ 8278 mp1 = mp->b_cont->b_cont; 8279 8280 if (ipip->ipi_cmd_type == IF_CMD) { 8281 /* This a old style SIOC[GS]IF* command */ 8282 ifr = (struct ifreq *)mp1->b_rptr; 8283 /* 8284 * Null terminate the string to protect against buffer 8285 * overrun. String was generated by user code and may not 8286 * be trusted. 8287 */ 8288 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 8289 sin = (sin_t *)&ifr->ifr_addr; 8290 name = ifr->ifr_name; 8291 ci->ci_sin = sin; 8292 ci->ci_sin6 = NULL; 8293 ci->ci_lifr = (struct lifreq *)ifr; 8294 } else { 8295 /* This a new style SIOC[GS]LIF* command */ 8296 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 8297 lifr = (struct lifreq *)mp1->b_rptr; 8298 /* 8299 * Null terminate the string to protect against buffer 8300 * overrun. String was generated by user code and may not 8301 * be trusted. 8302 */ 8303 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 8304 name = lifr->lifr_name; 8305 sin = (sin_t *)&lifr->lifr_addr; 8306 sin6 = (sin6_t *)&lifr->lifr_addr; 8307 if (ipip->ipi_cmd == SIOCSLIFGROUPNAME) { 8308 (void) strncpy(ci->ci_groupname, lifr->lifr_groupname, 8309 LIFNAMSIZ); 8310 } 8311 ci->ci_sin = sin; 8312 ci->ci_sin6 = sin6; 8313 ci->ci_lifr = lifr; 8314 } 8315 8316 if (ipip->ipi_cmd == SIOCSLIFNAME) { 8317 /* 8318 * The ioctl will be failed if the ioctl comes down 8319 * an conn stream 8320 */ 8321 if (ill == NULL) { 8322 /* 8323 * Not an ill queue, return EINVAL same as the 8324 * old error code. 8325 */ 8326 return (ENXIO); 8327 } 8328 ipif = ill->ill_ipif; 8329 ipif_refhold(ipif); 8330 } else { 8331 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 8332 &exists, isv6, zoneid, 8333 (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err, 8334 ipst); 8335 if (ipif == NULL) { 8336 if (err == EINPROGRESS) 8337 return (err); 8338 if (ipip->ipi_cmd == SIOCLIFFAILOVER || 8339 ipip->ipi_cmd == SIOCLIFFAILBACK) { 8340 /* 8341 * Need to try both v4 and v6 since this 8342 * ioctl can come down either v4 or v6 8343 * socket. The lifreq.lifr_family passed 8344 * down by this ioctl is AF_UNSPEC. 8345 */ 8346 ipif = ipif_lookup_on_name(name, 8347 mi_strlen(name), B_FALSE, &exists, !isv6, 8348 zoneid, (connp == NULL) ? q : 8349 CONNP_TO_WQ(connp), mp, func, &err, ipst); 8350 if (err == EINPROGRESS) 8351 return (err); 8352 } 8353 err = 0; /* Ensure we don't use it below */ 8354 } 8355 } 8356 8357 /* 8358 * Old style [GS]IFCMD does not admit IPv6 ipif 8359 */ 8360 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) { 8361 ipif_refrele(ipif); 8362 return (ENXIO); 8363 } 8364 8365 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 8366 name[0] == '\0') { 8367 /* 8368 * Handle a or a SIOC?IF* with a null name 8369 * during plumb (on the ill queue before the I_PLINK). 8370 */ 8371 ipif = ill->ill_ipif; 8372 ipif_refhold(ipif); 8373 } 8374 8375 if (ipif == NULL) 8376 return (ENXIO); 8377 8378 /* 8379 * Allow only GET operations if this ipif has been created 8380 * temporarily due to a MOVE operation. 8381 */ 8382 if (ipif->ipif_replace_zero && !(ipip->ipi_flags & IPI_REPL)) { 8383 ipif_refrele(ipif); 8384 return (EINVAL); 8385 } 8386 8387 ci->ci_ipif = ipif; 8388 return (0); 8389 } 8390 8391 /* 8392 * Return the total number of ipifs. 8393 */ 8394 static uint_t 8395 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst) 8396 { 8397 uint_t numifs = 0; 8398 ill_t *ill; 8399 ill_walk_context_t ctx; 8400 ipif_t *ipif; 8401 8402 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8403 ill = ILL_START_WALK_V4(&ctx, ipst); 8404 8405 while (ill != NULL) { 8406 for (ipif = ill->ill_ipif; ipif != NULL; 8407 ipif = ipif->ipif_next) { 8408 if (ipif->ipif_zoneid == zoneid || 8409 ipif->ipif_zoneid == ALL_ZONES) 8410 numifs++; 8411 } 8412 ill = ill_next(&ctx, ill); 8413 } 8414 rw_exit(&ipst->ips_ill_g_lock); 8415 return (numifs); 8416 } 8417 8418 /* 8419 * Return the total number of ipifs. 8420 */ 8421 static uint_t 8422 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst) 8423 { 8424 uint_t numifs = 0; 8425 ill_t *ill; 8426 ipif_t *ipif; 8427 ill_walk_context_t ctx; 8428 8429 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 8430 8431 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8432 if (family == AF_INET) 8433 ill = ILL_START_WALK_V4(&ctx, ipst); 8434 else if (family == AF_INET6) 8435 ill = ILL_START_WALK_V6(&ctx, ipst); 8436 else 8437 ill = ILL_START_WALK_ALL(&ctx, ipst); 8438 8439 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8440 for (ipif = ill->ill_ipif; ipif != NULL; 8441 ipif = ipif->ipif_next) { 8442 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8443 !(lifn_flags & LIFC_NOXMIT)) 8444 continue; 8445 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8446 !(lifn_flags & LIFC_TEMPORARY)) 8447 continue; 8448 if (((ipif->ipif_flags & 8449 (IPIF_NOXMIT|IPIF_NOLOCAL| 8450 IPIF_DEPRECATED)) || 8451 IS_LOOPBACK(ill) || 8452 !(ipif->ipif_flags & IPIF_UP)) && 8453 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 8454 continue; 8455 8456 if (zoneid != ipif->ipif_zoneid && 8457 ipif->ipif_zoneid != ALL_ZONES && 8458 (zoneid != GLOBAL_ZONEID || 8459 !(lifn_flags & LIFC_ALLZONES))) 8460 continue; 8461 8462 numifs++; 8463 } 8464 } 8465 rw_exit(&ipst->ips_ill_g_lock); 8466 return (numifs); 8467 } 8468 8469 uint_t 8470 ip_get_lifsrcofnum(ill_t *ill) 8471 { 8472 uint_t numifs = 0; 8473 ill_t *ill_head = ill; 8474 ip_stack_t *ipst = ill->ill_ipst; 8475 8476 /* 8477 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 8478 * other thread may be trying to relink the ILLs in this usesrc group 8479 * and adjusting the ill_usesrc_grp_next pointers 8480 */ 8481 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 8482 if ((ill->ill_usesrc_ifindex == 0) && 8483 (ill->ill_usesrc_grp_next != NULL)) { 8484 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 8485 ill = ill->ill_usesrc_grp_next) 8486 numifs++; 8487 } 8488 rw_exit(&ipst->ips_ill_g_usesrc_lock); 8489 8490 return (numifs); 8491 } 8492 8493 /* Null values are passed in for ipif, sin, and ifreq */ 8494 /* ARGSUSED */ 8495 int 8496 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8497 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8498 { 8499 int *nump; 8500 conn_t *connp = Q_TO_CONN(q); 8501 8502 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8503 8504 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8505 nump = (int *)mp->b_cont->b_cont->b_rptr; 8506 8507 *nump = ip_get_numifs(connp->conn_zoneid, 8508 connp->conn_netstack->netstack_ip); 8509 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 8510 return (0); 8511 } 8512 8513 /* Null values are passed in for ipif, sin, and ifreq */ 8514 /* ARGSUSED */ 8515 int 8516 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 8517 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8518 { 8519 struct lifnum *lifn; 8520 mblk_t *mp1; 8521 conn_t *connp = Q_TO_CONN(q); 8522 8523 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8524 8525 /* Existence checked in ip_wput_nondata */ 8526 mp1 = mp->b_cont->b_cont; 8527 8528 lifn = (struct lifnum *)mp1->b_rptr; 8529 switch (lifn->lifn_family) { 8530 case AF_UNSPEC: 8531 case AF_INET: 8532 case AF_INET6: 8533 break; 8534 default: 8535 return (EAFNOSUPPORT); 8536 } 8537 8538 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 8539 connp->conn_zoneid, connp->conn_netstack->netstack_ip); 8540 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 8541 return (0); 8542 } 8543 8544 /* ARGSUSED */ 8545 int 8546 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8547 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8548 { 8549 STRUCT_HANDLE(ifconf, ifc); 8550 mblk_t *mp1; 8551 struct iocblk *iocp; 8552 struct ifreq *ifr; 8553 ill_walk_context_t ctx; 8554 ill_t *ill; 8555 ipif_t *ipif; 8556 struct sockaddr_in *sin; 8557 int32_t ifclen; 8558 zoneid_t zoneid; 8559 ip_stack_t *ipst = CONNQ_TO_IPST(q); 8560 8561 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 8562 8563 ip1dbg(("ip_sioctl_get_ifconf")); 8564 /* Existence verified in ip_wput_nondata */ 8565 mp1 = mp->b_cont->b_cont; 8566 iocp = (struct iocblk *)mp->b_rptr; 8567 zoneid = Q_TO_CONN(q)->conn_zoneid; 8568 8569 /* 8570 * The original SIOCGIFCONF passed in a struct ifconf which specified 8571 * the user buffer address and length into which the list of struct 8572 * ifreqs was to be copied. Since AT&T Streams does not seem to 8573 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 8574 * the SIOCGIFCONF operation was redefined to simply provide 8575 * a large output buffer into which we are supposed to jam the ifreq 8576 * array. The same ioctl command code was used, despite the fact that 8577 * both the applications and the kernel code had to change, thus making 8578 * it impossible to support both interfaces. 8579 * 8580 * For reasons not good enough to try to explain, the following 8581 * algorithm is used for deciding what to do with one of these: 8582 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 8583 * form with the output buffer coming down as the continuation message. 8584 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 8585 * and we have to copy in the ifconf structure to find out how big the 8586 * output buffer is and where to copy out to. Sure no problem... 8587 * 8588 */ 8589 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 8590 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 8591 int numifs = 0; 8592 size_t ifc_bufsize; 8593 8594 /* 8595 * Must be (better be!) continuation of a TRANSPARENT 8596 * IOCTL. We just copied in the ifconf structure. 8597 */ 8598 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 8599 (struct ifconf *)mp1->b_rptr); 8600 8601 /* 8602 * Allocate a buffer to hold requested information. 8603 * 8604 * If ifc_len is larger than what is needed, we only 8605 * allocate what we will use. 8606 * 8607 * If ifc_len is smaller than what is needed, return 8608 * EINVAL. 8609 * 8610 * XXX: the ill_t structure can hava 2 counters, for 8611 * v4 and v6 (not just ill_ipif_up_count) to store the 8612 * number of interfaces for a device, so we don't need 8613 * to count them here... 8614 */ 8615 numifs = ip_get_numifs(zoneid, ipst); 8616 8617 ifclen = STRUCT_FGET(ifc, ifc_len); 8618 ifc_bufsize = numifs * sizeof (struct ifreq); 8619 if (ifc_bufsize > ifclen) { 8620 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8621 /* old behaviour */ 8622 return (EINVAL); 8623 } else { 8624 ifc_bufsize = ifclen; 8625 } 8626 } 8627 8628 mp1 = mi_copyout_alloc(q, mp, 8629 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 8630 if (mp1 == NULL) 8631 return (ENOMEM); 8632 8633 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 8634 } 8635 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8636 /* 8637 * the SIOCGIFCONF ioctl only knows about 8638 * IPv4 addresses, so don't try to tell 8639 * it about interfaces with IPv6-only 8640 * addresses. (Last parm 'isv6' is B_FALSE) 8641 */ 8642 8643 ifr = (struct ifreq *)mp1->b_rptr; 8644 8645 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8646 ill = ILL_START_WALK_V4(&ctx, ipst); 8647 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8648 for (ipif = ill->ill_ipif; ipif != NULL; 8649 ipif = ipif->ipif_next) { 8650 if (zoneid != ipif->ipif_zoneid && 8651 ipif->ipif_zoneid != ALL_ZONES) 8652 continue; 8653 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 8654 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8655 /* old behaviour */ 8656 rw_exit(&ipst->ips_ill_g_lock); 8657 return (EINVAL); 8658 } else { 8659 goto if_copydone; 8660 } 8661 } 8662 ipif_get_name(ipif, ifr->ifr_name, 8663 sizeof (ifr->ifr_name)); 8664 sin = (sin_t *)&ifr->ifr_addr; 8665 *sin = sin_null; 8666 sin->sin_family = AF_INET; 8667 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8668 ifr++; 8669 } 8670 } 8671 if_copydone: 8672 rw_exit(&ipst->ips_ill_g_lock); 8673 mp1->b_wptr = (uchar_t *)ifr; 8674 8675 if (STRUCT_BUF(ifc) != NULL) { 8676 STRUCT_FSET(ifc, ifc_len, 8677 (int)((uchar_t *)ifr - mp1->b_rptr)); 8678 } 8679 return (0); 8680 } 8681 8682 /* 8683 * Get the interfaces using the address hosted on the interface passed in, 8684 * as a source adddress 8685 */ 8686 /* ARGSUSED */ 8687 int 8688 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8689 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8690 { 8691 mblk_t *mp1; 8692 ill_t *ill, *ill_head; 8693 ipif_t *ipif, *orig_ipif; 8694 int numlifs = 0; 8695 size_t lifs_bufsize, lifsmaxlen; 8696 struct lifreq *lifr; 8697 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8698 uint_t ifindex; 8699 zoneid_t zoneid; 8700 int err = 0; 8701 boolean_t isv6 = B_FALSE; 8702 struct sockaddr_in *sin; 8703 struct sockaddr_in6 *sin6; 8704 STRUCT_HANDLE(lifsrcof, lifs); 8705 ip_stack_t *ipst; 8706 8707 ipst = CONNQ_TO_IPST(q); 8708 8709 ASSERT(q->q_next == NULL); 8710 8711 zoneid = Q_TO_CONN(q)->conn_zoneid; 8712 8713 /* Existence verified in ip_wput_nondata */ 8714 mp1 = mp->b_cont->b_cont; 8715 8716 /* 8717 * Must be (better be!) continuation of a TRANSPARENT 8718 * IOCTL. We just copied in the lifsrcof structure. 8719 */ 8720 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 8721 (struct lifsrcof *)mp1->b_rptr); 8722 8723 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 8724 return (EINVAL); 8725 8726 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 8727 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 8728 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp, 8729 ip_process_ioctl, &err, ipst); 8730 if (ipif == NULL) { 8731 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 8732 ifindex)); 8733 return (err); 8734 } 8735 8736 8737 /* Allocate a buffer to hold requested information */ 8738 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 8739 lifs_bufsize = numlifs * sizeof (struct lifreq); 8740 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 8741 /* The actual size needed is always returned in lifs_len */ 8742 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 8743 8744 /* If the amount we need is more than what is passed in, abort */ 8745 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 8746 ipif_refrele(ipif); 8747 return (0); 8748 } 8749 8750 mp1 = mi_copyout_alloc(q, mp, 8751 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 8752 if (mp1 == NULL) { 8753 ipif_refrele(ipif); 8754 return (ENOMEM); 8755 } 8756 8757 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 8758 bzero(mp1->b_rptr, lifs_bufsize); 8759 8760 lifr = (struct lifreq *)mp1->b_rptr; 8761 8762 ill = ill_head = ipif->ipif_ill; 8763 orig_ipif = ipif; 8764 8765 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 8766 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 8767 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8768 8769 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 8770 for (; (ill != NULL) && (ill != ill_head); 8771 ill = ill->ill_usesrc_grp_next) { 8772 8773 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 8774 break; 8775 8776 ipif = ill->ill_ipif; 8777 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name)); 8778 if (ipif->ipif_isv6) { 8779 sin6 = (sin6_t *)&lifr->lifr_addr; 8780 *sin6 = sin6_null; 8781 sin6->sin6_family = AF_INET6; 8782 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 8783 lifr->lifr_addrlen = ip_mask_to_plen_v6( 8784 &ipif->ipif_v6net_mask); 8785 } else { 8786 sin = (sin_t *)&lifr->lifr_addr; 8787 *sin = sin_null; 8788 sin->sin_family = AF_INET; 8789 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8790 lifr->lifr_addrlen = ip_mask_to_plen( 8791 ipif->ipif_net_mask); 8792 } 8793 lifr++; 8794 } 8795 rw_exit(&ipst->ips_ill_g_usesrc_lock); 8796 rw_exit(&ipst->ips_ill_g_lock); 8797 ipif_refrele(orig_ipif); 8798 mp1->b_wptr = (uchar_t *)lifr; 8799 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 8800 8801 return (0); 8802 } 8803 8804 /* ARGSUSED */ 8805 int 8806 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8807 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8808 { 8809 mblk_t *mp1; 8810 int list; 8811 ill_t *ill; 8812 ipif_t *ipif; 8813 int flags; 8814 int numlifs = 0; 8815 size_t lifc_bufsize; 8816 struct lifreq *lifr; 8817 sa_family_t family; 8818 struct sockaddr_in *sin; 8819 struct sockaddr_in6 *sin6; 8820 ill_walk_context_t ctx; 8821 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8822 int32_t lifclen; 8823 zoneid_t zoneid; 8824 STRUCT_HANDLE(lifconf, lifc); 8825 ip_stack_t *ipst = CONNQ_TO_IPST(q); 8826 8827 ip1dbg(("ip_sioctl_get_lifconf")); 8828 8829 ASSERT(q->q_next == NULL); 8830 8831 zoneid = Q_TO_CONN(q)->conn_zoneid; 8832 8833 /* Existence verified in ip_wput_nondata */ 8834 mp1 = mp->b_cont->b_cont; 8835 8836 /* 8837 * An extended version of SIOCGIFCONF that takes an 8838 * additional address family and flags field. 8839 * AF_UNSPEC retrieve both IPv4 and IPv6. 8840 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 8841 * interfaces are omitted. 8842 * Similarly, IPIF_TEMPORARY interfaces are omitted 8843 * unless LIFC_TEMPORARY is specified. 8844 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 8845 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 8846 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 8847 * has priority over LIFC_NOXMIT. 8848 */ 8849 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 8850 8851 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 8852 return (EINVAL); 8853 8854 /* 8855 * Must be (better be!) continuation of a TRANSPARENT 8856 * IOCTL. We just copied in the lifconf structure. 8857 */ 8858 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 8859 8860 family = STRUCT_FGET(lifc, lifc_family); 8861 flags = STRUCT_FGET(lifc, lifc_flags); 8862 8863 switch (family) { 8864 case AF_UNSPEC: 8865 /* 8866 * walk all ILL's. 8867 */ 8868 list = MAX_G_HEADS; 8869 break; 8870 case AF_INET: 8871 /* 8872 * walk only IPV4 ILL's. 8873 */ 8874 list = IP_V4_G_HEAD; 8875 break; 8876 case AF_INET6: 8877 /* 8878 * walk only IPV6 ILL's. 8879 */ 8880 list = IP_V6_G_HEAD; 8881 break; 8882 default: 8883 return (EAFNOSUPPORT); 8884 } 8885 8886 /* 8887 * Allocate a buffer to hold requested information. 8888 * 8889 * If lifc_len is larger than what is needed, we only 8890 * allocate what we will use. 8891 * 8892 * If lifc_len is smaller than what is needed, return 8893 * EINVAL. 8894 */ 8895 numlifs = ip_get_numlifs(family, flags, zoneid, ipst); 8896 lifc_bufsize = numlifs * sizeof (struct lifreq); 8897 lifclen = STRUCT_FGET(lifc, lifc_len); 8898 if (lifc_bufsize > lifclen) { 8899 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 8900 return (EINVAL); 8901 else 8902 lifc_bufsize = lifclen; 8903 } 8904 8905 mp1 = mi_copyout_alloc(q, mp, 8906 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 8907 if (mp1 == NULL) 8908 return (ENOMEM); 8909 8910 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 8911 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8912 8913 lifr = (struct lifreq *)mp1->b_rptr; 8914 8915 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8916 ill = ill_first(list, list, &ctx, ipst); 8917 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8918 for (ipif = ill->ill_ipif; ipif != NULL; 8919 ipif = ipif->ipif_next) { 8920 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8921 !(flags & LIFC_NOXMIT)) 8922 continue; 8923 8924 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8925 !(flags & LIFC_TEMPORARY)) 8926 continue; 8927 8928 if (((ipif->ipif_flags & 8929 (IPIF_NOXMIT|IPIF_NOLOCAL| 8930 IPIF_DEPRECATED)) || 8931 IS_LOOPBACK(ill) || 8932 !(ipif->ipif_flags & IPIF_UP)) && 8933 (flags & LIFC_EXTERNAL_SOURCE)) 8934 continue; 8935 8936 if (zoneid != ipif->ipif_zoneid && 8937 ipif->ipif_zoneid != ALL_ZONES && 8938 (zoneid != GLOBAL_ZONEID || 8939 !(flags & LIFC_ALLZONES))) 8940 continue; 8941 8942 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 8943 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 8944 rw_exit(&ipst->ips_ill_g_lock); 8945 return (EINVAL); 8946 } else { 8947 goto lif_copydone; 8948 } 8949 } 8950 8951 ipif_get_name(ipif, lifr->lifr_name, 8952 sizeof (lifr->lifr_name)); 8953 if (ipif->ipif_isv6) { 8954 sin6 = (sin6_t *)&lifr->lifr_addr; 8955 *sin6 = sin6_null; 8956 sin6->sin6_family = AF_INET6; 8957 sin6->sin6_addr = 8958 ipif->ipif_v6lcl_addr; 8959 lifr->lifr_addrlen = 8960 ip_mask_to_plen_v6( 8961 &ipif->ipif_v6net_mask); 8962 } else { 8963 sin = (sin_t *)&lifr->lifr_addr; 8964 *sin = sin_null; 8965 sin->sin_family = AF_INET; 8966 sin->sin_addr.s_addr = 8967 ipif->ipif_lcl_addr; 8968 lifr->lifr_addrlen = 8969 ip_mask_to_plen( 8970 ipif->ipif_net_mask); 8971 } 8972 lifr++; 8973 } 8974 } 8975 lif_copydone: 8976 rw_exit(&ipst->ips_ill_g_lock); 8977 8978 mp1->b_wptr = (uchar_t *)lifr; 8979 if (STRUCT_BUF(lifc) != NULL) { 8980 STRUCT_FSET(lifc, lifc_len, 8981 (int)((uchar_t *)lifr - mp1->b_rptr)); 8982 } 8983 return (0); 8984 } 8985 8986 /* ARGSUSED */ 8987 int 8988 ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin, 8989 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8990 { 8991 ip_stack_t *ipst; 8992 8993 if (q->q_next == NULL) 8994 ipst = CONNQ_TO_IPST(q); 8995 else 8996 ipst = ILLQ_TO_IPST(q); 8997 8998 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8999 ipst->ips_ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr; 9000 return (0); 9001 } 9002 9003 static void 9004 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 9005 { 9006 ip6_asp_t *table; 9007 size_t table_size; 9008 mblk_t *data_mp; 9009 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9010 ip_stack_t *ipst; 9011 9012 if (q->q_next == NULL) 9013 ipst = CONNQ_TO_IPST(q); 9014 else 9015 ipst = ILLQ_TO_IPST(q); 9016 9017 /* These two ioctls are I_STR only */ 9018 if (iocp->ioc_count == TRANSPARENT) { 9019 miocnak(q, mp, 0, EINVAL); 9020 return; 9021 } 9022 9023 data_mp = mp->b_cont; 9024 if (data_mp == NULL) { 9025 /* The user passed us a NULL argument */ 9026 table = NULL; 9027 table_size = iocp->ioc_count; 9028 } else { 9029 /* 9030 * The user provided a table. The stream head 9031 * may have copied in the user data in chunks, 9032 * so make sure everything is pulled up 9033 * properly. 9034 */ 9035 if (MBLKL(data_mp) < iocp->ioc_count) { 9036 mblk_t *new_data_mp; 9037 if ((new_data_mp = msgpullup(data_mp, -1)) == 9038 NULL) { 9039 miocnak(q, mp, 0, ENOMEM); 9040 return; 9041 } 9042 freemsg(data_mp); 9043 data_mp = new_data_mp; 9044 mp->b_cont = data_mp; 9045 } 9046 table = (ip6_asp_t *)data_mp->b_rptr; 9047 table_size = iocp->ioc_count; 9048 } 9049 9050 switch (iocp->ioc_cmd) { 9051 case SIOCGIP6ADDRPOLICY: 9052 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst); 9053 if (iocp->ioc_rval == -1) 9054 iocp->ioc_error = EINVAL; 9055 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 9056 else if (table != NULL && 9057 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 9058 ip6_asp_t *src = table; 9059 ip6_asp32_t *dst = (void *)table; 9060 int count = table_size / sizeof (ip6_asp_t); 9061 int i; 9062 9063 /* 9064 * We need to do an in-place shrink of the array 9065 * to match the alignment attributes of the 9066 * 32-bit ABI looking at it. 9067 */ 9068 /* LINTED: logical expression always true: op "||" */ 9069 ASSERT(sizeof (*src) > sizeof (*dst)); 9070 for (i = 1; i < count; i++) 9071 bcopy(src + i, dst + i, sizeof (*dst)); 9072 } 9073 #endif 9074 break; 9075 9076 case SIOCSIP6ADDRPOLICY: 9077 ASSERT(mp->b_prev == NULL); 9078 mp->b_prev = (void *)q; 9079 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 9080 /* 9081 * We pass in the datamodel here so that the ip6_asp_replace() 9082 * routine can handle converting from 32-bit to native formats 9083 * where necessary. 9084 * 9085 * A better way to handle this might be to convert the inbound 9086 * data structure here, and hang it off a new 'mp'; thus the 9087 * ip6_asp_replace() logic would always be dealing with native 9088 * format data structures.. 9089 * 9090 * (An even simpler way to handle these ioctls is to just 9091 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 9092 * and just recompile everything that depends on it.) 9093 */ 9094 #endif 9095 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst, 9096 iocp->ioc_flag & IOC_MODELS); 9097 return; 9098 } 9099 9100 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 9101 qreply(q, mp); 9102 } 9103 9104 static void 9105 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 9106 { 9107 mblk_t *data_mp; 9108 struct dstinforeq *dir; 9109 uint8_t *end, *cur; 9110 in6_addr_t *daddr, *saddr; 9111 ipaddr_t v4daddr; 9112 ire_t *ire; 9113 char *slabel, *dlabel; 9114 boolean_t isipv4; 9115 int match_ire; 9116 ill_t *dst_ill; 9117 ipif_t *src_ipif, *ire_ipif; 9118 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9119 zoneid_t zoneid; 9120 ip_stack_t *ipst = CONNQ_TO_IPST(q); 9121 9122 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9123 zoneid = Q_TO_CONN(q)->conn_zoneid; 9124 9125 /* 9126 * This ioctl is I_STR only, and must have a 9127 * data mblk following the M_IOCTL mblk. 9128 */ 9129 data_mp = mp->b_cont; 9130 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 9131 miocnak(q, mp, 0, EINVAL); 9132 return; 9133 } 9134 9135 if (MBLKL(data_mp) < iocp->ioc_count) { 9136 mblk_t *new_data_mp; 9137 9138 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 9139 miocnak(q, mp, 0, ENOMEM); 9140 return; 9141 } 9142 freemsg(data_mp); 9143 data_mp = new_data_mp; 9144 mp->b_cont = data_mp; 9145 } 9146 match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT; 9147 9148 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 9149 end - cur >= sizeof (struct dstinforeq); 9150 cur += sizeof (struct dstinforeq)) { 9151 dir = (struct dstinforeq *)cur; 9152 daddr = &dir->dir_daddr; 9153 saddr = &dir->dir_saddr; 9154 9155 /* 9156 * ip_addr_scope_v6() and ip6_asp_lookup() handle 9157 * v4 mapped addresses; ire_ftable_lookup[_v6]() 9158 * and ipif_select_source[_v6]() do not. 9159 */ 9160 dir->dir_dscope = ip_addr_scope_v6(daddr); 9161 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst); 9162 9163 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 9164 if (isipv4) { 9165 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 9166 ire = ire_ftable_lookup(v4daddr, NULL, NULL, 9167 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst); 9168 } else { 9169 ire = ire_ftable_lookup_v6(daddr, NULL, NULL, 9170 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst); 9171 } 9172 if (ire == NULL) { 9173 dir->dir_dreachable = 0; 9174 9175 /* move on to next dst addr */ 9176 continue; 9177 } 9178 dir->dir_dreachable = 1; 9179 9180 ire_ipif = ire->ire_ipif; 9181 if (ire_ipif == NULL) 9182 goto next_dst; 9183 9184 /* 9185 * We expect to get back an interface ire or a 9186 * gateway ire cache entry. For both types, the 9187 * output interface is ire_ipif->ipif_ill. 9188 */ 9189 dst_ill = ire_ipif->ipif_ill; 9190 dir->dir_dmactype = dst_ill->ill_mactype; 9191 9192 if (isipv4) { 9193 src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid); 9194 } else { 9195 src_ipif = ipif_select_source_v6(dst_ill, 9196 daddr, RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, 9197 zoneid); 9198 } 9199 if (src_ipif == NULL) 9200 goto next_dst; 9201 9202 *saddr = src_ipif->ipif_v6lcl_addr; 9203 dir->dir_sscope = ip_addr_scope_v6(saddr); 9204 slabel = ip6_asp_lookup(saddr, NULL, ipst); 9205 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 9206 dir->dir_sdeprecated = 9207 (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 9208 ipif_refrele(src_ipif); 9209 next_dst: 9210 ire_refrele(ire); 9211 } 9212 miocack(q, mp, iocp->ioc_count, 0); 9213 } 9214 9215 9216 /* 9217 * Check if this is an address assigned to this machine. 9218 * Skips interfaces that are down by using ire checks. 9219 * Translates mapped addresses to v4 addresses and then 9220 * treats them as such, returning true if the v4 address 9221 * associated with this mapped address is configured. 9222 * Note: Applications will have to be careful what they do 9223 * with the response; use of mapped addresses limits 9224 * what can be done with the socket, especially with 9225 * respect to socket options and ioctls - neither IPv4 9226 * options nor IPv6 sticky options/ancillary data options 9227 * may be used. 9228 */ 9229 /* ARGSUSED */ 9230 int 9231 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9232 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9233 { 9234 struct sioc_addrreq *sia; 9235 sin_t *sin; 9236 ire_t *ire; 9237 mblk_t *mp1; 9238 zoneid_t zoneid; 9239 ip_stack_t *ipst; 9240 9241 ip1dbg(("ip_sioctl_tmyaddr")); 9242 9243 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9244 zoneid = Q_TO_CONN(q)->conn_zoneid; 9245 ipst = CONNQ_TO_IPST(q); 9246 9247 /* Existence verified in ip_wput_nondata */ 9248 mp1 = mp->b_cont->b_cont; 9249 sia = (struct sioc_addrreq *)mp1->b_rptr; 9250 sin = (sin_t *)&sia->sa_addr; 9251 switch (sin->sin_family) { 9252 case AF_INET6: { 9253 sin6_t *sin6 = (sin6_t *)sin; 9254 9255 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9256 ipaddr_t v4_addr; 9257 9258 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9259 v4_addr); 9260 ire = ire_ctable_lookup(v4_addr, 0, 9261 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9262 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9263 } else { 9264 in6_addr_t v6addr; 9265 9266 v6addr = sin6->sin6_addr; 9267 ire = ire_ctable_lookup_v6(&v6addr, 0, 9268 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9269 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9270 } 9271 break; 9272 } 9273 case AF_INET: { 9274 ipaddr_t v4addr; 9275 9276 v4addr = sin->sin_addr.s_addr; 9277 ire = ire_ctable_lookup(v4addr, 0, 9278 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9279 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9280 break; 9281 } 9282 default: 9283 return (EAFNOSUPPORT); 9284 } 9285 if (ire != NULL) { 9286 sia->sa_res = 1; 9287 ire_refrele(ire); 9288 } else { 9289 sia->sa_res = 0; 9290 } 9291 return (0); 9292 } 9293 9294 /* 9295 * Check if this is an address assigned on-link i.e. neighbor, 9296 * and makes sure it's reachable from the current zone. 9297 * Returns true for my addresses as well. 9298 * Translates mapped addresses to v4 addresses and then 9299 * treats them as such, returning true if the v4 address 9300 * associated with this mapped address is configured. 9301 * Note: Applications will have to be careful what they do 9302 * with the response; use of mapped addresses limits 9303 * what can be done with the socket, especially with 9304 * respect to socket options and ioctls - neither IPv4 9305 * options nor IPv6 sticky options/ancillary data options 9306 * may be used. 9307 */ 9308 /* ARGSUSED */ 9309 int 9310 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9311 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 9312 { 9313 struct sioc_addrreq *sia; 9314 sin_t *sin; 9315 mblk_t *mp1; 9316 ire_t *ire = NULL; 9317 zoneid_t zoneid; 9318 ip_stack_t *ipst; 9319 9320 ip1dbg(("ip_sioctl_tonlink")); 9321 9322 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9323 zoneid = Q_TO_CONN(q)->conn_zoneid; 9324 ipst = CONNQ_TO_IPST(q); 9325 9326 /* Existence verified in ip_wput_nondata */ 9327 mp1 = mp->b_cont->b_cont; 9328 sia = (struct sioc_addrreq *)mp1->b_rptr; 9329 sin = (sin_t *)&sia->sa_addr; 9330 9331 /* 9332 * Match addresses with a zero gateway field to avoid 9333 * routes going through a router. 9334 * Exclude broadcast and multicast addresses. 9335 */ 9336 switch (sin->sin_family) { 9337 case AF_INET6: { 9338 sin6_t *sin6 = (sin6_t *)sin; 9339 9340 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9341 ipaddr_t v4_addr; 9342 9343 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9344 v4_addr); 9345 if (!CLASSD(v4_addr)) { 9346 ire = ire_route_lookup(v4_addr, 0, 0, 0, 9347 NULL, NULL, zoneid, NULL, 9348 MATCH_IRE_GW, ipst); 9349 } 9350 } else { 9351 in6_addr_t v6addr; 9352 in6_addr_t v6gw; 9353 9354 v6addr = sin6->sin6_addr; 9355 v6gw = ipv6_all_zeros; 9356 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 9357 ire = ire_route_lookup_v6(&v6addr, 0, 9358 &v6gw, 0, NULL, NULL, zoneid, 9359 NULL, MATCH_IRE_GW, ipst); 9360 } 9361 } 9362 break; 9363 } 9364 case AF_INET: { 9365 ipaddr_t v4addr; 9366 9367 v4addr = sin->sin_addr.s_addr; 9368 if (!CLASSD(v4addr)) { 9369 ire = ire_route_lookup(v4addr, 0, 0, 0, 9370 NULL, NULL, zoneid, NULL, 9371 MATCH_IRE_GW, ipst); 9372 } 9373 break; 9374 } 9375 default: 9376 return (EAFNOSUPPORT); 9377 } 9378 sia->sa_res = 0; 9379 if (ire != NULL) { 9380 if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE| 9381 IRE_LOCAL|IRE_LOOPBACK)) { 9382 sia->sa_res = 1; 9383 } 9384 ire_refrele(ire); 9385 } 9386 return (0); 9387 } 9388 9389 /* 9390 * TBD: implement when kernel maintaines a list of site prefixes. 9391 */ 9392 /* ARGSUSED */ 9393 int 9394 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9395 ip_ioctl_cmd_t *ipip, void *ifreq) 9396 { 9397 return (ENXIO); 9398 } 9399 9400 /* ARGSUSED */ 9401 int 9402 ip_sioctl_tunparam(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9403 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9404 { 9405 ill_t *ill; 9406 mblk_t *mp1; 9407 conn_t *connp; 9408 boolean_t success; 9409 9410 ip1dbg(("ip_sioctl_tunparam(%s:%u %p)\n", 9411 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9412 /* ioctl comes down on an conn */ 9413 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9414 connp = Q_TO_CONN(q); 9415 9416 mp->b_datap->db_type = M_IOCTL; 9417 9418 /* 9419 * Send down a copy. (copymsg does not copy b_next/b_prev). 9420 * The original mp contains contaminated b_next values due to 'mi', 9421 * which is needed to do the mi_copy_done. Unfortunately if we 9422 * send down the original mblk itself and if we are popped due to an 9423 * an unplumb before the response comes back from tunnel, 9424 * the streamhead (which does a freemsg) will see this contaminated 9425 * message and the assertion in freemsg about non-null b_next/b_prev 9426 * will panic a DEBUG kernel. 9427 */ 9428 mp1 = copymsg(mp); 9429 if (mp1 == NULL) 9430 return (ENOMEM); 9431 9432 ill = ipif->ipif_ill; 9433 mutex_enter(&connp->conn_lock); 9434 mutex_enter(&ill->ill_lock); 9435 if (ipip->ipi_cmd == SIOCSTUNPARAM || ipip->ipi_cmd == OSIOCSTUNPARAM) { 9436 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), 9437 mp, 0); 9438 } else { 9439 success = ill_pending_mp_add(ill, connp, mp); 9440 } 9441 mutex_exit(&ill->ill_lock); 9442 mutex_exit(&connp->conn_lock); 9443 9444 if (success) { 9445 ip1dbg(("sending down tunparam request ")); 9446 putnext(ill->ill_wq, mp1); 9447 return (EINPROGRESS); 9448 } else { 9449 /* The conn has started closing */ 9450 freemsg(mp1); 9451 return (EINTR); 9452 } 9453 } 9454 9455 /* 9456 * ARP IOCTLs. 9457 * How does IP get in the business of fronting ARP configuration/queries? 9458 * Well it's like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP) 9459 * are by tradition passed in through a datagram socket. That lands in IP. 9460 * As it happens, this is just as well since the interface is quite crude in 9461 * that it passes in no information about protocol or hardware types, or 9462 * interface association. After making the protocol assumption, IP is in 9463 * the position to look up the name of the ILL, which ARP will need, and 9464 * format a request that can be handled by ARP. The request is passed up 9465 * stream to ARP, and the original IOCTL is completed by IP when ARP passes 9466 * back a response. ARP supports its own set of more general IOCTLs, in 9467 * case anyone is interested. 9468 */ 9469 /* ARGSUSED */ 9470 int 9471 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9472 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9473 { 9474 mblk_t *mp1; 9475 mblk_t *mp2; 9476 mblk_t *pending_mp; 9477 ipaddr_t ipaddr; 9478 area_t *area; 9479 struct iocblk *iocp; 9480 conn_t *connp; 9481 struct arpreq *ar; 9482 struct xarpreq *xar; 9483 int flags, alength; 9484 char *lladdr; 9485 ip_stack_t *ipst; 9486 ill_t *ill = ipif->ipif_ill; 9487 boolean_t if_arp_ioctl = B_FALSE; 9488 9489 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9490 connp = Q_TO_CONN(q); 9491 ipst = connp->conn_netstack->netstack_ip; 9492 9493 if (ipip->ipi_cmd_type == XARP_CMD) { 9494 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 9495 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 9496 ar = NULL; 9497 9498 flags = xar->xarp_flags; 9499 lladdr = LLADDR(&xar->xarp_ha); 9500 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0); 9501 /* 9502 * Validate against user's link layer address length 9503 * input and name and addr length limits. 9504 */ 9505 alength = ill->ill_phys_addr_length; 9506 if (ipip->ipi_cmd == SIOCSXARP) { 9507 if (alength != xar->xarp_ha.sdl_alen || 9508 (alength + xar->xarp_ha.sdl_nlen > 9509 sizeof (xar->xarp_ha.sdl_data))) 9510 return (EINVAL); 9511 } 9512 } else { 9513 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 9514 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 9515 xar = NULL; 9516 9517 flags = ar->arp_flags; 9518 lladdr = ar->arp_ha.sa_data; 9519 /* 9520 * Theoretically, the sa_family could tell us what link 9521 * layer type this operation is trying to deal with. By 9522 * common usage AF_UNSPEC means ethernet. We'll assume 9523 * any attempt to use the SIOC?ARP ioctls is for ethernet, 9524 * for now. Our new SIOC*XARP ioctls can be used more 9525 * generally. 9526 * 9527 * If the underlying media happens to have a non 6 byte 9528 * address, arp module will fail set/get, but the del 9529 * operation will succeed. 9530 */ 9531 alength = 6; 9532 if ((ipip->ipi_cmd != SIOCDARP) && 9533 (alength != ill->ill_phys_addr_length)) { 9534 return (EINVAL); 9535 } 9536 } 9537 9538 /* 9539 * We are going to pass up to ARP a packet chain that looks 9540 * like: 9541 * 9542 * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 9543 * 9544 * Get a copy of the original IOCTL mblk to head the chain, 9545 * to be sent up (in mp1). Also get another copy to store 9546 * in the ill_pending_mp list, for matching the response 9547 * when it comes back from ARP. 9548 */ 9549 mp1 = copyb(mp); 9550 pending_mp = copymsg(mp); 9551 if (mp1 == NULL || pending_mp == NULL) { 9552 if (mp1 != NULL) 9553 freeb(mp1); 9554 if (pending_mp != NULL) 9555 inet_freemsg(pending_mp); 9556 return (ENOMEM); 9557 } 9558 9559 ipaddr = sin->sin_addr.s_addr; 9560 9561 mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 9562 (caddr_t)&ipaddr); 9563 if (mp2 == NULL) { 9564 freeb(mp1); 9565 inet_freemsg(pending_mp); 9566 return (ENOMEM); 9567 } 9568 /* Put together the chain. */ 9569 mp1->b_cont = mp2; 9570 mp1->b_datap->db_type = M_IOCTL; 9571 mp2->b_cont = mp; 9572 mp2->b_datap->db_type = M_DATA; 9573 9574 iocp = (struct iocblk *)mp1->b_rptr; 9575 9576 /* 9577 * An M_IOCDATA's payload (struct copyresp) is mostly the same as an 9578 * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a 9579 * cp_private field (or cp_rval on 32-bit systems) in place of the 9580 * ioc_count field; set ioc_count to be correct. 9581 */ 9582 iocp->ioc_count = MBLKL(mp1->b_cont); 9583 9584 /* 9585 * Set the proper command in the ARP message. 9586 * Convert the SIOC{G|S|D}ARP calls into our 9587 * AR_ENTRY_xxx calls. 9588 */ 9589 area = (area_t *)mp2->b_rptr; 9590 switch (iocp->ioc_cmd) { 9591 case SIOCDARP: 9592 case SIOCDXARP: 9593 /* 9594 * We defer deleting the corresponding IRE until 9595 * we return from arp. 9596 */ 9597 area->area_cmd = AR_ENTRY_DELETE; 9598 area->area_proto_mask_offset = 0; 9599 break; 9600 case SIOCGARP: 9601 case SIOCGXARP: 9602 area->area_cmd = AR_ENTRY_SQUERY; 9603 area->area_proto_mask_offset = 0; 9604 break; 9605 case SIOCSARP: 9606 case SIOCSXARP: 9607 /* 9608 * Delete the corresponding ire to make sure IP will 9609 * pick up any change from arp. 9610 */ 9611 if (!if_arp_ioctl) { 9612 (void) ip_ire_clookup_and_delete(ipaddr, NULL, ipst); 9613 } else { 9614 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 9615 if (ipif != NULL) { 9616 (void) ip_ire_clookup_and_delete(ipaddr, ipif, 9617 ipst); 9618 ipif_refrele(ipif); 9619 } 9620 } 9621 break; 9622 } 9623 iocp->ioc_cmd = area->area_cmd; 9624 9625 /* 9626 * Fill in the rest of the ARP operation fields. 9627 */ 9628 area->area_hw_addr_length = alength; 9629 bcopy(lladdr, (char *)area + area->area_hw_addr_offset, alength); 9630 9631 /* Translate the flags. */ 9632 if (flags & ATF_PERM) 9633 area->area_flags |= ACE_F_PERMANENT; 9634 if (flags & ATF_PUBL) 9635 area->area_flags |= ACE_F_PUBLISH; 9636 if (flags & ATF_AUTHORITY) 9637 area->area_flags |= ACE_F_AUTHORITY; 9638 9639 /* 9640 * Before sending 'mp' to ARP, we have to clear the b_next 9641 * and b_prev. Otherwise if STREAMS encounters such a message 9642 * in freemsg(), (because ARP can close any time) it can cause 9643 * a panic. But mi code needs the b_next and b_prev values of 9644 * mp->b_cont, to complete the ioctl. So we store it here 9645 * in pending_mp->bcont, and restore it in ip_sioctl_iocack() 9646 * when the response comes down from ARP. 9647 */ 9648 pending_mp->b_cont->b_next = mp->b_cont->b_next; 9649 pending_mp->b_cont->b_prev = mp->b_cont->b_prev; 9650 mp->b_cont->b_next = NULL; 9651 mp->b_cont->b_prev = NULL; 9652 9653 mutex_enter(&connp->conn_lock); 9654 mutex_enter(&ill->ill_lock); 9655 /* conn has not yet started closing, hence this can't fail */ 9656 VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0); 9657 mutex_exit(&ill->ill_lock); 9658 mutex_exit(&connp->conn_lock); 9659 9660 /* 9661 * Up to ARP it goes. The response will come back in ip_wput() as an 9662 * M_IOCACK, and will be handed to ip_sioctl_iocack() for completion. 9663 */ 9664 putnext(ill->ill_rq, mp1); 9665 return (EINPROGRESS); 9666 } 9667 9668 /* 9669 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify 9670 * the associated sin and refhold and return the associated ipif via `ci'. 9671 */ 9672 int 9673 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 9674 cmd_info_t *ci, ipsq_func_t func) 9675 { 9676 mblk_t *mp1; 9677 int err; 9678 sin_t *sin; 9679 conn_t *connp; 9680 ipif_t *ipif; 9681 ire_t *ire = NULL; 9682 ill_t *ill = NULL; 9683 boolean_t exists; 9684 ip_stack_t *ipst; 9685 struct arpreq *ar; 9686 struct xarpreq *xar; 9687 struct sockaddr_dl *sdl; 9688 9689 /* ioctl comes down on a conn */ 9690 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9691 connp = Q_TO_CONN(q); 9692 if (connp->conn_af_isv6) 9693 return (ENXIO); 9694 9695 ipst = connp->conn_netstack->netstack_ip; 9696 9697 /* Verified in ip_wput_nondata */ 9698 mp1 = mp->b_cont->b_cont; 9699 9700 if (ipip->ipi_cmd_type == XARP_CMD) { 9701 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq)); 9702 xar = (struct xarpreq *)mp1->b_rptr; 9703 sin = (sin_t *)&xar->xarp_pa; 9704 sdl = &xar->xarp_ha; 9705 9706 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET) 9707 return (ENXIO); 9708 if (sdl->sdl_nlen >= LIFNAMSIZ) 9709 return (EINVAL); 9710 } else { 9711 ASSERT(ipip->ipi_cmd_type == ARP_CMD); 9712 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq)); 9713 ar = (struct arpreq *)mp1->b_rptr; 9714 sin = (sin_t *)&ar->arp_pa; 9715 } 9716 9717 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) { 9718 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen, 9719 B_FALSE, &exists, B_FALSE, ALL_ZONES, CONNP_TO_WQ(connp), 9720 mp, func, &err, ipst); 9721 if (ipif == NULL) 9722 return (err); 9723 if (ipif->ipif_id != 0 || 9724 ipif->ipif_net_type != IRE_IF_RESOLVER) { 9725 ipif_refrele(ipif); 9726 return (ENXIO); 9727 } 9728 } else { 9729 /* 9730 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with sdl_nlen == 9731 * 0: use the IP address to figure out the ill. In the IPMP 9732 * case, a simple forwarding table lookup will return the 9733 * IRE_IF_RESOLVER for the first interface in the group, which 9734 * might not be the interface on which the requested IP 9735 * address was resolved due to the ill selection algorithm 9736 * (see ip_newroute_get_dst_ill()). So we do a cache table 9737 * lookup first: if the IRE cache entry for the IP address is 9738 * still there, it will contain the ill pointer for the right 9739 * interface, so we use that. If the cache entry has been 9740 * flushed, we fall back to the forwarding table lookup. This 9741 * should be rare enough since IRE cache entries have a longer 9742 * life expectancy than ARP cache entries. 9743 */ 9744 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL, 9745 ipst); 9746 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 9747 ((ill = ire_to_ill(ire)) == NULL) || 9748 (ill->ill_net_type != IRE_IF_RESOLVER)) { 9749 if (ire != NULL) 9750 ire_refrele(ire); 9751 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 9752 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 9753 NULL, MATCH_IRE_TYPE, ipst); 9754 if (ire == NULL || ((ill = ire_to_ill(ire)) == NULL)) { 9755 9756 if (ire != NULL) 9757 ire_refrele(ire); 9758 return (ENXIO); 9759 } 9760 } 9761 ASSERT(ire != NULL && ill != NULL); 9762 ipif = ill->ill_ipif; 9763 ipif_refhold(ipif); 9764 ire_refrele(ire); 9765 } 9766 ci->ci_sin = sin; 9767 ci->ci_ipif = ipif; 9768 return (0); 9769 } 9770 9771 /* 9772 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 9773 * atomically set/clear the muxids. Also complete the ioctl by acking or 9774 * naking it. Note that the code is structured such that the link type, 9775 * whether it's persistent or not, is treated equally. ifconfig(1M) and 9776 * its clones use the persistent link, while pppd(1M) and perhaps many 9777 * other daemons may use non-persistent link. When combined with some 9778 * ill_t states, linking and unlinking lower streams may be used as 9779 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 9780 */ 9781 /* ARGSUSED */ 9782 void 9783 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 9784 { 9785 mblk_t *mp1, *mp2; 9786 struct linkblk *li; 9787 struct ipmx_s *ipmxp; 9788 ill_t *ill; 9789 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; 9790 int err = 0; 9791 boolean_t entered_ipsq = B_FALSE; 9792 boolean_t islink; 9793 ip_stack_t *ipst; 9794 9795 if (CONN_Q(q)) 9796 ipst = CONNQ_TO_IPST(q); 9797 else 9798 ipst = ILLQ_TO_IPST(q); 9799 9800 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK || 9801 ioccmd == I_LINK || ioccmd == I_UNLINK); 9802 9803 islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 9804 9805 mp1 = mp->b_cont; /* This is the linkblk info */ 9806 li = (struct linkblk *)mp1->b_rptr; 9807 9808 /* 9809 * ARP has added this special mblk, and the utility is asking us 9810 * to perform consistency checks, and also atomically set the 9811 * muxid. Ifconfig is an example. It achieves this by using 9812 * /dev/arp as the mux to plink the arp stream, and pushes arp on 9813 * to /dev/udp[6] stream for use as the mux when plinking the IP 9814 * stream. SIOCSLIFMUXID is not required. See ifconfig.c, arp.c 9815 * and other comments in this routine for more details. 9816 */ 9817 mp2 = mp1->b_cont; /* This is added by ARP */ 9818 9819 /* 9820 * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than 9821 * ifconfig which didn't push ARP on top of the dummy mux, we won't 9822 * get the special mblk above. For backward compatibility, we 9823 * request ip_sioctl_plink_ipmod() to skip the consistency checks. 9824 * The utility will use SIOCSLIFMUXID to store the muxids. This is 9825 * not atomic, and can leave the streams unplumbable if the utility 9826 * is interrupted before it does the SIOCSLIFMUXID. 9827 */ 9828 if (mp2 == NULL) { 9829 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_FALSE); 9830 if (err == EINPROGRESS) 9831 return; 9832 goto done; 9833 } 9834 9835 /* 9836 * This is an I_{P}LINK sent down by ifconfig through the ARP module; 9837 * ARP has appended this last mblk to tell us whether the lower stream 9838 * is an arp-dev stream or an IP module stream. 9839 */ 9840 ipmxp = (struct ipmx_s *)mp2->b_rptr; 9841 if (ipmxp->ipmx_arpdev_stream) { 9842 /* 9843 * The lower stream is the arp-dev stream. 9844 */ 9845 ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE, 9846 q, mp, ip_sioctl_plink, &err, NULL, ipst); 9847 if (ill == NULL) { 9848 if (err == EINPROGRESS) 9849 return; 9850 err = EINVAL; 9851 goto done; 9852 } 9853 9854 if (ipsq == NULL) { 9855 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 9856 NEW_OP, B_TRUE); 9857 if (ipsq == NULL) { 9858 ill_refrele(ill); 9859 return; 9860 } 9861 entered_ipsq = B_TRUE; 9862 } 9863 ASSERT(IAM_WRITER_ILL(ill)); 9864 ill_refrele(ill); 9865 9866 /* 9867 * To ensure consistency between IP and ARP, the following 9868 * LIFO scheme is used in plink/punlink. (IP first, ARP last). 9869 * This is because the muxid's are stored in the IP stream on 9870 * the ill. 9871 * 9872 * I_{P}LINK: ifconfig plinks the IP stream before plinking 9873 * the ARP stream. On an arp-dev stream, IP checks that it is 9874 * not yet plinked, and it also checks that the corresponding 9875 * IP stream is already plinked. 9876 * 9877 * I_{P}UNLINK: ifconfig punlinks the ARP stream before 9878 * punlinking the IP stream. IP does not allow punlink of the 9879 * IP stream unless the arp stream has been punlinked. 9880 */ 9881 if ((islink && 9882 (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) || 9883 (!islink && ill->ill_arp_muxid != li->l_index)) { 9884 err = EINVAL; 9885 goto done; 9886 } 9887 ill->ill_arp_muxid = islink ? li->l_index : 0; 9888 } else { 9889 /* 9890 * The lower stream is probably an IP module stream. Do 9891 * consistency checking. 9892 */ 9893 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_TRUE); 9894 if (err == EINPROGRESS) 9895 return; 9896 } 9897 done: 9898 if (err == 0) 9899 miocack(q, mp, 0, 0); 9900 else 9901 miocnak(q, mp, 0, err); 9902 9903 /* Conn was refheld in ip_sioctl_copyin_setup */ 9904 if (CONN_Q(q)) 9905 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 9906 if (entered_ipsq) 9907 ipsq_exit(ipsq, B_TRUE, B_TRUE); 9908 } 9909 9910 /* 9911 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to 9912 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP 9913 * module stream). If `doconsist' is set, then do the extended consistency 9914 * checks requested by ifconfig(1M) and (atomically) set ill_ip_muxid here. 9915 * Returns zero on success, EINPROGRESS if the operation is still pending, or 9916 * an error code on failure. 9917 */ 9918 static int 9919 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, 9920 struct linkblk *li, boolean_t doconsist) 9921 { 9922 ill_t *ill; 9923 queue_t *ipwq, *dwq; 9924 const char *name; 9925 struct qinit *qinfo; 9926 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 9927 boolean_t entered_ipsq = B_FALSE; 9928 9929 /* 9930 * Walk the lower stream to verify it's the IP module stream. 9931 * The IP module is identified by its name, wput function, 9932 * and non-NULL q_next. STREAMS ensures that the lower stream 9933 * (li->l_qbot) will not vanish until this ioctl completes. 9934 */ 9935 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) { 9936 qinfo = ipwq->q_qinfo; 9937 name = qinfo->qi_minfo->mi_idname; 9938 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 && 9939 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 9940 break; 9941 } 9942 } 9943 9944 /* 9945 * If this isn't an IP module stream, bail. 9946 */ 9947 if (ipwq == NULL) 9948 return (0); 9949 9950 ill = ipwq->q_ptr; 9951 ASSERT(ill != NULL); 9952 9953 if (ipsq == NULL) { 9954 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 9955 NEW_OP, B_TRUE); 9956 if (ipsq == NULL) 9957 return (EINPROGRESS); 9958 entered_ipsq = B_TRUE; 9959 } 9960 ASSERT(IAM_WRITER_ILL(ill)); 9961 9962 if (doconsist) { 9963 /* 9964 * Consistency checking requires that I_{P}LINK occurs 9965 * prior to setting ill_ip_muxid, and that I_{P}UNLINK 9966 * occurs prior to clearing ill_arp_muxid. 9967 */ 9968 if ((islink && ill->ill_ip_muxid != 0) || 9969 (!islink && ill->ill_arp_muxid != 0)) { 9970 if (entered_ipsq) 9971 ipsq_exit(ipsq, B_TRUE, B_TRUE); 9972 return (EINVAL); 9973 } 9974 } 9975 9976 /* 9977 * As part of I_{P}LINKing, stash the number of downstream modules and 9978 * the read queue of the module immediately below IP in the ill. 9979 * These are used during the capability negotiation below. 9980 */ 9981 ill->ill_lmod_rq = NULL; 9982 ill->ill_lmod_cnt = 0; 9983 if (islink && ((dwq = ipwq->q_next) != NULL)) { 9984 ill->ill_lmod_rq = RD(dwq); 9985 for (; dwq != NULL; dwq = dwq->q_next) 9986 ill->ill_lmod_cnt++; 9987 } 9988 9989 if (doconsist) 9990 ill->ill_ip_muxid = islink ? li->l_index : 0; 9991 9992 /* 9993 * If there's at least one up ipif on this ill, then we're bound to 9994 * the underlying driver via DLPI. In that case, renegotiate 9995 * capabilities to account for any possible change in modules 9996 * interposed between IP and the driver. 9997 */ 9998 if (ill->ill_ipif_up_count > 0) { 9999 if (islink) 10000 ill_capability_probe(ill); 10001 else 10002 ill_capability_reset(ill); 10003 } 10004 10005 if (entered_ipsq) 10006 ipsq_exit(ipsq, B_TRUE, B_TRUE); 10007 10008 return (0); 10009 } 10010 10011 /* 10012 * Search the ioctl command in the ioctl tables and return a pointer 10013 * to the ioctl command information. The ioctl command tables are 10014 * static and fully populated at compile time. 10015 */ 10016 ip_ioctl_cmd_t * 10017 ip_sioctl_lookup(int ioc_cmd) 10018 { 10019 int index; 10020 ip_ioctl_cmd_t *ipip; 10021 ip_ioctl_cmd_t *ipip_end; 10022 10023 if (ioc_cmd == IPI_DONTCARE) 10024 return (NULL); 10025 10026 /* 10027 * Do a 2 step search. First search the indexed table 10028 * based on the least significant byte of the ioctl cmd. 10029 * If we don't find a match, then search the misc table 10030 * serially. 10031 */ 10032 index = ioc_cmd & 0xFF; 10033 if (index < ip_ndx_ioctl_count) { 10034 ipip = &ip_ndx_ioctl_table[index]; 10035 if (ipip->ipi_cmd == ioc_cmd) { 10036 /* Found a match in the ndx table */ 10037 return (ipip); 10038 } 10039 } 10040 10041 /* Search the misc table */ 10042 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 10043 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 10044 if (ipip->ipi_cmd == ioc_cmd) 10045 /* Found a match in the misc table */ 10046 return (ipip); 10047 } 10048 10049 return (NULL); 10050 } 10051 10052 /* 10053 * Wrapper function for resuming deferred ioctl processing 10054 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 10055 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 10056 */ 10057 /* ARGSUSED */ 10058 void 10059 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 10060 void *dummy_arg) 10061 { 10062 ip_sioctl_copyin_setup(q, mp); 10063 } 10064 10065 /* 10066 * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message 10067 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 10068 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 10069 * We establish here the size of the block to be copied in. mi_copyin 10070 * arranges for this to happen, an processing continues in ip_wput with 10071 * an M_IOCDATA message. 10072 */ 10073 void 10074 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 10075 { 10076 int copyin_size; 10077 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 10078 ip_ioctl_cmd_t *ipip; 10079 cred_t *cr; 10080 ip_stack_t *ipst; 10081 10082 if (CONN_Q(q)) 10083 ipst = CONNQ_TO_IPST(q); 10084 else 10085 ipst = ILLQ_TO_IPST(q); 10086 10087 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 10088 if (ipip == NULL) { 10089 /* 10090 * The ioctl is not one we understand or own. 10091 * Pass it along to be processed down stream, 10092 * if this is a module instance of IP, else nak 10093 * the ioctl. 10094 */ 10095 if (q->q_next == NULL) { 10096 goto nak; 10097 } else { 10098 putnext(q, mp); 10099 return; 10100 } 10101 } 10102 10103 /* 10104 * If this is deferred, then we will do all the checks when we 10105 * come back. 10106 */ 10107 if ((iocp->ioc_cmd == SIOCGDSTINFO || 10108 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) { 10109 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 10110 return; 10111 } 10112 10113 /* 10114 * Only allow a very small subset of IP ioctls on this stream if 10115 * IP is a module and not a driver. Allowing ioctls to be processed 10116 * in this case may cause assert failures or data corruption. 10117 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 10118 * ioctls allowed on an IP module stream, after which this stream 10119 * normally becomes a multiplexor (at which time the stream head 10120 * will fail all ioctls). 10121 */ 10122 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 10123 if (ipip->ipi_flags & IPI_PASS_DOWN) { 10124 /* 10125 * Pass common Streams ioctls which the IP 10126 * module does not own or consume along to 10127 * be processed down stream. 10128 */ 10129 putnext(q, mp); 10130 return; 10131 } else { 10132 goto nak; 10133 } 10134 } 10135 10136 /* Make sure we have ioctl data to process. */ 10137 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 10138 goto nak; 10139 10140 /* 10141 * Prefer dblk credential over ioctl credential; some synthesized 10142 * ioctls have kcred set because there's no way to crhold() 10143 * a credential in some contexts. (ioc_cr is not crfree() by 10144 * the framework; the caller of ioctl needs to hold the reference 10145 * for the duration of the call). 10146 */ 10147 cr = DB_CREDDEF(mp, iocp->ioc_cr); 10148 10149 /* Make sure normal users don't send down privileged ioctls */ 10150 if ((ipip->ipi_flags & IPI_PRIV) && 10151 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) { 10152 /* We checked the privilege earlier but log it here */ 10153 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE)); 10154 return; 10155 } 10156 10157 /* 10158 * The ioctl command tables can only encode fixed length 10159 * ioctl data. If the length is variable, the table will 10160 * encode the length as zero. Such special cases are handled 10161 * below in the switch. 10162 */ 10163 if (ipip->ipi_copyin_size != 0) { 10164 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 10165 return; 10166 } 10167 10168 switch (iocp->ioc_cmd) { 10169 case O_SIOCGIFCONF: 10170 case SIOCGIFCONF: 10171 /* 10172 * This IOCTL is hilarious. See comments in 10173 * ip_sioctl_get_ifconf for the story. 10174 */ 10175 if (iocp->ioc_count == TRANSPARENT) 10176 copyin_size = SIZEOF_STRUCT(ifconf, 10177 iocp->ioc_flag); 10178 else 10179 copyin_size = iocp->ioc_count; 10180 mi_copyin(q, mp, NULL, copyin_size); 10181 return; 10182 10183 case O_SIOCGLIFCONF: 10184 case SIOCGLIFCONF: 10185 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 10186 mi_copyin(q, mp, NULL, copyin_size); 10187 return; 10188 10189 case SIOCGLIFSRCOF: 10190 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 10191 mi_copyin(q, mp, NULL, copyin_size); 10192 return; 10193 case SIOCGIP6ADDRPOLICY: 10194 ip_sioctl_ip6addrpolicy(q, mp); 10195 ip6_asp_table_refrele(ipst); 10196 return; 10197 10198 case SIOCSIP6ADDRPOLICY: 10199 ip_sioctl_ip6addrpolicy(q, mp); 10200 return; 10201 10202 case SIOCGDSTINFO: 10203 ip_sioctl_dstinfo(q, mp); 10204 ip6_asp_table_refrele(ipst); 10205 return; 10206 10207 case I_PLINK: 10208 case I_PUNLINK: 10209 case I_LINK: 10210 case I_UNLINK: 10211 /* 10212 * We treat non-persistent link similarly as the persistent 10213 * link case, in terms of plumbing/unplumbing, as well as 10214 * dynamic re-plumbing events indicator. See comments 10215 * in ip_sioctl_plink() for more. 10216 * 10217 * Request can be enqueued in the 'ipsq' while waiting 10218 * to become exclusive. So bump up the conn ref. 10219 */ 10220 if (CONN_Q(q)) 10221 CONN_INC_REF(Q_TO_CONN(q)); 10222 ip_sioctl_plink(NULL, q, mp, NULL); 10223 return; 10224 10225 case ND_GET: 10226 case ND_SET: 10227 /* 10228 * Use of the nd table requires holding the reader lock. 10229 * Modifying the nd table thru nd_load/nd_unload requires 10230 * the writer lock. 10231 */ 10232 rw_enter(&ipst->ips_ip_g_nd_lock, RW_READER); 10233 if (nd_getset(q, ipst->ips_ip_g_nd, mp)) { 10234 rw_exit(&ipst->ips_ip_g_nd_lock); 10235 10236 if (iocp->ioc_error) 10237 iocp->ioc_count = 0; 10238 mp->b_datap->db_type = M_IOCACK; 10239 qreply(q, mp); 10240 return; 10241 } 10242 rw_exit(&ipst->ips_ip_g_nd_lock); 10243 /* 10244 * We don't understand this subioctl of ND_GET / ND_SET. 10245 * Maybe intended for some driver / module below us 10246 */ 10247 if (q->q_next) { 10248 putnext(q, mp); 10249 } else { 10250 iocp->ioc_error = ENOENT; 10251 mp->b_datap->db_type = M_IOCNAK; 10252 iocp->ioc_count = 0; 10253 qreply(q, mp); 10254 } 10255 return; 10256 10257 case IP_IOCTL: 10258 ip_wput_ioctl(q, mp); 10259 return; 10260 default: 10261 cmn_err(CE_PANIC, "should not happen "); 10262 } 10263 nak: 10264 if (mp->b_cont != NULL) { 10265 freemsg(mp->b_cont); 10266 mp->b_cont = NULL; 10267 } 10268 iocp->ioc_error = EINVAL; 10269 mp->b_datap->db_type = M_IOCNAK; 10270 iocp->ioc_count = 0; 10271 qreply(q, mp); 10272 } 10273 10274 /* ip_wput hands off ARP IOCTL responses to us */ 10275 void 10276 ip_sioctl_iocack(queue_t *q, mblk_t *mp) 10277 { 10278 struct arpreq *ar; 10279 struct xarpreq *xar; 10280 area_t *area; 10281 mblk_t *area_mp; 10282 struct iocblk *iocp; 10283 mblk_t *orig_ioc_mp, *tmp; 10284 struct iocblk *orig_iocp; 10285 ill_t *ill; 10286 conn_t *connp = NULL; 10287 uint_t ioc_id; 10288 mblk_t *pending_mp; 10289 int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE; 10290 int *flagsp; 10291 char *storage = NULL; 10292 sin_t *sin; 10293 ipaddr_t addr; 10294 int err; 10295 ip_stack_t *ipst; 10296 10297 ill = q->q_ptr; 10298 ASSERT(ill != NULL); 10299 ipst = ill->ill_ipst; 10300 10301 /* 10302 * We should get back from ARP a packet chain that looks like: 10303 * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 10304 */ 10305 if (!(area_mp = mp->b_cont) || 10306 (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) || 10307 !(orig_ioc_mp = area_mp->b_cont) || 10308 !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) { 10309 freemsg(mp); 10310 return; 10311 } 10312 10313 orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr; 10314 10315 tmp = (orig_ioc_mp->b_cont)->b_cont; 10316 if ((orig_iocp->ioc_cmd == SIOCGXARP) || 10317 (orig_iocp->ioc_cmd == SIOCSXARP) || 10318 (orig_iocp->ioc_cmd == SIOCDXARP)) { 10319 x_arp_ioctl = B_TRUE; 10320 xar = (struct xarpreq *)tmp->b_rptr; 10321 sin = (sin_t *)&xar->xarp_pa; 10322 flagsp = &xar->xarp_flags; 10323 storage = xar->xarp_ha.sdl_data; 10324 if (xar->xarp_ha.sdl_nlen != 0) 10325 ifx_arp_ioctl = B_TRUE; 10326 } else { 10327 ar = (struct arpreq *)tmp->b_rptr; 10328 sin = (sin_t *)&ar->arp_pa; 10329 flagsp = &ar->arp_flags; 10330 storage = ar->arp_ha.sa_data; 10331 } 10332 10333 iocp = (struct iocblk *)mp->b_rptr; 10334 10335 /* 10336 * Pick out the originating queue based on the ioc_id. 10337 */ 10338 ioc_id = iocp->ioc_id; 10339 pending_mp = ill_pending_mp_get(ill, &connp, ioc_id); 10340 if (pending_mp == NULL) { 10341 ASSERT(connp == NULL); 10342 inet_freemsg(mp); 10343 return; 10344 } 10345 ASSERT(connp != NULL); 10346 q = CONNP_TO_WQ(connp); 10347 10348 /* Uncouple the internally generated IOCTL from the original one */ 10349 area = (area_t *)area_mp->b_rptr; 10350 area_mp->b_cont = NULL; 10351 10352 /* 10353 * Restore the b_next and b_prev used by mi code. This is needed 10354 * to complete the ioctl using mi* functions. We stored them in 10355 * the pending mp prior to sending the request to ARP. 10356 */ 10357 orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next; 10358 orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev; 10359 inet_freemsg(pending_mp); 10360 10361 /* 10362 * We're done if there was an error or if this is not an SIOCG{X}ARP 10363 * Catch the case where there is an IRE_CACHE by no entry in the 10364 * arp table. 10365 */ 10366 addr = sin->sin_addr.s_addr; 10367 if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) { 10368 ire_t *ire; 10369 dl_unitdata_req_t *dlup; 10370 mblk_t *llmp; 10371 int addr_len; 10372 ill_t *ipsqill = NULL; 10373 10374 if (ifx_arp_ioctl) { 10375 /* 10376 * There's no need to lookup the ill, since 10377 * we've already done that when we started 10378 * processing the ioctl and sent the message 10379 * to ARP on that ill. So use the ill that 10380 * is stored in q->q_ptr. 10381 */ 10382 ipsqill = ill; 10383 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10384 ipsqill->ill_ipif, ALL_ZONES, 10385 NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); 10386 } else { 10387 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10388 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 10389 if (ire != NULL) 10390 ipsqill = ire_to_ill(ire); 10391 } 10392 10393 if ((x_arp_ioctl) && (ipsqill != NULL)) 10394 storage += ill_xarp_info(&xar->xarp_ha, ipsqill); 10395 10396 if (ire != NULL) { 10397 /* 10398 * Since the ire obtained from cachetable is used for 10399 * mac addr copying below, treat an incomplete ire as if 10400 * as if we never found it. 10401 */ 10402 if (ire->ire_nce != NULL && 10403 ire->ire_nce->nce_state != ND_REACHABLE) { 10404 ire_refrele(ire); 10405 ire = NULL; 10406 ipsqill = NULL; 10407 goto errack; 10408 } 10409 *flagsp = ATF_INUSE; 10410 llmp = (ire->ire_nce != NULL ? 10411 ire->ire_nce->nce_res_mp : NULL); 10412 if (llmp != NULL && ipsqill != NULL) { 10413 uchar_t *macaddr; 10414 10415 addr_len = ipsqill->ill_phys_addr_length; 10416 if (x_arp_ioctl && ((addr_len + 10417 ipsqill->ill_name_length) > 10418 sizeof (xar->xarp_ha.sdl_data))) { 10419 ire_refrele(ire); 10420 freemsg(mp); 10421 ip_ioctl_finish(q, orig_ioc_mp, 10422 EINVAL, NO_COPYOUT, NULL); 10423 return; 10424 } 10425 *flagsp |= ATF_COM; 10426 dlup = (dl_unitdata_req_t *)llmp->b_rptr; 10427 if (ipsqill->ill_sap_length < 0) 10428 macaddr = llmp->b_rptr + 10429 dlup->dl_dest_addr_offset; 10430 else 10431 macaddr = llmp->b_rptr + 10432 dlup->dl_dest_addr_offset + 10433 ipsqill->ill_sap_length; 10434 /* 10435 * For SIOCGARP, MAC address length 10436 * validation has already been done 10437 * before the ioctl was issued to ARP to 10438 * allow it to progress only on 6 byte 10439 * addressable (ethernet like) media. Thus 10440 * the mac address copying can not overwrite 10441 * the sa_data area below. 10442 */ 10443 bcopy(macaddr, storage, addr_len); 10444 } 10445 /* Ditch the internal IOCTL. */ 10446 freemsg(mp); 10447 ire_refrele(ire); 10448 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL); 10449 return; 10450 } 10451 } 10452 10453 /* 10454 * Delete the coresponding IRE_CACHE if any. 10455 * Reset the error if there was one (in case there was no entry 10456 * in arp.) 10457 */ 10458 if (iocp->ioc_cmd == AR_ENTRY_DELETE) { 10459 ipif_t *ipintf = NULL; 10460 10461 if (ifx_arp_ioctl) { 10462 /* 10463 * There's no need to lookup the ill, since 10464 * we've already done that when we started 10465 * processing the ioctl and sent the message 10466 * to ARP on that ill. So use the ill that 10467 * is stored in q->q_ptr. 10468 */ 10469 ipintf = ill->ill_ipif; 10470 } 10471 if (ip_ire_clookup_and_delete(addr, ipintf, ipst)) { 10472 /* 10473 * The address in "addr" may be an entry for a 10474 * router. If that's true, then any off-net 10475 * IRE_CACHE entries that go through the router 10476 * with address "addr" must be clobbered. Use 10477 * ire_walk to achieve this goal. 10478 */ 10479 if (ifx_arp_ioctl) 10480 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 10481 ire_delete_cache_gw, (char *)&addr, ill); 10482 else 10483 ire_walk_v4(ire_delete_cache_gw, (char *)&addr, 10484 ALL_ZONES, ipst); 10485 iocp->ioc_error = 0; 10486 } 10487 } 10488 errack: 10489 if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) { 10490 err = iocp->ioc_error; 10491 freemsg(mp); 10492 ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL); 10493 return; 10494 } 10495 10496 /* 10497 * Completion of an SIOCG{X}ARP. Translate the information from 10498 * the area_t into the struct {x}arpreq. 10499 */ 10500 if (x_arp_ioctl) { 10501 storage += ill_xarp_info(&xar->xarp_ha, ill); 10502 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 10503 sizeof (xar->xarp_ha.sdl_data)) { 10504 freemsg(mp); 10505 ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT, 10506 NULL); 10507 return; 10508 } 10509 } 10510 *flagsp = ATF_INUSE; 10511 if (area->area_flags & ACE_F_PERMANENT) 10512 *flagsp |= ATF_PERM; 10513 if (area->area_flags & ACE_F_PUBLISH) 10514 *flagsp |= ATF_PUBL; 10515 if (area->area_flags & ACE_F_AUTHORITY) 10516 *flagsp |= ATF_AUTHORITY; 10517 if (area->area_hw_addr_length != 0) { 10518 *flagsp |= ATF_COM; 10519 /* 10520 * For SIOCGARP, MAC address length validation has 10521 * already been done before the ioctl was issued to ARP 10522 * to allow it to progress only on 6 byte addressable 10523 * (ethernet like) media. Thus the mac address copying 10524 * can not overwrite the sa_data area below. 10525 */ 10526 bcopy((char *)area + area->area_hw_addr_offset, 10527 storage, area->area_hw_addr_length); 10528 } 10529 10530 /* Ditch the internal IOCTL. */ 10531 freemsg(mp); 10532 /* Complete the original. */ 10533 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL); 10534 } 10535 10536 /* 10537 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 10538 * interface) create the next available logical interface for this 10539 * physical interface. 10540 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 10541 * ipif with the specified name. 10542 * 10543 * If the address family is not AF_UNSPEC then set the address as well. 10544 * 10545 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 10546 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 10547 * 10548 * Executed as a writer on the ill or ill group. 10549 * So no lock is needed to traverse the ipif chain, or examine the 10550 * phyint flags. 10551 */ 10552 /* ARGSUSED */ 10553 int 10554 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 10555 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10556 { 10557 mblk_t *mp1; 10558 struct lifreq *lifr; 10559 boolean_t isv6; 10560 boolean_t exists; 10561 char *name; 10562 char *endp; 10563 char *cp; 10564 int namelen; 10565 ipif_t *ipif; 10566 long id; 10567 ipsq_t *ipsq; 10568 ill_t *ill; 10569 sin_t *sin; 10570 int err = 0; 10571 boolean_t found_sep = B_FALSE; 10572 conn_t *connp; 10573 zoneid_t zoneid; 10574 int orig_ifindex = 0; 10575 ip_stack_t *ipst = CONNQ_TO_IPST(q); 10576 10577 ASSERT(q->q_next == NULL); 10578 ip1dbg(("ip_sioctl_addif\n")); 10579 /* Existence of mp1 has been checked in ip_wput_nondata */ 10580 mp1 = mp->b_cont->b_cont; 10581 /* 10582 * Null terminate the string to protect against buffer 10583 * overrun. String was generated by user code and may not 10584 * be trusted. 10585 */ 10586 lifr = (struct lifreq *)mp1->b_rptr; 10587 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 10588 name = lifr->lifr_name; 10589 ASSERT(CONN_Q(q)); 10590 connp = Q_TO_CONN(q); 10591 isv6 = connp->conn_af_isv6; 10592 zoneid = connp->conn_zoneid; 10593 namelen = mi_strlen(name); 10594 if (namelen == 0) 10595 return (EINVAL); 10596 10597 exists = B_FALSE; 10598 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 10599 (mi_strcmp(name, ipif_loopback_name) == 0)) { 10600 /* 10601 * Allow creating lo0 using SIOCLIFADDIF. 10602 * can't be any other writer thread. So can pass null below 10603 * for the last 4 args to ipif_lookup_name. 10604 */ 10605 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE, 10606 &exists, isv6, zoneid, NULL, NULL, NULL, NULL, ipst); 10607 /* Prevent any further action */ 10608 if (ipif == NULL) { 10609 return (ENOBUFS); 10610 } else if (!exists) { 10611 /* We created the ipif now and as writer */ 10612 ipif_refrele(ipif); 10613 return (0); 10614 } else { 10615 ill = ipif->ipif_ill; 10616 ill_refhold(ill); 10617 ipif_refrele(ipif); 10618 } 10619 } else { 10620 /* Look for a colon in the name. */ 10621 endp = &name[namelen]; 10622 for (cp = endp; --cp > name; ) { 10623 if (*cp == IPIF_SEPARATOR_CHAR) { 10624 found_sep = B_TRUE; 10625 /* 10626 * Reject any non-decimal aliases for plumbing 10627 * of logical interfaces. Aliases with leading 10628 * zeroes are also rejected as they introduce 10629 * ambiguity in the naming of the interfaces. 10630 * Comparing with "0" takes care of all such 10631 * cases. 10632 */ 10633 if ((strncmp("0", cp+1, 1)) == 0) 10634 return (EINVAL); 10635 10636 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 10637 id <= 0 || *endp != '\0') { 10638 return (EINVAL); 10639 } 10640 *cp = '\0'; 10641 break; 10642 } 10643 } 10644 ill = ill_lookup_on_name(name, B_FALSE, isv6, 10645 CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL, ipst); 10646 if (found_sep) 10647 *cp = IPIF_SEPARATOR_CHAR; 10648 if (ill == NULL) 10649 return (err); 10650 } 10651 10652 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 10653 B_TRUE); 10654 10655 /* 10656 * Release the refhold due to the lookup, now that we are excl 10657 * or we are just returning 10658 */ 10659 ill_refrele(ill); 10660 10661 if (ipsq == NULL) 10662 return (EINPROGRESS); 10663 10664 /* 10665 * If the interface is failed, inactive or offlined, look for a working 10666 * interface in the ill group and create the ipif there. If we can't 10667 * find a good interface, create the ipif anyway so that in.mpathd can 10668 * move it to the first repaired interface. 10669 */ 10670 if ((ill->ill_phyint->phyint_flags & 10671 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10672 ill->ill_phyint->phyint_groupname_len != 0) { 10673 phyint_t *phyi; 10674 char *groupname = ill->ill_phyint->phyint_groupname; 10675 10676 /* 10677 * We're looking for a working interface, but it doesn't matter 10678 * if it's up or down; so instead of following the group lists, 10679 * we look at each physical interface and compare the groupname. 10680 * We're only interested in interfaces with IPv4 (resp. IPv6) 10681 * plumbed when we're adding an IPv4 (resp. IPv6) ipif. 10682 * Otherwise we create the ipif on the failed interface. 10683 */ 10684 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 10685 phyi = avl_first(&ipst->ips_phyint_g_list-> 10686 phyint_list_avl_by_index); 10687 for (; phyi != NULL; 10688 phyi = avl_walk(&ipst->ips_phyint_g_list-> 10689 phyint_list_avl_by_index, 10690 phyi, AVL_AFTER)) { 10691 if (phyi->phyint_groupname_len == 0) 10692 continue; 10693 ASSERT(phyi->phyint_groupname != NULL); 10694 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 && 10695 !(phyi->phyint_flags & 10696 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10697 (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) : 10698 (phyi->phyint_illv4 != NULL))) { 10699 break; 10700 } 10701 } 10702 rw_exit(&ipst->ips_ill_g_lock); 10703 10704 if (phyi != NULL) { 10705 orig_ifindex = ill->ill_phyint->phyint_ifindex; 10706 ill = (ill->ill_isv6 ? phyi->phyint_illv6 : 10707 phyi->phyint_illv4); 10708 } 10709 } 10710 10711 /* 10712 * We are now exclusive on the ipsq, so an ill move will be serialized 10713 * before or after us. 10714 */ 10715 ASSERT(IAM_WRITER_ILL(ill)); 10716 ASSERT(ill->ill_move_in_progress == B_FALSE); 10717 10718 if (found_sep && orig_ifindex == 0) { 10719 /* Now see if there is an IPIF with this unit number. */ 10720 for (ipif = ill->ill_ipif; ipif != NULL; 10721 ipif = ipif->ipif_next) { 10722 if (ipif->ipif_id == id) { 10723 err = EEXIST; 10724 goto done; 10725 } 10726 } 10727 } 10728 10729 /* 10730 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 10731 * of lo0. We never come here when we plumb lo0:0. It 10732 * happens in ipif_lookup_on_name. 10733 * The specified unit number is ignored when we create the ipif on a 10734 * different interface. However, we save it in ipif_orig_ipifid below so 10735 * that the ipif fails back to the right position. 10736 */ 10737 if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ? 10738 id : -1, IRE_LOCAL, B_TRUE)) == NULL) { 10739 err = ENOBUFS; 10740 goto done; 10741 } 10742 10743 /* Return created name with ioctl */ 10744 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 10745 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 10746 ip1dbg(("created %s\n", lifr->lifr_name)); 10747 10748 /* Set address */ 10749 sin = (sin_t *)&lifr->lifr_addr; 10750 if (sin->sin_family != AF_UNSPEC) { 10751 err = ip_sioctl_addr(ipif, sin, q, mp, 10752 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 10753 } 10754 10755 /* Set ifindex and unit number for failback */ 10756 if (err == 0 && orig_ifindex != 0) { 10757 ipif->ipif_orig_ifindex = orig_ifindex; 10758 if (found_sep) { 10759 ipif->ipif_orig_ipifid = id; 10760 } 10761 } 10762 10763 done: 10764 ipsq_exit(ipsq, B_TRUE, B_TRUE); 10765 return (err); 10766 } 10767 10768 /* 10769 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 10770 * interface) delete it based on the IP address (on this physical interface). 10771 * Otherwise delete it based on the ipif_id. 10772 * Also, special handling to allow a removeif of lo0. 10773 */ 10774 /* ARGSUSED */ 10775 int 10776 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10777 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10778 { 10779 conn_t *connp; 10780 ill_t *ill = ipif->ipif_ill; 10781 boolean_t success; 10782 ip_stack_t *ipst; 10783 10784 ipst = CONNQ_TO_IPST(q); 10785 10786 ASSERT(q->q_next == NULL); 10787 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 10788 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10789 ASSERT(IAM_WRITER_IPIF(ipif)); 10790 10791 connp = Q_TO_CONN(q); 10792 /* 10793 * Special case for unplumbing lo0 (the loopback physical interface). 10794 * If unplumbing lo0, the incoming address structure has been 10795 * initialized to all zeros. When unplumbing lo0, all its logical 10796 * interfaces must be removed too. 10797 * 10798 * Note that this interface may be called to remove a specific 10799 * loopback logical interface (eg, lo0:1). But in that case 10800 * ipif->ipif_id != 0 so that the code path for that case is the 10801 * same as any other interface (meaning it skips the code directly 10802 * below). 10803 */ 10804 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10805 if (sin->sin_family == AF_UNSPEC && 10806 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 10807 /* 10808 * Mark it condemned. No new ref. will be made to ill. 10809 */ 10810 mutex_enter(&ill->ill_lock); 10811 ill->ill_state_flags |= ILL_CONDEMNED; 10812 for (ipif = ill->ill_ipif; ipif != NULL; 10813 ipif = ipif->ipif_next) { 10814 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10815 } 10816 mutex_exit(&ill->ill_lock); 10817 10818 ipif = ill->ill_ipif; 10819 /* unplumb the loopback interface */ 10820 ill_delete(ill); 10821 mutex_enter(&connp->conn_lock); 10822 mutex_enter(&ill->ill_lock); 10823 ASSERT(ill->ill_group == NULL); 10824 10825 /* Are any references to this ill active */ 10826 if (ill_is_quiescent(ill)) { 10827 mutex_exit(&ill->ill_lock); 10828 mutex_exit(&connp->conn_lock); 10829 ill_delete_tail(ill); 10830 mi_free(ill); 10831 return (0); 10832 } 10833 success = ipsq_pending_mp_add(connp, ipif, 10834 CONNP_TO_WQ(connp), mp, ILL_FREE); 10835 mutex_exit(&connp->conn_lock); 10836 mutex_exit(&ill->ill_lock); 10837 if (success) 10838 return (EINPROGRESS); 10839 else 10840 return (EINTR); 10841 } 10842 } 10843 10844 /* 10845 * We are exclusive on the ipsq, so an ill move will be serialized 10846 * before or after us. 10847 */ 10848 ASSERT(ill->ill_move_in_progress == B_FALSE); 10849 10850 if (ipif->ipif_id == 0) { 10851 /* Find based on address */ 10852 if (ipif->ipif_isv6) { 10853 sin6_t *sin6; 10854 10855 if (sin->sin_family != AF_INET6) 10856 return (EAFNOSUPPORT); 10857 10858 sin6 = (sin6_t *)sin; 10859 /* We are a writer, so we should be able to lookup */ 10860 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 10861 ill, ALL_ZONES, NULL, NULL, NULL, NULL, ipst); 10862 if (ipif == NULL) { 10863 /* 10864 * Maybe the address in on another interface in 10865 * the same IPMP group? We check this below. 10866 */ 10867 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 10868 NULL, ALL_ZONES, NULL, NULL, NULL, NULL, 10869 ipst); 10870 } 10871 } else { 10872 ipaddr_t addr; 10873 10874 if (sin->sin_family != AF_INET) 10875 return (EAFNOSUPPORT); 10876 10877 addr = sin->sin_addr.s_addr; 10878 /* We are a writer, so we should be able to lookup */ 10879 ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL, 10880 NULL, NULL, NULL, ipst); 10881 if (ipif == NULL) { 10882 /* 10883 * Maybe the address in on another interface in 10884 * the same IPMP group? We check this below. 10885 */ 10886 ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES, 10887 NULL, NULL, NULL, NULL, ipst); 10888 } 10889 } 10890 if (ipif == NULL) { 10891 return (EADDRNOTAVAIL); 10892 } 10893 /* 10894 * When the address to be removed is hosted on a different 10895 * interface, we check if the interface is in the same IPMP 10896 * group as the specified one; if so we proceed with the 10897 * removal. 10898 * ill->ill_group is NULL when the ill is down, so we have to 10899 * compare the group names instead. 10900 */ 10901 if (ipif->ipif_ill != ill && 10902 (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 || 10903 ill->ill_phyint->phyint_groupname_len == 0 || 10904 mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname, 10905 ill->ill_phyint->phyint_groupname) != 0)) { 10906 ipif_refrele(ipif); 10907 return (EADDRNOTAVAIL); 10908 } 10909 10910 /* This is a writer */ 10911 ipif_refrele(ipif); 10912 } 10913 10914 /* 10915 * Can not delete instance zero since it is tied to the ill. 10916 */ 10917 if (ipif->ipif_id == 0) 10918 return (EBUSY); 10919 10920 mutex_enter(&ill->ill_lock); 10921 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10922 mutex_exit(&ill->ill_lock); 10923 10924 ipif_free(ipif); 10925 10926 mutex_enter(&connp->conn_lock); 10927 mutex_enter(&ill->ill_lock); 10928 10929 /* Are any references to this ipif active */ 10930 if (ipif->ipif_refcnt == 0 && ipif->ipif_ire_cnt == 0) { 10931 mutex_exit(&ill->ill_lock); 10932 mutex_exit(&connp->conn_lock); 10933 ipif_non_duplicate(ipif); 10934 ipif_down_tail(ipif); 10935 ipif_free_tail(ipif); 10936 return (0); 10937 } 10938 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 10939 IPIF_FREE); 10940 mutex_exit(&ill->ill_lock); 10941 mutex_exit(&connp->conn_lock); 10942 if (success) 10943 return (EINPROGRESS); 10944 else 10945 return (EINTR); 10946 } 10947 10948 /* 10949 * Restart the removeif ioctl. The refcnt has gone down to 0. 10950 * The ipif is already condemned. So can't find it thru lookups. 10951 */ 10952 /* ARGSUSED */ 10953 int 10954 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 10955 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10956 { 10957 ill_t *ill = ipif->ipif_ill; 10958 10959 ASSERT(IAM_WRITER_IPIF(ipif)); 10960 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 10961 10962 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 10963 ill->ill_name, ipif->ipif_id, (void *)ipif)); 10964 10965 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10966 ASSERT(ill->ill_state_flags & ILL_CONDEMNED); 10967 ill_delete_tail(ill); 10968 mi_free(ill); 10969 return (0); 10970 } 10971 10972 ipif_non_duplicate(ipif); 10973 ipif_down_tail(ipif); 10974 ipif_free_tail(ipif); 10975 10976 ILL_UNMARK_CHANGING(ill); 10977 return (0); 10978 } 10979 10980 /* 10981 * Set the local interface address. 10982 * Allow an address of all zero when the interface is down. 10983 */ 10984 /* ARGSUSED */ 10985 int 10986 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10987 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10988 { 10989 int err = 0; 10990 in6_addr_t v6addr; 10991 boolean_t need_up = B_FALSE; 10992 10993 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 10994 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10995 10996 ASSERT(IAM_WRITER_IPIF(ipif)); 10997 10998 if (ipif->ipif_isv6) { 10999 sin6_t *sin6; 11000 ill_t *ill; 11001 phyint_t *phyi; 11002 11003 if (sin->sin_family != AF_INET6) 11004 return (EAFNOSUPPORT); 11005 11006 sin6 = (sin6_t *)sin; 11007 v6addr = sin6->sin6_addr; 11008 ill = ipif->ipif_ill; 11009 phyi = ill->ill_phyint; 11010 11011 /* 11012 * Enforce that true multicast interfaces have a link-local 11013 * address for logical unit 0. 11014 */ 11015 if (ipif->ipif_id == 0 && 11016 (ill->ill_flags & ILLF_MULTICAST) && 11017 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 11018 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 11019 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 11020 return (EADDRNOTAVAIL); 11021 } 11022 11023 /* 11024 * up interfaces shouldn't have the unspecified address 11025 * unless they also have the IPIF_NOLOCAL flags set and 11026 * have a subnet assigned. 11027 */ 11028 if ((ipif->ipif_flags & IPIF_UP) && 11029 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 11030 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 11031 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 11032 return (EADDRNOTAVAIL); 11033 } 11034 11035 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 11036 return (EADDRNOTAVAIL); 11037 } else { 11038 ipaddr_t addr; 11039 11040 if (sin->sin_family != AF_INET) 11041 return (EAFNOSUPPORT); 11042 11043 addr = sin->sin_addr.s_addr; 11044 11045 /* Allow 0 as the local address. */ 11046 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 11047 return (EADDRNOTAVAIL); 11048 11049 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11050 } 11051 11052 11053 /* 11054 * Even if there is no change we redo things just to rerun 11055 * ipif_set_default. 11056 */ 11057 if (ipif->ipif_flags & IPIF_UP) { 11058 /* 11059 * Setting a new local address, make sure 11060 * we have net and subnet bcast ire's for 11061 * the old address if we need them. 11062 */ 11063 if (!ipif->ipif_isv6) 11064 ipif_check_bcast_ires(ipif); 11065 /* 11066 * If the interface is already marked up, 11067 * we call ipif_down which will take care 11068 * of ditching any IREs that have been set 11069 * up based on the old interface address. 11070 */ 11071 err = ipif_logical_down(ipif, q, mp); 11072 if (err == EINPROGRESS) 11073 return (err); 11074 ipif_down_tail(ipif); 11075 need_up = 1; 11076 } 11077 11078 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 11079 return (err); 11080 } 11081 11082 int 11083 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11084 boolean_t need_up) 11085 { 11086 in6_addr_t v6addr; 11087 in6_addr_t ov6addr; 11088 ipaddr_t addr; 11089 sin6_t *sin6; 11090 int sinlen; 11091 int err = 0; 11092 ill_t *ill = ipif->ipif_ill; 11093 boolean_t need_dl_down; 11094 boolean_t need_arp_down; 11095 struct iocblk *iocp; 11096 11097 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL; 11098 11099 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 11100 ill->ill_name, ipif->ipif_id, (void *)ipif)); 11101 ASSERT(IAM_WRITER_IPIF(ipif)); 11102 11103 /* Must cancel any pending timer before taking the ill_lock */ 11104 if (ipif->ipif_recovery_id != 0) 11105 (void) untimeout(ipif->ipif_recovery_id); 11106 ipif->ipif_recovery_id = 0; 11107 11108 if (ipif->ipif_isv6) { 11109 sin6 = (sin6_t *)sin; 11110 v6addr = sin6->sin6_addr; 11111 sinlen = sizeof (struct sockaddr_in6); 11112 } else { 11113 addr = sin->sin_addr.s_addr; 11114 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11115 sinlen = sizeof (struct sockaddr_in); 11116 } 11117 mutex_enter(&ill->ill_lock); 11118 ov6addr = ipif->ipif_v6lcl_addr; 11119 ipif->ipif_v6lcl_addr = v6addr; 11120 sctp_update_ipif_addr(ipif, ov6addr); 11121 if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) { 11122 ipif->ipif_v6src_addr = ipv6_all_zeros; 11123 } else { 11124 ipif->ipif_v6src_addr = v6addr; 11125 } 11126 ipif->ipif_addr_ready = 0; 11127 11128 /* 11129 * If the interface was previously marked as a duplicate, then since 11130 * we've now got a "new" address, it should no longer be considered a 11131 * duplicate -- even if the "new" address is the same as the old one. 11132 * Note that if all ipifs are down, we may have a pending ARP down 11133 * event to handle. This is because we want to recover from duplicates 11134 * and thus delay tearing down ARP until the duplicates have been 11135 * removed or disabled. 11136 */ 11137 need_dl_down = need_arp_down = B_FALSE; 11138 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11139 need_arp_down = !need_up; 11140 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11141 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11142 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11143 need_dl_down = B_TRUE; 11144 } 11145 } 11146 11147 if (ipif->ipif_isv6 && IN6_IS_ADDR_6TO4(&v6addr) && 11148 !ill->ill_is_6to4tun) { 11149 queue_t *wqp = ill->ill_wq; 11150 11151 /* 11152 * The local address of this interface is a 6to4 address, 11153 * check if this interface is in fact a 6to4 tunnel or just 11154 * an interface configured with a 6to4 address. We are only 11155 * interested in the former. 11156 */ 11157 if (wqp != NULL) { 11158 while ((wqp->q_next != NULL) && 11159 (wqp->q_next->q_qinfo != NULL) && 11160 (wqp->q_next->q_qinfo->qi_minfo != NULL)) { 11161 11162 if (wqp->q_next->q_qinfo->qi_minfo->mi_idnum 11163 == TUN6TO4_MODID) { 11164 /* set for use in IP */ 11165 ill->ill_is_6to4tun = 1; 11166 break; 11167 } 11168 wqp = wqp->q_next; 11169 } 11170 } 11171 } 11172 11173 ipif_set_default(ipif); 11174 11175 /* 11176 * When publishing an interface address change event, we only notify 11177 * the event listeners of the new address. It is assumed that if they 11178 * actively care about the addresses assigned that they will have 11179 * already discovered the previous address assigned (if there was one.) 11180 * 11181 * Don't attach nic event message for SIOCLIFADDIF ioctl. 11182 */ 11183 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) { 11184 hook_nic_event_t *info; 11185 if ((info = ipif->ipif_ill->ill_nic_event_info) != NULL) { 11186 ip2dbg(("ip_sioctl_addr_tail: unexpected nic event %d " 11187 "attached for %s\n", info->hne_event, 11188 ill->ill_name)); 11189 if (info->hne_data != NULL) 11190 kmem_free(info->hne_data, info->hne_datalen); 11191 kmem_free(info, sizeof (hook_nic_event_t)); 11192 } 11193 11194 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 11195 if (info != NULL) { 11196 ip_stack_t *ipst = ill->ill_ipst; 11197 11198 info->hne_nic = 11199 ipif->ipif_ill->ill_phyint->phyint_hook_ifindex; 11200 info->hne_lif = MAP_IPIF_ID(ipif->ipif_id); 11201 info->hne_event = NE_ADDRESS_CHANGE; 11202 info->hne_family = ipif->ipif_isv6 ? 11203 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 11204 info->hne_data = kmem_alloc(sinlen, KM_NOSLEEP); 11205 if (info->hne_data != NULL) { 11206 info->hne_datalen = sinlen; 11207 bcopy(sin, info->hne_data, sinlen); 11208 } else { 11209 ip2dbg(("ip_sioctl_addr_tail: could not attach " 11210 "address information for ADDRESS_CHANGE nic" 11211 " event of %s (ENOMEM)\n", 11212 ipif->ipif_ill->ill_name)); 11213 kmem_free(info, sizeof (hook_nic_event_t)); 11214 } 11215 } else 11216 ip2dbg(("ip_sioctl_addr_tail: could not attach " 11217 "ADDRESS_CHANGE nic event information for %s " 11218 "(ENOMEM)\n", ipif->ipif_ill->ill_name)); 11219 11220 ipif->ipif_ill->ill_nic_event_info = info; 11221 } 11222 11223 mutex_exit(&ill->ill_lock); 11224 11225 if (need_up) { 11226 /* 11227 * Now bring the interface back up. If this 11228 * is the only IPIF for the ILL, ipif_up 11229 * will have to re-bind to the device, so 11230 * we may get back EINPROGRESS, in which 11231 * case, this IOCTL will get completed in 11232 * ip_rput_dlpi when we see the DL_BIND_ACK. 11233 */ 11234 err = ipif_up(ipif, q, mp); 11235 } 11236 11237 if (need_dl_down) 11238 ill_dl_down(ill); 11239 if (need_arp_down) 11240 ipif_arp_down(ipif); 11241 11242 return (err); 11243 } 11244 11245 11246 /* 11247 * Restart entry point to restart the address set operation after the 11248 * refcounts have dropped to zero. 11249 */ 11250 /* ARGSUSED */ 11251 int 11252 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11253 ip_ioctl_cmd_t *ipip, void *ifreq) 11254 { 11255 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 11256 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11257 ASSERT(IAM_WRITER_IPIF(ipif)); 11258 ipif_down_tail(ipif); 11259 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 11260 } 11261 11262 /* ARGSUSED */ 11263 int 11264 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11265 ip_ioctl_cmd_t *ipip, void *if_req) 11266 { 11267 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11268 struct lifreq *lifr = (struct lifreq *)if_req; 11269 11270 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 11271 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11272 /* 11273 * The net mask and address can't change since we have a 11274 * reference to the ipif. So no lock is necessary. 11275 */ 11276 if (ipif->ipif_isv6) { 11277 *sin6 = sin6_null; 11278 sin6->sin6_family = AF_INET6; 11279 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 11280 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11281 lifr->lifr_addrlen = 11282 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11283 } else { 11284 *sin = sin_null; 11285 sin->sin_family = AF_INET; 11286 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 11287 if (ipip->ipi_cmd_type == LIF_CMD) { 11288 lifr->lifr_addrlen = 11289 ip_mask_to_plen(ipif->ipif_net_mask); 11290 } 11291 } 11292 return (0); 11293 } 11294 11295 /* 11296 * Set the destination address for a pt-pt interface. 11297 */ 11298 /* ARGSUSED */ 11299 int 11300 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11301 ip_ioctl_cmd_t *ipip, void *if_req) 11302 { 11303 int err = 0; 11304 in6_addr_t v6addr; 11305 boolean_t need_up = B_FALSE; 11306 11307 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 11308 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11309 ASSERT(IAM_WRITER_IPIF(ipif)); 11310 11311 if (ipif->ipif_isv6) { 11312 sin6_t *sin6; 11313 11314 if (sin->sin_family != AF_INET6) 11315 return (EAFNOSUPPORT); 11316 11317 sin6 = (sin6_t *)sin; 11318 v6addr = sin6->sin6_addr; 11319 11320 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 11321 return (EADDRNOTAVAIL); 11322 } else { 11323 ipaddr_t addr; 11324 11325 if (sin->sin_family != AF_INET) 11326 return (EAFNOSUPPORT); 11327 11328 addr = sin->sin_addr.s_addr; 11329 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 11330 return (EADDRNOTAVAIL); 11331 11332 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11333 } 11334 11335 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 11336 return (0); /* No change */ 11337 11338 if (ipif->ipif_flags & IPIF_UP) { 11339 /* 11340 * If the interface is already marked up, 11341 * we call ipif_down which will take care 11342 * of ditching any IREs that have been set 11343 * up based on the old pp dst address. 11344 */ 11345 err = ipif_logical_down(ipif, q, mp); 11346 if (err == EINPROGRESS) 11347 return (err); 11348 ipif_down_tail(ipif); 11349 need_up = B_TRUE; 11350 } 11351 /* 11352 * could return EINPROGRESS. If so ioctl will complete in 11353 * ip_rput_dlpi_writer 11354 */ 11355 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 11356 return (err); 11357 } 11358 11359 static int 11360 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11361 boolean_t need_up) 11362 { 11363 in6_addr_t v6addr; 11364 ill_t *ill = ipif->ipif_ill; 11365 int err = 0; 11366 boolean_t need_dl_down; 11367 boolean_t need_arp_down; 11368 11369 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 11370 ipif->ipif_id, (void *)ipif)); 11371 11372 /* Must cancel any pending timer before taking the ill_lock */ 11373 if (ipif->ipif_recovery_id != 0) 11374 (void) untimeout(ipif->ipif_recovery_id); 11375 ipif->ipif_recovery_id = 0; 11376 11377 if (ipif->ipif_isv6) { 11378 sin6_t *sin6; 11379 11380 sin6 = (sin6_t *)sin; 11381 v6addr = sin6->sin6_addr; 11382 } else { 11383 ipaddr_t addr; 11384 11385 addr = sin->sin_addr.s_addr; 11386 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11387 } 11388 mutex_enter(&ill->ill_lock); 11389 /* Set point to point destination address. */ 11390 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11391 /* 11392 * Allow this as a means of creating logical 11393 * pt-pt interfaces on top of e.g. an Ethernet. 11394 * XXX Undocumented HACK for testing. 11395 * pt-pt interfaces are created with NUD disabled. 11396 */ 11397 ipif->ipif_flags |= IPIF_POINTOPOINT; 11398 ipif->ipif_flags &= ~IPIF_BROADCAST; 11399 if (ipif->ipif_isv6) 11400 ill->ill_flags |= ILLF_NONUD; 11401 } 11402 11403 /* 11404 * If the interface was previously marked as a duplicate, then since 11405 * we've now got a "new" address, it should no longer be considered a 11406 * duplicate -- even if the "new" address is the same as the old one. 11407 * Note that if all ipifs are down, we may have a pending ARP down 11408 * event to handle. 11409 */ 11410 need_dl_down = need_arp_down = B_FALSE; 11411 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11412 need_arp_down = !need_up; 11413 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11414 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11415 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11416 need_dl_down = B_TRUE; 11417 } 11418 } 11419 11420 /* Set the new address. */ 11421 ipif->ipif_v6pp_dst_addr = v6addr; 11422 /* Make sure subnet tracks pp_dst */ 11423 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 11424 mutex_exit(&ill->ill_lock); 11425 11426 if (need_up) { 11427 /* 11428 * Now bring the interface back up. If this 11429 * is the only IPIF for the ILL, ipif_up 11430 * will have to re-bind to the device, so 11431 * we may get back EINPROGRESS, in which 11432 * case, this IOCTL will get completed in 11433 * ip_rput_dlpi when we see the DL_BIND_ACK. 11434 */ 11435 err = ipif_up(ipif, q, mp); 11436 } 11437 11438 if (need_dl_down) 11439 ill_dl_down(ill); 11440 11441 if (need_arp_down) 11442 ipif_arp_down(ipif); 11443 return (err); 11444 } 11445 11446 /* 11447 * Restart entry point to restart the dstaddress set operation after the 11448 * refcounts have dropped to zero. 11449 */ 11450 /* ARGSUSED */ 11451 int 11452 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11453 ip_ioctl_cmd_t *ipip, void *ifreq) 11454 { 11455 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 11456 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11457 ipif_down_tail(ipif); 11458 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 11459 } 11460 11461 /* ARGSUSED */ 11462 int 11463 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11464 ip_ioctl_cmd_t *ipip, void *if_req) 11465 { 11466 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11467 11468 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 11469 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11470 /* 11471 * Get point to point destination address. The addresses can't 11472 * change since we hold a reference to the ipif. 11473 */ 11474 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 11475 return (EADDRNOTAVAIL); 11476 11477 if (ipif->ipif_isv6) { 11478 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11479 *sin6 = sin6_null; 11480 sin6->sin6_family = AF_INET6; 11481 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 11482 } else { 11483 *sin = sin_null; 11484 sin->sin_family = AF_INET; 11485 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 11486 } 11487 return (0); 11488 } 11489 11490 /* 11491 * part of ipmp, make this func return the active/inactive state and 11492 * caller can set once atomically instead of multiple mutex_enter/mutex_exit 11493 */ 11494 /* 11495 * This function either sets or clears the IFF_INACTIVE flag. 11496 * 11497 * As long as there are some addresses or multicast memberships on the 11498 * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we 11499 * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface 11500 * will be used for outbound packets. 11501 * 11502 * Caller needs to verify the validity of setting IFF_INACTIVE. 11503 */ 11504 static void 11505 phyint_inactive(phyint_t *phyi) 11506 { 11507 ill_t *ill_v4; 11508 ill_t *ill_v6; 11509 ipif_t *ipif; 11510 ilm_t *ilm; 11511 11512 ill_v4 = phyi->phyint_illv4; 11513 ill_v6 = phyi->phyint_illv6; 11514 11515 /* 11516 * No need for a lock while traversing the list since iam 11517 * a writer 11518 */ 11519 if (ill_v4 != NULL) { 11520 ASSERT(IAM_WRITER_ILL(ill_v4)); 11521 for (ipif = ill_v4->ill_ipif; ipif != NULL; 11522 ipif = ipif->ipif_next) { 11523 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11524 mutex_enter(&phyi->phyint_lock); 11525 phyi->phyint_flags &= ~PHYI_INACTIVE; 11526 mutex_exit(&phyi->phyint_lock); 11527 return; 11528 } 11529 } 11530 for (ilm = ill_v4->ill_ilm; ilm != NULL; 11531 ilm = ilm->ilm_next) { 11532 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11533 mutex_enter(&phyi->phyint_lock); 11534 phyi->phyint_flags &= ~PHYI_INACTIVE; 11535 mutex_exit(&phyi->phyint_lock); 11536 return; 11537 } 11538 } 11539 } 11540 if (ill_v6 != NULL) { 11541 ill_v6 = phyi->phyint_illv6; 11542 for (ipif = ill_v6->ill_ipif; ipif != NULL; 11543 ipif = ipif->ipif_next) { 11544 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11545 mutex_enter(&phyi->phyint_lock); 11546 phyi->phyint_flags &= ~PHYI_INACTIVE; 11547 mutex_exit(&phyi->phyint_lock); 11548 return; 11549 } 11550 } 11551 for (ilm = ill_v6->ill_ilm; ilm != NULL; 11552 ilm = ilm->ilm_next) { 11553 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11554 mutex_enter(&phyi->phyint_lock); 11555 phyi->phyint_flags &= ~PHYI_INACTIVE; 11556 mutex_exit(&phyi->phyint_lock); 11557 return; 11558 } 11559 } 11560 } 11561 mutex_enter(&phyi->phyint_lock); 11562 phyi->phyint_flags |= PHYI_INACTIVE; 11563 mutex_exit(&phyi->phyint_lock); 11564 } 11565 11566 /* 11567 * This function is called only when the phyint flags change. Currently 11568 * called from ip_sioctl_flags. We re-do the broadcast nomination so 11569 * that we can select a good ill. 11570 */ 11571 static void 11572 ip_redo_nomination(phyint_t *phyi) 11573 { 11574 ill_t *ill_v4; 11575 11576 ill_v4 = phyi->phyint_illv4; 11577 11578 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 11579 ASSERT(IAM_WRITER_ILL(ill_v4)); 11580 if (ill_v4->ill_group->illgrp_ill_count > 1) 11581 ill_nominate_bcast_rcv(ill_v4->ill_group); 11582 } 11583 } 11584 11585 /* 11586 * Heuristic to check if ill is INACTIVE. 11587 * Checks if ill has an ipif with an usable ip address. 11588 * 11589 * Return values: 11590 * B_TRUE - ill is INACTIVE; has no usable ipif 11591 * B_FALSE - ill is not INACTIVE; ill has at least one usable ipif 11592 */ 11593 static boolean_t 11594 ill_is_inactive(ill_t *ill) 11595 { 11596 ipif_t *ipif; 11597 11598 /* Check whether it is in an IPMP group */ 11599 if (ill->ill_phyint->phyint_groupname == NULL) 11600 return (B_FALSE); 11601 11602 if (ill->ill_ipif_up_count == 0) 11603 return (B_TRUE); 11604 11605 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 11606 uint64_t flags = ipif->ipif_flags; 11607 11608 /* 11609 * This ipif is usable if it is IPIF_UP and not a 11610 * dedicated test address. A dedicated test address 11611 * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED 11612 * (note in particular that V6 test addresses are 11613 * link-local data addresses and thus are marked 11614 * IPIF_NOFAILOVER but not IPIF_DEPRECATED). 11615 */ 11616 if ((flags & IPIF_UP) && 11617 ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) != 11618 (IPIF_DEPRECATED|IPIF_NOFAILOVER))) 11619 return (B_FALSE); 11620 } 11621 return (B_TRUE); 11622 } 11623 11624 /* 11625 * Set interface flags. 11626 * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, 11627 * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST, 11628 * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE. 11629 * 11630 * NOTE : We really don't enforce that ipif_id zero should be used 11631 * for setting any flags other than IFF_LOGINT_FLAGS. This 11632 * is because applications generally does SICGLIFFLAGS and 11633 * ORs in the new flags (that affects the logical) and does a 11634 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 11635 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 11636 * flags that will be turned on is correct with respect to 11637 * ipif_id 0. For backward compatibility reasons, it is not done. 11638 */ 11639 /* ARGSUSED */ 11640 int 11641 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11642 ip_ioctl_cmd_t *ipip, void *if_req) 11643 { 11644 uint64_t turn_on; 11645 uint64_t turn_off; 11646 int err; 11647 boolean_t need_up = B_FALSE; 11648 phyint_t *phyi; 11649 ill_t *ill; 11650 uint64_t intf_flags; 11651 boolean_t phyint_flags_modified = B_FALSE; 11652 uint64_t flags; 11653 struct ifreq *ifr; 11654 struct lifreq *lifr; 11655 boolean_t set_linklocal = B_FALSE; 11656 boolean_t zero_source = B_FALSE; 11657 ip_stack_t *ipst; 11658 11659 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 11660 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11661 11662 ASSERT(IAM_WRITER_IPIF(ipif)); 11663 11664 ill = ipif->ipif_ill; 11665 phyi = ill->ill_phyint; 11666 ipst = ill->ill_ipst; 11667 11668 if (ipip->ipi_cmd_type == IF_CMD) { 11669 ifr = (struct ifreq *)if_req; 11670 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 11671 } else { 11672 lifr = (struct lifreq *)if_req; 11673 flags = lifr->lifr_flags; 11674 } 11675 11676 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11677 11678 /* 11679 * Has the flags been set correctly till now ? 11680 */ 11681 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11682 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11683 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11684 /* 11685 * Compare the new flags to the old, and partition 11686 * into those coming on and those going off. 11687 * For the 16 bit command keep the bits above bit 16 unchanged. 11688 */ 11689 if (ipip->ipi_cmd == SIOCSIFFLAGS) 11690 flags |= intf_flags & ~0xFFFF; 11691 11692 /* 11693 * First check which bits will change and then which will 11694 * go on and off 11695 */ 11696 turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE; 11697 if (!turn_on) 11698 return (0); /* No change */ 11699 11700 turn_off = intf_flags & turn_on; 11701 turn_on ^= turn_off; 11702 err = 0; 11703 11704 /* 11705 * Don't allow any bits belonging to the logical interface 11706 * to be set or cleared on the replacement ipif that was 11707 * created temporarily during a MOVE. 11708 */ 11709 if (ipif->ipif_replace_zero && 11710 ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) { 11711 return (EINVAL); 11712 } 11713 11714 /* 11715 * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on 11716 * IPv6 interfaces. 11717 */ 11718 if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6)) 11719 return (EINVAL); 11720 11721 /* 11722 * cannot turn off IFF_NOXMIT on VNI interfaces. 11723 */ 11724 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill)) 11725 return (EINVAL); 11726 11727 /* 11728 * Don't allow the IFF_ROUTER flag to be turned on on loopback 11729 * interfaces. It makes no sense in that context. 11730 */ 11731 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 11732 return (EINVAL); 11733 11734 if (flags & (IFF_NOLOCAL|IFF_ANYCAST)) 11735 zero_source = B_TRUE; 11736 11737 /* 11738 * For IPv6 ipif_id 0, don't allow the interface to be up without 11739 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 11740 * If the link local address isn't set, and can be set, it will get 11741 * set later on in this function. 11742 */ 11743 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 11744 (flags & IFF_UP) && !zero_source && 11745 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 11746 if (ipif_cant_setlinklocal(ipif)) 11747 return (EINVAL); 11748 set_linklocal = B_TRUE; 11749 } 11750 11751 /* 11752 * ILL cannot be part of a usesrc group and and IPMP group at the 11753 * same time. No need to grab ill_g_usesrc_lock here, see 11754 * synchronization notes in ip.c 11755 */ 11756 if (turn_on & PHYI_STANDBY && 11757 ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 11758 return (EINVAL); 11759 } 11760 11761 /* 11762 * If we modify physical interface flags, we'll potentially need to 11763 * send up two routing socket messages for the changes (one for the 11764 * IPv4 ill, and another for the IPv6 ill). Note that here. 11765 */ 11766 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 11767 phyint_flags_modified = B_TRUE; 11768 11769 /* 11770 * If we are setting or clearing FAILED or STANDBY or OFFLINE, 11771 * we need to flush the IRE_CACHES belonging to this ill. 11772 * We handle this case here without doing the DOWN/UP dance 11773 * like it is done for other flags. If some other flags are 11774 * being turned on/off with FAILED/STANDBY/OFFLINE, the code 11775 * below will handle it by bringing it down and then 11776 * bringing it UP. 11777 */ 11778 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) { 11779 ill_t *ill_v4, *ill_v6; 11780 11781 ill_v4 = phyi->phyint_illv4; 11782 ill_v6 = phyi->phyint_illv6; 11783 11784 /* 11785 * First set the INACTIVE flag if needed. Then delete the ires. 11786 * ire_add will atomically prevent creating new IRE_CACHEs 11787 * unless hidden flag is set. 11788 * PHYI_FAILED and PHYI_INACTIVE are exclusive 11789 */ 11790 if ((turn_on & PHYI_FAILED) && 11791 ((intf_flags & PHYI_STANDBY) || 11792 !ipst->ips_ipmp_enable_failback)) { 11793 /* Reset PHYI_INACTIVE when PHYI_FAILED is being set */ 11794 phyi->phyint_flags &= ~PHYI_INACTIVE; 11795 } 11796 if ((turn_off & PHYI_FAILED) && 11797 ((intf_flags & PHYI_STANDBY) || 11798 (!ipst->ips_ipmp_enable_failback && 11799 ill_is_inactive(ill)))) { 11800 phyint_inactive(phyi); 11801 } 11802 11803 if (turn_on & PHYI_STANDBY) { 11804 /* 11805 * We implicitly set INACTIVE only when STANDBY is set. 11806 * INACTIVE is also set on non-STANDBY phyint when user 11807 * disables FAILBACK using configuration file. 11808 * Do not allow STANDBY to be set on such INACTIVE 11809 * phyint 11810 */ 11811 if (phyi->phyint_flags & PHYI_INACTIVE) 11812 return (EINVAL); 11813 if (!(phyi->phyint_flags & PHYI_FAILED)) 11814 phyint_inactive(phyi); 11815 } 11816 if (turn_off & PHYI_STANDBY) { 11817 if (ipst->ips_ipmp_enable_failback) { 11818 /* 11819 * Reset PHYI_INACTIVE. 11820 */ 11821 phyi->phyint_flags &= ~PHYI_INACTIVE; 11822 } else if (ill_is_inactive(ill) && 11823 !(phyi->phyint_flags & PHYI_FAILED)) { 11824 /* 11825 * Need to set INACTIVE, when user sets 11826 * STANDBY on a non-STANDBY phyint and 11827 * later resets STANDBY 11828 */ 11829 phyint_inactive(phyi); 11830 } 11831 } 11832 /* 11833 * We should always send up a message so that the 11834 * daemons come to know of it. Note that the zeroth 11835 * interface can be down and the check below for IPIF_UP 11836 * will not make sense as we are actually setting 11837 * a phyint flag here. We assume that the ipif used 11838 * is always the zeroth ipif. (ip_rts_ifmsg does not 11839 * send up any message for non-zero ipifs). 11840 */ 11841 phyint_flags_modified = B_TRUE; 11842 11843 if (ill_v4 != NULL) { 11844 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 11845 IRE_CACHE, ill_stq_cache_delete, 11846 (char *)ill_v4, ill_v4); 11847 illgrp_reset_schednext(ill_v4); 11848 } 11849 if (ill_v6 != NULL) { 11850 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 11851 IRE_CACHE, ill_stq_cache_delete, 11852 (char *)ill_v6, ill_v6); 11853 illgrp_reset_schednext(ill_v6); 11854 } 11855 } 11856 11857 /* 11858 * If ILLF_ROUTER changes, we need to change the ip forwarding 11859 * status of the interface and, if the interface is part of an IPMP 11860 * group, all other interfaces that are part of the same IPMP 11861 * group. 11862 */ 11863 if ((turn_on | turn_off) & ILLF_ROUTER) 11864 (void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0)); 11865 11866 /* 11867 * If the interface is not UP and we are not going to 11868 * bring it UP, record the flags and return. When the 11869 * interface comes UP later, the right actions will be 11870 * taken. 11871 */ 11872 if (!(ipif->ipif_flags & IPIF_UP) && 11873 !(turn_on & IPIF_UP)) { 11874 /* Record new flags in their respective places. */ 11875 mutex_enter(&ill->ill_lock); 11876 mutex_enter(&ill->ill_phyint->phyint_lock); 11877 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11878 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11879 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11880 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11881 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11882 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11883 mutex_exit(&ill->ill_lock); 11884 mutex_exit(&ill->ill_phyint->phyint_lock); 11885 11886 /* 11887 * We do the broadcast and nomination here rather 11888 * than waiting for a FAILOVER/FAILBACK to happen. In 11889 * the case of FAILBACK from INACTIVE standby to the 11890 * interface that has been repaired, PHYI_FAILED has not 11891 * been cleared yet. If there are only two interfaces in 11892 * that group, all we have is a FAILED and INACTIVE 11893 * interface. If we do the nomination soon after a failback, 11894 * the broadcast nomination code would select the 11895 * INACTIVE interface for receiving broadcasts as FAILED is 11896 * not yet cleared. As we don't want STANDBY/INACTIVE to 11897 * receive broadcast packets, we need to redo nomination 11898 * when the FAILED is cleared here. Thus, in general we 11899 * always do the nomination here for FAILED, STANDBY 11900 * and OFFLINE. 11901 */ 11902 if (((turn_on | turn_off) & 11903 (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) { 11904 ip_redo_nomination(phyi); 11905 } 11906 if (phyint_flags_modified) { 11907 if (phyi->phyint_illv4 != NULL) { 11908 ip_rts_ifmsg(phyi->phyint_illv4-> 11909 ill_ipif); 11910 } 11911 if (phyi->phyint_illv6 != NULL) { 11912 ip_rts_ifmsg(phyi->phyint_illv6-> 11913 ill_ipif); 11914 } 11915 } 11916 return (0); 11917 } else if (set_linklocal || zero_source) { 11918 mutex_enter(&ill->ill_lock); 11919 if (set_linklocal) 11920 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 11921 if (zero_source) 11922 ipif->ipif_state_flags |= IPIF_ZERO_SOURCE; 11923 mutex_exit(&ill->ill_lock); 11924 } 11925 11926 /* 11927 * Disallow IPv6 interfaces coming up that have the unspecified address, 11928 * or point-to-point interfaces with an unspecified destination. We do 11929 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 11930 * have a subnet assigned, which is how in.ndpd currently manages its 11931 * onlink prefix list when no addresses are configured with those 11932 * prefixes. 11933 */ 11934 if (ipif->ipif_isv6 && 11935 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 11936 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 11937 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 11938 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11939 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 11940 return (EINVAL); 11941 } 11942 11943 /* 11944 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 11945 * from being brought up. 11946 */ 11947 if (!ipif->ipif_isv6 && 11948 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11949 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 11950 return (EINVAL); 11951 } 11952 11953 /* 11954 * The only flag changes that we currently take specific action on 11955 * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, 11956 * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and 11957 * IPIF_PREFERRED. This is done by bring the ipif down, changing 11958 * the flags and bringing it back up again. 11959 */ 11960 if ((turn_on|turn_off) & 11961 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 11962 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) { 11963 /* 11964 * Taking this ipif down, make sure we have 11965 * valid net and subnet bcast ire's for other 11966 * logical interfaces, if we need them. 11967 */ 11968 if (!ipif->ipif_isv6) 11969 ipif_check_bcast_ires(ipif); 11970 11971 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 11972 !(turn_off & IPIF_UP)) { 11973 need_up = B_TRUE; 11974 if (ipif->ipif_flags & IPIF_UP) 11975 ill->ill_logical_down = 1; 11976 turn_on &= ~IPIF_UP; 11977 } 11978 err = ipif_down(ipif, q, mp); 11979 ip1dbg(("ipif_down returns %d err ", err)); 11980 if (err == EINPROGRESS) 11981 return (err); 11982 ipif_down_tail(ipif); 11983 } 11984 return (ip_sioctl_flags_tail(ipif, flags, q, mp, need_up)); 11985 } 11986 11987 static int 11988 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp, 11989 boolean_t need_up) 11990 { 11991 ill_t *ill; 11992 phyint_t *phyi; 11993 uint64_t turn_on; 11994 uint64_t turn_off; 11995 uint64_t intf_flags; 11996 boolean_t phyint_flags_modified = B_FALSE; 11997 int err = 0; 11998 boolean_t set_linklocal = B_FALSE; 11999 boolean_t zero_source = B_FALSE; 12000 12001 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 12002 ipif->ipif_ill->ill_name, ipif->ipif_id)); 12003 12004 ASSERT(IAM_WRITER_IPIF(ipif)); 12005 12006 ill = ipif->ipif_ill; 12007 phyi = ill->ill_phyint; 12008 12009 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 12010 turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP); 12011 12012 turn_off = intf_flags & turn_on; 12013 turn_on ^= turn_off; 12014 12015 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) 12016 phyint_flags_modified = B_TRUE; 12017 12018 /* 12019 * Now we change the flags. Track current value of 12020 * other flags in their respective places. 12021 */ 12022 mutex_enter(&ill->ill_lock); 12023 mutex_enter(&phyi->phyint_lock); 12024 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 12025 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 12026 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 12027 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 12028 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 12029 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 12030 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 12031 set_linklocal = B_TRUE; 12032 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 12033 } 12034 if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) { 12035 zero_source = B_TRUE; 12036 ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE; 12037 } 12038 mutex_exit(&ill->ill_lock); 12039 mutex_exit(&phyi->phyint_lock); 12040 12041 if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) 12042 ip_redo_nomination(phyi); 12043 12044 if (set_linklocal) 12045 (void) ipif_setlinklocal(ipif); 12046 12047 if (zero_source) 12048 ipif->ipif_v6src_addr = ipv6_all_zeros; 12049 else 12050 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 12051 12052 if (need_up) { 12053 /* 12054 * XXX ipif_up really does not know whether a phyint flags 12055 * was modified or not. So, it sends up information on 12056 * only one routing sockets message. As we don't bring up 12057 * the interface and also set STANDBY/FAILED simultaneously 12058 * it should be okay. 12059 */ 12060 err = ipif_up(ipif, q, mp); 12061 } else { 12062 /* 12063 * Make sure routing socket sees all changes to the flags. 12064 * ipif_up_done* handles this when we use ipif_up. 12065 */ 12066 if (phyint_flags_modified) { 12067 if (phyi->phyint_illv4 != NULL) { 12068 ip_rts_ifmsg(phyi->phyint_illv4-> 12069 ill_ipif); 12070 } 12071 if (phyi->phyint_illv6 != NULL) { 12072 ip_rts_ifmsg(phyi->phyint_illv6-> 12073 ill_ipif); 12074 } 12075 } else { 12076 ip_rts_ifmsg(ipif); 12077 } 12078 /* 12079 * Update the flags in SCTP's IPIF list, ipif_up() will do 12080 * this in need_up case. 12081 */ 12082 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 12083 } 12084 return (err); 12085 } 12086 12087 /* 12088 * Restart entry point to restart the flags restart operation after the 12089 * refcounts have dropped to zero. 12090 */ 12091 /* ARGSUSED */ 12092 int 12093 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12094 ip_ioctl_cmd_t *ipip, void *if_req) 12095 { 12096 int err; 12097 struct ifreq *ifr = (struct ifreq *)if_req; 12098 struct lifreq *lifr = (struct lifreq *)if_req; 12099 12100 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 12101 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12102 12103 ipif_down_tail(ipif); 12104 if (ipip->ipi_cmd_type == IF_CMD) { 12105 /* 12106 * Since ip_sioctl_flags expects an int and ifr_flags 12107 * is a short we need to cast ifr_flags into an int 12108 * to avoid having sign extension cause bits to get 12109 * set that should not be. 12110 */ 12111 err = ip_sioctl_flags_tail(ipif, 12112 (uint64_t)(ifr->ifr_flags & 0x0000ffff), 12113 q, mp, B_TRUE); 12114 } else { 12115 err = ip_sioctl_flags_tail(ipif, lifr->lifr_flags, 12116 q, mp, B_TRUE); 12117 } 12118 return (err); 12119 } 12120 12121 /* 12122 * Can operate on either a module or a driver queue. 12123 */ 12124 /* ARGSUSED */ 12125 int 12126 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12127 ip_ioctl_cmd_t *ipip, void *if_req) 12128 { 12129 /* 12130 * Has the flags been set correctly till now ? 12131 */ 12132 ill_t *ill = ipif->ipif_ill; 12133 phyint_t *phyi = ill->ill_phyint; 12134 12135 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 12136 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12137 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 12138 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 12139 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 12140 12141 /* 12142 * Need a lock since some flags can be set even when there are 12143 * references to the ipif. 12144 */ 12145 mutex_enter(&ill->ill_lock); 12146 if (ipip->ipi_cmd_type == IF_CMD) { 12147 struct ifreq *ifr = (struct ifreq *)if_req; 12148 12149 /* Get interface flags (low 16 only). */ 12150 ifr->ifr_flags = ((ipif->ipif_flags | 12151 ill->ill_flags | phyi->phyint_flags) & 0xffff); 12152 } else { 12153 struct lifreq *lifr = (struct lifreq *)if_req; 12154 12155 /* Get interface flags. */ 12156 lifr->lifr_flags = ipif->ipif_flags | 12157 ill->ill_flags | phyi->phyint_flags; 12158 } 12159 mutex_exit(&ill->ill_lock); 12160 return (0); 12161 } 12162 12163 /* ARGSUSED */ 12164 int 12165 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12166 ip_ioctl_cmd_t *ipip, void *if_req) 12167 { 12168 int mtu; 12169 int ip_min_mtu; 12170 struct ifreq *ifr; 12171 struct lifreq *lifr; 12172 ire_t *ire; 12173 ip_stack_t *ipst; 12174 12175 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 12176 ipif->ipif_id, (void *)ipif)); 12177 if (ipip->ipi_cmd_type == IF_CMD) { 12178 ifr = (struct ifreq *)if_req; 12179 mtu = ifr->ifr_metric; 12180 } else { 12181 lifr = (struct lifreq *)if_req; 12182 mtu = lifr->lifr_mtu; 12183 } 12184 12185 if (ipif->ipif_isv6) 12186 ip_min_mtu = IPV6_MIN_MTU; 12187 else 12188 ip_min_mtu = IP_MIN_MTU; 12189 12190 if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu) 12191 return (EINVAL); 12192 12193 /* 12194 * Change the MTU size in all relevant ire's. 12195 * Mtu change Vs. new ire creation - protocol below. 12196 * First change ipif_mtu and the ire_max_frag of the 12197 * interface ire. Then do an ire walk and change the 12198 * ire_max_frag of all affected ires. During ire_add 12199 * under the bucket lock, set the ire_max_frag of the 12200 * new ire being created from the ipif/ire from which 12201 * it is being derived. If an mtu change happens after 12202 * the ire is added, the new ire will be cleaned up. 12203 * Conversely if the mtu change happens before the ire 12204 * is added, ire_add will see the new value of the mtu. 12205 */ 12206 ipif->ipif_mtu = mtu; 12207 ipif->ipif_flags |= IPIF_FIXEDMTU; 12208 12209 if (ipif->ipif_isv6) 12210 ire = ipif_to_ire_v6(ipif); 12211 else 12212 ire = ipif_to_ire(ipif); 12213 if (ire != NULL) { 12214 ire->ire_max_frag = ipif->ipif_mtu; 12215 ire_refrele(ire); 12216 } 12217 ipst = ipif->ipif_ill->ill_ipst; 12218 if (ipif->ipif_flags & IPIF_UP) { 12219 if (ipif->ipif_isv6) 12220 ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES, 12221 ipst); 12222 else 12223 ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES, 12224 ipst); 12225 } 12226 /* Update the MTU in SCTP's list */ 12227 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 12228 return (0); 12229 } 12230 12231 /* Get interface MTU. */ 12232 /* ARGSUSED */ 12233 int 12234 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12235 ip_ioctl_cmd_t *ipip, void *if_req) 12236 { 12237 struct ifreq *ifr; 12238 struct lifreq *lifr; 12239 12240 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 12241 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12242 if (ipip->ipi_cmd_type == IF_CMD) { 12243 ifr = (struct ifreq *)if_req; 12244 ifr->ifr_metric = ipif->ipif_mtu; 12245 } else { 12246 lifr = (struct lifreq *)if_req; 12247 lifr->lifr_mtu = ipif->ipif_mtu; 12248 } 12249 return (0); 12250 } 12251 12252 /* Set interface broadcast address. */ 12253 /* ARGSUSED2 */ 12254 int 12255 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12256 ip_ioctl_cmd_t *ipip, void *if_req) 12257 { 12258 ipaddr_t addr; 12259 ire_t *ire; 12260 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12261 12262 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name, 12263 ipif->ipif_id)); 12264 12265 ASSERT(IAM_WRITER_IPIF(ipif)); 12266 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 12267 return (EADDRNOTAVAIL); 12268 12269 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 12270 12271 if (sin->sin_family != AF_INET) 12272 return (EAFNOSUPPORT); 12273 12274 addr = sin->sin_addr.s_addr; 12275 if (ipif->ipif_flags & IPIF_UP) { 12276 /* 12277 * If we are already up, make sure the new 12278 * broadcast address makes sense. If it does, 12279 * there should be an IRE for it already. 12280 * Don't match on ipif, only on the ill 12281 * since we are sharing these now. Don't use 12282 * MATCH_IRE_ILL_GROUP as we are looking for 12283 * the broadcast ire on this ill and each ill 12284 * in the group has its own broadcast ire. 12285 */ 12286 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, 12287 ipif, ALL_ZONES, NULL, 12288 (MATCH_IRE_ILL | MATCH_IRE_TYPE), ipst); 12289 if (ire == NULL) { 12290 return (EINVAL); 12291 } else { 12292 ire_refrele(ire); 12293 } 12294 } 12295 /* 12296 * Changing the broadcast addr for this ipif. 12297 * Make sure we have valid net and subnet bcast 12298 * ire's for other logical interfaces, if needed. 12299 */ 12300 if (addr != ipif->ipif_brd_addr) 12301 ipif_check_bcast_ires(ipif); 12302 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 12303 return (0); 12304 } 12305 12306 /* Get interface broadcast address. */ 12307 /* ARGSUSED */ 12308 int 12309 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12310 ip_ioctl_cmd_t *ipip, void *if_req) 12311 { 12312 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 12313 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12314 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 12315 return (EADDRNOTAVAIL); 12316 12317 /* IPIF_BROADCAST not possible with IPv6 */ 12318 ASSERT(!ipif->ipif_isv6); 12319 *sin = sin_null; 12320 sin->sin_family = AF_INET; 12321 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 12322 return (0); 12323 } 12324 12325 /* 12326 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 12327 */ 12328 /* ARGSUSED */ 12329 int 12330 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12331 ip_ioctl_cmd_t *ipip, void *if_req) 12332 { 12333 int err = 0; 12334 in6_addr_t v6mask; 12335 12336 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 12337 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12338 12339 ASSERT(IAM_WRITER_IPIF(ipif)); 12340 12341 if (ipif->ipif_isv6) { 12342 sin6_t *sin6; 12343 12344 if (sin->sin_family != AF_INET6) 12345 return (EAFNOSUPPORT); 12346 12347 sin6 = (sin6_t *)sin; 12348 v6mask = sin6->sin6_addr; 12349 } else { 12350 ipaddr_t mask; 12351 12352 if (sin->sin_family != AF_INET) 12353 return (EAFNOSUPPORT); 12354 12355 mask = sin->sin_addr.s_addr; 12356 V4MASK_TO_V6(mask, v6mask); 12357 } 12358 12359 /* 12360 * No big deal if the interface isn't already up, or the mask 12361 * isn't really changing, or this is pt-pt. 12362 */ 12363 if (!(ipif->ipif_flags & IPIF_UP) || 12364 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 12365 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 12366 ipif->ipif_v6net_mask = v6mask; 12367 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12368 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 12369 ipif->ipif_v6net_mask, 12370 ipif->ipif_v6subnet); 12371 } 12372 return (0); 12373 } 12374 /* 12375 * Make sure we have valid net and subnet broadcast ire's 12376 * for the old netmask, if needed by other logical interfaces. 12377 */ 12378 if (!ipif->ipif_isv6) 12379 ipif_check_bcast_ires(ipif); 12380 12381 err = ipif_logical_down(ipif, q, mp); 12382 if (err == EINPROGRESS) 12383 return (err); 12384 ipif_down_tail(ipif); 12385 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 12386 return (err); 12387 } 12388 12389 static int 12390 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 12391 { 12392 in6_addr_t v6mask; 12393 int err = 0; 12394 12395 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 12396 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12397 12398 if (ipif->ipif_isv6) { 12399 sin6_t *sin6; 12400 12401 sin6 = (sin6_t *)sin; 12402 v6mask = sin6->sin6_addr; 12403 } else { 12404 ipaddr_t mask; 12405 12406 mask = sin->sin_addr.s_addr; 12407 V4MASK_TO_V6(mask, v6mask); 12408 } 12409 12410 ipif->ipif_v6net_mask = v6mask; 12411 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12412 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 12413 ipif->ipif_v6subnet); 12414 } 12415 err = ipif_up(ipif, q, mp); 12416 12417 if (err == 0 || err == EINPROGRESS) { 12418 /* 12419 * The interface must be DL_BOUND if this packet has to 12420 * go out on the wire. Since we only go through a logical 12421 * down and are bound with the driver during an internal 12422 * down/up that is satisfied. 12423 */ 12424 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 12425 /* Potentially broadcast an address mask reply. */ 12426 ipif_mask_reply(ipif); 12427 } 12428 } 12429 return (err); 12430 } 12431 12432 /* ARGSUSED */ 12433 int 12434 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12435 ip_ioctl_cmd_t *ipip, void *if_req) 12436 { 12437 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 12438 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12439 ipif_down_tail(ipif); 12440 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 12441 } 12442 12443 /* Get interface net mask. */ 12444 /* ARGSUSED */ 12445 int 12446 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12447 ip_ioctl_cmd_t *ipip, void *if_req) 12448 { 12449 struct lifreq *lifr = (struct lifreq *)if_req; 12450 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 12451 12452 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 12453 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12454 12455 /* 12456 * net mask can't change since we have a reference to the ipif. 12457 */ 12458 if (ipif->ipif_isv6) { 12459 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12460 *sin6 = sin6_null; 12461 sin6->sin6_family = AF_INET6; 12462 sin6->sin6_addr = ipif->ipif_v6net_mask; 12463 lifr->lifr_addrlen = 12464 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12465 } else { 12466 *sin = sin_null; 12467 sin->sin_family = AF_INET; 12468 sin->sin_addr.s_addr = ipif->ipif_net_mask; 12469 if (ipip->ipi_cmd_type == LIF_CMD) { 12470 lifr->lifr_addrlen = 12471 ip_mask_to_plen(ipif->ipif_net_mask); 12472 } 12473 } 12474 return (0); 12475 } 12476 12477 /* ARGSUSED */ 12478 int 12479 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12480 ip_ioctl_cmd_t *ipip, void *if_req) 12481 { 12482 12483 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 12484 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12485 /* 12486 * Set interface metric. We don't use this for 12487 * anything but we keep track of it in case it is 12488 * important to routing applications or such. 12489 */ 12490 if (ipip->ipi_cmd_type == IF_CMD) { 12491 struct ifreq *ifr; 12492 12493 ifr = (struct ifreq *)if_req; 12494 ipif->ipif_metric = ifr->ifr_metric; 12495 } else { 12496 struct lifreq *lifr; 12497 12498 lifr = (struct lifreq *)if_req; 12499 ipif->ipif_metric = lifr->lifr_metric; 12500 } 12501 return (0); 12502 } 12503 12504 12505 /* ARGSUSED */ 12506 int 12507 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12508 ip_ioctl_cmd_t *ipip, void *if_req) 12509 { 12510 12511 /* Get interface metric. */ 12512 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 12513 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12514 if (ipip->ipi_cmd_type == IF_CMD) { 12515 struct ifreq *ifr; 12516 12517 ifr = (struct ifreq *)if_req; 12518 ifr->ifr_metric = ipif->ipif_metric; 12519 } else { 12520 struct lifreq *lifr; 12521 12522 lifr = (struct lifreq *)if_req; 12523 lifr->lifr_metric = ipif->ipif_metric; 12524 } 12525 12526 return (0); 12527 } 12528 12529 /* ARGSUSED */ 12530 int 12531 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12532 ip_ioctl_cmd_t *ipip, void *if_req) 12533 { 12534 12535 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 12536 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12537 /* 12538 * Set the muxid returned from I_PLINK. 12539 */ 12540 if (ipip->ipi_cmd_type == IF_CMD) { 12541 struct ifreq *ifr = (struct ifreq *)if_req; 12542 12543 ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid; 12544 ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid; 12545 } else { 12546 struct lifreq *lifr = (struct lifreq *)if_req; 12547 12548 ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid; 12549 ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid; 12550 } 12551 return (0); 12552 } 12553 12554 /* ARGSUSED */ 12555 int 12556 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12557 ip_ioctl_cmd_t *ipip, void *if_req) 12558 { 12559 12560 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 12561 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12562 /* 12563 * Get the muxid saved in ill for I_PUNLINK. 12564 */ 12565 if (ipip->ipi_cmd_type == IF_CMD) { 12566 struct ifreq *ifr = (struct ifreq *)if_req; 12567 12568 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12569 ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12570 } else { 12571 struct lifreq *lifr = (struct lifreq *)if_req; 12572 12573 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12574 lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12575 } 12576 return (0); 12577 } 12578 12579 /* 12580 * Set the subnet prefix. Does not modify the broadcast address. 12581 */ 12582 /* ARGSUSED */ 12583 int 12584 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12585 ip_ioctl_cmd_t *ipip, void *if_req) 12586 { 12587 int err = 0; 12588 in6_addr_t v6addr; 12589 in6_addr_t v6mask; 12590 boolean_t need_up = B_FALSE; 12591 int addrlen; 12592 12593 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 12594 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12595 12596 ASSERT(IAM_WRITER_IPIF(ipif)); 12597 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 12598 12599 if (ipif->ipif_isv6) { 12600 sin6_t *sin6; 12601 12602 if (sin->sin_family != AF_INET6) 12603 return (EAFNOSUPPORT); 12604 12605 sin6 = (sin6_t *)sin; 12606 v6addr = sin6->sin6_addr; 12607 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 12608 return (EADDRNOTAVAIL); 12609 } else { 12610 ipaddr_t addr; 12611 12612 if (sin->sin_family != AF_INET) 12613 return (EAFNOSUPPORT); 12614 12615 addr = sin->sin_addr.s_addr; 12616 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 12617 return (EADDRNOTAVAIL); 12618 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12619 /* Add 96 bits */ 12620 addrlen += IPV6_ABITS - IP_ABITS; 12621 } 12622 12623 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 12624 return (EINVAL); 12625 12626 /* Check if bits in the address is set past the mask */ 12627 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 12628 return (EINVAL); 12629 12630 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 12631 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 12632 return (0); /* No change */ 12633 12634 if (ipif->ipif_flags & IPIF_UP) { 12635 /* 12636 * If the interface is already marked up, 12637 * we call ipif_down which will take care 12638 * of ditching any IREs that have been set 12639 * up based on the old interface address. 12640 */ 12641 err = ipif_logical_down(ipif, q, mp); 12642 if (err == EINPROGRESS) 12643 return (err); 12644 ipif_down_tail(ipif); 12645 need_up = B_TRUE; 12646 } 12647 12648 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 12649 return (err); 12650 } 12651 12652 static int 12653 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 12654 queue_t *q, mblk_t *mp, boolean_t need_up) 12655 { 12656 ill_t *ill = ipif->ipif_ill; 12657 int err = 0; 12658 12659 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 12660 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12661 12662 /* Set the new address. */ 12663 mutex_enter(&ill->ill_lock); 12664 ipif->ipif_v6net_mask = v6mask; 12665 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12666 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 12667 ipif->ipif_v6subnet); 12668 } 12669 mutex_exit(&ill->ill_lock); 12670 12671 if (need_up) { 12672 /* 12673 * Now bring the interface back up. If this 12674 * is the only IPIF for the ILL, ipif_up 12675 * will have to re-bind to the device, so 12676 * we may get back EINPROGRESS, in which 12677 * case, this IOCTL will get completed in 12678 * ip_rput_dlpi when we see the DL_BIND_ACK. 12679 */ 12680 err = ipif_up(ipif, q, mp); 12681 if (err == EINPROGRESS) 12682 return (err); 12683 } 12684 return (err); 12685 } 12686 12687 /* ARGSUSED */ 12688 int 12689 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12690 ip_ioctl_cmd_t *ipip, void *if_req) 12691 { 12692 int addrlen; 12693 in6_addr_t v6addr; 12694 in6_addr_t v6mask; 12695 struct lifreq *lifr = (struct lifreq *)if_req; 12696 12697 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 12698 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12699 ipif_down_tail(ipif); 12700 12701 addrlen = lifr->lifr_addrlen; 12702 if (ipif->ipif_isv6) { 12703 sin6_t *sin6; 12704 12705 sin6 = (sin6_t *)sin; 12706 v6addr = sin6->sin6_addr; 12707 } else { 12708 ipaddr_t addr; 12709 12710 addr = sin->sin_addr.s_addr; 12711 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12712 addrlen += IPV6_ABITS - IP_ABITS; 12713 } 12714 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 12715 12716 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 12717 } 12718 12719 /* ARGSUSED */ 12720 int 12721 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12722 ip_ioctl_cmd_t *ipip, void *if_req) 12723 { 12724 struct lifreq *lifr = (struct lifreq *)if_req; 12725 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 12726 12727 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 12728 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12729 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12730 12731 if (ipif->ipif_isv6) { 12732 *sin6 = sin6_null; 12733 sin6->sin6_family = AF_INET6; 12734 sin6->sin6_addr = ipif->ipif_v6subnet; 12735 lifr->lifr_addrlen = 12736 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12737 } else { 12738 *sin = sin_null; 12739 sin->sin_family = AF_INET; 12740 sin->sin_addr.s_addr = ipif->ipif_subnet; 12741 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 12742 } 12743 return (0); 12744 } 12745 12746 /* 12747 * Set the IPv6 address token. 12748 */ 12749 /* ARGSUSED */ 12750 int 12751 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12752 ip_ioctl_cmd_t *ipi, void *if_req) 12753 { 12754 ill_t *ill = ipif->ipif_ill; 12755 int err; 12756 in6_addr_t v6addr; 12757 in6_addr_t v6mask; 12758 boolean_t need_up = B_FALSE; 12759 int i; 12760 sin6_t *sin6 = (sin6_t *)sin; 12761 struct lifreq *lifr = (struct lifreq *)if_req; 12762 int addrlen; 12763 12764 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 12765 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12766 ASSERT(IAM_WRITER_IPIF(ipif)); 12767 12768 addrlen = lifr->lifr_addrlen; 12769 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12770 if (ipif->ipif_id != 0) 12771 return (EINVAL); 12772 12773 if (!ipif->ipif_isv6) 12774 return (EINVAL); 12775 12776 if (addrlen > IPV6_ABITS) 12777 return (EINVAL); 12778 12779 v6addr = sin6->sin6_addr; 12780 12781 /* 12782 * The length of the token is the length from the end. To get 12783 * the proper mask for this, compute the mask of the bits not 12784 * in the token; ie. the prefix, and then xor to get the mask. 12785 */ 12786 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 12787 return (EINVAL); 12788 for (i = 0; i < 4; i++) { 12789 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12790 } 12791 12792 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 12793 ill->ill_token_length == addrlen) 12794 return (0); /* No change */ 12795 12796 if (ipif->ipif_flags & IPIF_UP) { 12797 err = ipif_logical_down(ipif, q, mp); 12798 if (err == EINPROGRESS) 12799 return (err); 12800 ipif_down_tail(ipif); 12801 need_up = B_TRUE; 12802 } 12803 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 12804 return (err); 12805 } 12806 12807 static int 12808 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 12809 mblk_t *mp, boolean_t need_up) 12810 { 12811 in6_addr_t v6addr; 12812 in6_addr_t v6mask; 12813 ill_t *ill = ipif->ipif_ill; 12814 int i; 12815 int err = 0; 12816 12817 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 12818 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12819 v6addr = sin6->sin6_addr; 12820 /* 12821 * The length of the token is the length from the end. To get 12822 * the proper mask for this, compute the mask of the bits not 12823 * in the token; ie. the prefix, and then xor to get the mask. 12824 */ 12825 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 12826 for (i = 0; i < 4; i++) 12827 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12828 12829 mutex_enter(&ill->ill_lock); 12830 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 12831 ill->ill_token_length = addrlen; 12832 mutex_exit(&ill->ill_lock); 12833 12834 if (need_up) { 12835 /* 12836 * Now bring the interface back up. If this 12837 * is the only IPIF for the ILL, ipif_up 12838 * will have to re-bind to the device, so 12839 * we may get back EINPROGRESS, in which 12840 * case, this IOCTL will get completed in 12841 * ip_rput_dlpi when we see the DL_BIND_ACK. 12842 */ 12843 err = ipif_up(ipif, q, mp); 12844 if (err == EINPROGRESS) 12845 return (err); 12846 } 12847 return (err); 12848 } 12849 12850 /* ARGSUSED */ 12851 int 12852 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12853 ip_ioctl_cmd_t *ipi, void *if_req) 12854 { 12855 ill_t *ill; 12856 sin6_t *sin6 = (sin6_t *)sin; 12857 struct lifreq *lifr = (struct lifreq *)if_req; 12858 12859 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 12860 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12861 if (ipif->ipif_id != 0) 12862 return (EINVAL); 12863 12864 ill = ipif->ipif_ill; 12865 if (!ill->ill_isv6) 12866 return (ENXIO); 12867 12868 *sin6 = sin6_null; 12869 sin6->sin6_family = AF_INET6; 12870 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 12871 sin6->sin6_addr = ill->ill_token; 12872 lifr->lifr_addrlen = ill->ill_token_length; 12873 return (0); 12874 } 12875 12876 /* 12877 * Set (hardware) link specific information that might override 12878 * what was acquired through the DL_INFO_ACK. 12879 * The logic is as follows. 12880 * 12881 * become exclusive 12882 * set CHANGING flag 12883 * change mtu on affected IREs 12884 * clear CHANGING flag 12885 * 12886 * An ire add that occurs before the CHANGING flag is set will have its mtu 12887 * changed by the ip_sioctl_lnkinfo. 12888 * 12889 * During the time the CHANGING flag is set, no new ires will be added to the 12890 * bucket, and ire add will fail (due the CHANGING flag). 12891 * 12892 * An ire add that occurs after the CHANGING flag is set will have the right mtu 12893 * before it is added to the bucket. 12894 * 12895 * Obviously only 1 thread can set the CHANGING flag and we need to become 12896 * exclusive to set the flag. 12897 */ 12898 /* ARGSUSED */ 12899 int 12900 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12901 ip_ioctl_cmd_t *ipi, void *if_req) 12902 { 12903 ill_t *ill = ipif->ipif_ill; 12904 ipif_t *nipif; 12905 int ip_min_mtu; 12906 boolean_t mtu_walk = B_FALSE; 12907 struct lifreq *lifr = (struct lifreq *)if_req; 12908 lif_ifinfo_req_t *lir; 12909 ire_t *ire; 12910 12911 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 12912 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12913 lir = &lifr->lifr_ifinfo; 12914 ASSERT(IAM_WRITER_IPIF(ipif)); 12915 12916 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12917 if (ipif->ipif_id != 0) 12918 return (EINVAL); 12919 12920 /* Set interface MTU. */ 12921 if (ipif->ipif_isv6) 12922 ip_min_mtu = IPV6_MIN_MTU; 12923 else 12924 ip_min_mtu = IP_MIN_MTU; 12925 12926 /* 12927 * Verify values before we set anything. Allow zero to 12928 * mean unspecified. 12929 */ 12930 if (lir->lir_maxmtu != 0 && 12931 (lir->lir_maxmtu > ill->ill_max_frag || 12932 lir->lir_maxmtu < ip_min_mtu)) 12933 return (EINVAL); 12934 if (lir->lir_reachtime != 0 && 12935 lir->lir_reachtime > ND_MAX_REACHTIME) 12936 return (EINVAL); 12937 if (lir->lir_reachretrans != 0 && 12938 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 12939 return (EINVAL); 12940 12941 mutex_enter(&ill->ill_lock); 12942 ill->ill_state_flags |= ILL_CHANGING; 12943 for (nipif = ill->ill_ipif; nipif != NULL; 12944 nipif = nipif->ipif_next) { 12945 nipif->ipif_state_flags |= IPIF_CHANGING; 12946 } 12947 12948 mutex_exit(&ill->ill_lock); 12949 12950 if (lir->lir_maxmtu != 0) { 12951 ill->ill_max_mtu = lir->lir_maxmtu; 12952 ill->ill_mtu_userspecified = 1; 12953 mtu_walk = B_TRUE; 12954 } 12955 12956 if (lir->lir_reachtime != 0) 12957 ill->ill_reachable_time = lir->lir_reachtime; 12958 12959 if (lir->lir_reachretrans != 0) 12960 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 12961 12962 ill->ill_max_hops = lir->lir_maxhops; 12963 12964 ill->ill_max_buf = ND_MAX_Q; 12965 12966 if (mtu_walk) { 12967 /* 12968 * Set the MTU on all ipifs associated with this ill except 12969 * for those whose MTU was fixed via SIOCSLIFMTU. 12970 */ 12971 for (nipif = ill->ill_ipif; nipif != NULL; 12972 nipif = nipif->ipif_next) { 12973 if (nipif->ipif_flags & IPIF_FIXEDMTU) 12974 continue; 12975 12976 nipif->ipif_mtu = ill->ill_max_mtu; 12977 12978 if (!(nipif->ipif_flags & IPIF_UP)) 12979 continue; 12980 12981 if (nipif->ipif_isv6) 12982 ire = ipif_to_ire_v6(nipif); 12983 else 12984 ire = ipif_to_ire(nipif); 12985 if (ire != NULL) { 12986 ire->ire_max_frag = ipif->ipif_mtu; 12987 ire_refrele(ire); 12988 } 12989 if (ill->ill_isv6) { 12990 ire_walk_ill_v6(MATCH_IRE_ILL, 0, 12991 ipif_mtu_change, (char *)nipif, 12992 ill); 12993 } else { 12994 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 12995 ipif_mtu_change, (char *)nipif, 12996 ill); 12997 } 12998 } 12999 } 13000 13001 mutex_enter(&ill->ill_lock); 13002 for (nipif = ill->ill_ipif; nipif != NULL; 13003 nipif = nipif->ipif_next) { 13004 nipif->ipif_state_flags &= ~IPIF_CHANGING; 13005 } 13006 ILL_UNMARK_CHANGING(ill); 13007 mutex_exit(&ill->ill_lock); 13008 13009 return (0); 13010 } 13011 13012 /* ARGSUSED */ 13013 int 13014 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 13015 ip_ioctl_cmd_t *ipi, void *if_req) 13016 { 13017 struct lif_ifinfo_req *lir; 13018 ill_t *ill = ipif->ipif_ill; 13019 13020 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 13021 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 13022 if (ipif->ipif_id != 0) 13023 return (EINVAL); 13024 13025 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 13026 lir->lir_maxhops = ill->ill_max_hops; 13027 lir->lir_reachtime = ill->ill_reachable_time; 13028 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 13029 lir->lir_maxmtu = ill->ill_max_mtu; 13030 13031 return (0); 13032 } 13033 13034 /* 13035 * Return best guess as to the subnet mask for the specified address. 13036 * Based on the subnet masks for all the configured interfaces. 13037 * 13038 * We end up returning a zero mask in the case of default, multicast or 13039 * experimental. 13040 */ 13041 static ipaddr_t 13042 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst) 13043 { 13044 ipaddr_t net_mask; 13045 ill_t *ill; 13046 ipif_t *ipif; 13047 ill_walk_context_t ctx; 13048 ipif_t *fallback_ipif = NULL; 13049 13050 net_mask = ip_net_mask(addr); 13051 if (net_mask == 0) { 13052 *ipifp = NULL; 13053 return (0); 13054 } 13055 13056 /* Let's check to see if this is maybe a local subnet route. */ 13057 /* this function only applies to IPv4 interfaces */ 13058 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 13059 ill = ILL_START_WALK_V4(&ctx, ipst); 13060 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 13061 mutex_enter(&ill->ill_lock); 13062 for (ipif = ill->ill_ipif; ipif != NULL; 13063 ipif = ipif->ipif_next) { 13064 if (!IPIF_CAN_LOOKUP(ipif)) 13065 continue; 13066 if (!(ipif->ipif_flags & IPIF_UP)) 13067 continue; 13068 if ((ipif->ipif_subnet & net_mask) == 13069 (addr & net_mask)) { 13070 /* 13071 * Don't trust pt-pt interfaces if there are 13072 * other interfaces. 13073 */ 13074 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 13075 if (fallback_ipif == NULL) { 13076 ipif_refhold_locked(ipif); 13077 fallback_ipif = ipif; 13078 } 13079 continue; 13080 } 13081 13082 /* 13083 * Fine. Just assume the same net mask as the 13084 * directly attached subnet interface is using. 13085 */ 13086 ipif_refhold_locked(ipif); 13087 mutex_exit(&ill->ill_lock); 13088 rw_exit(&ipst->ips_ill_g_lock); 13089 if (fallback_ipif != NULL) 13090 ipif_refrele(fallback_ipif); 13091 *ipifp = ipif; 13092 return (ipif->ipif_net_mask); 13093 } 13094 } 13095 mutex_exit(&ill->ill_lock); 13096 } 13097 rw_exit(&ipst->ips_ill_g_lock); 13098 13099 *ipifp = fallback_ipif; 13100 return ((fallback_ipif != NULL) ? 13101 fallback_ipif->ipif_net_mask : net_mask); 13102 } 13103 13104 /* 13105 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 13106 */ 13107 static void 13108 ip_wput_ioctl(queue_t *q, mblk_t *mp) 13109 { 13110 IOCP iocp; 13111 ipft_t *ipft; 13112 ipllc_t *ipllc; 13113 mblk_t *mp1; 13114 cred_t *cr; 13115 int error = 0; 13116 conn_t *connp; 13117 13118 ip1dbg(("ip_wput_ioctl")); 13119 iocp = (IOCP)mp->b_rptr; 13120 mp1 = mp->b_cont; 13121 if (mp1 == NULL) { 13122 iocp->ioc_error = EINVAL; 13123 mp->b_datap->db_type = M_IOCNAK; 13124 iocp->ioc_count = 0; 13125 qreply(q, mp); 13126 return; 13127 } 13128 13129 /* 13130 * These IOCTLs provide various control capabilities to 13131 * upstream agents such as ULPs and processes. There 13132 * are currently two such IOCTLs implemented. They 13133 * are used by TCP to provide update information for 13134 * existing IREs and to forcibly delete an IRE for a 13135 * host that is not responding, thereby forcing an 13136 * attempt at a new route. 13137 */ 13138 iocp->ioc_error = EINVAL; 13139 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 13140 goto done; 13141 13142 ipllc = (ipllc_t *)mp1->b_rptr; 13143 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 13144 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 13145 break; 13146 } 13147 /* 13148 * prefer credential from mblk over ioctl; 13149 * see ip_sioctl_copyin_setup 13150 */ 13151 cr = DB_CREDDEF(mp, iocp->ioc_cr); 13152 13153 /* 13154 * Refhold the conn in case the request gets queued up in some lookup 13155 */ 13156 ASSERT(CONN_Q(q)); 13157 connp = Q_TO_CONN(q); 13158 CONN_INC_REF(connp); 13159 if (ipft->ipft_pfi && 13160 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 13161 pullupmsg(mp1, ipft->ipft_min_size))) { 13162 error = (*ipft->ipft_pfi)(q, 13163 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 13164 } 13165 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 13166 /* 13167 * CONN_OPER_PENDING_DONE happens in the function called 13168 * through ipft_pfi above. 13169 */ 13170 return; 13171 } 13172 13173 CONN_OPER_PENDING_DONE(connp); 13174 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 13175 freemsg(mp); 13176 return; 13177 } 13178 iocp->ioc_error = error; 13179 13180 done: 13181 mp->b_datap->db_type = M_IOCACK; 13182 if (iocp->ioc_error) 13183 iocp->ioc_count = 0; 13184 qreply(q, mp); 13185 } 13186 13187 /* 13188 * Lookup an ipif using the sequence id (ipif_seqid) 13189 */ 13190 ipif_t * 13191 ipif_lookup_seqid(ill_t *ill, uint_t seqid) 13192 { 13193 ipif_t *ipif; 13194 13195 ASSERT(MUTEX_HELD(&ill->ill_lock)); 13196 13197 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13198 if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif)) 13199 return (ipif); 13200 } 13201 return (NULL); 13202 } 13203 13204 /* 13205 * Assign a unique id for the ipif. This is used later when we send 13206 * IRES to ARP for resolution where we initialize ire_ipif_seqid 13207 * to the value pointed by ire_ipif->ipif_seqid. Later when the 13208 * IRE is added, we verify that ipif has not disappeared. 13209 */ 13210 13211 static void 13212 ipif_assign_seqid(ipif_t *ipif) 13213 { 13214 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13215 13216 ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1); 13217 } 13218 13219 /* 13220 * Insert the ipif, so that the list of ipifs on the ill will be sorted 13221 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 13222 * be inserted into the first space available in the list. The value of 13223 * ipif_id will then be set to the appropriate value for its position. 13224 */ 13225 static int 13226 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) 13227 { 13228 ill_t *ill; 13229 ipif_t *tipif; 13230 ipif_t **tipifp; 13231 int id; 13232 ip_stack_t *ipst; 13233 13234 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 13235 IAM_WRITER_IPIF(ipif)); 13236 13237 ill = ipif->ipif_ill; 13238 ASSERT(ill != NULL); 13239 ipst = ill->ill_ipst; 13240 13241 /* 13242 * In the case of lo0:0 we already hold the ill_g_lock. 13243 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 13244 * ipif_insert. Another such caller is ipif_move. 13245 */ 13246 if (acquire_g_lock) 13247 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13248 if (acquire_ill_lock) 13249 mutex_enter(&ill->ill_lock); 13250 id = ipif->ipif_id; 13251 tipifp = &(ill->ill_ipif); 13252 if (id == -1) { /* need to find a real id */ 13253 id = 0; 13254 while ((tipif = *tipifp) != NULL) { 13255 ASSERT(tipif->ipif_id >= id); 13256 if (tipif->ipif_id != id) 13257 break; /* non-consecutive id */ 13258 id++; 13259 tipifp = &(tipif->ipif_next); 13260 } 13261 /* limit number of logical interfaces */ 13262 if (id >= ipst->ips_ip_addrs_per_if) { 13263 if (acquire_ill_lock) 13264 mutex_exit(&ill->ill_lock); 13265 if (acquire_g_lock) 13266 rw_exit(&ipst->ips_ill_g_lock); 13267 return (-1); 13268 } 13269 ipif->ipif_id = id; /* assign new id */ 13270 } else if (id < ipst->ips_ip_addrs_per_if) { 13271 /* we have a real id; insert ipif in the right place */ 13272 while ((tipif = *tipifp) != NULL) { 13273 ASSERT(tipif->ipif_id != id); 13274 if (tipif->ipif_id > id) 13275 break; /* found correct location */ 13276 tipifp = &(tipif->ipif_next); 13277 } 13278 } else { 13279 if (acquire_ill_lock) 13280 mutex_exit(&ill->ill_lock); 13281 if (acquire_g_lock) 13282 rw_exit(&ipst->ips_ill_g_lock); 13283 return (-1); 13284 } 13285 13286 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 13287 13288 ipif->ipif_next = tipif; 13289 *tipifp = ipif; 13290 if (acquire_ill_lock) 13291 mutex_exit(&ill->ill_lock); 13292 if (acquire_g_lock) 13293 rw_exit(&ipst->ips_ill_g_lock); 13294 return (0); 13295 } 13296 13297 static void 13298 ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock) 13299 { 13300 ipif_t **ipifp; 13301 ill_t *ill = ipif->ipif_ill; 13302 13303 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock)); 13304 if (acquire_ill_lock) 13305 mutex_enter(&ill->ill_lock); 13306 else 13307 ASSERT(MUTEX_HELD(&ill->ill_lock)); 13308 13309 ipifp = &ill->ill_ipif; 13310 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 13311 if (*ipifp == ipif) { 13312 *ipifp = ipif->ipif_next; 13313 break; 13314 } 13315 } 13316 13317 if (acquire_ill_lock) 13318 mutex_exit(&ill->ill_lock); 13319 } 13320 13321 /* 13322 * Allocate and initialize a new interface control structure. (Always 13323 * called as writer.) 13324 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 13325 * is not part of the global linked list of ills. ipif_seqid is unique 13326 * in the system and to preserve the uniqueness, it is assigned only 13327 * when ill becomes part of the global list. At that point ill will 13328 * have a name. If it doesn't get assigned here, it will get assigned 13329 * in ipif_set_values() as part of SIOCSLIFNAME processing. 13330 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 13331 * the interface flags or any other information from the DL_INFO_ACK for 13332 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 13333 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 13334 * second DL_INFO_ACK comes in from the driver. 13335 */ 13336 static ipif_t * 13337 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) 13338 { 13339 ipif_t *ipif; 13340 phyint_t *phyi; 13341 13342 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 13343 ill->ill_name, id, (void *)ill)); 13344 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 13345 13346 if ((ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) 13347 return (NULL); 13348 *ipif = ipif_zero; /* start clean */ 13349 13350 ipif->ipif_ill = ill; 13351 ipif->ipif_id = id; /* could be -1 */ 13352 /* 13353 * Inherit the zoneid from the ill; for the shared stack instance 13354 * this is always the global zone 13355 */ 13356 ipif->ipif_zoneid = ill->ill_zoneid; 13357 13358 mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 13359 13360 ipif->ipif_refcnt = 0; 13361 ipif->ipif_saved_ire_cnt = 0; 13362 13363 if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) { 13364 mi_free(ipif); 13365 return (NULL); 13366 } 13367 /* -1 id should have been replaced by real id */ 13368 id = ipif->ipif_id; 13369 ASSERT(id >= 0); 13370 13371 if (ill->ill_name[0] != '\0') 13372 ipif_assign_seqid(ipif); 13373 13374 /* 13375 * Keep a copy of original id in ipif_orig_ipifid. Failback 13376 * will attempt to restore the original id. The SIOCSLIFOINDEX 13377 * ioctl sets ipif_orig_ipifid to zero. 13378 */ 13379 ipif->ipif_orig_ipifid = id; 13380 13381 /* 13382 * We grab the ill_lock and phyint_lock to protect the flag changes. 13383 * The ipif is still not up and can't be looked up until the 13384 * ioctl completes and the IPIF_CHANGING flag is cleared. 13385 */ 13386 mutex_enter(&ill->ill_lock); 13387 mutex_enter(&ill->ill_phyint->phyint_lock); 13388 /* 13389 * Set the running flag when logical interface zero is created. 13390 * For subsequent logical interfaces, a DLPI link down 13391 * notification message may have cleared the running flag to 13392 * indicate the link is down, so we shouldn't just blindly set it. 13393 */ 13394 if (id == 0) 13395 ill->ill_phyint->phyint_flags |= PHYI_RUNNING; 13396 ipif->ipif_ire_type = ire_type; 13397 phyi = ill->ill_phyint; 13398 ipif->ipif_orig_ifindex = phyi->phyint_ifindex; 13399 13400 if (ipif->ipif_isv6) { 13401 ill->ill_flags |= ILLF_IPV6; 13402 } else { 13403 ipaddr_t inaddr_any = INADDR_ANY; 13404 13405 ill->ill_flags |= ILLF_IPV4; 13406 13407 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 13408 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13409 &ipif->ipif_v6lcl_addr); 13410 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13411 &ipif->ipif_v6src_addr); 13412 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13413 &ipif->ipif_v6subnet); 13414 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13415 &ipif->ipif_v6net_mask); 13416 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13417 &ipif->ipif_v6brd_addr); 13418 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13419 &ipif->ipif_v6pp_dst_addr); 13420 } 13421 13422 /* 13423 * Don't set the interface flags etc. now, will do it in 13424 * ip_ll_subnet_defaults. 13425 */ 13426 if (!initialize) { 13427 mutex_exit(&ill->ill_lock); 13428 mutex_exit(&ill->ill_phyint->phyint_lock); 13429 return (ipif); 13430 } 13431 ipif->ipif_mtu = ill->ill_max_mtu; 13432 13433 if (ill->ill_bcast_addr_length != 0) { 13434 /* 13435 * Later detect lack of DLPI driver multicast 13436 * capability by catching DL_ENABMULTI errors in 13437 * ip_rput_dlpi. 13438 */ 13439 ill->ill_flags |= ILLF_MULTICAST; 13440 if (!ipif->ipif_isv6) 13441 ipif->ipif_flags |= IPIF_BROADCAST; 13442 } else { 13443 if (ill->ill_net_type != IRE_LOOPBACK) { 13444 if (ipif->ipif_isv6) 13445 /* 13446 * Note: xresolv interfaces will eventually need 13447 * NOARP set here as well, but that will require 13448 * those external resolvers to have some 13449 * knowledge of that flag and act appropriately. 13450 * Not to be changed at present. 13451 */ 13452 ill->ill_flags |= ILLF_NONUD; 13453 else 13454 ill->ill_flags |= ILLF_NOARP; 13455 } 13456 if (ill->ill_phys_addr_length == 0) { 13457 if (ill->ill_media && 13458 ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 13459 ipif->ipif_flags |= IPIF_NOXMIT; 13460 phyi->phyint_flags |= PHYI_VIRTUAL; 13461 } else { 13462 /* pt-pt supports multicast. */ 13463 ill->ill_flags |= ILLF_MULTICAST; 13464 if (ill->ill_net_type == IRE_LOOPBACK) { 13465 phyi->phyint_flags |= 13466 (PHYI_LOOPBACK | PHYI_VIRTUAL); 13467 } else { 13468 ipif->ipif_flags |= IPIF_POINTOPOINT; 13469 } 13470 } 13471 } 13472 } 13473 mutex_exit(&ill->ill_lock); 13474 mutex_exit(&ill->ill_phyint->phyint_lock); 13475 return (ipif); 13476 } 13477 13478 /* 13479 * If appropriate, send a message up to the resolver delete the entry 13480 * for the address of this interface which is going out of business. 13481 * (Always called as writer). 13482 * 13483 * NOTE : We need to check for NULL mps as some of the fields are 13484 * initialized only for some interface types. See ipif_resolver_up() 13485 * for details. 13486 */ 13487 void 13488 ipif_arp_down(ipif_t *ipif) 13489 { 13490 mblk_t *mp; 13491 ill_t *ill = ipif->ipif_ill; 13492 13493 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13494 ASSERT(IAM_WRITER_IPIF(ipif)); 13495 13496 /* Delete the mapping for the local address */ 13497 mp = ipif->ipif_arp_del_mp; 13498 if (mp != NULL) { 13499 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13500 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13501 putnext(ill->ill_rq, mp); 13502 ipif->ipif_arp_del_mp = NULL; 13503 } 13504 13505 /* 13506 * If this is the last ipif that is going down and there are no 13507 * duplicate addresses we may yet attempt to re-probe, then we need to 13508 * clean up ARP completely. 13509 */ 13510 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) { 13511 13512 /* Send up AR_INTERFACE_DOWN message */ 13513 mp = ill->ill_arp_down_mp; 13514 if (mp != NULL) { 13515 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13516 *(unsigned *)mp->b_rptr, ill->ill_name, 13517 ipif->ipif_id)); 13518 putnext(ill->ill_rq, mp); 13519 ill->ill_arp_down_mp = NULL; 13520 } 13521 13522 /* Tell ARP to delete the multicast mappings */ 13523 mp = ill->ill_arp_del_mapping_mp; 13524 if (mp != NULL) { 13525 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13526 *(unsigned *)mp->b_rptr, ill->ill_name, 13527 ipif->ipif_id)); 13528 putnext(ill->ill_rq, mp); 13529 ill->ill_arp_del_mapping_mp = NULL; 13530 } 13531 } 13532 } 13533 13534 /* 13535 * This function sets up the multicast mappings in ARP. When ipif_resolver_up 13536 * calls this function, it passes a non-NULL arp_add_mapping_mp indicating 13537 * that it wants the add_mp allocated in this function to be returned 13538 * wihtout sending it to arp. When ip_rput_dlpi_writer calls this to 13539 * just re-do the multicast, it wants us to send the add_mp to ARP also. 13540 * ipif_resolver_up does not want us to do the "add" i.e sending to ARP, 13541 * as it does a ipif_arp_down after calling this function - which will 13542 * remove what we add here. 13543 * 13544 * Returns -1 on failures and 0 on success. 13545 */ 13546 int 13547 ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) 13548 { 13549 mblk_t *del_mp = NULL; 13550 mblk_t *add_mp = NULL; 13551 mblk_t *mp; 13552 ill_t *ill = ipif->ipif_ill; 13553 phyint_t *phyi = ill->ill_phyint; 13554 ipaddr_t addr, mask, extract_mask = 0; 13555 arma_t *arma; 13556 uint8_t *maddr, *bphys_addr; 13557 uint32_t hw_start; 13558 dl_unitdata_req_t *dlur; 13559 13560 ASSERT(IAM_WRITER_IPIF(ipif)); 13561 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13562 return (0); 13563 13564 /* 13565 * Delete the existing mapping from ARP. Normally ipif_down 13566 * -> ipif_arp_down should send this up to ARP. The only 13567 * reason we would find this when we are switching from 13568 * Multicast to Broadcast where we did not do a down. 13569 */ 13570 mp = ill->ill_arp_del_mapping_mp; 13571 if (mp != NULL) { 13572 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13573 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13574 putnext(ill->ill_rq, mp); 13575 ill->ill_arp_del_mapping_mp = NULL; 13576 } 13577 13578 if (arp_add_mapping_mp != NULL) 13579 *arp_add_mapping_mp = NULL; 13580 13581 /* 13582 * Check that the address is not to long for the constant 13583 * length reserved in the template arma_t. 13584 */ 13585 if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) 13586 return (-1); 13587 13588 /* Add mapping mblk */ 13589 addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP); 13590 mask = (ipaddr_t)htonl(IN_CLASSD_NET); 13591 add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template, 13592 (caddr_t)&addr); 13593 if (add_mp == NULL) 13594 return (-1); 13595 arma = (arma_t *)add_mp->b_rptr; 13596 maddr = (uint8_t *)arma + arma->arma_hw_addr_offset; 13597 bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN); 13598 arma->arma_hw_addr_length = ill->ill_phys_addr_length; 13599 13600 /* 13601 * Determine the broadcast address. 13602 */ 13603 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 13604 if (ill->ill_sap_length < 0) 13605 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 13606 else 13607 bphys_addr = (uchar_t *)dlur + 13608 dlur->dl_dest_addr_offset + ill->ill_sap_length; 13609 /* 13610 * Check PHYI_MULTI_BCAST and length of physical 13611 * address to determine if we use the mapping or the 13612 * broadcast address. 13613 */ 13614 if (!(phyi->phyint_flags & PHYI_MULTI_BCAST)) 13615 if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length, 13616 bphys_addr, maddr, &hw_start, &extract_mask)) 13617 phyi->phyint_flags |= PHYI_MULTI_BCAST; 13618 13619 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) || 13620 (ill->ill_flags & ILLF_MULTICAST)) { 13621 /* Make sure this will not match the "exact" entry. */ 13622 addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP); 13623 del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 13624 (caddr_t)&addr); 13625 if (del_mp == NULL) { 13626 freemsg(add_mp); 13627 return (-1); 13628 } 13629 bcopy(&extract_mask, (char *)arma + 13630 arma->arma_proto_extract_mask_offset, IP_ADDR_LEN); 13631 if (phyi->phyint_flags & PHYI_MULTI_BCAST) { 13632 /* Use link-layer broadcast address for MULTI_BCAST */ 13633 bcopy(bphys_addr, maddr, ill->ill_phys_addr_length); 13634 ip2dbg(("ipif_arp_setup_multicast: adding" 13635 " MULTI_BCAST ARP setup for %s\n", ill->ill_name)); 13636 } else { 13637 arma->arma_hw_mapping_start = hw_start; 13638 ip2dbg(("ipif_arp_setup_multicast: adding multicast" 13639 " ARP setup for %s\n", ill->ill_name)); 13640 } 13641 } else { 13642 freemsg(add_mp); 13643 ASSERT(del_mp == NULL); 13644 /* It is neither MULTICAST nor MULTI_BCAST */ 13645 return (0); 13646 } 13647 ASSERT(add_mp != NULL && del_mp != NULL); 13648 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13649 ill->ill_arp_del_mapping_mp = del_mp; 13650 if (arp_add_mapping_mp != NULL) { 13651 /* The caller just wants the mblks allocated */ 13652 *arp_add_mapping_mp = add_mp; 13653 } else { 13654 /* The caller wants us to send it to arp */ 13655 putnext(ill->ill_rq, add_mp); 13656 } 13657 return (0); 13658 } 13659 13660 /* 13661 * Get the resolver set up for a new interface address. 13662 * (Always called as writer.) 13663 * Called both for IPv4 and IPv6 interfaces, 13664 * though it only sets up the resolver for v6 13665 * if it's an xresolv interface (one using an external resolver). 13666 * Honors ILLF_NOARP. 13667 * The enumerated value res_act is used to tune the behavior. 13668 * If set to Res_act_initial, then we set up all the resolver 13669 * structures for a new interface. If set to Res_act_move, then 13670 * we just send an AR_ENTRY_ADD message up to ARP for IPv4 13671 * interfaces; this is called by ip_rput_dlpi_writer() to handle 13672 * asynchronous hardware address change notification. If set to 13673 * Res_act_defend, then we tell ARP that it needs to send a single 13674 * gratuitous message in defense of the address. 13675 * Returns error on failure. 13676 */ 13677 int 13678 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 13679 { 13680 caddr_t addr; 13681 mblk_t *arp_up_mp = NULL; 13682 mblk_t *arp_down_mp = NULL; 13683 mblk_t *arp_add_mp = NULL; 13684 mblk_t *arp_del_mp = NULL; 13685 mblk_t *arp_add_mapping_mp = NULL; 13686 mblk_t *arp_del_mapping_mp = NULL; 13687 ill_t *ill = ipif->ipif_ill; 13688 uchar_t *area_p = NULL; 13689 uchar_t *ared_p = NULL; 13690 int err = ENOMEM; 13691 boolean_t was_dup; 13692 13693 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 13694 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 13695 ASSERT(IAM_WRITER_IPIF(ipif)); 13696 13697 was_dup = B_FALSE; 13698 if (res_act == Res_act_initial) { 13699 ipif->ipif_addr_ready = 0; 13700 /* 13701 * We're bringing an interface up here. There's no way that we 13702 * should need to shut down ARP now. 13703 */ 13704 mutex_enter(&ill->ill_lock); 13705 if (ipif->ipif_flags & IPIF_DUPLICATE) { 13706 ipif->ipif_flags &= ~IPIF_DUPLICATE; 13707 ill->ill_ipif_dup_count--; 13708 was_dup = B_TRUE; 13709 } 13710 mutex_exit(&ill->ill_lock); 13711 } 13712 if (ipif->ipif_recovery_id != 0) 13713 (void) untimeout(ipif->ipif_recovery_id); 13714 ipif->ipif_recovery_id = 0; 13715 if (ill->ill_net_type != IRE_IF_RESOLVER) { 13716 ipif->ipif_addr_ready = 1; 13717 return (0); 13718 } 13719 /* NDP will set the ipif_addr_ready flag when it's ready */ 13720 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 13721 return (0); 13722 13723 if (ill->ill_isv6) { 13724 /* 13725 * External resolver for IPv6 13726 */ 13727 ASSERT(res_act == Res_act_initial); 13728 if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 13729 addr = (caddr_t)&ipif->ipif_v6lcl_addr; 13730 area_p = (uchar_t *)&ip6_area_template; 13731 ared_p = (uchar_t *)&ip6_ared_template; 13732 } 13733 } else { 13734 /* 13735 * IPv4 arp case. If the ARP stream has already started 13736 * closing, fail this request for ARP bringup. Else 13737 * record the fact that an ARP bringup is pending. 13738 */ 13739 mutex_enter(&ill->ill_lock); 13740 if (ill->ill_arp_closing) { 13741 mutex_exit(&ill->ill_lock); 13742 err = EINVAL; 13743 goto failed; 13744 } else { 13745 if (ill->ill_ipif_up_count == 0 && 13746 ill->ill_ipif_dup_count == 0 && !was_dup) 13747 ill->ill_arp_bringup_pending = 1; 13748 mutex_exit(&ill->ill_lock); 13749 } 13750 if (ipif->ipif_lcl_addr != INADDR_ANY) { 13751 addr = (caddr_t)&ipif->ipif_lcl_addr; 13752 area_p = (uchar_t *)&ip_area_template; 13753 ared_p = (uchar_t *)&ip_ared_template; 13754 } 13755 } 13756 13757 /* 13758 * Add an entry for the local address in ARP only if it 13759 * is not UNNUMBERED and the address is not INADDR_ANY. 13760 */ 13761 if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && area_p != NULL) { 13762 area_t *area; 13763 13764 /* Now ask ARP to publish our address. */ 13765 arp_add_mp = ill_arp_alloc(ill, area_p, addr); 13766 if (arp_add_mp == NULL) 13767 goto failed; 13768 area = (area_t *)arp_add_mp->b_rptr; 13769 if (res_act != Res_act_initial) { 13770 /* 13771 * Copy the new hardware address and length into 13772 * arp_add_mp to be sent to ARP. 13773 */ 13774 area->area_hw_addr_length = ill->ill_phys_addr_length; 13775 bcopy(ill->ill_phys_addr, 13776 ((char *)area + area->area_hw_addr_offset), 13777 area->area_hw_addr_length); 13778 } 13779 13780 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | 13781 ACE_F_MYADDR; 13782 13783 if (res_act == Res_act_defend) { 13784 area->area_flags |= ACE_F_DEFEND; 13785 /* 13786 * If we're just defending our address now, then 13787 * there's no need to set up ARP multicast mappings. 13788 * The publish command is enough. 13789 */ 13790 goto done; 13791 } 13792 13793 if (res_act != Res_act_initial) 13794 goto arp_setup_multicast; 13795 13796 /* 13797 * Allocate an ARP deletion message so we know we can tell ARP 13798 * when the interface goes down. 13799 */ 13800 arp_del_mp = ill_arp_alloc(ill, ared_p, addr); 13801 if (arp_del_mp == NULL) 13802 goto failed; 13803 13804 } else { 13805 if (res_act != Res_act_initial) 13806 goto done; 13807 } 13808 /* 13809 * Need to bring up ARP or setup multicast mapping only 13810 * when the first interface is coming UP. 13811 */ 13812 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 13813 was_dup) { 13814 goto done; 13815 } 13816 13817 /* 13818 * Allocate an ARP down message (to be saved) and an ARP up 13819 * message. 13820 */ 13821 arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0); 13822 if (arp_down_mp == NULL) 13823 goto failed; 13824 13825 arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0); 13826 if (arp_up_mp == NULL) 13827 goto failed; 13828 13829 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13830 goto done; 13831 13832 arp_setup_multicast: 13833 /* 13834 * Setup the multicast mappings. This function initializes 13835 * ill_arp_del_mapping_mp also. This does not need to be done for 13836 * IPv6. 13837 */ 13838 if (!ill->ill_isv6) { 13839 err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp); 13840 if (err != 0) 13841 goto failed; 13842 ASSERT(ill->ill_arp_del_mapping_mp != NULL); 13843 ASSERT(arp_add_mapping_mp != NULL); 13844 } 13845 13846 done: 13847 if (arp_del_mp != NULL) { 13848 ASSERT(ipif->ipif_arp_del_mp == NULL); 13849 ipif->ipif_arp_del_mp = arp_del_mp; 13850 } 13851 if (arp_down_mp != NULL) { 13852 ASSERT(ill->ill_arp_down_mp == NULL); 13853 ill->ill_arp_down_mp = arp_down_mp; 13854 } 13855 if (arp_del_mapping_mp != NULL) { 13856 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13857 ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; 13858 } 13859 if (arp_up_mp != NULL) { 13860 ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", 13861 ill->ill_name, ipif->ipif_id)); 13862 putnext(ill->ill_rq, arp_up_mp); 13863 } 13864 if (arp_add_mp != NULL) { 13865 ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", 13866 ill->ill_name, ipif->ipif_id)); 13867 /* 13868 * If it's an extended ARP implementation, then we'll wait to 13869 * hear that DAD has finished before using the interface. 13870 */ 13871 if (!ill->ill_arp_extend) 13872 ipif->ipif_addr_ready = 1; 13873 putnext(ill->ill_rq, arp_add_mp); 13874 } else { 13875 ipif->ipif_addr_ready = 1; 13876 } 13877 if (arp_add_mapping_mp != NULL) { 13878 ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", 13879 ill->ill_name, ipif->ipif_id)); 13880 putnext(ill->ill_rq, arp_add_mapping_mp); 13881 } 13882 if (res_act != Res_act_initial) 13883 return (0); 13884 13885 if (ill->ill_flags & ILLF_NOARP) 13886 err = ill_arp_off(ill); 13887 else 13888 err = ill_arp_on(ill); 13889 if (err != 0) { 13890 ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err)); 13891 freemsg(ipif->ipif_arp_del_mp); 13892 freemsg(ill->ill_arp_down_mp); 13893 freemsg(ill->ill_arp_del_mapping_mp); 13894 ipif->ipif_arp_del_mp = NULL; 13895 ill->ill_arp_down_mp = NULL; 13896 ill->ill_arp_del_mapping_mp = NULL; 13897 return (err); 13898 } 13899 return ((ill->ill_ipif_up_count != 0 || was_dup || 13900 ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS); 13901 13902 failed: 13903 ip1dbg(("ipif_resolver_up: FAILED\n")); 13904 freemsg(arp_add_mp); 13905 freemsg(arp_del_mp); 13906 freemsg(arp_add_mapping_mp); 13907 freemsg(arp_up_mp); 13908 freemsg(arp_down_mp); 13909 ill->ill_arp_bringup_pending = 0; 13910 return (err); 13911 } 13912 13913 /* 13914 * This routine restarts IPv4 duplicate address detection (DAD) when a link has 13915 * just gone back up. 13916 */ 13917 static void 13918 ipif_arp_start_dad(ipif_t *ipif) 13919 { 13920 ill_t *ill = ipif->ipif_ill; 13921 mblk_t *arp_add_mp; 13922 area_t *area; 13923 13924 if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing || 13925 (ipif->ipif_flags & IPIF_UNNUMBERED) || 13926 ipif->ipif_lcl_addr == INADDR_ANY || 13927 (arp_add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 13928 (char *)&ipif->ipif_lcl_addr)) == NULL) { 13929 /* 13930 * If we can't contact ARP for some reason, that's not really a 13931 * problem. Just send out the routing socket notification that 13932 * DAD completion would have done, and continue. 13933 */ 13934 ipif_mask_reply(ipif); 13935 ip_rts_ifmsg(ipif); 13936 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 13937 sctp_update_ipif(ipif, SCTP_IPIF_UP); 13938 ipif->ipif_addr_ready = 1; 13939 return; 13940 } 13941 13942 /* Setting the 'unverified' flag restarts DAD */ 13943 area = (area_t *)arp_add_mp->b_rptr; 13944 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | 13945 ACE_F_UNVERIFIED; 13946 putnext(ill->ill_rq, arp_add_mp); 13947 } 13948 13949 static void 13950 ipif_ndp_start_dad(ipif_t *ipif) 13951 { 13952 nce_t *nce; 13953 13954 nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, B_FALSE); 13955 if (nce == NULL) 13956 return; 13957 13958 if (!ndp_restart_dad(nce)) { 13959 /* 13960 * If we can't restart DAD for some reason, that's not really a 13961 * problem. Just send out the routing socket notification that 13962 * DAD completion would have done, and continue. 13963 */ 13964 ip_rts_ifmsg(ipif); 13965 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 13966 sctp_update_ipif(ipif, SCTP_IPIF_UP); 13967 ipif->ipif_addr_ready = 1; 13968 } 13969 NCE_REFRELE(nce); 13970 } 13971 13972 /* 13973 * Restart duplicate address detection on all interfaces on the given ill. 13974 * 13975 * This is called when an interface transitions from down to up 13976 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 13977 * 13978 * Note that since the underlying physical link has transitioned, we must cause 13979 * at least one routing socket message to be sent here, either via DAD 13980 * completion or just by default on the first ipif. (If we don't do this, then 13981 * in.mpathd will see long delays when doing link-based failure recovery.) 13982 */ 13983 void 13984 ill_restart_dad(ill_t *ill, boolean_t went_up) 13985 { 13986 ipif_t *ipif; 13987 13988 if (ill == NULL) 13989 return; 13990 13991 /* 13992 * If layer two doesn't support duplicate address detection, then just 13993 * send the routing socket message now and be done with it. 13994 */ 13995 if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) || 13996 (!ill->ill_isv6 && !ill->ill_arp_extend)) { 13997 ip_rts_ifmsg(ill->ill_ipif); 13998 return; 13999 } 14000 14001 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14002 if (went_up) { 14003 if (ipif->ipif_flags & IPIF_UP) { 14004 if (ill->ill_isv6) 14005 ipif_ndp_start_dad(ipif); 14006 else 14007 ipif_arp_start_dad(ipif); 14008 } else if (ill->ill_isv6 && 14009 (ipif->ipif_flags & IPIF_DUPLICATE)) { 14010 /* 14011 * For IPv4, the ARP module itself will 14012 * automatically start the DAD process when it 14013 * sees DL_NOTE_LINK_UP. We respond to the 14014 * AR_CN_READY at the completion of that task. 14015 * For IPv6, we must kick off the bring-up 14016 * process now. 14017 */ 14018 ndp_do_recovery(ipif); 14019 } else { 14020 /* 14021 * Unfortunately, the first ipif is "special" 14022 * and represents the underlying ill in the 14023 * routing socket messages. Thus, when this 14024 * one ipif is down, we must still notify so 14025 * that the user knows the IFF_RUNNING status 14026 * change. (If the first ipif is up, then 14027 * we'll handle eventual routing socket 14028 * notification via DAD completion.) 14029 */ 14030 if (ipif == ill->ill_ipif) 14031 ip_rts_ifmsg(ill->ill_ipif); 14032 } 14033 } else { 14034 /* 14035 * After link down, we'll need to send a new routing 14036 * message when the link comes back, so clear 14037 * ipif_addr_ready. 14038 */ 14039 ipif->ipif_addr_ready = 0; 14040 } 14041 } 14042 14043 /* 14044 * If we've torn down links, then notify the user right away. 14045 */ 14046 if (!went_up) 14047 ip_rts_ifmsg(ill->ill_ipif); 14048 } 14049 14050 /* 14051 * Wakeup all threads waiting to enter the ipsq, and sleeping 14052 * on any of the ills in this ipsq. The ill_lock of the ill 14053 * must be held so that waiters don't miss wakeups 14054 */ 14055 static void 14056 ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock) 14057 { 14058 phyint_t *phyint; 14059 14060 phyint = ipsq->ipsq_phyint_list; 14061 while (phyint != NULL) { 14062 if (phyint->phyint_illv4) { 14063 if (!caller_holds_lock) 14064 mutex_enter(&phyint->phyint_illv4->ill_lock); 14065 ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14066 cv_broadcast(&phyint->phyint_illv4->ill_cv); 14067 if (!caller_holds_lock) 14068 mutex_exit(&phyint->phyint_illv4->ill_lock); 14069 } 14070 if (phyint->phyint_illv6) { 14071 if (!caller_holds_lock) 14072 mutex_enter(&phyint->phyint_illv6->ill_lock); 14073 ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14074 cv_broadcast(&phyint->phyint_illv6->ill_cv); 14075 if (!caller_holds_lock) 14076 mutex_exit(&phyint->phyint_illv6->ill_lock); 14077 } 14078 phyint = phyint->phyint_ipsq_next; 14079 } 14080 } 14081 14082 static ipsq_t * 14083 ipsq_create(char *groupname, ip_stack_t *ipst) 14084 { 14085 ipsq_t *ipsq; 14086 14087 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 14088 ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 14089 if (ipsq == NULL) { 14090 return (NULL); 14091 } 14092 14093 if (groupname != NULL) 14094 (void) strcpy(ipsq->ipsq_name, groupname); 14095 else 14096 ipsq->ipsq_name[0] = '\0'; 14097 14098 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL); 14099 ipsq->ipsq_flags |= IPSQ_GROUP; 14100 ipsq->ipsq_next = ipst->ips_ipsq_g_head; 14101 ipst->ips_ipsq_g_head = ipsq; 14102 ipsq->ipsq_ipst = ipst; /* No netstack_hold */ 14103 return (ipsq); 14104 } 14105 14106 /* 14107 * Return an ipsq correspoding to the groupname. If 'create' is true 14108 * allocate a new ipsq if one does not exist. Usually an ipsq is associated 14109 * uniquely with an IPMP group. However during IPMP groupname operations, 14110 * multiple IPMP groups may be associated with a single ipsq. But no 14111 * IPMP group can be associated with more than 1 ipsq at any time. 14112 * For example 14113 * Interfaces IPMP grpname ipsq ipsq_name ipsq_refs 14114 * hme1, hme2 mpk17-84 ipsq1 mpk17-84 2 14115 * hme3, hme4 mpk17-85 ipsq2 mpk17-85 2 14116 * 14117 * Now the command ifconfig hme3 group mpk17-84 results in the temporary 14118 * status shown below during the execution of the above command. 14119 * hme1, hme2, hme3, hme4 mpk17-84, mpk17-85 ipsq1 mpk17-84 4 14120 * 14121 * After the completion of the above groupname command we return to the stable 14122 * state shown below. 14123 * hme1, hme2, hme3 mpk17-84 ipsq1 mpk17-84 3 14124 * hme4 mpk17-85 ipsq2 mpk17-85 1 14125 * 14126 * Because of the above, we don't search based on the ipsq_name since that 14127 * would miss the correct ipsq during certain windows as shown above. 14128 * The ipsq_name is only used during split of an ipsq to return the ipsq to its 14129 * natural state. 14130 */ 14131 static ipsq_t * 14132 ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq, 14133 ip_stack_t *ipst) 14134 { 14135 ipsq_t *ipsq; 14136 int group_len; 14137 phyint_t *phyint; 14138 14139 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 14140 14141 group_len = strlen(groupname); 14142 ASSERT(group_len != 0); 14143 group_len++; 14144 14145 for (ipsq = ipst->ips_ipsq_g_head; 14146 ipsq != NULL; 14147 ipsq = ipsq->ipsq_next) { 14148 /* 14149 * When an ipsq is being split, and ill_split_ipsq 14150 * calls this function, we exclude it from being considered. 14151 */ 14152 if (ipsq == exclude_ipsq) 14153 continue; 14154 14155 /* 14156 * Compare against the ipsq_name. The groupname change happens 14157 * in 2 phases. The 1st phase merges the from group into 14158 * the to group's ipsq, by calling ill_merge_groups and restarts 14159 * the ioctl. The 2nd phase then locates the ipsq again thru 14160 * ipsq_name. At this point the phyint_groupname has not been 14161 * updated. 14162 */ 14163 if ((group_len == strlen(ipsq->ipsq_name) + 1) && 14164 (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) { 14165 /* 14166 * Verify that an ipmp groupname is exactly 14167 * part of 1 ipsq and is not found in any other 14168 * ipsq. 14169 */ 14170 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq, ipst) == 14171 NULL); 14172 return (ipsq); 14173 } 14174 14175 /* 14176 * Comparison against ipsq_name alone is not sufficient. 14177 * In the case when groups are currently being 14178 * merged, the ipsq could hold other IPMP groups temporarily. 14179 * so we walk the phyint list and compare against the 14180 * phyint_groupname as well. 14181 */ 14182 phyint = ipsq->ipsq_phyint_list; 14183 while (phyint != NULL) { 14184 if ((group_len == phyint->phyint_groupname_len) && 14185 (bcmp(phyint->phyint_groupname, groupname, 14186 group_len) == 0)) { 14187 /* 14188 * Verify that an ipmp groupname is exactly 14189 * part of 1 ipsq and is not found in any other 14190 * ipsq. 14191 */ 14192 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq, 14193 ipst) == NULL); 14194 return (ipsq); 14195 } 14196 phyint = phyint->phyint_ipsq_next; 14197 } 14198 } 14199 if (create) 14200 ipsq = ipsq_create(groupname, ipst); 14201 return (ipsq); 14202 } 14203 14204 static void 14205 ipsq_delete(ipsq_t *ipsq) 14206 { 14207 ipsq_t *nipsq; 14208 ipsq_t *pipsq = NULL; 14209 ip_stack_t *ipst = ipsq->ipsq_ipst; 14210 14211 /* 14212 * We don't hold the ipsq lock, but we are sure no new 14213 * messages can land up, since the ipsq_refs is zero. 14214 * i.e. this ipsq is unnamed and no phyint or phyint group 14215 * is associated with this ipsq. (Lookups are based on ill_name 14216 * or phyint_groupname) 14217 */ 14218 ASSERT(ipsq->ipsq_refs == 0); 14219 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL); 14220 ASSERT(ipsq->ipsq_pending_mp == NULL); 14221 if (!(ipsq->ipsq_flags & IPSQ_GROUP)) { 14222 /* 14223 * This is not the ipsq of an IPMP group. 14224 */ 14225 ipsq->ipsq_ipst = NULL; 14226 kmem_free(ipsq, sizeof (ipsq_t)); 14227 return; 14228 } 14229 14230 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14231 14232 /* 14233 * Locate the ipsq before we can remove it from 14234 * the singly linked list of ipsq's. 14235 */ 14236 for (nipsq = ipst->ips_ipsq_g_head; nipsq != NULL; 14237 nipsq = nipsq->ipsq_next) { 14238 if (nipsq == ipsq) { 14239 break; 14240 } 14241 pipsq = nipsq; 14242 } 14243 14244 ASSERT(nipsq == ipsq); 14245 14246 /* unlink ipsq from the list */ 14247 if (pipsq != NULL) 14248 pipsq->ipsq_next = ipsq->ipsq_next; 14249 else 14250 ipst->ips_ipsq_g_head = ipsq->ipsq_next; 14251 ipsq->ipsq_ipst = NULL; 14252 kmem_free(ipsq, sizeof (ipsq_t)); 14253 rw_exit(&ipst->ips_ill_g_lock); 14254 } 14255 14256 static void 14257 ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp, 14258 queue_t *q) 14259 { 14260 ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock)); 14261 ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL); 14262 ASSERT(old_ipsq->ipsq_pending_ipif == NULL); 14263 ASSERT(old_ipsq->ipsq_pending_mp == NULL); 14264 ASSERT(current_mp != NULL); 14265 14266 ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl, 14267 NEW_OP, NULL); 14268 14269 ASSERT(new_ipsq->ipsq_xopq_mptail != NULL && 14270 new_ipsq->ipsq_xopq_mphead != NULL); 14271 14272 /* 14273 * move from old ipsq to the new ipsq. 14274 */ 14275 new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead; 14276 if (old_ipsq->ipsq_xopq_mphead != NULL) 14277 new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail; 14278 14279 old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL; 14280 } 14281 14282 void 14283 ill_group_cleanup(ill_t *ill) 14284 { 14285 ill_t *ill_v4; 14286 ill_t *ill_v6; 14287 ipif_t *ipif; 14288 14289 ill_v4 = ill->ill_phyint->phyint_illv4; 14290 ill_v6 = ill->ill_phyint->phyint_illv6; 14291 14292 if (ill_v4 != NULL) { 14293 mutex_enter(&ill_v4->ill_lock); 14294 for (ipif = ill_v4->ill_ipif; ipif != NULL; 14295 ipif = ipif->ipif_next) { 14296 IPIF_UNMARK_MOVING(ipif); 14297 } 14298 ill_v4->ill_up_ipifs = B_FALSE; 14299 mutex_exit(&ill_v4->ill_lock); 14300 } 14301 14302 if (ill_v6 != NULL) { 14303 mutex_enter(&ill_v6->ill_lock); 14304 for (ipif = ill_v6->ill_ipif; ipif != NULL; 14305 ipif = ipif->ipif_next) { 14306 IPIF_UNMARK_MOVING(ipif); 14307 } 14308 ill_v6->ill_up_ipifs = B_FALSE; 14309 mutex_exit(&ill_v6->ill_lock); 14310 } 14311 } 14312 /* 14313 * This function is called when an ill has had a change in its group status 14314 * to bring up all the ipifs that were up before the change. 14315 */ 14316 int 14317 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 14318 { 14319 ipif_t *ipif; 14320 ill_t *ill_v4; 14321 ill_t *ill_v6; 14322 ill_t *from_ill; 14323 int err = 0; 14324 14325 14326 ASSERT(IAM_WRITER_ILL(ill)); 14327 14328 /* 14329 * Except for ipif_state_flags and ill_state_flags the other 14330 * fields of the ipif/ill that are modified below are protected 14331 * implicitly since we are a writer. We would have tried to down 14332 * even an ipif that was already down, in ill_down_ipifs. So we 14333 * just blindly clear the IPIF_CHANGING flag here on all ipifs. 14334 */ 14335 ill_v4 = ill->ill_phyint->phyint_illv4; 14336 ill_v6 = ill->ill_phyint->phyint_illv6; 14337 if (ill_v4 != NULL) { 14338 ill_v4->ill_up_ipifs = B_TRUE; 14339 for (ipif = ill_v4->ill_ipif; ipif != NULL; 14340 ipif = ipif->ipif_next) { 14341 mutex_enter(&ill_v4->ill_lock); 14342 ipif->ipif_state_flags &= ~IPIF_CHANGING; 14343 IPIF_UNMARK_MOVING(ipif); 14344 mutex_exit(&ill_v4->ill_lock); 14345 if (ipif->ipif_was_up) { 14346 if (!(ipif->ipif_flags & IPIF_UP)) 14347 err = ipif_up(ipif, q, mp); 14348 ipif->ipif_was_up = B_FALSE; 14349 if (err != 0) { 14350 /* 14351 * Can there be any other error ? 14352 */ 14353 ASSERT(err == EINPROGRESS); 14354 return (err); 14355 } 14356 } 14357 } 14358 mutex_enter(&ill_v4->ill_lock); 14359 ill_v4->ill_state_flags &= ~ILL_CHANGING; 14360 mutex_exit(&ill_v4->ill_lock); 14361 ill_v4->ill_up_ipifs = B_FALSE; 14362 if (ill_v4->ill_move_in_progress) { 14363 ASSERT(ill_v4->ill_move_peer != NULL); 14364 ill_v4->ill_move_in_progress = B_FALSE; 14365 from_ill = ill_v4->ill_move_peer; 14366 from_ill->ill_move_in_progress = B_FALSE; 14367 from_ill->ill_move_peer = NULL; 14368 mutex_enter(&from_ill->ill_lock); 14369 from_ill->ill_state_flags &= ~ILL_CHANGING; 14370 mutex_exit(&from_ill->ill_lock); 14371 if (ill_v6 == NULL) { 14372 if (from_ill->ill_phyint->phyint_flags & 14373 PHYI_STANDBY) { 14374 phyint_inactive(from_ill->ill_phyint); 14375 } 14376 if (ill_v4->ill_phyint->phyint_flags & 14377 PHYI_STANDBY) { 14378 phyint_inactive(ill_v4->ill_phyint); 14379 } 14380 } 14381 ill_v4->ill_move_peer = NULL; 14382 } 14383 } 14384 14385 if (ill_v6 != NULL) { 14386 ill_v6->ill_up_ipifs = B_TRUE; 14387 for (ipif = ill_v6->ill_ipif; ipif != NULL; 14388 ipif = ipif->ipif_next) { 14389 mutex_enter(&ill_v6->ill_lock); 14390 ipif->ipif_state_flags &= ~IPIF_CHANGING; 14391 IPIF_UNMARK_MOVING(ipif); 14392 mutex_exit(&ill_v6->ill_lock); 14393 if (ipif->ipif_was_up) { 14394 if (!(ipif->ipif_flags & IPIF_UP)) 14395 err = ipif_up(ipif, q, mp); 14396 ipif->ipif_was_up = B_FALSE; 14397 if (err != 0) { 14398 /* 14399 * Can there be any other error ? 14400 */ 14401 ASSERT(err == EINPROGRESS); 14402 return (err); 14403 } 14404 } 14405 } 14406 mutex_enter(&ill_v6->ill_lock); 14407 ill_v6->ill_state_flags &= ~ILL_CHANGING; 14408 mutex_exit(&ill_v6->ill_lock); 14409 ill_v6->ill_up_ipifs = B_FALSE; 14410 if (ill_v6->ill_move_in_progress) { 14411 ASSERT(ill_v6->ill_move_peer != NULL); 14412 ill_v6->ill_move_in_progress = B_FALSE; 14413 from_ill = ill_v6->ill_move_peer; 14414 from_ill->ill_move_in_progress = B_FALSE; 14415 from_ill->ill_move_peer = NULL; 14416 mutex_enter(&from_ill->ill_lock); 14417 from_ill->ill_state_flags &= ~ILL_CHANGING; 14418 mutex_exit(&from_ill->ill_lock); 14419 if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 14420 phyint_inactive(from_ill->ill_phyint); 14421 } 14422 if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) { 14423 phyint_inactive(ill_v6->ill_phyint); 14424 } 14425 ill_v6->ill_move_peer = NULL; 14426 } 14427 } 14428 return (0); 14429 } 14430 14431 /* 14432 * bring down all the approriate ipifs. 14433 */ 14434 /* ARGSUSED */ 14435 static void 14436 ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover) 14437 { 14438 ipif_t *ipif; 14439 14440 ASSERT(IAM_WRITER_ILL(ill)); 14441 14442 /* 14443 * Except for ipif_state_flags the other fields of the ipif/ill that 14444 * are modified below are protected implicitly since we are a writer 14445 */ 14446 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14447 if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER)) 14448 continue; 14449 if (index == 0 || index == ipif->ipif_orig_ifindex) { 14450 /* 14451 * We go through the ipif_down logic even if the ipif 14452 * is already down, since routes can be added based 14453 * on down ipifs. Going through ipif_down once again 14454 * will delete any IREs created based on these routes. 14455 */ 14456 if (ipif->ipif_flags & IPIF_UP) 14457 ipif->ipif_was_up = B_TRUE; 14458 /* 14459 * If called with chk_nofailover true ipif is moving. 14460 */ 14461 mutex_enter(&ill->ill_lock); 14462 if (chk_nofailover) { 14463 ipif->ipif_state_flags |= 14464 IPIF_MOVING | IPIF_CHANGING; 14465 } else { 14466 ipif->ipif_state_flags |= IPIF_CHANGING; 14467 } 14468 mutex_exit(&ill->ill_lock); 14469 /* 14470 * Need to re-create net/subnet bcast ires if 14471 * they are dependent on ipif. 14472 */ 14473 if (!ipif->ipif_isv6) 14474 ipif_check_bcast_ires(ipif); 14475 (void) ipif_logical_down(ipif, NULL, NULL); 14476 ipif_non_duplicate(ipif); 14477 ipif_down_tail(ipif); 14478 } 14479 } 14480 } 14481 14482 #define IPSQ_INC_REF(ipsq, ipst) { \ 14483 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); \ 14484 (ipsq)->ipsq_refs++; \ 14485 } 14486 14487 #define IPSQ_DEC_REF(ipsq, ipst) { \ 14488 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); \ 14489 (ipsq)->ipsq_refs--; \ 14490 if ((ipsq)->ipsq_refs == 0) \ 14491 (ipsq)->ipsq_name[0] = '\0'; \ 14492 } 14493 14494 /* 14495 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 14496 * new_ipsq. 14497 */ 14498 static void 14499 ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq, ip_stack_t *ipst) 14500 { 14501 phyint_t *phyint; 14502 phyint_t *next_phyint; 14503 14504 /* 14505 * To change the ipsq of an ill, we need to hold the ill_g_lock as 14506 * writer and the ill_lock of the ill in question. Also the dest 14507 * ipsq can't vanish while we hold the ill_g_lock as writer. 14508 */ 14509 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 14510 14511 phyint = cur_ipsq->ipsq_phyint_list; 14512 cur_ipsq->ipsq_phyint_list = NULL; 14513 while (phyint != NULL) { 14514 next_phyint = phyint->phyint_ipsq_next; 14515 IPSQ_DEC_REF(cur_ipsq, ipst); 14516 phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list; 14517 new_ipsq->ipsq_phyint_list = phyint; 14518 IPSQ_INC_REF(new_ipsq, ipst); 14519 phyint->phyint_ipsq = new_ipsq; 14520 phyint = next_phyint; 14521 } 14522 } 14523 14524 #define SPLIT_SUCCESS 0 14525 #define SPLIT_NOT_NEEDED 1 14526 #define SPLIT_FAILED 2 14527 14528 int 14529 ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry, 14530 ip_stack_t *ipst) 14531 { 14532 ipsq_t *newipsq = NULL; 14533 14534 /* 14535 * Assertions denote pre-requisites for changing the ipsq of 14536 * a phyint 14537 */ 14538 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 14539 /* 14540 * <ill-phyint> assocs can't change while ill_g_lock 14541 * is held as writer. See ill_phyint_reinit() 14542 */ 14543 ASSERT(phyint->phyint_illv4 == NULL || 14544 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14545 ASSERT(phyint->phyint_illv6 == NULL || 14546 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14547 14548 if ((phyint->phyint_groupname_len != 14549 (strlen(cur_ipsq->ipsq_name) + 1) || 14550 bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name, 14551 phyint->phyint_groupname_len) != 0)) { 14552 /* 14553 * Once we fail in creating a new ipsq due to memory shortage, 14554 * don't attempt to create new ipsq again, based on another 14555 * phyint, since we want all phyints belonging to an IPMP group 14556 * to be in the same ipsq even in the event of mem alloc fails. 14557 */ 14558 newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry, 14559 cur_ipsq, ipst); 14560 if (newipsq == NULL) { 14561 /* Memory allocation failure */ 14562 return (SPLIT_FAILED); 14563 } else { 14564 /* ipsq_refs protected by ill_g_lock (writer) */ 14565 IPSQ_DEC_REF(cur_ipsq, ipst); 14566 phyint->phyint_ipsq = newipsq; 14567 phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list; 14568 newipsq->ipsq_phyint_list = phyint; 14569 IPSQ_INC_REF(newipsq, ipst); 14570 return (SPLIT_SUCCESS); 14571 } 14572 } 14573 return (SPLIT_NOT_NEEDED); 14574 } 14575 14576 /* 14577 * The ill locks of the phyint and the ill_g_lock (writer) must be held 14578 * to do this split 14579 */ 14580 static int 14581 ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, ip_stack_t *ipst) 14582 { 14583 ipsq_t *newipsq; 14584 14585 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 14586 /* 14587 * <ill-phyint> assocs can't change while ill_g_lock 14588 * is held as writer. See ill_phyint_reinit() 14589 */ 14590 14591 ASSERT(phyint->phyint_illv4 == NULL || 14592 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14593 ASSERT(phyint->phyint_illv6 == NULL || 14594 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14595 14596 if (!ipsq_init((phyint->phyint_illv4 != NULL) ? 14597 phyint->phyint_illv4: phyint->phyint_illv6)) { 14598 /* 14599 * ipsq_init failed due to no memory 14600 * caller will use the same ipsq 14601 */ 14602 return (SPLIT_FAILED); 14603 } 14604 14605 /* ipsq_ref is protected by ill_g_lock (writer) */ 14606 IPSQ_DEC_REF(cur_ipsq, ipst); 14607 14608 /* 14609 * This is a new ipsq that is unknown to the world. 14610 * So we don't need to hold ipsq_lock, 14611 */ 14612 newipsq = phyint->phyint_ipsq; 14613 newipsq->ipsq_writer = NULL; 14614 newipsq->ipsq_reentry_cnt--; 14615 ASSERT(newipsq->ipsq_reentry_cnt == 0); 14616 #ifdef DEBUG 14617 newipsq->ipsq_depth = 0; 14618 #endif 14619 14620 return (SPLIT_SUCCESS); 14621 } 14622 14623 /* 14624 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 14625 * ipsq's representing their individual groups or themselves. Return 14626 * whether split needs to be retried again later. 14627 */ 14628 static boolean_t 14629 ill_split_ipsq(ipsq_t *cur_ipsq) 14630 { 14631 phyint_t *phyint; 14632 phyint_t *next_phyint; 14633 int error; 14634 boolean_t need_retry = B_FALSE; 14635 ip_stack_t *ipst = cur_ipsq->ipsq_ipst; 14636 14637 phyint = cur_ipsq->ipsq_phyint_list; 14638 cur_ipsq->ipsq_phyint_list = NULL; 14639 while (phyint != NULL) { 14640 next_phyint = phyint->phyint_ipsq_next; 14641 /* 14642 * 'created' will tell us whether the callee actually 14643 * created an ipsq. Lack of memory may force the callee 14644 * to return without creating an ipsq. 14645 */ 14646 if (phyint->phyint_groupname == NULL) { 14647 error = ill_split_to_own_ipsq(phyint, cur_ipsq, ipst); 14648 } else { 14649 error = ill_split_to_grp_ipsq(phyint, cur_ipsq, 14650 need_retry, ipst); 14651 } 14652 14653 switch (error) { 14654 case SPLIT_FAILED: 14655 need_retry = B_TRUE; 14656 /* FALLTHRU */ 14657 case SPLIT_NOT_NEEDED: 14658 /* 14659 * Keep it on the list. 14660 */ 14661 phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list; 14662 cur_ipsq->ipsq_phyint_list = phyint; 14663 break; 14664 case SPLIT_SUCCESS: 14665 break; 14666 default: 14667 ASSERT(0); 14668 } 14669 14670 phyint = next_phyint; 14671 } 14672 return (need_retry); 14673 } 14674 14675 /* 14676 * given an ipsq 'ipsq' lock all ills associated with this ipsq. 14677 * and return the ills in the list. This list will be 14678 * needed to unlock all the ills later on by the caller. 14679 * The <ill-ipsq> associations could change between the 14680 * lock and unlock. Hence the unlock can't traverse the 14681 * ipsq to get the list of ills. 14682 */ 14683 static int 14684 ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max) 14685 { 14686 int cnt = 0; 14687 phyint_t *phyint; 14688 ip_stack_t *ipst = ipsq->ipsq_ipst; 14689 14690 /* 14691 * The caller holds ill_g_lock to ensure that the ill memberships 14692 * of the ipsq don't change 14693 */ 14694 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 14695 14696 phyint = ipsq->ipsq_phyint_list; 14697 while (phyint != NULL) { 14698 if (phyint->phyint_illv4 != NULL) { 14699 ASSERT(cnt < list_max); 14700 list[cnt++] = phyint->phyint_illv4; 14701 } 14702 if (phyint->phyint_illv6 != NULL) { 14703 ASSERT(cnt < list_max); 14704 list[cnt++] = phyint->phyint_illv6; 14705 } 14706 phyint = phyint->phyint_ipsq_next; 14707 } 14708 ill_lock_ills(list, cnt); 14709 return (cnt); 14710 } 14711 14712 void 14713 ill_lock_ills(ill_t **list, int cnt) 14714 { 14715 int i; 14716 14717 if (cnt > 1) { 14718 boolean_t try_again; 14719 do { 14720 try_again = B_FALSE; 14721 for (i = 0; i < cnt - 1; i++) { 14722 if (list[i] < list[i + 1]) { 14723 ill_t *tmp; 14724 14725 /* swap the elements */ 14726 tmp = list[i]; 14727 list[i] = list[i + 1]; 14728 list[i + 1] = tmp; 14729 try_again = B_TRUE; 14730 } 14731 } 14732 } while (try_again); 14733 } 14734 14735 for (i = 0; i < cnt; i++) { 14736 if (i == 0) { 14737 if (list[i] != NULL) 14738 mutex_enter(&list[i]->ill_lock); 14739 else 14740 return; 14741 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 14742 mutex_enter(&list[i]->ill_lock); 14743 } 14744 } 14745 } 14746 14747 void 14748 ill_unlock_ills(ill_t **list, int cnt) 14749 { 14750 int i; 14751 14752 for (i = 0; i < cnt; i++) { 14753 if ((i == 0) && (list[i] != NULL)) { 14754 mutex_exit(&list[i]->ill_lock); 14755 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 14756 mutex_exit(&list[i]->ill_lock); 14757 } 14758 } 14759 } 14760 14761 /* 14762 * Merge all the ills from 1 ipsq group into another ipsq group. 14763 * The source ipsq group is specified by the ipsq associated with 14764 * 'from_ill'. The destination ipsq group is specified by the ipsq 14765 * associated with 'to_ill' or 'groupname' respectively. 14766 * Note that ipsq itself does not have a reference count mechanism 14767 * and functions don't look up an ipsq and pass it around. Instead 14768 * functions pass around an ill or groupname, and the ipsq is looked 14769 * up from the ill or groupname and the required operation performed 14770 * atomically with the lookup on the ipsq. 14771 */ 14772 static int 14773 ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp, 14774 queue_t *q) 14775 { 14776 ipsq_t *old_ipsq; 14777 ipsq_t *new_ipsq; 14778 ill_t **ill_list; 14779 int cnt; 14780 size_t ill_list_size; 14781 boolean_t became_writer_on_new_sq = B_FALSE; 14782 ip_stack_t *ipst = from_ill->ill_ipst; 14783 14784 ASSERT(to_ill == NULL || ipst == to_ill->ill_ipst); 14785 /* Exactly 1 of 'to_ill' and groupname can be specified. */ 14786 ASSERT((to_ill != NULL) ^ (groupname != NULL)); 14787 14788 /* 14789 * Need to hold ill_g_lock as writer and also the ill_lock to 14790 * change the <ill-ipsq> assoc of an ill. Need to hold the 14791 * ipsq_lock to prevent new messages from landing on an ipsq. 14792 */ 14793 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14794 14795 old_ipsq = from_ill->ill_phyint->phyint_ipsq; 14796 if (groupname != NULL) 14797 new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL, ipst); 14798 else { 14799 new_ipsq = to_ill->ill_phyint->phyint_ipsq; 14800 } 14801 14802 ASSERT(old_ipsq != NULL && new_ipsq != NULL); 14803 14804 /* 14805 * both groups are on the same ipsq. 14806 */ 14807 if (old_ipsq == new_ipsq) { 14808 rw_exit(&ipst->ips_ill_g_lock); 14809 return (0); 14810 } 14811 14812 cnt = old_ipsq->ipsq_refs << 1; 14813 ill_list_size = cnt * sizeof (ill_t *); 14814 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 14815 if (ill_list == NULL) { 14816 rw_exit(&ipst->ips_ill_g_lock); 14817 return (ENOMEM); 14818 } 14819 cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt); 14820 14821 /* Need ipsq lock to enque messages on new ipsq or to become writer */ 14822 mutex_enter(&new_ipsq->ipsq_lock); 14823 if ((new_ipsq->ipsq_writer == NULL && 14824 new_ipsq->ipsq_current_ipif == NULL) || 14825 (new_ipsq->ipsq_writer == curthread)) { 14826 new_ipsq->ipsq_writer = curthread; 14827 new_ipsq->ipsq_reentry_cnt++; 14828 became_writer_on_new_sq = B_TRUE; 14829 } 14830 14831 /* 14832 * We are holding ill_g_lock as writer and all the ill locks of 14833 * the old ipsq. So the old_ipsq can't be looked up, and hence no new 14834 * message can land up on the old ipsq even though we don't hold the 14835 * ipsq_lock of the old_ipsq. Now move all messages to the newipsq. 14836 */ 14837 ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q); 14838 14839 /* 14840 * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'. 14841 * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq> 14842 * assocs. till we release the ill_g_lock, and hence it can't vanish. 14843 */ 14844 ill_merge_ipsq(old_ipsq, new_ipsq, ipst); 14845 14846 /* 14847 * Mark the new ipsq as needing a split since it is currently 14848 * being shared by more than 1 IPMP group. The split will 14849 * occur at the end of ipsq_exit 14850 */ 14851 new_ipsq->ipsq_split = B_TRUE; 14852 14853 /* Now release all the locks */ 14854 mutex_exit(&new_ipsq->ipsq_lock); 14855 ill_unlock_ills(ill_list, cnt); 14856 rw_exit(&ipst->ips_ill_g_lock); 14857 14858 kmem_free(ill_list, ill_list_size); 14859 14860 /* 14861 * If we succeeded in becoming writer on the new ipsq, then 14862 * drain the new ipsq and start processing all enqueued messages 14863 * including the current ioctl we are processing which is either 14864 * a set groupname or failover/failback. 14865 */ 14866 if (became_writer_on_new_sq) 14867 ipsq_exit(new_ipsq, B_TRUE, B_TRUE); 14868 14869 /* 14870 * syncq has been changed and all the messages have been moved. 14871 */ 14872 mutex_enter(&old_ipsq->ipsq_lock); 14873 old_ipsq->ipsq_current_ipif = NULL; 14874 old_ipsq->ipsq_current_ioctl = 0; 14875 mutex_exit(&old_ipsq->ipsq_lock); 14876 return (EINPROGRESS); 14877 } 14878 14879 /* 14880 * Delete and add the loopback copy and non-loopback copy of 14881 * the BROADCAST ire corresponding to ill and addr. Used to 14882 * group broadcast ires together when ill becomes part of 14883 * a group. 14884 * 14885 * This function is also called when ill is leaving the group 14886 * so that the ires belonging to the group gets re-grouped. 14887 */ 14888 static void 14889 ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr) 14890 { 14891 ire_t *ire, *nire, *nire_next, *ire_head = NULL; 14892 ire_t **ire_ptpn = &ire_head; 14893 ip_stack_t *ipst = ill->ill_ipst; 14894 14895 /* 14896 * The loopback and non-loopback IREs are inserted in the order in which 14897 * they're found, on the basis that they are correctly ordered (loopback 14898 * first). 14899 */ 14900 for (;;) { 14901 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 14902 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); 14903 if (ire == NULL) 14904 break; 14905 14906 /* 14907 * we are passing in KM_SLEEP because it is not easy to 14908 * go back to a sane state in case of memory failure. 14909 */ 14910 nire = kmem_cache_alloc(ire_cache, KM_SLEEP); 14911 ASSERT(nire != NULL); 14912 bzero(nire, sizeof (ire_t)); 14913 /* 14914 * Don't use ire_max_frag directly since we don't 14915 * hold on to 'ire' until we add the new ire 'nire' and 14916 * we don't want the new ire to have a dangling reference 14917 * to 'ire'. The ire_max_frag of a broadcast ire must 14918 * be in sync with the ipif_mtu of the associate ipif. 14919 * For eg. this happens as a result of SIOCSLIFNAME, 14920 * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by 14921 * the driver. A change in ire_max_frag triggered as 14922 * as a result of path mtu discovery, or due to an 14923 * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a 14924 * route change -mtu command does not apply to broadcast ires. 14925 * 14926 * XXX We need a recovery strategy here if ire_init fails 14927 */ 14928 if (ire_init(nire, 14929 (uchar_t *)&ire->ire_addr, 14930 (uchar_t *)&ire->ire_mask, 14931 (uchar_t *)&ire->ire_src_addr, 14932 (uchar_t *)&ire->ire_gateway_addr, 14933 ire->ire_stq == NULL ? &ip_loopback_mtu : 14934 &ire->ire_ipif->ipif_mtu, 14935 ire->ire_nce, 14936 ire->ire_rfq, 14937 ire->ire_stq, 14938 ire->ire_type, 14939 ire->ire_ipif, 14940 ire->ire_cmask, 14941 ire->ire_phandle, 14942 ire->ire_ihandle, 14943 ire->ire_flags, 14944 &ire->ire_uinfo, 14945 NULL, 14946 NULL, 14947 ipst) == NULL) { 14948 cmn_err(CE_PANIC, "ire_init() failed"); 14949 } 14950 ire_delete(ire); 14951 ire_refrele(ire); 14952 14953 /* 14954 * The newly created IREs are inserted at the tail of the list 14955 * starting with ire_head. As we've just allocated them no one 14956 * knows about them so it's safe. 14957 */ 14958 *ire_ptpn = nire; 14959 ire_ptpn = &nire->ire_next; 14960 } 14961 14962 for (nire = ire_head; nire != NULL; nire = nire_next) { 14963 int error; 14964 ire_t *oire; 14965 /* unlink the IRE from our list before calling ire_add() */ 14966 nire_next = nire->ire_next; 14967 nire->ire_next = NULL; 14968 14969 /* ire_add adds the ire at the right place in the list */ 14970 oire = nire; 14971 error = ire_add(&nire, NULL, NULL, NULL, B_FALSE); 14972 ASSERT(error == 0); 14973 ASSERT(oire == nire); 14974 ire_refrele(nire); /* Held in ire_add */ 14975 } 14976 } 14977 14978 /* 14979 * This function is usually called when an ill is inserted in 14980 * a group and all the ipifs are already UP. As all the ipifs 14981 * are already UP, the broadcast ires have already been created 14982 * and been inserted. But, ire_add_v4 would not have grouped properly. 14983 * We need to re-group for the benefit of ip_wput_ire which 14984 * expects BROADCAST ires to be grouped properly to avoid sending 14985 * more than one copy of the broadcast packet per group. 14986 * 14987 * NOTE : We don't check for ill_ipif_up_count to be non-zero here 14988 * because when ipif_up_done ends up calling this, ires have 14989 * already been added before illgrp_insert i.e before ill_group 14990 * has been initialized. 14991 */ 14992 static void 14993 ill_group_bcast_for_xmit(ill_t *ill) 14994 { 14995 ill_group_t *illgrp; 14996 ipif_t *ipif; 14997 ipaddr_t addr; 14998 ipaddr_t net_mask; 14999 ipaddr_t subnet_netmask; 15000 15001 illgrp = ill->ill_group; 15002 15003 /* 15004 * This function is called even when an ill is deleted from 15005 * the group. Hence, illgrp could be null. 15006 */ 15007 if (illgrp != NULL && illgrp->illgrp_ill_count == 1) 15008 return; 15009 15010 /* 15011 * Delete all the BROADCAST ires matching this ill and add 15012 * them back. This time, ire_add_v4 should take care of 15013 * grouping them with others because ill is part of the 15014 * group. 15015 */ 15016 ill_bcast_delete_and_add(ill, 0); 15017 ill_bcast_delete_and_add(ill, INADDR_BROADCAST); 15018 15019 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15020 15021 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15022 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15023 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15024 } else { 15025 net_mask = htonl(IN_CLASSA_NET); 15026 } 15027 addr = net_mask & ipif->ipif_subnet; 15028 ill_bcast_delete_and_add(ill, addr); 15029 ill_bcast_delete_and_add(ill, ~net_mask | addr); 15030 15031 subnet_netmask = ipif->ipif_net_mask; 15032 addr = ipif->ipif_subnet; 15033 ill_bcast_delete_and_add(ill, addr); 15034 ill_bcast_delete_and_add(ill, ~subnet_netmask | addr); 15035 } 15036 } 15037 15038 /* 15039 * This function is called from illgrp_delete when ill is being deleted 15040 * from the group. 15041 * 15042 * As ill is not there in the group anymore, any address belonging 15043 * to this ill should be cleared of IRE_MARK_NORECV. 15044 */ 15045 static void 15046 ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr) 15047 { 15048 ire_t *ire; 15049 irb_t *irb; 15050 ip_stack_t *ipst = ill->ill_ipst; 15051 15052 ASSERT(ill->ill_group == NULL); 15053 15054 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 15055 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); 15056 15057 if (ire != NULL) { 15058 /* 15059 * IPMP and plumbing operations are serialized on the ipsq, so 15060 * no one will insert or delete a broadcast ire under our feet. 15061 */ 15062 irb = ire->ire_bucket; 15063 rw_enter(&irb->irb_lock, RW_READER); 15064 ire_refrele(ire); 15065 15066 for (; ire != NULL; ire = ire->ire_next) { 15067 if (ire->ire_addr != addr) 15068 break; 15069 if (ire_to_ill(ire) != ill) 15070 continue; 15071 15072 ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED)); 15073 ire->ire_marks &= ~IRE_MARK_NORECV; 15074 } 15075 rw_exit(&irb->irb_lock); 15076 } 15077 } 15078 15079 /* 15080 * This function must be called only after the broadcast ires 15081 * have been grouped together. For a given address addr, nominate 15082 * only one of the ires whose interface is not FAILED or OFFLINE. 15083 * 15084 * This is also called when an ipif goes down, so that we can nominate 15085 * a different ire with the same address for receiving. 15086 */ 15087 static void 15088 ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr, ip_stack_t *ipst) 15089 { 15090 irb_t *irb; 15091 ire_t *ire; 15092 ire_t *ire1; 15093 ire_t *save_ire; 15094 ire_t **irep = NULL; 15095 boolean_t first = B_TRUE; 15096 ire_t *clear_ire = NULL; 15097 ire_t *start_ire = NULL; 15098 ire_t *new_lb_ire; 15099 ire_t *new_nlb_ire; 15100 boolean_t new_lb_ire_used = B_FALSE; 15101 boolean_t new_nlb_ire_used = B_FALSE; 15102 uint64_t match_flags; 15103 uint64_t phyi_flags; 15104 boolean_t fallback = B_FALSE; 15105 uint_t max_frag; 15106 15107 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES, 15108 NULL, MATCH_IRE_TYPE, ipst); 15109 /* 15110 * We may not be able to find some ires if a previous 15111 * ire_create failed. This happens when an ipif goes 15112 * down and we are unable to create BROADCAST ires due 15113 * to memory failure. Thus, we have to check for NULL 15114 * below. This should handle the case for LOOPBACK, 15115 * POINTOPOINT and interfaces with some POINTOPOINT 15116 * logicals for which there are no BROADCAST ires. 15117 */ 15118 if (ire == NULL) 15119 return; 15120 /* 15121 * Currently IRE_BROADCASTS are deleted when an ipif 15122 * goes down which runs exclusively. Thus, setting 15123 * IRE_MARK_RCVD should not race with ire_delete marking 15124 * IRE_MARK_CONDEMNED. We grab the lock below just to 15125 * be consistent with other parts of the code that walks 15126 * a given bucket. 15127 */ 15128 save_ire = ire; 15129 irb = ire->ire_bucket; 15130 new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 15131 if (new_lb_ire == NULL) { 15132 ire_refrele(ire); 15133 return; 15134 } 15135 new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 15136 if (new_nlb_ire == NULL) { 15137 ire_refrele(ire); 15138 kmem_cache_free(ire_cache, new_lb_ire); 15139 return; 15140 } 15141 IRB_REFHOLD(irb); 15142 rw_enter(&irb->irb_lock, RW_WRITER); 15143 /* 15144 * Get to the first ire matching the address and the 15145 * group. If the address does not match we are done 15146 * as we could not find the IRE. If the address matches 15147 * we should get to the first one matching the group. 15148 */ 15149 while (ire != NULL) { 15150 if (ire->ire_addr != addr || 15151 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 15152 break; 15153 } 15154 ire = ire->ire_next; 15155 } 15156 match_flags = PHYI_FAILED | PHYI_INACTIVE; 15157 start_ire = ire; 15158 redo: 15159 while (ire != NULL && ire->ire_addr == addr && 15160 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 15161 /* 15162 * The first ire for any address within a group 15163 * should always be the one with IRE_MARK_NORECV cleared 15164 * so that ip_wput_ire can avoid searching for one. 15165 * Note down the insertion point which will be used 15166 * later. 15167 */ 15168 if (first && (irep == NULL)) 15169 irep = ire->ire_ptpn; 15170 /* 15171 * PHYI_FAILED is set when the interface fails. 15172 * This interface might have become good, but the 15173 * daemon has not yet detected. We should still 15174 * not receive on this. PHYI_OFFLINE should never 15175 * be picked as this has been offlined and soon 15176 * be removed. 15177 */ 15178 phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags; 15179 if (phyi_flags & PHYI_OFFLINE) { 15180 ire->ire_marks |= IRE_MARK_NORECV; 15181 ire = ire->ire_next; 15182 continue; 15183 } 15184 if (phyi_flags & match_flags) { 15185 ire->ire_marks |= IRE_MARK_NORECV; 15186 ire = ire->ire_next; 15187 if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) == 15188 PHYI_INACTIVE) { 15189 fallback = B_TRUE; 15190 } 15191 continue; 15192 } 15193 if (first) { 15194 /* 15195 * We will move this to the front of the list later 15196 * on. 15197 */ 15198 clear_ire = ire; 15199 ire->ire_marks &= ~IRE_MARK_NORECV; 15200 } else { 15201 ire->ire_marks |= IRE_MARK_NORECV; 15202 } 15203 first = B_FALSE; 15204 ire = ire->ire_next; 15205 } 15206 /* 15207 * If we never nominated anybody, try nominating at least 15208 * an INACTIVE, if we found one. Do it only once though. 15209 */ 15210 if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) && 15211 fallback) { 15212 match_flags = PHYI_FAILED; 15213 ire = start_ire; 15214 irep = NULL; 15215 goto redo; 15216 } 15217 ire_refrele(save_ire); 15218 15219 /* 15220 * irep non-NULL indicates that we entered the while loop 15221 * above. If clear_ire is at the insertion point, we don't 15222 * have to do anything. clear_ire will be NULL if all the 15223 * interfaces are failed. 15224 * 15225 * We cannot unlink and reinsert the ire at the right place 15226 * in the list since there can be other walkers of this bucket. 15227 * Instead we delete and recreate the ire 15228 */ 15229 if (clear_ire != NULL && irep != NULL && *irep != clear_ire) { 15230 ire_t *clear_ire_stq = NULL; 15231 15232 bzero(new_lb_ire, sizeof (ire_t)); 15233 /* XXX We need a recovery strategy here. */ 15234 if (ire_init(new_lb_ire, 15235 (uchar_t *)&clear_ire->ire_addr, 15236 (uchar_t *)&clear_ire->ire_mask, 15237 (uchar_t *)&clear_ire->ire_src_addr, 15238 (uchar_t *)&clear_ire->ire_gateway_addr, 15239 &clear_ire->ire_max_frag, 15240 NULL, /* let ire_nce_init derive the resolver info */ 15241 clear_ire->ire_rfq, 15242 clear_ire->ire_stq, 15243 clear_ire->ire_type, 15244 clear_ire->ire_ipif, 15245 clear_ire->ire_cmask, 15246 clear_ire->ire_phandle, 15247 clear_ire->ire_ihandle, 15248 clear_ire->ire_flags, 15249 &clear_ire->ire_uinfo, 15250 NULL, 15251 NULL, 15252 ipst) == NULL) 15253 cmn_err(CE_PANIC, "ire_init() failed"); 15254 if (clear_ire->ire_stq == NULL) { 15255 ire_t *ire_next = clear_ire->ire_next; 15256 if (ire_next != NULL && 15257 ire_next->ire_stq != NULL && 15258 ire_next->ire_addr == clear_ire->ire_addr && 15259 ire_next->ire_ipif->ipif_ill == 15260 clear_ire->ire_ipif->ipif_ill) { 15261 clear_ire_stq = ire_next; 15262 15263 bzero(new_nlb_ire, sizeof (ire_t)); 15264 /* XXX We need a recovery strategy here. */ 15265 if (ire_init(new_nlb_ire, 15266 (uchar_t *)&clear_ire_stq->ire_addr, 15267 (uchar_t *)&clear_ire_stq->ire_mask, 15268 (uchar_t *)&clear_ire_stq->ire_src_addr, 15269 (uchar_t *)&clear_ire_stq->ire_gateway_addr, 15270 &clear_ire_stq->ire_max_frag, 15271 NULL, 15272 clear_ire_stq->ire_rfq, 15273 clear_ire_stq->ire_stq, 15274 clear_ire_stq->ire_type, 15275 clear_ire_stq->ire_ipif, 15276 clear_ire_stq->ire_cmask, 15277 clear_ire_stq->ire_phandle, 15278 clear_ire_stq->ire_ihandle, 15279 clear_ire_stq->ire_flags, 15280 &clear_ire_stq->ire_uinfo, 15281 NULL, 15282 NULL, 15283 ipst) == NULL) 15284 cmn_err(CE_PANIC, "ire_init() failed"); 15285 } 15286 } 15287 15288 /* 15289 * Delete the ire. We can't call ire_delete() since 15290 * we are holding the bucket lock. We can't release the 15291 * bucket lock since we can't allow irep to change. So just 15292 * mark it CONDEMNED. The IRB_REFRELE will delete the 15293 * ire from the list and do the refrele. 15294 */ 15295 clear_ire->ire_marks |= IRE_MARK_CONDEMNED; 15296 irb->irb_marks |= IRB_MARK_CONDEMNED; 15297 15298 if (clear_ire_stq != NULL && clear_ire_stq->ire_nce != NULL) { 15299 nce_fastpath_list_delete(clear_ire_stq->ire_nce); 15300 clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED; 15301 } 15302 15303 /* 15304 * Also take care of otherfields like ib/ob pkt count 15305 * etc. Need to dup them. ditto in ill_bcast_delete_and_add 15306 */ 15307 15308 /* Set the max_frag before adding the ire */ 15309 max_frag = *new_lb_ire->ire_max_fragp; 15310 new_lb_ire->ire_max_fragp = NULL; 15311 new_lb_ire->ire_max_frag = max_frag; 15312 15313 /* Add the new ire's. Insert at *irep */ 15314 new_lb_ire->ire_bucket = clear_ire->ire_bucket; 15315 ire1 = *irep; 15316 if (ire1 != NULL) 15317 ire1->ire_ptpn = &new_lb_ire->ire_next; 15318 new_lb_ire->ire_next = ire1; 15319 /* Link the new one in. */ 15320 new_lb_ire->ire_ptpn = irep; 15321 membar_producer(); 15322 *irep = new_lb_ire; 15323 new_lb_ire_used = B_TRUE; 15324 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted); 15325 new_lb_ire->ire_bucket->irb_ire_cnt++; 15326 new_lb_ire->ire_ipif->ipif_ire_cnt++; 15327 15328 if (clear_ire_stq != NULL) { 15329 /* Set the max_frag before adding the ire */ 15330 max_frag = *new_nlb_ire->ire_max_fragp; 15331 new_nlb_ire->ire_max_fragp = NULL; 15332 new_nlb_ire->ire_max_frag = max_frag; 15333 15334 new_nlb_ire->ire_bucket = clear_ire->ire_bucket; 15335 irep = &new_lb_ire->ire_next; 15336 /* Add the new ire. Insert at *irep */ 15337 ire1 = *irep; 15338 if (ire1 != NULL) 15339 ire1->ire_ptpn = &new_nlb_ire->ire_next; 15340 new_nlb_ire->ire_next = ire1; 15341 /* Link the new one in. */ 15342 new_nlb_ire->ire_ptpn = irep; 15343 membar_producer(); 15344 *irep = new_nlb_ire; 15345 new_nlb_ire_used = B_TRUE; 15346 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, 15347 ire_stats_inserted); 15348 new_nlb_ire->ire_bucket->irb_ire_cnt++; 15349 new_nlb_ire->ire_ipif->ipif_ire_cnt++; 15350 ((ill_t *)new_nlb_ire->ire_stq->q_ptr)->ill_ire_cnt++; 15351 } 15352 } 15353 rw_exit(&irb->irb_lock); 15354 if (!new_lb_ire_used) 15355 kmem_cache_free(ire_cache, new_lb_ire); 15356 if (!new_nlb_ire_used) 15357 kmem_cache_free(ire_cache, new_nlb_ire); 15358 IRB_REFRELE(irb); 15359 } 15360 15361 /* 15362 * Whenever an ipif goes down we have to renominate a different 15363 * broadcast ire to receive. Whenever an ipif comes up, we need 15364 * to make sure that we have only one nominated to receive. 15365 */ 15366 static void 15367 ipif_renominate_bcast(ipif_t *ipif) 15368 { 15369 ill_t *ill = ipif->ipif_ill; 15370 ipaddr_t subnet_addr; 15371 ipaddr_t net_addr; 15372 ipaddr_t net_mask = 0; 15373 ipaddr_t subnet_netmask; 15374 ipaddr_t addr; 15375 ill_group_t *illgrp; 15376 ip_stack_t *ipst = ill->ill_ipst; 15377 15378 illgrp = ill->ill_group; 15379 /* 15380 * If this is the last ipif going down, it might take 15381 * the ill out of the group. In that case ipif_down -> 15382 * illgrp_delete takes care of doing the nomination. 15383 * ipif_down does not call for this case. 15384 */ 15385 ASSERT(illgrp != NULL); 15386 15387 /* There could not have been any ires associated with this */ 15388 if (ipif->ipif_subnet == 0) 15389 return; 15390 15391 ill_mark_bcast(illgrp, 0, ipst); 15392 ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst); 15393 15394 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15395 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15396 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15397 } else { 15398 net_mask = htonl(IN_CLASSA_NET); 15399 } 15400 addr = net_mask & ipif->ipif_subnet; 15401 ill_mark_bcast(illgrp, addr, ipst); 15402 15403 net_addr = ~net_mask | addr; 15404 ill_mark_bcast(illgrp, net_addr, ipst); 15405 15406 subnet_netmask = ipif->ipif_net_mask; 15407 addr = ipif->ipif_subnet; 15408 ill_mark_bcast(illgrp, addr, ipst); 15409 15410 subnet_addr = ~subnet_netmask | addr; 15411 ill_mark_bcast(illgrp, subnet_addr, ipst); 15412 } 15413 15414 /* 15415 * Whenever we form or delete ill groups, we need to nominate one set of 15416 * BROADCAST ires for receiving in the group. 15417 * 15418 * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires 15419 * have been added, but ill_ipif_up_count is 0. Thus, we don't assert 15420 * for ill_ipif_up_count to be non-zero. This is the only case where 15421 * ill_ipif_up_count is zero and we would still find the ires. 15422 * 15423 * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one 15424 * ipif is UP and we just have to do the nomination. 15425 * 15426 * 3) When ill_handoff_responsibility calls us, some ill has been removed 15427 * from the group. So, we have to do the nomination. 15428 * 15429 * Because of (3), there could be just one ill in the group. But we have 15430 * to nominate still as IRE_MARK_NORCV may have been marked on this. 15431 * Thus, this function does not optimize when there is only one ill as 15432 * it is not correct for (3). 15433 */ 15434 static void 15435 ill_nominate_bcast_rcv(ill_group_t *illgrp) 15436 { 15437 ill_t *ill; 15438 ipif_t *ipif; 15439 ipaddr_t subnet_addr; 15440 ipaddr_t prev_subnet_addr = 0; 15441 ipaddr_t net_addr; 15442 ipaddr_t prev_net_addr = 0; 15443 ipaddr_t net_mask = 0; 15444 ipaddr_t subnet_netmask; 15445 ipaddr_t addr; 15446 ip_stack_t *ipst; 15447 15448 /* 15449 * When the last memeber is leaving, there is nothing to 15450 * nominate. 15451 */ 15452 if (illgrp->illgrp_ill_count == 0) { 15453 ASSERT(illgrp->illgrp_ill == NULL); 15454 return; 15455 } 15456 15457 ill = illgrp->illgrp_ill; 15458 ASSERT(!ill->ill_isv6); 15459 ipst = ill->ill_ipst; 15460 /* 15461 * We assume that ires with same address and belonging to the 15462 * same group, has been grouped together. Nominating a *single* 15463 * ill in the group for sending and receiving broadcast is done 15464 * by making sure that the first BROADCAST ire (which will be 15465 * the one returned by ire_ctable_lookup for ip_rput and the 15466 * one that will be used in ip_wput_ire) will be the one that 15467 * will not have IRE_MARK_NORECV set. 15468 * 15469 * 1) ip_rput checks and discards packets received on ires marked 15470 * with IRE_MARK_NORECV. Thus, we don't send up duplicate 15471 * broadcast packets. We need to clear IRE_MARK_NORECV on the 15472 * first ire in the group for every broadcast address in the group. 15473 * ip_rput will accept packets only on the first ire i.e only 15474 * one copy of the ill. 15475 * 15476 * 2) ip_wput_ire needs to send out just one copy of the broadcast 15477 * packet for the whole group. It needs to send out on the ill 15478 * whose ire has not been marked with IRE_MARK_NORECV. If it sends 15479 * on the one marked with IRE_MARK_NORECV, ip_rput will accept 15480 * the copy echoed back on other port where the ire is not marked 15481 * with IRE_MARK_NORECV. 15482 * 15483 * Note that we just need to have the first IRE either loopback or 15484 * non-loopback (either of them may not exist if ire_create failed 15485 * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will 15486 * always hit the first one and hence will always accept one copy. 15487 * 15488 * We have a broadcast ire per ill for all the unique prefixes 15489 * hosted on that ill. As we don't have a way of knowing the 15490 * unique prefixes on a given ill and hence in the whole group, 15491 * we just call ill_mark_bcast on all the prefixes that exist 15492 * in the group. For the common case of one prefix, the code 15493 * below optimizes by remebering the last address used for 15494 * markng. In the case of multiple prefixes, this will still 15495 * optimize depending the order of prefixes. 15496 * 15497 * The only unique address across the whole group is 0.0.0.0 and 15498 * 255.255.255.255 and thus we call only once. ill_mark_bcast enables 15499 * the first ire in the bucket for receiving and disables the 15500 * others. 15501 */ 15502 ill_mark_bcast(illgrp, 0, ipst); 15503 ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst); 15504 for (; ill != NULL; ill = ill->ill_group_next) { 15505 15506 for (ipif = ill->ill_ipif; ipif != NULL; 15507 ipif = ipif->ipif_next) { 15508 15509 if (!(ipif->ipif_flags & IPIF_UP) || 15510 ipif->ipif_subnet == 0) { 15511 continue; 15512 } 15513 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15514 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15515 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15516 } else { 15517 net_mask = htonl(IN_CLASSA_NET); 15518 } 15519 addr = net_mask & ipif->ipif_subnet; 15520 if (prev_net_addr == 0 || prev_net_addr != addr) { 15521 ill_mark_bcast(illgrp, addr, ipst); 15522 net_addr = ~net_mask | addr; 15523 ill_mark_bcast(illgrp, net_addr, ipst); 15524 } 15525 prev_net_addr = addr; 15526 15527 subnet_netmask = ipif->ipif_net_mask; 15528 addr = ipif->ipif_subnet; 15529 if (prev_subnet_addr == 0 || 15530 prev_subnet_addr != addr) { 15531 ill_mark_bcast(illgrp, addr, ipst); 15532 subnet_addr = ~subnet_netmask | addr; 15533 ill_mark_bcast(illgrp, subnet_addr, ipst); 15534 } 15535 prev_subnet_addr = addr; 15536 } 15537 } 15538 } 15539 15540 /* 15541 * This function is called while forming ill groups. 15542 * 15543 * Currently, we handle only allmulti groups. We want to join 15544 * allmulti on only one of the ills in the groups. In future, 15545 * when we have link aggregation, we may have to join normal 15546 * multicast groups on multiple ills as switch does inbound load 15547 * balancing. Following are the functions that calls this 15548 * function : 15549 * 15550 * 1) ill_recover_multicast : Interface is coming back UP. 15551 * When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6 15552 * will call ill_recover_multicast to recover all the multicast 15553 * groups. We need to make sure that only one member is joined 15554 * in the ill group. 15555 * 15556 * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed. 15557 * Somebody is joining allmulti. We need to make sure that only one 15558 * member is joined in the group. 15559 * 15560 * 3) illgrp_insert : If allmulti has already joined, we need to make 15561 * sure that only one member is joined in the group. 15562 * 15563 * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving 15564 * allmulti who we have nominated. We need to pick someother ill. 15565 * 15566 * 5) illgrp_delete : The ill we nominated is leaving the group, 15567 * we need to pick a new ill to join the group. 15568 * 15569 * For (1), (2), (5) - we just have to check whether there is 15570 * a good ill joined in the group. If we could not find any ills 15571 * joined the group, we should join. 15572 * 15573 * For (4), the one that was nominated to receive, left the group. 15574 * There could be nobody joined in the group when this function is 15575 * called. 15576 * 15577 * For (3) - we need to explicitly check whether there are multiple 15578 * ills joined in the group. 15579 * 15580 * For simplicity, we don't differentiate any of the above cases. We 15581 * just leave the group if it is joined on any of them and join on 15582 * the first good ill. 15583 */ 15584 int 15585 ill_nominate_mcast_rcv(ill_group_t *illgrp) 15586 { 15587 ilm_t *ilm; 15588 ill_t *ill; 15589 ill_t *fallback_inactive_ill = NULL; 15590 ill_t *fallback_failed_ill = NULL; 15591 int ret = 0; 15592 15593 /* 15594 * Leave the allmulti on all the ills and start fresh. 15595 */ 15596 for (ill = illgrp->illgrp_ill; ill != NULL; 15597 ill = ill->ill_group_next) { 15598 if (ill->ill_join_allmulti) 15599 (void) ip_leave_allmulti(ill->ill_ipif); 15600 } 15601 15602 /* 15603 * Choose a good ill. Fallback to inactive or failed if 15604 * none available. We need to fallback to FAILED in the 15605 * case where we have 2 interfaces in a group - where 15606 * one of them is failed and another is a good one and 15607 * the good one (not marked inactive) is leaving the group. 15608 */ 15609 ret = 0; 15610 for (ill = illgrp->illgrp_ill; ill != NULL; 15611 ill = ill->ill_group_next) { 15612 /* Never pick an offline interface */ 15613 if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE) 15614 continue; 15615 15616 if (ill->ill_phyint->phyint_flags & PHYI_FAILED) { 15617 fallback_failed_ill = ill; 15618 continue; 15619 } 15620 if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) { 15621 fallback_inactive_ill = ill; 15622 continue; 15623 } 15624 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15625 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15626 ret = ip_join_allmulti(ill->ill_ipif); 15627 /* 15628 * ip_join_allmulti can fail because of memory 15629 * failures. So, make sure we join at least 15630 * on one ill. 15631 */ 15632 if (ill->ill_join_allmulti) 15633 return (0); 15634 } 15635 } 15636 } 15637 if (ret != 0) { 15638 /* 15639 * If we tried nominating above and failed to do so, 15640 * return error. We might have tried multiple times. 15641 * But, return the latest error. 15642 */ 15643 return (ret); 15644 } 15645 if ((ill = fallback_inactive_ill) != NULL) { 15646 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15647 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15648 ret = ip_join_allmulti(ill->ill_ipif); 15649 return (ret); 15650 } 15651 } 15652 } else if ((ill = fallback_failed_ill) != NULL) { 15653 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15654 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15655 ret = ip_join_allmulti(ill->ill_ipif); 15656 return (ret); 15657 } 15658 } 15659 } 15660 return (0); 15661 } 15662 15663 /* 15664 * This function is called from illgrp_delete after it is 15665 * deleted from the group to reschedule responsibilities 15666 * to a different ill. 15667 */ 15668 static void 15669 ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp) 15670 { 15671 ilm_t *ilm; 15672 ipif_t *ipif; 15673 ipaddr_t subnet_addr; 15674 ipaddr_t net_addr; 15675 ipaddr_t net_mask = 0; 15676 ipaddr_t subnet_netmask; 15677 ipaddr_t addr; 15678 ip_stack_t *ipst = ill->ill_ipst; 15679 15680 ASSERT(ill->ill_group == NULL); 15681 /* 15682 * Broadcast Responsibility: 15683 * 15684 * 1. If this ill has been nominated for receiving broadcast 15685 * packets, we need to find a new one. Before we find a new 15686 * one, we need to re-group the ires that are part of this new 15687 * group (assumed by ill_nominate_bcast_rcv). We do this by 15688 * calling ill_group_bcast_for_xmit(ill) which will do the right 15689 * thing for us. 15690 * 15691 * 2. If this ill was not nominated for receiving broadcast 15692 * packets, we need to clear the IRE_MARK_NORECV flag 15693 * so that we continue to send up broadcast packets. 15694 */ 15695 if (!ill->ill_isv6) { 15696 /* 15697 * Case 1 above : No optimization here. Just redo the 15698 * nomination. 15699 */ 15700 ill_group_bcast_for_xmit(ill); 15701 ill_nominate_bcast_rcv(illgrp); 15702 15703 /* 15704 * Case 2 above : Lookup and clear IRE_MARK_NORECV. 15705 */ 15706 ill_clear_bcast_mark(ill, 0); 15707 ill_clear_bcast_mark(ill, INADDR_BROADCAST); 15708 15709 for (ipif = ill->ill_ipif; ipif != NULL; 15710 ipif = ipif->ipif_next) { 15711 15712 if (!(ipif->ipif_flags & IPIF_UP) || 15713 ipif->ipif_subnet == 0) { 15714 continue; 15715 } 15716 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15717 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15718 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15719 } else { 15720 net_mask = htonl(IN_CLASSA_NET); 15721 } 15722 addr = net_mask & ipif->ipif_subnet; 15723 ill_clear_bcast_mark(ill, addr); 15724 15725 net_addr = ~net_mask | addr; 15726 ill_clear_bcast_mark(ill, net_addr); 15727 15728 subnet_netmask = ipif->ipif_net_mask; 15729 addr = ipif->ipif_subnet; 15730 ill_clear_bcast_mark(ill, addr); 15731 15732 subnet_addr = ~subnet_netmask | addr; 15733 ill_clear_bcast_mark(ill, subnet_addr); 15734 } 15735 } 15736 15737 /* 15738 * Multicast Responsibility. 15739 * 15740 * If we have joined allmulti on this one, find a new member 15741 * in the group to join allmulti. As this ill is already part 15742 * of allmulti, we don't have to join on this one. 15743 * 15744 * If we have not joined allmulti on this one, there is no 15745 * responsibility to handoff. But we need to take new 15746 * responsibility i.e, join allmulti on this one if we need 15747 * to. 15748 */ 15749 if (ill->ill_join_allmulti) { 15750 (void) ill_nominate_mcast_rcv(illgrp); 15751 } else { 15752 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15753 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15754 (void) ip_join_allmulti(ill->ill_ipif); 15755 break; 15756 } 15757 } 15758 } 15759 15760 /* 15761 * We intentionally do the flushing of IRE_CACHES only matching 15762 * on the ill and not on groups. Note that we are already deleted 15763 * from the group. 15764 * 15765 * This will make sure that all IRE_CACHES whose stq is pointing 15766 * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get 15767 * deleted and IRE_CACHES that are not pointing at this ill will 15768 * be left alone. 15769 */ 15770 if (ill->ill_isv6) { 15771 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 15772 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 15773 } else { 15774 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 15775 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 15776 } 15777 15778 /* 15779 * Some conn may have cached one of the IREs deleted above. By removing 15780 * the ire reference, we clean up the extra reference to the ill held in 15781 * ire->ire_stq. 15782 */ 15783 ipcl_walk(conn_cleanup_stale_ire, NULL, ipst); 15784 15785 /* 15786 * Re-do source address selection for all the members in the 15787 * group, if they borrowed source address from one of the ipifs 15788 * in this ill. 15789 */ 15790 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15791 if (ill->ill_isv6) { 15792 ipif_update_other_ipifs_v6(ipif, illgrp); 15793 } else { 15794 ipif_update_other_ipifs(ipif, illgrp); 15795 } 15796 } 15797 } 15798 15799 /* 15800 * Delete the ill from the group. The caller makes sure that it is 15801 * in a group and it okay to delete from the group. So, we always 15802 * delete here. 15803 */ 15804 static void 15805 illgrp_delete(ill_t *ill) 15806 { 15807 ill_group_t *illgrp; 15808 ill_group_t *tmpg; 15809 ill_t *tmp_ill; 15810 ip_stack_t *ipst = ill->ill_ipst; 15811 15812 /* 15813 * Reset illgrp_ill_schednext if it was pointing at us. 15814 * We need to do this before we set ill_group to NULL. 15815 */ 15816 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15817 mutex_enter(&ill->ill_lock); 15818 15819 illgrp_reset_schednext(ill); 15820 15821 illgrp = ill->ill_group; 15822 15823 /* Delete the ill from illgrp. */ 15824 if (illgrp->illgrp_ill == ill) { 15825 illgrp->illgrp_ill = ill->ill_group_next; 15826 } else { 15827 tmp_ill = illgrp->illgrp_ill; 15828 while (tmp_ill->ill_group_next != ill) { 15829 tmp_ill = tmp_ill->ill_group_next; 15830 ASSERT(tmp_ill != NULL); 15831 } 15832 tmp_ill->ill_group_next = ill->ill_group_next; 15833 } 15834 ill->ill_group = NULL; 15835 ill->ill_group_next = NULL; 15836 15837 illgrp->illgrp_ill_count--; 15838 mutex_exit(&ill->ill_lock); 15839 rw_exit(&ipst->ips_ill_g_lock); 15840 15841 /* 15842 * As this ill is leaving the group, we need to hand off 15843 * the responsibilities to the other ills in the group, if 15844 * this ill had some responsibilities. 15845 */ 15846 15847 ill_handoff_responsibility(ill, illgrp); 15848 15849 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15850 15851 if (illgrp->illgrp_ill_count == 0) { 15852 15853 ASSERT(illgrp->illgrp_ill == NULL); 15854 if (ill->ill_isv6) { 15855 if (illgrp == ipst->ips_illgrp_head_v6) { 15856 ipst->ips_illgrp_head_v6 = illgrp->illgrp_next; 15857 } else { 15858 tmpg = ipst->ips_illgrp_head_v6; 15859 while (tmpg->illgrp_next != illgrp) { 15860 tmpg = tmpg->illgrp_next; 15861 ASSERT(tmpg != NULL); 15862 } 15863 tmpg->illgrp_next = illgrp->illgrp_next; 15864 } 15865 } else { 15866 if (illgrp == ipst->ips_illgrp_head_v4) { 15867 ipst->ips_illgrp_head_v4 = illgrp->illgrp_next; 15868 } else { 15869 tmpg = ipst->ips_illgrp_head_v4; 15870 while (tmpg->illgrp_next != illgrp) { 15871 tmpg = tmpg->illgrp_next; 15872 ASSERT(tmpg != NULL); 15873 } 15874 tmpg->illgrp_next = illgrp->illgrp_next; 15875 } 15876 } 15877 mutex_destroy(&illgrp->illgrp_lock); 15878 mi_free(illgrp); 15879 } 15880 rw_exit(&ipst->ips_ill_g_lock); 15881 15882 /* 15883 * Even though the ill is out of the group its not necessary 15884 * to set ipsq_split as TRUE as the ipifs could be down temporarily 15885 * We will split the ipsq when phyint_groupname is set to NULL. 15886 */ 15887 15888 /* 15889 * Send a routing sockets message if we are deleting from 15890 * groups with names. 15891 */ 15892 if (ill->ill_phyint->phyint_groupname_len != 0) 15893 ip_rts_ifmsg(ill->ill_ipif); 15894 } 15895 15896 /* 15897 * Re-do source address selection. This is normally called when 15898 * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST 15899 * ipif comes up. 15900 */ 15901 void 15902 ill_update_source_selection(ill_t *ill) 15903 { 15904 ipif_t *ipif; 15905 15906 ASSERT(IAM_WRITER_ILL(ill)); 15907 15908 if (ill->ill_group != NULL) 15909 ill = ill->ill_group->illgrp_ill; 15910 15911 for (; ill != NULL; ill = ill->ill_group_next) { 15912 for (ipif = ill->ill_ipif; ipif != NULL; 15913 ipif = ipif->ipif_next) { 15914 if (ill->ill_isv6) 15915 ipif_recreate_interface_routes_v6(NULL, ipif); 15916 else 15917 ipif_recreate_interface_routes(NULL, ipif); 15918 } 15919 } 15920 } 15921 15922 /* 15923 * Insert ill in a group headed by illgrp_head. The caller can either 15924 * pass a groupname in which case we search for a group with the 15925 * same name to insert in or pass a group to insert in. This function 15926 * would only search groups with names. 15927 * 15928 * NOTE : The caller should make sure that there is at least one ipif 15929 * UP on this ill so that illgrp_scheduler can pick this ill 15930 * for outbound packets. If ill_ipif_up_count is zero, we have 15931 * already sent a DL_UNBIND to the driver and we don't want to 15932 * send anymore packets. We don't assert for ipif_up_count 15933 * to be greater than zero, because ipif_up_done wants to call 15934 * this function before bumping up the ipif_up_count. See 15935 * ipif_up_done() for details. 15936 */ 15937 int 15938 illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname, 15939 ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up) 15940 { 15941 ill_group_t *illgrp; 15942 ill_t *prev_ill; 15943 phyint_t *phyi; 15944 ip_stack_t *ipst = ill->ill_ipst; 15945 15946 ASSERT(ill->ill_group == NULL); 15947 15948 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15949 mutex_enter(&ill->ill_lock); 15950 15951 if (groupname != NULL) { 15952 /* 15953 * Look for a group with a matching groupname to insert. 15954 */ 15955 for (illgrp = *illgrp_head; illgrp != NULL; 15956 illgrp = illgrp->illgrp_next) { 15957 15958 ill_t *tmp_ill; 15959 15960 /* 15961 * If we have an ill_group_t in the list which has 15962 * no ill_t assigned then we must be in the process of 15963 * removing this group. We skip this as illgrp_delete() 15964 * will remove it from the list. 15965 */ 15966 if ((tmp_ill = illgrp->illgrp_ill) == NULL) { 15967 ASSERT(illgrp->illgrp_ill_count == 0); 15968 continue; 15969 } 15970 15971 ASSERT(tmp_ill->ill_phyint != NULL); 15972 phyi = tmp_ill->ill_phyint; 15973 /* 15974 * Look at groups which has names only. 15975 */ 15976 if (phyi->phyint_groupname_len == 0) 15977 continue; 15978 /* 15979 * Names are stored in the phyint common to both 15980 * IPv4 and IPv6. 15981 */ 15982 if (mi_strcmp(phyi->phyint_groupname, 15983 groupname) == 0) { 15984 break; 15985 } 15986 } 15987 } else { 15988 /* 15989 * If the caller passes in a NULL "grp_to_insert", we 15990 * allocate one below and insert this singleton. 15991 */ 15992 illgrp = grp_to_insert; 15993 } 15994 15995 ill->ill_group_next = NULL; 15996 15997 if (illgrp == NULL) { 15998 illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t)); 15999 if (illgrp == NULL) { 16000 return (ENOMEM); 16001 } 16002 illgrp->illgrp_next = *illgrp_head; 16003 *illgrp_head = illgrp; 16004 illgrp->illgrp_ill = ill; 16005 illgrp->illgrp_ill_count = 1; 16006 ill->ill_group = illgrp; 16007 /* 16008 * Used in illgrp_scheduler to protect multiple threads 16009 * from traversing the list. 16010 */ 16011 mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0); 16012 } else { 16013 ASSERT(ill->ill_net_type == 16014 illgrp->illgrp_ill->ill_net_type); 16015 ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type); 16016 16017 /* Insert ill at tail of this group */ 16018 prev_ill = illgrp->illgrp_ill; 16019 while (prev_ill->ill_group_next != NULL) 16020 prev_ill = prev_ill->ill_group_next; 16021 prev_ill->ill_group_next = ill; 16022 ill->ill_group = illgrp; 16023 illgrp->illgrp_ill_count++; 16024 /* 16025 * Inherit group properties. Currently only forwarding 16026 * is the property we try to keep the same with all the 16027 * ills. When there are more, we will abstract this into 16028 * a function. 16029 */ 16030 ill->ill_flags &= ~ILLF_ROUTER; 16031 ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER); 16032 } 16033 mutex_exit(&ill->ill_lock); 16034 rw_exit(&ipst->ips_ill_g_lock); 16035 16036 /* 16037 * 1) When ipif_up_done() calls this function, ipif_up_count 16038 * may be zero as it has not yet been bumped. But the ires 16039 * have already been added. So, we do the nomination here 16040 * itself. But, when ip_sioctl_groupname calls this, it checks 16041 * for ill_ipif_up_count != 0. Thus we don't check for 16042 * ill_ipif_up_count here while nominating broadcast ires for 16043 * receive. 16044 * 16045 * 2) Similarly, we need to call ill_group_bcast_for_xmit here 16046 * to group them properly as ire_add() has already happened 16047 * in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert 16048 * case, we need to do it here anyway. 16049 */ 16050 if (!ill->ill_isv6) { 16051 ill_group_bcast_for_xmit(ill); 16052 ill_nominate_bcast_rcv(illgrp); 16053 } 16054 16055 if (!ipif_is_coming_up) { 16056 /* 16057 * When ipif_up_done() calls this function, the multicast 16058 * groups have not been joined yet. So, there is no point in 16059 * nomination. ip_join_allmulti will handle groups when 16060 * ill_recover_multicast is called from ipif_up_done() later. 16061 */ 16062 (void) ill_nominate_mcast_rcv(illgrp); 16063 /* 16064 * ipif_up_done calls ill_update_source_selection 16065 * anyway. Moreover, we don't want to re-create 16066 * interface routes while ipif_up_done() still has reference 16067 * to them. Refer to ipif_up_done() for more details. 16068 */ 16069 ill_update_source_selection(ill); 16070 } 16071 16072 /* 16073 * Send a routing sockets message if we are inserting into 16074 * groups with names. 16075 */ 16076 if (groupname != NULL) 16077 ip_rts_ifmsg(ill->ill_ipif); 16078 return (0); 16079 } 16080 16081 /* 16082 * Return the first phyint matching the groupname. There could 16083 * be more than one when there are ill groups. 16084 * 16085 * If 'usable' is set, then we exclude ones that are marked with any of 16086 * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE). 16087 * Needs work: called only from ip_sioctl_groupname and from the ipmp/netinfo 16088 * emulation of ipmp. 16089 */ 16090 phyint_t * 16091 phyint_lookup_group(char *groupname, boolean_t usable, ip_stack_t *ipst) 16092 { 16093 phyint_t *phyi; 16094 16095 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 16096 /* 16097 * Group names are stored in the phyint - a common structure 16098 * to both IPv4 and IPv6. 16099 */ 16100 phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); 16101 for (; phyi != NULL; 16102 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16103 phyi, AVL_AFTER)) { 16104 if (phyi->phyint_groupname_len == 0) 16105 continue; 16106 /* 16107 * Skip the ones that should not be used since the callers 16108 * sometime use this for sending packets. 16109 */ 16110 if (usable && (phyi->phyint_flags & 16111 (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE))) 16112 continue; 16113 16114 ASSERT(phyi->phyint_groupname != NULL); 16115 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0) 16116 return (phyi); 16117 } 16118 return (NULL); 16119 } 16120 16121 16122 /* 16123 * Return the first usable phyint matching the group index. By 'usable' 16124 * we exclude ones that are marked ununsable with any of 16125 * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE). 16126 * 16127 * Used only for the ipmp/netinfo emulation of ipmp. 16128 */ 16129 phyint_t * 16130 phyint_lookup_group_ifindex(uint_t group_ifindex, ip_stack_t *ipst) 16131 { 16132 phyint_t *phyi; 16133 16134 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 16135 16136 if (!ipst->ips_ipmp_hook_emulation) 16137 return (NULL); 16138 16139 /* 16140 * Group indicies are stored in the phyint - a common structure 16141 * to both IPv4 and IPv6. 16142 */ 16143 phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); 16144 for (; phyi != NULL; 16145 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16146 phyi, AVL_AFTER)) { 16147 /* Ignore the ones that do not have a group */ 16148 if (phyi->phyint_groupname_len == 0) 16149 continue; 16150 16151 ASSERT(phyi->phyint_group_ifindex != 0); 16152 /* 16153 * Skip the ones that should not be used since the callers 16154 * sometime use this for sending packets. 16155 */ 16156 if (phyi->phyint_flags & 16157 (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)) 16158 continue; 16159 if (phyi->phyint_group_ifindex == group_ifindex) 16160 return (phyi); 16161 } 16162 return (NULL); 16163 } 16164 16165 16166 /* 16167 * MT notes on creation and deletion of IPMP groups 16168 * 16169 * Creation and deletion of IPMP groups introduce the need to merge or 16170 * split the associated serialization objects i.e the ipsq's. Normally all 16171 * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled 16172 * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during 16173 * the execution of the SIOCSLIFGROUPNAME command the picture changes. There 16174 * is a need to change the <ill-ipsq> association and we have to operate on both 16175 * the source and destination IPMP groups. For eg. attempting to set the 16176 * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to 16177 * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the 16178 * source or destination IPMP group are mapped to a single ipsq for executing 16179 * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's. 16180 * The <ill-ipsq> mapping is restored back to normal at a later point. This is 16181 * termed as a split of the ipsq. The converse of the merge i.e. a split of the 16182 * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname 16183 * occurred on the ipsq, then the ipsq_split flag is set. This indicates the 16184 * ipsq has to be examined for redoing the <ill-ipsq> associations. 16185 * 16186 * In the above example the ioctl handling code locates the current ipsq of hme0 16187 * which is ipsq(mpk17-84). It then enters the above ipsq immediately or 16188 * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates 16189 * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into 16190 * the destination ipsq. If the destination ipsq is not busy, it also enters 16191 * the destination ipsq exclusively. Now the actual groupname setting operation 16192 * can proceed. If the destination ipsq is busy, the operation is enqueued 16193 * on the destination (merged) ipsq and will be handled in the unwind from 16194 * ipsq_exit. 16195 * 16196 * To prevent other threads accessing the ill while the group name change is 16197 * in progres, we bring down the ipifs which also removes the ill from the 16198 * group. The group is changed in phyint and when the first ipif on the ill 16199 * is brought up, the ill is inserted into the right IPMP group by 16200 * illgrp_insert. 16201 */ 16202 /* ARGSUSED */ 16203 int 16204 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16205 ip_ioctl_cmd_t *ipip, void *ifreq) 16206 { 16207 int i; 16208 char *tmp; 16209 int namelen; 16210 ill_t *ill = ipif->ipif_ill; 16211 ill_t *ill_v4, *ill_v6; 16212 int err = 0; 16213 phyint_t *phyi; 16214 phyint_t *phyi_tmp; 16215 struct lifreq *lifr; 16216 mblk_t *mp1; 16217 char *groupname; 16218 ipsq_t *ipsq; 16219 ip_stack_t *ipst = ill->ill_ipst; 16220 16221 ASSERT(IAM_WRITER_IPIF(ipif)); 16222 16223 /* Existance verified in ip_wput_nondata */ 16224 mp1 = mp->b_cont->b_cont; 16225 lifr = (struct lifreq *)mp1->b_rptr; 16226 groupname = lifr->lifr_groupname; 16227 16228 if (ipif->ipif_id != 0) 16229 return (EINVAL); 16230 16231 phyi = ill->ill_phyint; 16232 ASSERT(phyi != NULL); 16233 16234 if (phyi->phyint_flags & PHYI_VIRTUAL) 16235 return (EINVAL); 16236 16237 tmp = groupname; 16238 for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++) 16239 ; 16240 16241 if (i == LIFNAMSIZ) { 16242 /* no null termination */ 16243 return (EINVAL); 16244 } 16245 16246 /* 16247 * Calculate the namelen exclusive of the null 16248 * termination character. 16249 */ 16250 namelen = tmp - groupname; 16251 16252 ill_v4 = phyi->phyint_illv4; 16253 ill_v6 = phyi->phyint_illv6; 16254 16255 /* 16256 * ILL cannot be part of a usesrc group and and IPMP group at the 16257 * same time. No need to grab the ill_g_usesrc_lock here, see 16258 * synchronization notes in ip.c 16259 */ 16260 if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 16261 return (EINVAL); 16262 } 16263 16264 /* 16265 * mark the ill as changing. 16266 * this should queue all new requests on the syncq. 16267 */ 16268 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16269 16270 if (ill_v4 != NULL) 16271 ill_v4->ill_state_flags |= ILL_CHANGING; 16272 if (ill_v6 != NULL) 16273 ill_v6->ill_state_flags |= ILL_CHANGING; 16274 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16275 16276 if (namelen == 0) { 16277 /* 16278 * Null string means remove this interface from the 16279 * existing group. 16280 */ 16281 if (phyi->phyint_groupname_len == 0) { 16282 /* 16283 * Never was in a group. 16284 */ 16285 err = 0; 16286 goto done; 16287 } 16288 16289 /* 16290 * IPv4 or IPv6 may be temporarily out of the group when all 16291 * the ipifs are down. Thus, we need to check for ill_group to 16292 * be non-NULL. 16293 */ 16294 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 16295 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 16296 mutex_enter(&ill_v4->ill_lock); 16297 if (!ill_is_quiescent(ill_v4)) { 16298 /* 16299 * ipsq_pending_mp_add will not fail since 16300 * connp is NULL 16301 */ 16302 (void) ipsq_pending_mp_add(NULL, 16303 ill_v4->ill_ipif, q, mp, ILL_DOWN); 16304 mutex_exit(&ill_v4->ill_lock); 16305 err = EINPROGRESS; 16306 goto done; 16307 } 16308 mutex_exit(&ill_v4->ill_lock); 16309 } 16310 16311 if (ill_v6 != NULL && ill_v6->ill_group != NULL) { 16312 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 16313 mutex_enter(&ill_v6->ill_lock); 16314 if (!ill_is_quiescent(ill_v6)) { 16315 (void) ipsq_pending_mp_add(NULL, 16316 ill_v6->ill_ipif, q, mp, ILL_DOWN); 16317 mutex_exit(&ill_v6->ill_lock); 16318 err = EINPROGRESS; 16319 goto done; 16320 } 16321 mutex_exit(&ill_v6->ill_lock); 16322 } 16323 16324 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16325 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16326 mutex_enter(&phyi->phyint_lock); 16327 ASSERT(phyi->phyint_groupname != NULL); 16328 mi_free(phyi->phyint_groupname); 16329 phyi->phyint_groupname = NULL; 16330 phyi->phyint_groupname_len = 0; 16331 16332 /* Restore the ifindex used to be the per interface one */ 16333 phyi->phyint_group_ifindex = 0; 16334 phyi->phyint_hook_ifindex = phyi->phyint_ifindex; 16335 mutex_exit(&phyi->phyint_lock); 16336 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16337 rw_exit(&ipst->ips_ill_g_lock); 16338 err = ill_up_ipifs(ill, q, mp); 16339 16340 /* 16341 * set the split flag so that the ipsq can be split 16342 */ 16343 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16344 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16345 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16346 16347 } else { 16348 if (phyi->phyint_groupname_len != 0) { 16349 ASSERT(phyi->phyint_groupname != NULL); 16350 /* Are we inserting in the same group ? */ 16351 if (mi_strcmp(groupname, 16352 phyi->phyint_groupname) == 0) { 16353 err = 0; 16354 goto done; 16355 } 16356 } 16357 16358 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 16359 /* 16360 * Merge ipsq for the group's. 16361 * This check is here as multiple groups/ills might be 16362 * sharing the same ipsq. 16363 * If we have to merege than the operation is restarted 16364 * on the new ipsq. 16365 */ 16366 ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL, ipst); 16367 if (phyi->phyint_ipsq != ipsq) { 16368 rw_exit(&ipst->ips_ill_g_lock); 16369 err = ill_merge_groups(ill, NULL, groupname, mp, q); 16370 goto done; 16371 } 16372 /* 16373 * Running exclusive on new ipsq. 16374 */ 16375 16376 ASSERT(ipsq != NULL); 16377 ASSERT(ipsq->ipsq_writer == curthread); 16378 16379 /* 16380 * Check whether the ill_type and ill_net_type matches before 16381 * we allocate any memory so that the cleanup is easier. 16382 * 16383 * We can't group dissimilar ones as we can't load spread 16384 * packets across the group because of potential link-level 16385 * header differences. 16386 */ 16387 phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst); 16388 if (phyi_tmp != NULL) { 16389 if ((ill_v4 != NULL && 16390 phyi_tmp->phyint_illv4 != NULL) && 16391 ((ill_v4->ill_net_type != 16392 phyi_tmp->phyint_illv4->ill_net_type) || 16393 (ill_v4->ill_type != 16394 phyi_tmp->phyint_illv4->ill_type))) { 16395 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16396 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16397 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16398 rw_exit(&ipst->ips_ill_g_lock); 16399 return (EINVAL); 16400 } 16401 if ((ill_v6 != NULL && 16402 phyi_tmp->phyint_illv6 != NULL) && 16403 ((ill_v6->ill_net_type != 16404 phyi_tmp->phyint_illv6->ill_net_type) || 16405 (ill_v6->ill_type != 16406 phyi_tmp->phyint_illv6->ill_type))) { 16407 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16408 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16409 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16410 rw_exit(&ipst->ips_ill_g_lock); 16411 return (EINVAL); 16412 } 16413 } 16414 16415 rw_exit(&ipst->ips_ill_g_lock); 16416 16417 /* 16418 * bring down all v4 ipifs. 16419 */ 16420 if (ill_v4 != NULL) { 16421 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 16422 } 16423 16424 /* 16425 * bring down all v6 ipifs. 16426 */ 16427 if (ill_v6 != NULL) { 16428 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 16429 } 16430 16431 /* 16432 * make sure all ipifs are down and there are no active 16433 * references. Call to ipsq_pending_mp_add will not fail 16434 * since connp is NULL. 16435 */ 16436 if (ill_v4 != NULL) { 16437 mutex_enter(&ill_v4->ill_lock); 16438 if (!ill_is_quiescent(ill_v4)) { 16439 (void) ipsq_pending_mp_add(NULL, 16440 ill_v4->ill_ipif, q, mp, ILL_DOWN); 16441 mutex_exit(&ill_v4->ill_lock); 16442 err = EINPROGRESS; 16443 goto done; 16444 } 16445 mutex_exit(&ill_v4->ill_lock); 16446 } 16447 16448 if (ill_v6 != NULL) { 16449 mutex_enter(&ill_v6->ill_lock); 16450 if (!ill_is_quiescent(ill_v6)) { 16451 (void) ipsq_pending_mp_add(NULL, 16452 ill_v6->ill_ipif, q, mp, ILL_DOWN); 16453 mutex_exit(&ill_v6->ill_lock); 16454 err = EINPROGRESS; 16455 goto done; 16456 } 16457 mutex_exit(&ill_v6->ill_lock); 16458 } 16459 16460 /* 16461 * allocate including space for null terminator 16462 * before we insert. 16463 */ 16464 tmp = (char *)mi_alloc(namelen + 1, BPRI_MED); 16465 if (tmp == NULL) 16466 return (ENOMEM); 16467 16468 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16469 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16470 mutex_enter(&phyi->phyint_lock); 16471 if (phyi->phyint_groupname_len != 0) { 16472 ASSERT(phyi->phyint_groupname != NULL); 16473 mi_free(phyi->phyint_groupname); 16474 } 16475 16476 /* 16477 * setup the new group name. 16478 */ 16479 phyi->phyint_groupname = tmp; 16480 bcopy(groupname, phyi->phyint_groupname, namelen + 1); 16481 phyi->phyint_groupname_len = namelen + 1; 16482 16483 if (ipst->ips_ipmp_hook_emulation) { 16484 /* 16485 * If the group already exists we use the existing 16486 * group_ifindex, otherwise we pick a new index here. 16487 */ 16488 if (phyi_tmp != NULL) { 16489 phyi->phyint_group_ifindex = 16490 phyi_tmp->phyint_group_ifindex; 16491 } else { 16492 /* XXX We need a recovery strategy here. */ 16493 if (!ip_assign_ifindex( 16494 &phyi->phyint_group_ifindex, ipst)) 16495 cmn_err(CE_PANIC, 16496 "ip_assign_ifindex() failed"); 16497 } 16498 } 16499 /* 16500 * Select whether the netinfo and hook use the per-interface 16501 * or per-group ifindex. 16502 */ 16503 if (ipst->ips_ipmp_hook_emulation) 16504 phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex; 16505 else 16506 phyi->phyint_hook_ifindex = phyi->phyint_ifindex; 16507 16508 if (ipst->ips_ipmp_hook_emulation && 16509 phyi_tmp != NULL) { 16510 /* First phyint in group - group PLUMB event */ 16511 ill_nic_info_plumb(ill, B_TRUE); 16512 } 16513 mutex_exit(&phyi->phyint_lock); 16514 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16515 rw_exit(&ipst->ips_ill_g_lock); 16516 16517 err = ill_up_ipifs(ill, q, mp); 16518 } 16519 16520 done: 16521 /* 16522 * normally ILL_CHANGING is cleared in ill_up_ipifs. 16523 */ 16524 if (err != EINPROGRESS) { 16525 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16526 if (ill_v4 != NULL) 16527 ill_v4->ill_state_flags &= ~ILL_CHANGING; 16528 if (ill_v6 != NULL) 16529 ill_v6->ill_state_flags &= ~ILL_CHANGING; 16530 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16531 } 16532 return (err); 16533 } 16534 16535 /* ARGSUSED */ 16536 int 16537 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 16538 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 16539 { 16540 ill_t *ill; 16541 phyint_t *phyi; 16542 struct lifreq *lifr; 16543 mblk_t *mp1; 16544 16545 /* Existence verified in ip_wput_nondata */ 16546 mp1 = mp->b_cont->b_cont; 16547 lifr = (struct lifreq *)mp1->b_rptr; 16548 ill = ipif->ipif_ill; 16549 phyi = ill->ill_phyint; 16550 16551 lifr->lifr_groupname[0] = '\0'; 16552 /* 16553 * ill_group may be null if all the interfaces 16554 * are down. But still, the phyint should always 16555 * hold the name. 16556 */ 16557 if (phyi->phyint_groupname_len != 0) { 16558 bcopy(phyi->phyint_groupname, lifr->lifr_groupname, 16559 phyi->phyint_groupname_len); 16560 } 16561 16562 return (0); 16563 } 16564 16565 16566 typedef struct conn_move_s { 16567 ill_t *cm_from_ill; 16568 ill_t *cm_to_ill; 16569 int cm_ifindex; 16570 } conn_move_t; 16571 16572 /* 16573 * ipcl_walk function for moving conn_multicast_ill for a given ill. 16574 */ 16575 static void 16576 conn_move(conn_t *connp, caddr_t arg) 16577 { 16578 conn_move_t *connm; 16579 int ifindex; 16580 int i; 16581 ill_t *from_ill; 16582 ill_t *to_ill; 16583 ilg_t *ilg; 16584 ilm_t *ret_ilm; 16585 16586 connm = (conn_move_t *)arg; 16587 ifindex = connm->cm_ifindex; 16588 from_ill = connm->cm_from_ill; 16589 to_ill = connm->cm_to_ill; 16590 16591 /* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */ 16592 16593 /* All multicast fields protected by conn_lock */ 16594 mutex_enter(&connp->conn_lock); 16595 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); 16596 if ((connp->conn_outgoing_ill == from_ill) && 16597 (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) { 16598 connp->conn_outgoing_ill = to_ill; 16599 connp->conn_incoming_ill = to_ill; 16600 } 16601 16602 /* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */ 16603 16604 if ((connp->conn_multicast_ill == from_ill) && 16605 (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) { 16606 connp->conn_multicast_ill = connm->cm_to_ill; 16607 } 16608 16609 /* Change IP_XMIT_IF associations */ 16610 if ((connp->conn_xmit_if_ill == from_ill) && 16611 (ifindex == 0 || connp->conn_orig_xmit_ifindex == ifindex)) { 16612 connp->conn_xmit_if_ill = to_ill; 16613 } 16614 /* 16615 * Change the ilg_ill to point to the new one. This assumes 16616 * ilm_move_v6 has moved the ilms to new_ill and the driver 16617 * has been told to receive packets on this interface. 16618 * ilm_move_v6 FAILBACKS all the ilms successfully always. 16619 * But when doing a FAILOVER, it might fail with ENOMEM and so 16620 * some ilms may not have moved. We check to see whether 16621 * the ilms have moved to to_ill. We can't check on from_ill 16622 * as in the process of moving, we could have split an ilm 16623 * in to two - which has the same orig_ifindex and v6group. 16624 * 16625 * For IPv4, ilg_ipif moves implicitly. The code below really 16626 * does not do anything for IPv4 as ilg_ill is NULL for IPv4. 16627 */ 16628 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 16629 ilg = &connp->conn_ilg[i]; 16630 if ((ilg->ilg_ill == from_ill) && 16631 (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) { 16632 /* ifindex != 0 indicates failback */ 16633 if (ifindex != 0) { 16634 connp->conn_ilg[i].ilg_ill = to_ill; 16635 continue; 16636 } 16637 16638 ret_ilm = ilm_lookup_ill_index_v6(to_ill, 16639 &ilg->ilg_v6group, ilg->ilg_orig_ifindex, 16640 connp->conn_zoneid); 16641 16642 if (ret_ilm != NULL) 16643 connp->conn_ilg[i].ilg_ill = to_ill; 16644 } 16645 } 16646 mutex_exit(&connp->conn_lock); 16647 } 16648 16649 static void 16650 conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex) 16651 { 16652 conn_move_t connm; 16653 ip_stack_t *ipst = from_ill->ill_ipst; 16654 16655 connm.cm_from_ill = from_ill; 16656 connm.cm_to_ill = to_ill; 16657 connm.cm_ifindex = ifindex; 16658 16659 ipcl_walk(conn_move, (caddr_t)&connm, ipst); 16660 } 16661 16662 /* 16663 * ilm has been moved from from_ill to to_ill. 16664 * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill. 16665 * appropriately. 16666 * 16667 * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because 16668 * the code there de-references ipif_ill to get the ill to 16669 * send multicast requests. It does not work as ipif is on its 16670 * move and already moved when this function is called. 16671 * Thus, we need to use from_ill and to_ill send down multicast 16672 * requests. 16673 */ 16674 static void 16675 ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill) 16676 { 16677 ipif_t *ipif; 16678 ilm_t *ilm; 16679 16680 /* 16681 * See whether we need to send down DL_ENABMULTI_REQ on 16682 * to_ill as ilm has just been added. 16683 */ 16684 ASSERT(IAM_WRITER_ILL(to_ill)); 16685 ASSERT(IAM_WRITER_ILL(from_ill)); 16686 16687 ILM_WALKER_HOLD(to_ill); 16688 for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 16689 16690 if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED)) 16691 continue; 16692 /* 16693 * no locks held, ill/ipif cannot dissappear as long 16694 * as we are writer. 16695 */ 16696 ipif = to_ill->ill_ipif; 16697 /* 16698 * No need to hold any lock as we are the writer and this 16699 * can only be changed by a writer. 16700 */ 16701 ilm->ilm_is_new = B_FALSE; 16702 16703 if (to_ill->ill_net_type != IRE_IF_RESOLVER || 16704 ipif->ipif_flags & IPIF_POINTOPOINT) { 16705 ip1dbg(("ilm_send_multicast_reqs: to_ill not " 16706 "resolver\n")); 16707 continue; /* Must be IRE_IF_NORESOLVER */ 16708 } 16709 16710 16711 if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 16712 ip1dbg(("ilm_send_multicast_reqs: " 16713 "to_ill MULTI_BCAST\n")); 16714 goto from; 16715 } 16716 16717 if (to_ill->ill_isv6) 16718 mld_joingroup(ilm); 16719 else 16720 igmp_joingroup(ilm); 16721 16722 if (to_ill->ill_ipif_up_count == 0) { 16723 /* 16724 * Nobody there. All multicast addresses will be 16725 * re-joined when we get the DL_BIND_ACK bringing the 16726 * interface up. 16727 */ 16728 ilm->ilm_notify_driver = B_FALSE; 16729 ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n")); 16730 goto from; 16731 } 16732 16733 /* 16734 * For allmulti address, we want to join on only one interface. 16735 * Checking for ilm_numentries_v6 is not correct as you may 16736 * find an ilm with zero address on to_ill, but we may not 16737 * have nominated to_ill for receiving. Thus, if we have 16738 * nominated from_ill (ill_join_allmulti is set), nominate 16739 * only if to_ill is not already nominated (to_ill normally 16740 * should not have been nominated if "from_ill" has already 16741 * been nominated. As we don't prevent failovers from happening 16742 * across groups, we don't assert). 16743 */ 16744 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16745 /* 16746 * There is no need to hold ill locks as we are 16747 * writer on both ills and when ill_join_allmulti 16748 * is changed the thread is always a writer. 16749 */ 16750 if (from_ill->ill_join_allmulti && 16751 !to_ill->ill_join_allmulti) { 16752 (void) ip_join_allmulti(to_ill->ill_ipif); 16753 } 16754 } else if (ilm->ilm_notify_driver) { 16755 16756 /* 16757 * This is a newly moved ilm so we need to tell the 16758 * driver about the new group. There can be more than 16759 * one ilm's for the same group in the list each with a 16760 * different orig_ifindex. We have to inform the driver 16761 * once. In ilm_move_v[4,6] we only set the flag 16762 * ilm_notify_driver for the first ilm. 16763 */ 16764 16765 (void) ip_ll_send_enabmulti_req(to_ill, 16766 &ilm->ilm_v6addr); 16767 } 16768 16769 ilm->ilm_notify_driver = B_FALSE; 16770 16771 /* 16772 * See whether we need to send down DL_DISABMULTI_REQ on 16773 * from_ill as ilm has just been removed. 16774 */ 16775 from: 16776 ipif = from_ill->ill_ipif; 16777 if (from_ill->ill_net_type != IRE_IF_RESOLVER || 16778 ipif->ipif_flags & IPIF_POINTOPOINT) { 16779 ip1dbg(("ilm_send_multicast_reqs: " 16780 "from_ill not resolver\n")); 16781 continue; /* Must be IRE_IF_NORESOLVER */ 16782 } 16783 16784 if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 16785 ip1dbg(("ilm_send_multicast_reqs: " 16786 "from_ill MULTI_BCAST\n")); 16787 continue; 16788 } 16789 16790 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16791 if (from_ill->ill_join_allmulti) 16792 (void) ip_leave_allmulti(from_ill->ill_ipif); 16793 } else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) { 16794 (void) ip_ll_send_disabmulti_req(from_ill, 16795 &ilm->ilm_v6addr); 16796 } 16797 } 16798 ILM_WALKER_RELE(to_ill); 16799 } 16800 16801 /* 16802 * This function is called when all multicast memberships needs 16803 * to be moved from "from_ill" to "to_ill" for IPv6. This function is 16804 * called only once unlike the IPv4 counterpart where it is called after 16805 * every logical interface is moved. The reason is due to multicast 16806 * memberships are joined using an interface address in IPv4 while in 16807 * IPv6, interface index is used. 16808 */ 16809 static void 16810 ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex) 16811 { 16812 ilm_t *ilm; 16813 ilm_t *ilm_next; 16814 ilm_t *new_ilm; 16815 ilm_t **ilmp; 16816 int count; 16817 char buf[INET6_ADDRSTRLEN]; 16818 in6_addr_t ipv6_snm = ipv6_solicited_node_mcast; 16819 ip_stack_t *ipst = from_ill->ill_ipst; 16820 16821 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 16822 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 16823 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 16824 16825 if (ifindex == 0) { 16826 /* 16827 * Form the solicited node mcast address which is used later. 16828 */ 16829 ipif_t *ipif; 16830 16831 ipif = from_ill->ill_ipif; 16832 ASSERT(ipif->ipif_id == 0); 16833 16834 ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 16835 } 16836 16837 ilmp = &from_ill->ill_ilm; 16838 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 16839 ilm_next = ilm->ilm_next; 16840 16841 if (ilm->ilm_flags & ILM_DELETED) { 16842 ilmp = &ilm->ilm_next; 16843 continue; 16844 } 16845 16846 new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr, 16847 ilm->ilm_orig_ifindex, ilm->ilm_zoneid); 16848 ASSERT(ilm->ilm_orig_ifindex != 0); 16849 if (ilm->ilm_orig_ifindex == ifindex) { 16850 /* 16851 * We are failing back multicast memberships. 16852 * If the same ilm exists in to_ill, it means somebody 16853 * has joined the same group there e.g. ff02::1 16854 * is joined within the kernel when the interfaces 16855 * came UP. 16856 */ 16857 ASSERT(ilm->ilm_ipif == NULL); 16858 if (new_ilm != NULL) { 16859 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16860 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16861 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16862 new_ilm->ilm_is_new = B_TRUE; 16863 } 16864 } else { 16865 /* 16866 * check if we can just move the ilm 16867 */ 16868 if (from_ill->ill_ilm_walker_cnt != 0) { 16869 /* 16870 * We have walkers we cannot move 16871 * the ilm, so allocate a new ilm, 16872 * this (old) ilm will be marked 16873 * ILM_DELETED at the end of the loop 16874 * and will be freed when the 16875 * last walker exits. 16876 */ 16877 new_ilm = (ilm_t *)mi_zalloc 16878 (sizeof (ilm_t)); 16879 if (new_ilm == NULL) { 16880 ip0dbg(("ilm_move_v6: " 16881 "FAILBACK of IPv6" 16882 " multicast address %s : " 16883 "from %s to" 16884 " %s failed : ENOMEM \n", 16885 inet_ntop(AF_INET6, 16886 &ilm->ilm_v6addr, buf, 16887 sizeof (buf)), 16888 from_ill->ill_name, 16889 to_ill->ill_name)); 16890 16891 ilmp = &ilm->ilm_next; 16892 continue; 16893 } 16894 *new_ilm = *ilm; 16895 /* 16896 * we don't want new_ilm linked to 16897 * ilm's filter list. 16898 */ 16899 new_ilm->ilm_filter = NULL; 16900 } else { 16901 /* 16902 * No walkers we can move the ilm. 16903 * lets take it out of the list. 16904 */ 16905 *ilmp = ilm->ilm_next; 16906 ilm->ilm_next = NULL; 16907 new_ilm = ilm; 16908 } 16909 16910 /* 16911 * if this is the first ilm for the group 16912 * set ilm_notify_driver so that we notify the 16913 * driver in ilm_send_multicast_reqs. 16914 */ 16915 if (ilm_lookup_ill_v6(to_ill, 16916 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16917 new_ilm->ilm_notify_driver = B_TRUE; 16918 16919 new_ilm->ilm_ill = to_ill; 16920 /* Add to the to_ill's list */ 16921 new_ilm->ilm_next = to_ill->ill_ilm; 16922 to_ill->ill_ilm = new_ilm; 16923 /* 16924 * set the flag so that mld_joingroup is 16925 * called in ilm_send_multicast_reqs(). 16926 */ 16927 new_ilm->ilm_is_new = B_TRUE; 16928 } 16929 goto bottom; 16930 } else if (ifindex != 0) { 16931 /* 16932 * If this is FAILBACK (ifindex != 0) and the ifindex 16933 * has not matched above, look at the next ilm. 16934 */ 16935 ilmp = &ilm->ilm_next; 16936 continue; 16937 } 16938 /* 16939 * If we are here, it means ifindex is 0. Failover 16940 * everything. 16941 * 16942 * We need to handle solicited node mcast address 16943 * and all_nodes mcast address differently as they 16944 * are joined witin the kenrel (ipif_multicast_up) 16945 * and potentially from the userland. We are called 16946 * after the ipifs of from_ill has been moved. 16947 * If we still find ilms on ill with solicited node 16948 * mcast address or all_nodes mcast address, it must 16949 * belong to the UP interface that has not moved e.g. 16950 * ipif_id 0 with the link local prefix does not move. 16951 * We join this on the new ill accounting for all the 16952 * userland memberships so that applications don't 16953 * see any failure. 16954 * 16955 * We need to make sure that we account only for the 16956 * solicited node and all node multicast addresses 16957 * that was brought UP on these. In the case of 16958 * a failover from A to B, we might have ilms belonging 16959 * to A (ilm_orig_ifindex pointing at A) on B accounting 16960 * for the membership from the userland. If we are failing 16961 * over from B to C now, we will find the ones belonging 16962 * to A on B. These don't account for the ill_ipif_up_count. 16963 * They just move from B to C. The check below on 16964 * ilm_orig_ifindex ensures that. 16965 */ 16966 if ((ilm->ilm_orig_ifindex == 16967 from_ill->ill_phyint->phyint_ifindex) && 16968 (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) || 16969 IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, 16970 &ilm->ilm_v6addr))) { 16971 ASSERT(ilm->ilm_refcnt > 0); 16972 count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count; 16973 /* 16974 * For indentation reasons, we are not using a 16975 * "else" here. 16976 */ 16977 if (count == 0) { 16978 ilmp = &ilm->ilm_next; 16979 continue; 16980 } 16981 ilm->ilm_refcnt -= count; 16982 if (new_ilm != NULL) { 16983 /* 16984 * Can find one with the same 16985 * ilm_orig_ifindex, if we are failing 16986 * over to a STANDBY. This happens 16987 * when somebody wants to join a group 16988 * on a STANDBY interface and we 16989 * internally join on a different one. 16990 * If we had joined on from_ill then, a 16991 * failover now will find a new ilm 16992 * with this index. 16993 */ 16994 ip1dbg(("ilm_move_v6: FAILOVER, found" 16995 " new ilm on %s, group address %s\n", 16996 to_ill->ill_name, 16997 inet_ntop(AF_INET6, 16998 &ilm->ilm_v6addr, buf, 16999 sizeof (buf)))); 17000 new_ilm->ilm_refcnt += count; 17001 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 17002 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 17003 new_ilm->ilm_is_new = B_TRUE; 17004 } 17005 } else { 17006 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 17007 if (new_ilm == NULL) { 17008 ip0dbg(("ilm_move_v6: FAILOVER of IPv6" 17009 " multicast address %s : from %s to" 17010 " %s failed : ENOMEM \n", 17011 inet_ntop(AF_INET6, 17012 &ilm->ilm_v6addr, buf, 17013 sizeof (buf)), from_ill->ill_name, 17014 to_ill->ill_name)); 17015 ilmp = &ilm->ilm_next; 17016 continue; 17017 } 17018 *new_ilm = *ilm; 17019 new_ilm->ilm_filter = NULL; 17020 new_ilm->ilm_refcnt = count; 17021 new_ilm->ilm_timer = INFINITY; 17022 new_ilm->ilm_rtx.rtx_timer = INFINITY; 17023 new_ilm->ilm_is_new = B_TRUE; 17024 /* 17025 * If the to_ill has not joined this 17026 * group we need to tell the driver in 17027 * ill_send_multicast_reqs. 17028 */ 17029 if (ilm_lookup_ill_v6(to_ill, 17030 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 17031 new_ilm->ilm_notify_driver = B_TRUE; 17032 17033 new_ilm->ilm_ill = to_ill; 17034 /* Add to the to_ill's list */ 17035 new_ilm->ilm_next = to_ill->ill_ilm; 17036 to_ill->ill_ilm = new_ilm; 17037 ASSERT(new_ilm->ilm_ipif == NULL); 17038 } 17039 if (ilm->ilm_refcnt == 0) { 17040 goto bottom; 17041 } else { 17042 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17043 CLEAR_SLIST(new_ilm->ilm_filter); 17044 ilmp = &ilm->ilm_next; 17045 } 17046 continue; 17047 } else { 17048 /* 17049 * ifindex = 0 means, move everything pointing at 17050 * from_ill. We are doing this becuase ill has 17051 * either FAILED or became INACTIVE. 17052 * 17053 * As we would like to move things later back to 17054 * from_ill, we want to retain the identity of this 17055 * ilm. Thus, we don't blindly increment the reference 17056 * count on the ilms matching the address alone. We 17057 * need to match on the ilm_orig_index also. new_ilm 17058 * was obtained by matching ilm_orig_index also. 17059 */ 17060 if (new_ilm != NULL) { 17061 /* 17062 * This is possible only if a previous restore 17063 * was incomplete i.e restore to 17064 * ilm_orig_ifindex left some ilms because 17065 * of some failures. Thus when we are failing 17066 * again, we might find our old friends there. 17067 */ 17068 ip1dbg(("ilm_move_v6: FAILOVER, found new ilm" 17069 " on %s, group address %s\n", 17070 to_ill->ill_name, 17071 inet_ntop(AF_INET6, 17072 &ilm->ilm_v6addr, buf, 17073 sizeof (buf)))); 17074 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 17075 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 17076 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 17077 new_ilm->ilm_is_new = B_TRUE; 17078 } 17079 } else { 17080 if (from_ill->ill_ilm_walker_cnt != 0) { 17081 new_ilm = (ilm_t *) 17082 mi_zalloc(sizeof (ilm_t)); 17083 if (new_ilm == NULL) { 17084 ip0dbg(("ilm_move_v6: " 17085 "FAILOVER of IPv6" 17086 " multicast address %s : " 17087 "from %s to" 17088 " %s failed : ENOMEM \n", 17089 inet_ntop(AF_INET6, 17090 &ilm->ilm_v6addr, buf, 17091 sizeof (buf)), 17092 from_ill->ill_name, 17093 to_ill->ill_name)); 17094 17095 ilmp = &ilm->ilm_next; 17096 continue; 17097 } 17098 *new_ilm = *ilm; 17099 new_ilm->ilm_filter = NULL; 17100 } else { 17101 *ilmp = ilm->ilm_next; 17102 new_ilm = ilm; 17103 } 17104 /* 17105 * If the to_ill has not joined this 17106 * group we need to tell the driver in 17107 * ill_send_multicast_reqs. 17108 */ 17109 if (ilm_lookup_ill_v6(to_ill, 17110 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 17111 new_ilm->ilm_notify_driver = B_TRUE; 17112 17113 /* Add to the to_ill's list */ 17114 new_ilm->ilm_next = to_ill->ill_ilm; 17115 to_ill->ill_ilm = new_ilm; 17116 ASSERT(ilm->ilm_ipif == NULL); 17117 new_ilm->ilm_ill = to_ill; 17118 new_ilm->ilm_is_new = B_TRUE; 17119 } 17120 17121 } 17122 17123 bottom: 17124 /* 17125 * Revert multicast filter state to (EXCLUDE, NULL). 17126 * new_ilm->ilm_is_new should already be set if needed. 17127 */ 17128 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17129 CLEAR_SLIST(new_ilm->ilm_filter); 17130 /* 17131 * We allocated/got a new ilm, free the old one. 17132 */ 17133 if (new_ilm != ilm) { 17134 if (from_ill->ill_ilm_walker_cnt == 0) { 17135 *ilmp = ilm->ilm_next; 17136 ilm->ilm_next = NULL; 17137 FREE_SLIST(ilm->ilm_filter); 17138 FREE_SLIST(ilm->ilm_pendsrcs); 17139 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 17140 FREE_SLIST(ilm->ilm_rtx.rtx_block); 17141 mi_free((char *)ilm); 17142 } else { 17143 ilm->ilm_flags |= ILM_DELETED; 17144 from_ill->ill_ilm_cleanup_reqd = 1; 17145 ilmp = &ilm->ilm_next; 17146 } 17147 } 17148 } 17149 } 17150 17151 /* 17152 * Move all the multicast memberships to to_ill. Called when 17153 * an ipif moves from "from_ill" to "to_ill". This function is slightly 17154 * different from IPv6 counterpart as multicast memberships are associated 17155 * with ills in IPv6. This function is called after every ipif is moved 17156 * unlike IPv6, where it is moved only once. 17157 */ 17158 static void 17159 ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif) 17160 { 17161 ilm_t *ilm; 17162 ilm_t *ilm_next; 17163 ilm_t *new_ilm; 17164 ilm_t **ilmp; 17165 ip_stack_t *ipst = from_ill->ill_ipst; 17166 17167 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 17168 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 17169 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 17170 17171 ilmp = &from_ill->ill_ilm; 17172 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 17173 ilm_next = ilm->ilm_next; 17174 17175 if (ilm->ilm_flags & ILM_DELETED) { 17176 ilmp = &ilm->ilm_next; 17177 continue; 17178 } 17179 17180 ASSERT(ilm->ilm_ipif != NULL); 17181 17182 if (ilm->ilm_ipif != ipif) { 17183 ilmp = &ilm->ilm_next; 17184 continue; 17185 } 17186 17187 if (V4_PART_OF_V6(ilm->ilm_v6addr) == 17188 htonl(INADDR_ALLHOSTS_GROUP)) { 17189 new_ilm = ilm_lookup_ipif(ipif, 17190 V4_PART_OF_V6(ilm->ilm_v6addr)); 17191 if (new_ilm != NULL) { 17192 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 17193 /* 17194 * We still need to deal with the from_ill. 17195 */ 17196 new_ilm->ilm_is_new = B_TRUE; 17197 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17198 CLEAR_SLIST(new_ilm->ilm_filter); 17199 goto delete_ilm; 17200 } 17201 /* 17202 * If we could not find one e.g. ipif is 17203 * still down on to_ill, we add this ilm 17204 * on ill_new to preserve the reference 17205 * count. 17206 */ 17207 } 17208 /* 17209 * When ipifs move, ilms always move with it 17210 * to the NEW ill. Thus we should never be 17211 * able to find ilm till we really move it here. 17212 */ 17213 ASSERT(ilm_lookup_ipif(ipif, 17214 V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL); 17215 17216 if (from_ill->ill_ilm_walker_cnt != 0) { 17217 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 17218 if (new_ilm == NULL) { 17219 char buf[INET6_ADDRSTRLEN]; 17220 ip0dbg(("ilm_move_v4: FAILBACK of IPv4" 17221 " multicast address %s : " 17222 "from %s to" 17223 " %s failed : ENOMEM \n", 17224 inet_ntop(AF_INET, 17225 &ilm->ilm_v6addr, buf, 17226 sizeof (buf)), 17227 from_ill->ill_name, 17228 to_ill->ill_name)); 17229 17230 ilmp = &ilm->ilm_next; 17231 continue; 17232 } 17233 *new_ilm = *ilm; 17234 /* We don't want new_ilm linked to ilm's filter list */ 17235 new_ilm->ilm_filter = NULL; 17236 } else { 17237 /* Remove from the list */ 17238 *ilmp = ilm->ilm_next; 17239 new_ilm = ilm; 17240 } 17241 17242 /* 17243 * If we have never joined this group on the to_ill 17244 * make sure we tell the driver. 17245 */ 17246 if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr, 17247 ALL_ZONES) == NULL) 17248 new_ilm->ilm_notify_driver = B_TRUE; 17249 17250 /* Add to the to_ill's list */ 17251 new_ilm->ilm_next = to_ill->ill_ilm; 17252 to_ill->ill_ilm = new_ilm; 17253 new_ilm->ilm_is_new = B_TRUE; 17254 17255 /* 17256 * Revert multicast filter state to (EXCLUDE, NULL) 17257 */ 17258 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17259 CLEAR_SLIST(new_ilm->ilm_filter); 17260 17261 /* 17262 * Delete only if we have allocated a new ilm. 17263 */ 17264 if (new_ilm != ilm) { 17265 delete_ilm: 17266 if (from_ill->ill_ilm_walker_cnt == 0) { 17267 /* Remove from the list */ 17268 *ilmp = ilm->ilm_next; 17269 ilm->ilm_next = NULL; 17270 FREE_SLIST(ilm->ilm_filter); 17271 FREE_SLIST(ilm->ilm_pendsrcs); 17272 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 17273 FREE_SLIST(ilm->ilm_rtx.rtx_block); 17274 mi_free((char *)ilm); 17275 } else { 17276 ilm->ilm_flags |= ILM_DELETED; 17277 from_ill->ill_ilm_cleanup_reqd = 1; 17278 ilmp = &ilm->ilm_next; 17279 } 17280 } 17281 } 17282 } 17283 17284 static uint_t 17285 ipif_get_id(ill_t *ill, uint_t id) 17286 { 17287 uint_t unit; 17288 ipif_t *tipif; 17289 boolean_t found = B_FALSE; 17290 ip_stack_t *ipst = ill->ill_ipst; 17291 17292 /* 17293 * During failback, we want to go back to the same id 17294 * instead of the smallest id so that the original 17295 * configuration is maintained. id is non-zero in that 17296 * case. 17297 */ 17298 if (id != 0) { 17299 /* 17300 * While failing back, if we still have an ipif with 17301 * MAX_ADDRS_PER_IF, it means this will be replaced 17302 * as soon as we return from this function. It was 17303 * to set to MAX_ADDRS_PER_IF by the caller so that 17304 * we can choose the smallest id. Thus we return zero 17305 * in that case ignoring the hint. 17306 */ 17307 if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF) 17308 return (0); 17309 for (tipif = ill->ill_ipif; tipif != NULL; 17310 tipif = tipif->ipif_next) { 17311 if (tipif->ipif_id == id) { 17312 found = B_TRUE; 17313 break; 17314 } 17315 } 17316 /* 17317 * If somebody already plumbed another logical 17318 * with the same id, we won't be able to find it. 17319 */ 17320 if (!found) 17321 return (id); 17322 } 17323 for (unit = 0; unit <= ipst->ips_ip_addrs_per_if; unit++) { 17324 found = B_FALSE; 17325 for (tipif = ill->ill_ipif; tipif != NULL; 17326 tipif = tipif->ipif_next) { 17327 if (tipif->ipif_id == unit) { 17328 found = B_TRUE; 17329 break; 17330 } 17331 } 17332 if (!found) 17333 break; 17334 } 17335 return (unit); 17336 } 17337 17338 /* ARGSUSED */ 17339 static int 17340 ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp, 17341 ipif_t **rep_ipif_ptr) 17342 { 17343 ill_t *from_ill; 17344 ipif_t *rep_ipif; 17345 uint_t unit; 17346 int err = 0; 17347 ipif_t *to_ipif; 17348 struct iocblk *iocp; 17349 boolean_t failback_cmd; 17350 boolean_t remove_ipif; 17351 int rc; 17352 ip_stack_t *ipst; 17353 17354 ASSERT(IAM_WRITER_ILL(to_ill)); 17355 ASSERT(IAM_WRITER_IPIF(ipif)); 17356 17357 iocp = (struct iocblk *)mp->b_rptr; 17358 failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK); 17359 remove_ipif = B_FALSE; 17360 17361 from_ill = ipif->ipif_ill; 17362 ipst = from_ill->ill_ipst; 17363 17364 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 17365 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 17366 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 17367 17368 /* 17369 * Don't move LINK LOCAL addresses as they are tied to 17370 * physical interface. 17371 */ 17372 if (from_ill->ill_isv6 && 17373 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) { 17374 ipif->ipif_was_up = B_FALSE; 17375 IPIF_UNMARK_MOVING(ipif); 17376 return (0); 17377 } 17378 17379 /* 17380 * We set the ipif_id to maximum so that the search for 17381 * ipif_id will pick the lowest number i.e 0 in the 17382 * following 2 cases : 17383 * 17384 * 1) We have a replacement ipif at the head of to_ill. 17385 * We can't remove it yet as we can exceed ip_addrs_per_if 17386 * on to_ill and hence the MOVE might fail. We want to 17387 * remove it only if we could move the ipif. Thus, by 17388 * setting it to the MAX value, we make the search in 17389 * ipif_get_id return the zeroth id. 17390 * 17391 * 2) When DR pulls out the NIC and re-plumbs the interface, 17392 * we might just have a zero address plumbed on the ipif 17393 * with zero id in the case of IPv4. We remove that while 17394 * doing the failback. We want to remove it only if we 17395 * could move the ipif. Thus, by setting it to the MAX 17396 * value, we make the search in ipif_get_id return the 17397 * zeroth id. 17398 * 17399 * Both (1) and (2) are done only when when we are moving 17400 * an ipif (either due to failover/failback) which originally 17401 * belonged to this interface i.e the ipif_orig_ifindex is 17402 * the same as to_ill's ifindex. This is needed so that 17403 * FAILOVER from A -> B ( A failed) followed by FAILOVER 17404 * from B -> A (B is being removed from the group) and 17405 * FAILBACK from A -> B restores the original configuration. 17406 * Without the check for orig_ifindex, the second FAILOVER 17407 * could make the ipif belonging to B replace the A's zeroth 17408 * ipif and the subsequent failback re-creating the replacement 17409 * ipif again. 17410 * 17411 * NOTE : We created the replacement ipif when we did a 17412 * FAILOVER (See below). We could check for FAILBACK and 17413 * then look for replacement ipif to be removed. But we don't 17414 * want to do that because we wan't to allow the possibility 17415 * of a FAILOVER from A -> B (which creates the replacement ipif), 17416 * followed by a *FAILOVER* from B -> A instead of a FAILBACK 17417 * from B -> A. 17418 */ 17419 to_ipif = to_ill->ill_ipif; 17420 if ((to_ill->ill_phyint->phyint_ifindex == 17421 ipif->ipif_orig_ifindex) && 17422 IPIF_REPL_CHECK(to_ipif, failback_cmd)) { 17423 ASSERT(to_ipif->ipif_id == 0); 17424 remove_ipif = B_TRUE; 17425 to_ipif->ipif_id = MAX_ADDRS_PER_IF; 17426 } 17427 /* 17428 * Find the lowest logical unit number on the to_ill. 17429 * If we are failing back, try to get the original id 17430 * rather than the lowest one so that the original 17431 * configuration is maintained. 17432 * 17433 * XXX need a better scheme for this. 17434 */ 17435 if (failback_cmd) { 17436 unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid); 17437 } else { 17438 unit = ipif_get_id(to_ill, 0); 17439 } 17440 17441 /* Reset back to zero in case we fail below */ 17442 if (to_ipif->ipif_id == MAX_ADDRS_PER_IF) 17443 to_ipif->ipif_id = 0; 17444 17445 if (unit == ipst->ips_ip_addrs_per_if) { 17446 ipif->ipif_was_up = B_FALSE; 17447 IPIF_UNMARK_MOVING(ipif); 17448 return (EINVAL); 17449 } 17450 17451 /* 17452 * ipif is ready to move from "from_ill" to "to_ill". 17453 * 17454 * 1) If we are moving ipif with id zero, create a 17455 * replacement ipif for this ipif on from_ill. If this fails 17456 * fail the MOVE operation. 17457 * 17458 * 2) Remove the replacement ipif on to_ill if any. 17459 * We could remove the replacement ipif when we are moving 17460 * the ipif with id zero. But what if somebody already 17461 * unplumbed it ? Thus we always remove it if it is present. 17462 * We want to do it only if we are sure we are going to 17463 * move the ipif to to_ill which is why there are no 17464 * returns due to error till ipif is linked to to_ill. 17465 * Note that the first ipif that we failback will always 17466 * be zero if it is present. 17467 */ 17468 if (ipif->ipif_id == 0) { 17469 ipaddr_t inaddr_any = INADDR_ANY; 17470 17471 rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED); 17472 if (rep_ipif == NULL) { 17473 ipif->ipif_was_up = B_FALSE; 17474 IPIF_UNMARK_MOVING(ipif); 17475 return (ENOMEM); 17476 } 17477 *rep_ipif = ipif_zero; 17478 /* 17479 * Before we put the ipif on the list, store the addresses 17480 * as mapped addresses as some of the ioctls e.g SIOCGIFADDR 17481 * assumes so. This logic is not any different from what 17482 * ipif_allocate does. 17483 */ 17484 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17485 &rep_ipif->ipif_v6lcl_addr); 17486 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17487 &rep_ipif->ipif_v6src_addr); 17488 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17489 &rep_ipif->ipif_v6subnet); 17490 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17491 &rep_ipif->ipif_v6net_mask); 17492 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17493 &rep_ipif->ipif_v6brd_addr); 17494 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17495 &rep_ipif->ipif_v6pp_dst_addr); 17496 /* 17497 * We mark IPIF_NOFAILOVER so that this can never 17498 * move. 17499 */ 17500 rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER; 17501 rep_ipif->ipif_flags &= ~IPIF_UP & ~IPIF_DUPLICATE; 17502 rep_ipif->ipif_replace_zero = B_TRUE; 17503 mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL, 17504 MUTEX_DEFAULT, NULL); 17505 rep_ipif->ipif_id = 0; 17506 rep_ipif->ipif_ire_type = ipif->ipif_ire_type; 17507 rep_ipif->ipif_ill = from_ill; 17508 rep_ipif->ipif_orig_ifindex = 17509 from_ill->ill_phyint->phyint_ifindex; 17510 /* Insert at head */ 17511 rep_ipif->ipif_next = from_ill->ill_ipif; 17512 from_ill->ill_ipif = rep_ipif; 17513 /* 17514 * We don't really care to let apps know about 17515 * this interface. 17516 */ 17517 } 17518 17519 if (remove_ipif) { 17520 /* 17521 * We set to a max value above for this case to get 17522 * id zero. ASSERT that we did get one. 17523 */ 17524 ASSERT((to_ipif->ipif_id == 0) && (unit == 0)); 17525 rep_ipif = to_ipif; 17526 to_ill->ill_ipif = rep_ipif->ipif_next; 17527 rep_ipif->ipif_next = NULL; 17528 /* 17529 * If some apps scanned and find this interface, 17530 * it is time to let them know, so that they can 17531 * delete it. 17532 */ 17533 17534 *rep_ipif_ptr = rep_ipif; 17535 } 17536 17537 /* Get it out of the ILL interface list. */ 17538 ipif_remove(ipif, B_FALSE); 17539 17540 /* Assign the new ill */ 17541 ipif->ipif_ill = to_ill; 17542 ipif->ipif_id = unit; 17543 /* id has already been checked */ 17544 rc = ipif_insert(ipif, B_FALSE, B_FALSE); 17545 ASSERT(rc == 0); 17546 /* Let SCTP update its list */ 17547 sctp_move_ipif(ipif, from_ill, to_ill); 17548 /* 17549 * Handle the failover and failback of ipif_t between 17550 * ill_t that have differing maximum mtu values. 17551 */ 17552 if (ipif->ipif_mtu > to_ill->ill_max_mtu) { 17553 if (ipif->ipif_saved_mtu == 0) { 17554 /* 17555 * As this ipif_t is moving to an ill_t 17556 * that has a lower ill_max_mtu, its 17557 * ipif_mtu needs to be saved so it can 17558 * be restored during failback or during 17559 * failover to an ill_t which has a 17560 * higher ill_max_mtu. 17561 */ 17562 ipif->ipif_saved_mtu = ipif->ipif_mtu; 17563 ipif->ipif_mtu = to_ill->ill_max_mtu; 17564 } else { 17565 /* 17566 * The ipif_t is, once again, moving to 17567 * an ill_t that has a lower maximum mtu 17568 * value. 17569 */ 17570 ipif->ipif_mtu = to_ill->ill_max_mtu; 17571 } 17572 } else if (ipif->ipif_mtu < to_ill->ill_max_mtu && 17573 ipif->ipif_saved_mtu != 0) { 17574 /* 17575 * The mtu of this ipif_t had to be reduced 17576 * during an earlier failover; this is an 17577 * opportunity for it to be increased (either as 17578 * part of another failover or a failback). 17579 */ 17580 if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) { 17581 ipif->ipif_mtu = ipif->ipif_saved_mtu; 17582 ipif->ipif_saved_mtu = 0; 17583 } else { 17584 ipif->ipif_mtu = to_ill->ill_max_mtu; 17585 } 17586 } 17587 17588 /* 17589 * We preserve all the other fields of the ipif including 17590 * ipif_saved_ire_mp. The routes that are saved here will 17591 * be recreated on the new interface and back on the old 17592 * interface when we move back. 17593 */ 17594 ASSERT(ipif->ipif_arp_del_mp == NULL); 17595 17596 return (err); 17597 } 17598 17599 static int 17600 ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp, 17601 int ifindex, ipif_t **rep_ipif_ptr) 17602 { 17603 ipif_t *mipif; 17604 ipif_t *ipif_next; 17605 int err; 17606 17607 /* 17608 * We don't really try to MOVE back things if some of the 17609 * operations fail. The daemon will take care of moving again 17610 * later on. 17611 */ 17612 for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) { 17613 ipif_next = mipif->ipif_next; 17614 if (!(mipif->ipif_flags & IPIF_NOFAILOVER) && 17615 (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) { 17616 17617 err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr); 17618 17619 /* 17620 * When the MOVE fails, it is the job of the 17621 * application to take care of this properly 17622 * i.e try again if it is ENOMEM. 17623 */ 17624 if (mipif->ipif_ill != from_ill) { 17625 /* 17626 * ipif has moved. 17627 * 17628 * Move the multicast memberships associated 17629 * with this ipif to the new ill. For IPv6, we 17630 * do it once after all the ipifs are moved 17631 * (in ill_move) as they are not associated 17632 * with ipifs. 17633 * 17634 * We need to move the ilms as the ipif has 17635 * already been moved to a new ill even 17636 * in the case of errors. Neither 17637 * ilm_free(ipif) will find the ilm 17638 * when somebody unplumbs this ipif nor 17639 * ilm_delete(ilm) will be able to find the 17640 * ilm, if we don't move now. 17641 */ 17642 if (!from_ill->ill_isv6) 17643 ilm_move_v4(from_ill, to_ill, mipif); 17644 } 17645 17646 if (err != 0) 17647 return (err); 17648 } 17649 } 17650 return (0); 17651 } 17652 17653 static int 17654 ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp) 17655 { 17656 int ifindex; 17657 int err; 17658 struct iocblk *iocp; 17659 ipif_t *ipif; 17660 ipif_t *rep_ipif_ptr = NULL; 17661 ipif_t *from_ipif = NULL; 17662 boolean_t check_rep_if = B_FALSE; 17663 ip_stack_t *ipst = from_ill->ill_ipst; 17664 17665 iocp = (struct iocblk *)mp->b_rptr; 17666 if (iocp->ioc_cmd == SIOCLIFFAILOVER) { 17667 /* 17668 * Move everything pointing at from_ill to to_ill. 17669 * We acheive this by passing in 0 as ifindex. 17670 */ 17671 ifindex = 0; 17672 } else { 17673 /* 17674 * Move everything pointing at from_ill whose original 17675 * ifindex of connp, ipif, ilm points at to_ill->ill_index. 17676 * We acheive this by passing in ifindex rather than 0. 17677 * Multicast vifs, ilgs move implicitly because ipifs move. 17678 */ 17679 ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK); 17680 ifindex = to_ill->ill_phyint->phyint_ifindex; 17681 } 17682 17683 /* 17684 * Determine if there is at least one ipif that would move from 17685 * 'from_ill' to 'to_ill'. If so, it is possible that the replacement 17686 * ipif (if it exists) on the to_ill would be consumed as a result of 17687 * the move, in which case we need to quiesce the replacement ipif also. 17688 */ 17689 for (from_ipif = from_ill->ill_ipif; from_ipif != NULL; 17690 from_ipif = from_ipif->ipif_next) { 17691 if (((ifindex == 0) || 17692 (ifindex == from_ipif->ipif_orig_ifindex)) && 17693 !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) { 17694 check_rep_if = B_TRUE; 17695 break; 17696 } 17697 } 17698 17699 17700 ill_down_ipifs(from_ill, mp, ifindex, B_TRUE); 17701 17702 GRAB_ILL_LOCKS(from_ill, to_ill); 17703 if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) { 17704 (void) ipsq_pending_mp_add(NULL, ipif, q, 17705 mp, ILL_MOVE_OK); 17706 RELEASE_ILL_LOCKS(from_ill, to_ill); 17707 return (EINPROGRESS); 17708 } 17709 17710 /* Check if the replacement ipif is quiescent to delete */ 17711 if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif, 17712 (iocp->ioc_cmd == SIOCLIFFAILBACK))) { 17713 to_ill->ill_ipif->ipif_state_flags |= 17714 IPIF_MOVING | IPIF_CHANGING; 17715 if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) { 17716 (void) ipsq_pending_mp_add(NULL, ipif, q, 17717 mp, ILL_MOVE_OK); 17718 RELEASE_ILL_LOCKS(from_ill, to_ill); 17719 return (EINPROGRESS); 17720 } 17721 } 17722 RELEASE_ILL_LOCKS(from_ill, to_ill); 17723 17724 ASSERT(!MUTEX_HELD(&to_ill->ill_lock)); 17725 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 17726 GRAB_ILL_LOCKS(from_ill, to_ill); 17727 err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr); 17728 17729 /* ilm_move is done inside ipif_move for IPv4 */ 17730 if (err == 0 && from_ill->ill_isv6) 17731 ilm_move_v6(from_ill, to_ill, ifindex); 17732 17733 RELEASE_ILL_LOCKS(from_ill, to_ill); 17734 rw_exit(&ipst->ips_ill_g_lock); 17735 17736 /* 17737 * send rts messages and multicast messages. 17738 */ 17739 if (rep_ipif_ptr != NULL) { 17740 if (rep_ipif_ptr->ipif_recovery_id != 0) { 17741 (void) untimeout(rep_ipif_ptr->ipif_recovery_id); 17742 rep_ipif_ptr->ipif_recovery_id = 0; 17743 } 17744 ip_rts_ifmsg(rep_ipif_ptr); 17745 ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr); 17746 #ifdef DEBUG 17747 ipif_trace_cleanup(rep_ipif_ptr); 17748 #endif 17749 mi_free(rep_ipif_ptr); 17750 } 17751 17752 conn_move_ill(from_ill, to_ill, ifindex); 17753 17754 return (err); 17755 } 17756 17757 /* 17758 * Used to extract arguments for FAILOVER/FAILBACK ioctls. 17759 * Also checks for the validity of the arguments. 17760 * Note: We are already exclusive inside the from group. 17761 * It is upto the caller to release refcnt on the to_ill's. 17762 */ 17763 static int 17764 ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4, 17765 ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6) 17766 { 17767 int dst_index; 17768 ipif_t *ipif_v4, *ipif_v6; 17769 struct lifreq *lifr; 17770 mblk_t *mp1; 17771 boolean_t exists; 17772 sin_t *sin; 17773 int err = 0; 17774 ip_stack_t *ipst; 17775 17776 if (CONN_Q(q)) 17777 ipst = CONNQ_TO_IPST(q); 17778 else 17779 ipst = ILLQ_TO_IPST(q); 17780 17781 17782 if ((mp1 = mp->b_cont) == NULL) 17783 return (EPROTO); 17784 17785 if ((mp1 = mp1->b_cont) == NULL) 17786 return (EPROTO); 17787 17788 lifr = (struct lifreq *)mp1->b_rptr; 17789 sin = (sin_t *)&lifr->lifr_addr; 17790 17791 /* 17792 * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6 17793 * specific operations. 17794 */ 17795 if (sin->sin_family != AF_UNSPEC) 17796 return (EINVAL); 17797 17798 /* 17799 * Get ipif with id 0. We are writer on the from ill. So we can pass 17800 * NULLs for the last 4 args and we know the lookup won't fail 17801 * with EINPROGRESS. 17802 */ 17803 ipif_v4 = ipif_lookup_on_name(lifr->lifr_name, 17804 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE, 17805 ALL_ZONES, NULL, NULL, NULL, NULL, ipst); 17806 ipif_v6 = ipif_lookup_on_name(lifr->lifr_name, 17807 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE, 17808 ALL_ZONES, NULL, NULL, NULL, NULL, ipst); 17809 17810 if (ipif_v4 == NULL && ipif_v6 == NULL) 17811 return (ENXIO); 17812 17813 if (ipif_v4 != NULL) { 17814 ASSERT(ipif_v4->ipif_refcnt != 0); 17815 if (ipif_v4->ipif_id != 0) { 17816 err = EINVAL; 17817 goto done; 17818 } 17819 17820 ASSERT(IAM_WRITER_IPIF(ipif_v4)); 17821 *ill_from_v4 = ipif_v4->ipif_ill; 17822 } 17823 17824 if (ipif_v6 != NULL) { 17825 ASSERT(ipif_v6->ipif_refcnt != 0); 17826 if (ipif_v6->ipif_id != 0) { 17827 err = EINVAL; 17828 goto done; 17829 } 17830 17831 ASSERT(IAM_WRITER_IPIF(ipif_v6)); 17832 *ill_from_v6 = ipif_v6->ipif_ill; 17833 } 17834 17835 err = 0; 17836 dst_index = lifr->lifr_movetoindex; 17837 *ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE, 17838 q, mp, ip_process_ioctl, &err, ipst); 17839 if (err != 0) { 17840 /* 17841 * There could be only v6. 17842 */ 17843 if (err != ENXIO) 17844 goto done; 17845 err = 0; 17846 } 17847 17848 *ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE, 17849 q, mp, ip_process_ioctl, &err, ipst); 17850 if (err != 0) { 17851 if (err != ENXIO) 17852 goto done; 17853 if (*ill_to_v4 == NULL) { 17854 err = ENXIO; 17855 goto done; 17856 } 17857 err = 0; 17858 } 17859 17860 /* 17861 * If we have something to MOVE i.e "from" not NULL, 17862 * "to" should be non-NULL. 17863 */ 17864 if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) || 17865 (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) { 17866 err = EINVAL; 17867 } 17868 17869 done: 17870 if (ipif_v4 != NULL) 17871 ipif_refrele(ipif_v4); 17872 if (ipif_v6 != NULL) 17873 ipif_refrele(ipif_v6); 17874 return (err); 17875 } 17876 17877 /* 17878 * FAILOVER and FAILBACK are modelled as MOVE operations. 17879 * 17880 * We don't check whether the MOVE is within the same group or 17881 * not, because this ioctl can be used as a generic mechanism 17882 * to failover from interface A to B, though things will function 17883 * only if they are really part of the same group. Moreover, 17884 * all ipifs may be down and hence temporarily out of the group. 17885 * 17886 * ipif's that need to be moved are first brought down; V4 ipifs are brought 17887 * down first and then V6. For each we wait for the ipif's to become quiescent. 17888 * Bringing down the ipifs ensures that all ires pointing to these ipifs's 17889 * have been deleted and there are no active references. Once quiescent the 17890 * ipif's are moved and brought up on the new ill. 17891 * 17892 * Normally the source ill and destination ill belong to the same IPMP group 17893 * and hence the same ipsq_t. In the event they don't belong to the same 17894 * same group the two ipsq's are first merged into one ipsq - that of the 17895 * to_ill. The multicast memberships on the source and destination ill cannot 17896 * change during the move operation since multicast joins/leaves also have to 17897 * execute on the same ipsq and are hence serialized. 17898 */ 17899 /* ARGSUSED */ 17900 int 17901 ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17902 ip_ioctl_cmd_t *ipip, void *ifreq) 17903 { 17904 ill_t *ill_to_v4 = NULL; 17905 ill_t *ill_to_v6 = NULL; 17906 ill_t *ill_from_v4 = NULL; 17907 ill_t *ill_from_v6 = NULL; 17908 int err = 0; 17909 17910 /* 17911 * setup from and to ill's, we can get EINPROGRESS only for 17912 * to_ill's. 17913 */ 17914 err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6, 17915 &ill_to_v4, &ill_to_v6); 17916 17917 if (err != 0) { 17918 ip0dbg(("ip_sioctl_move: extract args failed\n")); 17919 goto done; 17920 } 17921 17922 /* 17923 * nothing to do. 17924 */ 17925 if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) { 17926 goto done; 17927 } 17928 17929 /* 17930 * nothing to do. 17931 */ 17932 if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) { 17933 goto done; 17934 } 17935 17936 /* 17937 * Mark the ill as changing. 17938 * ILL_CHANGING flag is cleared when the ipif's are brought up 17939 * in ill_up_ipifs in case of error they are cleared below. 17940 */ 17941 17942 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 17943 if (ill_from_v4 != NULL) 17944 ill_from_v4->ill_state_flags |= ILL_CHANGING; 17945 if (ill_from_v6 != NULL) 17946 ill_from_v6->ill_state_flags |= ILL_CHANGING; 17947 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 17948 17949 /* 17950 * Make sure that both src and dst are 17951 * in the same syncq group. If not make it happen. 17952 * We are not holding any locks because we are the writer 17953 * on the from_ipsq and we will hold locks in ill_merge_groups 17954 * to protect to_ipsq against changing. 17955 */ 17956 if (ill_from_v4 != NULL) { 17957 if (ill_from_v4->ill_phyint->phyint_ipsq != 17958 ill_to_v4->ill_phyint->phyint_ipsq) { 17959 err = ill_merge_groups(ill_from_v4, ill_to_v4, 17960 NULL, mp, q); 17961 goto err_ret; 17962 17963 } 17964 ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock)); 17965 } else { 17966 17967 if (ill_from_v6->ill_phyint->phyint_ipsq != 17968 ill_to_v6->ill_phyint->phyint_ipsq) { 17969 err = ill_merge_groups(ill_from_v6, ill_to_v6, 17970 NULL, mp, q); 17971 goto err_ret; 17972 17973 } 17974 ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock)); 17975 } 17976 17977 /* 17978 * Now that the ipsq's have been merged and we are the writer 17979 * lets mark to_ill as changing as well. 17980 */ 17981 17982 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 17983 if (ill_to_v4 != NULL) 17984 ill_to_v4->ill_state_flags |= ILL_CHANGING; 17985 if (ill_to_v6 != NULL) 17986 ill_to_v6->ill_state_flags |= ILL_CHANGING; 17987 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 17988 17989 /* 17990 * Its ok for us to proceed with the move even if 17991 * ill_pending_mp is non null on one of the from ill's as the reply 17992 * should not be looking at the ipif, it should only care about the 17993 * ill itself. 17994 */ 17995 17996 /* 17997 * lets move ipv4 first. 17998 */ 17999 if (ill_from_v4 != NULL) { 18000 ASSERT(IAM_WRITER_ILL(ill_to_v4)); 18001 ill_from_v4->ill_move_in_progress = B_TRUE; 18002 ill_to_v4->ill_move_in_progress = B_TRUE; 18003 ill_to_v4->ill_move_peer = ill_from_v4; 18004 ill_from_v4->ill_move_peer = ill_to_v4; 18005 err = ill_move(ill_from_v4, ill_to_v4, q, mp); 18006 } 18007 18008 /* 18009 * Now lets move ipv6. 18010 */ 18011 if (err == 0 && ill_from_v6 != NULL) { 18012 ASSERT(IAM_WRITER_ILL(ill_to_v6)); 18013 ill_from_v6->ill_move_in_progress = B_TRUE; 18014 ill_to_v6->ill_move_in_progress = B_TRUE; 18015 ill_to_v6->ill_move_peer = ill_from_v6; 18016 ill_from_v6->ill_move_peer = ill_to_v6; 18017 err = ill_move(ill_from_v6, ill_to_v6, q, mp); 18018 } 18019 18020 err_ret: 18021 /* 18022 * EINPROGRESS means we are waiting for the ipif's that need to be 18023 * moved to become quiescent. 18024 */ 18025 if (err == EINPROGRESS) { 18026 goto done; 18027 } 18028 18029 /* 18030 * if err is set ill_up_ipifs will not be called 18031 * lets clear the flags. 18032 */ 18033 18034 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 18035 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 18036 /* 18037 * Some of the clearing may be redundant. But it is simple 18038 * not making any extra checks. 18039 */ 18040 if (ill_from_v6 != NULL) { 18041 ill_from_v6->ill_move_in_progress = B_FALSE; 18042 ill_from_v6->ill_move_peer = NULL; 18043 ill_from_v6->ill_state_flags &= ~ILL_CHANGING; 18044 } 18045 if (ill_from_v4 != NULL) { 18046 ill_from_v4->ill_move_in_progress = B_FALSE; 18047 ill_from_v4->ill_move_peer = NULL; 18048 ill_from_v4->ill_state_flags &= ~ILL_CHANGING; 18049 } 18050 if (ill_to_v6 != NULL) { 18051 ill_to_v6->ill_move_in_progress = B_FALSE; 18052 ill_to_v6->ill_move_peer = NULL; 18053 ill_to_v6->ill_state_flags &= ~ILL_CHANGING; 18054 } 18055 if (ill_to_v4 != NULL) { 18056 ill_to_v4->ill_move_in_progress = B_FALSE; 18057 ill_to_v4->ill_move_peer = NULL; 18058 ill_to_v4->ill_state_flags &= ~ILL_CHANGING; 18059 } 18060 18061 /* 18062 * Check for setting INACTIVE, if STANDBY is set and FAILED is not set. 18063 * Do this always to maintain proper state i.e even in case of errors. 18064 * As phyint_inactive looks at both v4 and v6 interfaces, 18065 * we need not call on both v4 and v6 interfaces. 18066 */ 18067 if (ill_from_v4 != NULL) { 18068 if ((ill_from_v4->ill_phyint->phyint_flags & 18069 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 18070 phyint_inactive(ill_from_v4->ill_phyint); 18071 } 18072 } else if (ill_from_v6 != NULL) { 18073 if ((ill_from_v6->ill_phyint->phyint_flags & 18074 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 18075 phyint_inactive(ill_from_v6->ill_phyint); 18076 } 18077 } 18078 18079 if (ill_to_v4 != NULL) { 18080 if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) { 18081 ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 18082 } 18083 } else if (ill_to_v6 != NULL) { 18084 if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) { 18085 ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 18086 } 18087 } 18088 18089 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 18090 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 18091 18092 no_err: 18093 /* 18094 * lets bring the interfaces up on the to_ill. 18095 */ 18096 if (err == 0) { 18097 err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4, 18098 q, mp); 18099 } 18100 18101 if (err == 0) { 18102 if (ill_from_v4 != NULL && ill_to_v4 != NULL) 18103 ilm_send_multicast_reqs(ill_from_v4, ill_to_v4); 18104 18105 if (ill_from_v6 != NULL && ill_to_v6 != NULL) 18106 ilm_send_multicast_reqs(ill_from_v6, ill_to_v6); 18107 } 18108 done: 18109 18110 if (ill_to_v4 != NULL) { 18111 ill_refrele(ill_to_v4); 18112 } 18113 if (ill_to_v6 != NULL) { 18114 ill_refrele(ill_to_v6); 18115 } 18116 18117 return (err); 18118 } 18119 18120 static void 18121 ill_dl_down(ill_t *ill) 18122 { 18123 /* 18124 * The ill is down; unbind but stay attached since we're still 18125 * associated with a PPA. If we have negotiated DLPI capabilites 18126 * with the data link service provider (IDS_OK) then reset them. 18127 * The interval between unbinding and rebinding is potentially 18128 * unbounded hence we cannot assume things will be the same. 18129 * The DLPI capabilities will be probed again when the data link 18130 * is brought up. 18131 */ 18132 mblk_t *mp = ill->ill_unbind_mp; 18133 hook_nic_event_t *info; 18134 18135 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 18136 18137 ill->ill_unbind_mp = NULL; 18138 if (mp != NULL) { 18139 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 18140 dlpi_prim_str(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 18141 ill->ill_name)); 18142 mutex_enter(&ill->ill_lock); 18143 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 18144 mutex_exit(&ill->ill_lock); 18145 /* 18146 * Reset the capabilities if the negotiation is done or is 18147 * still in progress. Note that ill_capability_reset() will 18148 * set ill_dlpi_capab_state to IDS_UNKNOWN, so the subsequent 18149 * DL_CAPABILITY_ACK and DL_NOTE_CAPAB_RENEG will be ignored. 18150 * 18151 * Further, reset ill_capab_reneg to be B_FALSE so that the 18152 * subsequent DL_CAPABILITY_ACK can be ignored, to prevent 18153 * the capabilities renegotiation from happening. 18154 */ 18155 if (ill->ill_dlpi_capab_state != IDS_UNKNOWN) 18156 ill_capability_reset(ill); 18157 ill->ill_capab_reneg = B_FALSE; 18158 18159 ill_dlpi_send(ill, mp); 18160 } 18161 18162 /* 18163 * Toss all of our multicast memberships. We could keep them, but 18164 * then we'd have to do bookkeeping of any joins and leaves performed 18165 * by the application while the the interface is down (we can't just 18166 * issue them because arp cannot currently process AR_ENTRY_SQUERY's 18167 * on a downed interface). 18168 */ 18169 ill_leave_multicast(ill); 18170 18171 mutex_enter(&ill->ill_lock); 18172 18173 ill->ill_dl_up = 0; 18174 18175 if ((info = ill->ill_nic_event_info) != NULL) { 18176 ip2dbg(("ill_dl_down:unexpected nic event %d attached for %s\n", 18177 info->hne_event, ill->ill_name)); 18178 if (info->hne_data != NULL) 18179 kmem_free(info->hne_data, info->hne_datalen); 18180 kmem_free(info, sizeof (hook_nic_event_t)); 18181 } 18182 18183 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 18184 if (info != NULL) { 18185 ip_stack_t *ipst = ill->ill_ipst; 18186 18187 info->hne_nic = ill->ill_phyint->phyint_hook_ifindex; 18188 info->hne_lif = 0; 18189 info->hne_event = NE_DOWN; 18190 info->hne_data = NULL; 18191 info->hne_datalen = 0; 18192 info->hne_family = ill->ill_isv6 ? 18193 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 18194 } else 18195 ip2dbg(("ill_dl_down: could not attach DOWN nic event " 18196 "information for %s (ENOMEM)\n", ill->ill_name)); 18197 18198 ill->ill_nic_event_info = info; 18199 18200 mutex_exit(&ill->ill_lock); 18201 } 18202 18203 static void 18204 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 18205 { 18206 union DL_primitives *dlp; 18207 t_uscalar_t prim; 18208 18209 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 18210 18211 dlp = (union DL_primitives *)mp->b_rptr; 18212 prim = dlp->dl_primitive; 18213 18214 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 18215 dlpi_prim_str(prim), prim, ill->ill_name)); 18216 18217 switch (prim) { 18218 case DL_PHYS_ADDR_REQ: 18219 { 18220 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 18221 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 18222 break; 18223 } 18224 case DL_BIND_REQ: 18225 mutex_enter(&ill->ill_lock); 18226 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 18227 mutex_exit(&ill->ill_lock); 18228 break; 18229 } 18230 18231 /* 18232 * Except for the ACKs for the M_PCPROTO messages, all other ACKs 18233 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore 18234 * we only wait for the ACK of the DL_UNBIND_REQ. 18235 */ 18236 mutex_enter(&ill->ill_lock); 18237 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 18238 (prim == DL_UNBIND_REQ)) { 18239 ill->ill_dlpi_pending = prim; 18240 } 18241 mutex_exit(&ill->ill_lock); 18242 18243 putnext(ill->ill_wq, mp); 18244 } 18245 18246 /* 18247 * Helper function for ill_dlpi_send(). 18248 */ 18249 /* ARGSUSED */ 18250 static void 18251 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 18252 { 18253 ill_dlpi_send((ill_t *)q->q_ptr, mp); 18254 } 18255 18256 /* 18257 * Send a DLPI control message to the driver but make sure there 18258 * is only one outstanding message. Uses ill_dlpi_pending to tell 18259 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 18260 * when an ACK or a NAK is received to process the next queued message. 18261 */ 18262 void 18263 ill_dlpi_send(ill_t *ill, mblk_t *mp) 18264 { 18265 mblk_t **mpp; 18266 18267 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 18268 18269 /* 18270 * To ensure that any DLPI requests for current exclusive operation 18271 * are always completely sent before any DLPI messages for other 18272 * operations, require writer access before enqueuing. 18273 */ 18274 if (!IAM_WRITER_ILL(ill)) { 18275 ill_refhold(ill); 18276 /* qwriter_ip() does the ill_refrele() */ 18277 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer, 18278 NEW_OP, B_TRUE); 18279 return; 18280 } 18281 18282 mutex_enter(&ill->ill_lock); 18283 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 18284 /* Must queue message. Tail insertion */ 18285 mpp = &ill->ill_dlpi_deferred; 18286 while (*mpp != NULL) 18287 mpp = &((*mpp)->b_next); 18288 18289 ip1dbg(("ill_dlpi_send: deferring request for %s\n", 18290 ill->ill_name)); 18291 18292 *mpp = mp; 18293 mutex_exit(&ill->ill_lock); 18294 return; 18295 } 18296 mutex_exit(&ill->ill_lock); 18297 ill_dlpi_dispatch(ill, mp); 18298 } 18299 18300 /* 18301 * Send all deferred DLPI messages without waiting for their ACKs. 18302 */ 18303 void 18304 ill_dlpi_send_deferred(ill_t *ill) 18305 { 18306 mblk_t *mp, *nextmp; 18307 18308 /* 18309 * Clear ill_dlpi_pending so that the message is not queued in 18310 * ill_dlpi_send(). 18311 */ 18312 mutex_enter(&ill->ill_lock); 18313 ill->ill_dlpi_pending = DL_PRIM_INVAL; 18314 mp = ill->ill_dlpi_deferred; 18315 ill->ill_dlpi_deferred = NULL; 18316 mutex_exit(&ill->ill_lock); 18317 18318 for (; mp != NULL; mp = nextmp) { 18319 nextmp = mp->b_next; 18320 mp->b_next = NULL; 18321 ill_dlpi_send(ill, mp); 18322 } 18323 } 18324 18325 /* 18326 * Check if the DLPI primitive `prim' is pending; print a warning if not. 18327 */ 18328 boolean_t 18329 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim) 18330 { 18331 t_uscalar_t pending; 18332 18333 mutex_enter(&ill->ill_lock); 18334 if (ill->ill_dlpi_pending == prim) { 18335 mutex_exit(&ill->ill_lock); 18336 return (B_TRUE); 18337 } 18338 18339 /* 18340 * During teardown, ill_dlpi_dispatch() will send DLPI requests 18341 * without waiting, so don't print any warnings in that case. 18342 */ 18343 if (ill->ill_state_flags & ILL_CONDEMNED) { 18344 mutex_exit(&ill->ill_lock); 18345 return (B_FALSE); 18346 } 18347 pending = ill->ill_dlpi_pending; 18348 mutex_exit(&ill->ill_lock); 18349 18350 if (pending == DL_PRIM_INVAL) { 18351 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 18352 "received unsolicited ack for %s on %s\n", 18353 dlpi_prim_str(prim), ill->ill_name); 18354 } else { 18355 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 18356 "received unexpected ack for %s on %s (expecting %s)\n", 18357 dlpi_prim_str(prim), ill->ill_name, dlpi_prim_str(pending)); 18358 } 18359 return (B_FALSE); 18360 } 18361 18362 /* 18363 * Called when an DLPI control message has been acked or nacked to 18364 * send down the next queued message (if any). 18365 */ 18366 void 18367 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 18368 { 18369 mblk_t *mp; 18370 18371 ASSERT(IAM_WRITER_ILL(ill)); 18372 mutex_enter(&ill->ill_lock); 18373 18374 ASSERT(prim != DL_PRIM_INVAL); 18375 ASSERT(ill->ill_dlpi_pending == prim); 18376 18377 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 18378 dlpi_prim_str(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 18379 18380 if ((mp = ill->ill_dlpi_deferred) == NULL) { 18381 ill->ill_dlpi_pending = DL_PRIM_INVAL; 18382 cv_signal(&ill->ill_cv); 18383 mutex_exit(&ill->ill_lock); 18384 return; 18385 } 18386 18387 ill->ill_dlpi_deferred = mp->b_next; 18388 mp->b_next = NULL; 18389 mutex_exit(&ill->ill_lock); 18390 18391 ill_dlpi_dispatch(ill, mp); 18392 } 18393 18394 void 18395 conn_delete_ire(conn_t *connp, caddr_t arg) 18396 { 18397 ipif_t *ipif = (ipif_t *)arg; 18398 ire_t *ire; 18399 18400 /* 18401 * Look at the cached ires on conns which has pointers to ipifs. 18402 * We just call ire_refrele which clears up the reference 18403 * to ire. Called when a conn closes. Also called from ipif_free 18404 * to cleanup indirect references to the stale ipif via the cached ire. 18405 */ 18406 mutex_enter(&connp->conn_lock); 18407 ire = connp->conn_ire_cache; 18408 if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) { 18409 connp->conn_ire_cache = NULL; 18410 mutex_exit(&connp->conn_lock); 18411 IRE_REFRELE_NOTR(ire); 18412 return; 18413 } 18414 mutex_exit(&connp->conn_lock); 18415 18416 } 18417 18418 /* 18419 * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number 18420 * of IREs. Those IREs may have been previously cached in the conn structure. 18421 * This ipcl_walk() walker function releases all references to such IREs based 18422 * on the condemned flag. 18423 */ 18424 /* ARGSUSED */ 18425 void 18426 conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) 18427 { 18428 ire_t *ire; 18429 18430 mutex_enter(&connp->conn_lock); 18431 ire = connp->conn_ire_cache; 18432 if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) { 18433 connp->conn_ire_cache = NULL; 18434 mutex_exit(&connp->conn_lock); 18435 IRE_REFRELE_NOTR(ire); 18436 return; 18437 } 18438 mutex_exit(&connp->conn_lock); 18439 } 18440 18441 /* 18442 * Take down a specific interface, but don't lose any information about it. 18443 * Also delete interface from its interface group (ifgrp). 18444 * (Always called as writer.) 18445 * This function goes through the down sequence even if the interface is 18446 * already down. There are 2 reasons. 18447 * a. Currently we permit interface routes that depend on down interfaces 18448 * to be added. This behaviour itself is questionable. However it appears 18449 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 18450 * time. We go thru the cleanup in order to remove these routes. 18451 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 18452 * DL_ERROR_ACK in response to the the DL_BIND request. The interface is 18453 * down, but we need to cleanup i.e. do ill_dl_down and 18454 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 18455 * 18456 * IP-MT notes: 18457 * 18458 * Model of reference to interfaces. 18459 * 18460 * The following members in ipif_t track references to the ipif. 18461 * int ipif_refcnt; Active reference count 18462 * uint_t ipif_ire_cnt; Number of ire's referencing this ipif 18463 * The following members in ill_t track references to the ill. 18464 * int ill_refcnt; active refcnt 18465 * uint_t ill_ire_cnt; Number of ires referencing ill 18466 * uint_t ill_nce_cnt; Number of nces referencing ill 18467 * 18468 * Reference to an ipif or ill can be obtained in any of the following ways. 18469 * 18470 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 18471 * Pointers to ipif / ill from other data structures viz ire and conn. 18472 * Implicit reference to the ipif / ill by holding a reference to the ire. 18473 * 18474 * The ipif/ill lookup functions return a reference held ipif / ill. 18475 * ipif_refcnt and ill_refcnt track the reference counts respectively. 18476 * This is a purely dynamic reference count associated with threads holding 18477 * references to the ipif / ill. Pointers from other structures do not 18478 * count towards this reference count. 18479 * 18480 * ipif_ire_cnt/ill_ire_cnt is the number of ire's associated with the 18481 * ipif/ill. This is incremented whenever a new ire is created referencing the 18482 * ipif/ill. This is done atomically inside ire_add_v[46] where the ire is 18483 * actually added to the ire hash table. The count is decremented in 18484 * ire_inactive where the ire is destroyed. 18485 * 18486 * nce's reference ill's thru nce_ill and the count of nce's associated with 18487 * an ill is recorded in ill_nce_cnt. This is incremented atomically in 18488 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the 18489 * table. Similarly it is decremented in ndp_inactive() where the nce 18490 * is destroyed. 18491 * 18492 * Flow of ioctls involving interface down/up 18493 * 18494 * The following is the sequence of an attempt to set some critical flags on an 18495 * up interface. 18496 * ip_sioctl_flags 18497 * ipif_down 18498 * wait for ipif to be quiescent 18499 * ipif_down_tail 18500 * ip_sioctl_flags_tail 18501 * 18502 * All set ioctls that involve down/up sequence would have a skeleton similar 18503 * to the above. All the *tail functions are called after the refcounts have 18504 * dropped to the appropriate values. 18505 * 18506 * The mechanism to quiesce an ipif is as follows. 18507 * 18508 * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed 18509 * on the ipif. Callers either pass a flag requesting wait or the lookup 18510 * functions will return NULL. 18511 * 18512 * Delete all ires referencing this ipif 18513 * 18514 * Any thread attempting to do an ipif_refhold on an ipif that has been 18515 * obtained thru a cached pointer will first make sure that 18516 * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then 18517 * increment the refcount. 18518 * 18519 * The above guarantees that the ipif refcount will eventually come down to 18520 * zero and the ipif will quiesce, once all threads that currently hold a 18521 * reference to the ipif refrelease the ipif. The ipif is quiescent after the 18522 * ipif_refcount has dropped to zero and all ire's associated with this ipif 18523 * have also been ire_inactive'd. i.e. when ipif_ire_cnt and ipif_refcnt both 18524 * drop to zero. 18525 * 18526 * Lookups during the IPIF_CHANGING/ILL_CHANGING interval. 18527 * 18528 * Threads trying to lookup an ipif or ill can pass a flag requesting 18529 * wait and restart if the ipif / ill cannot be looked up currently. 18530 * For eg. bind, and route operations (Eg. route add / delete) cannot return 18531 * failure if the ipif is currently undergoing an exclusive operation, and 18532 * hence pass the flag. The mblk is then enqueued in the ipsq and the operation 18533 * is restarted by ipsq_exit() when the currently exclusive ioctl completes. 18534 * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The 18535 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 18536 * change while the ill_lock is held. Before dropping the ill_lock we acquire 18537 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 18538 * until we release the ipsq_lock, even though the the ill/ipif state flags 18539 * can change after we drop the ill_lock. 18540 * 18541 * An attempt to send out a packet using an ipif that is currently 18542 * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this 18543 * operation and restart it later when the exclusive condition on the ipif ends. 18544 * This is an example of not passing the wait flag to the lookup functions. For 18545 * example an attempt to refhold and use conn->conn_multicast_ipif and send 18546 * out a multicast packet on that ipif will fail while the ipif is 18547 * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is 18548 * currently IPIF_CHANGING will also fail. 18549 */ 18550 int 18551 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 18552 { 18553 ill_t *ill = ipif->ipif_ill; 18554 phyint_t *phyi; 18555 conn_t *connp; 18556 boolean_t success; 18557 boolean_t ipif_was_up = B_FALSE; 18558 ip_stack_t *ipst = ill->ill_ipst; 18559 18560 ASSERT(IAM_WRITER_IPIF(ipif)); 18561 18562 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 18563 18564 if (ipif->ipif_flags & IPIF_UP) { 18565 mutex_enter(&ill->ill_lock); 18566 ipif->ipif_flags &= ~IPIF_UP; 18567 ASSERT(ill->ill_ipif_up_count > 0); 18568 --ill->ill_ipif_up_count; 18569 mutex_exit(&ill->ill_lock); 18570 ipif_was_up = B_TRUE; 18571 /* Update status in SCTP's list */ 18572 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 18573 } 18574 18575 /* 18576 * Blow away memberships we established in ipif_multicast_up(). 18577 */ 18578 ipif_multicast_down(ipif); 18579 18580 /* 18581 * Remove from the mapping for __sin6_src_id. We insert only 18582 * when the address is not INADDR_ANY. As IPv4 addresses are 18583 * stored as mapped addresses, we need to check for mapped 18584 * INADDR_ANY also. 18585 */ 18586 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 18587 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 18588 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 18589 int err; 18590 18591 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 18592 ipif->ipif_zoneid, ipst); 18593 if (err != 0) { 18594 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 18595 } 18596 } 18597 18598 /* 18599 * Before we delete the ill from the group (if any), we need 18600 * to make sure that we delete all the routes dependent on 18601 * this and also any ipifs dependent on this ipif for 18602 * source address. We need to do before we delete from 18603 * the group because 18604 * 18605 * 1) ipif_down_delete_ire de-references ill->ill_group. 18606 * 18607 * 2) ipif_update_other_ipifs needs to walk the whole group 18608 * for re-doing source address selection. Note that 18609 * ipif_select_source[_v6] called from 18610 * ipif_update_other_ipifs[_v6] will not pick this ipif 18611 * because we have already marked down here i.e cleared 18612 * IPIF_UP. 18613 */ 18614 if (ipif->ipif_isv6) { 18615 ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, 18616 ipst); 18617 } else { 18618 ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, 18619 ipst); 18620 } 18621 18622 /* 18623 * Cleaning up the conn_ire_cache or conns must be done only after the 18624 * ires have been deleted above. Otherwise a thread could end up 18625 * caching an ire in a conn after we have finished the cleanup of the 18626 * conn. The caching is done after making sure that the ire is not yet 18627 * condemned. Also documented in the block comment above ip_output 18628 */ 18629 ipcl_walk(conn_cleanup_stale_ire, NULL, ipst); 18630 /* Also, delete the ires cached in SCTP */ 18631 sctp_ire_cache_flush(ipif); 18632 18633 /* 18634 * Update any other ipifs which have used "our" local address as 18635 * a source address. This entails removing and recreating IRE_INTERFACE 18636 * entries for such ipifs. 18637 */ 18638 if (ipif->ipif_isv6) 18639 ipif_update_other_ipifs_v6(ipif, ill->ill_group); 18640 else 18641 ipif_update_other_ipifs(ipif, ill->ill_group); 18642 18643 if (ipif_was_up) { 18644 /* 18645 * Check whether it is last ipif to leave this group. 18646 * If this is the last ipif to leave, we should remove 18647 * this ill from the group as ipif_select_source will not 18648 * be able to find any useful ipifs if this ill is selected 18649 * for load balancing. 18650 * 18651 * For nameless groups, we should call ifgrp_delete if this 18652 * belongs to some group. As this ipif is going down, we may 18653 * need to reconstruct groups. 18654 */ 18655 phyi = ill->ill_phyint; 18656 /* 18657 * If the phyint_groupname_len is 0, it may or may not 18658 * be in the nameless group. If the phyint_groupname_len is 18659 * not 0, then this ill should be part of some group. 18660 * As we always insert this ill in the group if 18661 * phyint_groupname_len is not zero when the first ipif 18662 * comes up (in ipif_up_done), it should be in a group 18663 * when the namelen is not 0. 18664 * 18665 * NOTE : When we delete the ill from the group,it will 18666 * blow away all the IRE_CACHES pointing either at this ipif or 18667 * ill_wq (illgrp_cache_delete does this). Thus, no IRES 18668 * should be pointing at this ill. 18669 */ 18670 ASSERT(phyi->phyint_groupname_len == 0 || 18671 (phyi->phyint_groupname != NULL && ill->ill_group != NULL)); 18672 18673 if (phyi->phyint_groupname_len != 0) { 18674 if (ill->ill_ipif_up_count == 0) 18675 illgrp_delete(ill); 18676 } 18677 18678 /* 18679 * If we have deleted some of the broadcast ires associated 18680 * with this ipif, we need to re-nominate somebody else if 18681 * the ires that we deleted were the nominated ones. 18682 */ 18683 if (ill->ill_group != NULL && !ill->ill_isv6) 18684 ipif_renominate_bcast(ipif); 18685 } 18686 18687 /* 18688 * neighbor-discovery or arp entries for this interface. 18689 */ 18690 ipif_ndp_down(ipif); 18691 18692 /* 18693 * If mp is NULL the caller will wait for the appropriate refcnt. 18694 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 18695 * and ill_delete -> ipif_free -> ipif_down 18696 */ 18697 if (mp == NULL) { 18698 ASSERT(q == NULL); 18699 return (0); 18700 } 18701 18702 if (CONN_Q(q)) { 18703 connp = Q_TO_CONN(q); 18704 mutex_enter(&connp->conn_lock); 18705 } else { 18706 connp = NULL; 18707 } 18708 mutex_enter(&ill->ill_lock); 18709 /* 18710 * Are there any ire's pointing to this ipif that are still active ? 18711 * If this is the last ipif going down, are there any ire's pointing 18712 * to this ill that are still active ? 18713 */ 18714 if (ipif_is_quiescent(ipif)) { 18715 mutex_exit(&ill->ill_lock); 18716 if (connp != NULL) 18717 mutex_exit(&connp->conn_lock); 18718 return (0); 18719 } 18720 18721 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 18722 ill->ill_name, (void *)ill)); 18723 /* 18724 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 18725 * drops down, the operation will be restarted by ipif_ill_refrele_tail 18726 * which in turn is called by the last refrele on the ipif/ill/ire. 18727 */ 18728 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 18729 if (!success) { 18730 /* The conn is closing. So just return */ 18731 ASSERT(connp != NULL); 18732 mutex_exit(&ill->ill_lock); 18733 mutex_exit(&connp->conn_lock); 18734 return (EINTR); 18735 } 18736 18737 mutex_exit(&ill->ill_lock); 18738 if (connp != NULL) 18739 mutex_exit(&connp->conn_lock); 18740 return (EINPROGRESS); 18741 } 18742 18743 void 18744 ipif_down_tail(ipif_t *ipif) 18745 { 18746 ill_t *ill = ipif->ipif_ill; 18747 18748 /* 18749 * Skip any loopback interface (null wq). 18750 * If this is the last logical interface on the ill 18751 * have ill_dl_down tell the driver we are gone (unbind) 18752 * Note that lun 0 can ipif_down even though 18753 * there are other logical units that are up. 18754 * This occurs e.g. when we change a "significant" IFF_ flag. 18755 */ 18756 if (ill->ill_wq != NULL && !ill->ill_logical_down && 18757 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 18758 ill->ill_dl_up) { 18759 ill_dl_down(ill); 18760 } 18761 ill->ill_logical_down = 0; 18762 18763 /* 18764 * Have to be after removing the routes in ipif_down_delete_ire. 18765 */ 18766 if (ipif->ipif_isv6) { 18767 if (ill->ill_flags & ILLF_XRESOLV) 18768 ipif_arp_down(ipif); 18769 } else { 18770 ipif_arp_down(ipif); 18771 } 18772 18773 ip_rts_ifmsg(ipif); 18774 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif); 18775 } 18776 18777 /* 18778 * Bring interface logically down without bringing the physical interface 18779 * down e.g. when the netmask is changed. This avoids long lasting link 18780 * negotiations between an ethernet interface and a certain switches. 18781 */ 18782 static int 18783 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 18784 { 18785 /* 18786 * The ill_logical_down flag is a transient flag. It is set here 18787 * and is cleared once the down has completed in ipif_down_tail. 18788 * This flag does not indicate whether the ill stream is in the 18789 * DL_BOUND state with the driver. Instead this flag is used by 18790 * ipif_down_tail to determine whether to DL_UNBIND the stream with 18791 * the driver. The state of the ill stream i.e. whether it is 18792 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 18793 */ 18794 ipif->ipif_ill->ill_logical_down = 1; 18795 return (ipif_down(ipif, q, mp)); 18796 } 18797 18798 /* 18799 * This is called when the SIOCSLIFUSESRC ioctl is processed in IP. 18800 * If the usesrc client ILL is already part of a usesrc group or not, 18801 * in either case a ire_stq with the matching usesrc client ILL will 18802 * locate the IRE's that need to be deleted. We want IREs to be created 18803 * with the new source address. 18804 */ 18805 static void 18806 ipif_delete_cache_ire(ire_t *ire, char *ill_arg) 18807 { 18808 ill_t *ucill = (ill_t *)ill_arg; 18809 18810 ASSERT(IAM_WRITER_ILL(ucill)); 18811 18812 if (ire->ire_stq == NULL) 18813 return; 18814 18815 if ((ire->ire_type == IRE_CACHE) && 18816 ((ill_t *)ire->ire_stq->q_ptr == ucill)) 18817 ire_delete(ire); 18818 } 18819 18820 /* 18821 * ire_walk routine to delete every IRE dependent on the interface 18822 * address that is going down. (Always called as writer.) 18823 * Works for both v4 and v6. 18824 * In addition for checking for ire_ipif matches it also checks for 18825 * IRE_CACHE entries which have the same source address as the 18826 * disappearing ipif since ipif_select_source might have picked 18827 * that source. Note that ipif_down/ipif_update_other_ipifs takes 18828 * care of any IRE_INTERFACE with the disappearing source address. 18829 */ 18830 static void 18831 ipif_down_delete_ire(ire_t *ire, char *ipif_arg) 18832 { 18833 ipif_t *ipif = (ipif_t *)ipif_arg; 18834 ill_t *ire_ill; 18835 ill_t *ipif_ill; 18836 18837 ASSERT(IAM_WRITER_IPIF(ipif)); 18838 if (ire->ire_ipif == NULL) 18839 return; 18840 18841 /* 18842 * For IPv4, we derive source addresses for an IRE from ipif's 18843 * belonging to the same IPMP group as the IRE's outgoing 18844 * interface. If an IRE's outgoing interface isn't in the 18845 * same IPMP group as a particular ipif, then that ipif 18846 * couldn't have been used as a source address for this IRE. 18847 * 18848 * For IPv6, source addresses are only restricted to the IPMP group 18849 * if the IRE is for a link-local address or a multicast address. 18850 * Otherwise, source addresses for an IRE can be chosen from 18851 * interfaces other than the the outgoing interface for that IRE. 18852 * 18853 * For source address selection details, see ipif_select_source() 18854 * and ipif_select_source_v6(). 18855 */ 18856 if (ire->ire_ipversion == IPV4_VERSION || 18857 IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) || 18858 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { 18859 ire_ill = ire->ire_ipif->ipif_ill; 18860 ipif_ill = ipif->ipif_ill; 18861 18862 if (ire_ill->ill_group != ipif_ill->ill_group) { 18863 return; 18864 } 18865 } 18866 18867 18868 if (ire->ire_ipif != ipif) { 18869 /* 18870 * Look for a matching source address. 18871 */ 18872 if (ire->ire_type != IRE_CACHE) 18873 return; 18874 if (ipif->ipif_flags & IPIF_NOLOCAL) 18875 return; 18876 18877 if (ire->ire_ipversion == IPV4_VERSION) { 18878 if (ire->ire_src_addr != ipif->ipif_src_addr) 18879 return; 18880 } else { 18881 if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 18882 &ipif->ipif_v6lcl_addr)) 18883 return; 18884 } 18885 ire_delete(ire); 18886 return; 18887 } 18888 /* 18889 * ire_delete() will do an ire_flush_cache which will delete 18890 * all ire_ipif matches 18891 */ 18892 ire_delete(ire); 18893 } 18894 18895 /* 18896 * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when 18897 * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or 18898 * 2) when an interface is brought up or down (on that ill). 18899 * This ensures that the IRE_CACHE entries don't retain stale source 18900 * address selection results. 18901 */ 18902 void 18903 ill_ipif_cache_delete(ire_t *ire, char *ill_arg) 18904 { 18905 ill_t *ill = (ill_t *)ill_arg; 18906 ill_t *ipif_ill; 18907 18908 ASSERT(IAM_WRITER_ILL(ill)); 18909 /* 18910 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18911 * Hence this should be IRE_CACHE. 18912 */ 18913 ASSERT(ire->ire_type == IRE_CACHE); 18914 18915 /* 18916 * We are called for IRE_CACHES whose ire_ipif matches ill. 18917 * We are only interested in IRE_CACHES that has borrowed 18918 * the source address from ill_arg e.g. ipif_up_done[_v6] 18919 * for which we need to look at ire_ipif->ipif_ill match 18920 * with ill. 18921 */ 18922 ASSERT(ire->ire_ipif != NULL); 18923 ipif_ill = ire->ire_ipif->ipif_ill; 18924 if (ipif_ill == ill || (ill->ill_group != NULL && 18925 ipif_ill->ill_group == ill->ill_group)) { 18926 ire_delete(ire); 18927 } 18928 } 18929 18930 /* 18931 * Delete all the ire whose stq references ill_arg. 18932 */ 18933 static void 18934 ill_stq_cache_delete(ire_t *ire, char *ill_arg) 18935 { 18936 ill_t *ill = (ill_t *)ill_arg; 18937 ill_t *ire_ill; 18938 18939 ASSERT(IAM_WRITER_ILL(ill)); 18940 /* 18941 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18942 * Hence this should be IRE_CACHE. 18943 */ 18944 ASSERT(ire->ire_type == IRE_CACHE); 18945 18946 /* 18947 * We are called for IRE_CACHES whose ire_stq and ire_ipif 18948 * matches ill. We are only interested in IRE_CACHES that 18949 * has ire_stq->q_ptr pointing at ill_arg. Thus we do the 18950 * filtering here. 18951 */ 18952 ire_ill = (ill_t *)ire->ire_stq->q_ptr; 18953 18954 if (ire_ill == ill) 18955 ire_delete(ire); 18956 } 18957 18958 /* 18959 * This is called when an ill leaves the group. We want to delete 18960 * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is 18961 * pointing at ill. 18962 */ 18963 static void 18964 illgrp_cache_delete(ire_t *ire, char *ill_arg) 18965 { 18966 ill_t *ill = (ill_t *)ill_arg; 18967 18968 ASSERT(IAM_WRITER_ILL(ill)); 18969 ASSERT(ill->ill_group == NULL); 18970 /* 18971 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18972 * Hence this should be IRE_CACHE. 18973 */ 18974 ASSERT(ire->ire_type == IRE_CACHE); 18975 /* 18976 * We are called for IRE_CACHES whose ire_stq and ire_ipif 18977 * matches ill. We are interested in both. 18978 */ 18979 ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) || 18980 (ire->ire_ipif->ipif_ill == ill)); 18981 18982 ire_delete(ire); 18983 } 18984 18985 /* 18986 * Initiate deallocate of an IPIF. Always called as writer. Called by 18987 * ill_delete or ip_sioctl_removeif. 18988 */ 18989 static void 18990 ipif_free(ipif_t *ipif) 18991 { 18992 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 18993 18994 ASSERT(IAM_WRITER_IPIF(ipif)); 18995 18996 if (ipif->ipif_recovery_id != 0) 18997 (void) untimeout(ipif->ipif_recovery_id); 18998 ipif->ipif_recovery_id = 0; 18999 19000 /* Remove conn references */ 19001 reset_conn_ipif(ipif); 19002 19003 /* 19004 * Make sure we have valid net and subnet broadcast ire's for the 19005 * other ipif's which share them with this ipif. 19006 */ 19007 if (!ipif->ipif_isv6) 19008 ipif_check_bcast_ires(ipif); 19009 19010 /* 19011 * Take down the interface. We can be called either from ill_delete 19012 * or from ip_sioctl_removeif. 19013 */ 19014 (void) ipif_down(ipif, NULL, NULL); 19015 19016 /* 19017 * Now that the interface is down, there's no chance it can still 19018 * become a duplicate. Cancel any timer that may have been set while 19019 * tearing down. 19020 */ 19021 if (ipif->ipif_recovery_id != 0) 19022 (void) untimeout(ipif->ipif_recovery_id); 19023 ipif->ipif_recovery_id = 0; 19024 19025 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 19026 /* Remove pointers to this ill in the multicast routing tables */ 19027 reset_mrt_vif_ipif(ipif); 19028 rw_exit(&ipst->ips_ill_g_lock); 19029 } 19030 19031 /* 19032 * Warning: this is not the only function that calls mi_free on an ipif_t. See 19033 * also ill_move(). 19034 */ 19035 static void 19036 ipif_free_tail(ipif_t *ipif) 19037 { 19038 mblk_t *mp; 19039 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 19040 19041 /* 19042 * Free state for addition IRE_IF_[NO]RESOLVER ire's. 19043 */ 19044 mutex_enter(&ipif->ipif_saved_ire_lock); 19045 mp = ipif->ipif_saved_ire_mp; 19046 ipif->ipif_saved_ire_mp = NULL; 19047 mutex_exit(&ipif->ipif_saved_ire_lock); 19048 freemsg(mp); 19049 19050 /* 19051 * Need to hold both ill_g_lock and ill_lock while 19052 * inserting or removing an ipif from the linked list 19053 * of ipifs hanging off the ill. 19054 */ 19055 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 19056 /* 19057 * Remove all IPv4 multicast memberships on the interface now. 19058 * IPv6 is not handled here as the multicast memberships are 19059 * tied to the ill rather than the ipif. 19060 */ 19061 ilm_free(ipif); 19062 19063 /* 19064 * Since we held the ill_g_lock while doing the ilm_free above, 19065 * we can assert the ilms were really deleted and not just marked 19066 * ILM_DELETED. 19067 */ 19068 ASSERT(ilm_walk_ipif(ipif) == 0); 19069 19070 #ifdef DEBUG 19071 ipif_trace_cleanup(ipif); 19072 #endif 19073 19074 /* Ask SCTP to take it out of it list */ 19075 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 19076 19077 /* Get it out of the ILL interface list. */ 19078 ipif_remove(ipif, B_TRUE); 19079 rw_exit(&ipst->ips_ill_g_lock); 19080 19081 mutex_destroy(&ipif->ipif_saved_ire_lock); 19082 19083 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 19084 ASSERT(ipif->ipif_recovery_id == 0); 19085 19086 /* Free the memory. */ 19087 mi_free(ipif); 19088 } 19089 19090 /* 19091 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id" 19092 * is zero. 19093 */ 19094 void 19095 ipif_get_name(const ipif_t *ipif, char *buf, int len) 19096 { 19097 char lbuf[LIFNAMSIZ]; 19098 char *name; 19099 size_t name_len; 19100 19101 buf[0] = '\0'; 19102 name = ipif->ipif_ill->ill_name; 19103 name_len = ipif->ipif_ill->ill_name_length; 19104 if (ipif->ipif_id != 0) { 19105 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 19106 ipif->ipif_id); 19107 name = lbuf; 19108 name_len = mi_strlen(name) + 1; 19109 } 19110 len -= 1; 19111 buf[len] = '\0'; 19112 len = MIN(len, name_len); 19113 bcopy(name, buf, len); 19114 } 19115 19116 /* 19117 * Find an IPIF based on the name passed in. Names can be of the 19118 * form <phys> (e.g., le0), <phys>:<#> (e.g., le0:1), 19119 * The <phys> string can have forms like <dev><#> (e.g., le0), 19120 * <dev><#>.<module> (e.g. le0.foo), or <dev>.<module><#> (e.g. ip.tun3). 19121 * When there is no colon, the implied unit id is zero. <phys> must 19122 * correspond to the name of an ILL. (May be called as writer.) 19123 */ 19124 static ipif_t * 19125 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 19126 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q, 19127 mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) 19128 { 19129 char *cp; 19130 char *endp; 19131 long id; 19132 ill_t *ill; 19133 ipif_t *ipif; 19134 uint_t ire_type; 19135 boolean_t did_alloc = B_FALSE; 19136 ipsq_t *ipsq; 19137 19138 if (error != NULL) 19139 *error = 0; 19140 19141 /* 19142 * If the caller wants to us to create the ipif, make sure we have a 19143 * valid zoneid 19144 */ 19145 ASSERT(!do_alloc || zoneid != ALL_ZONES); 19146 19147 if (namelen == 0) { 19148 if (error != NULL) 19149 *error = ENXIO; 19150 return (NULL); 19151 } 19152 19153 *exists = B_FALSE; 19154 /* Look for a colon in the name. */ 19155 endp = &name[namelen]; 19156 for (cp = endp; --cp > name; ) { 19157 if (*cp == IPIF_SEPARATOR_CHAR) 19158 break; 19159 } 19160 19161 if (*cp == IPIF_SEPARATOR_CHAR) { 19162 /* 19163 * Reject any non-decimal aliases for logical 19164 * interfaces. Aliases with leading zeroes 19165 * are also rejected as they introduce ambiguity 19166 * in the naming of the interfaces. 19167 * In order to confirm with existing semantics, 19168 * and to not break any programs/script relying 19169 * on that behaviour, if<0>:0 is considered to be 19170 * a valid interface. 19171 * 19172 * If alias has two or more digits and the first 19173 * is zero, fail. 19174 */ 19175 if (&cp[2] < endp && cp[1] == '0') 19176 return (NULL); 19177 } 19178 19179 if (cp <= name) { 19180 cp = endp; 19181 } else { 19182 *cp = '\0'; 19183 } 19184 19185 /* 19186 * Look up the ILL, based on the portion of the name 19187 * before the slash. ill_lookup_on_name returns a held ill. 19188 * Temporary to check whether ill exists already. If so 19189 * ill_lookup_on_name will clear it. 19190 */ 19191 ill = ill_lookup_on_name(name, do_alloc, isv6, 19192 q, mp, func, error, &did_alloc, ipst); 19193 if (cp != endp) 19194 *cp = IPIF_SEPARATOR_CHAR; 19195 if (ill == NULL) 19196 return (NULL); 19197 19198 /* Establish the unit number in the name. */ 19199 id = 0; 19200 if (cp < endp && *endp == '\0') { 19201 /* If there was a colon, the unit number follows. */ 19202 cp++; 19203 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 19204 ill_refrele(ill); 19205 if (error != NULL) 19206 *error = ENXIO; 19207 return (NULL); 19208 } 19209 } 19210 19211 GRAB_CONN_LOCK(q); 19212 mutex_enter(&ill->ill_lock); 19213 /* Now see if there is an IPIF with this unit number. */ 19214 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 19215 if (ipif->ipif_id == id) { 19216 if (zoneid != ALL_ZONES && 19217 zoneid != ipif->ipif_zoneid && 19218 ipif->ipif_zoneid != ALL_ZONES) { 19219 mutex_exit(&ill->ill_lock); 19220 RELEASE_CONN_LOCK(q); 19221 ill_refrele(ill); 19222 if (error != NULL) 19223 *error = ENXIO; 19224 return (NULL); 19225 } 19226 /* 19227 * The block comment at the start of ipif_down 19228 * explains the use of the macros used below 19229 */ 19230 if (IPIF_CAN_LOOKUP(ipif)) { 19231 ipif_refhold_locked(ipif); 19232 mutex_exit(&ill->ill_lock); 19233 if (!did_alloc) 19234 *exists = B_TRUE; 19235 /* 19236 * Drop locks before calling ill_refrele 19237 * since it can potentially call into 19238 * ipif_ill_refrele_tail which can end up 19239 * in trying to acquire any lock. 19240 */ 19241 RELEASE_CONN_LOCK(q); 19242 ill_refrele(ill); 19243 return (ipif); 19244 } else if (IPIF_CAN_WAIT(ipif, q)) { 19245 ipsq = ill->ill_phyint->phyint_ipsq; 19246 mutex_enter(&ipsq->ipsq_lock); 19247 mutex_exit(&ill->ill_lock); 19248 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 19249 mutex_exit(&ipsq->ipsq_lock); 19250 RELEASE_CONN_LOCK(q); 19251 ill_refrele(ill); 19252 *error = EINPROGRESS; 19253 return (NULL); 19254 } 19255 } 19256 } 19257 RELEASE_CONN_LOCK(q); 19258 19259 if (!do_alloc) { 19260 mutex_exit(&ill->ill_lock); 19261 ill_refrele(ill); 19262 if (error != NULL) 19263 *error = ENXIO; 19264 return (NULL); 19265 } 19266 19267 /* 19268 * If none found, atomically allocate and return a new one. 19269 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 19270 * to support "receive only" use of lo0:1 etc. as is still done 19271 * below as an initial guess. 19272 * However, this is now likely to be overriden later in ipif_up_done() 19273 * when we know for sure what address has been configured on the 19274 * interface, since we might have more than one loopback interface 19275 * with a loopback address, e.g. in the case of zones, and all the 19276 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 19277 */ 19278 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 19279 ire_type = IRE_LOOPBACK; 19280 else 19281 ire_type = IRE_LOCAL; 19282 ipif = ipif_allocate(ill, id, ire_type, B_TRUE); 19283 if (ipif != NULL) 19284 ipif_refhold_locked(ipif); 19285 else if (error != NULL) 19286 *error = ENOMEM; 19287 mutex_exit(&ill->ill_lock); 19288 ill_refrele(ill); 19289 return (ipif); 19290 } 19291 19292 /* 19293 * This routine is called whenever a new address comes up on an ipif. If 19294 * we are configured to respond to address mask requests, then we are supposed 19295 * to broadcast an address mask reply at this time. This routine is also 19296 * called if we are already up, but a netmask change is made. This is legal 19297 * but might not make the system manager very popular. (May be called 19298 * as writer.) 19299 */ 19300 void 19301 ipif_mask_reply(ipif_t *ipif) 19302 { 19303 icmph_t *icmph; 19304 ipha_t *ipha; 19305 mblk_t *mp; 19306 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 19307 19308 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 19309 19310 if (!ipst->ips_ip_respond_to_address_mask_broadcast) 19311 return; 19312 19313 /* ICMP mask reply is IPv4 only */ 19314 ASSERT(!ipif->ipif_isv6); 19315 /* ICMP mask reply is not for a loopback interface */ 19316 ASSERT(ipif->ipif_ill->ill_wq != NULL); 19317 19318 mp = allocb(REPLY_LEN, BPRI_HI); 19319 if (mp == NULL) 19320 return; 19321 mp->b_wptr = mp->b_rptr + REPLY_LEN; 19322 19323 ipha = (ipha_t *)mp->b_rptr; 19324 bzero(ipha, REPLY_LEN); 19325 *ipha = icmp_ipha; 19326 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 19327 ipha->ipha_src = ipif->ipif_src_addr; 19328 ipha->ipha_dst = ipif->ipif_brd_addr; 19329 ipha->ipha_length = htons(REPLY_LEN); 19330 ipha->ipha_ident = 0; 19331 19332 icmph = (icmph_t *)&ipha[1]; 19333 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 19334 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 19335 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 19336 19337 put(ipif->ipif_wq, mp); 19338 19339 #undef REPLY_LEN 19340 } 19341 19342 /* 19343 * When the mtu in the ipif changes, we call this routine through ire_walk 19344 * to update all the relevant IREs. 19345 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 19346 */ 19347 static void 19348 ipif_mtu_change(ire_t *ire, char *ipif_arg) 19349 { 19350 ipif_t *ipif = (ipif_t *)ipif_arg; 19351 19352 if (ire->ire_stq == NULL || ire->ire_ipif != ipif) 19353 return; 19354 ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET); 19355 } 19356 19357 /* 19358 * When the mtu in the ill changes, we call this routine through ire_walk 19359 * to update all the relevant IREs. 19360 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 19361 */ 19362 void 19363 ill_mtu_change(ire_t *ire, char *ill_arg) 19364 { 19365 ill_t *ill = (ill_t *)ill_arg; 19366 19367 if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill) 19368 return; 19369 ire->ire_max_frag = ire->ire_ipif->ipif_mtu; 19370 } 19371 19372 /* 19373 * Join the ipif specific multicast groups. 19374 * Must be called after a mapping has been set up in the resolver. (Always 19375 * called as writer.) 19376 */ 19377 void 19378 ipif_multicast_up(ipif_t *ipif) 19379 { 19380 int err, index; 19381 ill_t *ill; 19382 19383 ASSERT(IAM_WRITER_IPIF(ipif)); 19384 19385 ill = ipif->ipif_ill; 19386 index = ill->ill_phyint->phyint_ifindex; 19387 19388 ip1dbg(("ipif_multicast_up\n")); 19389 if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up) 19390 return; 19391 19392 if (ipif->ipif_isv6) { 19393 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 19394 return; 19395 19396 /* Join the all hosts multicast address */ 19397 ip1dbg(("ipif_multicast_up - addmulti\n")); 19398 /* 19399 * Passing B_TRUE means we have to join the multicast 19400 * membership on this interface even though this is 19401 * FAILED. If we join on a different one in the group, 19402 * we will not be able to delete the membership later 19403 * as we currently don't track where we join when we 19404 * join within the kernel unlike applications where 19405 * we have ilg/ilg_orig_index. See ip_addmulti_v6 19406 * for more on this. 19407 */ 19408 err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index, 19409 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 19410 if (err != 0) { 19411 ip0dbg(("ipif_multicast_up: " 19412 "all_hosts_mcast failed %d\n", 19413 err)); 19414 return; 19415 } 19416 /* 19417 * Enable multicast for the solicited node multicast address 19418 */ 19419 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 19420 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 19421 19422 ipv6_multi.s6_addr32[3] |= 19423 ipif->ipif_v6lcl_addr.s6_addr32[3]; 19424 19425 err = ip_addmulti_v6(&ipv6_multi, ill, index, 19426 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, 19427 NULL); 19428 if (err != 0) { 19429 ip0dbg(("ipif_multicast_up: solicited MC" 19430 " failed %d\n", err)); 19431 (void) ip_delmulti_v6(&ipv6_all_hosts_mcast, 19432 ill, ill->ill_phyint->phyint_ifindex, 19433 ipif->ipif_zoneid, B_TRUE, B_TRUE); 19434 return; 19435 } 19436 } 19437 } else { 19438 if (ipif->ipif_lcl_addr == INADDR_ANY) 19439 return; 19440 19441 /* Join the all hosts multicast address */ 19442 ip1dbg(("ipif_multicast_up - addmulti\n")); 19443 err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, 19444 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 19445 if (err) { 19446 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 19447 return; 19448 } 19449 } 19450 ipif->ipif_multicast_up = 1; 19451 } 19452 19453 /* 19454 * Blow away any multicast groups that we joined in ipif_multicast_up(). 19455 * (Explicit memberships are blown away in ill_leave_multicast() when the 19456 * ill is brought down.) 19457 */ 19458 static void 19459 ipif_multicast_down(ipif_t *ipif) 19460 { 19461 int err; 19462 19463 ASSERT(IAM_WRITER_IPIF(ipif)); 19464 19465 ip1dbg(("ipif_multicast_down\n")); 19466 if (!ipif->ipif_multicast_up) 19467 return; 19468 19469 ip1dbg(("ipif_multicast_down - delmulti\n")); 19470 19471 if (!ipif->ipif_isv6) { 19472 err = ip_delmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, B_TRUE, 19473 B_TRUE); 19474 if (err != 0) 19475 ip0dbg(("ipif_multicast_down: failed %d\n", err)); 19476 19477 ipif->ipif_multicast_up = 0; 19478 return; 19479 } 19480 19481 /* 19482 * Leave the all hosts multicast address. Similar to ip_addmulti_v6, 19483 * we should look for ilms on this ill rather than the ones that have 19484 * been failed over here. They are here temporarily. As 19485 * ipif_multicast_up has joined on this ill, we should delete only 19486 * from this ill. 19487 */ 19488 err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, 19489 ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid, 19490 B_TRUE, B_TRUE); 19491 if (err != 0) { 19492 ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n", 19493 err)); 19494 } 19495 /* 19496 * Disable multicast for the solicited node multicast address 19497 */ 19498 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 19499 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 19500 19501 ipv6_multi.s6_addr32[3] |= 19502 ipif->ipif_v6lcl_addr.s6_addr32[3]; 19503 19504 err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill, 19505 ipif->ipif_ill->ill_phyint->phyint_ifindex, 19506 ipif->ipif_zoneid, B_TRUE, B_TRUE); 19507 19508 if (err != 0) { 19509 ip0dbg(("ipif_multicast_down: sol MC failed %d\n", 19510 err)); 19511 } 19512 } 19513 19514 ipif->ipif_multicast_up = 0; 19515 } 19516 19517 /* 19518 * Used when an interface comes up to recreate any extra routes on this 19519 * interface. 19520 */ 19521 static ire_t ** 19522 ipif_recover_ire(ipif_t *ipif) 19523 { 19524 mblk_t *mp; 19525 ire_t **ipif_saved_irep; 19526 ire_t **irep; 19527 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 19528 19529 ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name, 19530 ipif->ipif_id)); 19531 19532 mutex_enter(&ipif->ipif_saved_ire_lock); 19533 ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) * 19534 ipif->ipif_saved_ire_cnt, KM_NOSLEEP); 19535 if (ipif_saved_irep == NULL) { 19536 mutex_exit(&ipif->ipif_saved_ire_lock); 19537 return (NULL); 19538 } 19539 19540 irep = ipif_saved_irep; 19541 for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 19542 ire_t *ire; 19543 queue_t *rfq; 19544 queue_t *stq; 19545 ifrt_t *ifrt; 19546 uchar_t *src_addr; 19547 uchar_t *gateway_addr; 19548 ushort_t type; 19549 19550 /* 19551 * When the ire was initially created and then added in 19552 * ip_rt_add(), it was created either using ipif->ipif_net_type 19553 * in the case of a traditional interface route, or as one of 19554 * the IRE_OFFSUBNET types (with the exception of 19555 * IRE_HOST types ire which is created by icmp_redirect() and 19556 * which we don't need to save or recover). In the case where 19557 * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update 19558 * the ire_type to IRE_IF_NORESOLVER before calling ire_add() 19559 * to satisfy software like GateD and Sun Cluster which creates 19560 * routes using the the loopback interface's address as a 19561 * gateway. 19562 * 19563 * As ifrt->ifrt_type reflects the already updated ire_type, 19564 * ire_create() will be called in the same way here as 19565 * in ip_rt_add(), namely using ipif->ipif_net_type when 19566 * the route looks like a traditional interface route (where 19567 * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using 19568 * the saved ifrt->ifrt_type. This means that in the case where 19569 * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by 19570 * ire_create() will be an IRE_LOOPBACK, it will then be turned 19571 * into an IRE_IF_NORESOLVER and then added by ire_add(). 19572 */ 19573 ifrt = (ifrt_t *)mp->b_rptr; 19574 ASSERT(ifrt->ifrt_type != IRE_CACHE); 19575 if (ifrt->ifrt_type & IRE_INTERFACE) { 19576 rfq = NULL; 19577 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 19578 ? ipif->ipif_rq : ipif->ipif_wq; 19579 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19580 ? (uint8_t *)&ifrt->ifrt_src_addr 19581 : (uint8_t *)&ipif->ipif_src_addr; 19582 gateway_addr = NULL; 19583 type = ipif->ipif_net_type; 19584 } else if (ifrt->ifrt_type & IRE_BROADCAST) { 19585 /* Recover multiroute broadcast IRE. */ 19586 rfq = ipif->ipif_rq; 19587 stq = ipif->ipif_wq; 19588 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19589 ? (uint8_t *)&ifrt->ifrt_src_addr 19590 : (uint8_t *)&ipif->ipif_src_addr; 19591 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 19592 type = ifrt->ifrt_type; 19593 } else { 19594 rfq = NULL; 19595 stq = NULL; 19596 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19597 ? (uint8_t *)&ifrt->ifrt_src_addr : NULL; 19598 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 19599 type = ifrt->ifrt_type; 19600 } 19601 19602 /* 19603 * Create a copy of the IRE with the saved address and netmask. 19604 */ 19605 ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for " 19606 "0x%x/0x%x\n", 19607 ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type, 19608 ntohl(ifrt->ifrt_addr), 19609 ntohl(ifrt->ifrt_mask))); 19610 ire = ire_create( 19611 (uint8_t *)&ifrt->ifrt_addr, 19612 (uint8_t *)&ifrt->ifrt_mask, 19613 src_addr, 19614 gateway_addr, 19615 &ifrt->ifrt_max_frag, 19616 NULL, 19617 rfq, 19618 stq, 19619 type, 19620 ipif, 19621 0, 19622 0, 19623 0, 19624 ifrt->ifrt_flags, 19625 &ifrt->ifrt_iulp_info, 19626 NULL, 19627 NULL, 19628 ipst); 19629 19630 if (ire == NULL) { 19631 mutex_exit(&ipif->ipif_saved_ire_lock); 19632 kmem_free(ipif_saved_irep, 19633 ipif->ipif_saved_ire_cnt * sizeof (ire_t *)); 19634 return (NULL); 19635 } 19636 19637 /* 19638 * Some software (for example, GateD and Sun Cluster) attempts 19639 * to create (what amount to) IRE_PREFIX routes with the 19640 * loopback address as the gateway. This is primarily done to 19641 * set up prefixes with the RTF_REJECT flag set (for example, 19642 * when generating aggregate routes.) 19643 * 19644 * If the IRE type (as defined by ipif->ipif_net_type) is 19645 * IRE_LOOPBACK, then we map the request into a 19646 * IRE_IF_NORESOLVER. 19647 */ 19648 if (ipif->ipif_net_type == IRE_LOOPBACK) 19649 ire->ire_type = IRE_IF_NORESOLVER; 19650 /* 19651 * ire held by ire_add, will be refreled' towards the 19652 * the end of ipif_up_done 19653 */ 19654 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 19655 *irep = ire; 19656 irep++; 19657 ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire)); 19658 } 19659 mutex_exit(&ipif->ipif_saved_ire_lock); 19660 return (ipif_saved_irep); 19661 } 19662 19663 /* 19664 * Used to set the netmask and broadcast address to default values when the 19665 * interface is brought up. (Always called as writer.) 19666 */ 19667 static void 19668 ipif_set_default(ipif_t *ipif) 19669 { 19670 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 19671 19672 if (!ipif->ipif_isv6) { 19673 /* 19674 * Interface holds an IPv4 address. Default 19675 * mask is the natural netmask. 19676 */ 19677 if (!ipif->ipif_net_mask) { 19678 ipaddr_t v4mask; 19679 19680 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 19681 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 19682 } 19683 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19684 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19685 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 19686 } else { 19687 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 19688 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 19689 } 19690 /* 19691 * NOTE: SunOS 4.X does this even if the broadcast address 19692 * has been already set thus we do the same here. 19693 */ 19694 if (ipif->ipif_flags & IPIF_BROADCAST) { 19695 ipaddr_t v4addr; 19696 19697 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 19698 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 19699 } 19700 } else { 19701 /* 19702 * Interface holds an IPv6-only address. Default 19703 * mask is all-ones. 19704 */ 19705 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 19706 ipif->ipif_v6net_mask = ipv6_all_ones; 19707 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19708 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19709 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 19710 } else { 19711 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 19712 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 19713 } 19714 } 19715 } 19716 19717 /* 19718 * Return 0 if this address can be used as local address without causing 19719 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 19720 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 19721 * Special checks are needed to allow the same IPv6 link-local address 19722 * on different ills. 19723 * TODO: allowing the same site-local address on different ill's. 19724 */ 19725 int 19726 ip_addr_availability_check(ipif_t *new_ipif) 19727 { 19728 in6_addr_t our_v6addr; 19729 ill_t *ill; 19730 ipif_t *ipif; 19731 ill_walk_context_t ctx; 19732 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst; 19733 19734 ASSERT(IAM_WRITER_IPIF(new_ipif)); 19735 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock)); 19736 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); 19737 19738 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 19739 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 19740 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 19741 return (0); 19742 19743 our_v6addr = new_ipif->ipif_v6lcl_addr; 19744 19745 if (new_ipif->ipif_isv6) 19746 ill = ILL_START_WALK_V6(&ctx, ipst); 19747 else 19748 ill = ILL_START_WALK_V4(&ctx, ipst); 19749 19750 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 19751 for (ipif = ill->ill_ipif; ipif != NULL; 19752 ipif = ipif->ipif_next) { 19753 if ((ipif == new_ipif) || 19754 !(ipif->ipif_flags & IPIF_UP) || 19755 (ipif->ipif_flags & IPIF_UNNUMBERED)) 19756 continue; 19757 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 19758 &our_v6addr)) { 19759 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 19760 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 19761 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 19762 ipif->ipif_flags |= IPIF_UNNUMBERED; 19763 else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) && 19764 new_ipif->ipif_ill != ill) 19765 continue; 19766 else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) && 19767 new_ipif->ipif_ill != ill) 19768 continue; 19769 else if (new_ipif->ipif_zoneid != 19770 ipif->ipif_zoneid && 19771 ipif->ipif_zoneid != ALL_ZONES && 19772 IS_LOOPBACK(ill)) 19773 continue; 19774 else if (new_ipif->ipif_ill == ill) 19775 return (EADDRINUSE); 19776 else 19777 return (EADDRNOTAVAIL); 19778 } 19779 } 19780 } 19781 19782 return (0); 19783 } 19784 19785 /* 19786 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 19787 * IREs for the ipif. 19788 * When the routine returns EINPROGRESS then mp has been consumed and 19789 * the ioctl will be acked from ip_rput_dlpi. 19790 */ 19791 static int 19792 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 19793 { 19794 ill_t *ill = ipif->ipif_ill; 19795 boolean_t isv6 = ipif->ipif_isv6; 19796 int err = 0; 19797 boolean_t success; 19798 19799 ASSERT(IAM_WRITER_IPIF(ipif)); 19800 19801 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 19802 19803 /* Shouldn't get here if it is already up. */ 19804 if (ipif->ipif_flags & IPIF_UP) 19805 return (EALREADY); 19806 19807 /* Skip arp/ndp for any loopback interface. */ 19808 if (ill->ill_wq != NULL) { 19809 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 19810 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 19811 19812 if (!ill->ill_dl_up) { 19813 /* 19814 * ill_dl_up is not yet set. i.e. we are yet to 19815 * DL_BIND with the driver and this is the first 19816 * logical interface on the ill to become "up". 19817 * Tell the driver to get going (via DL_BIND_REQ). 19818 * Note that changing "significant" IFF_ flags 19819 * address/netmask etc cause a down/up dance, but 19820 * does not cause an unbind (DL_UNBIND) with the driver 19821 */ 19822 return (ill_dl_up(ill, ipif, mp, q)); 19823 } 19824 19825 /* 19826 * ipif_resolver_up may end up sending an 19827 * AR_INTERFACE_UP message to ARP, which would, in 19828 * turn send a DLPI message to the driver. ioctls are 19829 * serialized and so we cannot send more than one 19830 * interface up message at a time. If ipif_resolver_up 19831 * does send an interface up message to ARP, we get 19832 * EINPROGRESS and we will complete in ip_arp_done. 19833 */ 19834 19835 ASSERT(connp != NULL || !CONN_Q(q)); 19836 ASSERT(ipsq->ipsq_pending_mp == NULL); 19837 if (connp != NULL) 19838 mutex_enter(&connp->conn_lock); 19839 mutex_enter(&ill->ill_lock); 19840 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 19841 mutex_exit(&ill->ill_lock); 19842 if (connp != NULL) 19843 mutex_exit(&connp->conn_lock); 19844 if (!success) 19845 return (EINTR); 19846 19847 /* 19848 * Crank up IPv6 neighbor discovery 19849 * Unlike ARP, this should complete when 19850 * ipif_ndp_up returns. However, for 19851 * ILLF_XRESOLV interfaces we also send a 19852 * AR_INTERFACE_UP to the external resolver. 19853 * That ioctl will complete in ip_rput. 19854 */ 19855 if (isv6) { 19856 err = ipif_ndp_up(ipif); 19857 if (err != 0) { 19858 if (err != EINPROGRESS) 19859 mp = ipsq_pending_mp_get(ipsq, &connp); 19860 return (err); 19861 } 19862 } 19863 /* Now, ARP */ 19864 err = ipif_resolver_up(ipif, Res_act_initial); 19865 if (err == EINPROGRESS) { 19866 /* We will complete it in ip_arp_done */ 19867 return (err); 19868 } 19869 mp = ipsq_pending_mp_get(ipsq, &connp); 19870 ASSERT(mp != NULL); 19871 if (err != 0) 19872 return (err); 19873 } else { 19874 /* 19875 * Interfaces without underlying hardware don't do duplicate 19876 * address detection. 19877 */ 19878 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 19879 ipif->ipif_addr_ready = 1; 19880 } 19881 return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 19882 } 19883 19884 /* 19885 * Perform a bind for the physical device. 19886 * When the routine returns EINPROGRESS then mp has been consumed and 19887 * the ioctl will be acked from ip_rput_dlpi. 19888 * Allocate an unbind message and save it until ipif_down. 19889 */ 19890 static int 19891 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 19892 { 19893 areq_t *areq; 19894 mblk_t *areq_mp = NULL; 19895 mblk_t *bind_mp = NULL; 19896 mblk_t *unbind_mp = NULL; 19897 conn_t *connp; 19898 boolean_t success; 19899 uint16_t sap_addr; 19900 19901 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 19902 ASSERT(IAM_WRITER_ILL(ill)); 19903 ASSERT(mp != NULL); 19904 19905 /* Create a resolver cookie for ARP */ 19906 if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) { 19907 areq_mp = ill_arp_alloc(ill, (uchar_t *)&ip_areq_template, 0); 19908 if (areq_mp == NULL) 19909 return (ENOMEM); 19910 19911 freemsg(ill->ill_resolver_mp); 19912 ill->ill_resolver_mp = areq_mp; 19913 areq = (areq_t *)areq_mp->b_rptr; 19914 sap_addr = ill->ill_sap; 19915 bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr)); 19916 } 19917 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 19918 DL_BIND_REQ); 19919 if (bind_mp == NULL) 19920 goto bad; 19921 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 19922 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 19923 19924 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 19925 if (unbind_mp == NULL) 19926 goto bad; 19927 19928 /* 19929 * Record state needed to complete this operation when the 19930 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 19931 */ 19932 ASSERT(WR(q)->q_next == NULL); 19933 connp = Q_TO_CONN(q); 19934 19935 mutex_enter(&connp->conn_lock); 19936 mutex_enter(&ipif->ipif_ill->ill_lock); 19937 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 19938 mutex_exit(&ipif->ipif_ill->ill_lock); 19939 mutex_exit(&connp->conn_lock); 19940 if (!success) 19941 goto bad; 19942 19943 /* 19944 * Save the unbind message for ill_dl_down(); it will be consumed when 19945 * the interface goes down. 19946 */ 19947 ASSERT(ill->ill_unbind_mp == NULL); 19948 ill->ill_unbind_mp = unbind_mp; 19949 19950 ill_dlpi_send(ill, bind_mp); 19951 /* Send down link-layer capabilities probe if not already done. */ 19952 ill_capability_probe(ill); 19953 19954 /* 19955 * Sysid used to rely on the fact that netboots set domainname 19956 * and the like. Now that miniroot boots aren't strictly netboots 19957 * and miniroot network configuration is driven from userland 19958 * these things still need to be set. This situation can be detected 19959 * by comparing the interface being configured here to the one 19960 * dhcack was set to reference by the boot loader. Once sysid is 19961 * converted to use dhcp_ipc_getinfo() this call can go away. 19962 */ 19963 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && (dhcack != NULL) && 19964 (strcmp(ill->ill_name, dhcack) == 0) && 19965 (strlen(srpc_domain) == 0)) { 19966 if (dhcpinit() != 0) 19967 cmn_err(CE_WARN, "no cached dhcp response"); 19968 } 19969 19970 /* 19971 * This operation will complete in ip_rput_dlpi with either 19972 * a DL_BIND_ACK or DL_ERROR_ACK. 19973 */ 19974 return (EINPROGRESS); 19975 bad: 19976 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 19977 /* 19978 * We don't have to check for possible removal from illgrp 19979 * as we have not yet inserted in illgrp. For groups 19980 * without names, this ipif is still not UP and hence 19981 * this could not have possibly had any influence in forming 19982 * groups. 19983 */ 19984 19985 freemsg(bind_mp); 19986 freemsg(unbind_mp); 19987 return (ENOMEM); 19988 } 19989 19990 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 19991 19992 /* 19993 * DLPI and ARP is up. 19994 * Create all the IREs associated with an interface bring up multicast. 19995 * Set the interface flag and finish other initialization 19996 * that potentially had to be differed to after DL_BIND_ACK. 19997 */ 19998 int 19999 ipif_up_done(ipif_t *ipif) 20000 { 20001 ire_t *ire_array[20]; 20002 ire_t **irep = ire_array; 20003 ire_t **irep1; 20004 ipaddr_t net_mask = 0; 20005 ipaddr_t subnet_mask, route_mask; 20006 ill_t *ill = ipif->ipif_ill; 20007 queue_t *stq; 20008 ipif_t *src_ipif; 20009 ipif_t *tmp_ipif; 20010 boolean_t flush_ire_cache = B_TRUE; 20011 int err = 0; 20012 phyint_t *phyi; 20013 ire_t **ipif_saved_irep = NULL; 20014 int ipif_saved_ire_cnt; 20015 int cnt; 20016 boolean_t src_ipif_held = B_FALSE; 20017 boolean_t ire_added = B_FALSE; 20018 boolean_t loopback = B_FALSE; 20019 ip_stack_t *ipst = ill->ill_ipst; 20020 20021 ip1dbg(("ipif_up_done(%s:%u)\n", 20022 ipif->ipif_ill->ill_name, ipif->ipif_id)); 20023 /* Check if this is a loopback interface */ 20024 if (ipif->ipif_ill->ill_wq == NULL) 20025 loopback = B_TRUE; 20026 20027 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 20028 /* 20029 * If all other interfaces for this ill are down or DEPRECATED, 20030 * or otherwise unsuitable for source address selection, remove 20031 * any IRE_CACHE entries for this ill to make sure source 20032 * address selection gets to take this new ipif into account. 20033 * No need to hold ill_lock while traversing the ipif list since 20034 * we are writer 20035 */ 20036 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 20037 tmp_ipif = tmp_ipif->ipif_next) { 20038 if (((tmp_ipif->ipif_flags & 20039 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 20040 !(tmp_ipif->ipif_flags & IPIF_UP)) || 20041 (tmp_ipif == ipif)) 20042 continue; 20043 /* first useable pre-existing interface */ 20044 flush_ire_cache = B_FALSE; 20045 break; 20046 } 20047 if (flush_ire_cache) 20048 ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE, 20049 IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); 20050 20051 /* 20052 * Figure out which way the send-to queue should go. Only 20053 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK 20054 * should show up here. 20055 */ 20056 switch (ill->ill_net_type) { 20057 case IRE_IF_RESOLVER: 20058 stq = ill->ill_rq; 20059 break; 20060 case IRE_IF_NORESOLVER: 20061 case IRE_LOOPBACK: 20062 stq = ill->ill_wq; 20063 break; 20064 default: 20065 return (EINVAL); 20066 } 20067 20068 if (IS_LOOPBACK(ill)) { 20069 /* 20070 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 20071 * ipif_lookup_on_name(), but in the case of zones we can have 20072 * several loopback addresses on lo0. So all the interfaces with 20073 * loopback addresses need to be marked IRE_LOOPBACK. 20074 */ 20075 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 20076 htonl(INADDR_LOOPBACK)) 20077 ipif->ipif_ire_type = IRE_LOOPBACK; 20078 else 20079 ipif->ipif_ire_type = IRE_LOCAL; 20080 } 20081 20082 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) { 20083 /* 20084 * Can't use our source address. Select a different 20085 * source address for the IRE_INTERFACE and IRE_LOCAL 20086 */ 20087 src_ipif = ipif_select_source(ipif->ipif_ill, 20088 ipif->ipif_subnet, ipif->ipif_zoneid); 20089 if (src_ipif == NULL) 20090 src_ipif = ipif; /* Last resort */ 20091 else 20092 src_ipif_held = B_TRUE; 20093 } else { 20094 src_ipif = ipif; 20095 } 20096 20097 /* Create all the IREs associated with this interface */ 20098 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 20099 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 20100 20101 /* 20102 * If we're on a labeled system then make sure that zone- 20103 * private addresses have proper remote host database entries. 20104 */ 20105 if (is_system_labeled() && 20106 ipif->ipif_ire_type != IRE_LOOPBACK && 20107 !tsol_check_interface_address(ipif)) 20108 return (EINVAL); 20109 20110 /* Register the source address for __sin6_src_id */ 20111 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 20112 ipif->ipif_zoneid, ipst); 20113 if (err != 0) { 20114 ip0dbg(("ipif_up_done: srcid_insert %d\n", err)); 20115 return (err); 20116 } 20117 20118 /* If the interface address is set, create the local IRE. */ 20119 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n", 20120 (void *)ipif, 20121 ipif->ipif_ire_type, 20122 ntohl(ipif->ipif_lcl_addr))); 20123 *irep++ = ire_create( 20124 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 20125 (uchar_t *)&ip_g_all_ones, /* mask */ 20126 (uchar_t *)&src_ipif->ipif_src_addr, /* source address */ 20127 NULL, /* no gateway */ 20128 &ip_loopback_mtuplus, /* max frag size */ 20129 NULL, 20130 ipif->ipif_rq, /* recv-from queue */ 20131 NULL, /* no send-to queue */ 20132 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 20133 ipif, 20134 0, 20135 0, 20136 0, 20137 (ipif->ipif_flags & IPIF_PRIVATE) ? 20138 RTF_PRIVATE : 0, 20139 &ire_uinfo_null, 20140 NULL, 20141 NULL, 20142 ipst); 20143 } else { 20144 ip1dbg(( 20145 "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n", 20146 ipif->ipif_ire_type, 20147 ntohl(ipif->ipif_lcl_addr), 20148 (uint_t)ipif->ipif_flags)); 20149 } 20150 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 20151 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 20152 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 20153 } else { 20154 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 20155 } 20156 20157 subnet_mask = ipif->ipif_net_mask; 20158 20159 /* 20160 * If mask was not specified, use natural netmask of 20161 * interface address. Also, store this mask back into the 20162 * ipif struct. 20163 */ 20164 if (subnet_mask == 0) { 20165 subnet_mask = net_mask; 20166 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 20167 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 20168 ipif->ipif_v6subnet); 20169 } 20170 20171 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 20172 if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) && 20173 ipif->ipif_subnet != INADDR_ANY) { 20174 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 20175 20176 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 20177 route_mask = IP_HOST_MASK; 20178 } else { 20179 route_mask = subnet_mask; 20180 } 20181 20182 ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p " 20183 "creating if IRE ill_net_type 0x%x for 0x%x\n", 20184 (void *)ipif, (void *)ill, 20185 ill->ill_net_type, 20186 ntohl(ipif->ipif_subnet))); 20187 *irep++ = ire_create( 20188 (uchar_t *)&ipif->ipif_subnet, /* dest address */ 20189 (uchar_t *)&route_mask, /* mask */ 20190 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 20191 NULL, /* no gateway */ 20192 &ipif->ipif_mtu, /* max frag */ 20193 NULL, 20194 NULL, /* no recv queue */ 20195 stq, /* send-to queue */ 20196 ill->ill_net_type, /* IF_[NO]RESOLVER */ 20197 ipif, 20198 0, 20199 0, 20200 0, 20201 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0, 20202 &ire_uinfo_null, 20203 NULL, 20204 NULL, 20205 ipst); 20206 } 20207 20208 /* 20209 * Create any necessary broadcast IREs. 20210 */ 20211 if ((ipif->ipif_subnet != INADDR_ANY) && 20212 (ipif->ipif_flags & IPIF_BROADCAST)) 20213 irep = ipif_create_bcast_ires(ipif, irep); 20214 20215 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 20216 20217 /* If an earlier ire_create failed, get out now */ 20218 for (irep1 = irep; irep1 > ire_array; ) { 20219 irep1--; 20220 if (*irep1 == NULL) { 20221 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 20222 err = ENOMEM; 20223 goto bad; 20224 } 20225 } 20226 20227 /* 20228 * Need to atomically check for ip_addr_availablity_check 20229 * under ip_addr_avail_lock, and if it fails got bad, and remove 20230 * from group also.The ill_g_lock is grabbed as reader 20231 * just to make sure no new ills or new ipifs are being added 20232 * to the system while we are checking the uniqueness of addresses. 20233 */ 20234 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 20235 mutex_enter(&ipst->ips_ip_addr_avail_lock); 20236 /* Mark it up, and increment counters. */ 20237 ipif->ipif_flags |= IPIF_UP; 20238 ill->ill_ipif_up_count++; 20239 err = ip_addr_availability_check(ipif); 20240 mutex_exit(&ipst->ips_ip_addr_avail_lock); 20241 rw_exit(&ipst->ips_ill_g_lock); 20242 20243 if (err != 0) { 20244 /* 20245 * Our address may already be up on the same ill. In this case, 20246 * the ARP entry for our ipif replaced the one for the other 20247 * ipif. So we don't want to delete it (otherwise the other ipif 20248 * would be unable to send packets). 20249 * ip_addr_availability_check() identifies this case for us and 20250 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL 20251 * which is the expected error code. 20252 */ 20253 if (err == EADDRINUSE) { 20254 freemsg(ipif->ipif_arp_del_mp); 20255 ipif->ipif_arp_del_mp = NULL; 20256 err = EADDRNOTAVAIL; 20257 } 20258 ill->ill_ipif_up_count--; 20259 ipif->ipif_flags &= ~IPIF_UP; 20260 goto bad; 20261 } 20262 20263 /* 20264 * Add in all newly created IREs. ire_create_bcast() has 20265 * already checked for duplicates of the IRE_BROADCAST type. 20266 * We want to add before we call ifgrp_insert which wants 20267 * to know whether IRE_IF_RESOLVER exists or not. 20268 * 20269 * NOTE : We refrele the ire though we may branch to "bad" 20270 * later on where we do ire_delete. This is okay 20271 * because nobody can delete it as we are running 20272 * exclusively. 20273 */ 20274 for (irep1 = irep; irep1 > ire_array; ) { 20275 irep1--; 20276 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock))); 20277 /* 20278 * refheld by ire_add. refele towards the end of the func 20279 */ 20280 (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE); 20281 } 20282 ire_added = B_TRUE; 20283 /* 20284 * Form groups if possible. 20285 * 20286 * If we are supposed to be in a ill_group with a name, insert it 20287 * now as we know that at least one ipif is UP. Otherwise form 20288 * nameless groups. 20289 * 20290 * If ip_enable_group_ifs is set and ipif address is not 0, insert 20291 * this ipif into the appropriate interface group, or create a 20292 * new one. If this is already in a nameless group, we try to form 20293 * a bigger group looking at other ills potentially sharing this 20294 * ipif's prefix. 20295 */ 20296 phyi = ill->ill_phyint; 20297 if (phyi->phyint_groupname_len != 0) { 20298 ASSERT(phyi->phyint_groupname != NULL); 20299 if (ill->ill_ipif_up_count == 1) { 20300 ASSERT(ill->ill_group == NULL); 20301 err = illgrp_insert(&ipst->ips_illgrp_head_v4, ill, 20302 phyi->phyint_groupname, NULL, B_TRUE); 20303 if (err != 0) { 20304 ip1dbg(("ipif_up_done: illgrp allocation " 20305 "failed, error %d\n", err)); 20306 goto bad; 20307 } 20308 } 20309 ASSERT(ill->ill_group != NULL); 20310 } 20311 20312 /* 20313 * When this is part of group, we need to make sure that 20314 * any broadcast ires created because of this ipif coming 20315 * UP gets marked/cleared with IRE_MARK_NORECV appropriately 20316 * so that we don't receive duplicate broadcast packets. 20317 */ 20318 if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0) 20319 ipif_renominate_bcast(ipif); 20320 20321 /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ 20322 ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; 20323 ipif_saved_irep = ipif_recover_ire(ipif); 20324 20325 if (!loopback) { 20326 /* 20327 * If the broadcast address has been set, make sure it makes 20328 * sense based on the interface address. 20329 * Only match on ill since we are sharing broadcast addresses. 20330 */ 20331 if ((ipif->ipif_brd_addr != INADDR_ANY) && 20332 (ipif->ipif_flags & IPIF_BROADCAST)) { 20333 ire_t *ire; 20334 20335 ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0, 20336 IRE_BROADCAST, ipif, ALL_ZONES, 20337 NULL, (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst); 20338 20339 if (ire == NULL) { 20340 /* 20341 * If there isn't a matching broadcast IRE, 20342 * revert to the default for this netmask. 20343 */ 20344 ipif->ipif_v6brd_addr = ipv6_all_zeros; 20345 mutex_enter(&ipif->ipif_ill->ill_lock); 20346 ipif_set_default(ipif); 20347 mutex_exit(&ipif->ipif_ill->ill_lock); 20348 } else { 20349 ire_refrele(ire); 20350 } 20351 } 20352 20353 } 20354 20355 /* This is the first interface on this ill */ 20356 if (ipif->ipif_ipif_up_count == 1 && !loopback) { 20357 /* 20358 * Need to recover all multicast memberships in the driver. 20359 * This had to be deferred until we had attached. 20360 */ 20361 ill_recover_multicast(ill); 20362 } 20363 /* Join the allhosts multicast address */ 20364 ipif_multicast_up(ipif); 20365 20366 if (!loopback) { 20367 /* 20368 * See whether anybody else would benefit from the 20369 * new ipif that we added. We call this always rather 20370 * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST 20371 * ipif is for the benefit of illgrp_insert (done above) 20372 * which does not do source address selection as it does 20373 * not want to re-create interface routes that we are 20374 * having reference to it here. 20375 */ 20376 ill_update_source_selection(ill); 20377 } 20378 20379 for (irep1 = irep; irep1 > ire_array; ) { 20380 irep1--; 20381 if (*irep1 != NULL) { 20382 /* was held in ire_add */ 20383 ire_refrele(*irep1); 20384 } 20385 } 20386 20387 cnt = ipif_saved_ire_cnt; 20388 for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) { 20389 if (*irep1 != NULL) { 20390 /* was held in ire_add */ 20391 ire_refrele(*irep1); 20392 } 20393 } 20394 20395 if (!loopback && ipif->ipif_addr_ready) { 20396 /* Broadcast an address mask reply. */ 20397 ipif_mask_reply(ipif); 20398 } 20399 if (ipif_saved_irep != NULL) { 20400 kmem_free(ipif_saved_irep, 20401 ipif_saved_ire_cnt * sizeof (ire_t *)); 20402 } 20403 if (src_ipif_held) 20404 ipif_refrele(src_ipif); 20405 20406 /* 20407 * This had to be deferred until we had bound. Tell routing sockets and 20408 * others that this interface is up if it looks like the address has 20409 * been validated. Otherwise, if it isn't ready yet, wait for 20410 * duplicate address detection to do its thing. 20411 */ 20412 if (ipif->ipif_addr_ready) { 20413 ip_rts_ifmsg(ipif); 20414 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 20415 /* Let SCTP update the status for this ipif */ 20416 sctp_update_ipif(ipif, SCTP_IPIF_UP); 20417 } 20418 return (0); 20419 20420 bad: 20421 ip1dbg(("ipif_up_done: FAILED \n")); 20422 /* 20423 * We don't have to bother removing from ill groups because 20424 * 20425 * 1) For groups with names, we insert only when the first ipif 20426 * comes up. In that case if it fails, it will not be in any 20427 * group. So, we need not try to remove for that case. 20428 * 20429 * 2) For groups without names, either we tried to insert ipif_ill 20430 * in a group as singleton or found some other group to become 20431 * a bigger group. For the former, if it fails we don't have 20432 * anything to do as ipif_ill is not in the group and for the 20433 * latter, there are no failures in illgrp_insert/illgrp_delete 20434 * (ENOMEM can't occur for this. Check ifgrp_insert). 20435 */ 20436 while (irep > ire_array) { 20437 irep--; 20438 if (*irep != NULL) { 20439 ire_delete(*irep); 20440 if (ire_added) 20441 ire_refrele(*irep); 20442 } 20443 } 20444 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); 20445 20446 if (ipif_saved_irep != NULL) { 20447 kmem_free(ipif_saved_irep, 20448 ipif_saved_ire_cnt * sizeof (ire_t *)); 20449 } 20450 if (src_ipif_held) 20451 ipif_refrele(src_ipif); 20452 20453 ipif_arp_down(ipif); 20454 return (err); 20455 } 20456 20457 /* 20458 * Turn off the ARP with the ILLF_NOARP flag. 20459 */ 20460 static int 20461 ill_arp_off(ill_t *ill) 20462 { 20463 mblk_t *arp_off_mp = NULL; 20464 mblk_t *arp_on_mp = NULL; 20465 20466 ip1dbg(("ill_arp_off(%s)\n", ill->ill_name)); 20467 20468 ASSERT(IAM_WRITER_ILL(ill)); 20469 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 20470 20471 /* 20472 * If the on message is still around we've already done 20473 * an arp_off without doing an arp_on thus there is no 20474 * work needed. 20475 */ 20476 if (ill->ill_arp_on_mp != NULL) 20477 return (0); 20478 20479 /* 20480 * Allocate an ARP on message (to be saved) and an ARP off message 20481 */ 20482 arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0); 20483 if (!arp_off_mp) 20484 return (ENOMEM); 20485 20486 arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0); 20487 if (!arp_on_mp) 20488 goto failed; 20489 20490 ASSERT(ill->ill_arp_on_mp == NULL); 20491 ill->ill_arp_on_mp = arp_on_mp; 20492 20493 /* Send an AR_INTERFACE_OFF request */ 20494 putnext(ill->ill_rq, arp_off_mp); 20495 return (0); 20496 failed: 20497 20498 if (arp_off_mp) 20499 freemsg(arp_off_mp); 20500 return (ENOMEM); 20501 } 20502 20503 /* 20504 * Turn on ARP by turning off the ILLF_NOARP flag. 20505 */ 20506 static int 20507 ill_arp_on(ill_t *ill) 20508 { 20509 mblk_t *mp; 20510 20511 ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name)); 20512 20513 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 20514 20515 ASSERT(IAM_WRITER_ILL(ill)); 20516 /* 20517 * Send an AR_INTERFACE_ON request if we have already done 20518 * an arp_off (which allocated the message). 20519 */ 20520 if (ill->ill_arp_on_mp != NULL) { 20521 mp = ill->ill_arp_on_mp; 20522 ill->ill_arp_on_mp = NULL; 20523 putnext(ill->ill_rq, mp); 20524 } 20525 return (0); 20526 } 20527 20528 /* 20529 * Called after either deleting ill from the group or when setting 20530 * FAILED or STANDBY on the interface. 20531 */ 20532 static void 20533 illgrp_reset_schednext(ill_t *ill) 20534 { 20535 ill_group_t *illgrp; 20536 ill_t *save_ill; 20537 20538 ASSERT(IAM_WRITER_ILL(ill)); 20539 /* 20540 * When called from illgrp_delete, ill_group will be non-NULL. 20541 * But when called from ip_sioctl_flags, it could be NULL if 20542 * somebody is setting FAILED/INACTIVE on some interface which 20543 * is not part of a group. 20544 */ 20545 illgrp = ill->ill_group; 20546 if (illgrp == NULL) 20547 return; 20548 if (illgrp->illgrp_ill_schednext != ill) 20549 return; 20550 20551 illgrp->illgrp_ill_schednext = NULL; 20552 save_ill = ill; 20553 /* 20554 * Choose a good ill to be the next one for 20555 * outbound traffic. As the flags FAILED/STANDBY is 20556 * not yet marked when called from ip_sioctl_flags, 20557 * we check for ill separately. 20558 */ 20559 for (ill = illgrp->illgrp_ill; ill != NULL; 20560 ill = ill->ill_group_next) { 20561 if ((ill != save_ill) && 20562 !(ill->ill_phyint->phyint_flags & 20563 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) { 20564 illgrp->illgrp_ill_schednext = ill; 20565 return; 20566 } 20567 } 20568 } 20569 20570 /* 20571 * Given an ill, find the next ill in the group to be scheduled. 20572 * (This should be called by ip_newroute() before ire_create().) 20573 * The passed in ill may be pulled out of the group, after we have picked 20574 * up a different outgoing ill from the same group. However ire add will 20575 * atomically check this. 20576 */ 20577 ill_t * 20578 illgrp_scheduler(ill_t *ill) 20579 { 20580 ill_t *retill; 20581 ill_group_t *illgrp; 20582 int illcnt; 20583 int i; 20584 uint64_t flags; 20585 ip_stack_t *ipst = ill->ill_ipst; 20586 20587 /* 20588 * We don't use a lock to check for the ill_group. If this ill 20589 * is currently being inserted we may end up just returning this 20590 * ill itself. That is ok. 20591 */ 20592 if (ill->ill_group == NULL) { 20593 ill_refhold(ill); 20594 return (ill); 20595 } 20596 20597 /* 20598 * Grab the ill_g_lock as reader to make sure we are dealing with 20599 * a set of stable ills. No ill can be added or deleted or change 20600 * group while we hold the reader lock. 20601 */ 20602 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 20603 if ((illgrp = ill->ill_group) == NULL) { 20604 rw_exit(&ipst->ips_ill_g_lock); 20605 ill_refhold(ill); 20606 return (ill); 20607 } 20608 20609 illcnt = illgrp->illgrp_ill_count; 20610 mutex_enter(&illgrp->illgrp_lock); 20611 retill = illgrp->illgrp_ill_schednext; 20612 20613 if (retill == NULL) 20614 retill = illgrp->illgrp_ill; 20615 20616 /* 20617 * We do a circular search beginning at illgrp_ill_schednext 20618 * or illgrp_ill. We don't check the flags against the ill lock 20619 * since it can change anytime. The ire creation will be atomic 20620 * and will fail if the ill is FAILED or OFFLINE. 20621 */ 20622 for (i = 0; i < illcnt; i++) { 20623 flags = retill->ill_phyint->phyint_flags; 20624 20625 if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 20626 ILL_CAN_LOOKUP(retill)) { 20627 illgrp->illgrp_ill_schednext = retill->ill_group_next; 20628 ill_refhold(retill); 20629 break; 20630 } 20631 retill = retill->ill_group_next; 20632 if (retill == NULL) 20633 retill = illgrp->illgrp_ill; 20634 } 20635 mutex_exit(&illgrp->illgrp_lock); 20636 rw_exit(&ipst->ips_ill_g_lock); 20637 20638 return (i == illcnt ? NULL : retill); 20639 } 20640 20641 /* 20642 * Checks for availbility of a usable source address (if there is one) when the 20643 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 20644 * this selection is done regardless of the destination. 20645 */ 20646 boolean_t 20647 ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) 20648 { 20649 uint_t ifindex; 20650 ipif_t *ipif = NULL; 20651 ill_t *uill; 20652 boolean_t isv6; 20653 ip_stack_t *ipst = ill->ill_ipst; 20654 20655 ASSERT(ill != NULL); 20656 20657 isv6 = ill->ill_isv6; 20658 ifindex = ill->ill_usesrc_ifindex; 20659 if (ifindex != 0) { 20660 uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, 20661 NULL, ipst); 20662 if (uill == NULL) 20663 return (NULL); 20664 mutex_enter(&uill->ill_lock); 20665 for (ipif = uill->ill_ipif; ipif != NULL; 20666 ipif = ipif->ipif_next) { 20667 if (!IPIF_CAN_LOOKUP(ipif)) 20668 continue; 20669 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 20670 continue; 20671 if (!(ipif->ipif_flags & IPIF_UP)) 20672 continue; 20673 if (ipif->ipif_zoneid != zoneid) 20674 continue; 20675 if ((isv6 && 20676 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) || 20677 (ipif->ipif_lcl_addr == INADDR_ANY)) 20678 continue; 20679 mutex_exit(&uill->ill_lock); 20680 ill_refrele(uill); 20681 return (B_TRUE); 20682 } 20683 mutex_exit(&uill->ill_lock); 20684 ill_refrele(uill); 20685 } 20686 return (B_FALSE); 20687 } 20688 20689 /* 20690 * Determine the best source address given a destination address and an ill. 20691 * Prefers non-deprecated over deprecated but will return a deprecated 20692 * address if there is no other choice. If there is a usable source address 20693 * on the interface pointed to by ill_usesrc_ifindex then that is given 20694 * first preference. 20695 * 20696 * Returns NULL if there is no suitable source address for the ill. 20697 * This only occurs when there is no valid source address for the ill. 20698 */ 20699 ipif_t * 20700 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) 20701 { 20702 ipif_t *ipif; 20703 ipif_t *ipif_dep = NULL; /* Fallback to deprecated */ 20704 ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE]; 20705 int index = 0; 20706 boolean_t wrapped = B_FALSE; 20707 boolean_t same_subnet_only = B_FALSE; 20708 boolean_t ipif_same_found, ipif_other_found; 20709 boolean_t specific_found; 20710 ill_t *till, *usill = NULL; 20711 tsol_tpc_t *src_rhtp, *dst_rhtp; 20712 ip_stack_t *ipst = ill->ill_ipst; 20713 20714 if (ill->ill_usesrc_ifindex != 0) { 20715 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, 20716 B_FALSE, NULL, NULL, NULL, NULL, ipst); 20717 if (usill != NULL) 20718 ill = usill; /* Select source from usesrc ILL */ 20719 else 20720 return (NULL); 20721 } 20722 20723 /* 20724 * If we're dealing with an unlabeled destination on a labeled system, 20725 * make sure that we ignore source addresses that are incompatible with 20726 * the destination's default label. That destination's default label 20727 * must dominate the minimum label on the source address. 20728 */ 20729 dst_rhtp = NULL; 20730 if (is_system_labeled()) { 20731 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 20732 if (dst_rhtp == NULL) 20733 return (NULL); 20734 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 20735 TPC_RELE(dst_rhtp); 20736 dst_rhtp = NULL; 20737 } 20738 } 20739 20740 /* 20741 * Holds the ill_g_lock as reader. This makes sure that no ipif/ill 20742 * can be deleted. But an ipif/ill can get CONDEMNED any time. 20743 * After selecting the right ipif, under ill_lock make sure ipif is 20744 * not condemned, and increment refcnt. If ipif is CONDEMNED, 20745 * we retry. Inside the loop we still need to check for CONDEMNED, 20746 * but not under a lock. 20747 */ 20748 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 20749 20750 retry: 20751 till = ill; 20752 ipif_arr[0] = NULL; 20753 20754 if (till->ill_group != NULL) 20755 till = till->ill_group->illgrp_ill; 20756 20757 /* 20758 * Choose one good source address from each ill across the group. 20759 * If possible choose a source address in the same subnet as 20760 * the destination address. 20761 * 20762 * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE 20763 * This is okay because of the following. 20764 * 20765 * If PHYI_FAILED is set and we still have non-deprecated 20766 * addresses, it means the addresses have not yet been 20767 * failed over to a different interface. We potentially 20768 * select them to create IRE_CACHES, which will be later 20769 * flushed when the addresses move over. 20770 * 20771 * If PHYI_INACTIVE is set and we still have non-deprecated 20772 * addresses, it means either the user has configured them 20773 * or PHYI_INACTIVE has not been cleared after the addresses 20774 * been moved over. For the former, in.mpathd does a failover 20775 * when the interface becomes INACTIVE and hence we should 20776 * not find them. Once INACTIVE is set, we don't allow them 20777 * to create logical interfaces anymore. For the latter, a 20778 * flush will happen when INACTIVE is cleared which will 20779 * flush the IRE_CACHES. 20780 * 20781 * If PHYI_OFFLINE is set, all the addresses will be failed 20782 * over soon. We potentially select them to create IRE_CACHEs, 20783 * which will be later flushed when the addresses move over. 20784 * 20785 * NOTE : As ipif_select_source is called to borrow source address 20786 * for an ipif that is part of a group, source address selection 20787 * will be re-done whenever the group changes i.e either an 20788 * insertion/deletion in the group. 20789 * 20790 * Fill ipif_arr[] with source addresses, using these rules: 20791 * 20792 * 1. At most one source address from a given ill ends up 20793 * in ipif_arr[] -- that is, at most one of the ipif's 20794 * associated with a given ill ends up in ipif_arr[]. 20795 * 20796 * 2. If there is at least one non-deprecated ipif in the 20797 * IPMP group with a source address on the same subnet as 20798 * our destination, then fill ipif_arr[] only with 20799 * source addresses on the same subnet as our destination. 20800 * Note that because of (1), only the first 20801 * non-deprecated ipif found with a source address 20802 * matching the destination ends up in ipif_arr[]. 20803 * 20804 * 3. Otherwise, fill ipif_arr[] with non-deprecated source 20805 * addresses not in the same subnet as our destination. 20806 * Again, because of (1), only the first off-subnet source 20807 * address will be chosen. 20808 * 20809 * 4. If there are no non-deprecated ipifs, then just use 20810 * the source address associated with the last deprecated 20811 * one we find that happens to be on the same subnet, 20812 * otherwise the first one not in the same subnet. 20813 */ 20814 specific_found = B_FALSE; 20815 for (; till != NULL; till = till->ill_group_next) { 20816 ipif_same_found = B_FALSE; 20817 ipif_other_found = B_FALSE; 20818 for (ipif = till->ill_ipif; ipif != NULL; 20819 ipif = ipif->ipif_next) { 20820 if (!IPIF_CAN_LOOKUP(ipif)) 20821 continue; 20822 /* Always skip NOLOCAL and ANYCAST interfaces */ 20823 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 20824 continue; 20825 if (!(ipif->ipif_flags & IPIF_UP) || 20826 !ipif->ipif_addr_ready) 20827 continue; 20828 if (ipif->ipif_zoneid != zoneid && 20829 ipif->ipif_zoneid != ALL_ZONES) 20830 continue; 20831 /* 20832 * Interfaces with 0.0.0.0 address are allowed to be UP, 20833 * but are not valid as source addresses. 20834 */ 20835 if (ipif->ipif_lcl_addr == INADDR_ANY) 20836 continue; 20837 20838 /* 20839 * Check compatibility of local address for 20840 * destination's default label if we're on a labeled 20841 * system. Incompatible addresses can't be used at 20842 * all. 20843 */ 20844 if (dst_rhtp != NULL) { 20845 boolean_t incompat; 20846 20847 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 20848 IPV4_VERSION, B_FALSE); 20849 if (src_rhtp == NULL) 20850 continue; 20851 incompat = 20852 src_rhtp->tpc_tp.host_type != SUN_CIPSO || 20853 src_rhtp->tpc_tp.tp_doi != 20854 dst_rhtp->tpc_tp.tp_doi || 20855 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 20856 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 20857 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 20858 src_rhtp->tpc_tp.tp_sl_set_cipso)); 20859 TPC_RELE(src_rhtp); 20860 if (incompat) 20861 continue; 20862 } 20863 20864 /* 20865 * We prefer not to use all all-zones addresses, if we 20866 * can avoid it, as they pose problems with unlabeled 20867 * destinations. 20868 */ 20869 if (ipif->ipif_zoneid != ALL_ZONES) { 20870 if (!specific_found && 20871 (!same_subnet_only || 20872 (ipif->ipif_net_mask & dst) == 20873 ipif->ipif_subnet)) { 20874 index = 0; 20875 specific_found = B_TRUE; 20876 ipif_other_found = B_FALSE; 20877 } 20878 } else { 20879 if (specific_found) 20880 continue; 20881 } 20882 if (ipif->ipif_flags & IPIF_DEPRECATED) { 20883 if (ipif_dep == NULL || 20884 (ipif->ipif_net_mask & dst) == 20885 ipif->ipif_subnet) 20886 ipif_dep = ipif; 20887 continue; 20888 } 20889 if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) { 20890 /* found a source address in the same subnet */ 20891 if (!same_subnet_only) { 20892 same_subnet_only = B_TRUE; 20893 index = 0; 20894 } 20895 ipif_same_found = B_TRUE; 20896 } else { 20897 if (same_subnet_only || ipif_other_found) 20898 continue; 20899 ipif_other_found = B_TRUE; 20900 } 20901 ipif_arr[index++] = ipif; 20902 if (index == MAX_IPIF_SELECT_SOURCE) { 20903 wrapped = B_TRUE; 20904 index = 0; 20905 } 20906 if (ipif_same_found) 20907 break; 20908 } 20909 } 20910 20911 if (ipif_arr[0] == NULL) { 20912 ipif = ipif_dep; 20913 } else { 20914 if (wrapped) 20915 index = MAX_IPIF_SELECT_SOURCE; 20916 ipif = ipif_arr[ipif_rand(ipst) % index]; 20917 ASSERT(ipif != NULL); 20918 } 20919 20920 if (ipif != NULL) { 20921 mutex_enter(&ipif->ipif_ill->ill_lock); 20922 if (!IPIF_CAN_LOOKUP(ipif)) { 20923 mutex_exit(&ipif->ipif_ill->ill_lock); 20924 goto retry; 20925 } 20926 ipif_refhold_locked(ipif); 20927 mutex_exit(&ipif->ipif_ill->ill_lock); 20928 } 20929 20930 rw_exit(&ipst->ips_ill_g_lock); 20931 if (usill != NULL) 20932 ill_refrele(usill); 20933 if (dst_rhtp != NULL) 20934 TPC_RELE(dst_rhtp); 20935 20936 #ifdef DEBUG 20937 if (ipif == NULL) { 20938 char buf1[INET6_ADDRSTRLEN]; 20939 20940 ip1dbg(("ipif_select_source(%s, %s) -> NULL\n", 20941 ill->ill_name, 20942 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 20943 } else { 20944 char buf1[INET6_ADDRSTRLEN]; 20945 char buf2[INET6_ADDRSTRLEN]; 20946 20947 ip1dbg(("ipif_select_source(%s, %s) -> %s\n", 20948 ipif->ipif_ill->ill_name, 20949 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 20950 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 20951 buf2, sizeof (buf2)))); 20952 } 20953 #endif /* DEBUG */ 20954 return (ipif); 20955 } 20956 20957 20958 /* 20959 * If old_ipif is not NULL, see if ipif was derived from old 20960 * ipif and if so, recreate the interface route by re-doing 20961 * source address selection. This happens when ipif_down -> 20962 * ipif_update_other_ipifs calls us. 20963 * 20964 * If old_ipif is NULL, just redo the source address selection 20965 * if needed. This happens when illgrp_insert or ipif_up_done 20966 * calls us. 20967 */ 20968 static void 20969 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) 20970 { 20971 ire_t *ire; 20972 ire_t *ipif_ire; 20973 queue_t *stq; 20974 ipif_t *nipif; 20975 ill_t *ill; 20976 boolean_t need_rele = B_FALSE; 20977 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 20978 20979 ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif)); 20980 ASSERT(IAM_WRITER_IPIF(ipif)); 20981 20982 ill = ipif->ipif_ill; 20983 if (!(ipif->ipif_flags & 20984 (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { 20985 /* 20986 * Can't possibly have borrowed the source 20987 * from old_ipif. 20988 */ 20989 return; 20990 } 20991 20992 /* 20993 * Is there any work to be done? No work if the address 20994 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST ( 20995 * ipif_select_source() does not borrow addresses from 20996 * NOLOCAL and ANYCAST interfaces). 20997 */ 20998 if ((old_ipif != NULL) && 20999 ((old_ipif->ipif_lcl_addr == INADDR_ANY) || 21000 (old_ipif->ipif_ill->ill_wq == NULL) || 21001 (old_ipif->ipif_flags & 21002 (IPIF_NOLOCAL|IPIF_ANYCAST)))) { 21003 return; 21004 } 21005 21006 /* 21007 * Perform the same checks as when creating the 21008 * IRE_INTERFACE in ipif_up_done. 21009 */ 21010 if (!(ipif->ipif_flags & IPIF_UP)) 21011 return; 21012 21013 if ((ipif->ipif_flags & IPIF_NOXMIT) || 21014 (ipif->ipif_subnet == INADDR_ANY)) 21015 return; 21016 21017 ipif_ire = ipif_to_ire(ipif); 21018 if (ipif_ire == NULL) 21019 return; 21020 21021 /* 21022 * We know that ipif uses some other source for its 21023 * IRE_INTERFACE. Is it using the source of this 21024 * old_ipif? 21025 */ 21026 if (old_ipif != NULL && 21027 old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) { 21028 ire_refrele(ipif_ire); 21029 return; 21030 } 21031 if (ip_debug > 2) { 21032 /* ip1dbg */ 21033 pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for" 21034 " src %s\n", AF_INET, &ipif_ire->ire_src_addr); 21035 } 21036 21037 stq = ipif_ire->ire_stq; 21038 21039 /* 21040 * Can't use our source address. Select a different 21041 * source address for the IRE_INTERFACE. 21042 */ 21043 nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid); 21044 if (nipif == NULL) { 21045 /* Last resort - all ipif's have IPIF_NOLOCAL */ 21046 nipif = ipif; 21047 } else { 21048 need_rele = B_TRUE; 21049 } 21050 21051 ire = ire_create( 21052 (uchar_t *)&ipif->ipif_subnet, /* dest pref */ 21053 (uchar_t *)&ipif->ipif_net_mask, /* mask */ 21054 (uchar_t *)&nipif->ipif_src_addr, /* src addr */ 21055 NULL, /* no gateway */ 21056 &ipif->ipif_mtu, /* max frag */ 21057 NULL, /* no src nce */ 21058 NULL, /* no recv from queue */ 21059 stq, /* send-to queue */ 21060 ill->ill_net_type, /* IF_[NO]RESOLVER */ 21061 ipif, 21062 0, 21063 0, 21064 0, 21065 0, 21066 &ire_uinfo_null, 21067 NULL, 21068 NULL, 21069 ipst); 21070 21071 if (ire != NULL) { 21072 ire_t *ret_ire; 21073 int error; 21074 21075 /* 21076 * We don't need ipif_ire anymore. We need to delete 21077 * before we add so that ire_add does not detect 21078 * duplicates. 21079 */ 21080 ire_delete(ipif_ire); 21081 ret_ire = ire; 21082 error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE); 21083 ASSERT(error == 0); 21084 ASSERT(ire == ret_ire); 21085 /* Held in ire_add */ 21086 ire_refrele(ret_ire); 21087 } 21088 /* 21089 * Either we are falling through from above or could not 21090 * allocate a replacement. 21091 */ 21092 ire_refrele(ipif_ire); 21093 if (need_rele) 21094 ipif_refrele(nipif); 21095 } 21096 21097 /* 21098 * This old_ipif is going away. 21099 * 21100 * Determine if any other ipif's is using our address as 21101 * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or 21102 * IPIF_DEPRECATED). 21103 * Find the IRE_INTERFACE for such ipifs and recreate them 21104 * to use an different source address following the rules in 21105 * ipif_up_done. 21106 * 21107 * This function takes an illgrp as an argument so that illgrp_delete 21108 * can call this to update source address even after deleting the 21109 * old_ipif->ipif_ill from the ill group. 21110 */ 21111 static void 21112 ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp) 21113 { 21114 ipif_t *ipif; 21115 ill_t *ill; 21116 char buf[INET6_ADDRSTRLEN]; 21117 21118 ASSERT(IAM_WRITER_IPIF(old_ipif)); 21119 ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif)); 21120 21121 ill = old_ipif->ipif_ill; 21122 21123 ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", 21124 ill->ill_name, 21125 inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, 21126 buf, sizeof (buf)))); 21127 /* 21128 * If this part of a group, look at all ills as ipif_select_source 21129 * borrows source address across all the ills in the group. 21130 */ 21131 if (illgrp != NULL) 21132 ill = illgrp->illgrp_ill; 21133 21134 for (; ill != NULL; ill = ill->ill_group_next) { 21135 for (ipif = ill->ill_ipif; ipif != NULL; 21136 ipif = ipif->ipif_next) { 21137 21138 if (ipif == old_ipif) 21139 continue; 21140 21141 ipif_recreate_interface_routes(old_ipif, ipif); 21142 } 21143 } 21144 } 21145 21146 /* ARGSUSED */ 21147 int 21148 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 21149 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 21150 { 21151 /* 21152 * ill_phyint_reinit merged the v4 and v6 into a single 21153 * ipsq. Could also have become part of a ipmp group in the 21154 * process, and we might not have been able to complete the 21155 * operation in ipif_set_values, if we could not become 21156 * exclusive. If so restart it here. 21157 */ 21158 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 21159 } 21160 21161 21162 /* 21163 * Can operate on either a module or a driver queue. 21164 * Returns an error if not a module queue. 21165 */ 21166 /* ARGSUSED */ 21167 int 21168 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 21169 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 21170 { 21171 queue_t *q1 = q; 21172 char *cp; 21173 char interf_name[LIFNAMSIZ]; 21174 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 21175 21176 if (q->q_next == NULL) { 21177 ip1dbg(( 21178 "if_unitsel: IF_UNITSEL: no q_next\n")); 21179 return (EINVAL); 21180 } 21181 21182 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 21183 return (EALREADY); 21184 21185 do { 21186 q1 = q1->q_next; 21187 } while (q1->q_next); 21188 cp = q1->q_qinfo->qi_minfo->mi_idname; 21189 (void) sprintf(interf_name, "%s%d", cp, ppa); 21190 21191 /* 21192 * Here we are not going to delay the ioack until after 21193 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 21194 * original ioctl message before sending the requests. 21195 */ 21196 return (ipif_set_values(q, mp, interf_name, &ppa)); 21197 } 21198 21199 /* ARGSUSED */ 21200 int 21201 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 21202 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 21203 { 21204 return (ENXIO); 21205 } 21206 21207 /* 21208 * Create any IRE_BROADCAST entries for `ipif', and store those entries in 21209 * `irep'. Returns a pointer to the next free `irep' entry (just like 21210 * ire_check_and_create_bcast()). 21211 */ 21212 static ire_t ** 21213 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) 21214 { 21215 ipaddr_t addr; 21216 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 21217 ipaddr_t subnetmask = ipif->ipif_net_mask; 21218 int flags = MATCH_IRE_TYPE | MATCH_IRE_ILL; 21219 21220 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n")); 21221 21222 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 21223 21224 if (ipif->ipif_lcl_addr == INADDR_ANY || 21225 (ipif->ipif_flags & IPIF_NOLOCAL)) 21226 netmask = htonl(IN_CLASSA_NET); /* fallback */ 21227 21228 irep = ire_check_and_create_bcast(ipif, 0, irep, flags); 21229 irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, flags); 21230 21231 /* 21232 * For backward compatibility, we create net broadcast IREs based on 21233 * the old "IP address class system", since some old machines only 21234 * respond to these class derived net broadcast. However, we must not 21235 * create these net broadcast IREs if the subnetmask is shorter than 21236 * the IP address class based derived netmask. Otherwise, we may 21237 * create a net broadcast address which is the same as an IP address 21238 * on the subnet -- and then TCP will refuse to talk to that address. 21239 */ 21240 if (netmask < subnetmask) { 21241 addr = netmask & ipif->ipif_subnet; 21242 irep = ire_check_and_create_bcast(ipif, addr, irep, flags); 21243 irep = ire_check_and_create_bcast(ipif, ~netmask | addr, irep, 21244 flags); 21245 } 21246 21247 /* 21248 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 21249 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 21250 * created. Creating these broadcast IREs will only create confusion 21251 * as `addr' will be the same as the IP address. 21252 */ 21253 if (subnetmask != 0xFFFFFFFF) { 21254 addr = ipif->ipif_subnet; 21255 irep = ire_check_and_create_bcast(ipif, addr, irep, flags); 21256 irep = ire_check_and_create_bcast(ipif, ~subnetmask | addr, 21257 irep, flags); 21258 } 21259 21260 return (irep); 21261 } 21262 21263 /* 21264 * Broadcast IRE info structure used in the functions below. Since we 21265 * allocate BCAST_COUNT of them on the stack, keep the bit layout compact. 21266 */ 21267 typedef struct bcast_ireinfo { 21268 uchar_t bi_type; /* BCAST_* value from below */ 21269 uchar_t bi_willdie:1, /* will this IRE be going away? */ 21270 bi_needrep:1, /* do we need to replace it? */ 21271 bi_haverep:1, /* have we replaced it? */ 21272 bi_pad:5; 21273 ipaddr_t bi_addr; /* IRE address */ 21274 ipif_t *bi_backup; /* last-ditch ipif to replace it on */ 21275 } bcast_ireinfo_t; 21276 21277 enum { BCAST_ALLONES, BCAST_ALLZEROES, BCAST_NET, BCAST_SUBNET, BCAST_COUNT }; 21278 21279 /* 21280 * Check if `ipif' needs the dying broadcast IRE described by `bireinfop', and 21281 * return B_TRUE if it should immediately be used to recreate the IRE. 21282 */ 21283 static boolean_t 21284 ipif_consider_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop) 21285 { 21286 ipaddr_t addr; 21287 21288 ASSERT(!bireinfop->bi_haverep && bireinfop->bi_willdie); 21289 21290 switch (bireinfop->bi_type) { 21291 case BCAST_NET: 21292 addr = ipif->ipif_subnet & ip_net_mask(ipif->ipif_subnet); 21293 if (addr != bireinfop->bi_addr) 21294 return (B_FALSE); 21295 break; 21296 case BCAST_SUBNET: 21297 if (ipif->ipif_subnet != bireinfop->bi_addr) 21298 return (B_FALSE); 21299 break; 21300 } 21301 21302 bireinfop->bi_needrep = 1; 21303 if (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_NOLOCAL|IPIF_ANYCAST)) { 21304 if (bireinfop->bi_backup == NULL) 21305 bireinfop->bi_backup = ipif; 21306 return (B_FALSE); 21307 } 21308 return (B_TRUE); 21309 } 21310 21311 /* 21312 * Create the broadcast IREs described by `bireinfop' on `ipif', and return 21313 * them ala ire_check_and_create_bcast(). 21314 */ 21315 static ire_t ** 21316 ipif_create_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop, ire_t **irep) 21317 { 21318 ipaddr_t mask, addr; 21319 21320 ASSERT(!bireinfop->bi_haverep && bireinfop->bi_needrep); 21321 21322 addr = bireinfop->bi_addr; 21323 irep = ire_create_bcast(ipif, addr, irep); 21324 21325 switch (bireinfop->bi_type) { 21326 case BCAST_NET: 21327 mask = ip_net_mask(ipif->ipif_subnet); 21328 irep = ire_create_bcast(ipif, addr | ~mask, irep); 21329 break; 21330 case BCAST_SUBNET: 21331 mask = ipif->ipif_net_mask; 21332 irep = ire_create_bcast(ipif, addr | ~mask, irep); 21333 break; 21334 } 21335 21336 bireinfop->bi_haverep = 1; 21337 return (irep); 21338 } 21339 21340 /* 21341 * Walk through all of the ipifs on `ill' that will be affected by `test_ipif' 21342 * going away, and determine if any of the broadcast IREs (named by `bireinfop') 21343 * that are going away are still needed. If so, have ipif_create_bcast() 21344 * recreate them (except for the deprecated case, as explained below). 21345 */ 21346 static ire_t ** 21347 ill_create_bcast(ill_t *ill, ipif_t *test_ipif, bcast_ireinfo_t *bireinfo, 21348 ire_t **irep) 21349 { 21350 int i; 21351 ipif_t *ipif; 21352 21353 ASSERT(!ill->ill_isv6); 21354 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 21355 /* 21356 * Skip this ipif if it's (a) the one being taken down, (b) 21357 * not in the same zone, or (c) has no valid local address. 21358 */ 21359 if (ipif == test_ipif || 21360 ipif->ipif_zoneid != test_ipif->ipif_zoneid || 21361 ipif->ipif_subnet == 0 || 21362 (ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST|IPIF_NOXMIT)) != 21363 (IPIF_UP|IPIF_BROADCAST)) 21364 continue; 21365 21366 /* 21367 * For each dying IRE that hasn't yet been replaced, see if 21368 * `ipif' needs it and whether the IRE should be recreated on 21369 * `ipif'. If `ipif' is deprecated, ipif_consider_bcast() 21370 * will return B_FALSE even if `ipif' needs the IRE on the 21371 * hopes that we'll later find a needy non-deprecated ipif. 21372 * However, the ipif is recorded in bi_backup for possible 21373 * subsequent use by ipif_check_bcast_ires(). 21374 */ 21375 for (i = 0; i < BCAST_COUNT; i++) { 21376 if (!bireinfo[i].bi_willdie || bireinfo[i].bi_haverep) 21377 continue; 21378 if (!ipif_consider_bcast(ipif, &bireinfo[i])) 21379 continue; 21380 irep = ipif_create_bcast(ipif, &bireinfo[i], irep); 21381 } 21382 21383 /* 21384 * If we've replaced all of the broadcast IREs that are going 21385 * to be taken down, we know we're done. 21386 */ 21387 for (i = 0; i < BCAST_COUNT; i++) { 21388 if (bireinfo[i].bi_willdie && !bireinfo[i].bi_haverep) 21389 break; 21390 } 21391 if (i == BCAST_COUNT) 21392 break; 21393 } 21394 return (irep); 21395 } 21396 21397 /* 21398 * Check if `test_ipif' (which is going away) is associated with any existing 21399 * broadcast IREs, and whether any other ipifs (e.g., on the same ill) were 21400 * using those broadcast IREs. If so, recreate the broadcast IREs on one or 21401 * more of those other ipifs. (The old IREs will be deleted in ipif_down().) 21402 * 21403 * This is necessary because broadcast IREs are shared. In particular, a 21404 * given ill has one set of all-zeroes and all-ones broadcast IREs (for every 21405 * zone), plus one set of all-subnet-ones, all-subnet-zeroes, all-net-ones, 21406 * and all-net-zeroes for every net/subnet (and every zone) it has IPIF_UP 21407 * ipifs on. Thus, if there are two IPIF_UP ipifs on the same subnet with the 21408 * same zone, they will share the same set of broadcast IREs. 21409 * 21410 * Note: the upper bound of 12 IREs comes from the worst case of replacing all 21411 * six pairs (loopback and non-loopback) of broadcast IREs (all-zeroes, 21412 * all-ones, subnet-zeroes, subnet-ones, net-zeroes, and net-ones). 21413 */ 21414 static void 21415 ipif_check_bcast_ires(ipif_t *test_ipif) 21416 { 21417 ill_t *ill = test_ipif->ipif_ill; 21418 ire_t *ire, *ire_array[12]; /* see note above */ 21419 ire_t **irep1, **irep = &ire_array[0]; 21420 uint_t i, willdie; 21421 ipaddr_t mask = ip_net_mask(test_ipif->ipif_subnet); 21422 bcast_ireinfo_t bireinfo[BCAST_COUNT]; 21423 21424 ASSERT(!test_ipif->ipif_isv6); 21425 ASSERT(IAM_WRITER_IPIF(test_ipif)); 21426 21427 /* 21428 * No broadcast IREs for the LOOPBACK interface 21429 * or others such as point to point and IPIF_NOXMIT. 21430 */ 21431 if (!(test_ipif->ipif_flags & IPIF_BROADCAST) || 21432 (test_ipif->ipif_flags & IPIF_NOXMIT)) 21433 return; 21434 21435 bzero(bireinfo, sizeof (bireinfo)); 21436 bireinfo[0].bi_type = BCAST_ALLZEROES; 21437 bireinfo[0].bi_addr = 0; 21438 21439 bireinfo[1].bi_type = BCAST_ALLONES; 21440 bireinfo[1].bi_addr = INADDR_BROADCAST; 21441 21442 bireinfo[2].bi_type = BCAST_NET; 21443 bireinfo[2].bi_addr = test_ipif->ipif_subnet & mask; 21444 21445 if (test_ipif->ipif_net_mask != 0) 21446 mask = test_ipif->ipif_net_mask; 21447 bireinfo[3].bi_type = BCAST_SUBNET; 21448 bireinfo[3].bi_addr = test_ipif->ipif_subnet & mask; 21449 21450 /* 21451 * Figure out what (if any) broadcast IREs will die as a result of 21452 * `test_ipif' going away. If none will die, we're done. 21453 */ 21454 for (i = 0, willdie = 0; i < BCAST_COUNT; i++) { 21455 ire = ire_ctable_lookup(bireinfo[i].bi_addr, 0, IRE_BROADCAST, 21456 test_ipif, ALL_ZONES, NULL, 21457 (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ill->ill_ipst); 21458 if (ire != NULL) { 21459 willdie++; 21460 bireinfo[i].bi_willdie = 1; 21461 ire_refrele(ire); 21462 } 21463 } 21464 21465 if (willdie == 0) 21466 return; 21467 21468 /* 21469 * Walk through all the ipifs that will be affected by the dying IREs, 21470 * and recreate the IREs as necessary. 21471 */ 21472 irep = ill_create_bcast(ill, test_ipif, bireinfo, irep); 21473 21474 /* 21475 * Scan through the set of broadcast IREs and see if there are any 21476 * that we need to replace that have not yet been replaced. If so, 21477 * replace them using the appropriate backup ipif. 21478 */ 21479 for (i = 0; i < BCAST_COUNT; i++) { 21480 if (bireinfo[i].bi_needrep && !bireinfo[i].bi_haverep) 21481 irep = ipif_create_bcast(bireinfo[i].bi_backup, 21482 &bireinfo[i], irep); 21483 } 21484 21485 /* 21486 * If we can't create all of them, don't add any of them. (Code in 21487 * ip_wput_ire() and ire_to_ill() assumes that we always have a 21488 * non-loopback copy and loopback copy for a given address.) 21489 */ 21490 for (irep1 = irep; irep1 > ire_array; ) { 21491 irep1--; 21492 if (*irep1 == NULL) { 21493 ip0dbg(("ipif_check_bcast_ires: can't create " 21494 "IRE_BROADCAST, memory allocation failure\n")); 21495 while (irep > ire_array) { 21496 irep--; 21497 if (*irep != NULL) 21498 ire_delete(*irep); 21499 } 21500 return; 21501 } 21502 } 21503 21504 for (irep1 = irep; irep1 > ire_array; ) { 21505 irep1--; 21506 if (ire_add(irep1, NULL, NULL, NULL, B_FALSE) == 0) 21507 ire_refrele(*irep1); /* Held in ire_add */ 21508 } 21509 } 21510 21511 /* 21512 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 21513 * from lifr_flags and the name from lifr_name. 21514 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 21515 * since ipif_lookup_on_name uses the _isv6 flags when matching. 21516 * Returns EINPROGRESS when mp has been consumed by queueing it on 21517 * ill_pending_mp and the ioctl will complete in ip_rput. 21518 * 21519 * Can operate on either a module or a driver queue. 21520 * Returns an error if not a module queue. 21521 */ 21522 /* ARGSUSED */ 21523 int 21524 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21525 ip_ioctl_cmd_t *ipip, void *if_req) 21526 { 21527 int err; 21528 ill_t *ill; 21529 struct lifreq *lifr = (struct lifreq *)if_req; 21530 21531 ASSERT(ipif != NULL); 21532 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 21533 21534 if (q->q_next == NULL) { 21535 ip1dbg(( 21536 "if_sioctl_slifname: SIOCSLIFNAME: no q_next\n")); 21537 return (EINVAL); 21538 } 21539 21540 ill = (ill_t *)q->q_ptr; 21541 /* 21542 * If we are not writer on 'q' then this interface exists already 21543 * and previous lookups (ipif_extract_lifreq()) found this ipif. 21544 * So return EALREADY 21545 */ 21546 if (ill != ipif->ipif_ill) 21547 return (EALREADY); 21548 21549 if (ill->ill_name[0] != '\0') 21550 return (EALREADY); 21551 21552 /* 21553 * Set all the flags. Allows all kinds of override. Provide some 21554 * sanity checking by not allowing IFF_BROADCAST and IFF_MULTICAST 21555 * unless there is either multicast/broadcast support in the driver 21556 * or it is a pt-pt link. 21557 */ 21558 if (lifr->lifr_flags & (IFF_PROMISC|IFF_ALLMULTI)) { 21559 /* Meaningless to IP thus don't allow them to be set. */ 21560 ip1dbg(("ip_setname: EINVAL 1\n")); 21561 return (EINVAL); 21562 } 21563 /* 21564 * For a DL_STYLE2 driver (ill_needs_attach), we would not have the 21565 * ill_bcast_addr_length info. 21566 */ 21567 if (!ill->ill_needs_attach && 21568 ((lifr->lifr_flags & IFF_MULTICAST) && 21569 !(lifr->lifr_flags & IFF_POINTOPOINT) && 21570 ill->ill_bcast_addr_length == 0)) { 21571 /* Link not broadcast/pt-pt capable i.e. no multicast */ 21572 ip1dbg(("ip_setname: EINVAL 2\n")); 21573 return (EINVAL); 21574 } 21575 if ((lifr->lifr_flags & IFF_BROADCAST) && 21576 ((lifr->lifr_flags & IFF_IPV6) || 21577 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 21578 /* Link not broadcast capable or IPv6 i.e. no broadcast */ 21579 ip1dbg(("ip_setname: EINVAL 3\n")); 21580 return (EINVAL); 21581 } 21582 if (lifr->lifr_flags & IFF_UP) { 21583 /* Can only be set with SIOCSLIFFLAGS */ 21584 ip1dbg(("ip_setname: EINVAL 4\n")); 21585 return (EINVAL); 21586 } 21587 if ((lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV6 && 21588 (lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV4) { 21589 ip1dbg(("ip_setname: EINVAL 5\n")); 21590 return (EINVAL); 21591 } 21592 /* 21593 * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces. 21594 */ 21595 if ((lifr->lifr_flags & IFF_XRESOLV) && 21596 !(lifr->lifr_flags & IFF_IPV6) && 21597 !(ipif->ipif_isv6)) { 21598 ip1dbg(("ip_setname: EINVAL 6\n")); 21599 return (EINVAL); 21600 } 21601 21602 /* 21603 * The user has done SIOCGLIFFLAGS prior to this ioctl and hence 21604 * we have all the flags here. So, we assign rather than we OR. 21605 * We can't OR the flags here because we don't want to set 21606 * both IFF_IPV4 and IFF_IPV6. We start off as IFF_IPV4 in 21607 * ipif_allocate and become IFF_IPV4 or IFF_IPV6 here depending 21608 * on lifr_flags value here. 21609 */ 21610 /* 21611 * This ill has not been inserted into the global list. 21612 * So we are still single threaded and don't need any lock 21613 */ 21614 ipif->ipif_flags = lifr->lifr_flags & IFF_LOGINT_FLAGS & 21615 ~IFF_DUPLICATE; 21616 ill->ill_flags = lifr->lifr_flags & IFF_PHYINTINST_FLAGS; 21617 ill->ill_phyint->phyint_flags = lifr->lifr_flags & IFF_PHYINT_FLAGS; 21618 21619 /* We started off as V4. */ 21620 if (ill->ill_flags & ILLF_IPV6) { 21621 ill->ill_phyint->phyint_illv6 = ill; 21622 ill->ill_phyint->phyint_illv4 = NULL; 21623 } 21624 err = ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa); 21625 return (err); 21626 } 21627 21628 /* ARGSUSED */ 21629 int 21630 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21631 ip_ioctl_cmd_t *ipip, void *if_req) 21632 { 21633 /* 21634 * ill_phyint_reinit merged the v4 and v6 into a single 21635 * ipsq. Could also have become part of a ipmp group in the 21636 * process, and we might not have been able to complete the 21637 * slifname in ipif_set_values, if we could not become 21638 * exclusive. If so restart it here 21639 */ 21640 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 21641 } 21642 21643 /* 21644 * Return a pointer to the ipif which matches the index, IP version type and 21645 * zoneid. 21646 */ 21647 ipif_t * 21648 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 21649 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err, ip_stack_t *ipst) 21650 { 21651 ill_t *ill; 21652 ipsq_t *ipsq; 21653 phyint_t *phyi; 21654 ipif_t *ipif; 21655 21656 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 21657 (q != NULL && mp != NULL && func != NULL && err != NULL)); 21658 21659 if (err != NULL) 21660 *err = 0; 21661 21662 /* 21663 * Indexes are stored in the phyint - a common structure 21664 * to both IPv4 and IPv6. 21665 */ 21666 21667 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 21668 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 21669 (void *) &index, NULL); 21670 if (phyi != NULL) { 21671 ill = isv6 ? phyi->phyint_illv6 : phyi->phyint_illv4; 21672 if (ill == NULL) { 21673 rw_exit(&ipst->ips_ill_g_lock); 21674 if (err != NULL) 21675 *err = ENXIO; 21676 return (NULL); 21677 } 21678 GRAB_CONN_LOCK(q); 21679 mutex_enter(&ill->ill_lock); 21680 if (ILL_CAN_LOOKUP(ill)) { 21681 for (ipif = ill->ill_ipif; ipif != NULL; 21682 ipif = ipif->ipif_next) { 21683 if (IPIF_CAN_LOOKUP(ipif) && 21684 (zoneid == ALL_ZONES || 21685 zoneid == ipif->ipif_zoneid || 21686 ipif->ipif_zoneid == ALL_ZONES)) { 21687 ipif_refhold_locked(ipif); 21688 mutex_exit(&ill->ill_lock); 21689 RELEASE_CONN_LOCK(q); 21690 rw_exit(&ipst->ips_ill_g_lock); 21691 return (ipif); 21692 } 21693 } 21694 } else if (ILL_CAN_WAIT(ill, q)) { 21695 ipsq = ill->ill_phyint->phyint_ipsq; 21696 mutex_enter(&ipsq->ipsq_lock); 21697 rw_exit(&ipst->ips_ill_g_lock); 21698 mutex_exit(&ill->ill_lock); 21699 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 21700 mutex_exit(&ipsq->ipsq_lock); 21701 RELEASE_CONN_LOCK(q); 21702 *err = EINPROGRESS; 21703 return (NULL); 21704 } 21705 mutex_exit(&ill->ill_lock); 21706 RELEASE_CONN_LOCK(q); 21707 } 21708 rw_exit(&ipst->ips_ill_g_lock); 21709 if (err != NULL) 21710 *err = ENXIO; 21711 return (NULL); 21712 } 21713 21714 typedef struct conn_change_s { 21715 uint_t cc_old_ifindex; 21716 uint_t cc_new_ifindex; 21717 } conn_change_t; 21718 21719 /* 21720 * ipcl_walk function for changing interface index. 21721 */ 21722 static void 21723 conn_change_ifindex(conn_t *connp, caddr_t arg) 21724 { 21725 conn_change_t *connc; 21726 uint_t old_ifindex; 21727 uint_t new_ifindex; 21728 int i; 21729 ilg_t *ilg; 21730 21731 connc = (conn_change_t *)arg; 21732 old_ifindex = connc->cc_old_ifindex; 21733 new_ifindex = connc->cc_new_ifindex; 21734 21735 if (connp->conn_orig_bound_ifindex == old_ifindex) 21736 connp->conn_orig_bound_ifindex = new_ifindex; 21737 21738 if (connp->conn_orig_multicast_ifindex == old_ifindex) 21739 connp->conn_orig_multicast_ifindex = new_ifindex; 21740 21741 if (connp->conn_orig_xmit_ifindex == old_ifindex) 21742 connp->conn_orig_xmit_ifindex = new_ifindex; 21743 21744 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 21745 ilg = &connp->conn_ilg[i]; 21746 if (ilg->ilg_orig_ifindex == old_ifindex) 21747 ilg->ilg_orig_ifindex = new_ifindex; 21748 } 21749 } 21750 21751 /* 21752 * Walk all the ipifs and ilms on this ill and change the orig_ifindex 21753 * to new_index if it matches the old_index. 21754 * 21755 * Failovers typically happen within a group of ills. But somebody 21756 * can remove an ill from the group after a failover happened. If 21757 * we are setting the ifindex after this, we potentially need to 21758 * look at all the ills rather than just the ones in the group. 21759 * We cut down the work by looking at matching ill_net_types 21760 * and ill_types as we could not possibly grouped them together. 21761 */ 21762 static void 21763 ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc) 21764 { 21765 ill_t *ill; 21766 ipif_t *ipif; 21767 uint_t old_ifindex; 21768 uint_t new_ifindex; 21769 ilm_t *ilm; 21770 ill_walk_context_t ctx; 21771 ip_stack_t *ipst = ill_orig->ill_ipst; 21772 21773 old_ifindex = connc->cc_old_ifindex; 21774 new_ifindex = connc->cc_new_ifindex; 21775 21776 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 21777 ill = ILL_START_WALK_ALL(&ctx, ipst); 21778 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 21779 if ((ill_orig->ill_net_type != ill->ill_net_type) || 21780 (ill_orig->ill_type != ill->ill_type)) { 21781 continue; 21782 } 21783 for (ipif = ill->ill_ipif; ipif != NULL; 21784 ipif = ipif->ipif_next) { 21785 if (ipif->ipif_orig_ifindex == old_ifindex) 21786 ipif->ipif_orig_ifindex = new_ifindex; 21787 } 21788 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 21789 if (ilm->ilm_orig_ifindex == old_ifindex) 21790 ilm->ilm_orig_ifindex = new_ifindex; 21791 } 21792 } 21793 rw_exit(&ipst->ips_ill_g_lock); 21794 } 21795 21796 /* 21797 * We first need to ensure that the new index is unique, and 21798 * then carry the change across both v4 and v6 ill representation 21799 * of the physical interface. 21800 */ 21801 /* ARGSUSED */ 21802 int 21803 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21804 ip_ioctl_cmd_t *ipip, void *ifreq) 21805 { 21806 ill_t *ill; 21807 ill_t *ill_other; 21808 phyint_t *phyi; 21809 int old_index; 21810 conn_change_t connc; 21811 struct ifreq *ifr = (struct ifreq *)ifreq; 21812 struct lifreq *lifr = (struct lifreq *)ifreq; 21813 uint_t index; 21814 ill_t *ill_v4; 21815 ill_t *ill_v6; 21816 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 21817 21818 if (ipip->ipi_cmd_type == IF_CMD) 21819 index = ifr->ifr_index; 21820 else 21821 index = lifr->lifr_index; 21822 21823 /* 21824 * Only allow on physical interface. Also, index zero is illegal. 21825 * 21826 * Need to check for PHYI_FAILED and PHYI_INACTIVE 21827 * 21828 * 1) If PHYI_FAILED is set, a failover could have happened which 21829 * implies a possible failback might have to happen. As failback 21830 * depends on the old index, we should fail setting the index. 21831 * 21832 * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that 21833 * any addresses or multicast memberships are failed over to 21834 * a non-STANDBY interface. As failback depends on the old 21835 * index, we should fail setting the index for this case also. 21836 * 21837 * 3) If PHYI_OFFLINE is set, a possible failover has happened. 21838 * Be consistent with PHYI_FAILED and fail the ioctl. 21839 */ 21840 ill = ipif->ipif_ill; 21841 phyi = ill->ill_phyint; 21842 if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) || 21843 ipif->ipif_id != 0 || index == 0) { 21844 return (EINVAL); 21845 } 21846 old_index = phyi->phyint_ifindex; 21847 21848 /* If the index is not changing, no work to do */ 21849 if (old_index == index) 21850 return (0); 21851 21852 /* 21853 * Use ill_lookup_on_ifindex to determine if the 21854 * new index is unused and if so allow the change. 21855 */ 21856 ill_v6 = ill_lookup_on_ifindex(index, B_TRUE, NULL, NULL, NULL, NULL, 21857 ipst); 21858 ill_v4 = ill_lookup_on_ifindex(index, B_FALSE, NULL, NULL, NULL, NULL, 21859 ipst); 21860 if (ill_v6 != NULL || ill_v4 != NULL) { 21861 if (ill_v4 != NULL) 21862 ill_refrele(ill_v4); 21863 if (ill_v6 != NULL) 21864 ill_refrele(ill_v6); 21865 return (EBUSY); 21866 } 21867 21868 /* 21869 * The new index is unused. Set it in the phyint. 21870 * Locate the other ill so that we can send a routing 21871 * sockets message. 21872 */ 21873 if (ill->ill_isv6) { 21874 ill_other = phyi->phyint_illv4; 21875 } else { 21876 ill_other = phyi->phyint_illv6; 21877 } 21878 21879 phyi->phyint_ifindex = index; 21880 21881 /* Update SCTP's ILL list */ 21882 sctp_ill_reindex(ill, old_index); 21883 21884 connc.cc_old_ifindex = old_index; 21885 connc.cc_new_ifindex = index; 21886 ip_change_ifindex(ill, &connc); 21887 ipcl_walk(conn_change_ifindex, (caddr_t)&connc, ipst); 21888 21889 /* Send the routing sockets message */ 21890 ip_rts_ifmsg(ipif); 21891 if (ill_other != NULL) 21892 ip_rts_ifmsg(ill_other->ill_ipif); 21893 21894 return (0); 21895 } 21896 21897 /* ARGSUSED */ 21898 int 21899 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21900 ip_ioctl_cmd_t *ipip, void *ifreq) 21901 { 21902 struct ifreq *ifr = (struct ifreq *)ifreq; 21903 struct lifreq *lifr = (struct lifreq *)ifreq; 21904 21905 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 21906 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21907 /* Get the interface index */ 21908 if (ipip->ipi_cmd_type == IF_CMD) { 21909 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 21910 } else { 21911 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 21912 } 21913 return (0); 21914 } 21915 21916 /* ARGSUSED */ 21917 int 21918 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21919 ip_ioctl_cmd_t *ipip, void *ifreq) 21920 { 21921 struct lifreq *lifr = (struct lifreq *)ifreq; 21922 21923 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 21924 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21925 /* Get the interface zone */ 21926 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21927 lifr->lifr_zoneid = ipif->ipif_zoneid; 21928 return (0); 21929 } 21930 21931 /* 21932 * Set the zoneid of an interface. 21933 */ 21934 /* ARGSUSED */ 21935 int 21936 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21937 ip_ioctl_cmd_t *ipip, void *ifreq) 21938 { 21939 struct lifreq *lifr = (struct lifreq *)ifreq; 21940 int err = 0; 21941 boolean_t need_up = B_FALSE; 21942 zone_t *zptr; 21943 zone_status_t status; 21944 zoneid_t zoneid; 21945 21946 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21947 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 21948 if (!is_system_labeled()) 21949 return (ENOTSUP); 21950 zoneid = GLOBAL_ZONEID; 21951 } 21952 21953 /* cannot assign instance zero to a non-global zone */ 21954 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 21955 return (ENOTSUP); 21956 21957 /* 21958 * Cannot assign to a zone that doesn't exist or is shutting down. In 21959 * the event of a race with the zone shutdown processing, since IP 21960 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 21961 * interface will be cleaned up even if the zone is shut down 21962 * immediately after the status check. If the interface can't be brought 21963 * down right away, and the zone is shut down before the restart 21964 * function is called, we resolve the possible races by rechecking the 21965 * zone status in the restart function. 21966 */ 21967 if ((zptr = zone_find_by_id(zoneid)) == NULL) 21968 return (EINVAL); 21969 status = zone_status_get(zptr); 21970 zone_rele(zptr); 21971 21972 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 21973 return (EINVAL); 21974 21975 if (ipif->ipif_flags & IPIF_UP) { 21976 /* 21977 * If the interface is already marked up, 21978 * we call ipif_down which will take care 21979 * of ditching any IREs that have been set 21980 * up based on the old interface address. 21981 */ 21982 err = ipif_logical_down(ipif, q, mp); 21983 if (err == EINPROGRESS) 21984 return (err); 21985 ipif_down_tail(ipif); 21986 need_up = B_TRUE; 21987 } 21988 21989 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 21990 return (err); 21991 } 21992 21993 static int 21994 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 21995 queue_t *q, mblk_t *mp, boolean_t need_up) 21996 { 21997 int err = 0; 21998 ip_stack_t *ipst; 21999 22000 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 22001 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 22002 22003 if (CONN_Q(q)) 22004 ipst = CONNQ_TO_IPST(q); 22005 else 22006 ipst = ILLQ_TO_IPST(q); 22007 22008 /* 22009 * For exclusive stacks we don't allow a different zoneid than 22010 * global. 22011 */ 22012 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID && 22013 zoneid != GLOBAL_ZONEID) 22014 return (EINVAL); 22015 22016 /* Set the new zone id. */ 22017 ipif->ipif_zoneid = zoneid; 22018 22019 /* Update sctp list */ 22020 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 22021 22022 if (need_up) { 22023 /* 22024 * Now bring the interface back up. If this 22025 * is the only IPIF for the ILL, ipif_up 22026 * will have to re-bind to the device, so 22027 * we may get back EINPROGRESS, in which 22028 * case, this IOCTL will get completed in 22029 * ip_rput_dlpi when we see the DL_BIND_ACK. 22030 */ 22031 err = ipif_up(ipif, q, mp); 22032 } 22033 return (err); 22034 } 22035 22036 /* ARGSUSED */ 22037 int 22038 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22039 ip_ioctl_cmd_t *ipip, void *if_req) 22040 { 22041 struct lifreq *lifr = (struct lifreq *)if_req; 22042 zoneid_t zoneid; 22043 zone_t *zptr; 22044 zone_status_t status; 22045 22046 ASSERT(ipif->ipif_id != 0); 22047 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 22048 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 22049 zoneid = GLOBAL_ZONEID; 22050 22051 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 22052 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 22053 22054 /* 22055 * We recheck the zone status to resolve the following race condition: 22056 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 22057 * 2) hme0:1 is up and can't be brought down right away; 22058 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 22059 * 3) zone "myzone" is halted; the zone status switches to 22060 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 22061 * the interfaces to remove - hme0:1 is not returned because it's not 22062 * yet in "myzone", so it won't be removed; 22063 * 4) the restart function for SIOCSLIFZONE is called; without the 22064 * status check here, we would have hme0:1 in "myzone" after it's been 22065 * destroyed. 22066 * Note that if the status check fails, we need to bring the interface 22067 * back to its state prior to ip_sioctl_slifzone(), hence the call to 22068 * ipif_up_done[_v6](). 22069 */ 22070 status = ZONE_IS_UNINITIALIZED; 22071 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 22072 status = zone_status_get(zptr); 22073 zone_rele(zptr); 22074 } 22075 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 22076 if (ipif->ipif_isv6) { 22077 (void) ipif_up_done_v6(ipif); 22078 } else { 22079 (void) ipif_up_done(ipif); 22080 } 22081 return (EINVAL); 22082 } 22083 22084 ipif_down_tail(ipif); 22085 22086 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 22087 B_TRUE)); 22088 } 22089 22090 /* ARGSUSED */ 22091 int 22092 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22093 ip_ioctl_cmd_t *ipip, void *ifreq) 22094 { 22095 struct lifreq *lifr = ifreq; 22096 22097 ASSERT(q->q_next == NULL); 22098 ASSERT(CONN_Q(q)); 22099 22100 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 22101 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 22102 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 22103 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 22104 22105 return (0); 22106 } 22107 22108 22109 /* Find the previous ILL in this usesrc group */ 22110 static ill_t * 22111 ill_prev_usesrc(ill_t *uill) 22112 { 22113 ill_t *ill; 22114 22115 for (ill = uill->ill_usesrc_grp_next; 22116 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 22117 ill = ill->ill_usesrc_grp_next) 22118 /* do nothing */; 22119 return (ill); 22120 } 22121 22122 /* 22123 * Release all members of the usesrc group. This routine is called 22124 * from ill_delete when the interface being unplumbed is the 22125 * group head. 22126 */ 22127 static void 22128 ill_disband_usesrc_group(ill_t *uill) 22129 { 22130 ill_t *next_ill, *tmp_ill; 22131 ip_stack_t *ipst = uill->ill_ipst; 22132 22133 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 22134 next_ill = uill->ill_usesrc_grp_next; 22135 22136 do { 22137 ASSERT(next_ill != NULL); 22138 tmp_ill = next_ill->ill_usesrc_grp_next; 22139 ASSERT(tmp_ill != NULL); 22140 next_ill->ill_usesrc_grp_next = NULL; 22141 next_ill->ill_usesrc_ifindex = 0; 22142 next_ill = tmp_ill; 22143 } while (next_ill->ill_usesrc_ifindex != 0); 22144 uill->ill_usesrc_grp_next = NULL; 22145 } 22146 22147 /* 22148 * Remove the client usesrc ILL from the list and relink to a new list 22149 */ 22150 int 22151 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 22152 { 22153 ill_t *ill, *tmp_ill; 22154 ip_stack_t *ipst = ucill->ill_ipst; 22155 22156 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 22157 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 22158 22159 /* 22160 * Check if the usesrc client ILL passed in is not already 22161 * in use as a usesrc ILL i.e one whose source address is 22162 * in use OR a usesrc ILL is not already in use as a usesrc 22163 * client ILL 22164 */ 22165 if ((ucill->ill_usesrc_ifindex == 0) || 22166 (uill->ill_usesrc_ifindex != 0)) { 22167 return (-1); 22168 } 22169 22170 ill = ill_prev_usesrc(ucill); 22171 ASSERT(ill->ill_usesrc_grp_next != NULL); 22172 22173 /* Remove from the current list */ 22174 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 22175 /* Only two elements in the list */ 22176 ASSERT(ill->ill_usesrc_ifindex == 0); 22177 ill->ill_usesrc_grp_next = NULL; 22178 } else { 22179 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 22180 } 22181 22182 if (ifindex == 0) { 22183 ucill->ill_usesrc_ifindex = 0; 22184 ucill->ill_usesrc_grp_next = NULL; 22185 return (0); 22186 } 22187 22188 ucill->ill_usesrc_ifindex = ifindex; 22189 tmp_ill = uill->ill_usesrc_grp_next; 22190 uill->ill_usesrc_grp_next = ucill; 22191 ucill->ill_usesrc_grp_next = 22192 (tmp_ill != NULL) ? tmp_ill : uill; 22193 return (0); 22194 } 22195 22196 /* 22197 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 22198 * ip.c for locking details. 22199 */ 22200 /* ARGSUSED */ 22201 int 22202 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22203 ip_ioctl_cmd_t *ipip, void *ifreq) 22204 { 22205 struct lifreq *lifr = (struct lifreq *)ifreq; 22206 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE, 22207 ill_flag_changed = B_FALSE; 22208 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 22209 int err = 0, ret; 22210 uint_t ifindex; 22211 phyint_t *us_phyint, *us_cli_phyint; 22212 ipsq_t *ipsq = NULL; 22213 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 22214 22215 ASSERT(IAM_WRITER_IPIF(ipif)); 22216 ASSERT(q->q_next == NULL); 22217 ASSERT(CONN_Q(q)); 22218 22219 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 22220 us_cli_phyint = usesrc_cli_ill->ill_phyint; 22221 22222 ASSERT(us_cli_phyint != NULL); 22223 22224 /* 22225 * If the client ILL is being used for IPMP, abort. 22226 * Note, this can be done before ipsq_try_enter since we are already 22227 * exclusive on this ILL 22228 */ 22229 if ((us_cli_phyint->phyint_groupname != NULL) || 22230 (us_cli_phyint->phyint_flags & PHYI_STANDBY)) { 22231 return (EINVAL); 22232 } 22233 22234 ifindex = lifr->lifr_index; 22235 if (ifindex == 0) { 22236 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 22237 /* non usesrc group interface, nothing to reset */ 22238 return (0); 22239 } 22240 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 22241 /* valid reset request */ 22242 reset_flg = B_TRUE; 22243 } 22244 22245 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp, 22246 ip_process_ioctl, &err, ipst); 22247 22248 if (usesrc_ill == NULL) { 22249 return (err); 22250 } 22251 22252 /* 22253 * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP 22254 * group nor can either of the interfaces be used for standy. So 22255 * to guarantee mutual exclusion with ip_sioctl_flags (which sets 22256 * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname) 22257 * we need to be exclusive on the ipsq belonging to the usesrc_ill. 22258 * We are already exlusive on this ipsq i.e ipsq corresponding to 22259 * the usesrc_cli_ill 22260 */ 22261 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 22262 NEW_OP, B_TRUE); 22263 if (ipsq == NULL) { 22264 err = EINPROGRESS; 22265 /* Operation enqueued on the ipsq of the usesrc ILL */ 22266 goto done; 22267 } 22268 22269 /* Check if the usesrc_ill is used for IPMP */ 22270 us_phyint = usesrc_ill->ill_phyint; 22271 if ((us_phyint->phyint_groupname != NULL) || 22272 (us_phyint->phyint_flags & PHYI_STANDBY)) { 22273 err = EINVAL; 22274 goto done; 22275 } 22276 22277 /* 22278 * If the client is already in use as a usesrc_ill or a usesrc_ill is 22279 * already a client then return EINVAL 22280 */ 22281 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 22282 err = EINVAL; 22283 goto done; 22284 } 22285 22286 /* 22287 * If the ill_usesrc_ifindex field is already set to what it needs to 22288 * be then this is a duplicate operation. 22289 */ 22290 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 22291 err = 0; 22292 goto done; 22293 } 22294 22295 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 22296 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 22297 usesrc_ill->ill_isv6)); 22298 22299 /* 22300 * The next step ensures that no new ires will be created referencing 22301 * the client ill, until the ILL_CHANGING flag is cleared. Then 22302 * we go through an ire walk deleting all ire caches that reference 22303 * the client ill. New ires referencing the client ill that are added 22304 * to the ire table before the ILL_CHANGING flag is set, will be 22305 * cleaned up by the ire walk below. Attempt to add new ires referencing 22306 * the client ill while the ILL_CHANGING flag is set will be failed 22307 * during the ire_add in ire_atomic_start. ire_atomic_start atomically 22308 * checks (under the ill_g_usesrc_lock) that the ire being added 22309 * is not stale, i.e the ire_stq and ire_ipif are consistent and 22310 * belong to the same usesrc group. 22311 */ 22312 mutex_enter(&usesrc_cli_ill->ill_lock); 22313 usesrc_cli_ill->ill_state_flags |= ILL_CHANGING; 22314 mutex_exit(&usesrc_cli_ill->ill_lock); 22315 ill_flag_changed = B_TRUE; 22316 22317 if (ipif->ipif_isv6) 22318 ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 22319 ALL_ZONES, ipst); 22320 else 22321 ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 22322 ALL_ZONES, ipst); 22323 22324 /* 22325 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 22326 * and the ill_usesrc_ifindex fields 22327 */ 22328 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 22329 22330 if (reset_flg) { 22331 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 22332 if (ret != 0) { 22333 err = EINVAL; 22334 } 22335 rw_exit(&ipst->ips_ill_g_usesrc_lock); 22336 goto done; 22337 } 22338 22339 /* 22340 * Four possibilities to consider: 22341 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 22342 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 22343 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 22344 * 4. Both are part of their respective usesrc groups 22345 */ 22346 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 22347 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 22348 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 22349 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 22350 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 22351 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 22352 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 22353 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 22354 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 22355 /* Insert at head of list */ 22356 usesrc_cli_ill->ill_usesrc_grp_next = 22357 usesrc_ill->ill_usesrc_grp_next; 22358 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 22359 } else { 22360 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 22361 ifindex); 22362 if (ret != 0) 22363 err = EINVAL; 22364 } 22365 rw_exit(&ipst->ips_ill_g_usesrc_lock); 22366 22367 done: 22368 if (ill_flag_changed) { 22369 mutex_enter(&usesrc_cli_ill->ill_lock); 22370 usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING; 22371 mutex_exit(&usesrc_cli_ill->ill_lock); 22372 } 22373 if (ipsq != NULL) 22374 ipsq_exit(ipsq, B_TRUE, B_TRUE); 22375 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 22376 ill_refrele(usesrc_ill); 22377 return (err); 22378 } 22379 22380 /* 22381 * comparison function used by avl. 22382 */ 22383 static int 22384 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 22385 { 22386 22387 uint_t index; 22388 22389 ASSERT(phyip != NULL && index_ptr != NULL); 22390 22391 index = *((uint_t *)index_ptr); 22392 /* 22393 * let the phyint with the lowest index be on top. 22394 */ 22395 if (((phyint_t *)phyip)->phyint_ifindex < index) 22396 return (1); 22397 if (((phyint_t *)phyip)->phyint_ifindex > index) 22398 return (-1); 22399 return (0); 22400 } 22401 22402 /* 22403 * comparison function used by avl. 22404 */ 22405 static int 22406 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 22407 { 22408 ill_t *ill; 22409 int res = 0; 22410 22411 ASSERT(phyip != NULL && name_ptr != NULL); 22412 22413 if (((phyint_t *)phyip)->phyint_illv4) 22414 ill = ((phyint_t *)phyip)->phyint_illv4; 22415 else 22416 ill = ((phyint_t *)phyip)->phyint_illv6; 22417 ASSERT(ill != NULL); 22418 22419 res = strcmp(ill->ill_name, (char *)name_ptr); 22420 if (res > 0) 22421 return (1); 22422 else if (res < 0) 22423 return (-1); 22424 return (0); 22425 } 22426 /* 22427 * This function is called from ill_delete when the ill is being 22428 * unplumbed. We remove the reference from the phyint and we also 22429 * free the phyint when there are no more references to it. 22430 */ 22431 static void 22432 ill_phyint_free(ill_t *ill) 22433 { 22434 phyint_t *phyi; 22435 phyint_t *next_phyint; 22436 ipsq_t *cur_ipsq; 22437 ip_stack_t *ipst = ill->ill_ipst; 22438 22439 ASSERT(ill->ill_phyint != NULL); 22440 22441 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 22442 phyi = ill->ill_phyint; 22443 ill->ill_phyint = NULL; 22444 /* 22445 * ill_init allocates a phyint always to store the copy 22446 * of flags relevant to phyint. At that point in time, we could 22447 * not assign the name and hence phyint_illv4/v6 could not be 22448 * initialized. Later in ipif_set_values, we assign the name to 22449 * the ill, at which point in time we assign phyint_illv4/v6. 22450 * Thus we don't rely on phyint_illv6 to be initialized always. 22451 */ 22452 if (ill->ill_flags & ILLF_IPV6) { 22453 phyi->phyint_illv6 = NULL; 22454 } else { 22455 phyi->phyint_illv4 = NULL; 22456 } 22457 /* 22458 * ipif_down removes it from the group when the last ipif goes 22459 * down. 22460 */ 22461 ASSERT(ill->ill_group == NULL); 22462 22463 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) 22464 return; 22465 22466 /* 22467 * Make sure this phyint was put in the list. 22468 */ 22469 if (phyi->phyint_ifindex > 0) { 22470 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 22471 phyi); 22472 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 22473 phyi); 22474 } 22475 /* 22476 * remove phyint from the ipsq list. 22477 */ 22478 cur_ipsq = phyi->phyint_ipsq; 22479 if (phyi == cur_ipsq->ipsq_phyint_list) { 22480 cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next; 22481 } else { 22482 next_phyint = cur_ipsq->ipsq_phyint_list; 22483 while (next_phyint != NULL) { 22484 if (next_phyint->phyint_ipsq_next == phyi) { 22485 next_phyint->phyint_ipsq_next = 22486 phyi->phyint_ipsq_next; 22487 break; 22488 } 22489 next_phyint = next_phyint->phyint_ipsq_next; 22490 } 22491 ASSERT(next_phyint != NULL); 22492 } 22493 IPSQ_DEC_REF(cur_ipsq, ipst); 22494 22495 if (phyi->phyint_groupname_len != 0) { 22496 ASSERT(phyi->phyint_groupname != NULL); 22497 mi_free(phyi->phyint_groupname); 22498 } 22499 mi_free(phyi); 22500 } 22501 22502 /* 22503 * Attach the ill to the phyint structure which can be shared by both 22504 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 22505 * function is called from ipif_set_values and ill_lookup_on_name (for 22506 * loopback) where we know the name of the ill. We lookup the ill and if 22507 * there is one present already with the name use that phyint. Otherwise 22508 * reuse the one allocated by ill_init. 22509 */ 22510 static void 22511 ill_phyint_reinit(ill_t *ill) 22512 { 22513 boolean_t isv6 = ill->ill_isv6; 22514 phyint_t *phyi_old; 22515 phyint_t *phyi; 22516 avl_index_t where = 0; 22517 ill_t *ill_other = NULL; 22518 ipsq_t *ipsq; 22519 ip_stack_t *ipst = ill->ill_ipst; 22520 22521 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 22522 22523 phyi_old = ill->ill_phyint; 22524 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 22525 phyi_old->phyint_illv6 == NULL)); 22526 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 22527 phyi_old->phyint_illv4 == NULL)); 22528 ASSERT(phyi_old->phyint_ifindex == 0); 22529 22530 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 22531 ill->ill_name, &where); 22532 22533 /* 22534 * 1. We grabbed the ill_g_lock before inserting this ill into 22535 * the global list of ills. So no other thread could have located 22536 * this ill and hence the ipsq of this ill is guaranteed to be empty. 22537 * 2. Now locate the other protocol instance of this ill. 22538 * 3. Now grab both ill locks in the right order, and the phyint lock of 22539 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 22540 * of neither ill can change. 22541 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 22542 * other ill. 22543 * 5. Release all locks. 22544 */ 22545 22546 /* 22547 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 22548 * we are initializing IPv4. 22549 */ 22550 if (phyi != NULL) { 22551 ill_other = (isv6) ? phyi->phyint_illv4 : 22552 phyi->phyint_illv6; 22553 ASSERT(ill_other->ill_phyint != NULL); 22554 ASSERT((isv6 && !ill_other->ill_isv6) || 22555 (!isv6 && ill_other->ill_isv6)); 22556 GRAB_ILL_LOCKS(ill, ill_other); 22557 /* 22558 * We are potentially throwing away phyint_flags which 22559 * could be different from the one that we obtain from 22560 * ill_other->ill_phyint. But it is okay as we are assuming 22561 * that the state maintained within IP is correct. 22562 */ 22563 mutex_enter(&phyi->phyint_lock); 22564 if (isv6) { 22565 ASSERT(phyi->phyint_illv6 == NULL); 22566 phyi->phyint_illv6 = ill; 22567 } else { 22568 ASSERT(phyi->phyint_illv4 == NULL); 22569 phyi->phyint_illv4 = ill; 22570 } 22571 /* 22572 * This is a new ill, currently undergoing SLIFNAME 22573 * So we could not have joined an IPMP group until now. 22574 */ 22575 ASSERT(phyi_old->phyint_ipsq_next == NULL && 22576 phyi_old->phyint_groupname == NULL); 22577 22578 /* 22579 * This phyi_old is going away. Decref ipsq_refs and 22580 * assert it is zero. The ipsq itself will be freed in 22581 * ipsq_exit 22582 */ 22583 ipsq = phyi_old->phyint_ipsq; 22584 IPSQ_DEC_REF(ipsq, ipst); 22585 ASSERT(ipsq->ipsq_refs == 0); 22586 /* Get the singleton phyint out of the ipsq list */ 22587 ASSERT(phyi_old->phyint_ipsq_next == NULL); 22588 ipsq->ipsq_phyint_list = NULL; 22589 phyi_old->phyint_illv4 = NULL; 22590 phyi_old->phyint_illv6 = NULL; 22591 mi_free(phyi_old); 22592 } else { 22593 mutex_enter(&ill->ill_lock); 22594 /* 22595 * We don't need to acquire any lock, since 22596 * the ill is not yet visible globally and we 22597 * have not yet released the ill_g_lock. 22598 */ 22599 phyi = phyi_old; 22600 mutex_enter(&phyi->phyint_lock); 22601 /* XXX We need a recovery strategy here. */ 22602 if (!phyint_assign_ifindex(phyi, ipst)) 22603 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 22604 22605 /* No IPMP group yet, thus the hook uses the ifindex */ 22606 phyi->phyint_hook_ifindex = phyi->phyint_ifindex; 22607 22608 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 22609 (void *)phyi, where); 22610 22611 (void) avl_find(&ipst->ips_phyint_g_list-> 22612 phyint_list_avl_by_index, 22613 &phyi->phyint_ifindex, &where); 22614 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 22615 (void *)phyi, where); 22616 } 22617 22618 /* 22619 * Reassigning ill_phyint automatically reassigns the ipsq also. 22620 * pending mp is not affected because that is per ill basis. 22621 */ 22622 ill->ill_phyint = phyi; 22623 22624 /* 22625 * Keep the index on ipif_orig_index to be used by FAILOVER. 22626 * We do this here as when the first ipif was allocated, 22627 * ipif_allocate does not know the right interface index. 22628 */ 22629 22630 ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex; 22631 /* 22632 * Now that the phyint's ifindex has been assigned, complete the 22633 * remaining 22634 */ 22635 22636 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex; 22637 if (ill->ill_isv6) { 22638 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 22639 ill->ill_phyint->phyint_ifindex; 22640 ill->ill_mcast_type = ipst->ips_mld_max_version; 22641 } else { 22642 ill->ill_mcast_type = ipst->ips_igmp_max_version; 22643 } 22644 22645 /* 22646 * Generate an event within the hooks framework to indicate that 22647 * a new interface has just been added to IP. For this event to 22648 * be generated, the network interface must, at least, have an 22649 * ifindex assigned to it. 22650 * 22651 * This needs to be run inside the ill_g_lock perimeter to ensure 22652 * that the ordering of delivered events to listeners matches the 22653 * order of them in the kernel. 22654 * 22655 * This function could be called from ill_lookup_on_name. In that case 22656 * the interface is loopback "lo", which will not generate a NIC event. 22657 */ 22658 if (ill->ill_name_length <= 2 || 22659 ill->ill_name[0] != 'l' || ill->ill_name[1] != 'o') { 22660 /* 22661 * Generate nic plumb event for ill_name even if 22662 * ipmp_hook_emulation is set. That avoids generating events 22663 * for the ill_names should ipmp_hook_emulation be turned on 22664 * later. 22665 */ 22666 ill_nic_info_plumb(ill, B_FALSE); 22667 } 22668 RELEASE_ILL_LOCKS(ill, ill_other); 22669 mutex_exit(&phyi->phyint_lock); 22670 } 22671 22672 /* 22673 * Allocate a NE_PLUMB nic info event and store in the ill. 22674 * If 'group' is set we do it for the group name, otherwise the ill name. 22675 * It will be sent when we leave the ipsq. 22676 */ 22677 void 22678 ill_nic_info_plumb(ill_t *ill, boolean_t group) 22679 { 22680 phyint_t *phyi = ill->ill_phyint; 22681 ip_stack_t *ipst = ill->ill_ipst; 22682 hook_nic_event_t *info; 22683 char *name; 22684 int namelen; 22685 22686 ASSERT(MUTEX_HELD(&ill->ill_lock)); 22687 22688 if ((info = ill->ill_nic_event_info) != NULL) { 22689 ip2dbg(("ill_nic_info_plumb: unexpected nic event %d " 22690 "attached for %s\n", info->hne_event, 22691 ill->ill_name)); 22692 if (info->hne_data != NULL) 22693 kmem_free(info->hne_data, info->hne_datalen); 22694 kmem_free(info, sizeof (hook_nic_event_t)); 22695 ill->ill_nic_event_info = NULL; 22696 } 22697 22698 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 22699 if (info == NULL) { 22700 ip2dbg(("ill_nic_info_plumb: could not attach PLUMB nic " 22701 "event information for %s (ENOMEM)\n", 22702 ill->ill_name)); 22703 return; 22704 } 22705 22706 if (group) { 22707 ASSERT(phyi->phyint_groupname_len != 0); 22708 namelen = phyi->phyint_groupname_len; 22709 name = phyi->phyint_groupname; 22710 } else { 22711 namelen = ill->ill_name_length; 22712 name = ill->ill_name; 22713 } 22714 22715 info->hne_nic = phyi->phyint_hook_ifindex; 22716 info->hne_lif = 0; 22717 info->hne_event = NE_PLUMB; 22718 info->hne_family = ill->ill_isv6 ? 22719 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 22720 22721 info->hne_data = kmem_alloc(namelen, KM_NOSLEEP); 22722 if (info->hne_data != NULL) { 22723 info->hne_datalen = namelen; 22724 bcopy(name, info->hne_data, info->hne_datalen); 22725 } else { 22726 ip2dbg(("ill_nic_info_plumb: could not attach " 22727 "name information for PLUMB nic event " 22728 "of %s (ENOMEM)\n", name)); 22729 kmem_free(info, sizeof (hook_nic_event_t)); 22730 info = NULL; 22731 } 22732 ill->ill_nic_event_info = info; 22733 } 22734 22735 /* 22736 * Unhook the nic event message from the ill and enqueue it 22737 * into the nic event taskq. 22738 */ 22739 void 22740 ill_nic_info_dispatch(ill_t *ill) 22741 { 22742 hook_nic_event_t *info; 22743 22744 ASSERT(MUTEX_HELD(&ill->ill_lock)); 22745 22746 if ((info = ill->ill_nic_event_info) != NULL) { 22747 if (ddi_taskq_dispatch(eventq_queue_nic, 22748 ip_ne_queue_func, info, DDI_SLEEP) == DDI_FAILURE) { 22749 ip2dbg(("ill_nic_info_dispatch: " 22750 "ddi_taskq_dispatch failed\n")); 22751 if (info->hne_data != NULL) 22752 kmem_free(info->hne_data, info->hne_datalen); 22753 kmem_free(info, sizeof (hook_nic_event_t)); 22754 } 22755 ill->ill_nic_event_info = NULL; 22756 } 22757 } 22758 22759 /* 22760 * Notify any downstream modules of the name of this interface. 22761 * An M_IOCTL is used even though we don't expect a successful reply. 22762 * Any reply message from the driver (presumably an M_IOCNAK) will 22763 * eventually get discarded somewhere upstream. The message format is 22764 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 22765 * to IP. 22766 */ 22767 static void 22768 ip_ifname_notify(ill_t *ill, queue_t *q) 22769 { 22770 mblk_t *mp1, *mp2; 22771 struct iocblk *iocp; 22772 struct lifreq *lifr; 22773 22774 mp1 = mkiocb(SIOCSLIFNAME); 22775 if (mp1 == NULL) 22776 return; 22777 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 22778 if (mp2 == NULL) { 22779 freeb(mp1); 22780 return; 22781 } 22782 22783 mp1->b_cont = mp2; 22784 iocp = (struct iocblk *)mp1->b_rptr; 22785 iocp->ioc_count = sizeof (struct lifreq); 22786 22787 lifr = (struct lifreq *)mp2->b_rptr; 22788 mp2->b_wptr += sizeof (struct lifreq); 22789 bzero(lifr, sizeof (struct lifreq)); 22790 22791 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 22792 lifr->lifr_ppa = ill->ill_ppa; 22793 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 22794 22795 putnext(q, mp1); 22796 } 22797 22798 static int 22799 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 22800 { 22801 int err; 22802 ip_stack_t *ipst = ill->ill_ipst; 22803 22804 /* Set the obsolete NDD per-interface forwarding name. */ 22805 err = ill_set_ndd_name(ill); 22806 if (err != 0) { 22807 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 22808 err); 22809 } 22810 22811 /* Tell downstream modules where they are. */ 22812 ip_ifname_notify(ill, q); 22813 22814 /* 22815 * ill_dl_phys returns EINPROGRESS in the usual case. 22816 * Error cases are ENOMEM ... 22817 */ 22818 err = ill_dl_phys(ill, ipif, mp, q); 22819 22820 /* 22821 * If there is no IRE expiration timer running, get one started. 22822 * igmp and mld timers will be triggered by the first multicast 22823 */ 22824 if (ipst->ips_ip_ire_expire_id == 0) { 22825 /* 22826 * acquire the lock and check again. 22827 */ 22828 mutex_enter(&ipst->ips_ip_trash_timer_lock); 22829 if (ipst->ips_ip_ire_expire_id == 0) { 22830 ipst->ips_ip_ire_expire_id = timeout( 22831 ip_trash_timer_expire, ipst, 22832 MSEC_TO_TICK(ipst->ips_ip_timer_interval)); 22833 } 22834 mutex_exit(&ipst->ips_ip_trash_timer_lock); 22835 } 22836 22837 if (ill->ill_isv6) { 22838 mutex_enter(&ipst->ips_mld_slowtimeout_lock); 22839 if (ipst->ips_mld_slowtimeout_id == 0) { 22840 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, 22841 (void *)ipst, 22842 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 22843 } 22844 mutex_exit(&ipst->ips_mld_slowtimeout_lock); 22845 } else { 22846 mutex_enter(&ipst->ips_igmp_slowtimeout_lock); 22847 if (ipst->ips_igmp_slowtimeout_id == 0) { 22848 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, 22849 (void *)ipst, 22850 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 22851 } 22852 mutex_exit(&ipst->ips_igmp_slowtimeout_lock); 22853 } 22854 22855 return (err); 22856 } 22857 22858 /* 22859 * Common routine for ppa and ifname setting. Should be called exclusive. 22860 * 22861 * Returns EINPROGRESS when mp has been consumed by queueing it on 22862 * ill_pending_mp and the ioctl will complete in ip_rput. 22863 * 22864 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 22865 * the new name and new ppa in lifr_name and lifr_ppa respectively. 22866 * For SLIFNAME, we pass these values back to the userland. 22867 */ 22868 static int 22869 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 22870 { 22871 ill_t *ill; 22872 ipif_t *ipif; 22873 ipsq_t *ipsq; 22874 char *ppa_ptr; 22875 char *old_ptr; 22876 char old_char; 22877 int error; 22878 ip_stack_t *ipst; 22879 22880 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 22881 ASSERT(q->q_next != NULL); 22882 ASSERT(interf_name != NULL); 22883 22884 ill = (ill_t *)q->q_ptr; 22885 ipst = ill->ill_ipst; 22886 22887 ASSERT(ill->ill_ipst != NULL); 22888 ASSERT(ill->ill_name[0] == '\0'); 22889 ASSERT(IAM_WRITER_ILL(ill)); 22890 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 22891 ASSERT(ill->ill_ppa == UINT_MAX); 22892 22893 /* The ppa is sent down by ifconfig or is chosen */ 22894 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 22895 return (EINVAL); 22896 } 22897 22898 /* 22899 * make sure ppa passed in is same as ppa in the name. 22900 * This check is not made when ppa == UINT_MAX in that case ppa 22901 * in the name could be anything. System will choose a ppa and 22902 * update new_ppa_ptr and inter_name to contain the choosen ppa. 22903 */ 22904 if (*new_ppa_ptr != UINT_MAX) { 22905 /* stoi changes the pointer */ 22906 old_ptr = ppa_ptr; 22907 /* 22908 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 22909 * (they don't have an externally visible ppa). We assign one 22910 * here so that we can manage the interface. Note that in 22911 * the past this value was always 0 for DLPI 1 drivers. 22912 */ 22913 if (*new_ppa_ptr == 0) 22914 *new_ppa_ptr = stoi(&old_ptr); 22915 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 22916 return (EINVAL); 22917 } 22918 /* 22919 * terminate string before ppa 22920 * save char at that location. 22921 */ 22922 old_char = ppa_ptr[0]; 22923 ppa_ptr[0] = '\0'; 22924 22925 ill->ill_ppa = *new_ppa_ptr; 22926 /* 22927 * Finish as much work now as possible before calling ill_glist_insert 22928 * which makes the ill globally visible and also merges it with the 22929 * other protocol instance of this phyint. The remaining work is 22930 * done after entering the ipsq which may happen sometime later. 22931 * ill_set_ndd_name occurs after the ill has been made globally visible. 22932 */ 22933 ipif = ill->ill_ipif; 22934 22935 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 22936 ipif_assign_seqid(ipif); 22937 22938 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 22939 ill->ill_flags |= ILLF_IPV4; 22940 22941 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 22942 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 22943 22944 if (ill->ill_flags & ILLF_IPV6) { 22945 22946 ill->ill_isv6 = B_TRUE; 22947 if (ill->ill_rq != NULL) { 22948 ill->ill_rq->q_qinfo = &rinit_ipv6; 22949 ill->ill_wq->q_qinfo = &winit_ipv6; 22950 } 22951 22952 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 22953 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 22954 ipif->ipif_v6src_addr = ipv6_all_zeros; 22955 ipif->ipif_v6subnet = ipv6_all_zeros; 22956 ipif->ipif_v6net_mask = ipv6_all_zeros; 22957 ipif->ipif_v6brd_addr = ipv6_all_zeros; 22958 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 22959 /* 22960 * point-to-point or Non-mulicast capable 22961 * interfaces won't do NUD unless explicitly 22962 * configured to do so. 22963 */ 22964 if (ipif->ipif_flags & IPIF_POINTOPOINT || 22965 !(ill->ill_flags & ILLF_MULTICAST)) { 22966 ill->ill_flags |= ILLF_NONUD; 22967 } 22968 /* Make sure IPv4 specific flag is not set on IPv6 if */ 22969 if (ill->ill_flags & ILLF_NOARP) { 22970 /* 22971 * Note: xresolv interfaces will eventually need 22972 * NOARP set here as well, but that will require 22973 * those external resolvers to have some 22974 * knowledge of that flag and act appropriately. 22975 * Not to be changed at present. 22976 */ 22977 ill->ill_flags &= ~ILLF_NOARP; 22978 } 22979 /* 22980 * Set the ILLF_ROUTER flag according to the global 22981 * IPv6 forwarding policy. 22982 */ 22983 if (ipst->ips_ipv6_forward != 0) 22984 ill->ill_flags |= ILLF_ROUTER; 22985 } else if (ill->ill_flags & ILLF_IPV4) { 22986 ill->ill_isv6 = B_FALSE; 22987 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 22988 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr); 22989 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 22990 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 22991 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 22992 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 22993 /* 22994 * Set the ILLF_ROUTER flag according to the global 22995 * IPv4 forwarding policy. 22996 */ 22997 if (ipst->ips_ip_g_forward != 0) 22998 ill->ill_flags |= ILLF_ROUTER; 22999 } 23000 23001 ASSERT(ill->ill_phyint != NULL); 23002 23003 /* 23004 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will 23005 * be completed in ill_glist_insert -> ill_phyint_reinit 23006 */ 23007 if (!ill_allocate_mibs(ill)) 23008 return (ENOMEM); 23009 23010 /* 23011 * Pick a default sap until we get the DL_INFO_ACK back from 23012 * the driver. 23013 */ 23014 if (ill->ill_sap == 0) { 23015 if (ill->ill_isv6) 23016 ill->ill_sap = IP6_DL_SAP; 23017 else 23018 ill->ill_sap = IP_DL_SAP; 23019 } 23020 23021 ill->ill_ifname_pending = 1; 23022 ill->ill_ifname_pending_err = 0; 23023 23024 ill_refhold(ill); 23025 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 23026 if ((error = ill_glist_insert(ill, interf_name, 23027 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 23028 ill->ill_ppa = UINT_MAX; 23029 ill->ill_name[0] = '\0'; 23030 /* 23031 * undo null termination done above. 23032 */ 23033 ppa_ptr[0] = old_char; 23034 rw_exit(&ipst->ips_ill_g_lock); 23035 ill_refrele(ill); 23036 return (error); 23037 } 23038 23039 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 23040 23041 /* 23042 * When we return the buffer pointed to by interf_name should contain 23043 * the same name as in ill_name. 23044 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 23045 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 23046 * so copy full name and update the ppa ptr. 23047 * When ppa passed in != UINT_MAX all values are correct just undo 23048 * null termination, this saves a bcopy. 23049 */ 23050 if (*new_ppa_ptr == UINT_MAX) { 23051 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 23052 *new_ppa_ptr = ill->ill_ppa; 23053 } else { 23054 /* 23055 * undo null termination done above. 23056 */ 23057 ppa_ptr[0] = old_char; 23058 } 23059 23060 /* Let SCTP know about this ILL */ 23061 sctp_update_ill(ill, SCTP_ILL_INSERT); 23062 23063 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_reprocess_ioctl, NEW_OP, 23064 B_TRUE); 23065 23066 rw_exit(&ipst->ips_ill_g_lock); 23067 ill_refrele(ill); 23068 if (ipsq == NULL) 23069 return (EINPROGRESS); 23070 23071 /* 23072 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. 23073 */ 23074 if (ipsq->ipsq_current_ipif == NULL) 23075 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); 23076 else 23077 ASSERT(ipsq->ipsq_current_ipif == ipif); 23078 23079 error = ipif_set_values_tail(ill, ipif, mp, q); 23080 ipsq_exit(ipsq, B_TRUE, B_TRUE); 23081 if (error != 0 && error != EINPROGRESS) { 23082 /* 23083 * restore previous values 23084 */ 23085 ill->ill_isv6 = B_FALSE; 23086 } 23087 return (error); 23088 } 23089 23090 23091 void 23092 ipif_init(ip_stack_t *ipst) 23093 { 23094 hrtime_t hrt; 23095 int i; 23096 23097 /* 23098 * Can't call drv_getparm here as it is too early in the boot. 23099 * As we use ipif_src_random just for picking a different 23100 * source address everytime, this need not be really random. 23101 */ 23102 hrt = gethrtime(); 23103 ipst->ips_ipif_src_random = 23104 ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff); 23105 23106 for (i = 0; i < MAX_G_HEADS; i++) { 23107 ipst->ips_ill_g_heads[i].ill_g_list_head = 23108 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 23109 ipst->ips_ill_g_heads[i].ill_g_list_tail = 23110 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 23111 } 23112 23113 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 23114 ill_phyint_compare_index, 23115 sizeof (phyint_t), 23116 offsetof(struct phyint, phyint_avl_by_index)); 23117 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 23118 ill_phyint_compare_name, 23119 sizeof (phyint_t), 23120 offsetof(struct phyint, phyint_avl_by_name)); 23121 } 23122 23123 /* 23124 * Lookup the ipif corresponding to the onlink destination address. For 23125 * point-to-point interfaces, it matches with remote endpoint destination 23126 * address. For point-to-multipoint interfaces it only tries to match the 23127 * destination with the interface's subnet address. The longest, most specific 23128 * match is found to take care of such rare network configurations like - 23129 * le0: 129.146.1.1/16 23130 * le1: 129.146.2.2/24 23131 * It is used only by SO_DONTROUTE at the moment. 23132 */ 23133 ipif_t * 23134 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) 23135 { 23136 ipif_t *ipif, *best_ipif; 23137 ill_t *ill; 23138 ill_walk_context_t ctx; 23139 23140 ASSERT(zoneid != ALL_ZONES); 23141 best_ipif = NULL; 23142 23143 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 23144 ill = ILL_START_WALK_V4(&ctx, ipst); 23145 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 23146 mutex_enter(&ill->ill_lock); 23147 for (ipif = ill->ill_ipif; ipif != NULL; 23148 ipif = ipif->ipif_next) { 23149 if (!IPIF_CAN_LOOKUP(ipif)) 23150 continue; 23151 if (ipif->ipif_zoneid != zoneid && 23152 ipif->ipif_zoneid != ALL_ZONES) 23153 continue; 23154 /* 23155 * Point-to-point case. Look for exact match with 23156 * destination address. 23157 */ 23158 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 23159 if (ipif->ipif_pp_dst_addr == addr) { 23160 ipif_refhold_locked(ipif); 23161 mutex_exit(&ill->ill_lock); 23162 rw_exit(&ipst->ips_ill_g_lock); 23163 if (best_ipif != NULL) 23164 ipif_refrele(best_ipif); 23165 return (ipif); 23166 } 23167 } else if (ipif->ipif_subnet == (addr & 23168 ipif->ipif_net_mask)) { 23169 /* 23170 * Point-to-multipoint case. Looping through to 23171 * find the most specific match. If there are 23172 * multiple best match ipif's then prefer ipif's 23173 * that are UP. If there is only one best match 23174 * ipif and it is DOWN we must still return it. 23175 */ 23176 if ((best_ipif == NULL) || 23177 (ipif->ipif_net_mask > 23178 best_ipif->ipif_net_mask) || 23179 ((ipif->ipif_net_mask == 23180 best_ipif->ipif_net_mask) && 23181 ((ipif->ipif_flags & IPIF_UP) && 23182 (!(best_ipif->ipif_flags & IPIF_UP))))) { 23183 ipif_refhold_locked(ipif); 23184 mutex_exit(&ill->ill_lock); 23185 rw_exit(&ipst->ips_ill_g_lock); 23186 if (best_ipif != NULL) 23187 ipif_refrele(best_ipif); 23188 best_ipif = ipif; 23189 rw_enter(&ipst->ips_ill_g_lock, 23190 RW_READER); 23191 mutex_enter(&ill->ill_lock); 23192 } 23193 } 23194 } 23195 mutex_exit(&ill->ill_lock); 23196 } 23197 rw_exit(&ipst->ips_ill_g_lock); 23198 return (best_ipif); 23199 } 23200 23201 23202 /* 23203 * Save enough information so that we can recreate the IRE if 23204 * the interface goes down and then up. 23205 */ 23206 static void 23207 ipif_save_ire(ipif_t *ipif, ire_t *ire) 23208 { 23209 mblk_t *save_mp; 23210 23211 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 23212 if (save_mp != NULL) { 23213 ifrt_t *ifrt; 23214 23215 save_mp->b_wptr += sizeof (ifrt_t); 23216 ifrt = (ifrt_t *)save_mp->b_rptr; 23217 bzero(ifrt, sizeof (ifrt_t)); 23218 ifrt->ifrt_type = ire->ire_type; 23219 ifrt->ifrt_addr = ire->ire_addr; 23220 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 23221 ifrt->ifrt_src_addr = ire->ire_src_addr; 23222 ifrt->ifrt_mask = ire->ire_mask; 23223 ifrt->ifrt_flags = ire->ire_flags; 23224 ifrt->ifrt_max_frag = ire->ire_max_frag; 23225 mutex_enter(&ipif->ipif_saved_ire_lock); 23226 save_mp->b_cont = ipif->ipif_saved_ire_mp; 23227 ipif->ipif_saved_ire_mp = save_mp; 23228 ipif->ipif_saved_ire_cnt++; 23229 mutex_exit(&ipif->ipif_saved_ire_lock); 23230 } 23231 } 23232 23233 23234 static void 23235 ipif_remove_ire(ipif_t *ipif, ire_t *ire) 23236 { 23237 mblk_t **mpp; 23238 mblk_t *mp; 23239 ifrt_t *ifrt; 23240 23241 /* Remove from ipif_saved_ire_mp list if it is there */ 23242 mutex_enter(&ipif->ipif_saved_ire_lock); 23243 for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL; 23244 mpp = &(*mpp)->b_cont) { 23245 /* 23246 * On a given ipif, the triple of address, gateway and 23247 * mask is unique for each saved IRE (in the case of 23248 * ordinary interface routes, the gateway address is 23249 * all-zeroes). 23250 */ 23251 mp = *mpp; 23252 ifrt = (ifrt_t *)mp->b_rptr; 23253 if (ifrt->ifrt_addr == ire->ire_addr && 23254 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 23255 ifrt->ifrt_mask == ire->ire_mask) { 23256 *mpp = mp->b_cont; 23257 ipif->ipif_saved_ire_cnt--; 23258 freeb(mp); 23259 break; 23260 } 23261 } 23262 mutex_exit(&ipif->ipif_saved_ire_lock); 23263 } 23264 23265 23266 /* 23267 * IP multirouting broadcast routes handling 23268 * Append CGTP broadcast IREs to regular ones created 23269 * at ifconfig time. 23270 */ 23271 static void 23272 ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst, ip_stack_t *ipst) 23273 { 23274 ire_t *ire_prim; 23275 23276 ASSERT(ire != NULL); 23277 ASSERT(ire_dst != NULL); 23278 23279 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 23280 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 23281 if (ire_prim != NULL) { 23282 /* 23283 * We are in the special case of broadcasts for 23284 * CGTP. We add an IRE_BROADCAST that holds 23285 * the RTF_MULTIRT flag, the destination 23286 * address of ire_dst and the low level 23287 * info of ire_prim. In other words, CGTP 23288 * broadcast is added to the redundant ipif. 23289 */ 23290 ipif_t *ipif_prim; 23291 ire_t *bcast_ire; 23292 23293 ipif_prim = ire_prim->ire_ipif; 23294 23295 ip2dbg(("ip_cgtp_filter_bcast_add: " 23296 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 23297 (void *)ire_dst, (void *)ire_prim, 23298 (void *)ipif_prim)); 23299 23300 bcast_ire = ire_create( 23301 (uchar_t *)&ire->ire_addr, 23302 (uchar_t *)&ip_g_all_ones, 23303 (uchar_t *)&ire_dst->ire_src_addr, 23304 (uchar_t *)&ire->ire_gateway_addr, 23305 &ipif_prim->ipif_mtu, 23306 NULL, 23307 ipif_prim->ipif_rq, 23308 ipif_prim->ipif_wq, 23309 IRE_BROADCAST, 23310 ipif_prim, 23311 0, 23312 0, 23313 0, 23314 ire->ire_flags, 23315 &ire_uinfo_null, 23316 NULL, 23317 NULL, 23318 ipst); 23319 23320 if (bcast_ire != NULL) { 23321 23322 if (ire_add(&bcast_ire, NULL, NULL, NULL, 23323 B_FALSE) == 0) { 23324 ip2dbg(("ip_cgtp_filter_bcast_add: " 23325 "added bcast_ire %p\n", 23326 (void *)bcast_ire)); 23327 23328 ipif_save_ire(bcast_ire->ire_ipif, 23329 bcast_ire); 23330 ire_refrele(bcast_ire); 23331 } 23332 } 23333 ire_refrele(ire_prim); 23334 } 23335 } 23336 23337 23338 /* 23339 * IP multirouting broadcast routes handling 23340 * Remove the broadcast ire 23341 */ 23342 static void 23343 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst) 23344 { 23345 ire_t *ire_dst; 23346 23347 ASSERT(ire != NULL); 23348 ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST, 23349 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 23350 if (ire_dst != NULL) { 23351 ire_t *ire_prim; 23352 23353 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 23354 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 23355 if (ire_prim != NULL) { 23356 ipif_t *ipif_prim; 23357 ire_t *bcast_ire; 23358 23359 ipif_prim = ire_prim->ire_ipif; 23360 23361 ip2dbg(("ip_cgtp_filter_bcast_delete: " 23362 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 23363 (void *)ire_dst, (void *)ire_prim, 23364 (void *)ipif_prim)); 23365 23366 bcast_ire = ire_ctable_lookup(ire->ire_addr, 23367 ire->ire_gateway_addr, 23368 IRE_BROADCAST, 23369 ipif_prim, ALL_ZONES, 23370 NULL, 23371 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF | 23372 MATCH_IRE_MASK, ipst); 23373 23374 if (bcast_ire != NULL) { 23375 ip2dbg(("ip_cgtp_filter_bcast_delete: " 23376 "looked up bcast_ire %p\n", 23377 (void *)bcast_ire)); 23378 ipif_remove_ire(bcast_ire->ire_ipif, 23379 bcast_ire); 23380 ire_delete(bcast_ire); 23381 } 23382 ire_refrele(ire_prim); 23383 } 23384 ire_refrele(ire_dst); 23385 } 23386 } 23387 23388 /* 23389 * IPsec hardware acceleration capabilities related functions. 23390 */ 23391 23392 /* 23393 * Free a per-ill IPsec capabilities structure. 23394 */ 23395 static void 23396 ill_ipsec_capab_free(ill_ipsec_capab_t *capab) 23397 { 23398 if (capab->auth_hw_algs != NULL) 23399 kmem_free(capab->auth_hw_algs, capab->algs_size); 23400 if (capab->encr_hw_algs != NULL) 23401 kmem_free(capab->encr_hw_algs, capab->algs_size); 23402 if (capab->encr_algparm != NULL) 23403 kmem_free(capab->encr_algparm, capab->encr_algparm_size); 23404 kmem_free(capab, sizeof (ill_ipsec_capab_t)); 23405 } 23406 23407 /* 23408 * Allocate a new per-ill IPsec capabilities structure. This structure 23409 * is specific to an IPsec protocol (AH or ESP). It is implemented as 23410 * an array which specifies, for each algorithm, whether this algorithm 23411 * is supported by the ill or not. 23412 */ 23413 static ill_ipsec_capab_t * 23414 ill_ipsec_capab_alloc(void) 23415 { 23416 ill_ipsec_capab_t *capab; 23417 uint_t nelems; 23418 23419 capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP); 23420 if (capab == NULL) 23421 return (NULL); 23422 23423 /* we need one bit per algorithm */ 23424 nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t); 23425 capab->algs_size = nelems * sizeof (ipsec_capab_elem_t); 23426 23427 /* allocate memory to store algorithm flags */ 23428 capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 23429 if (capab->encr_hw_algs == NULL) 23430 goto nomem; 23431 capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 23432 if (capab->auth_hw_algs == NULL) 23433 goto nomem; 23434 /* 23435 * Leave encr_algparm NULL for now since we won't need it half 23436 * the time 23437 */ 23438 return (capab); 23439 23440 nomem: 23441 ill_ipsec_capab_free(capab); 23442 return (NULL); 23443 } 23444 23445 /* 23446 * Resize capability array. Since we're exclusive, this is OK. 23447 */ 23448 static boolean_t 23449 ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid) 23450 { 23451 ipsec_capab_algparm_t *nalp, *oalp; 23452 uint32_t olen, nlen; 23453 23454 oalp = capab->encr_algparm; 23455 olen = capab->encr_algparm_size; 23456 23457 if (oalp != NULL) { 23458 if (algid < capab->encr_algparm_end) 23459 return (B_TRUE); 23460 } 23461 23462 nlen = (algid + 1) * sizeof (*nalp); 23463 nalp = kmem_zalloc(nlen, KM_NOSLEEP); 23464 if (nalp == NULL) 23465 return (B_FALSE); 23466 23467 if (oalp != NULL) { 23468 bcopy(oalp, nalp, olen); 23469 kmem_free(oalp, olen); 23470 } 23471 capab->encr_algparm = nalp; 23472 capab->encr_algparm_size = nlen; 23473 capab->encr_algparm_end = algid + 1; 23474 23475 return (B_TRUE); 23476 } 23477 23478 /* 23479 * Compare the capabilities of the specified ill with the protocol 23480 * and algorithms specified by the SA passed as argument. 23481 * If they match, returns B_TRUE, B_FALSE if they do not match. 23482 * 23483 * The ill can be passed as a pointer to it, or by specifying its index 23484 * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments). 23485 * 23486 * Called by ipsec_out_is_accelerated() do decide whether an outbound 23487 * packet is eligible for hardware acceleration, and by 23488 * ill_ipsec_capab_send_all() to decide whether a SA must be sent down 23489 * to a particular ill. 23490 */ 23491 boolean_t 23492 ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6, 23493 ipsa_t *sa, netstack_t *ns) 23494 { 23495 boolean_t sa_isv6; 23496 uint_t algid; 23497 struct ill_ipsec_capab_s *cpp; 23498 boolean_t need_refrele = B_FALSE; 23499 ip_stack_t *ipst = ns->netstack_ip; 23500 23501 if (ill == NULL) { 23502 ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL, 23503 NULL, NULL, NULL, ipst); 23504 if (ill == NULL) { 23505 ip0dbg(("ipsec_capab_match: ill doesn't exist\n")); 23506 return (B_FALSE); 23507 } 23508 need_refrele = B_TRUE; 23509 } 23510 23511 /* 23512 * Use the address length specified by the SA to determine 23513 * if it corresponds to a IPv6 address, and fail the matching 23514 * if the isv6 flag passed as argument does not match. 23515 * Note: this check is used for SADB capability checking before 23516 * sending SA information to an ill. 23517 */ 23518 sa_isv6 = (sa->ipsa_addrfam == AF_INET6); 23519 if (sa_isv6 != ill_isv6) 23520 /* protocol mismatch */ 23521 goto done; 23522 23523 /* 23524 * Check if the ill supports the protocol, algorithm(s) and 23525 * key size(s) specified by the SA, and get the pointers to 23526 * the algorithms supported by the ill. 23527 */ 23528 switch (sa->ipsa_type) { 23529 23530 case SADB_SATYPE_ESP: 23531 if (!(ill->ill_capabilities & ILL_CAPAB_ESP)) 23532 /* ill does not support ESP acceleration */ 23533 goto done; 23534 cpp = ill->ill_ipsec_capab_esp; 23535 algid = sa->ipsa_auth_alg; 23536 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs)) 23537 goto done; 23538 algid = sa->ipsa_encr_alg; 23539 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs)) 23540 goto done; 23541 if (algid < cpp->encr_algparm_end) { 23542 ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid]; 23543 if (sa->ipsa_encrkeybits < alp->minkeylen) 23544 goto done; 23545 if (sa->ipsa_encrkeybits > alp->maxkeylen) 23546 goto done; 23547 } 23548 break; 23549 23550 case SADB_SATYPE_AH: 23551 if (!(ill->ill_capabilities & ILL_CAPAB_AH)) 23552 /* ill does not support AH acceleration */ 23553 goto done; 23554 if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg, 23555 ill->ill_ipsec_capab_ah->auth_hw_algs)) 23556 goto done; 23557 break; 23558 } 23559 23560 if (need_refrele) 23561 ill_refrele(ill); 23562 return (B_TRUE); 23563 done: 23564 if (need_refrele) 23565 ill_refrele(ill); 23566 return (B_FALSE); 23567 } 23568 23569 23570 /* 23571 * Add a new ill to the list of IPsec capable ills. 23572 * Called from ill_capability_ipsec_ack() when an ACK was received 23573 * indicating that IPsec hardware processing was enabled for an ill. 23574 * 23575 * ill must point to the ill for which acceleration was enabled. 23576 * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP. 23577 */ 23578 static void 23579 ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync) 23580 { 23581 ipsec_capab_ill_t **ills, *cur_ill, *new_ill; 23582 uint_t sa_type; 23583 uint_t ipproto; 23584 ip_stack_t *ipst = ill->ill_ipst; 23585 23586 ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) || 23587 (dl_cap == DL_CAPAB_IPSEC_ESP)); 23588 23589 switch (dl_cap) { 23590 case DL_CAPAB_IPSEC_AH: 23591 sa_type = SADB_SATYPE_AH; 23592 ills = &ipst->ips_ipsec_capab_ills_ah; 23593 ipproto = IPPROTO_AH; 23594 break; 23595 case DL_CAPAB_IPSEC_ESP: 23596 sa_type = SADB_SATYPE_ESP; 23597 ills = &ipst->ips_ipsec_capab_ills_esp; 23598 ipproto = IPPROTO_ESP; 23599 break; 23600 } 23601 23602 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER); 23603 23604 /* 23605 * Add ill index to list of hardware accelerators. If 23606 * already in list, do nothing. 23607 */ 23608 for (cur_ill = *ills; cur_ill != NULL && 23609 (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex || 23610 cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next) 23611 ; 23612 23613 if (cur_ill == NULL) { 23614 /* if this is a new entry for this ill */ 23615 new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP); 23616 if (new_ill == NULL) { 23617 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 23618 return; 23619 } 23620 23621 new_ill->ill_index = ill->ill_phyint->phyint_ifindex; 23622 new_ill->ill_isv6 = ill->ill_isv6; 23623 new_ill->next = *ills; 23624 *ills = new_ill; 23625 } else if (!sadb_resync) { 23626 /* not resync'ing SADB and an entry exists for this ill */ 23627 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 23628 return; 23629 } 23630 23631 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 23632 23633 if (ipst->ips_ipcl_proto_fanout_v6[ipproto].connf_head != NULL) 23634 /* 23635 * IPsec module for protocol loaded, initiate dump 23636 * of the SADB to this ill. 23637 */ 23638 sadb_ill_download(ill, sa_type); 23639 } 23640 23641 /* 23642 * Remove an ill from the list of IPsec capable ills. 23643 */ 23644 static void 23645 ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap) 23646 { 23647 ipsec_capab_ill_t **ills, *cur_ill, *prev_ill; 23648 ip_stack_t *ipst = ill->ill_ipst; 23649 23650 ASSERT(dl_cap == DL_CAPAB_IPSEC_AH || 23651 dl_cap == DL_CAPAB_IPSEC_ESP); 23652 23653 ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipst->ips_ipsec_capab_ills_ah : 23654 &ipst->ips_ipsec_capab_ills_esp; 23655 23656 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER); 23657 23658 prev_ill = NULL; 23659 for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index != 23660 ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 != 23661 ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next) 23662 ; 23663 if (cur_ill == NULL) { 23664 /* entry not found */ 23665 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 23666 return; 23667 } 23668 if (prev_ill == NULL) { 23669 /* entry at front of list */ 23670 *ills = NULL; 23671 } else { 23672 prev_ill->next = cur_ill->next; 23673 } 23674 kmem_free(cur_ill, sizeof (ipsec_capab_ill_t)); 23675 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 23676 } 23677 23678 /* 23679 * Called by SADB to send a DL_CONTROL_REQ message to every ill 23680 * supporting the specified IPsec protocol acceleration. 23681 * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP. 23682 * We free the mblk and, if sa is non-null, release the held referece. 23683 */ 23684 void 23685 ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa, 23686 netstack_t *ns) 23687 { 23688 ipsec_capab_ill_t *ici, *cur_ici; 23689 ill_t *ill; 23690 mblk_t *nmp, *mp_ship_list = NULL, *next_mp; 23691 ip_stack_t *ipst = ns->netstack_ip; 23692 23693 ici = (sa_type == SADB_SATYPE_AH) ? ipst->ips_ipsec_capab_ills_ah : 23694 ipst->ips_ipsec_capab_ills_esp; 23695 23696 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_READER); 23697 23698 for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) { 23699 ill = ill_lookup_on_ifindex(cur_ici->ill_index, 23700 cur_ici->ill_isv6, NULL, NULL, NULL, NULL, ipst); 23701 23702 /* 23703 * Handle the case where the ill goes away while the SADB is 23704 * attempting to send messages. If it's going away, it's 23705 * nuking its shadow SADB, so we don't care.. 23706 */ 23707 23708 if (ill == NULL) 23709 continue; 23710 23711 if (sa != NULL) { 23712 /* 23713 * Make sure capabilities match before 23714 * sending SA to ill. 23715 */ 23716 if (!ipsec_capab_match(ill, cur_ici->ill_index, 23717 cur_ici->ill_isv6, sa, ipst->ips_netstack)) { 23718 ill_refrele(ill); 23719 continue; 23720 } 23721 23722 mutex_enter(&sa->ipsa_lock); 23723 sa->ipsa_flags |= IPSA_F_HW; 23724 mutex_exit(&sa->ipsa_lock); 23725 } 23726 23727 /* 23728 * Copy template message, and add it to the front 23729 * of the mblk ship list. We want to avoid holding 23730 * the ipsec_capab_ills_lock while sending the 23731 * message to the ills. 23732 * 23733 * The b_next and b_prev are temporarily used 23734 * to build a list of mblks to be sent down, and to 23735 * save the ill to which they must be sent. 23736 */ 23737 nmp = copymsg(mp); 23738 if (nmp == NULL) { 23739 ill_refrele(ill); 23740 continue; 23741 } 23742 ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL); 23743 nmp->b_next = mp_ship_list; 23744 mp_ship_list = nmp; 23745 nmp->b_prev = (mblk_t *)ill; 23746 } 23747 23748 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 23749 23750 for (nmp = mp_ship_list; nmp != NULL; nmp = next_mp) { 23751 /* restore the mblk to a sane state */ 23752 next_mp = nmp->b_next; 23753 nmp->b_next = NULL; 23754 ill = (ill_t *)nmp->b_prev; 23755 nmp->b_prev = NULL; 23756 23757 ill_dlpi_send(ill, nmp); 23758 ill_refrele(ill); 23759 } 23760 23761 if (sa != NULL) 23762 IPSA_REFRELE(sa); 23763 freemsg(mp); 23764 } 23765 23766 /* 23767 * Derive an interface id from the link layer address. 23768 * Knows about IEEE 802 and IEEE EUI-64 mappings. 23769 */ 23770 static boolean_t 23771 ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23772 { 23773 char *addr; 23774 23775 if (phys_length != ETHERADDRL) 23776 return (B_FALSE); 23777 23778 /* Form EUI-64 like address */ 23779 addr = (char *)&v6addr->s6_addr32[2]; 23780 bcopy((char *)phys_addr, addr, 3); 23781 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 23782 addr[3] = (char)0xff; 23783 addr[4] = (char)0xfe; 23784 bcopy((char *)phys_addr + 3, addr + 5, 3); 23785 return (B_TRUE); 23786 } 23787 23788 /* ARGSUSED */ 23789 static boolean_t 23790 ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23791 { 23792 return (B_FALSE); 23793 } 23794 23795 /* ARGSUSED */ 23796 static boolean_t 23797 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 23798 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 23799 { 23800 /* 23801 * Multicast address mappings used over Ethernet/802.X. 23802 * This address is used as a base for mappings. 23803 */ 23804 static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00, 23805 0x00, 0x00, 0x00}; 23806 23807 /* 23808 * Extract low order 32 bits from IPv6 multicast address. 23809 * Or that into the link layer address, starting from the 23810 * second byte. 23811 */ 23812 *hw_start = 2; 23813 v6_extract_mask->s6_addr32[0] = 0; 23814 v6_extract_mask->s6_addr32[1] = 0; 23815 v6_extract_mask->s6_addr32[2] = 0; 23816 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 23817 bcopy(ipv6_g_phys_multi_addr, maddr, lla_length); 23818 return (B_TRUE); 23819 } 23820 23821 /* 23822 * Indicate by return value whether multicast is supported. If not, 23823 * this code should not touch/change any parameters. 23824 */ 23825 /* ARGSUSED */ 23826 static boolean_t 23827 ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 23828 uint32_t *hw_start, ipaddr_t *extract_mask) 23829 { 23830 /* 23831 * Multicast address mappings used over Ethernet/802.X. 23832 * This address is used as a base for mappings. 23833 */ 23834 static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e, 23835 0x00, 0x00, 0x00 }; 23836 23837 if (phys_length != ETHERADDRL) 23838 return (B_FALSE); 23839 23840 *extract_mask = htonl(0x007fffff); 23841 *hw_start = 2; 23842 bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL); 23843 return (B_TRUE); 23844 } 23845 23846 /* 23847 * Derive IPoIB interface id from the link layer address. 23848 */ 23849 static boolean_t 23850 ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23851 { 23852 char *addr; 23853 23854 if (phys_length != 20) 23855 return (B_FALSE); 23856 addr = (char *)&v6addr->s6_addr32[2]; 23857 bcopy(phys_addr + 12, addr, 8); 23858 /* 23859 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 23860 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 23861 * rules. In these cases, the IBA considers these GUIDs to be in 23862 * "Modified EUI-64" format, and thus toggling the u/l bit is not 23863 * required; vendors are required not to assign global EUI-64's 23864 * that differ only in u/l bit values, thus guaranteeing uniqueness 23865 * of the interface identifier. Whether the GUID is in modified 23866 * or proper EUI-64 format, the ipv6 identifier must have the u/l 23867 * bit set to 1. 23868 */ 23869 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 23870 return (B_TRUE); 23871 } 23872 23873 /* 23874 * Note on mapping from multicast IP addresses to IPoIB multicast link 23875 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 23876 * The format of an IPoIB multicast address is: 23877 * 23878 * 4 byte QPN Scope Sign. Pkey 23879 * +--------------------------------------------+ 23880 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 23881 * +--------------------------------------------+ 23882 * 23883 * The Scope and Pkey components are properties of the IBA port and 23884 * network interface. They can be ascertained from the broadcast address. 23885 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 23886 */ 23887 23888 static boolean_t 23889 ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 23890 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 23891 { 23892 /* 23893 * Base IPoIB IPv6 multicast address used for mappings. 23894 * Does not contain the IBA scope/Pkey values. 23895 */ 23896 static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 23897 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 23898 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 23899 23900 /* 23901 * Extract low order 80 bits from IPv6 multicast address. 23902 * Or that into the link layer address, starting from the 23903 * sixth byte. 23904 */ 23905 *hw_start = 6; 23906 bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length); 23907 23908 /* 23909 * Now fill in the IBA scope/Pkey values from the broadcast address. 23910 */ 23911 *(maddr + 5) = *(bphys_addr + 5); 23912 *(maddr + 8) = *(bphys_addr + 8); 23913 *(maddr + 9) = *(bphys_addr + 9); 23914 23915 v6_extract_mask->s6_addr32[0] = 0; 23916 v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff); 23917 v6_extract_mask->s6_addr32[2] = 0xffffffffU; 23918 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 23919 return (B_TRUE); 23920 } 23921 23922 static boolean_t 23923 ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 23924 uint32_t *hw_start, ipaddr_t *extract_mask) 23925 { 23926 /* 23927 * Base IPoIB IPv4 multicast address used for mappings. 23928 * Does not contain the IBA scope/Pkey values. 23929 */ 23930 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 23931 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 23932 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 23933 23934 if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr)) 23935 return (B_FALSE); 23936 23937 /* 23938 * Extract low order 28 bits from IPv4 multicast address. 23939 * Or that into the link layer address, starting from the 23940 * sixteenth byte. 23941 */ 23942 *extract_mask = htonl(0x0fffffff); 23943 *hw_start = 16; 23944 bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length); 23945 23946 /* 23947 * Now fill in the IBA scope/Pkey values from the broadcast address. 23948 */ 23949 *(maddr + 5) = *(bphys_addr + 5); 23950 *(maddr + 8) = *(bphys_addr + 8); 23951 *(maddr + 9) = *(bphys_addr + 9); 23952 return (B_TRUE); 23953 } 23954 23955 /* 23956 * Returns B_TRUE if an ipif is present in the given zone, matching some flags 23957 * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there. 23958 * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with 23959 * the link-local address is preferred. 23960 */ 23961 boolean_t 23962 ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 23963 { 23964 ipif_t *ipif; 23965 ipif_t *maybe_ipif = NULL; 23966 23967 mutex_enter(&ill->ill_lock); 23968 if (ill->ill_state_flags & ILL_CONDEMNED) { 23969 mutex_exit(&ill->ill_lock); 23970 if (ipifp != NULL) 23971 *ipifp = NULL; 23972 return (B_FALSE); 23973 } 23974 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 23975 if (!IPIF_CAN_LOOKUP(ipif)) 23976 continue; 23977 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 23978 ipif->ipif_zoneid != ALL_ZONES) 23979 continue; 23980 if ((ipif->ipif_flags & flags) != flags) 23981 continue; 23982 23983 if (ipifp == NULL) { 23984 mutex_exit(&ill->ill_lock); 23985 ASSERT(maybe_ipif == NULL); 23986 return (B_TRUE); 23987 } 23988 if (!ill->ill_isv6 || 23989 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) { 23990 ipif_refhold_locked(ipif); 23991 mutex_exit(&ill->ill_lock); 23992 *ipifp = ipif; 23993 return (B_TRUE); 23994 } 23995 if (maybe_ipif == NULL) 23996 maybe_ipif = ipif; 23997 } 23998 if (ipifp != NULL) { 23999 if (maybe_ipif != NULL) 24000 ipif_refhold_locked(maybe_ipif); 24001 *ipifp = maybe_ipif; 24002 } 24003 mutex_exit(&ill->ill_lock); 24004 return (maybe_ipif != NULL); 24005 } 24006 24007 /* 24008 * Same as ipif_lookup_zoneid() but looks at all the ills in the same group. 24009 */ 24010 boolean_t 24011 ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 24012 { 24013 ill_t *illg; 24014 ip_stack_t *ipst = ill->ill_ipst; 24015 24016 /* 24017 * We look at the passed-in ill first without grabbing ill_g_lock. 24018 */ 24019 if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) { 24020 return (B_TRUE); 24021 } 24022 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 24023 if (ill->ill_group == NULL) { 24024 /* ill not in a group */ 24025 rw_exit(&ipst->ips_ill_g_lock); 24026 return (B_FALSE); 24027 } 24028 24029 /* 24030 * There's no ipif in the zone on ill, however ill is part of an IPMP 24031 * group. We need to look for an ipif in the zone on all the ills in the 24032 * group. 24033 */ 24034 illg = ill->ill_group->illgrp_ill; 24035 do { 24036 /* 24037 * We don't call ipif_lookup_zoneid() on ill as we already know 24038 * that it's not there. 24039 */ 24040 if (illg != ill && 24041 ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) { 24042 break; 24043 } 24044 } while ((illg = illg->ill_group_next) != NULL); 24045 rw_exit(&ipst->ips_ill_g_lock); 24046 return (illg != NULL); 24047 } 24048 24049 /* 24050 * Check if this ill is only being used to send ICMP probes for IPMP 24051 */ 24052 boolean_t 24053 ill_is_probeonly(ill_t *ill) 24054 { 24055 /* 24056 * Check if the interface is FAILED, or INACTIVE 24057 */ 24058 if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) 24059 return (B_TRUE); 24060 24061 return (B_FALSE); 24062 } 24063 24064 /* 24065 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 24066 * If a pointer to an ipif_t is returned then the caller will need to do 24067 * an ill_refrele(). 24068 * 24069 * If there is no real interface which matches the ifindex, then it looks 24070 * for a group that has a matching index. In the case of a group match the 24071 * lifidx must be zero. We don't need emulate the logical interfaces 24072 * since IP Filter's use of netinfo doesn't use that. 24073 */ 24074 ipif_t * 24075 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, 24076 ip_stack_t *ipst) 24077 { 24078 ipif_t *ipif; 24079 ill_t *ill; 24080 24081 ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, 24082 ipst); 24083 24084 if (ill == NULL) { 24085 /* Fallback to group names only if hook_emulation set */ 24086 if (!ipst->ips_ipmp_hook_emulation) 24087 return (NULL); 24088 24089 if (lifidx != 0) 24090 return (NULL); 24091 ill = ill_group_lookup_on_ifindex(ifindex, isv6, ipst); 24092 if (ill == NULL) 24093 return (NULL); 24094 } 24095 24096 mutex_enter(&ill->ill_lock); 24097 if (ill->ill_state_flags & ILL_CONDEMNED) { 24098 mutex_exit(&ill->ill_lock); 24099 ill_refrele(ill); 24100 return (NULL); 24101 } 24102 24103 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 24104 if (!IPIF_CAN_LOOKUP(ipif)) 24105 continue; 24106 if (lifidx == ipif->ipif_id) { 24107 ipif_refhold_locked(ipif); 24108 break; 24109 } 24110 } 24111 24112 mutex_exit(&ill->ill_lock); 24113 ill_refrele(ill); 24114 return (ipif); 24115 } 24116 24117 /* 24118 * Flush the fastpath by deleting any nce's that are waiting for the fastpath, 24119 * There is one exceptions IRE_BROADCAST are difficult to recreate, 24120 * so instead we just nuke their nce_fp_mp's; see ndp_fastpath_flush() 24121 * for details. 24122 */ 24123 void 24124 ill_fastpath_flush(ill_t *ill) 24125 { 24126 ip_stack_t *ipst = ill->ill_ipst; 24127 24128 nce_fastpath_list_dispatch(ill, NULL, NULL); 24129 ndp_walk_common((ill->ill_isv6 ? ipst->ips_ndp6 : ipst->ips_ndp4), 24130 ill, (pfi_t)ndp_fastpath_flush, NULL, B_TRUE); 24131 } 24132 24133 /* 24134 * Set the physical address information for `ill' to the contents of the 24135 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be 24136 * asynchronous if `ill' cannot immediately be quiesced -- in which case 24137 * EINPROGRESS will be returned. 24138 */ 24139 int 24140 ill_set_phys_addr(ill_t *ill, mblk_t *mp) 24141 { 24142 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 24143 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr; 24144 24145 ASSERT(IAM_WRITER_IPSQ(ipsq)); 24146 24147 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR && 24148 dlindp->dl_data != DL_CURR_PHYS_ADDR) { 24149 /* Changing DL_IPV6_TOKEN is not yet supported */ 24150 return (0); 24151 } 24152 24153 /* 24154 * We need to store up to two copies of `mp' in `ill'. Due to the 24155 * design of ipsq_pending_mp_add(), we can't pass them as separate 24156 * arguments to ill_set_phys_addr_tail(). Instead, chain them 24157 * together here, then pull 'em apart in ill_set_phys_addr_tail(). 24158 */ 24159 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) { 24160 freemsg(mp); 24161 return (ENOMEM); 24162 } 24163 24164 ipsq_current_start(ipsq, ill->ill_ipif, 0); 24165 24166 /* 24167 * If we can quiesce the ill, then set the address. If not, then 24168 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). 24169 */ 24170 ill_down_ipifs(ill, NULL, 0, B_FALSE); 24171 mutex_enter(&ill->ill_lock); 24172 if (!ill_is_quiescent(ill)) { 24173 /* call cannot fail since `conn_t *' argument is NULL */ 24174 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 24175 mp, ILL_DOWN); 24176 mutex_exit(&ill->ill_lock); 24177 return (EINPROGRESS); 24178 } 24179 mutex_exit(&ill->ill_lock); 24180 24181 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL); 24182 return (0); 24183 } 24184 24185 /* 24186 * Once the ill associated with `q' has quiesced, set its physical address 24187 * information to the values in `addrmp'. Note that two copies of `addrmp' 24188 * are passed (linked by b_cont), since we sometimes need to save two distinct 24189 * copies in the ill_t, and our context doesn't permit sleeping or allocation 24190 * failure (we'll free the other copy if it's not needed). Since the ill_t 24191 * is quiesced, we know any stale IREs with the old address information have 24192 * already been removed, so we don't need to call ill_fastpath_flush(). 24193 */ 24194 /* ARGSUSED */ 24195 static void 24196 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) 24197 { 24198 ill_t *ill = q->q_ptr; 24199 mblk_t *addrmp2 = unlinkb(addrmp); 24200 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr; 24201 uint_t addrlen, addroff; 24202 24203 ASSERT(IAM_WRITER_IPSQ(ipsq)); 24204 24205 addroff = dlindp->dl_addr_offset; 24206 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length); 24207 24208 switch (dlindp->dl_data) { 24209 case DL_IPV6_LINK_LAYER_ADDR: 24210 ill_set_ndmp(ill, addrmp, addroff, addrlen); 24211 freemsg(addrmp2); 24212 break; 24213 24214 case DL_CURR_PHYS_ADDR: 24215 freemsg(ill->ill_phys_addr_mp); 24216 ill->ill_phys_addr = addrmp->b_rptr + addroff; 24217 ill->ill_phys_addr_mp = addrmp; 24218 ill->ill_phys_addr_length = addrlen; 24219 24220 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 24221 ill_set_ndmp(ill, addrmp2, addroff, addrlen); 24222 else 24223 freemsg(addrmp2); 24224 break; 24225 default: 24226 ASSERT(0); 24227 } 24228 24229 /* 24230 * If there are ipifs to bring up, ill_up_ipifs() will return 24231 * EINPROGRESS, and ipsq_current_finish() will be called by 24232 * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is 24233 * brought up. 24234 */ 24235 if (ill_up_ipifs(ill, q, addrmp) != EINPROGRESS) 24236 ipsq_current_finish(ipsq); 24237 } 24238 24239 /* 24240 * Helper routine for setting the ill_nd_lla fields. 24241 */ 24242 void 24243 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen) 24244 { 24245 freemsg(ill->ill_nd_lla_mp); 24246 ill->ill_nd_lla = ndmp->b_rptr + addroff; 24247 ill->ill_nd_lla_mp = ndmp; 24248 ill->ill_nd_lla_len = addrlen; 24249 } 24250 24251 major_t IP_MAJ; 24252 #define IP "ip" 24253 24254 #define UDP6DEV "/devices/pseudo/udp6@0:udp6" 24255 #define UDPDEV "/devices/pseudo/udp@0:udp" 24256 24257 /* 24258 * Issue REMOVEIF ioctls to have the loopback interfaces 24259 * go away. Other interfaces are either I_LINKed or I_PLINKed; 24260 * the former going away when the user-level processes in the zone 24261 * are killed * and the latter are cleaned up by the stream head 24262 * str_stack_shutdown callback that undoes all I_PLINKs. 24263 */ 24264 void 24265 ip_loopback_cleanup(ip_stack_t *ipst) 24266 { 24267 int error; 24268 ldi_handle_t lh = NULL; 24269 ldi_ident_t li = NULL; 24270 int rval; 24271 cred_t *cr; 24272 struct strioctl iocb; 24273 struct lifreq lifreq; 24274 24275 IP_MAJ = ddi_name_to_major(IP); 24276 24277 #ifdef NS_DEBUG 24278 (void) printf("ip_loopback_cleanup() stackid %d\n", 24279 ipst->ips_netstack->netstack_stackid); 24280 #endif 24281 24282 bzero(&lifreq, sizeof (lifreq)); 24283 (void) strcpy(lifreq.lifr_name, ipif_loopback_name); 24284 24285 error = ldi_ident_from_major(IP_MAJ, &li); 24286 if (error) { 24287 #ifdef DEBUG 24288 printf("ip_loopback_cleanup: lyr ident get failed error %d\n", 24289 error); 24290 #endif 24291 return; 24292 } 24293 24294 cr = zone_get_kcred(netstackid_to_zoneid( 24295 ipst->ips_netstack->netstack_stackid)); 24296 ASSERT(cr != NULL); 24297 error = ldi_open_by_name(UDP6DEV, FREAD|FWRITE, cr, &lh, li); 24298 if (error) { 24299 #ifdef DEBUG 24300 printf("ip_loopback_cleanup: open of UDP6DEV failed error %d\n", 24301 error); 24302 #endif 24303 goto out; 24304 } 24305 iocb.ic_cmd = SIOCLIFREMOVEIF; 24306 iocb.ic_timout = 15; 24307 iocb.ic_len = sizeof (lifreq); 24308 iocb.ic_dp = (char *)&lifreq; 24309 24310 error = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval); 24311 /* LINTED - statement has no consequent */ 24312 if (error) { 24313 #ifdef NS_DEBUG 24314 printf("ip_loopback_cleanup: ioctl SIOCLIFREMOVEIF failed on " 24315 "UDP6 error %d\n", error); 24316 #endif 24317 } 24318 (void) ldi_close(lh, FREAD|FWRITE, cr); 24319 lh = NULL; 24320 24321 error = ldi_open_by_name(UDPDEV, FREAD|FWRITE, cr, &lh, li); 24322 if (error) { 24323 #ifdef NS_DEBUG 24324 printf("ip_loopback_cleanup: open of UDPDEV failed error %d\n", 24325 error); 24326 #endif 24327 goto out; 24328 } 24329 24330 iocb.ic_cmd = SIOCLIFREMOVEIF; 24331 iocb.ic_timout = 15; 24332 iocb.ic_len = sizeof (lifreq); 24333 iocb.ic_dp = (char *)&lifreq; 24334 24335 error = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval); 24336 /* LINTED - statement has no consequent */ 24337 if (error) { 24338 #ifdef NS_DEBUG 24339 printf("ip_loopback_cleanup: ioctl SIOCLIFREMOVEIF failed on " 24340 "UDP error %d\n", error); 24341 #endif 24342 } 24343 (void) ldi_close(lh, FREAD|FWRITE, cr); 24344 lh = NULL; 24345 24346 out: 24347 /* Close layered handles */ 24348 if (lh) 24349 (void) ldi_close(lh, FREAD|FWRITE, cr); 24350 if (li) 24351 ldi_ident_release(li); 24352 24353 crfree(cr); 24354 } 24355